index-transformer 1.0.0

//! Site adapters for task-oriented terminal documents.

use std::collections::BTreeSet;

use index_core::{
    AdapterId, ButtonAction, DiagnosticAction, DiagnosticConfidence, DiagnosticRecord,
    DiagnosticSeverity, DiagnosticSource, DocumentQuality, DocumentQualityCategory, FailureCause,
    FailureDiagnostic, Form, IndexDocument, IndexNode, Input, Link, SectionRole,
};
use index_readability::{ReadableNode, ReadablePage, ReadableSectionRole};
use url::Url;

/// Context passed to site adapters.
#[derive(Debug, Clone, Copy)]
pub struct AdapterContext<'a> {
    /// Readable page emitted by the generic extraction pipeline.
    pub page: &'a ReadablePage,
}

/// Result of matching an adapter.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct AdapterMatch {
    /// Adapter identifier.
    pub id: AdapterId,
    /// Human-readable page classification.
    pub page_type: String,
}

/// Site-specific semantic transformer.
pub trait SiteAdapter {
    /// Returns the stable adapter identifier.
    fn id(&self) -> AdapterId;

    /// Detects whether this adapter applies.
    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch>;

    /// Emits an Index document for this adapter.
    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument;
}

/// Registry of available site adapters.
pub struct AdapterRegistry {
    adapters: Vec<Box<dyn SiteAdapter>>,
}

impl AdapterRegistry {
    /// Creates a registry from adapter implementations.
    #[must_use]
    pub fn new(adapters: Vec<Box<dyn SiteAdapter>>) -> Self {
        Self { adapters }
    }

    /// Creates the default Milestone 5 adapter registry.
    #[must_use]
    pub fn default_registry() -> Self {
        Self::new(vec![
            Box::new(GitHubRepositoryAdapter),
            Box::new(GitHubIssueAdapter),
            Box::new(GitLabAdapter),
            Box::new(SourceHutAdapter),
            Box::new(ForgeAdapter),
            Box::new(DocsRsAdapter),
            Box::new(ReadTheDocsAdapter),
            Box::new(MdnAdapter),
            Box::new(CratesIoAdapter),
            Box::new(WikipediaAdapter),
            Box::new(HackerNewsAdapter),
            Box::new(StackOverflowAdapter),
            Box::new(RedditAdapter),
            Box::new(SlashdotAdapter),
            Box::new(DiscourseAdapter),
            Box::new(XenForoAdapter),
            Box::new(LegacyForumAdapter),
            Box::new(CompatibilityPackAdapter),
            Box::new(Top100BaselineAdapter),
            Box::new(ArxivAdapter),
            Box::new(InternetArchiveAdapter),
        ])
    }

    /// Finds the first matching adapter.
    #[must_use]
    pub fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        self.adapters
            .iter()
            .find_map(|adapter| adapter.detect(context))
    }

    /// Transforms with the first matching adapter.
    #[must_use]
    pub fn transform(&self, context: &AdapterContext<'_>) -> Option<IndexDocument> {
        self.adapters.iter().find_map(|adapter| {
            adapter
                .detect(context)
                .map(|matched| adapter.transform(context, &matched))
        })
    }
}

impl Default for AdapterRegistry {
    fn default() -> Self {
        Self::default_registry()
    }
}

/// GitHub repository adapter.
#[derive(Debug, Clone, Copy)]
pub struct GitHubRepositoryAdapter;

/// GitHub issue adapter.
#[derive(Debug, Clone, Copy)]
pub struct GitHubIssueAdapter;

/// GitLab project and issue adapter.
#[derive(Debug, Clone, Copy)]
pub struct GitLabAdapter;

/// SourceHut project, ticket, and mailing-list adapter.
#[derive(Debug, Clone, Copy)]
pub struct SourceHutAdapter;

/// Forgejo and Gitea project adapter.
#[derive(Debug, Clone, Copy)]
pub struct ForgeAdapter;

/// docs.rs adapter.
#[derive(Debug, Clone, Copy)]
pub struct DocsRsAdapter;

/// Read the Docs adapter.
#[derive(Debug, Clone, Copy)]
pub struct ReadTheDocsAdapter;

/// MDN documentation adapter.
#[derive(Debug, Clone, Copy)]
pub struct MdnAdapter;

/// crates.io adapter.
#[derive(Debug, Clone, Copy)]
pub struct CratesIoAdapter;

/// Wikipedia adapter.
#[derive(Debug, Clone, Copy)]
pub struct WikipediaAdapter;

/// Hacker News adapter.
#[derive(Debug, Clone, Copy)]
pub struct HackerNewsAdapter;

/// Stack Overflow adapter.
#[derive(Debug, Clone, Copy)]
pub struct StackOverflowAdapter;

/// Discourse read-only thread adapter.
#[derive(Debug, Clone, Copy)]
pub struct DiscourseAdapter;

/// Reddit read-focused thread adapter.
#[derive(Debug, Clone, Copy)]
pub struct RedditAdapter;

/// Slashdot read-focused story adapter.
#[derive(Debug, Clone, Copy)]
pub struct SlashdotAdapter;

/// XenForo-like thread adapter for known forum families.
#[derive(Debug, Clone, Copy)]
pub struct XenForoAdapter;

/// Legacy forum adapter for older board structures.
#[derive(Debug, Clone, Copy)]
pub struct LegacyForumAdapter;

/// Compatibility pack adapter for generic site-family coverage.
#[derive(Debug, Clone, Copy)]
pub struct CompatibilityPackAdapter;

/// Baseline adapter for supported top-100 web families that do not yet have a
/// dedicated task adapter.
#[derive(Debug, Clone, Copy)]
pub struct Top100BaselineAdapter;

/// arXiv abstract adapter.
#[derive(Debug, Clone, Copy)]
pub struct ArxivAdapter;

/// Internet Archive item adapter.
#[derive(Debug, Clone, Copy)]
pub struct InternetArchiveAdapter;

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ForumIntent {
    FrontPage,
    ThreadPage,
    PaginatedThread,
    ReplyForm,
    ProfileNoise,
}

impl ForumIntent {
    fn as_str(self) -> &'static str {
        match self {
            Self::FrontPage => "front-page",
            Self::ThreadPage => "thread-page",
            Self::PaginatedThread => "paginated-thread",
            Self::ReplyForm => "reply-form",
            Self::ProfileNoise => "profile-noise",
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ForumFamily {
    HackerNews,
    StackExchange,
    Reddit,
    Slashdot,
    Discourse,
    XenForoLike,
    Legacy,
}

impl ForumFamily {
    fn as_str(self) -> &'static str {
        match self {
            Self::HackerNews => "hacker-news",
            Self::StackExchange => "stackexchange",
            Self::Reddit => "reddit",
            Self::Slashdot => "slashdot",
            Self::Discourse => "discourse",
            Self::XenForoLike => "xenforo-like",
            Self::Legacy => "legacy-forum",
        }
    }

    fn label(self) -> &'static str {
        match self {
            Self::HackerNews => "Hacker News",
            Self::StackExchange => "StackExchange",
            Self::Reddit => "Reddit",
            Self::Slashdot => "Slashdot",
            Self::Discourse => "Discourse",
            Self::XenForoLike => "Forum",
            Self::Legacy => "Legacy Forum",
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CompatibilityPack {
    Forums,
    Qa,
    Docs,
    NewsMedia,
    Portal,
    AppShell,
    CommerceCards,
    MixedMedia,
}

impl CompatibilityPack {
    fn as_str(self) -> &'static str {
        match self {
            Self::Forums => "forums",
            Self::Qa => "qa",
            Self::Docs => "docs",
            Self::NewsMedia => "news-media",
            Self::Portal => "portal",
            Self::AppShell => "app-shell",
            Self::CommerceCards => "commerce-cards",
            Self::MixedMedia => "mixed-media",
        }
    }

    fn label(self) -> &'static str {
        match self {
            Self::Forums => "Forum Pack",
            Self::Qa => "Q&A Pack",
            Self::Docs => "Docs Pack",
            Self::NewsMedia => "News Pack",
            Self::Portal => "Portal Pack",
            Self::AppShell => "App Shell Pack",
            Self::CommerceCards => "Commerce Cards Pack",
            Self::MixedMedia => "Mixed Media Pack",
        }
    }

    const fn priority(self) -> u8 {
        match self {
            Self::Forums => 8,
            Self::Qa => 7,
            Self::Docs => 6,
            Self::NewsMedia => 5,
            Self::Portal => 4,
            Self::AppShell => 3,
            Self::CommerceCards => 2,
            Self::MixedMedia => 1,
        }
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct CompatibilityPackCandidate {
    pack: CompatibilityPack,
    confidence: u8,
    signals: Vec<String>,
}

impl CompatibilityPackCandidate {
    fn confidence_label(&self) -> &'static str {
        match self.confidence {
            5..=u8::MAX => "high",
            3..=4 => "medium",
            _ => "low",
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Top100Family {
    SearchPortal,
    KnowledgeReference,
    SocialCommunity,
    MediaStreaming,
    CommerceMarketplace,
    ServicesUtility,
    AiAssistant,
}

impl Top100Family {
    fn as_str(self) -> &'static str {
        match self {
            Self::SearchPortal => "search-portal",
            Self::KnowledgeReference => "knowledge-reference",
            Self::SocialCommunity => "social-community",
            Self::MediaStreaming => "media-streaming",
            Self::CommerceMarketplace => "commerce-marketplace",
            Self::ServicesUtility => "services-utility",
            Self::AiAssistant => "ai-assistant",
        }
    }

    fn label(self) -> &'static str {
        match self {
            Self::SearchPortal => "Search Portal",
            Self::KnowledgeReference => "Knowledge Reference",
            Self::SocialCommunity => "Social Community",
            Self::MediaStreaming => "Media and Streaming",
            Self::CommerceMarketplace => "Commerce and Marketplace",
            Self::ServicesUtility => "Services and Utility",
            Self::AiAssistant => "AI Assistant",
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Top100Intent {
    PortalLanding,
    SearchResults,
    ArticleOrReference,
    AppShell,
    FeedOrThread,
    VideoHub,
    MarketplaceListing,
}

impl Top100Intent {
    fn as_str(self) -> &'static str {
        match self {
            Self::PortalLanding => "portal-landing",
            Self::SearchResults => "search-results",
            Self::ArticleOrReference => "article-or-reference",
            Self::AppShell => "app-shell",
            Self::FeedOrThread => "feed-or-thread",
            Self::VideoHub => "video-hub",
            Self::MarketplaceListing => "marketplace-listing",
        }
    }
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum BlockedFlowClass {
    None,
    AuthWall,
    ScriptGate,
    BotGate,
    GeoGate,
    AgeGate,
    PolicyGate,
}

impl BlockedFlowClass {
    fn as_str(self) -> &'static str {
        match self {
            Self::None => "none",
            Self::AuthWall => "auth-wall",
            Self::ScriptGate => "script-gate",
            Self::BotGate => "bot-gate",
            Self::GeoGate => "geo-gate",
            Self::AgeGate => "age-gate",
            Self::PolicyGate => "policy-blocked",
        }
    }
}

impl SiteAdapter for GitHubRepositoryAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("github.repository")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let segments = path_segments(&url);
        (url.host_str() == Some("github.com")
            && segments.len() >= 2
            && !segments.contains(&"issues"))
        .then(|| AdapterMatch {
            id: self.id(),
            page_type: format!("{}/{}", segments[0], segments[1]),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        task_document(
            context.page,
            &matched.id,
            format!("GitHub repository: {}", matched.page_type),
            vec![
                "Browse code".to_owned(),
                "Open issues".to_owned(),
                "Review pull requests".to_owned(),
                "Read project documentation".to_owned(),
            ],
        )
    }
}

impl SiteAdapter for GitHubIssueAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("github.issue")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let segments = path_segments(&url);
        (url.host_str() == Some("github.com") && segments.len() >= 4 && segments[2] == "issues")
            .then(|| AdapterMatch {
                id: self.id(),
                page_type: format!("{}/{} issue #{}", segments[0], segments[1], segments[3]),
            })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        task_document(
            context.page,
            &matched.id,
            format!("GitHub issue: {}", matched.page_type),
            vec![
                "Read issue summary".to_owned(),
                "Inspect labels and status".to_owned(),
                "Review discussion links".to_owned(),
            ],
        )
    }
}

impl SiteAdapter for GitLabAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("gitlab")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let segments = path_segments(&url);
        (url.host_str() == Some("gitlab.com") && segments.len() >= 2).then(|| {
            let page_type = if let Some(issue_index) =
                segments.iter().enumerate().find_map(|(index, segment)| {
                    (*segment == "issues" && segments.get(index.saturating_sub(1)) == Some(&"-"))
                        .then_some(index)
                }) {
                format!(
                    "{}/{} issue #{}",
                    segments[0],
                    segments[1],
                    segments.get(issue_index + 1).copied().unwrap_or("unknown")
                )
            } else {
                format!("{}/{}", segments[0], segments[1])
            };
            AdapterMatch {
                id: self.id(),
                page_type,
            }
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        task_document(
            context.page,
            &matched.id,
            format!("GitLab: {}", matched.page_type),
            vec![
                "Browse project files".to_owned(),
                "Open issues and merge requests".to_owned(),
                "Read project documentation".to_owned(),
            ],
        )
    }
}

impl SiteAdapter for SourceHutAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("sourcehut")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let host = url.host_str()?;
        host.ends_with(".sr.ht").then(|| AdapterMatch {
            id: self.id(),
            page_type: format!(
                "{host} {}",
                path_segments(&url).first().copied().unwrap_or("root")
            ),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        task_document(
            context.page,
            &matched.id,
            format!("SourceHut: {}", matched.page_type),
            vec![
                "Open project summary".to_owned(),
                "Read mailing-list context".to_owned(),
                "Inspect tickets or patches".to_owned(),
            ],
        )
    }
}

impl SiteAdapter for ForgeAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("forge")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let host = url.host_str()?;
        let segments = path_segments(&url);
        ((host == "codeberg.org" || host == "gitea.com") && segments.len() >= 2).then(|| {
            let page_type = if segments.len() >= 4 && segments[2] == "issues" {
                format!(
                    "{host}/{}/{} issue #{}",
                    segments[0], segments[1], segments[3]
                )
            } else {
                format!("{host}/{}/{}", segments[0], segments[1])
            };
            AdapterMatch {
                id: self.id(),
                page_type,
            }
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        task_document(
            context.page,
            &matched.id,
            format!("Forge: {}", matched.page_type),
            vec![
                "Browse repository".to_owned(),
                "Open issues and pull requests".to_owned(),
                "Read release notes".to_owned(),
            ],
        )
    }
}

impl SiteAdapter for DocsRsAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("docs.rs")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        (url.host_str() == Some("docs.rs")).then(|| AdapterMatch {
            id: self.id(),
            page_type: path_segments(&url).first().map_or_else(
                || "crate documentation".to_owned(),
                |name| (*name).to_owned(),
            ),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        task_document(
            context.page,
            &matched.id,
            format!("docs.rs: {}", matched.page_type),
            vec![
                "Open crate modules".to_owned(),
                "Search items".to_owned(),
                "Read examples".to_owned(),
            ],
        )
    }
}

impl SiteAdapter for ReadTheDocsAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("read-the-docs")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        url.host_str()
            .filter(|host| host.ends_with(".readthedocs.io"))
            .map(|host| AdapterMatch {
                id: self.id(),
                page_type: (*host).to_owned(),
            })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        task_document(
            context.page,
            &matched.id,
            format!("Read the Docs: {}", matched.page_type),
            vec![
                "Read current section".to_owned(),
                "Open table of contents".to_owned(),
                "Search documentation links".to_owned(),
            ],
        )
    }
}

impl SiteAdapter for MdnAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("mdn")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        (url.host_str() == Some("developer.mozilla.org")).then(|| AdapterMatch {
            id: self.id(),
            page_type: context.page.title.clone(),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        task_document(
            context.page,
            &matched.id,
            format!("MDN: {}", matched.page_type),
            vec![
                "Read API summary".to_owned(),
                "Inspect examples".to_owned(),
                "Open browser compatibility notes".to_owned(),
            ],
        )
    }
}

impl SiteAdapter for CratesIoAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("crates.io")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        (url.host_str() == Some("crates.io")).then(|| AdapterMatch {
            id: self.id(),
            page_type: path_segments(&url)
                .get(1)
                .map_or_else(|| "crate".to_owned(), |name| (*name).to_owned()),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        task_document(
            context.page,
            &matched.id,
            format!("crates.io: {}", matched.page_type),
            vec![
                "Inspect crate metadata".to_owned(),
                "Open documentation".to_owned(),
                "Review repository and versions".to_owned(),
            ],
        )
    }
}

impl SiteAdapter for WikipediaAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("wikipedia")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        url.host_str()
            .filter(|host| host.ends_with("wikipedia.org"))
            .map(|_host| AdapterMatch {
                id: self.id(),
                page_type: context.page.title.clone(),
            })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        task_document(
            context.page,
            &matched.id,
            format!("Wikipedia: {}", matched.page_type),
            vec![
                "Read article lead".to_owned(),
                "Open references".to_owned(),
                "Inspect related links".to_owned(),
            ],
        )
    }
}

impl SiteAdapter for HackerNewsAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("hacker-news")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        if let Some(url) = page_url(context.page) {
            if url.host_str() == Some("news.ycombinator.com") {
                return Some(AdapterMatch {
                    id: self.id(),
                    page_type: hacker_news_page_type(Some(&url), &context.page.title),
                });
            }
        }

        looks_like_hacker_news(context.page).then(|| AdapterMatch {
            id: self.id(),
            page_type: hacker_news_page_type(None, &context.page.title),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        hacker_news_document(context.page, matched)
    }
}

impl SiteAdapter for StackOverflowAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("stackoverflow")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        stackexchange_host(url.host_str()?).then(|| AdapterMatch {
            id: self.id(),
            page_type: context.page.title.clone(),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        forum_thread_document(
            context.page,
            &matched.id,
            ForumFamily::StackExchange,
            format!("StackExchange: {}", matched.page_type),
            vec![
                "Read question".to_owned(),
                "Review accepted answer".to_owned(),
                "Inspect comments and related answers".to_owned(),
                "Open outbound references".to_owned(),
            ],
            None,
        )
    }
}

impl SiteAdapter for DiscourseAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("discourse")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let host = url.host_str()?;
        let segments = path_segments(&url);
        ((host == "meta.discourse.org"
            || host.starts_with("discuss.")
            || host.contains(".discourse."))
            && segments.first() == Some(&"t"))
        .then(|| AdapterMatch {
            id: self.id(),
            page_type: context.page.title.clone(),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        forum_thread_document(
            context.page,
            &matched.id,
            ForumFamily::Discourse,
            format!("Discourse thread: {}", matched.page_type),
            vec![
                "Read topic summary".to_owned(),
                "Inspect replies".to_owned(),
                "Open related discussion links".to_owned(),
            ],
            None,
        )
    }
}

impl SiteAdapter for RedditAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("reddit")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let host = url.host_str()?;
        ((host == "reddit.com" || host == "www.reddit.com" || host == "old.reddit.com")
            && path_segments(&url).contains(&"r"))
        .then(|| AdapterMatch {
            id: self.id(),
            page_type: context.page.title.clone(),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        if reddit_script_gated(context.page) {
            return blocked_forum_document(
                context.page,
                &matched.id,
                "Reddit requires additional script/cookie flow for this page shape",
            );
        }

        forum_thread_document(
            context.page,
            &matched.id,
            ForumFamily::Reddit,
            format!("Reddit thread: {}", matched.page_type),
            vec![
                "Read post summary".to_owned(),
                "Inspect nested comments".to_owned(),
                "Open outbound links".to_owned(),
            ],
            Some(is_reddit_actionable_link),
        )
    }
}

impl SiteAdapter for SlashdotAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("slashdot")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let host = url.host_str()?;
        let segments = path_segments(&url);
        (host == "slashdot.org"
            && (segments.contains(&"story")
                || segments.first() == Some(&"index2.pl")
                || context.page.title.to_ascii_lowercase().contains("slashdot")))
        .then(|| AdapterMatch {
            id: self.id(),
            page_type: context.page.title.clone(),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        forum_thread_document(
            context.page,
            &matched.id,
            ForumFamily::Slashdot,
            format!("Slashdot: {}", matched.page_type),
            vec![
                "Read story summary".to_owned(),
                "Inspect comment thread".to_owned(),
                "Open source links".to_owned(),
            ],
            None,
        )
    }
}

impl SiteAdapter for XenForoAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("forum-xenforo")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let host = url.host_str()?;
        let segments = path_segments(&url);
        let known_host = matches!(
            host,
            "resetera.com"
                | "www.resetera.com"
                | "www.neogaf.com"
                | "neogaf.com"
                | "forums.spacebattles.com"
                | "forums.sufficientvelocity.com"
                | "forums.overclock.net"
        );
        (known_host
            && (segments.contains(&"threads")
                || segments.contains(&"posts")
                || segments.contains(&"members")))
        .then(|| AdapterMatch {
            id: self.id(),
            page_type: context.page.title.clone(),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        forum_thread_document(
            context.page,
            &matched.id,
            ForumFamily::XenForoLike,
            format!("Forum thread: {}", matched.page_type),
            vec![
                "Read thread posts".to_owned(),
                "Inspect quote/reply chain".to_owned(),
                "Open pagination links".to_owned(),
            ],
            None,
        )
    }
}

impl SiteAdapter for LegacyForumAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("forum-legacy")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let host = url.host_str()?;
        let segments = path_segments(&url);
        let has_legacy_forum_path = segments.contains(&"forum")
            || segments.contains(&"forums")
            || url.as_str().contains("showthread")
            || url.as_str().contains("viewtopic");
        let known_host = matches!(
            host,
            "forum.xda-developers.com"
                | "forums.tomshardware.com"
                | "forums.anandtech.com"
                | "forums.macrumors.com"
                | "www.avsforum.com"
                | "www.city-data.com"
                | "www.skyscrapercity.com"
                | "forums.digitalpoint.com"
                | "www.webhostingtalk.com"
                | "www.sitepoint.com"
                | "www.namepros.com"
                | "www.mumsnet.com"
                | "www.thestudentroom.co.uk"
                | "www.boards.ie"
                | "forums.overclockers.co.uk"
                | "www.badcaps.net"
                | "forums.moneysavingexpert.com"
                | "www.dslreports.com"
                | "forum.bodybuilding.com"
                | "forums.sherdog.com"
                | "www.mtgsalvation.com"
                | "www.alternatehistory.com"
                | "forums.futura-sciences.com"
                | "www.physicsforums.com"
                | "forums.whirlpool.net.au"
                | "forums.somethingawful.com"
                | "www.gaiaonline.com"
                | "arstechnica.com"
        );

        (known_host && has_legacy_forum_path).then(|| AdapterMatch {
            id: self.id(),
            page_type: context.page.title.clone(),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        forum_thread_document(
            context.page,
            &matched.id,
            ForumFamily::Legacy,
            format!("Legacy forum thread: {}", matched.page_type),
            vec![
                "Read thread body".to_owned(),
                "Inspect quotes/code blocks".to_owned(),
                "Open next/previous pages".to_owned(),
            ],
            Some(is_legacy_actionable_link),
        )
    }
}

impl SiteAdapter for CompatibilityPackAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("family-pack")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let candidate = detect_compatibility_pack(&url, context.page)?;
        Some(AdapterMatch {
            id: AdapterId::new(format!("family-pack.{}", candidate.pack.as_str())),
            page_type: format!(
                "{} (confidence={})",
                candidate.pack.label(),
                candidate.confidence
            ),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        let fallback = CompatibilityPackCandidate {
            pack: if matched.id.as_str().ends_with(".forums") {
                CompatibilityPack::Forums
            } else if matched.id.as_str().ends_with(".qa") {
                CompatibilityPack::Qa
            } else if matched.id.as_str().ends_with(".docs") {
                CompatibilityPack::Docs
            } else if matched.id.as_str().ends_with(".news-media") {
                CompatibilityPack::NewsMedia
            } else if matched.id.as_str().ends_with(".app-shell") {
                CompatibilityPack::AppShell
            } else if matched.id.as_str().ends_with(".commerce-cards") {
                CompatibilityPack::CommerceCards
            } else if matched.id.as_str().ends_with(".mixed-media") {
                CompatibilityPack::MixedMedia
            } else {
                CompatibilityPack::Portal
            },
            confidence: 3,
            signals: vec!["adapter match persisted".to_owned()],
        };
        let candidate = page_url(context.page)
            .and_then(|url| detect_compatibility_pack(&url, context.page))
            .unwrap_or(fallback);
        compatibility_pack_document(context.page, &matched.id, &candidate)
    }
}

impl SiteAdapter for Top100BaselineAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("top100.baseline")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let host = url.host_str()?;
        let normalized = normalize_top100_domain(host);
        let family = classify_top100_family(&normalized)?;
        Some(AdapterMatch {
            id: self.id(),
            page_type: format!("{} {}", family.as_str(), normalized),
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, _matched: &AdapterMatch) -> IndexDocument {
        let Some(url) = page_url(context.page) else {
            return blocked_top100_document(
                context.page,
                &self.id(),
                "unknown",
                Top100Family::ServicesUtility,
                Top100Intent::AppShell,
                BlockedFlowClass::ScriptGate,
            );
        };

        let host = url.host_str().unwrap_or_default();
        let normalized = normalize_top100_domain(host);
        let family = classify_top100_family(&normalized).unwrap_or(Top100Family::ServicesUtility);
        let intent = classify_top100_intent(&normalized, &url, context.page);
        let blocked = classify_blocked_flow(context.page);
        if blocked != BlockedFlowClass::None {
            return blocked_top100_document(
                context.page,
                &self.id(),
                &normalized,
                family,
                intent,
                blocked,
            );
        }

        top100_baseline_document(context.page, &self.id(), &normalized, family, intent)
    }
}

impl SiteAdapter for ArxivAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("arxiv")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let segments = path_segments(&url);
        (url.host_str() == Some("arxiv.org") && segments.first() == Some(&"abs")).then(|| {
            AdapterMatch {
                id: self.id(),
                page_type: segments.get(1).copied().unwrap_or("abstract").to_owned(),
            }
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        task_document(
            context.page,
            &matched.id,
            format!("arXiv abstract: {}", matched.page_type),
            vec![
                "Read abstract".to_owned(),
                "Open PDF or source".to_owned(),
                "Inspect authors and categories".to_owned(),
            ],
        )
    }
}

impl SiteAdapter for InternetArchiveAdapter {
    fn id(&self) -> AdapterId {
        AdapterId::new("internet-archive")
    }

    fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
        let url = page_url(context.page)?;
        let segments = path_segments(&url);
        (url.host_str() == Some("archive.org") && segments.first() == Some(&"details")).then(|| {
            AdapterMatch {
                id: self.id(),
                page_type: segments.get(1).copied().unwrap_or("item").to_owned(),
            }
        })
    }

    fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
        task_document(
            context.page,
            &matched.id,
            format!("Internet Archive item: {}", matched.page_type),
            vec![
                "Read item metadata".to_owned(),
                "Open available files".to_owned(),
                "Inspect collection links".to_owned(),
            ],
        )
    }
}

fn stackexchange_host(host: &str) -> bool {
    host == "stackoverflow.com"
        || host == "serverfault.com"
        || host == "superuser.com"
        || host == "askubuntu.com"
        || host == "mathoverflow.net"
        || host == "stackexchange.com"
        || host.ends_with(".stackexchange.com")
}

fn normalize_top100_domain(host: &str) -> String {
    let host = host.trim_end_matches('.').to_ascii_lowercase();
    let host = host.strip_prefix("www.").unwrap_or(&host).to_owned();
    match host.as_str() {
        "old.reddit.com" | "m.reddit.com" => "reddit.com".to_owned(),
        "twitter.com" | "mobile.twitter.com" | "www.twitter.com" => "x.com".to_owned(),
        "m.facebook.com" | "touch.facebook.com" => "facebook.com".to_owned(),
        "m.instagram.com" => "instagram.com".to_owned(),
        "m.youtube.com" | "youtu.be" => "youtube.com".to_owned(),
        "m.wikipedia.org" => "wikipedia.org".to_owned(),
        "m.bing.com" => "bing.com".to_owned(),
        "m.duckduckgo.com" => "duckduckgo.com".to_owned(),
        "m.amazon.com" => "amazon.com".to_owned(),
        "en.wikipedia.org" | "es.wikipedia.org" | "fr.wikipedia.org" => "wikipedia.org".to_owned(),
        _ => host,
    }
}

fn classify_top100_family(domain: &str) -> Option<Top100Family> {
    const SEARCH: &[&str] = &[
        "google.com",
        "bing.com",
        "duckduckgo.com",
        "brave.com",
        "yahoo.com",
        "yahoo.co.jp",
        "news.yahoo.co.jp",
        "yandex.ru",
        "ya.ru",
        "baidu.com",
        "naver.com",
        "mail.ru",
        "qq.com",
        "msn.com",
    ];
    const KNOWLEDGE: &[&str] = &[
        "wikipedia.org",
        "fandom.com",
        "nytimes.com",
        "bbc.com",
        "bbc.co.uk",
        "cnn.com",
        "espn.com",
        "globo.com",
        "indiatimes.com",
        "theguardian.com",
        "weather.com",
        "dzen.ru",
    ];
    const SOCIAL: &[&str] = &[
        "facebook.com",
        "instagram.com",
        "x.com",
        "reddit.com",
        "discord.com",
        "quora.com",
        "linkedin.com",
        "t.me",
        "telegram.org",
        "vk.com",
        "ok.ru",
    ];
    const MEDIA: &[&str] = &[
        "youtube.com",
        "music.youtube.com",
        "tiktok.com",
        "bilibili.com",
        "twitch.tv",
        "spotify.com",
        "imdb.com",
        "netflix.com",
        "disneyplus.com",
        "hbomax.com",
        "rutube.ru",
        "douyin.com",
    ];
    const COMMERCE: &[&str] = &[
        "amazon.com",
        "amazon.in",
        "amazon.co.jp",
        "amazon.de",
        "amazon.co.uk",
        "ebay.com",
        "walmart.com",
        "etsy.com",
        "rakuten.co.jp",
        "aliexpress.com",
        "temu.com",
        "shein.com",
        "ozon.ru",
        "wildberries.ru",
        "booking.com",
        "zillow.com",
    ];
    const SERVICE: &[&str] = &[
        "microsoft.com",
        "office.com",
        "cloud.microsoft",
        "live.com",
        "paypal.com",
        "zoom.us",
        "canva.com",
        "usps.com",
        "apple.com",
        "adobe.com",
        "samsung.com",
        "indeed.com",
        "docomo.ne.jp",
        "instructure.com",
    ];
    const AI: &[&str] = &[
        "chatgpt.com",
        "gemini.google.com",
        "claude.ai",
        "deepseek.com",
        "chat.deepseek.com",
        "grok.com",
    ];

    if SEARCH.contains(&domain) {
        Some(Top100Family::SearchPortal)
    } else if KNOWLEDGE.contains(&domain) {
        Some(Top100Family::KnowledgeReference)
    } else if SOCIAL.contains(&domain) {
        Some(Top100Family::SocialCommunity)
    } else if MEDIA.contains(&domain) {
        Some(Top100Family::MediaStreaming)
    } else if COMMERCE.contains(&domain) {
        Some(Top100Family::CommerceMarketplace)
    } else if SERVICE.contains(&domain) {
        Some(Top100Family::ServicesUtility)
    } else if AI.contains(&domain) {
        Some(Top100Family::AiAssistant)
    } else {
        None
    }
}

fn classify_top100_intent(domain: &str, url: &Url, page: &ReadablePage) -> Top100Intent {
    let path = path_segments(url);
    let title = page.title.to_ascii_lowercase();
    let has_search_query = url
        .query_pairs()
        .any(|(k, v)| matches!(k.as_ref(), "q" | "query" | "search") && !v.trim().is_empty());
    let is_video_domain = matches!(
        domain,
        "youtube.com" | "music.youtube.com" | "twitch.tv" | "spotify.com" | "bilibili.com"
    );
    let is_marketplace_domain = matches!(
        domain,
        "amazon.com"
            | "amazon.in"
            | "amazon.co.jp"
            | "amazon.de"
            | "amazon.co.uk"
            | "ebay.com"
            | "walmart.com"
            | "etsy.com"
            | "rakuten.co.jp"
            | "aliexpress.com"
            | "temu.com"
            | "shein.com"
            | "ozon.ru"
            | "wildberries.ru"
            | "booking.com"
            | "zillow.com"
    );

    if is_video_domain
        || path
            .iter()
            .any(|segment| matches!(*segment, "watch" | "video" | "videos" | "playlist"))
    {
        return Top100Intent::VideoHub;
    }

    if is_marketplace_domain
        || path.iter().any(|segment| {
            matches!(
                *segment,
                "dp" | "product" | "products" | "item" | "listing" | "list"
            )
        })
    {
        return Top100Intent::MarketplaceListing;
    }

    if has_search_query
        || path
            .iter()
            .any(|segment| matches!(*segment, "search" | "results"))
    {
        return Top100Intent::SearchResults;
    }

    if matches!(
        domain,
        "wikipedia.org" | "fandom.com" | "nytimes.com" | "bbc.com" | "cnn.com"
    ) || path
        .iter()
        .any(|segment| matches!(*segment, "wiki" | "article" | "news"))
    {
        return Top100Intent::ArticleOrReference;
    }

    if path
        .iter()
        .any(|segment| matches!(*segment, "feed" | "timeline" | "thread" | "comments"))
        || title.contains("thread")
        || title.contains("comments")
    {
        return Top100Intent::FeedOrThread;
    }

    if title.contains("dashboard")
        || title.contains("workspace")
        || title.contains("app")
        || title.contains("account")
    {
        return Top100Intent::AppShell;
    }

    Top100Intent::PortalLanding
}

fn classify_blocked_flow(page: &ReadablePage) -> BlockedFlowClass {
    let mut haystack = page.title.to_ascii_lowercase();
    for paragraph in &page.paragraphs {
        haystack.push('\n');
        haystack.push_str(&paragraph.to_ascii_lowercase());
    }

    if haystack.contains("captcha")
        || haystack.contains("verify you are human")
        || haystack.contains("robot check")
        || haystack.contains("cloudflare")
        || haystack.contains("anti-bot")
    {
        return BlockedFlowClass::BotGate;
    }
    if haystack.contains("not available in your region")
        || haystack.contains("not available in your country")
        || haystack.contains("geo-restricted")
        || haystack.contains("geoblocked")
    {
        return BlockedFlowClass::GeoGate;
    }
    if haystack.contains("age verification")
        || haystack.contains("adults only")
        || haystack.contains("18+")
        || haystack.contains("confirm your age")
    {
        return BlockedFlowClass::AgeGate;
    }
    if haystack.contains("access denied")
        || haystack.contains("forbidden")
        || haystack.contains("blocked by policy")
        || haystack.contains("violates our terms")
        || haystack.contains("not permitted")
    {
        return BlockedFlowClass::PolicyGate;
    }
    if haystack.contains("enable javascript")
        || haystack.contains("requires javascript")
        || haystack.contains("continue in app")
        || haystack.contains("app is not available")
    {
        return BlockedFlowClass::ScriptGate;
    }
    if haystack.contains("log in")
        || haystack.contains("sign in")
        || haystack.contains("create account")
        || haystack.contains("authentication required")
        || haystack.contains("please log in")
    {
        return BlockedFlowClass::AuthWall;
    }

    BlockedFlowClass::None
}

fn detect_compatibility_pack(url: &Url, page: &ReadablePage) -> Option<CompatibilityPackCandidate> {
    const MIN_PACK_CONFIDENCE: u8 = 3;

    let path = path_segments(url);
    let title = page.title.to_ascii_lowercase();
    let has_code = page
        .nodes
        .iter()
        .any(|node| matches!(node, ReadableNode::CodeBlock { .. }));
    let heading_count = page
        .nodes
        .iter()
        .filter(|node| matches!(node, ReadableNode::Heading { .. }))
        .count();
    let list_count = page
        .nodes
        .iter()
        .filter(|node| matches!(node, ReadableNode::List { .. }))
        .count();
    let table_count = page
        .nodes
        .iter()
        .filter(|node| matches!(node, ReadableNode::Table { .. }))
        .count();
    let paragraph_count = page.paragraphs.len();
    let link_count = page.links.len();
    let mut candidates = Vec::new();

    let mut forum_signals = Vec::new();
    if path.iter().any(|segment| {
        matches!(
            *segment,
            "forum" | "forums" | "thread" | "threads" | "topic" | "topics"
        )
    }) || title.contains("forum")
        || title.contains("thread")
    {
        forum_signals.push("forum-thread path/title signal".to_owned());
    }
    if link_count >= 5 {
        forum_signals.push("forum link density >= 5".to_owned());
    }
    if !page.forms.is_empty() || list_count >= 1 {
        forum_signals.push("forum list/form structure signal".to_owned());
    }
    if !forum_signals.is_empty() {
        candidates.push(CompatibilityPackCandidate {
            pack: CompatibilityPack::Forums,
            confidence: forum_signals.len() as u8,
            signals: forum_signals,
        });
    }

    let mut qa_signals = Vec::new();
    if path.iter().any(|segment| {
        matches!(
            *segment,
            "question" | "questions" | "answer" | "answers" | "qa"
        )
    }) || title.contains("q&a")
        || title.contains("question")
    {
        qa_signals.push("qa path/title signal".to_owned());
    }
    if link_count >= 3 {
        qa_signals.push("qa link density >= 3".to_owned());
    }
    if paragraph_count >= 1 || list_count >= 1 {
        qa_signals.push("qa body/list structure signal".to_owned());
    }
    if !qa_signals.is_empty() {
        candidates.push(CompatibilityPackCandidate {
            pack: CompatibilityPack::Qa,
            confidence: qa_signals.len() as u8,
            signals: qa_signals,
        });
    }

    let mut docs_signals = Vec::new();
    if path
        .iter()
        .any(|segment| matches!(*segment, "docs" | "reference" | "api" | "guide" | "manual"))
        || url.host_str().is_some_and(|host| host.starts_with("docs."))
    {
        docs_signals.push("docs path/host signal".to_owned());
    }
    if has_code || heading_count >= 2 {
        docs_signals.push("docs semantic structure signal".to_owned());
    }
    if link_count >= 3 {
        docs_signals.push("docs link density >= 3".to_owned());
    }
    if !docs_signals.is_empty() {
        candidates.push(CompatibilityPackCandidate {
            pack: CompatibilityPack::Docs,
            confidence: docs_signals.len() as u8,
            signals: docs_signals,
        });
    }

    let mut news_signals = Vec::new();
    if path
        .iter()
        .any(|segment| matches!(*segment, "news" | "article" | "stories"))
        || title.contains("news")
        || title.contains("analysis")
    {
        news_signals.push("news path/title signal".to_owned());
    }
    if paragraph_count >= 4 {
        news_signals.push("news paragraph depth >= 4".to_owned());
    }
    if link_count >= 3 {
        news_signals.push("news link density >= 3".to_owned());
    }
    if !news_signals.is_empty() {
        candidates.push(CompatibilityPackCandidate {
            pack: CompatibilityPack::NewsMedia,
            confidence: news_signals.len() as u8,
            signals: news_signals,
        });
    }

    let mut portal_signals = Vec::new();
    if link_count >= 20 {
        portal_signals.push("portal link density >= 20".to_owned());
    }
    if paragraph_count <= 3 {
        portal_signals.push("portal compact body signal".to_owned());
    }
    if path.first().is_none_or(|segment| {
        matches!(
            *segment,
            "" | "home" | "discover" | "explore" | "index" | "portal"
        )
    }) {
        portal_signals.push("portal landing path signal".to_owned());
    }
    if !portal_signals.is_empty() {
        candidates.push(CompatibilityPackCandidate {
            pack: CompatibilityPack::Portal,
            confidence: portal_signals.len() as u8,
            signals: portal_signals,
        });
    }

    let mut app_shell_signals = Vec::new();
    if path.iter().any(|segment| {
        matches!(
            *segment,
            "app" | "dashboard" | "workspace" | "account" | "settings"
        )
    }) {
        app_shell_signals.push("app-shell path signal".to_owned());
    }
    if title.contains("dashboard")
        || title.contains("workspace")
        || title.contains("account")
        || title.contains("app")
    {
        app_shell_signals.push("app-shell title signal".to_owned());
    }
    if !page.forms.is_empty() || link_count >= 6 {
        app_shell_signals.push("app-shell workflow surface signal".to_owned());
    }
    if !app_shell_signals.is_empty() {
        candidates.push(CompatibilityPackCandidate {
            pack: CompatibilityPack::AppShell,
            confidence: app_shell_signals.len() as u8,
            signals: app_shell_signals,
        });
    }

    let mut commerce_signals = Vec::new();
    if path.iter().any(|segment| {
        matches!(
            *segment,
            "shop" | "store" | "products" | "product" | "category" | "deals" | "listing"
        )
    }) || title.contains("shop")
        || title.contains("deals")
    {
        commerce_signals.push("commerce path/title signal".to_owned());
    }
    if list_count >= 1 || table_count >= 1 {
        commerce_signals.push("commerce list/table signal".to_owned());
    }
    if page.paragraphs.iter().any(|paragraph| {
        paragraph.contains('$')
            || paragraph.contains('€')
            || paragraph.contains('£')
            || paragraph.contains('¥')
            || paragraph.contains('₹')
            || paragraph.to_ascii_lowercase().contains("price")
    }) {
        commerce_signals.push("commerce pricing signal".to_owned());
    }
    if link_count >= 6 {
        commerce_signals.push("commerce link density >= 6".to_owned());
    }
    if !commerce_signals.is_empty() {
        candidates.push(CompatibilityPackCandidate {
            pack: CompatibilityPack::CommerceCards,
            confidence: commerce_signals.len() as u8,
            signals: commerce_signals,
        });
    }

    let mut mixed_media_signals = Vec::new();
    if path.iter().any(|segment| {
        matches!(
            *segment,
            "watch" | "video" | "media" | "gallery" | "podcast" | "live"
        )
    }) || title.contains("video")
        || title.contains("podcast")
        || title.contains("gallery")
    {
        mixed_media_signals.push("mixed-media path/title signal".to_owned());
    }
    if paragraph_count >= 3 {
        mixed_media_signals.push("mixed-media paragraph depth >= 3".to_owned());
    }
    if list_count >= 1 || link_count >= 8 {
        mixed_media_signals.push("mixed-media list/link surface signal".to_owned());
    }
    if !mixed_media_signals.is_empty() {
        candidates.push(CompatibilityPackCandidate {
            pack: CompatibilityPack::MixedMedia,
            confidence: mixed_media_signals.len() as u8,
            signals: mixed_media_signals,
        });
    }

    let best = candidates.into_iter().max_by_key(|candidate| {
        (
            candidate.confidence,
            candidate.pack.priority(),
            std::cmp::Reverse(candidate.pack.as_str()),
        )
    })?;

    (best.confidence >= MIN_PACK_CONFIDENCE).then_some(best)
}

fn compatibility_pack_document(
    page: &ReadablePage,
    adapter_id: &AdapterId,
    candidate: &CompatibilityPackCandidate,
) -> IndexDocument {
    let title = format!("Compatibility pack: {}", candidate.pack.label());
    let mut document = IndexDocument::titled(title.clone());
    document.metadata.canonical_url = page.metadata.canonical_url.clone();
    document.metadata.language = page.metadata.language.clone();
    document.metadata.description = page.metadata.description.clone();
    document.metadata.open_graph_title = page.metadata.open_graph_title.clone();
    document.metadata.open_graph_description = page.metadata.open_graph_description.clone();
    document.metadata.adapter_id = Some(adapter_id.clone());
    document.metadata.quality = Some(DocumentQuality::new(
        DocumentQualityCategory::Adapter,
        88,
        [
            format!("matched adapter: {}", adapter_id.as_str()),
            format!("compatibility pack: {}", candidate.pack.as_str()),
            format!("pack confidence: {}", candidate.confidence),
        ],
    ));
    document.push(IndexNode::Heading {
        level: 1,
        text: title,
    });
    document.push(IndexNode::Paragraph(format!(
        "Family pack: {}",
        candidate.pack.label()
    )));
    document.push(IndexNode::Section {
        role: SectionRole::Unknown,
        title: Some("Pack diagnostics".to_owned()),
        collapsed: true,
        nodes: vec![IndexNode::List {
            ordered: false,
            items: std::iter::once(format!(
                "confidence: {} ({})",
                candidate.confidence,
                candidate.confidence_label()
            ))
            .chain(
                candidate
                    .signals
                    .iter()
                    .map(|signal| format!("signal: {signal}")),
            )
            .chain(std::iter::once(
                "fallback: confidence below 3 returns generic transformer".to_owned(),
            ))
            .collect(),
        }],
    });
    document.push(IndexNode::List {
        ordered: false,
        items: compatibility_pack_tasks(candidate.pack),
    });
    for node in page.nodes.iter().take(8) {
        if let Some(summary) = summary_node(node) {
            document.push(summary);
        }
    }
    for form in &page.forms {
        document.push(IndexNode::Form(index_form_from_html(form)));
    }
    for link in page.links.iter().take(24) {
        document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
    }
    document
}

fn compatibility_pack_tasks(pack: CompatibilityPack) -> Vec<String> {
    match pack {
        CompatibilityPack::Forums => vec![
            "Read thread context".to_owned(),
            "Inspect pagination and replies".to_owned(),
            "Open outbound references".to_owned(),
        ],
        CompatibilityPack::Qa => vec![
            "Read question and accepted answers".to_owned(),
            "Inspect answer ranking".to_owned(),
            "Open cited references".to_owned(),
        ],
        CompatibilityPack::Docs => vec![
            "Read API/reference sections".to_owned(),
            "Inspect code examples".to_owned(),
            "Open navigation and related topics".to_owned(),
        ],
        CompatibilityPack::NewsMedia => vec![
            "Read article body".to_owned(),
            "Inspect related coverage links".to_owned(),
            "Open citations and sources".to_owned(),
        ],
        CompatibilityPack::Portal => vec![
            "Scan major entry points".to_owned(),
            "Open primary categories".to_owned(),
            "Inspect utility/search actions".to_owned(),
        ],
        CompatibilityPack::AppShell => vec![
            "Inspect workspace navigation".to_owned(),
            "Open primary actions and settings".to_owned(),
            "Review account or session requirements".to_owned(),
        ],
        CompatibilityPack::CommerceCards => vec![
            "Scan product cards and prices".to_owned(),
            "Open product detail links".to_owned(),
            "Inspect filters, sorting, and pagination".to_owned(),
        ],
        CompatibilityPack::MixedMedia => vec![
            "Read media summaries and metadata".to_owned(),
            "Open video/audio/gallery entries".to_owned(),
            "Inspect related links and next-step navigation".to_owned(),
        ],
    }
}

fn top100_family_tasks(family: Top100Family) -> Vec<String> {
    match family {
        Top100Family::SearchPortal => vec![
            "Scan ranked results".to_owned(),
            "Open result targets".to_owned(),
            "Inspect query refinements".to_owned(),
        ],
        Top100Family::KnowledgeReference => vec![
            "Read primary article content".to_owned(),
            "Open references and sources".to_owned(),
            "Inspect related links".to_owned(),
        ],
        Top100Family::SocialCommunity => vec![
            "Read post/thread context".to_owned(),
            "Inspect reply chains".to_owned(),
            "Open outbound links".to_owned(),
        ],
        Top100Family::MediaStreaming => vec![
            "Read media metadata".to_owned(),
            "Open channel or playlist links".to_owned(),
            "Inspect related media links".to_owned(),
        ],
        Top100Family::CommerceMarketplace => vec![
            "Read listing and product metadata".to_owned(),
            "Open product links".to_owned(),
            "Inspect filters and pagination".to_owned(),
        ],
        Top100Family::ServicesUtility => vec![
            "Read public utility or help content".to_owned(),
            "Open relevant workflow links".to_owned(),
            "Inspect authentication requirements".to_owned(),
        ],
        Top100Family::AiAssistant => vec![
            "Read public product/help content".to_owned(),
            "Open documentation or policy links".to_owned(),
            "Inspect account-gated workflow boundaries".to_owned(),
        ],
    }
}

fn top100_baseline_document(
    page: &ReadablePage,
    adapter_id: &AdapterId,
    domain: &str,
    family: Top100Family,
    intent: Top100Intent,
) -> IndexDocument {
    let title = format!("Top site baseline: {domain}");
    let mut document = IndexDocument::titled(title.clone());
    document.metadata.canonical_url = page.metadata.canonical_url.clone();
    document.metadata.language = page.metadata.language.clone();
    document.metadata.description = page.metadata.description.clone();
    document.metadata.open_graph_title = page.metadata.open_graph_title.clone();
    document.metadata.open_graph_description = page.metadata.open_graph_description.clone();
    document.metadata.adapter_id = Some(adapter_id.clone());
    document.metadata.quality = Some(DocumentQuality::new(
        DocumentQualityCategory::Adapter,
        90,
        [
            format!("matched adapter: {}", adapter_id.as_str()),
            format!("domain: {domain}"),
            format!("family: {}", family.as_str()),
            format!("intent: {}", intent.as_str()),
        ],
    ));

    document.push(IndexNode::Heading {
        level: 1,
        text: title,
    });
    document.push(IndexNode::Paragraph(format!(
        "Family: {} | Intent: {}",
        family.label(),
        intent.as_str()
    )));
    document.push(IndexNode::List {
        ordered: false,
        items: top100_family_tasks(family),
    });

    match intent {
        Top100Intent::SearchResults => {
            let mut result_nodes = Vec::new();
            for link in page.links.iter().take(20) {
                if default_forum_link_filter(link) {
                    result_nodes.push(IndexNode::Link(Link::new(&link.text, &link.href)));
                }
            }
            if !result_nodes.is_empty() {
                document.push(IndexNode::Section {
                    role: SectionRole::Main,
                    title: Some("Results".to_owned()),
                    collapsed: false,
                    nodes: result_nodes,
                });
            }
        }
        Top100Intent::VideoHub => {
            let metadata = top100_media_metadata(page);
            if !metadata.is_empty() {
                document.push(IndexNode::Section {
                    role: SectionRole::Main,
                    title: Some("Media Metadata".to_owned()),
                    collapsed: false,
                    nodes: vec![IndexNode::List {
                        ordered: false,
                        items: metadata,
                    }],
                });
            }
        }
        Top100Intent::MarketplaceListing => {
            let listing_nodes = top100_listing_nodes(page);
            if !listing_nodes.is_empty() {
                document.push(IndexNode::Section {
                    role: SectionRole::Main,
                    title: Some("Listings".to_owned()),
                    collapsed: false,
                    nodes: listing_nodes,
                });
            }
        }
        Top100Intent::PortalLanding
        | Top100Intent::ArticleOrReference
        | Top100Intent::AppShell
        | Top100Intent::FeedOrThread => {}
    }

    for node in page.nodes.iter().take(6) {
        if let Some(summary) = summary_node(node) {
            document.push(summary);
        }
    }
    for form in &page.forms {
        document.push(IndexNode::Form(index_form_from_html(form)));
    }
    for link in page.links.iter().take(16) {
        document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
    }

    document
}

fn top100_media_metadata(page: &ReadablePage) -> Vec<String> {
    page.paragraphs
        .iter()
        .filter_map(|paragraph| {
            let trimmed = paragraph.trim();
            let lower = trimmed.to_ascii_lowercase();
            (lower.starts_with("creator:")
                || lower.starts_with("author:")
                || lower.starts_with("duration:")
                || lower.starts_with("channel:")
                || lower.starts_with("views:"))
            .then(|| trimmed.to_owned())
        })
        .take(8)
        .collect()
}

fn top100_listing_nodes(page: &ReadablePage) -> Vec<IndexNode> {
    let mut nodes = Vec::new();
    for readable in page.nodes.iter().take(8) {
        match readable {
            ReadableNode::List { ordered, items } if !items.is_empty() => {
                nodes.push(IndexNode::List {
                    ordered: *ordered,
                    items: items.clone(),
                })
            }
            ReadableNode::Table { rows } if !rows.is_empty() => {
                nodes.push(IndexNode::Table { rows: rows.clone() })
            }
            ReadableNode::Paragraph(text)
                if text.to_ascii_lowercase().contains("results for")
                    || text.to_ascii_lowercase().contains("price") =>
            {
                nodes.push(IndexNode::Paragraph(text.clone()))
            }
            _ => {}
        }
    }
    nodes
}

fn blocked_top100_document(
    page: &ReadablePage,
    adapter_id: &AdapterId,
    domain: &str,
    family: Top100Family,
    intent: Top100Intent,
    blocked: BlockedFlowClass,
) -> IndexDocument {
    let reason = format!(
        "{domain} page is gated by {} and cannot be transformed safely as an interactive flow",
        blocked.as_str()
    );
    let mut document = FailureDiagnostic::new(
        format!("Top site blocked flow: {domain}"),
        DiagnosticSource::Adapter,
        DiagnosticConfidence::Low,
        reason.clone(),
    )
    .with_likely_cause(FailureCause::BlockedByPolicy)
    .with_fallback("read-only generic extraction")
    .with_tried("top100 baseline family/intent detection")
    .with_actions([
        DiagnosticAction::Retry,
        DiagnosticAction::Extract,
        DiagnosticAction::Capture,
        DiagnosticAction::AddFixture,
    ])
    .with_command(":extract links")
    .with_command(":capture save top100-blocked.capture")
    .with_command(format!(
        ":capture --preview --redact https://{domain}/ blocked-flow.html"
    ))
    .with_record(
        DiagnosticRecord::new(DiagnosticSeverity::Warning, "INDEX-TOP100-BLOCKED", &reason)
            .with_field("adapter", adapter_id.as_str())
            .with_field("domain", domain)
            .with_field("family", family.as_str())
            .with_field("intent", intent.as_str())
            .with_field("blocked_flow_class", blocked.as_str()),
    )
    .into_document();
    document.metadata.canonical_url = page.metadata.canonical_url.clone();
    document.metadata.language = page.metadata.language.clone();
    document.metadata.adapter_id = Some(adapter_id.clone());
    document.metadata.quality = Some(DocumentQuality::new(
        DocumentQualityCategory::Fallback,
        25,
        [
            format!("matched adapter: {}", adapter_id.as_str()),
            format!("blocked-flow class: {}", blocked.as_str()),
            format!("family: {}", family.as_str()),
            format!("intent: {}", intent.as_str()),
        ],
    ));
    document.push(IndexNode::Section {
        role: SectionRole::Unknown,
        title: Some("Remediation".to_owned()),
        collapsed: false,
        nodes: vec![IndexNode::List {
            ordered: false,
            items: blocked_flow_guidance(blocked),
        }],
    });
    document
}

fn blocked_flow_guidance(blocked: BlockedFlowClass) -> Vec<String> {
    let mut items = vec![
        "collect a sanitized capture artifact for fixture review".to_owned(),
        "use :extract links for deterministic fallback output".to_owned(),
    ];
    items.push(match blocked {
        BlockedFlowClass::AuthWall => {
            "log in with a supported session scope before retry".to_owned()
        }
        BlockedFlowClass::ScriptGate => {
            "retry with headless snapshot fallback enabled for script-gated pages".to_owned()
        }
        BlockedFlowClass::BotGate => {
            "respect anti-bot policy and avoid automated bypass attempts".to_owned()
        }
        BlockedFlowClass::GeoGate => {
            "verify legal region availability before requesting content".to_owned()
        }
        BlockedFlowClass::AgeGate => {
            "confirm age-gated policy allows read-only access before retry".to_owned()
        }
        BlockedFlowClass::PolicyGate => {
            "page appears policy-blocked; treat as unsupported and preserve diagnostics".to_owned()
        }
        BlockedFlowClass::None => "retry generic extraction".to_owned(),
    });
    items
}

fn classify_forum_intent(url: Option<&Url>, page: &ReadablePage) -> ForumIntent {
    if page.forms.iter().any(is_reply_or_login_form) {
        return ForumIntent::ReplyForm;
    }

    let Some(url) = url else {
        return ForumIntent::FrontPage;
    };

    let segments = path_segments(url);
    if looks_like_profile_path(&segments, url.as_str()) {
        return ForumIntent::ProfileNoise;
    }

    if looks_like_paginated_path(url, page) {
        return ForumIntent::PaginatedThread;
    }

    if looks_like_thread_path(&segments, url.as_str()) {
        return ForumIntent::ThreadPage;
    }

    ForumIntent::FrontPage
}

fn looks_like_profile_path(segments: &[&str], raw_url: &str) -> bool {
    segments.contains(&"user")
        || segments.contains(&"users")
        || segments.contains(&"members")
        || raw_url.contains("/~")
}

fn looks_like_thread_path(segments: &[&str], raw_url: &str) -> bool {
    segments.contains(&"item")
        || segments.contains(&"comments")
        || segments.contains(&"question")
        || segments.contains(&"questions")
        || segments.contains(&"threads")
        || segments.contains(&"thread")
        || segments.contains(&"topic")
        || (segments.first() == Some(&"t"))
        || raw_url.contains("showthread")
        || raw_url.contains("viewtopic")
}

fn looks_like_paginated_path(url: &Url, page: &ReadablePage) -> bool {
    if url
        .query_pairs()
        .any(|(k, v)| (k == "p" || k == "page" || k == "start") && !v.is_empty())
    {
        return true;
    }

    page.links.iter().any(is_forum_pagination_link)
}

fn is_reply_or_login_form(form: &index_dom::HtmlForm) -> bool {
    let action = form.action.to_ascii_lowercase();
    let name = form.name.to_ascii_lowercase();
    let field_match = form.inputs.iter().any(|input| {
        let input_name = input.name.to_ascii_lowercase();
        input_name.contains("reply")
            || input_name.contains("comment")
            || input_name.contains("message")
            || input_name.contains("password")
            || input_name.contains("username")
    });
    field_match
        || action.contains("reply")
        || action.contains("comment")
        || action.contains("login")
        || name.contains("reply")
        || name.contains("login")
}

fn forum_thread_document(
    page: &ReadablePage,
    adapter_id: &AdapterId,
    family: ForumFamily,
    title: String,
    tasks: Vec<String>,
    filter: Option<fn(&index_dom::HtmlLink) -> bool>,
) -> IndexDocument {
    let canonical = page_url(page);
    let intent = classify_forum_intent(canonical.as_ref(), page);
    let mut document = IndexDocument::titled(title.clone());
    document.metadata.canonical_url = page.metadata.canonical_url.clone();
    document.metadata.language = page.metadata.language.clone();
    document.metadata.description = page.metadata.description.clone();
    document.metadata.open_graph_title = page.metadata.open_graph_title.clone();
    document.metadata.open_graph_description = page.metadata.open_graph_description.clone();
    document.metadata.adapter_id = Some(adapter_id.clone());
    document.metadata.quality = Some(DocumentQuality::new(
        DocumentQualityCategory::Adapter,
        94,
        [
            format!("matched adapter: {}", adapter_id.as_str()),
            format!("forum family: {}", family.as_str()),
            format!("forum intent: {}", intent.as_str()),
        ],
    ));
    document.push(IndexNode::Heading {
        level: 1,
        text: title,
    });
    document.push(IndexNode::Paragraph(format!(
        "Family: {} | Intent: {}",
        family.label(),
        intent.as_str()
    )));
    document.push(IndexNode::List {
        ordered: false,
        items: tasks,
    });

    let link_filter = filter.unwrap_or(default_forum_link_filter);
    let mut breadcrumbs = Vec::new();
    let mut pagination = Vec::new();
    let mut outbound = Vec::new();
    for link in &page.links {
        if !link_filter(link) {
            continue;
        }
        if is_forum_breadcrumb_link(link) {
            breadcrumbs.push(link);
        } else if is_forum_pagination_link(link) {
            pagination.push(link);
        } else {
            outbound.push(link);
        }
    }

    if !breadcrumbs.is_empty() {
        document.push(IndexNode::Section {
            role: SectionRole::Navigation,
            title: Some("Breadcrumbs".to_owned()),
            collapsed: true,
            nodes: breadcrumbs
                .into_iter()
                .map(|link| IndexNode::Link(Link::new(&link.text, &link.href)))
                .collect(),
        });
    }

    let mut thread_nodes = page
        .nodes
        .iter()
        .filter_map(forum_node_from_readable)
        .collect::<Vec<_>>();
    if thread_nodes.is_empty() {
        thread_nodes.extend(
            page.paragraphs
                .iter()
                .take(3)
                .map(|paragraph| IndexNode::Paragraph(paragraph.clone())),
        );
    }
    if !thread_nodes.is_empty() {
        document.push(IndexNode::Section {
            role: SectionRole::Comments,
            title: Some("Thread".to_owned()),
            collapsed: false,
            nodes: thread_nodes,
        });
    }

    if !pagination.is_empty() {
        document.push(IndexNode::Section {
            role: SectionRole::Navigation,
            title: Some("Pagination".to_owned()),
            collapsed: true,
            nodes: pagination
                .iter()
                .map(|link| IndexNode::Link(Link::new(&link.text, &link.href)))
                .collect(),
        });
    }

    let mut seen_step_hrefs = BTreeSet::new();
    let mut next_steps = Vec::new();
    for link in pagination.iter().chain(outbound.iter()) {
        if next_steps.len() >= 6 {
            break;
        }
        if !seen_step_hrefs.insert(link.href.clone()) {
            continue;
        }
        if is_forum_pagination_link(link) || looks_like_forum_next_step(link) {
            next_steps.push(IndexNode::Link(Link::new(&link.text, &link.href)));
        }
    }
    if !next_steps.is_empty() {
        document.push(IndexNode::Section {
            role: SectionRole::Navigation,
            title: Some("Next steps".to_owned()),
            collapsed: false,
            nodes: next_steps,
        });
    }

    for form in &page.forms {
        document.push(IndexNode::Form(index_form_from_html(form)));
    }

    for link in outbound.into_iter().take(20) {
        document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
    }

    if matches!(intent, ForumIntent::ProfileNoise) {
        document.push(IndexNode::Section {
            role: SectionRole::Unknown,
            title: Some("Diagnostic".to_owned()),
            collapsed: true,
            nodes: vec![
                IndexNode::Error(
                    "profile-like page detected; discussion thread extraction may be partial"
                        .to_owned(),
                ),
                IndexNode::List {
                    ordered: false,
                    items: vec![
                        "use :extract links to inspect profile actions".to_owned(),
                        "capture a fixture if thread navigation is expected".to_owned(),
                    ],
                },
            ],
        });
    }

    document
}

fn forum_node_from_readable(node: &ReadableNode) -> Option<IndexNode> {
    match node {
        ReadableNode::Heading { level, text } => Some(IndexNode::Heading {
            level: *level,
            text: text.clone(),
        }),
        ReadableNode::Paragraph(text) => {
            (!looks_like_signature_line(text)).then(|| IndexNode::Paragraph(text.clone()))
        }
        ReadableNode::Link(link) => Some(IndexNode::Link(Link::new(&link.text, &link.href))),
        ReadableNode::List { ordered, items } => Some(IndexNode::List {
            ordered: *ordered,
            items: items.clone(),
        }),
        ReadableNode::CodeBlock { language, code } => Some(IndexNode::CodeBlock {
            language: language.clone(),
            code: code.clone(),
        }),
        ReadableNode::Table { rows } => {
            (!rows.is_empty()).then(|| IndexNode::Table { rows: rows.clone() })
        }
        ReadableNode::Spacer { lines } => Some(IndexNode::Spacer { lines: *lines }),
        ReadableNode::Section {
            role,
            title,
            collapsed,
            nodes,
        } => {
            let mapped = nodes
                .iter()
                .filter_map(forum_node_from_readable)
                .collect::<Vec<_>>();
            (!mapped.is_empty()).then(|| IndexNode::Section {
                role: section_role(*role),
                title: title.clone(),
                collapsed: *collapsed,
                nodes: mapped,
            })
        }
        ReadableNode::Image { alt, src } => Some(IndexNode::Image {
            alt: alt.clone(),
            src: src.clone(),
        }),
        ReadableNode::Form(form) => Some(IndexNode::Form(index_form_from_html(form))),
    }
}

fn looks_like_signature_line(text: &str) -> bool {
    let lower = text.trim().to_ascii_lowercase();
    lower.starts_with("sent from my")
        || lower.starts_with("--")
        || lower.starts_with("signature:")
        || lower.starts_with("posted via")
}

fn is_forum_breadcrumb_link(link: &index_dom::HtmlLink) -> bool {
    matches!(
        link.text.trim().to_ascii_lowercase().as_str(),
        "home" | "forums" | "forum" | "boards" | "topics" | "discussions"
    )
}

fn is_forum_pagination_link(link: &index_dom::HtmlLink) -> bool {
    let text = link.text.trim().to_ascii_lowercase();
    if matches!(
        text.as_str(),
        "next" | "prev" | "previous" | "older" | "newer" | "more"
    ) {
        return true;
    }
    if text.starts_with("page ") {
        return true;
    }
    link.href.contains("page=")
        || link.href.contains("/page-")
        || link.href.contains("start=")
        || link.href.contains("p=")
}

fn default_forum_link_filter(link: &index_dom::HtmlLink) -> bool {
    let text = link.text.trim();
    if text.is_empty() {
        return false;
    }
    let href = link.href.to_ascii_lowercase();
    if href.starts_with("javascript:") || href.starts_with("mailto:") {
        return false;
    }
    !matches!(
        text.to_ascii_lowercase().as_str(),
        "reply" | "quote" | "report" | "like"
    )
}

fn looks_like_forum_next_step(link: &index_dom::HtmlLink) -> bool {
    let text = link.text.trim().to_ascii_lowercase();
    text.contains("thread")
        || text.contains("topic")
        || text.contains("discussion")
        || text.contains("result")
        || text.contains("latest")
        || text.contains("archive")
}

fn is_reddit_actionable_link(link: &index_dom::HtmlLink) -> bool {
    default_forum_link_filter(link)
        && !matches!(
            link.text.trim().to_ascii_lowercase().as_str(),
            "give award" | "share" | "save" | "hide"
        )
}

fn is_legacy_actionable_link(link: &index_dom::HtmlLink) -> bool {
    default_forum_link_filter(link)
        && !matches!(
            link.text.trim().to_ascii_lowercase().as_str(),
            "pm" | "warn" | "ignore" | "report post"
        )
}

fn reddit_script_gated(page: &ReadablePage) -> bool {
    let title = page.title.to_ascii_lowercase();
    if title.contains("reddit") && page.nodes.is_empty() && page.links.len() <= 2 {
        return true;
    }

    let gate_markers = [
        "continue in app",
        "enable javascript",
        "you've been blocked",
        "log in to reddit",
    ];
    page.paragraphs.iter().any(|paragraph| {
        let lower = paragraph.to_ascii_lowercase();
        gate_markers.iter().any(|marker| lower.contains(marker))
    })
}

fn blocked_forum_document(
    page: &ReadablePage,
    adapter_id: &AdapterId,
    reason: &str,
) -> IndexDocument {
    let mut document = FailureDiagnostic::new(
        page.title.clone(),
        DiagnosticSource::Adapter,
        DiagnosticConfidence::Low,
        reason,
    )
    .with_likely_cause(FailureCause::BlockedByPolicy)
    .with_fallback("generic read-only extraction")
    .with_tried("adapter forum-family detection")
    .with_actions([
        DiagnosticAction::Retry,
        DiagnosticAction::Capture,
        DiagnosticAction::AddFixture,
    ])
    .with_command(":extract links")
    .with_command(":capture save forum-blocked.capture")
    .with_record(
        DiagnosticRecord::new(DiagnosticSeverity::Warning, "INDEX-FORUM-BLOCKED", reason)
            .with_field("adapter", adapter_id.as_str()),
    )
    .into_document();
    document.metadata.canonical_url = page.metadata.canonical_url.clone();
    document.metadata.language = page.metadata.language.clone();
    document.metadata.adapter_id = Some(adapter_id.clone());
    document.metadata.quality = Some(DocumentQuality::new(
        DocumentQualityCategory::Fallback,
        30,
        [
            format!("matched adapter: {}", adapter_id.as_str()),
            "blocked forum flow emitted deterministic diagnostic".to_owned(),
        ],
    ));
    document
}

fn task_document(
    page: &ReadablePage,
    adapter_id: &AdapterId,
    title: String,
    tasks: Vec<String>,
) -> IndexDocument {
    let mut document = IndexDocument::titled(title.clone());
    document.metadata.canonical_url = page.metadata.canonical_url.clone();
    document.metadata.language = page.metadata.language.clone();
    document.metadata.description = page.metadata.description.clone();
    document.metadata.open_graph_title = page.metadata.open_graph_title.clone();
    document.metadata.open_graph_description = page.metadata.open_graph_description.clone();
    document.metadata.adapter_id = Some(adapter_id.clone());
    document.metadata.quality = Some(DocumentQuality::new(
        DocumentQualityCategory::Adapter,
        95,
        [
            format!("matched adapter: {}", adapter_id.as_str()),
            "fixture-backed task view".to_owned(),
        ],
    ));
    document.push(IndexNode::Heading {
        level: 1,
        text: title,
    });
    document.push(IndexNode::List {
        ordered: false,
        items: tasks,
    });

    for node in page.nodes.iter().take(3) {
        if let Some(summary) = summary_node(node) {
            document.push(summary);
        }
    }

    for link in page.links.iter().take(8) {
        document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
    }

    document
}

fn hacker_news_document(page: &ReadablePage, matched: &AdapterMatch) -> IndexDocument {
    let title = format!("Hacker News: {}", matched.page_type);
    let intent = classify_forum_intent(page_url(page).as_ref(), page);
    let mut document = IndexDocument::titled(title.clone());
    document.metadata.canonical_url = page.metadata.canonical_url.clone();
    document.metadata.language = page.metadata.language.clone();
    document.metadata.description = page.metadata.description.clone();
    document.metadata.open_graph_title = page.metadata.open_graph_title.clone();
    document.metadata.open_graph_description = page.metadata.open_graph_description.clone();
    document.metadata.adapter_id = Some(matched.id.clone());
    document.metadata.quality = Some(DocumentQuality::new(
        DocumentQualityCategory::Adapter,
        96,
        [
            "matched adapter: hacker-news".to_owned(),
            "preserved actionable links and forms".to_owned(),
            "suppressed raw table-layout dump".to_owned(),
        ],
    ));
    document.push(IndexNode::Heading {
        level: 1,
        text: title,
    });
    document.push(IndexNode::Paragraph(format!(
        "Family: {} | Intent: {}",
        ForumFamily::HackerNews.label(),
        intent.as_str()
    )));
    document.push(IndexNode::List {
        ordered: false,
        items: hacker_news_tasks(&matched.page_type),
    });

    // Preserve readable content except the raw table layout that makes HN hard to scan.
    let mut skipped_title_heading = false;
    for node in &page.nodes {
        if !skipped_title_heading
            && matches!(
                node,
                ReadableNode::Heading { level: 1, text } if text == &page.title
            )
        {
            skipped_title_heading = true;
            continue;
        }
        if let Some(index_node) = index_node_from_readable_without_tables(node) {
            document.push(index_node);
        }
    }

    let mut story_links = Vec::new();
    let mut navigation_links = Vec::new();
    let mut footer_links = Vec::new();
    for link in &page.links {
        if !is_hacker_news_link_actionable(link) {
            continue;
        }
        if is_hacker_news_navigation_link(link) {
            navigation_links.push(link);
        } else if is_hacker_news_footer_link(link) {
            footer_links.push(link);
        } else {
            story_links.push(link);
        }
    }

    for link in story_links {
        document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
    }

    if !navigation_links.is_empty() {
        document.push(IndexNode::Section {
            role: SectionRole::Navigation,
            title: Some("HN Navigation".to_owned()),
            collapsed: true,
            nodes: navigation_links
                .into_iter()
                .map(|link| IndexNode::Link(Link::new(&link.text, &link.href)))
                .collect(),
        });
    }

    if !footer_links.is_empty() {
        document.push(IndexNode::Section {
            role: SectionRole::Related,
            title: Some("HN Footer".to_owned()),
            collapsed: true,
            nodes: footer_links
                .into_iter()
                .map(|link| IndexNode::Link(Link::new(&link.text, &link.href)))
                .collect(),
        });
    }

    for form in &page.forms {
        document.push(IndexNode::Form(index_form_from_html(form)));
    }

    document
}

fn hacker_news_tasks(page_type: &str) -> Vec<String> {
    let page_type = page_type.to_ascii_lowercase();
    if page_type.contains("discussion") {
        vec![
            "Read story context".to_owned(),
            "Read comment threads".to_owned(),
            "Open author and parent links".to_owned(),
            "Use HN search form".to_owned(),
        ]
    } else if page_type.contains("login") {
        vec![
            "Open login workflow".to_owned(),
            "Use HN search form".to_owned(),
            "Jump to front-page discussions".to_owned(),
        ]
    } else {
        vec![
            "Open top stories".to_owned(),
            "Open discussion threads".to_owned(),
            "Filter by source or author".to_owned(),
            "Use HN search form".to_owned(),
        ]
    }
}

fn hacker_news_page_type(url: Option<&Url>, title: &str) -> String {
    let Some(url) = url else {
        return title.to_owned();
    };

    let segment = path_segments(url).first().copied().unwrap_or_default();
    match segment {
        "" | "news" => "front-page".to_owned(),
        "newest" => "new".to_owned(),
        "front" => "past".to_owned(),
        "newcomments" => "comments".to_owned(),
        "ask" => "ask".to_owned(),
        "show" => "show".to_owned(),
        "jobs" => "jobs".to_owned(),
        "login" => "login".to_owned(),
        "submit" => "submit".to_owned(),
        "item" => url
            .query_pairs()
            .find_map(|(key, value)| (key == "id").then(|| value.to_string()))
            .map_or_else(|| "discussion".to_owned(), |id| format!("discussion #{id}")),
        "user" => url
            .query_pairs()
            .find_map(|(key, value)| (key == "id").then(|| value.to_string()))
            .map_or_else(|| "user".to_owned(), |id| format!("user {id}")),
        _ => title.to_owned(),
    }
}

fn looks_like_hacker_news(page: &ReadablePage) -> bool {
    if page.title.to_ascii_lowercase().contains("hacker news") {
        return true;
    }

    if page
        .forms
        .iter()
        .any(|form| form.action.contains("news.ycombinator.com"))
    {
        return true;
    }

    let hn_link_count = page
        .links
        .iter()
        .filter(|link| link.href.contains("news.ycombinator.com"))
        .count();
    if hn_link_count >= 3 {
        return true;
    }

    let nav_label_count = page
        .links
        .iter()
        .filter(|link| {
            matches!(
                link.text.to_ascii_lowercase().as_str(),
                "hacker news"
                    | "new"
                    | "past"
                    | "comments"
                    | "ask"
                    | "show"
                    | "jobs"
                    | "submit"
                    | "login"
            )
        })
        .count();
    nav_label_count >= 3
}

fn is_hacker_news_link_actionable(link: &index_dom::HtmlLink) -> bool {
    let text = link.text.trim();
    if text.is_empty() {
        return false;
    }
    let href = link.href.to_ascii_lowercase();
    if href.starts_with("mailto:") {
        return false;
    }
    if href.starts_with("javascript:") {
        return false;
    }
    if href.contains("news.ycombinator.com/hide?id=") || href.contains("vote?id=") {
        return false;
    }
    if text.eq_ignore_ascii_case("hide")
        || text.eq_ignore_ascii_case("favorite")
        || text.eq_ignore_ascii_case("parent")
        || text.eq_ignore_ascii_case("next")
        || text.eq_ignore_ascii_case("root")
    {
        return false;
    }
    true
}

fn is_hacker_news_navigation_link(link: &index_dom::HtmlLink) -> bool {
    matches!(
        link.text.to_ascii_lowercase().as_str(),
        "hacker news"
            | "new"
            | "past"
            | "comments"
            | "ask"
            | "show"
            | "jobs"
            | "submit"
            | "login"
            | "more"
    )
}

fn is_hacker_news_footer_link(link: &index_dom::HtmlLink) -> bool {
    matches!(
        link.text.to_ascii_lowercase().as_str(),
        "guidelines" | "faq" | "lists" | "api" | "security" | "legal" | "apply to yc"
    )
}

fn index_node_from_readable_without_tables(node: &ReadableNode) -> Option<IndexNode> {
    match node {
        ReadableNode::Heading { level, text } => Some(IndexNode::Heading {
            level: *level,
            text: text.clone(),
        }),
        ReadableNode::Paragraph(text) => Some(IndexNode::Paragraph(text.clone())),
        ReadableNode::Link(link) => Some(IndexNode::Link(Link::new(&link.text, &link.href))),
        ReadableNode::List { ordered, items } => Some(IndexNode::List {
            ordered: *ordered,
            items: items.clone(),
        }),
        ReadableNode::CodeBlock { language, code } => Some(IndexNode::CodeBlock {
            language: language.clone(),
            code: code.clone(),
        }),
        ReadableNode::Table { .. } => None,
        ReadableNode::Spacer { lines } => Some(IndexNode::Spacer { lines: *lines }),
        ReadableNode::Section {
            role,
            title,
            collapsed,
            nodes,
        } => {
            let nodes = nodes
                .iter()
                .filter_map(index_node_from_readable_without_tables)
                .collect::<Vec<_>>();
            (!nodes.is_empty()).then(|| IndexNode::Section {
                role: section_role(*role),
                title: title.clone(),
                collapsed: *collapsed,
                nodes,
            })
        }
        ReadableNode::Image { alt, src } => Some(IndexNode::Image {
            alt: alt.clone(),
            src: src.clone(),
        }),
        ReadableNode::Form(_) => None,
    }
}

fn section_role(role: ReadableSectionRole) -> SectionRole {
    match role {
        ReadableSectionRole::Main => SectionRole::Main,
        ReadableSectionRole::Navigation => SectionRole::Navigation,
        ReadableSectionRole::Aside => SectionRole::Aside,
        ReadableSectionRole::Footer => SectionRole::Footer,
        ReadableSectionRole::Comments => SectionRole::Comments,
        ReadableSectionRole::Related => SectionRole::Related,
        ReadableSectionRole::Unknown => SectionRole::Unknown,
    }
}

fn index_form_from_html(form: &index_dom::HtmlForm) -> Form {
    Form {
        name: form.name.clone(),
        method: form.method.clone(),
        action: form.action.clone(),
        inputs: form
            .inputs
            .iter()
            .map(|input| Input {
                name: input.name.clone(),
                kind: input.kind.clone(),
                value: input.value.clone(),
                required: input.required,
            })
            .collect(),
        buttons: form
            .buttons
            .iter()
            .map(|button| ButtonAction {
                name: button.name.clone(),
                value: button.value.clone(),
                label: button.label.clone(),
            })
            .collect(),
    }
}

fn summary_node(node: &ReadableNode) -> Option<IndexNode> {
    match node {
        ReadableNode::Heading { level, text } => Some(IndexNode::Heading {
            level: *level,
            text: text.clone(),
        }),
        ReadableNode::Paragraph(text) => Some(IndexNode::Paragraph(text.clone())),
        ReadableNode::Link(_)
        | ReadableNode::CodeBlock { .. }
        | ReadableNode::List { .. }
        | ReadableNode::Table { .. }
        | ReadableNode::Spacer { .. }
        | ReadableNode::Section { .. }
        | ReadableNode::Image { .. }
        | ReadableNode::Form(_) => None,
    }
}

fn page_url(page: &ReadablePage) -> Option<Url> {
    page.metadata
        .canonical_url
        .as_deref()
        .and_then(|url| Url::parse(url).ok())
}

fn path_segments(url: &Url) -> Vec<&str> {
    url.path_segments()
        .map(|segments| segments.filter(|segment| !segment.is_empty()).collect())
        .unwrap_or_default()
}

#[cfg(test)]
mod tests {
    use index_dom::HtmlLink;
    use index_dom::{HtmlButton, HtmlForm, HtmlInput};
    use index_readability::{ReadableMetadata, ReadableNode, ReadablePage};

    use super::{AdapterContext, AdapterRegistry, page_url, path_segments};

    fn contains_code_block(nodes: &[index_core::IndexNode], needle: &str) -> bool {
        nodes.iter().any(|node| match node {
            index_core::IndexNode::CodeBlock { code, .. } => code.contains(needle),
            index_core::IndexNode::Section { nodes, .. } => contains_code_block(nodes, needle),
            _ => false,
        })
    }

    fn contains_paragraph_text(nodes: &[index_core::IndexNode], needle: &str) -> bool {
        nodes.iter().any(|node| match node {
            index_core::IndexNode::Paragraph(text) => text.contains(needle),
            index_core::IndexNode::Section { nodes, .. } => contains_paragraph_text(nodes, needle),
            _ => false,
        })
    }

    fn contains_error_text(nodes: &[index_core::IndexNode], needle: &str) -> bool {
        nodes.iter().any(|node| match node {
            index_core::IndexNode::Error(text) => text.contains(needle),
            index_core::IndexNode::Section { nodes, .. } => contains_error_text(nodes, needle),
            _ => false,
        })
    }

    fn page(url: &str, title: &str) -> ReadablePage {
        ReadablePage {
            title: title.to_owned(),
            paragraphs: vec!["Summary paragraph.".to_owned()],
            nodes: vec![ReadableNode::Paragraph("Summary paragraph.".to_owned())],
            links: vec![HtmlLink {
                text: "Primary".to_owned(),
                href: url.to_owned(),
            }],
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some(url.to_owned()),
                language: Some("en".to_owned()),
                description: Some("Description".to_owned()),
                open_graph_title: None,
                open_graph_description: None,
            },
        }
    }

    #[test]
    fn registry_detects_initial_supported_sites() {
        let registry = AdapterRegistry::default_registry();
        for (url, expected) in [
            ("https://github.com/index-rs/index", "github.repository"),
            (
                "https://github.com/index-rs/index/issues/42",
                "github.issue",
            ),
            ("https://gitlab.com/index-rs/index", "gitlab"),
            ("https://git.sr.ht/~index/index", "sourcehut"),
            ("https://codeberg.org/index/index", "forge"),
            ("https://docs.rs/scraper/latest/scraper/", "docs.rs"),
            (
                "https://index.readthedocs.io/en/latest/guide/",
                "read-the-docs",
            ),
            (
                "https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector",
                "mdn",
            ),
            ("https://crates.io/crates/scraper", "crates.io"),
            (
                "https://en.wikipedia.org/wiki/Rust_(programming_language)",
                "wikipedia",
            ),
            ("https://news.ycombinator.com/item?id=1", "hacker-news"),
            (
                "https://stackoverflow.com/questions/1/example",
                "stackoverflow",
            ),
            (
                "https://old.reddit.com/r/rust/comments/abc123/example_thread/",
                "reddit",
            ),
            (
                "https://slashdot.org/story/26/01/01/1230201/example",
                "slashdot",
            ),
            ("https://discuss.example.org/t/topic/42", "discourse"),
            (
                "https://resetera.com/threads/example-thread.42/",
                "forum-xenforo",
            ),
            (
                "https://forums.tomshardware.com/forum/threads/example.42/",
                "forum-legacy",
            ),
            ("https://arxiv.org/abs/2601.00001", "arxiv"),
            (
                "https://archive.org/details/community-manuals",
                "internet-archive",
            ),
        ] {
            let page = page(url, "Example");
            let matched = registry.detect(&AdapterContext { page: &page });
            assert_eq!(
                matched.map(|matched| matched.id.to_string()),
                Some(expected.to_owned())
            );
        }
    }

    #[test]
    fn registry_returns_none_for_unknown_site() {
        let registry = AdapterRegistry::default_registry();
        let page = page("https://example.com/article", "Example");

        assert_eq!(registry.detect(&AdapterContext { page: &page }), None);
    }

    #[test]
    fn adapter_output_is_task_oriented_document_model() {
        let registry = AdapterRegistry::default_registry();
        let page = page("https://github.com/index-rs/index", "Index");
        let document = registry.transform(&AdapterContext { page: &page });

        assert!(matches!(
            document.as_ref().and_then(|document| document.metadata.adapter_id.as_ref()),
            Some(id) if id.as_str() == "github.repository"
        ));
        assert!(matches!(
            document.as_ref().and_then(|document| document.nodes.get(1)),
            Some(index_core::IndexNode::List { items, .. }) if items.iter().any(|item| item == "Open issues")
        ));
    }

    #[test]
    fn each_supported_adapter_emits_expected_task_view() {
        let registry = AdapterRegistry::default_registry();
        for (url, title, expected_title, expected_task) in [
            (
                "https://github.com/index-rs/index/issues/42",
                "Issue",
                "GitHub issue:",
                "Inspect labels and status",
            ),
            (
                "https://docs.rs/scraper/latest/scraper/",
                "Docs",
                "docs.rs:",
                "Search items",
            ),
            (
                "https://gitlab.com/index-rs/index",
                "GitLab",
                "GitLab:",
                "Open issues and merge requests",
            ),
            (
                "https://git.sr.ht/~index/index",
                "SourceHut",
                "SourceHut:",
                "Read mailing-list context",
            ),
            (
                "https://codeberg.org/index/index",
                "Forge",
                "Forge:",
                "Open issues and pull requests",
            ),
            (
                "https://index.readthedocs.io/en/latest/guide/",
                "Read the Docs",
                "Read the Docs:",
                "Open table of contents",
            ),
            (
                "https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector",
                "MDN",
                "MDN:",
                "Open browser compatibility notes",
            ),
            (
                "https://crates.io/crates/scraper",
                "Crate",
                "crates.io:",
                "Open documentation",
            ),
            (
                "https://en.wikipedia.org/wiki/Rust_(programming_language)",
                "Rust",
                "Wikipedia:",
                "Open references",
            ),
            (
                "https://news.ycombinator.com/item?id=1",
                "Story",
                "Hacker News:",
                "Read comment threads",
            ),
            (
                "https://stackoverflow.com/questions/1/example",
                "Question",
                "StackExchange:",
                "Review accepted answer",
            ),
            (
                "https://old.reddit.com/r/rust/comments/abc123/example_thread/",
                "Reddit Thread",
                "Reddit thread:",
                "Inspect nested comments",
            ),
            (
                "https://slashdot.org/story/26/01/01/1230201/example",
                "Slashdot Story",
                "Slashdot:",
                "Inspect comment thread",
            ),
            (
                "https://discuss.example.org/t/topic/42",
                "Discourse",
                "Discourse thread:",
                "Inspect replies",
            ),
            (
                "https://resetera.com/threads/example-thread.42/",
                "ResetEra",
                "Forum thread:",
                "Inspect quote/reply chain",
            ),
            (
                "https://forums.tomshardware.com/forum/threads/example.42/",
                "Tom's Hardware",
                "Legacy forum thread:",
                "Inspect quotes/code blocks",
            ),
            (
                "https://arxiv.org/abs/2601.00001",
                "arXiv",
                "arXiv abstract:",
                "Open PDF or source",
            ),
            (
                "https://archive.org/details/community-manuals",
                "Archive",
                "Internet Archive item:",
                "Open available files",
            ),
        ] {
            let page = page(url, title);
            let document = registry.transform(&AdapterContext { page: &page });

            assert!(matches!(
                document.as_ref(),
                Some(document) if document.title.starts_with(expected_title)
            ));
            assert!(matches!(
                document.as_ref(),
                Some(document) if document.nodes.iter().any(|node| matches!(node, index_core::IndexNode::List { items, .. } if items.iter().any(|item| item == expected_task)))
            ));
        }
    }

    #[test]
    fn adapter_summary_keeps_headings_and_skips_non_summary_nodes() {
        let registry = AdapterRegistry::default_registry();
        let mut page = page("https://docs.rs/index/latest/index/", "Index docs");
        page.nodes = vec![
            ReadableNode::Heading {
                level: 2,
                text: "Module index".to_owned(),
            },
            ReadableNode::CodeBlock {
                language: Some("rust".to_owned()),
                code: "fn main() {}".to_owned(),
            },
            ReadableNode::Table {
                rows: vec![vec!["Name".to_owned()]],
            },
            ReadableNode::Image {
                alt: "Logo".to_owned(),
                src: None,
            },
        ];

        let document = registry.transform(&AdapterContext { page: &page });

        assert!(matches!(
            document.as_ref().and_then(|document| document.nodes.get(2)),
            Some(index_core::IndexNode::Heading { level: 2, text }) if text == "Module index"
        ));
        assert!(!matches!(
            document.as_ref().and_then(|document| document.nodes.get(3)),
            Some(index_core::IndexNode::CodeBlock { .. })
        ));
    }

    #[test]
    fn detection_handles_missing_or_sparse_urls() {
        let registry = AdapterRegistry::default_registry();
        let mut no_url = page("https://example.com/article", "No URL");
        no_url.metadata.canonical_url = None;
        assert_eq!(page_url(&no_url), None);
        assert_eq!(registry.detect(&AdapterContext { page: &no_url }), None);

        let docs_root = page("https://docs.rs/", "Docs root");
        let matched = registry.detect(&AdapterContext { page: &docs_root });
        assert_eq!(
            matched.map(|matched| matched.page_type),
            Some("crate documentation".to_owned())
        );

        let crates_root = page("https://crates.io/", "Crates root");
        let matched = registry.detect(&AdapterContext { page: &crates_root });
        assert_eq!(
            matched.map(|matched| matched.page_type),
            Some("crate".to_owned())
        );
    }

    #[test]
    fn path_segments_ignores_empty_segments() -> Result<(), Box<dyn std::error::Error>> {
        let url = url::Url::parse("https://example.com//a///b/")?;
        assert_eq!(path_segments(&url), vec!["a", "b"]);
        Ok(())
    }

    #[test]
    fn hacker_news_detects_without_canonical_url() {
        let registry = AdapterRegistry::default_registry();
        let mut page = page("https://news.ycombinator.com/news", "Hacker News");
        page.metadata.canonical_url = None;
        page.links = vec![
            HtmlLink {
                text: "new".to_owned(),
                href: "https://news.ycombinator.com/newest".to_owned(),
            },
            HtmlLink {
                text: "Show HN: Example".to_owned(),
                href: "https://example.org/show-hn".to_owned(),
            },
        ];

        let matched = registry.detect(&AdapterContext { page: &page });
        assert_eq!(
            matched.map(|matched| matched.id.as_str().to_owned()),
            Some("hacker-news".to_owned())
        );
    }

    #[test]
    fn hacker_news_task_view_preserves_forms_and_actionable_links()
    -> Result<(), Box<dyn std::error::Error>> {
        let registry = AdapterRegistry::default_registry();
        let mut page = page(
            "https://news.ycombinator.com/item?id=42",
            "Story | Hacker News",
        );
        page.nodes = vec![
            ReadableNode::Table {
                rows: vec![
                    vec!["1.".to_owned(), "Story".to_owned()],
                    vec!["12 points".to_owned(), "8 comments".to_owned()],
                ],
            },
            ReadableNode::Paragraph("A discussion page.".to_owned()),
        ];
        page.links = vec![
            HtmlLink {
                text: "Story".to_owned(),
                href: "https://example.org/story".to_owned(),
            },
            HtmlLink {
                text: "8 comments".to_owned(),
                href: "https://news.ycombinator.com/item?id=42".to_owned(),
            },
            HtmlLink {
                text: "alice".to_owned(),
                href: "https://news.ycombinator.com/user?id=alice".to_owned(),
            },
            HtmlLink {
                text: "hide".to_owned(),
                href: "https://news.ycombinator.com/hide?id=42&goto=item%3Fid%3D42".to_owned(),
            },
            HtmlLink {
                text: "FAQ".to_owned(),
                href: "https://news.ycombinator.com/newsfaq.html".to_owned(),
            },
        ];
        page.forms = vec![HtmlForm {
            name: "search".to_owned(),
            method: "GET".to_owned(),
            action: "https://news.ycombinator.com/search".to_owned(),
            inputs: vec![HtmlInput {
                name: "q".to_owned(),
                kind: "text".to_owned(),
                value: None,
                required: true,
            }],
            buttons: vec![HtmlButton {
                name: Some("go".to_owned()),
                value: Some("1".to_owned()),
                label: "Search".to_owned(),
            }],
        }];

        let document = registry
            .transform(&AdapterContext { page: &page })
            .ok_or("adapter should transform")?;

        assert_eq!(
            document.metadata.adapter_id.as_ref().map(|id| id.as_str()),
            Some("hacker-news")
        );
        assert!(
            !document
                .nodes
                .iter()
                .any(|node| matches!(node, index_core::IndexNode::Table { .. }))
        );
        assert!(document.nodes.iter().any(
            |node| matches!(node, index_core::IndexNode::Form(form) if form.name == "search" && form.action == "https://news.ycombinator.com/search")
        ));
        assert!(document.nodes.iter().any(
            |node| matches!(node, index_core::IndexNode::Link(link) if link.text == "Story" && link.href == "https://example.org/story")
        ));
        assert!(
            !document.nodes.iter().any(
                |node| matches!(node, index_core::IndexNode::Link(link) if link.text == "hide")
            )
        );
        Ok(())
    }

    #[test]
    fn hacker_news_detects_login_form_without_canonical_or_title_hint() {
        let registry = AdapterRegistry::default_registry();
        let page = ReadablePage {
            title: "Untitled".to_owned(),
            paragraphs: Vec::new(),
            nodes: Vec::new(),
            links: vec![HtmlLink {
                text: "Forgot your password?".to_owned(),
                href: "https://news.ycombinator.com/forgot".to_owned(),
            }],
            forms: vec![HtmlForm {
                name: "login".to_owned(),
                method: "POST".to_owned(),
                action: "https://news.ycombinator.com/login".to_owned(),
                inputs: vec![HtmlInput {
                    name: "acct".to_owned(),
                    kind: "text".to_owned(),
                    value: None,
                    required: true,
                }],
                buttons: vec![HtmlButton {
                    name: None,
                    value: None,
                    label: "login".to_owned(),
                }],
            }],
            metadata: ReadableMetadata::default(),
        };

        let matched = registry.detect(&AdapterContext { page: &page });
        assert_eq!(
            matched.map(|matched| matched.id.as_str().to_owned()),
            Some("hacker-news".to_owned())
        );
    }

    #[test]
    fn forum_intent_classifier_handles_common_shapes() -> Result<(), Box<dyn std::error::Error>> {
        let url = url::Url::parse("https://old.reddit.com/r/rust/comments/abc123/thread?page=2")?;
        let reply_page = ReadablePage {
            title: "Thread".to_owned(),
            paragraphs: vec!["Body".to_owned()],
            nodes: vec![ReadableNode::Paragraph("Body".to_owned())],
            links: vec![HtmlLink {
                text: "Next".to_owned(),
                href: "https://old.reddit.com/r/rust/comments/abc123/thread?page=3".to_owned(),
            }],
            forms: vec![HtmlForm {
                name: "reply".to_owned(),
                method: "POST".to_owned(),
                action: "https://old.reddit.com/comment".to_owned(),
                inputs: vec![HtmlInput {
                    name: "comment".to_owned(),
                    kind: "text".to_owned(),
                    value: None,
                    required: true,
                }],
                buttons: Vec::new(),
            }],
            metadata: ReadableMetadata::default(),
        };
        assert_eq!(
            super::classify_forum_intent(Some(&url), &reply_page),
            super::ForumIntent::ReplyForm
        );

        let profile_url = url::Url::parse("https://news.ycombinator.com/user?id=alice")?;
        assert_eq!(
            super::classify_forum_intent(Some(&profile_url), &page("https://example.com", "x")),
            super::ForumIntent::ProfileNoise
        );
        Ok(())
    }

    #[test]
    fn reddit_script_gated_page_emits_deterministic_fallback_document()
    -> Result<(), Box<dyn std::error::Error>> {
        let registry = AdapterRegistry::default_registry();
        let page = ReadablePage {
            title: "Reddit".to_owned(),
            paragraphs: vec!["Continue in app".to_owned()],
            nodes: vec![ReadableNode::Paragraph("Continue in app".to_owned())],
            links: vec![HtmlLink {
                text: "continue".to_owned(),
                href: "https://www.reddit.com/login/".to_owned(),
            }],
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some(
                    "https://www.reddit.com/r/rust/comments/abc123/example/".to_owned(),
                ),
                language: Some("en".to_owned()),
                description: None,
                open_graph_title: None,
                open_graph_description: None,
            },
        };

        let document = registry
            .transform(&AdapterContext { page: &page })
            .ok_or("adapter transform missing")?;
        assert_eq!(
            document.metadata.adapter_id.as_ref().map(|id| id.as_str()),
            Some("reddit")
        );
        assert!(contains_paragraph_text(
            &document.nodes,
            "additional script/cookie flow"
        ));
        Ok(())
    }

    #[test]
    fn legacy_forum_thread_filters_signature_lines() -> Result<(), Box<dyn std::error::Error>> {
        let registry = AdapterRegistry::default_registry();
        let page = ReadablePage {
            title: "Legacy thread".to_owned(),
            paragraphs: Vec::new(),
            nodes: vec![
                ReadableNode::Paragraph("Useful answer.".to_owned()),
                ReadableNode::Paragraph("Sent from my phone".to_owned()),
                ReadableNode::CodeBlock {
                    language: None,
                    code: "fn legacy() {}".to_owned(),
                },
            ],
            links: vec![HtmlLink {
                text: "next".to_owned(),
                href: "https://forums.tomshardware.com/forum/threads/example.42/?page=2".to_owned(),
            }],
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some(
                    "https://forums.tomshardware.com/forum/threads/example.42/".to_owned(),
                ),
                language: Some("en".to_owned()),
                description: None,
                open_graph_title: None,
                open_graph_description: None,
            },
        };

        let document = registry
            .transform(&AdapterContext { page: &page })
            .ok_or("adapter transform missing")?;
        assert!(contains_code_block(&document.nodes, "legacy"));
        assert!(!contains_paragraph_text(&document.nodes, "Sent from my"));
        assert!(document.nodes.iter().any(|node| matches!(
            node,
            index_core::IndexNode::Section {
                title: Some(title),
                nodes,
                ..
            } if title == "Next steps"
                && nodes.iter().any(|child| matches!(
                    child,
                    index_core::IndexNode::Link(link) if link.href.contains("page=2")
                ))
        )));
        Ok(())
    }

    #[test]
    fn top100_domain_normalization_handles_common_aliases() {
        assert_eq!(
            super::normalize_top100_domain("www.google.com"),
            "google.com"
        );
        assert_eq!(
            super::normalize_top100_domain("old.reddit.com"),
            "reddit.com"
        );
        assert_eq!(super::normalize_top100_domain("twitter.com"), "x.com");
        assert_eq!(
            super::normalize_top100_domain("m.youtube.com"),
            "youtube.com"
        );
        assert_eq!(
            super::normalize_top100_domain("en.wikipedia.org"),
            "wikipedia.org"
        );
    }

    #[test]
    fn blocked_flow_classifier_covers_required_classes() {
        let mut page = page("https://x.com/home", "Sign in to continue");
        page.paragraphs = vec!["Please log in to continue".to_owned()];
        assert_eq!(
            super::classify_blocked_flow(&page),
            super::BlockedFlowClass::AuthWall
        );

        page.paragraphs = vec!["Enable JavaScript to continue in app".to_owned()];
        assert_eq!(
            super::classify_blocked_flow(&page),
            super::BlockedFlowClass::ScriptGate
        );

        page.paragraphs = vec!["Captcha: verify you are human".to_owned()];
        assert_eq!(
            super::classify_blocked_flow(&page),
            super::BlockedFlowClass::BotGate
        );

        page.paragraphs = vec!["This content is not available in your region".to_owned()];
        assert_eq!(
            super::classify_blocked_flow(&page),
            super::BlockedFlowClass::GeoGate
        );

        page.paragraphs = vec!["Confirm your age to continue (18+)".to_owned()];
        assert_eq!(
            super::classify_blocked_flow(&page),
            super::BlockedFlowClass::AgeGate
        );

        page.paragraphs = vec!["Access denied: blocked by policy".to_owned()];
        assert_eq!(
            super::classify_blocked_flow(&page),
            super::BlockedFlowClass::PolicyGate
        );
    }

    #[test]
    fn blocked_top100_document_emits_remediation_and_capture_guidance() {
        let page = page("https://office.com/home", "Access denied");
        let document = super::blocked_top100_document(
            &page,
            &index_core::AdapterId::new("top100.baseline"),
            "office.com",
            super::Top100Family::ServicesUtility,
            super::Top100Intent::AppShell,
            super::BlockedFlowClass::PolicyGate,
        );
        let rendered = format!("{:?}", document.nodes);

        assert!(rendered.contains("Remediation"));
        assert!(rendered.contains("policy-blocked"));
        assert!(rendered.contains(":capture save top100-blocked.capture"));
        assert!(
            rendered.contains(":capture --preview --redact https://office.com/ blocked-flow.html")
        );
    }

    #[test]
    fn top100_baseline_adapter_catches_supported_domains_without_dedicated_adapters()
    -> Result<(), Box<dyn std::error::Error>> {
        let registry = AdapterRegistry::default_registry();

        let search_page = page(
            "https://google.com/search?q=index+browser",
            "Search results - Google",
        );
        let search_doc = registry
            .transform(&AdapterContext { page: &search_page })
            .ok_or("missing top100 adapter output")?;
        assert_eq!(
            search_doc
                .metadata
                .adapter_id
                .as_ref()
                .map(|id| id.as_str()),
            Some("top100.baseline")
        );
        assert!(
            search_doc
                .title
                .starts_with("Top site baseline: google.com")
        );
        assert!(contains_paragraph_text(
            &search_doc.nodes,
            "Family: Search Portal | Intent: search-results"
        ));

        let search_portal_page = page("https://brave.com/", "Brave Search portal");
        let search_portal_doc = registry
            .transform(&AdapterContext {
                page: &search_portal_page,
            })
            .ok_or("missing search portal output")?;
        assert_eq!(
            search_portal_doc
                .metadata
                .adapter_id
                .as_ref()
                .map(|id| id.as_str()),
            Some("top100.baseline")
        );
        assert!(contains_paragraph_text(
            &search_portal_doc.nodes,
            "Family: Search Portal | Intent: portal-landing"
        ));

        let dzen_page = page("https://dzen.ru/", "Dzen — Discover");
        let dzen_doc = registry
            .transform(&AdapterContext { page: &dzen_page })
            .ok_or("missing dzen output")?;
        assert_eq!(
            dzen_doc.metadata.adapter_id.as_ref().map(|id| id.as_str()),
            Some("top100.baseline")
        );
        assert!(contains_paragraph_text(
            &dzen_doc.nodes,
            "Family: Knowledge Reference | Intent: portal-landing"
        ));

        let indiatimes_page = page("https://indiatimes.com/", "Indiatimes home");
        let indiatimes_doc = registry
            .transform(&AdapterContext {
                page: &indiatimes_page,
            })
            .ok_or("missing indiatimes output")?;
        assert_eq!(
            indiatimes_doc
                .metadata
                .adapter_id
                .as_ref()
                .map(|id| id.as_str()),
            Some("top100.baseline")
        );
        assert!(contains_paragraph_text(
            &indiatimes_doc.nodes,
            "Family: Knowledge Reference | Intent: portal-landing"
        ));

        let marketplace_page = page("https://rakuten.co.jp/search?f=1", "Rakuten search listing");
        let marketplace_doc = registry
            .transform(&AdapterContext {
                page: &marketplace_page,
            })
            .ok_or("missing marketplace output")?;
        assert_eq!(
            marketplace_doc
                .metadata
                .adapter_id
                .as_ref()
                .map(|id| id.as_str()),
            Some("top100.baseline")
        );
        assert!(contains_paragraph_text(
            &marketplace_doc.nodes,
            "Family: Commerce and Marketplace | Intent: marketplace-listing"
        ));

        let blocked_page = ReadablePage {
            title: "Office".to_owned(),
            paragraphs: vec!["Please sign in to continue".to_owned()],
            nodes: vec![ReadableNode::Paragraph(
                "Please sign in to continue".to_owned(),
            )],
            links: vec![HtmlLink {
                text: "Sign in".to_owned(),
                href: "https://office.com/login".to_owned(),
            }],
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some("https://office.com/".to_owned()),
                language: Some("en".to_owned()),
                description: None,
                open_graph_title: None,
                open_graph_description: None,
            },
        };
        let blocked_doc = registry
            .transform(&AdapterContext {
                page: &blocked_page,
            })
            .ok_or("missing blocked output")?;
        assert_eq!(
            blocked_doc
                .metadata
                .adapter_id
                .as_ref()
                .map(|id| id.as_str()),
            Some("top100.baseline")
        );
        assert!(contains_error_text(&blocked_doc.nodes, "auth-wall"));
        Ok(())
    }

    #[test]
    fn top100_helpers_extract_media_metadata_and_listing_nodes() {
        let page = ReadablePage {
            title: "Video".to_owned(),
            paragraphs: vec![
                "Creator: Index Channel".to_owned(),
                "Duration: 12:34".to_owned(),
                "Unrelated paragraph".to_owned(),
            ],
            nodes: vec![
                ReadableNode::Paragraph("Results for keyboard".to_owned()),
                ReadableNode::List {
                    ordered: false,
                    items: vec![
                        "Mechanical keyboard".to_owned(),
                        "Compact keyboard".to_owned(),
                    ],
                },
            ],
            links: Vec::new(),
            forms: Vec::new(),
            metadata: ReadableMetadata::default(),
        };

        let metadata = super::top100_media_metadata(&page);
        assert_eq!(
            metadata,
            vec![
                "Creator: Index Channel".to_owned(),
                "Duration: 12:34".to_owned()
            ]
        );

        let listing_nodes = super::top100_listing_nodes(&page);
        assert!(listing_nodes
            .iter()
            .any(|node| matches!(node, index_core::IndexNode::Paragraph(text) if text.contains("Results for"))));
        assert!(listing_nodes.iter().any(
            |node| matches!(node, index_core::IndexNode::List { items, .. } if items.len() == 2)
        ));
    }

    #[test]
    fn compatibility_pack_detects_major_families_without_bespoke_adapters()
    -> Result<(), Box<dyn std::error::Error>> {
        let registry = AdapterRegistry::default_registry();

        let forum = ReadablePage {
            title: "Community thread".to_owned(),
            paragraphs: vec!["Forum intro".to_owned()],
            nodes: vec![ReadableNode::List {
                ordered: false,
                items: vec!["Reply".to_owned(), "Next".to_owned()],
            }],
            links: (0..6)
                .map(|index| HtmlLink {
                    text: format!("Forum link {index}"),
                    href: format!("https://talk.example.org/forums/thread/42?page={index}"),
                })
                .collect(),
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some("https://talk.example.org/forums/thread/42".to_owned()),
                language: Some("en".to_owned()),
                description: None,
                open_graph_title: None,
                open_graph_description: None,
            },
        };
        let Some(forum_doc) = registry.transform(&AdapterContext { page: &forum }) else {
            return Err(std::io::Error::other("forum pack output missing").into());
        };
        assert_eq!(
            forum_doc.metadata.adapter_id.as_ref().map(|id| id.as_str()),
            Some("family-pack.forums")
        );

        let qa = ReadablePage {
            title: "How do I parse forms?".to_owned(),
            paragraphs: vec!["Q body".to_owned()],
            nodes: vec![ReadableNode::Paragraph("Answer body".to_owned())],
            links: (0..4)
                .map(|index| HtmlLink {
                    text: format!("QA link {index}"),
                    href: format!("https://answers.example.org/questions/42#answer-{index}"),
                })
                .collect(),
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some("https://answers.example.org/questions/42".to_owned()),
                language: Some("en".to_owned()),
                description: None,
                open_graph_title: None,
                open_graph_description: None,
            },
        };
        let Some(qa_doc) = registry.transform(&AdapterContext { page: &qa }) else {
            return Err(std::io::Error::other("qa pack output missing").into());
        };
        assert_eq!(
            qa_doc.metadata.adapter_id.as_ref().map(|id| id.as_str()),
            Some("family-pack.qa")
        );

        let docs = ReadablePage {
            title: "Parser API".to_owned(),
            paragraphs: vec!["Reference intro".to_owned()],
            nodes: vec![
                ReadableNode::Heading {
                    level: 1,
                    text: "API".to_owned(),
                },
                ReadableNode::Heading {
                    level: 2,
                    text: "parse()".to_owned(),
                },
                ReadableNode::CodeBlock {
                    language: Some("rust".to_owned()),
                    code: "fn parse() {}".to_owned(),
                },
            ],
            links: vec![
                HtmlLink {
                    text: "Reference".to_owned(),
                    href: "https://kb.example.org/docs/reference".to_owned(),
                },
                HtmlLink {
                    text: "Guide".to_owned(),
                    href: "https://kb.example.org/docs/guide".to_owned(),
                },
                HtmlLink {
                    text: "Tutorial".to_owned(),
                    href: "https://kb.example.org/docs/tutorial".to_owned(),
                },
            ],
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some("https://kb.example.org/docs/api/parser".to_owned()),
                language: Some("en".to_owned()),
                description: None,
                open_graph_title: None,
                open_graph_description: None,
            },
        };
        let Some(docs_doc) = registry.transform(&AdapterContext { page: &docs }) else {
            return Err(std::io::Error::other("docs pack output missing").into());
        };
        assert_eq!(
            docs_doc.metadata.adapter_id.as_ref().map(|id| id.as_str()),
            Some("family-pack.docs")
        );

        let news = ReadablePage {
            title: "Tech news analysis".to_owned(),
            paragraphs: vec![
                "Lead paragraph".to_owned(),
                "Context paragraph".to_owned(),
                "Quote paragraph".to_owned(),
                "Conclusion paragraph".to_owned(),
            ],
            nodes: vec![ReadableNode::Paragraph("Lead paragraph".to_owned())],
            links: (0..4)
                .map(|index| HtmlLink {
                    text: format!("Source {index}"),
                    href: format!("https://media.example.org/news/story-{index}"),
                })
                .collect(),
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some("https://media.example.org/news/story-1".to_owned()),
                language: Some("en".to_owned()),
                description: None,
                open_graph_title: None,
                open_graph_description: None,
            },
        };
        let Some(news_doc) = registry.transform(&AdapterContext { page: &news }) else {
            return Err(std::io::Error::other("news pack output missing").into());
        };
        assert_eq!(
            news_doc.metadata.adapter_id.as_ref().map(|id| id.as_str()),
            Some("family-pack.news-media")
        );

        let portal = ReadablePage {
            title: "Explore".to_owned(),
            paragraphs: vec!["Quick index".to_owned()],
            nodes: vec![ReadableNode::Paragraph("Quick index".to_owned())],
            links: (0..24)
                .map(|index| HtmlLink {
                    text: format!("Portal {index}"),
                    href: format!("https://portal.example.org/entry/{index}"),
                })
                .collect(),
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some("https://portal.example.org/".to_owned()),
                language: Some("en".to_owned()),
                description: None,
                open_graph_title: None,
                open_graph_description: None,
            },
        };
        let Some(portal_doc) = registry.transform(&AdapterContext { page: &portal }) else {
            return Err(std::io::Error::other("portal pack output missing").into());
        };
        assert_eq!(
            portal_doc
                .metadata
                .adapter_id
                .as_ref()
                .map(|id| id.as_str()),
            Some("family-pack.portal")
        );

        let app_shell = ReadablePage {
            title: "Workspace dashboard".to_owned(),
            paragraphs: vec!["Workspace summary".to_owned()],
            nodes: vec![ReadableNode::Paragraph("Workspace summary".to_owned())],
            links: (0..8)
                .map(|index| HtmlLink {
                    text: format!("Action {index}"),
                    href: format!("https://app.example.org/app/dashboard/action/{index}"),
                })
                .collect(),
            forms: vec![HtmlForm {
                name: "quick-action".to_owned(),
                method: "POST".to_owned(),
                action: "https://app.example.org/app/dashboard/action".to_owned(),
                inputs: vec![HtmlInput {
                    name: "command".to_owned(),
                    kind: "text".to_owned(),
                    value: None,
                    required: false,
                }],
                buttons: vec![HtmlButton {
                    name: Some("run".to_owned()),
                    label: "Run".to_owned(),
                    value: Some("run".to_owned()),
                }],
            }],
            metadata: ReadableMetadata {
                canonical_url: Some("https://app.example.org/app/dashboard".to_owned()),
                language: Some("en".to_owned()),
                description: None,
                open_graph_title: None,
                open_graph_description: None,
            },
        };
        let Some(app_shell_doc) = registry.transform(&AdapterContext { page: &app_shell }) else {
            return Err(std::io::Error::other("app-shell pack output missing").into());
        };
        assert_eq!(
            app_shell_doc
                .metadata
                .adapter_id
                .as_ref()
                .map(|id| id.as_str()),
            Some("family-pack.app-shell")
        );

        let commerce = ReadablePage {
            title: "Shop deals".to_owned(),
            paragraphs: vec![
                "Price: $19.99".to_owned(),
                "Price: $49.99".to_owned(),
                "Public catalog".to_owned(),
            ],
            nodes: vec![ReadableNode::List {
                ordered: false,
                items: vec!["Keyboard".to_owned(), "Mouse".to_owned()],
            }],
            links: (0..8)
                .map(|index| HtmlLink {
                    text: format!("Product {index}"),
                    href: format!("https://shop.example.org/store/products/{index}"),
                })
                .collect(),
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some("https://shop.example.org/store/deals".to_owned()),
                language: Some("en".to_owned()),
                description: None,
                open_graph_title: None,
                open_graph_description: None,
            },
        };
        let Some(commerce_doc) = registry.transform(&AdapterContext { page: &commerce }) else {
            return Err(std::io::Error::other("commerce pack output missing").into());
        };
        assert_eq!(
            commerce_doc
                .metadata
                .adapter_id
                .as_ref()
                .map(|id| id.as_str()),
            Some("family-pack.commerce-cards")
        );

        let mixed_media = ReadablePage {
            title: "Video gallery highlights".to_owned(),
            paragraphs: vec![
                "Roundup introduction".to_owned(),
                "Episode summary".to_owned(),
                "Playlist notes".to_owned(),
            ],
            nodes: vec![ReadableNode::List {
                ordered: false,
                items: vec!["Episode 1".to_owned(), "Episode 2".to_owned()],
            }],
            links: (0..9)
                .map(|index| HtmlLink {
                    text: format!("Media {index}"),
                    href: format!("https://media.example.org/watch/highlights/{index}"),
                })
                .collect(),
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some("https://media.example.org/watch/highlights".to_owned()),
                language: Some("en".to_owned()),
                description: None,
                open_graph_title: None,
                open_graph_description: None,
            },
        };
        let Some(mixed_media_doc) = registry.transform(&AdapterContext { page: &mixed_media })
        else {
            return Err(std::io::Error::other("mixed media pack output missing").into());
        };
        assert_eq!(
            mixed_media_doc
                .metadata
                .adapter_id
                .as_ref()
                .map(|id| id.as_str()),
            Some("family-pack.mixed-media")
        );
        let mixed_rendered = format!("{:?}", mixed_media_doc.nodes);
        assert!(
            mixed_rendered.contains("fallback: confidence below 3 returns generic transformer")
        );
        Ok(())
    }

    #[test]
    fn compatibility_pack_skips_weak_generic_pages() {
        let registry = AdapterRegistry::default_registry();
        let weak = page("https://example.com/article", "Short note");
        let detected = registry.detect(&AdapterContext { page: &weak });
        assert_eq!(detected, None);
    }

    #[test]
    fn compatibility_pack_weak_signals_fall_back_to_generic() {
        let registry = AdapterRegistry::default_registry();
        let weak_app_shell = ReadablePage {
            title: "Dashboard".to_owned(),
            paragraphs: vec!["Short note".to_owned()],
            nodes: vec![ReadableNode::Paragraph("Short note".to_owned())],
            links: vec![HtmlLink {
                text: "Home".to_owned(),
                href: "https://app.example.org/app".to_owned(),
            }],
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some("https://app.example.org/app".to_owned()),
                language: Some("en".to_owned()),
                description: None,
                open_graph_title: None,
                open_graph_description: None,
            },
        };

        let detected = registry.detect(&AdapterContext {
            page: &weak_app_shell,
        });
        assert_eq!(detected, None);
    }
}