use std::collections::BTreeSet;
use index_core::{
AdapterId, ButtonAction, DiagnosticAction, DiagnosticConfidence, DiagnosticRecord,
DiagnosticSeverity, DiagnosticSource, DocumentQuality, DocumentQualityCategory, FailureCause,
FailureDiagnostic, Form, IndexDocument, IndexNode, Input, Link, SectionRole,
};
use index_readability::{ReadableNode, ReadablePage, ReadableSectionRole};
use url::Url;
#[derive(Debug, Clone, Copy)]
pub struct AdapterContext<'a> {
pub page: &'a ReadablePage,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct AdapterMatch {
pub id: AdapterId,
pub page_type: String,
}
pub trait SiteAdapter {
fn id(&self) -> AdapterId;
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch>;
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument;
}
pub struct AdapterRegistry {
adapters: Vec<Box<dyn SiteAdapter>>,
}
impl AdapterRegistry {
#[must_use]
pub fn new(adapters: Vec<Box<dyn SiteAdapter>>) -> Self {
Self { adapters }
}
#[must_use]
pub fn default_registry() -> Self {
Self::new(vec![
Box::new(GitHubRepositoryAdapter),
Box::new(GitHubIssueAdapter),
Box::new(GitLabAdapter),
Box::new(SourceHutAdapter),
Box::new(ForgeAdapter),
Box::new(DocsRsAdapter),
Box::new(ReadTheDocsAdapter),
Box::new(MdnAdapter),
Box::new(CratesIoAdapter),
Box::new(WikipediaAdapter),
Box::new(HackerNewsAdapter),
Box::new(StackOverflowAdapter),
Box::new(RedditAdapter),
Box::new(SlashdotAdapter),
Box::new(DiscourseAdapter),
Box::new(XenForoAdapter),
Box::new(LegacyForumAdapter),
Box::new(CompatibilityPackAdapter),
Box::new(Top100BaselineAdapter),
Box::new(ArxivAdapter),
Box::new(InternetArchiveAdapter),
])
}
#[must_use]
pub fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
self.adapters
.iter()
.find_map(|adapter| adapter.detect(context))
}
#[must_use]
pub fn transform(&self, context: &AdapterContext<'_>) -> Option<IndexDocument> {
self.adapters.iter().find_map(|adapter| {
adapter
.detect(context)
.map(|matched| adapter.transform(context, &matched))
})
}
}
impl Default for AdapterRegistry {
fn default() -> Self {
Self::default_registry()
}
}
#[derive(Debug, Clone, Copy)]
pub struct GitHubRepositoryAdapter;
#[derive(Debug, Clone, Copy)]
pub struct GitHubIssueAdapter;
#[derive(Debug, Clone, Copy)]
pub struct GitLabAdapter;
#[derive(Debug, Clone, Copy)]
pub struct SourceHutAdapter;
#[derive(Debug, Clone, Copy)]
pub struct ForgeAdapter;
#[derive(Debug, Clone, Copy)]
pub struct DocsRsAdapter;
#[derive(Debug, Clone, Copy)]
pub struct ReadTheDocsAdapter;
#[derive(Debug, Clone, Copy)]
pub struct MdnAdapter;
#[derive(Debug, Clone, Copy)]
pub struct CratesIoAdapter;
#[derive(Debug, Clone, Copy)]
pub struct WikipediaAdapter;
#[derive(Debug, Clone, Copy)]
pub struct HackerNewsAdapter;
#[derive(Debug, Clone, Copy)]
pub struct StackOverflowAdapter;
#[derive(Debug, Clone, Copy)]
pub struct DiscourseAdapter;
#[derive(Debug, Clone, Copy)]
pub struct RedditAdapter;
#[derive(Debug, Clone, Copy)]
pub struct SlashdotAdapter;
#[derive(Debug, Clone, Copy)]
pub struct XenForoAdapter;
#[derive(Debug, Clone, Copy)]
pub struct LegacyForumAdapter;
#[derive(Debug, Clone, Copy)]
pub struct CompatibilityPackAdapter;
#[derive(Debug, Clone, Copy)]
pub struct Top100BaselineAdapter;
#[derive(Debug, Clone, Copy)]
pub struct ArxivAdapter;
#[derive(Debug, Clone, Copy)]
pub struct InternetArchiveAdapter;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ForumIntent {
FrontPage,
ThreadPage,
PaginatedThread,
ReplyForm,
ProfileNoise,
}
impl ForumIntent {
fn as_str(self) -> &'static str {
match self {
Self::FrontPage => "front-page",
Self::ThreadPage => "thread-page",
Self::PaginatedThread => "paginated-thread",
Self::ReplyForm => "reply-form",
Self::ProfileNoise => "profile-noise",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ForumFamily {
HackerNews,
StackExchange,
Reddit,
Slashdot,
Discourse,
XenForoLike,
Legacy,
}
impl ForumFamily {
fn as_str(self) -> &'static str {
match self {
Self::HackerNews => "hacker-news",
Self::StackExchange => "stackexchange",
Self::Reddit => "reddit",
Self::Slashdot => "slashdot",
Self::Discourse => "discourse",
Self::XenForoLike => "xenforo-like",
Self::Legacy => "legacy-forum",
}
}
fn label(self) -> &'static str {
match self {
Self::HackerNews => "Hacker News",
Self::StackExchange => "StackExchange",
Self::Reddit => "Reddit",
Self::Slashdot => "Slashdot",
Self::Discourse => "Discourse",
Self::XenForoLike => "Forum",
Self::Legacy => "Legacy Forum",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CompatibilityPack {
Forums,
Qa,
Docs,
NewsMedia,
Portal,
AppShell,
CommerceCards,
MixedMedia,
}
impl CompatibilityPack {
fn as_str(self) -> &'static str {
match self {
Self::Forums => "forums",
Self::Qa => "qa",
Self::Docs => "docs",
Self::NewsMedia => "news-media",
Self::Portal => "portal",
Self::AppShell => "app-shell",
Self::CommerceCards => "commerce-cards",
Self::MixedMedia => "mixed-media",
}
}
fn label(self) -> &'static str {
match self {
Self::Forums => "Forum Pack",
Self::Qa => "Q&A Pack",
Self::Docs => "Docs Pack",
Self::NewsMedia => "News Pack",
Self::Portal => "Portal Pack",
Self::AppShell => "App Shell Pack",
Self::CommerceCards => "Commerce Cards Pack",
Self::MixedMedia => "Mixed Media Pack",
}
}
const fn priority(self) -> u8 {
match self {
Self::Forums => 8,
Self::Qa => 7,
Self::Docs => 6,
Self::NewsMedia => 5,
Self::Portal => 4,
Self::AppShell => 3,
Self::CommerceCards => 2,
Self::MixedMedia => 1,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
struct CompatibilityPackCandidate {
pack: CompatibilityPack,
confidence: u8,
signals: Vec<String>,
}
impl CompatibilityPackCandidate {
fn confidence_label(&self) -> &'static str {
match self.confidence {
5..=u8::MAX => "high",
3..=4 => "medium",
_ => "low",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Top100Family {
SearchPortal,
KnowledgeReference,
SocialCommunity,
MediaStreaming,
CommerceMarketplace,
ServicesUtility,
AiAssistant,
}
impl Top100Family {
fn as_str(self) -> &'static str {
match self {
Self::SearchPortal => "search-portal",
Self::KnowledgeReference => "knowledge-reference",
Self::SocialCommunity => "social-community",
Self::MediaStreaming => "media-streaming",
Self::CommerceMarketplace => "commerce-marketplace",
Self::ServicesUtility => "services-utility",
Self::AiAssistant => "ai-assistant",
}
}
fn label(self) -> &'static str {
match self {
Self::SearchPortal => "Search Portal",
Self::KnowledgeReference => "Knowledge Reference",
Self::SocialCommunity => "Social Community",
Self::MediaStreaming => "Media and Streaming",
Self::CommerceMarketplace => "Commerce and Marketplace",
Self::ServicesUtility => "Services and Utility",
Self::AiAssistant => "AI Assistant",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum Top100Intent {
PortalLanding,
SearchResults,
ArticleOrReference,
AppShell,
FeedOrThread,
VideoHub,
MarketplaceListing,
}
impl Top100Intent {
fn as_str(self) -> &'static str {
match self {
Self::PortalLanding => "portal-landing",
Self::SearchResults => "search-results",
Self::ArticleOrReference => "article-or-reference",
Self::AppShell => "app-shell",
Self::FeedOrThread => "feed-or-thread",
Self::VideoHub => "video-hub",
Self::MarketplaceListing => "marketplace-listing",
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum BlockedFlowClass {
None,
AuthWall,
ScriptGate,
BotGate,
GeoGate,
AgeGate,
PolicyGate,
}
impl BlockedFlowClass {
fn as_str(self) -> &'static str {
match self {
Self::None => "none",
Self::AuthWall => "auth-wall",
Self::ScriptGate => "script-gate",
Self::BotGate => "bot-gate",
Self::GeoGate => "geo-gate",
Self::AgeGate => "age-gate",
Self::PolicyGate => "policy-blocked",
}
}
}
impl SiteAdapter for GitHubRepositoryAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("github.repository")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let segments = path_segments(&url);
(url.host_str() == Some("github.com")
&& segments.len() >= 2
&& !segments.contains(&"issues"))
.then(|| AdapterMatch {
id: self.id(),
page_type: format!("{}/{}", segments[0], segments[1]),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
task_document(
context.page,
&matched.id,
format!("GitHub repository: {}", matched.page_type),
vec![
"Browse code".to_owned(),
"Open issues".to_owned(),
"Review pull requests".to_owned(),
"Read project documentation".to_owned(),
],
)
}
}
impl SiteAdapter for GitHubIssueAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("github.issue")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let segments = path_segments(&url);
(url.host_str() == Some("github.com") && segments.len() >= 4 && segments[2] == "issues")
.then(|| AdapterMatch {
id: self.id(),
page_type: format!("{}/{} issue #{}", segments[0], segments[1], segments[3]),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
task_document(
context.page,
&matched.id,
format!("GitHub issue: {}", matched.page_type),
vec![
"Read issue summary".to_owned(),
"Inspect labels and status".to_owned(),
"Review discussion links".to_owned(),
],
)
}
}
impl SiteAdapter for GitLabAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("gitlab")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let segments = path_segments(&url);
(url.host_str() == Some("gitlab.com") && segments.len() >= 2).then(|| {
let page_type = if let Some(issue_index) =
segments.iter().enumerate().find_map(|(index, segment)| {
(*segment == "issues" && segments.get(index.saturating_sub(1)) == Some(&"-"))
.then_some(index)
}) {
format!(
"{}/{} issue #{}",
segments[0],
segments[1],
segments.get(issue_index + 1).copied().unwrap_or("unknown")
)
} else {
format!("{}/{}", segments[0], segments[1])
};
AdapterMatch {
id: self.id(),
page_type,
}
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
task_document(
context.page,
&matched.id,
format!("GitLab: {}", matched.page_type),
vec![
"Browse project files".to_owned(),
"Open issues and merge requests".to_owned(),
"Read project documentation".to_owned(),
],
)
}
}
impl SiteAdapter for SourceHutAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("sourcehut")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let host = url.host_str()?;
host.ends_with(".sr.ht").then(|| AdapterMatch {
id: self.id(),
page_type: format!(
"{host} {}",
path_segments(&url).first().copied().unwrap_or("root")
),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
task_document(
context.page,
&matched.id,
format!("SourceHut: {}", matched.page_type),
vec![
"Open project summary".to_owned(),
"Read mailing-list context".to_owned(),
"Inspect tickets or patches".to_owned(),
],
)
}
}
impl SiteAdapter for ForgeAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("forge")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let host = url.host_str()?;
let segments = path_segments(&url);
((host == "codeberg.org" || host == "gitea.com") && segments.len() >= 2).then(|| {
let page_type = if segments.len() >= 4 && segments[2] == "issues" {
format!(
"{host}/{}/{} issue #{}",
segments[0], segments[1], segments[3]
)
} else {
format!("{host}/{}/{}", segments[0], segments[1])
};
AdapterMatch {
id: self.id(),
page_type,
}
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
task_document(
context.page,
&matched.id,
format!("Forge: {}", matched.page_type),
vec![
"Browse repository".to_owned(),
"Open issues and pull requests".to_owned(),
"Read release notes".to_owned(),
],
)
}
}
impl SiteAdapter for DocsRsAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("docs.rs")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
(url.host_str() == Some("docs.rs")).then(|| AdapterMatch {
id: self.id(),
page_type: path_segments(&url).first().map_or_else(
|| "crate documentation".to_owned(),
|name| (*name).to_owned(),
),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
task_document(
context.page,
&matched.id,
format!("docs.rs: {}", matched.page_type),
vec![
"Open crate modules".to_owned(),
"Search items".to_owned(),
"Read examples".to_owned(),
],
)
}
}
impl SiteAdapter for ReadTheDocsAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("read-the-docs")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
url.host_str()
.filter(|host| host.ends_with(".readthedocs.io"))
.map(|host| AdapterMatch {
id: self.id(),
page_type: (*host).to_owned(),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
task_document(
context.page,
&matched.id,
format!("Read the Docs: {}", matched.page_type),
vec![
"Read current section".to_owned(),
"Open table of contents".to_owned(),
"Search documentation links".to_owned(),
],
)
}
}
impl SiteAdapter for MdnAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("mdn")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
(url.host_str() == Some("developer.mozilla.org")).then(|| AdapterMatch {
id: self.id(),
page_type: context.page.title.clone(),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
task_document(
context.page,
&matched.id,
format!("MDN: {}", matched.page_type),
vec![
"Read API summary".to_owned(),
"Inspect examples".to_owned(),
"Open browser compatibility notes".to_owned(),
],
)
}
}
impl SiteAdapter for CratesIoAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("crates.io")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
(url.host_str() == Some("crates.io")).then(|| AdapterMatch {
id: self.id(),
page_type: path_segments(&url)
.get(1)
.map_or_else(|| "crate".to_owned(), |name| (*name).to_owned()),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
task_document(
context.page,
&matched.id,
format!("crates.io: {}", matched.page_type),
vec![
"Inspect crate metadata".to_owned(),
"Open documentation".to_owned(),
"Review repository and versions".to_owned(),
],
)
}
}
impl SiteAdapter for WikipediaAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("wikipedia")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
url.host_str()
.filter(|host| host.ends_with("wikipedia.org"))
.map(|_host| AdapterMatch {
id: self.id(),
page_type: context.page.title.clone(),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
task_document(
context.page,
&matched.id,
format!("Wikipedia: {}", matched.page_type),
vec![
"Read article lead".to_owned(),
"Open references".to_owned(),
"Inspect related links".to_owned(),
],
)
}
}
impl SiteAdapter for HackerNewsAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("hacker-news")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
if let Some(url) = page_url(context.page) {
if url.host_str() == Some("news.ycombinator.com") {
return Some(AdapterMatch {
id: self.id(),
page_type: hacker_news_page_type(Some(&url), &context.page.title),
});
}
}
looks_like_hacker_news(context.page).then(|| AdapterMatch {
id: self.id(),
page_type: hacker_news_page_type(None, &context.page.title),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
hacker_news_document(context.page, matched)
}
}
impl SiteAdapter for StackOverflowAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("stackoverflow")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
stackexchange_host(url.host_str()?).then(|| AdapterMatch {
id: self.id(),
page_type: context.page.title.clone(),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
forum_thread_document(
context.page,
&matched.id,
ForumFamily::StackExchange,
format!("StackExchange: {}", matched.page_type),
vec![
"Read question".to_owned(),
"Review accepted answer".to_owned(),
"Inspect comments and related answers".to_owned(),
"Open outbound references".to_owned(),
],
None,
)
}
}
impl SiteAdapter for DiscourseAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("discourse")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let host = url.host_str()?;
let segments = path_segments(&url);
((host == "meta.discourse.org"
|| host.starts_with("discuss.")
|| host.contains(".discourse."))
&& segments.first() == Some(&"t"))
.then(|| AdapterMatch {
id: self.id(),
page_type: context.page.title.clone(),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
forum_thread_document(
context.page,
&matched.id,
ForumFamily::Discourse,
format!("Discourse thread: {}", matched.page_type),
vec![
"Read topic summary".to_owned(),
"Inspect replies".to_owned(),
"Open related discussion links".to_owned(),
],
None,
)
}
}
impl SiteAdapter for RedditAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("reddit")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let host = url.host_str()?;
((host == "reddit.com" || host == "www.reddit.com" || host == "old.reddit.com")
&& path_segments(&url).contains(&"r"))
.then(|| AdapterMatch {
id: self.id(),
page_type: context.page.title.clone(),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
if reddit_script_gated(context.page) {
return blocked_forum_document(
context.page,
&matched.id,
"Reddit requires additional script/cookie flow for this page shape",
);
}
forum_thread_document(
context.page,
&matched.id,
ForumFamily::Reddit,
format!("Reddit thread: {}", matched.page_type),
vec![
"Read post summary".to_owned(),
"Inspect nested comments".to_owned(),
"Open outbound links".to_owned(),
],
Some(is_reddit_actionable_link),
)
}
}
impl SiteAdapter for SlashdotAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("slashdot")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let host = url.host_str()?;
let segments = path_segments(&url);
(host == "slashdot.org"
&& (segments.contains(&"story")
|| segments.first() == Some(&"index2.pl")
|| context.page.title.to_ascii_lowercase().contains("slashdot")))
.then(|| AdapterMatch {
id: self.id(),
page_type: context.page.title.clone(),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
forum_thread_document(
context.page,
&matched.id,
ForumFamily::Slashdot,
format!("Slashdot: {}", matched.page_type),
vec![
"Read story summary".to_owned(),
"Inspect comment thread".to_owned(),
"Open source links".to_owned(),
],
None,
)
}
}
impl SiteAdapter for XenForoAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("forum-xenforo")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let host = url.host_str()?;
let segments = path_segments(&url);
let known_host = matches!(
host,
"resetera.com"
| "www.resetera.com"
| "www.neogaf.com"
| "neogaf.com"
| "forums.spacebattles.com"
| "forums.sufficientvelocity.com"
| "forums.overclock.net"
);
(known_host
&& (segments.contains(&"threads")
|| segments.contains(&"posts")
|| segments.contains(&"members")))
.then(|| AdapterMatch {
id: self.id(),
page_type: context.page.title.clone(),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
forum_thread_document(
context.page,
&matched.id,
ForumFamily::XenForoLike,
format!("Forum thread: {}", matched.page_type),
vec![
"Read thread posts".to_owned(),
"Inspect quote/reply chain".to_owned(),
"Open pagination links".to_owned(),
],
None,
)
}
}
impl SiteAdapter for LegacyForumAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("forum-legacy")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let host = url.host_str()?;
let segments = path_segments(&url);
let has_legacy_forum_path = segments.contains(&"forum")
|| segments.contains(&"forums")
|| url.as_str().contains("showthread")
|| url.as_str().contains("viewtopic");
let known_host = matches!(
host,
"forum.xda-developers.com"
| "forums.tomshardware.com"
| "forums.anandtech.com"
| "forums.macrumors.com"
| "www.avsforum.com"
| "www.city-data.com"
| "www.skyscrapercity.com"
| "forums.digitalpoint.com"
| "www.webhostingtalk.com"
| "www.sitepoint.com"
| "www.namepros.com"
| "www.mumsnet.com"
| "www.thestudentroom.co.uk"
| "www.boards.ie"
| "forums.overclockers.co.uk"
| "www.badcaps.net"
| "forums.moneysavingexpert.com"
| "www.dslreports.com"
| "forum.bodybuilding.com"
| "forums.sherdog.com"
| "www.mtgsalvation.com"
| "www.alternatehistory.com"
| "forums.futura-sciences.com"
| "www.physicsforums.com"
| "forums.whirlpool.net.au"
| "forums.somethingawful.com"
| "www.gaiaonline.com"
| "arstechnica.com"
);
(known_host && has_legacy_forum_path).then(|| AdapterMatch {
id: self.id(),
page_type: context.page.title.clone(),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
forum_thread_document(
context.page,
&matched.id,
ForumFamily::Legacy,
format!("Legacy forum thread: {}", matched.page_type),
vec![
"Read thread body".to_owned(),
"Inspect quotes/code blocks".to_owned(),
"Open next/previous pages".to_owned(),
],
Some(is_legacy_actionable_link),
)
}
}
impl SiteAdapter for CompatibilityPackAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("family-pack")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let candidate = detect_compatibility_pack(&url, context.page)?;
Some(AdapterMatch {
id: AdapterId::new(format!("family-pack.{}", candidate.pack.as_str())),
page_type: format!(
"{} (confidence={})",
candidate.pack.label(),
candidate.confidence
),
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
let fallback = CompatibilityPackCandidate {
pack: if matched.id.as_str().ends_with(".forums") {
CompatibilityPack::Forums
} else if matched.id.as_str().ends_with(".qa") {
CompatibilityPack::Qa
} else if matched.id.as_str().ends_with(".docs") {
CompatibilityPack::Docs
} else if matched.id.as_str().ends_with(".news-media") {
CompatibilityPack::NewsMedia
} else if matched.id.as_str().ends_with(".app-shell") {
CompatibilityPack::AppShell
} else if matched.id.as_str().ends_with(".commerce-cards") {
CompatibilityPack::CommerceCards
} else if matched.id.as_str().ends_with(".mixed-media") {
CompatibilityPack::MixedMedia
} else {
CompatibilityPack::Portal
},
confidence: 3,
signals: vec!["adapter match persisted".to_owned()],
};
let candidate = page_url(context.page)
.and_then(|url| detect_compatibility_pack(&url, context.page))
.unwrap_or(fallback);
compatibility_pack_document(context.page, &matched.id, &candidate)
}
}
impl SiteAdapter for Top100BaselineAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("top100.baseline")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let host = url.host_str()?;
let normalized = normalize_top100_domain(host);
let family = classify_top100_family(&normalized)?;
Some(AdapterMatch {
id: self.id(),
page_type: format!("{} {}", family.as_str(), normalized),
})
}
fn transform(&self, context: &AdapterContext<'_>, _matched: &AdapterMatch) -> IndexDocument {
let Some(url) = page_url(context.page) else {
return blocked_top100_document(
context.page,
&self.id(),
"unknown",
Top100Family::ServicesUtility,
Top100Intent::AppShell,
BlockedFlowClass::ScriptGate,
);
};
let host = url.host_str().unwrap_or_default();
let normalized = normalize_top100_domain(host);
let family = classify_top100_family(&normalized).unwrap_or(Top100Family::ServicesUtility);
let intent = classify_top100_intent(&normalized, &url, context.page);
let blocked = classify_blocked_flow(context.page);
if blocked != BlockedFlowClass::None {
return blocked_top100_document(
context.page,
&self.id(),
&normalized,
family,
intent,
blocked,
);
}
top100_baseline_document(context.page, &self.id(), &normalized, family, intent)
}
}
impl SiteAdapter for ArxivAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("arxiv")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let segments = path_segments(&url);
(url.host_str() == Some("arxiv.org") && segments.first() == Some(&"abs")).then(|| {
AdapterMatch {
id: self.id(),
page_type: segments.get(1).copied().unwrap_or("abstract").to_owned(),
}
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
task_document(
context.page,
&matched.id,
format!("arXiv abstract: {}", matched.page_type),
vec![
"Read abstract".to_owned(),
"Open PDF or source".to_owned(),
"Inspect authors and categories".to_owned(),
],
)
}
}
impl SiteAdapter for InternetArchiveAdapter {
fn id(&self) -> AdapterId {
AdapterId::new("internet-archive")
}
fn detect(&self, context: &AdapterContext<'_>) -> Option<AdapterMatch> {
let url = page_url(context.page)?;
let segments = path_segments(&url);
(url.host_str() == Some("archive.org") && segments.first() == Some(&"details")).then(|| {
AdapterMatch {
id: self.id(),
page_type: segments.get(1).copied().unwrap_or("item").to_owned(),
}
})
}
fn transform(&self, context: &AdapterContext<'_>, matched: &AdapterMatch) -> IndexDocument {
task_document(
context.page,
&matched.id,
format!("Internet Archive item: {}", matched.page_type),
vec![
"Read item metadata".to_owned(),
"Open available files".to_owned(),
"Inspect collection links".to_owned(),
],
)
}
}
fn stackexchange_host(host: &str) -> bool {
host == "stackoverflow.com"
|| host == "serverfault.com"
|| host == "superuser.com"
|| host == "askubuntu.com"
|| host == "mathoverflow.net"
|| host == "stackexchange.com"
|| host.ends_with(".stackexchange.com")
}
fn normalize_top100_domain(host: &str) -> String {
let host = host.trim_end_matches('.').to_ascii_lowercase();
let host = host.strip_prefix("www.").unwrap_or(&host).to_owned();
match host.as_str() {
"old.reddit.com" | "m.reddit.com" => "reddit.com".to_owned(),
"twitter.com" | "mobile.twitter.com" | "www.twitter.com" => "x.com".to_owned(),
"m.facebook.com" | "touch.facebook.com" => "facebook.com".to_owned(),
"m.instagram.com" => "instagram.com".to_owned(),
"m.youtube.com" | "youtu.be" => "youtube.com".to_owned(),
"m.wikipedia.org" => "wikipedia.org".to_owned(),
"m.bing.com" => "bing.com".to_owned(),
"m.duckduckgo.com" => "duckduckgo.com".to_owned(),
"m.amazon.com" => "amazon.com".to_owned(),
"en.wikipedia.org" | "es.wikipedia.org" | "fr.wikipedia.org" => "wikipedia.org".to_owned(),
_ => host,
}
}
fn classify_top100_family(domain: &str) -> Option<Top100Family> {
const SEARCH: &[&str] = &[
"google.com",
"bing.com",
"duckduckgo.com",
"brave.com",
"yahoo.com",
"yahoo.co.jp",
"news.yahoo.co.jp",
"yandex.ru",
"ya.ru",
"baidu.com",
"naver.com",
"mail.ru",
"qq.com",
"msn.com",
];
const KNOWLEDGE: &[&str] = &[
"wikipedia.org",
"fandom.com",
"nytimes.com",
"bbc.com",
"bbc.co.uk",
"cnn.com",
"espn.com",
"globo.com",
"indiatimes.com",
"theguardian.com",
"weather.com",
"dzen.ru",
];
const SOCIAL: &[&str] = &[
"facebook.com",
"instagram.com",
"x.com",
"reddit.com",
"discord.com",
"quora.com",
"linkedin.com",
"t.me",
"telegram.org",
"vk.com",
"ok.ru",
];
const MEDIA: &[&str] = &[
"youtube.com",
"music.youtube.com",
"tiktok.com",
"bilibili.com",
"twitch.tv",
"spotify.com",
"imdb.com",
"netflix.com",
"disneyplus.com",
"hbomax.com",
"rutube.ru",
"douyin.com",
];
const COMMERCE: &[&str] = &[
"amazon.com",
"amazon.in",
"amazon.co.jp",
"amazon.de",
"amazon.co.uk",
"ebay.com",
"walmart.com",
"etsy.com",
"rakuten.co.jp",
"aliexpress.com",
"temu.com",
"shein.com",
"ozon.ru",
"wildberries.ru",
"booking.com",
"zillow.com",
];
const SERVICE: &[&str] = &[
"microsoft.com",
"office.com",
"cloud.microsoft",
"live.com",
"paypal.com",
"zoom.us",
"canva.com",
"usps.com",
"apple.com",
"adobe.com",
"samsung.com",
"indeed.com",
"docomo.ne.jp",
"instructure.com",
];
const AI: &[&str] = &[
"chatgpt.com",
"gemini.google.com",
"claude.ai",
"deepseek.com",
"chat.deepseek.com",
"grok.com",
];
if SEARCH.contains(&domain) {
Some(Top100Family::SearchPortal)
} else if KNOWLEDGE.contains(&domain) {
Some(Top100Family::KnowledgeReference)
} else if SOCIAL.contains(&domain) {
Some(Top100Family::SocialCommunity)
} else if MEDIA.contains(&domain) {
Some(Top100Family::MediaStreaming)
} else if COMMERCE.contains(&domain) {
Some(Top100Family::CommerceMarketplace)
} else if SERVICE.contains(&domain) {
Some(Top100Family::ServicesUtility)
} else if AI.contains(&domain) {
Some(Top100Family::AiAssistant)
} else {
None
}
}
fn classify_top100_intent(domain: &str, url: &Url, page: &ReadablePage) -> Top100Intent {
let path = path_segments(url);
let title = page.title.to_ascii_lowercase();
let has_search_query = url
.query_pairs()
.any(|(k, v)| matches!(k.as_ref(), "q" | "query" | "search") && !v.trim().is_empty());
let is_video_domain = matches!(
domain,
"youtube.com" | "music.youtube.com" | "twitch.tv" | "spotify.com" | "bilibili.com"
);
let is_marketplace_domain = matches!(
domain,
"amazon.com"
| "amazon.in"
| "amazon.co.jp"
| "amazon.de"
| "amazon.co.uk"
| "ebay.com"
| "walmart.com"
| "etsy.com"
| "rakuten.co.jp"
| "aliexpress.com"
| "temu.com"
| "shein.com"
| "ozon.ru"
| "wildberries.ru"
| "booking.com"
| "zillow.com"
);
if is_video_domain
|| path
.iter()
.any(|segment| matches!(*segment, "watch" | "video" | "videos" | "playlist"))
{
return Top100Intent::VideoHub;
}
if is_marketplace_domain
|| path.iter().any(|segment| {
matches!(
*segment,
"dp" | "product" | "products" | "item" | "listing" | "list"
)
})
{
return Top100Intent::MarketplaceListing;
}
if has_search_query
|| path
.iter()
.any(|segment| matches!(*segment, "search" | "results"))
{
return Top100Intent::SearchResults;
}
if matches!(
domain,
"wikipedia.org" | "fandom.com" | "nytimes.com" | "bbc.com" | "cnn.com"
) || path
.iter()
.any(|segment| matches!(*segment, "wiki" | "article" | "news"))
{
return Top100Intent::ArticleOrReference;
}
if path
.iter()
.any(|segment| matches!(*segment, "feed" | "timeline" | "thread" | "comments"))
|| title.contains("thread")
|| title.contains("comments")
{
return Top100Intent::FeedOrThread;
}
if title.contains("dashboard")
|| title.contains("workspace")
|| title.contains("app")
|| title.contains("account")
{
return Top100Intent::AppShell;
}
Top100Intent::PortalLanding
}
fn classify_blocked_flow(page: &ReadablePage) -> BlockedFlowClass {
let mut haystack = page.title.to_ascii_lowercase();
for paragraph in &page.paragraphs {
haystack.push('\n');
haystack.push_str(¶graph.to_ascii_lowercase());
}
if haystack.contains("captcha")
|| haystack.contains("verify you are human")
|| haystack.contains("robot check")
|| haystack.contains("cloudflare")
|| haystack.contains("anti-bot")
{
return BlockedFlowClass::BotGate;
}
if haystack.contains("not available in your region")
|| haystack.contains("not available in your country")
|| haystack.contains("geo-restricted")
|| haystack.contains("geoblocked")
{
return BlockedFlowClass::GeoGate;
}
if haystack.contains("age verification")
|| haystack.contains("adults only")
|| haystack.contains("18+")
|| haystack.contains("confirm your age")
{
return BlockedFlowClass::AgeGate;
}
if haystack.contains("access denied")
|| haystack.contains("forbidden")
|| haystack.contains("blocked by policy")
|| haystack.contains("violates our terms")
|| haystack.contains("not permitted")
{
return BlockedFlowClass::PolicyGate;
}
if haystack.contains("enable javascript")
|| haystack.contains("requires javascript")
|| haystack.contains("continue in app")
|| haystack.contains("app is not available")
{
return BlockedFlowClass::ScriptGate;
}
if haystack.contains("log in")
|| haystack.contains("sign in")
|| haystack.contains("create account")
|| haystack.contains("authentication required")
|| haystack.contains("please log in")
{
return BlockedFlowClass::AuthWall;
}
BlockedFlowClass::None
}
fn detect_compatibility_pack(url: &Url, page: &ReadablePage) -> Option<CompatibilityPackCandidate> {
const MIN_PACK_CONFIDENCE: u8 = 3;
let path = path_segments(url);
let title = page.title.to_ascii_lowercase();
let has_code = page
.nodes
.iter()
.any(|node| matches!(node, ReadableNode::CodeBlock { .. }));
let heading_count = page
.nodes
.iter()
.filter(|node| matches!(node, ReadableNode::Heading { .. }))
.count();
let list_count = page
.nodes
.iter()
.filter(|node| matches!(node, ReadableNode::List { .. }))
.count();
let table_count = page
.nodes
.iter()
.filter(|node| matches!(node, ReadableNode::Table { .. }))
.count();
let paragraph_count = page.paragraphs.len();
let link_count = page.links.len();
let mut candidates = Vec::new();
let mut forum_signals = Vec::new();
if path.iter().any(|segment| {
matches!(
*segment,
"forum" | "forums" | "thread" | "threads" | "topic" | "topics"
)
}) || title.contains("forum")
|| title.contains("thread")
{
forum_signals.push("forum-thread path/title signal".to_owned());
}
if link_count >= 5 {
forum_signals.push("forum link density >= 5".to_owned());
}
if !page.forms.is_empty() || list_count >= 1 {
forum_signals.push("forum list/form structure signal".to_owned());
}
if !forum_signals.is_empty() {
candidates.push(CompatibilityPackCandidate {
pack: CompatibilityPack::Forums,
confidence: forum_signals.len() as u8,
signals: forum_signals,
});
}
let mut qa_signals = Vec::new();
if path.iter().any(|segment| {
matches!(
*segment,
"question" | "questions" | "answer" | "answers" | "qa"
)
}) || title.contains("q&a")
|| title.contains("question")
{
qa_signals.push("qa path/title signal".to_owned());
}
if link_count >= 3 {
qa_signals.push("qa link density >= 3".to_owned());
}
if paragraph_count >= 1 || list_count >= 1 {
qa_signals.push("qa body/list structure signal".to_owned());
}
if !qa_signals.is_empty() {
candidates.push(CompatibilityPackCandidate {
pack: CompatibilityPack::Qa,
confidence: qa_signals.len() as u8,
signals: qa_signals,
});
}
let mut docs_signals = Vec::new();
if path
.iter()
.any(|segment| matches!(*segment, "docs" | "reference" | "api" | "guide" | "manual"))
|| url.host_str().is_some_and(|host| host.starts_with("docs."))
{
docs_signals.push("docs path/host signal".to_owned());
}
if has_code || heading_count >= 2 {
docs_signals.push("docs semantic structure signal".to_owned());
}
if link_count >= 3 {
docs_signals.push("docs link density >= 3".to_owned());
}
if !docs_signals.is_empty() {
candidates.push(CompatibilityPackCandidate {
pack: CompatibilityPack::Docs,
confidence: docs_signals.len() as u8,
signals: docs_signals,
});
}
let mut news_signals = Vec::new();
if path
.iter()
.any(|segment| matches!(*segment, "news" | "article" | "stories"))
|| title.contains("news")
|| title.contains("analysis")
{
news_signals.push("news path/title signal".to_owned());
}
if paragraph_count >= 4 {
news_signals.push("news paragraph depth >= 4".to_owned());
}
if link_count >= 3 {
news_signals.push("news link density >= 3".to_owned());
}
if !news_signals.is_empty() {
candidates.push(CompatibilityPackCandidate {
pack: CompatibilityPack::NewsMedia,
confidence: news_signals.len() as u8,
signals: news_signals,
});
}
let mut portal_signals = Vec::new();
if link_count >= 20 {
portal_signals.push("portal link density >= 20".to_owned());
}
if paragraph_count <= 3 {
portal_signals.push("portal compact body signal".to_owned());
}
if path.first().is_none_or(|segment| {
matches!(
*segment,
"" | "home" | "discover" | "explore" | "index" | "portal"
)
}) {
portal_signals.push("portal landing path signal".to_owned());
}
if !portal_signals.is_empty() {
candidates.push(CompatibilityPackCandidate {
pack: CompatibilityPack::Portal,
confidence: portal_signals.len() as u8,
signals: portal_signals,
});
}
let mut app_shell_signals = Vec::new();
if path.iter().any(|segment| {
matches!(
*segment,
"app" | "dashboard" | "workspace" | "account" | "settings"
)
}) {
app_shell_signals.push("app-shell path signal".to_owned());
}
if title.contains("dashboard")
|| title.contains("workspace")
|| title.contains("account")
|| title.contains("app")
{
app_shell_signals.push("app-shell title signal".to_owned());
}
if !page.forms.is_empty() || link_count >= 6 {
app_shell_signals.push("app-shell workflow surface signal".to_owned());
}
if !app_shell_signals.is_empty() {
candidates.push(CompatibilityPackCandidate {
pack: CompatibilityPack::AppShell,
confidence: app_shell_signals.len() as u8,
signals: app_shell_signals,
});
}
let mut commerce_signals = Vec::new();
if path.iter().any(|segment| {
matches!(
*segment,
"shop" | "store" | "products" | "product" | "category" | "deals" | "listing"
)
}) || title.contains("shop")
|| title.contains("deals")
{
commerce_signals.push("commerce path/title signal".to_owned());
}
if list_count >= 1 || table_count >= 1 {
commerce_signals.push("commerce list/table signal".to_owned());
}
if page.paragraphs.iter().any(|paragraph| {
paragraph.contains('$')
|| paragraph.contains('€')
|| paragraph.contains('£')
|| paragraph.contains('Â¥')
|| paragraph.contains('₹')
|| paragraph.to_ascii_lowercase().contains("price")
}) {
commerce_signals.push("commerce pricing signal".to_owned());
}
if link_count >= 6 {
commerce_signals.push("commerce link density >= 6".to_owned());
}
if !commerce_signals.is_empty() {
candidates.push(CompatibilityPackCandidate {
pack: CompatibilityPack::CommerceCards,
confidence: commerce_signals.len() as u8,
signals: commerce_signals,
});
}
let mut mixed_media_signals = Vec::new();
if path.iter().any(|segment| {
matches!(
*segment,
"watch" | "video" | "media" | "gallery" | "podcast" | "live"
)
}) || title.contains("video")
|| title.contains("podcast")
|| title.contains("gallery")
{
mixed_media_signals.push("mixed-media path/title signal".to_owned());
}
if paragraph_count >= 3 {
mixed_media_signals.push("mixed-media paragraph depth >= 3".to_owned());
}
if list_count >= 1 || link_count >= 8 {
mixed_media_signals.push("mixed-media list/link surface signal".to_owned());
}
if !mixed_media_signals.is_empty() {
candidates.push(CompatibilityPackCandidate {
pack: CompatibilityPack::MixedMedia,
confidence: mixed_media_signals.len() as u8,
signals: mixed_media_signals,
});
}
let best = candidates.into_iter().max_by_key(|candidate| {
(
candidate.confidence,
candidate.pack.priority(),
std::cmp::Reverse(candidate.pack.as_str()),
)
})?;
(best.confidence >= MIN_PACK_CONFIDENCE).then_some(best)
}
fn compatibility_pack_document(
page: &ReadablePage,
adapter_id: &AdapterId,
candidate: &CompatibilityPackCandidate,
) -> IndexDocument {
let title = format!("Compatibility pack: {}", candidate.pack.label());
let mut document = IndexDocument::titled(title.clone());
document.metadata.canonical_url = page.metadata.canonical_url.clone();
document.metadata.language = page.metadata.language.clone();
document.metadata.description = page.metadata.description.clone();
document.metadata.open_graph_title = page.metadata.open_graph_title.clone();
document.metadata.open_graph_description = page.metadata.open_graph_description.clone();
document.metadata.adapter_id = Some(adapter_id.clone());
document.metadata.quality = Some(DocumentQuality::new(
DocumentQualityCategory::Adapter,
88,
[
format!("matched adapter: {}", adapter_id.as_str()),
format!("compatibility pack: {}", candidate.pack.as_str()),
format!("pack confidence: {}", candidate.confidence),
],
));
document.push(IndexNode::Heading {
level: 1,
text: title,
});
document.push(IndexNode::Paragraph(format!(
"Family pack: {}",
candidate.pack.label()
)));
document.push(IndexNode::Section {
role: SectionRole::Unknown,
title: Some("Pack diagnostics".to_owned()),
collapsed: true,
nodes: vec![IndexNode::List {
ordered: false,
items: std::iter::once(format!(
"confidence: {} ({})",
candidate.confidence,
candidate.confidence_label()
))
.chain(
candidate
.signals
.iter()
.map(|signal| format!("signal: {signal}")),
)
.chain(std::iter::once(
"fallback: confidence below 3 returns generic transformer".to_owned(),
))
.collect(),
}],
});
document.push(IndexNode::List {
ordered: false,
items: compatibility_pack_tasks(candidate.pack),
});
for node in page.nodes.iter().take(8) {
if let Some(summary) = summary_node(node) {
document.push(summary);
}
}
for form in &page.forms {
document.push(IndexNode::Form(index_form_from_html(form)));
}
for link in page.links.iter().take(24) {
document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
}
document
}
fn compatibility_pack_tasks(pack: CompatibilityPack) -> Vec<String> {
match pack {
CompatibilityPack::Forums => vec![
"Read thread context".to_owned(),
"Inspect pagination and replies".to_owned(),
"Open outbound references".to_owned(),
],
CompatibilityPack::Qa => vec![
"Read question and accepted answers".to_owned(),
"Inspect answer ranking".to_owned(),
"Open cited references".to_owned(),
],
CompatibilityPack::Docs => vec![
"Read API/reference sections".to_owned(),
"Inspect code examples".to_owned(),
"Open navigation and related topics".to_owned(),
],
CompatibilityPack::NewsMedia => vec![
"Read article body".to_owned(),
"Inspect related coverage links".to_owned(),
"Open citations and sources".to_owned(),
],
CompatibilityPack::Portal => vec![
"Scan major entry points".to_owned(),
"Open primary categories".to_owned(),
"Inspect utility/search actions".to_owned(),
],
CompatibilityPack::AppShell => vec![
"Inspect workspace navigation".to_owned(),
"Open primary actions and settings".to_owned(),
"Review account or session requirements".to_owned(),
],
CompatibilityPack::CommerceCards => vec![
"Scan product cards and prices".to_owned(),
"Open product detail links".to_owned(),
"Inspect filters, sorting, and pagination".to_owned(),
],
CompatibilityPack::MixedMedia => vec![
"Read media summaries and metadata".to_owned(),
"Open video/audio/gallery entries".to_owned(),
"Inspect related links and next-step navigation".to_owned(),
],
}
}
fn top100_family_tasks(family: Top100Family) -> Vec<String> {
match family {
Top100Family::SearchPortal => vec![
"Scan ranked results".to_owned(),
"Open result targets".to_owned(),
"Inspect query refinements".to_owned(),
],
Top100Family::KnowledgeReference => vec![
"Read primary article content".to_owned(),
"Open references and sources".to_owned(),
"Inspect related links".to_owned(),
],
Top100Family::SocialCommunity => vec![
"Read post/thread context".to_owned(),
"Inspect reply chains".to_owned(),
"Open outbound links".to_owned(),
],
Top100Family::MediaStreaming => vec![
"Read media metadata".to_owned(),
"Open channel or playlist links".to_owned(),
"Inspect related media links".to_owned(),
],
Top100Family::CommerceMarketplace => vec![
"Read listing and product metadata".to_owned(),
"Open product links".to_owned(),
"Inspect filters and pagination".to_owned(),
],
Top100Family::ServicesUtility => vec![
"Read public utility or help content".to_owned(),
"Open relevant workflow links".to_owned(),
"Inspect authentication requirements".to_owned(),
],
Top100Family::AiAssistant => vec![
"Read public product/help content".to_owned(),
"Open documentation or policy links".to_owned(),
"Inspect account-gated workflow boundaries".to_owned(),
],
}
}
fn top100_baseline_document(
page: &ReadablePage,
adapter_id: &AdapterId,
domain: &str,
family: Top100Family,
intent: Top100Intent,
) -> IndexDocument {
let title = format!("Top site baseline: {domain}");
let mut document = IndexDocument::titled(title.clone());
document.metadata.canonical_url = page.metadata.canonical_url.clone();
document.metadata.language = page.metadata.language.clone();
document.metadata.description = page.metadata.description.clone();
document.metadata.open_graph_title = page.metadata.open_graph_title.clone();
document.metadata.open_graph_description = page.metadata.open_graph_description.clone();
document.metadata.adapter_id = Some(adapter_id.clone());
document.metadata.quality = Some(DocumentQuality::new(
DocumentQualityCategory::Adapter,
90,
[
format!("matched adapter: {}", adapter_id.as_str()),
format!("domain: {domain}"),
format!("family: {}", family.as_str()),
format!("intent: {}", intent.as_str()),
],
));
document.push(IndexNode::Heading {
level: 1,
text: title,
});
document.push(IndexNode::Paragraph(format!(
"Family: {} | Intent: {}",
family.label(),
intent.as_str()
)));
document.push(IndexNode::List {
ordered: false,
items: top100_family_tasks(family),
});
match intent {
Top100Intent::SearchResults => {
let mut result_nodes = Vec::new();
for link in page.links.iter().take(20) {
if default_forum_link_filter(link) {
result_nodes.push(IndexNode::Link(Link::new(&link.text, &link.href)));
}
}
if !result_nodes.is_empty() {
document.push(IndexNode::Section {
role: SectionRole::Main,
title: Some("Results".to_owned()),
collapsed: false,
nodes: result_nodes,
});
}
}
Top100Intent::VideoHub => {
let metadata = top100_media_metadata(page);
if !metadata.is_empty() {
document.push(IndexNode::Section {
role: SectionRole::Main,
title: Some("Media Metadata".to_owned()),
collapsed: false,
nodes: vec![IndexNode::List {
ordered: false,
items: metadata,
}],
});
}
}
Top100Intent::MarketplaceListing => {
let listing_nodes = top100_listing_nodes(page);
if !listing_nodes.is_empty() {
document.push(IndexNode::Section {
role: SectionRole::Main,
title: Some("Listings".to_owned()),
collapsed: false,
nodes: listing_nodes,
});
}
}
Top100Intent::PortalLanding
| Top100Intent::ArticleOrReference
| Top100Intent::AppShell
| Top100Intent::FeedOrThread => {}
}
for node in page.nodes.iter().take(6) {
if let Some(summary) = summary_node(node) {
document.push(summary);
}
}
for form in &page.forms {
document.push(IndexNode::Form(index_form_from_html(form)));
}
for link in page.links.iter().take(16) {
document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
}
document
}
fn top100_media_metadata(page: &ReadablePage) -> Vec<String> {
page.paragraphs
.iter()
.filter_map(|paragraph| {
let trimmed = paragraph.trim();
let lower = trimmed.to_ascii_lowercase();
(lower.starts_with("creator:")
|| lower.starts_with("author:")
|| lower.starts_with("duration:")
|| lower.starts_with("channel:")
|| lower.starts_with("views:"))
.then(|| trimmed.to_owned())
})
.take(8)
.collect()
}
fn top100_listing_nodes(page: &ReadablePage) -> Vec<IndexNode> {
let mut nodes = Vec::new();
for readable in page.nodes.iter().take(8) {
match readable {
ReadableNode::List { ordered, items } if !items.is_empty() => {
nodes.push(IndexNode::List {
ordered: *ordered,
items: items.clone(),
})
}
ReadableNode::Table { rows } if !rows.is_empty() => {
nodes.push(IndexNode::Table { rows: rows.clone() })
}
ReadableNode::Paragraph(text)
if text.to_ascii_lowercase().contains("results for")
|| text.to_ascii_lowercase().contains("price") =>
{
nodes.push(IndexNode::Paragraph(text.clone()))
}
_ => {}
}
}
nodes
}
fn blocked_top100_document(
page: &ReadablePage,
adapter_id: &AdapterId,
domain: &str,
family: Top100Family,
intent: Top100Intent,
blocked: BlockedFlowClass,
) -> IndexDocument {
let reason = format!(
"{domain} page is gated by {} and cannot be transformed safely as an interactive flow",
blocked.as_str()
);
let mut document = FailureDiagnostic::new(
format!("Top site blocked flow: {domain}"),
DiagnosticSource::Adapter,
DiagnosticConfidence::Low,
reason.clone(),
)
.with_likely_cause(FailureCause::BlockedByPolicy)
.with_fallback("read-only generic extraction")
.with_tried("top100 baseline family/intent detection")
.with_actions([
DiagnosticAction::Retry,
DiagnosticAction::Extract,
DiagnosticAction::Capture,
DiagnosticAction::AddFixture,
])
.with_command(":extract links")
.with_command(":capture save top100-blocked.capture")
.with_command(format!(
":capture --preview --redact https://{domain}/ blocked-flow.html"
))
.with_record(
DiagnosticRecord::new(DiagnosticSeverity::Warning, "INDEX-TOP100-BLOCKED", &reason)
.with_field("adapter", adapter_id.as_str())
.with_field("domain", domain)
.with_field("family", family.as_str())
.with_field("intent", intent.as_str())
.with_field("blocked_flow_class", blocked.as_str()),
)
.into_document();
document.metadata.canonical_url = page.metadata.canonical_url.clone();
document.metadata.language = page.metadata.language.clone();
document.metadata.adapter_id = Some(adapter_id.clone());
document.metadata.quality = Some(DocumentQuality::new(
DocumentQualityCategory::Fallback,
25,
[
format!("matched adapter: {}", adapter_id.as_str()),
format!("blocked-flow class: {}", blocked.as_str()),
format!("family: {}", family.as_str()),
format!("intent: {}", intent.as_str()),
],
));
document.push(IndexNode::Section {
role: SectionRole::Unknown,
title: Some("Remediation".to_owned()),
collapsed: false,
nodes: vec![IndexNode::List {
ordered: false,
items: blocked_flow_guidance(blocked),
}],
});
document
}
fn blocked_flow_guidance(blocked: BlockedFlowClass) -> Vec<String> {
let mut items = vec![
"collect a sanitized capture artifact for fixture review".to_owned(),
"use :extract links for deterministic fallback output".to_owned(),
];
items.push(match blocked {
BlockedFlowClass::AuthWall => {
"log in with a supported session scope before retry".to_owned()
}
BlockedFlowClass::ScriptGate => {
"retry with headless snapshot fallback enabled for script-gated pages".to_owned()
}
BlockedFlowClass::BotGate => {
"respect anti-bot policy and avoid automated bypass attempts".to_owned()
}
BlockedFlowClass::GeoGate => {
"verify legal region availability before requesting content".to_owned()
}
BlockedFlowClass::AgeGate => {
"confirm age-gated policy allows read-only access before retry".to_owned()
}
BlockedFlowClass::PolicyGate => {
"page appears policy-blocked; treat as unsupported and preserve diagnostics".to_owned()
}
BlockedFlowClass::None => "retry generic extraction".to_owned(),
});
items
}
fn classify_forum_intent(url: Option<&Url>, page: &ReadablePage) -> ForumIntent {
if page.forms.iter().any(is_reply_or_login_form) {
return ForumIntent::ReplyForm;
}
let Some(url) = url else {
return ForumIntent::FrontPage;
};
let segments = path_segments(url);
if looks_like_profile_path(&segments, url.as_str()) {
return ForumIntent::ProfileNoise;
}
if looks_like_paginated_path(url, page) {
return ForumIntent::PaginatedThread;
}
if looks_like_thread_path(&segments, url.as_str()) {
return ForumIntent::ThreadPage;
}
ForumIntent::FrontPage
}
fn looks_like_profile_path(segments: &[&str], raw_url: &str) -> bool {
segments.contains(&"user")
|| segments.contains(&"users")
|| segments.contains(&"members")
|| raw_url.contains("/~")
}
fn looks_like_thread_path(segments: &[&str], raw_url: &str) -> bool {
segments.contains(&"item")
|| segments.contains(&"comments")
|| segments.contains(&"question")
|| segments.contains(&"questions")
|| segments.contains(&"threads")
|| segments.contains(&"thread")
|| segments.contains(&"topic")
|| (segments.first() == Some(&"t"))
|| raw_url.contains("showthread")
|| raw_url.contains("viewtopic")
}
fn looks_like_paginated_path(url: &Url, page: &ReadablePage) -> bool {
if url
.query_pairs()
.any(|(k, v)| (k == "p" || k == "page" || k == "start") && !v.is_empty())
{
return true;
}
page.links.iter().any(is_forum_pagination_link)
}
fn is_reply_or_login_form(form: &index_dom::HtmlForm) -> bool {
let action = form.action.to_ascii_lowercase();
let name = form.name.to_ascii_lowercase();
let field_match = form.inputs.iter().any(|input| {
let input_name = input.name.to_ascii_lowercase();
input_name.contains("reply")
|| input_name.contains("comment")
|| input_name.contains("message")
|| input_name.contains("password")
|| input_name.contains("username")
});
field_match
|| action.contains("reply")
|| action.contains("comment")
|| action.contains("login")
|| name.contains("reply")
|| name.contains("login")
}
fn forum_thread_document(
page: &ReadablePage,
adapter_id: &AdapterId,
family: ForumFamily,
title: String,
tasks: Vec<String>,
filter: Option<fn(&index_dom::HtmlLink) -> bool>,
) -> IndexDocument {
let canonical = page_url(page);
let intent = classify_forum_intent(canonical.as_ref(), page);
let mut document = IndexDocument::titled(title.clone());
document.metadata.canonical_url = page.metadata.canonical_url.clone();
document.metadata.language = page.metadata.language.clone();
document.metadata.description = page.metadata.description.clone();
document.metadata.open_graph_title = page.metadata.open_graph_title.clone();
document.metadata.open_graph_description = page.metadata.open_graph_description.clone();
document.metadata.adapter_id = Some(adapter_id.clone());
document.metadata.quality = Some(DocumentQuality::new(
DocumentQualityCategory::Adapter,
94,
[
format!("matched adapter: {}", adapter_id.as_str()),
format!("forum family: {}", family.as_str()),
format!("forum intent: {}", intent.as_str()),
],
));
document.push(IndexNode::Heading {
level: 1,
text: title,
});
document.push(IndexNode::Paragraph(format!(
"Family: {} | Intent: {}",
family.label(),
intent.as_str()
)));
document.push(IndexNode::List {
ordered: false,
items: tasks,
});
let link_filter = filter.unwrap_or(default_forum_link_filter);
let mut breadcrumbs = Vec::new();
let mut pagination = Vec::new();
let mut outbound = Vec::new();
for link in &page.links {
if !link_filter(link) {
continue;
}
if is_forum_breadcrumb_link(link) {
breadcrumbs.push(link);
} else if is_forum_pagination_link(link) {
pagination.push(link);
} else {
outbound.push(link);
}
}
if !breadcrumbs.is_empty() {
document.push(IndexNode::Section {
role: SectionRole::Navigation,
title: Some("Breadcrumbs".to_owned()),
collapsed: true,
nodes: breadcrumbs
.into_iter()
.map(|link| IndexNode::Link(Link::new(&link.text, &link.href)))
.collect(),
});
}
let mut thread_nodes = page
.nodes
.iter()
.filter_map(forum_node_from_readable)
.collect::<Vec<_>>();
if thread_nodes.is_empty() {
thread_nodes.extend(
page.paragraphs
.iter()
.take(3)
.map(|paragraph| IndexNode::Paragraph(paragraph.clone())),
);
}
if !thread_nodes.is_empty() {
document.push(IndexNode::Section {
role: SectionRole::Comments,
title: Some("Thread".to_owned()),
collapsed: false,
nodes: thread_nodes,
});
}
if !pagination.is_empty() {
document.push(IndexNode::Section {
role: SectionRole::Navigation,
title: Some("Pagination".to_owned()),
collapsed: true,
nodes: pagination
.iter()
.map(|link| IndexNode::Link(Link::new(&link.text, &link.href)))
.collect(),
});
}
let mut seen_step_hrefs = BTreeSet::new();
let mut next_steps = Vec::new();
for link in pagination.iter().chain(outbound.iter()) {
if next_steps.len() >= 6 {
break;
}
if !seen_step_hrefs.insert(link.href.clone()) {
continue;
}
if is_forum_pagination_link(link) || looks_like_forum_next_step(link) {
next_steps.push(IndexNode::Link(Link::new(&link.text, &link.href)));
}
}
if !next_steps.is_empty() {
document.push(IndexNode::Section {
role: SectionRole::Navigation,
title: Some("Next steps".to_owned()),
collapsed: false,
nodes: next_steps,
});
}
for form in &page.forms {
document.push(IndexNode::Form(index_form_from_html(form)));
}
for link in outbound.into_iter().take(20) {
document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
}
if matches!(intent, ForumIntent::ProfileNoise) {
document.push(IndexNode::Section {
role: SectionRole::Unknown,
title: Some("Diagnostic".to_owned()),
collapsed: true,
nodes: vec![
IndexNode::Error(
"profile-like page detected; discussion thread extraction may be partial"
.to_owned(),
),
IndexNode::List {
ordered: false,
items: vec![
"use :extract links to inspect profile actions".to_owned(),
"capture a fixture if thread navigation is expected".to_owned(),
],
},
],
});
}
document
}
fn forum_node_from_readable(node: &ReadableNode) -> Option<IndexNode> {
match node {
ReadableNode::Heading { level, text } => Some(IndexNode::Heading {
level: *level,
text: text.clone(),
}),
ReadableNode::Paragraph(text) => {
(!looks_like_signature_line(text)).then(|| IndexNode::Paragraph(text.clone()))
}
ReadableNode::Link(link) => Some(IndexNode::Link(Link::new(&link.text, &link.href))),
ReadableNode::List { ordered, items } => Some(IndexNode::List {
ordered: *ordered,
items: items.clone(),
}),
ReadableNode::CodeBlock { language, code } => Some(IndexNode::CodeBlock {
language: language.clone(),
code: code.clone(),
}),
ReadableNode::Table { rows } => {
(!rows.is_empty()).then(|| IndexNode::Table { rows: rows.clone() })
}
ReadableNode::Spacer { lines } => Some(IndexNode::Spacer { lines: *lines }),
ReadableNode::Section {
role,
title,
collapsed,
nodes,
} => {
let mapped = nodes
.iter()
.filter_map(forum_node_from_readable)
.collect::<Vec<_>>();
(!mapped.is_empty()).then(|| IndexNode::Section {
role: section_role(*role),
title: title.clone(),
collapsed: *collapsed,
nodes: mapped,
})
}
ReadableNode::Image { alt, src } => Some(IndexNode::Image {
alt: alt.clone(),
src: src.clone(),
}),
ReadableNode::Form(form) => Some(IndexNode::Form(index_form_from_html(form))),
}
}
fn looks_like_signature_line(text: &str) -> bool {
let lower = text.trim().to_ascii_lowercase();
lower.starts_with("sent from my")
|| lower.starts_with("--")
|| lower.starts_with("signature:")
|| lower.starts_with("posted via")
}
fn is_forum_breadcrumb_link(link: &index_dom::HtmlLink) -> bool {
matches!(
link.text.trim().to_ascii_lowercase().as_str(),
"home" | "forums" | "forum" | "boards" | "topics" | "discussions"
)
}
fn is_forum_pagination_link(link: &index_dom::HtmlLink) -> bool {
let text = link.text.trim().to_ascii_lowercase();
if matches!(
text.as_str(),
"next" | "prev" | "previous" | "older" | "newer" | "more"
) {
return true;
}
if text.starts_with("page ") {
return true;
}
link.href.contains("page=")
|| link.href.contains("/page-")
|| link.href.contains("start=")
|| link.href.contains("p=")
}
fn default_forum_link_filter(link: &index_dom::HtmlLink) -> bool {
let text = link.text.trim();
if text.is_empty() {
return false;
}
let href = link.href.to_ascii_lowercase();
if href.starts_with("javascript:") || href.starts_with("mailto:") {
return false;
}
!matches!(
text.to_ascii_lowercase().as_str(),
"reply" | "quote" | "report" | "like"
)
}
fn looks_like_forum_next_step(link: &index_dom::HtmlLink) -> bool {
let text = link.text.trim().to_ascii_lowercase();
text.contains("thread")
|| text.contains("topic")
|| text.contains("discussion")
|| text.contains("result")
|| text.contains("latest")
|| text.contains("archive")
}
fn is_reddit_actionable_link(link: &index_dom::HtmlLink) -> bool {
default_forum_link_filter(link)
&& !matches!(
link.text.trim().to_ascii_lowercase().as_str(),
"give award" | "share" | "save" | "hide"
)
}
fn is_legacy_actionable_link(link: &index_dom::HtmlLink) -> bool {
default_forum_link_filter(link)
&& !matches!(
link.text.trim().to_ascii_lowercase().as_str(),
"pm" | "warn" | "ignore" | "report post"
)
}
fn reddit_script_gated(page: &ReadablePage) -> bool {
let title = page.title.to_ascii_lowercase();
if title.contains("reddit") && page.nodes.is_empty() && page.links.len() <= 2 {
return true;
}
let gate_markers = [
"continue in app",
"enable javascript",
"you've been blocked",
"log in to reddit",
];
page.paragraphs.iter().any(|paragraph| {
let lower = paragraph.to_ascii_lowercase();
gate_markers.iter().any(|marker| lower.contains(marker))
})
}
fn blocked_forum_document(
page: &ReadablePage,
adapter_id: &AdapterId,
reason: &str,
) -> IndexDocument {
let mut document = FailureDiagnostic::new(
page.title.clone(),
DiagnosticSource::Adapter,
DiagnosticConfidence::Low,
reason,
)
.with_likely_cause(FailureCause::BlockedByPolicy)
.with_fallback("generic read-only extraction")
.with_tried("adapter forum-family detection")
.with_actions([
DiagnosticAction::Retry,
DiagnosticAction::Capture,
DiagnosticAction::AddFixture,
])
.with_command(":extract links")
.with_command(":capture save forum-blocked.capture")
.with_record(
DiagnosticRecord::new(DiagnosticSeverity::Warning, "INDEX-FORUM-BLOCKED", reason)
.with_field("adapter", adapter_id.as_str()),
)
.into_document();
document.metadata.canonical_url = page.metadata.canonical_url.clone();
document.metadata.language = page.metadata.language.clone();
document.metadata.adapter_id = Some(adapter_id.clone());
document.metadata.quality = Some(DocumentQuality::new(
DocumentQualityCategory::Fallback,
30,
[
format!("matched adapter: {}", adapter_id.as_str()),
"blocked forum flow emitted deterministic diagnostic".to_owned(),
],
));
document
}
fn task_document(
page: &ReadablePage,
adapter_id: &AdapterId,
title: String,
tasks: Vec<String>,
) -> IndexDocument {
let mut document = IndexDocument::titled(title.clone());
document.metadata.canonical_url = page.metadata.canonical_url.clone();
document.metadata.language = page.metadata.language.clone();
document.metadata.description = page.metadata.description.clone();
document.metadata.open_graph_title = page.metadata.open_graph_title.clone();
document.metadata.open_graph_description = page.metadata.open_graph_description.clone();
document.metadata.adapter_id = Some(adapter_id.clone());
document.metadata.quality = Some(DocumentQuality::new(
DocumentQualityCategory::Adapter,
95,
[
format!("matched adapter: {}", adapter_id.as_str()),
"fixture-backed task view".to_owned(),
],
));
document.push(IndexNode::Heading {
level: 1,
text: title,
});
document.push(IndexNode::List {
ordered: false,
items: tasks,
});
for node in page.nodes.iter().take(3) {
if let Some(summary) = summary_node(node) {
document.push(summary);
}
}
for link in page.links.iter().take(8) {
document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
}
document
}
fn hacker_news_document(page: &ReadablePage, matched: &AdapterMatch) -> IndexDocument {
let title = format!("Hacker News: {}", matched.page_type);
let intent = classify_forum_intent(page_url(page).as_ref(), page);
let mut document = IndexDocument::titled(title.clone());
document.metadata.canonical_url = page.metadata.canonical_url.clone();
document.metadata.language = page.metadata.language.clone();
document.metadata.description = page.metadata.description.clone();
document.metadata.open_graph_title = page.metadata.open_graph_title.clone();
document.metadata.open_graph_description = page.metadata.open_graph_description.clone();
document.metadata.adapter_id = Some(matched.id.clone());
document.metadata.quality = Some(DocumentQuality::new(
DocumentQualityCategory::Adapter,
96,
[
"matched adapter: hacker-news".to_owned(),
"preserved actionable links and forms".to_owned(),
"suppressed raw table-layout dump".to_owned(),
],
));
document.push(IndexNode::Heading {
level: 1,
text: title,
});
document.push(IndexNode::Paragraph(format!(
"Family: {} | Intent: {}",
ForumFamily::HackerNews.label(),
intent.as_str()
)));
document.push(IndexNode::List {
ordered: false,
items: hacker_news_tasks(&matched.page_type),
});
let mut skipped_title_heading = false;
for node in &page.nodes {
if !skipped_title_heading
&& matches!(
node,
ReadableNode::Heading { level: 1, text } if text == &page.title
)
{
skipped_title_heading = true;
continue;
}
if let Some(index_node) = index_node_from_readable_without_tables(node) {
document.push(index_node);
}
}
let mut story_links = Vec::new();
let mut navigation_links = Vec::new();
let mut footer_links = Vec::new();
for link in &page.links {
if !is_hacker_news_link_actionable(link) {
continue;
}
if is_hacker_news_navigation_link(link) {
navigation_links.push(link);
} else if is_hacker_news_footer_link(link) {
footer_links.push(link);
} else {
story_links.push(link);
}
}
for link in story_links {
document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
}
if !navigation_links.is_empty() {
document.push(IndexNode::Section {
role: SectionRole::Navigation,
title: Some("HN Navigation".to_owned()),
collapsed: true,
nodes: navigation_links
.into_iter()
.map(|link| IndexNode::Link(Link::new(&link.text, &link.href)))
.collect(),
});
}
if !footer_links.is_empty() {
document.push(IndexNode::Section {
role: SectionRole::Related,
title: Some("HN Footer".to_owned()),
collapsed: true,
nodes: footer_links
.into_iter()
.map(|link| IndexNode::Link(Link::new(&link.text, &link.href)))
.collect(),
});
}
for form in &page.forms {
document.push(IndexNode::Form(index_form_from_html(form)));
}
document
}
fn hacker_news_tasks(page_type: &str) -> Vec<String> {
let page_type = page_type.to_ascii_lowercase();
if page_type.contains("discussion") {
vec![
"Read story context".to_owned(),
"Read comment threads".to_owned(),
"Open author and parent links".to_owned(),
"Use HN search form".to_owned(),
]
} else if page_type.contains("login") {
vec![
"Open login workflow".to_owned(),
"Use HN search form".to_owned(),
"Jump to front-page discussions".to_owned(),
]
} else {
vec![
"Open top stories".to_owned(),
"Open discussion threads".to_owned(),
"Filter by source or author".to_owned(),
"Use HN search form".to_owned(),
]
}
}
fn hacker_news_page_type(url: Option<&Url>, title: &str) -> String {
let Some(url) = url else {
return title.to_owned();
};
let segment = path_segments(url).first().copied().unwrap_or_default();
match segment {
"" | "news" => "front-page".to_owned(),
"newest" => "new".to_owned(),
"front" => "past".to_owned(),
"newcomments" => "comments".to_owned(),
"ask" => "ask".to_owned(),
"show" => "show".to_owned(),
"jobs" => "jobs".to_owned(),
"login" => "login".to_owned(),
"submit" => "submit".to_owned(),
"item" => url
.query_pairs()
.find_map(|(key, value)| (key == "id").then(|| value.to_string()))
.map_or_else(|| "discussion".to_owned(), |id| format!("discussion #{id}")),
"user" => url
.query_pairs()
.find_map(|(key, value)| (key == "id").then(|| value.to_string()))
.map_or_else(|| "user".to_owned(), |id| format!("user {id}")),
_ => title.to_owned(),
}
}
fn looks_like_hacker_news(page: &ReadablePage) -> bool {
if page.title.to_ascii_lowercase().contains("hacker news") {
return true;
}
if page
.forms
.iter()
.any(|form| form.action.contains("news.ycombinator.com"))
{
return true;
}
let hn_link_count = page
.links
.iter()
.filter(|link| link.href.contains("news.ycombinator.com"))
.count();
if hn_link_count >= 3 {
return true;
}
let nav_label_count = page
.links
.iter()
.filter(|link| {
matches!(
link.text.to_ascii_lowercase().as_str(),
"hacker news"
| "new"
| "past"
| "comments"
| "ask"
| "show"
| "jobs"
| "submit"
| "login"
)
})
.count();
nav_label_count >= 3
}
fn is_hacker_news_link_actionable(link: &index_dom::HtmlLink) -> bool {
let text = link.text.trim();
if text.is_empty() {
return false;
}
let href = link.href.to_ascii_lowercase();
if href.starts_with("mailto:") {
return false;
}
if href.starts_with("javascript:") {
return false;
}
if href.contains("news.ycombinator.com/hide?id=") || href.contains("vote?id=") {
return false;
}
if text.eq_ignore_ascii_case("hide")
|| text.eq_ignore_ascii_case("favorite")
|| text.eq_ignore_ascii_case("parent")
|| text.eq_ignore_ascii_case("next")
|| text.eq_ignore_ascii_case("root")
{
return false;
}
true
}
fn is_hacker_news_navigation_link(link: &index_dom::HtmlLink) -> bool {
matches!(
link.text.to_ascii_lowercase().as_str(),
"hacker news"
| "new"
| "past"
| "comments"
| "ask"
| "show"
| "jobs"
| "submit"
| "login"
| "more"
)
}
fn is_hacker_news_footer_link(link: &index_dom::HtmlLink) -> bool {
matches!(
link.text.to_ascii_lowercase().as_str(),
"guidelines" | "faq" | "lists" | "api" | "security" | "legal" | "apply to yc"
)
}
fn index_node_from_readable_without_tables(node: &ReadableNode) -> Option<IndexNode> {
match node {
ReadableNode::Heading { level, text } => Some(IndexNode::Heading {
level: *level,
text: text.clone(),
}),
ReadableNode::Paragraph(text) => Some(IndexNode::Paragraph(text.clone())),
ReadableNode::Link(link) => Some(IndexNode::Link(Link::new(&link.text, &link.href))),
ReadableNode::List { ordered, items } => Some(IndexNode::List {
ordered: *ordered,
items: items.clone(),
}),
ReadableNode::CodeBlock { language, code } => Some(IndexNode::CodeBlock {
language: language.clone(),
code: code.clone(),
}),
ReadableNode::Table { .. } => None,
ReadableNode::Spacer { lines } => Some(IndexNode::Spacer { lines: *lines }),
ReadableNode::Section {
role,
title,
collapsed,
nodes,
} => {
let nodes = nodes
.iter()
.filter_map(index_node_from_readable_without_tables)
.collect::<Vec<_>>();
(!nodes.is_empty()).then(|| IndexNode::Section {
role: section_role(*role),
title: title.clone(),
collapsed: *collapsed,
nodes,
})
}
ReadableNode::Image { alt, src } => Some(IndexNode::Image {
alt: alt.clone(),
src: src.clone(),
}),
ReadableNode::Form(_) => None,
}
}
fn section_role(role: ReadableSectionRole) -> SectionRole {
match role {
ReadableSectionRole::Main => SectionRole::Main,
ReadableSectionRole::Navigation => SectionRole::Navigation,
ReadableSectionRole::Aside => SectionRole::Aside,
ReadableSectionRole::Footer => SectionRole::Footer,
ReadableSectionRole::Comments => SectionRole::Comments,
ReadableSectionRole::Related => SectionRole::Related,
ReadableSectionRole::Unknown => SectionRole::Unknown,
}
}
fn index_form_from_html(form: &index_dom::HtmlForm) -> Form {
Form {
name: form.name.clone(),
method: form.method.clone(),
action: form.action.clone(),
inputs: form
.inputs
.iter()
.map(|input| Input {
name: input.name.clone(),
kind: input.kind.clone(),
value: input.value.clone(),
required: input.required,
})
.collect(),
buttons: form
.buttons
.iter()
.map(|button| ButtonAction {
name: button.name.clone(),
value: button.value.clone(),
label: button.label.clone(),
})
.collect(),
}
}
fn summary_node(node: &ReadableNode) -> Option<IndexNode> {
match node {
ReadableNode::Heading { level, text } => Some(IndexNode::Heading {
level: *level,
text: text.clone(),
}),
ReadableNode::Paragraph(text) => Some(IndexNode::Paragraph(text.clone())),
ReadableNode::Link(_)
| ReadableNode::CodeBlock { .. }
| ReadableNode::List { .. }
| ReadableNode::Table { .. }
| ReadableNode::Spacer { .. }
| ReadableNode::Section { .. }
| ReadableNode::Image { .. }
| ReadableNode::Form(_) => None,
}
}
fn page_url(page: &ReadablePage) -> Option<Url> {
page.metadata
.canonical_url
.as_deref()
.and_then(|url| Url::parse(url).ok())
}
fn path_segments(url: &Url) -> Vec<&str> {
url.path_segments()
.map(|segments| segments.filter(|segment| !segment.is_empty()).collect())
.unwrap_or_default()
}
#[cfg(test)]
mod tests {
use index_dom::HtmlLink;
use index_dom::{HtmlButton, HtmlForm, HtmlInput};
use index_readability::{ReadableMetadata, ReadableNode, ReadablePage};
use super::{AdapterContext, AdapterRegistry, page_url, path_segments};
fn contains_code_block(nodes: &[index_core::IndexNode], needle: &str) -> bool {
nodes.iter().any(|node| match node {
index_core::IndexNode::CodeBlock { code, .. } => code.contains(needle),
index_core::IndexNode::Section { nodes, .. } => contains_code_block(nodes, needle),
_ => false,
})
}
fn contains_paragraph_text(nodes: &[index_core::IndexNode], needle: &str) -> bool {
nodes.iter().any(|node| match node {
index_core::IndexNode::Paragraph(text) => text.contains(needle),
index_core::IndexNode::Section { nodes, .. } => contains_paragraph_text(nodes, needle),
_ => false,
})
}
fn contains_error_text(nodes: &[index_core::IndexNode], needle: &str) -> bool {
nodes.iter().any(|node| match node {
index_core::IndexNode::Error(text) => text.contains(needle),
index_core::IndexNode::Section { nodes, .. } => contains_error_text(nodes, needle),
_ => false,
})
}
fn page(url: &str, title: &str) -> ReadablePage {
ReadablePage {
title: title.to_owned(),
paragraphs: vec!["Summary paragraph.".to_owned()],
nodes: vec![ReadableNode::Paragraph("Summary paragraph.".to_owned())],
links: vec![HtmlLink {
text: "Primary".to_owned(),
href: url.to_owned(),
}],
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some(url.to_owned()),
language: Some("en".to_owned()),
description: Some("Description".to_owned()),
open_graph_title: None,
open_graph_description: None,
},
}
}
#[test]
fn registry_detects_initial_supported_sites() {
let registry = AdapterRegistry::default_registry();
for (url, expected) in [
("https://github.com/index-rs/index", "github.repository"),
(
"https://github.com/index-rs/index/issues/42",
"github.issue",
),
("https://gitlab.com/index-rs/index", "gitlab"),
("https://git.sr.ht/~index/index", "sourcehut"),
("https://codeberg.org/index/index", "forge"),
("https://docs.rs/scraper/latest/scraper/", "docs.rs"),
(
"https://index.readthedocs.io/en/latest/guide/",
"read-the-docs",
),
(
"https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector",
"mdn",
),
("https://crates.io/crates/scraper", "crates.io"),
(
"https://en.wikipedia.org/wiki/Rust_(programming_language)",
"wikipedia",
),
("https://news.ycombinator.com/item?id=1", "hacker-news"),
(
"https://stackoverflow.com/questions/1/example",
"stackoverflow",
),
(
"https://old.reddit.com/r/rust/comments/abc123/example_thread/",
"reddit",
),
(
"https://slashdot.org/story/26/01/01/1230201/example",
"slashdot",
),
("https://discuss.example.org/t/topic/42", "discourse"),
(
"https://resetera.com/threads/example-thread.42/",
"forum-xenforo",
),
(
"https://forums.tomshardware.com/forum/threads/example.42/",
"forum-legacy",
),
("https://arxiv.org/abs/2601.00001", "arxiv"),
(
"https://archive.org/details/community-manuals",
"internet-archive",
),
] {
let page = page(url, "Example");
let matched = registry.detect(&AdapterContext { page: &page });
assert_eq!(
matched.map(|matched| matched.id.to_string()),
Some(expected.to_owned())
);
}
}
#[test]
fn registry_returns_none_for_unknown_site() {
let registry = AdapterRegistry::default_registry();
let page = page("https://example.com/article", "Example");
assert_eq!(registry.detect(&AdapterContext { page: &page }), None);
}
#[test]
fn adapter_output_is_task_oriented_document_model() {
let registry = AdapterRegistry::default_registry();
let page = page("https://github.com/index-rs/index", "Index");
let document = registry.transform(&AdapterContext { page: &page });
assert!(matches!(
document.as_ref().and_then(|document| document.metadata.adapter_id.as_ref()),
Some(id) if id.as_str() == "github.repository"
));
assert!(matches!(
document.as_ref().and_then(|document| document.nodes.get(1)),
Some(index_core::IndexNode::List { items, .. }) if items.iter().any(|item| item == "Open issues")
));
}
#[test]
fn each_supported_adapter_emits_expected_task_view() {
let registry = AdapterRegistry::default_registry();
for (url, title, expected_title, expected_task) in [
(
"https://github.com/index-rs/index/issues/42",
"Issue",
"GitHub issue:",
"Inspect labels and status",
),
(
"https://docs.rs/scraper/latest/scraper/",
"Docs",
"docs.rs:",
"Search items",
),
(
"https://gitlab.com/index-rs/index",
"GitLab",
"GitLab:",
"Open issues and merge requests",
),
(
"https://git.sr.ht/~index/index",
"SourceHut",
"SourceHut:",
"Read mailing-list context",
),
(
"https://codeberg.org/index/index",
"Forge",
"Forge:",
"Open issues and pull requests",
),
(
"https://index.readthedocs.io/en/latest/guide/",
"Read the Docs",
"Read the Docs:",
"Open table of contents",
),
(
"https://developer.mozilla.org/en-US/docs/Web/API/Document/querySelector",
"MDN",
"MDN:",
"Open browser compatibility notes",
),
(
"https://crates.io/crates/scraper",
"Crate",
"crates.io:",
"Open documentation",
),
(
"https://en.wikipedia.org/wiki/Rust_(programming_language)",
"Rust",
"Wikipedia:",
"Open references",
),
(
"https://news.ycombinator.com/item?id=1",
"Story",
"Hacker News:",
"Read comment threads",
),
(
"https://stackoverflow.com/questions/1/example",
"Question",
"StackExchange:",
"Review accepted answer",
),
(
"https://old.reddit.com/r/rust/comments/abc123/example_thread/",
"Reddit Thread",
"Reddit thread:",
"Inspect nested comments",
),
(
"https://slashdot.org/story/26/01/01/1230201/example",
"Slashdot Story",
"Slashdot:",
"Inspect comment thread",
),
(
"https://discuss.example.org/t/topic/42",
"Discourse",
"Discourse thread:",
"Inspect replies",
),
(
"https://resetera.com/threads/example-thread.42/",
"ResetEra",
"Forum thread:",
"Inspect quote/reply chain",
),
(
"https://forums.tomshardware.com/forum/threads/example.42/",
"Tom's Hardware",
"Legacy forum thread:",
"Inspect quotes/code blocks",
),
(
"https://arxiv.org/abs/2601.00001",
"arXiv",
"arXiv abstract:",
"Open PDF or source",
),
(
"https://archive.org/details/community-manuals",
"Archive",
"Internet Archive item:",
"Open available files",
),
] {
let page = page(url, title);
let document = registry.transform(&AdapterContext { page: &page });
assert!(matches!(
document.as_ref(),
Some(document) if document.title.starts_with(expected_title)
));
assert!(matches!(
document.as_ref(),
Some(document) if document.nodes.iter().any(|node| matches!(node, index_core::IndexNode::List { items, .. } if items.iter().any(|item| item == expected_task)))
));
}
}
#[test]
fn adapter_summary_keeps_headings_and_skips_non_summary_nodes() {
let registry = AdapterRegistry::default_registry();
let mut page = page("https://docs.rs/index/latest/index/", "Index docs");
page.nodes = vec![
ReadableNode::Heading {
level: 2,
text: "Module index".to_owned(),
},
ReadableNode::CodeBlock {
language: Some("rust".to_owned()),
code: "fn main() {}".to_owned(),
},
ReadableNode::Table {
rows: vec![vec!["Name".to_owned()]],
},
ReadableNode::Image {
alt: "Logo".to_owned(),
src: None,
},
];
let document = registry.transform(&AdapterContext { page: &page });
assert!(matches!(
document.as_ref().and_then(|document| document.nodes.get(2)),
Some(index_core::IndexNode::Heading { level: 2, text }) if text == "Module index"
));
assert!(!matches!(
document.as_ref().and_then(|document| document.nodes.get(3)),
Some(index_core::IndexNode::CodeBlock { .. })
));
}
#[test]
fn detection_handles_missing_or_sparse_urls() {
let registry = AdapterRegistry::default_registry();
let mut no_url = page("https://example.com/article", "No URL");
no_url.metadata.canonical_url = None;
assert_eq!(page_url(&no_url), None);
assert_eq!(registry.detect(&AdapterContext { page: &no_url }), None);
let docs_root = page("https://docs.rs/", "Docs root");
let matched = registry.detect(&AdapterContext { page: &docs_root });
assert_eq!(
matched.map(|matched| matched.page_type),
Some("crate documentation".to_owned())
);
let crates_root = page("https://crates.io/", "Crates root");
let matched = registry.detect(&AdapterContext { page: &crates_root });
assert_eq!(
matched.map(|matched| matched.page_type),
Some("crate".to_owned())
);
}
#[test]
fn path_segments_ignores_empty_segments() -> Result<(), Box<dyn std::error::Error>> {
let url = url::Url::parse("https://example.com//a///b/")?;
assert_eq!(path_segments(&url), vec!["a", "b"]);
Ok(())
}
#[test]
fn hacker_news_detects_without_canonical_url() {
let registry = AdapterRegistry::default_registry();
let mut page = page("https://news.ycombinator.com/news", "Hacker News");
page.metadata.canonical_url = None;
page.links = vec![
HtmlLink {
text: "new".to_owned(),
href: "https://news.ycombinator.com/newest".to_owned(),
},
HtmlLink {
text: "Show HN: Example".to_owned(),
href: "https://example.org/show-hn".to_owned(),
},
];
let matched = registry.detect(&AdapterContext { page: &page });
assert_eq!(
matched.map(|matched| matched.id.as_str().to_owned()),
Some("hacker-news".to_owned())
);
}
#[test]
fn hacker_news_task_view_preserves_forms_and_actionable_links()
-> Result<(), Box<dyn std::error::Error>> {
let registry = AdapterRegistry::default_registry();
let mut page = page(
"https://news.ycombinator.com/item?id=42",
"Story | Hacker News",
);
page.nodes = vec![
ReadableNode::Table {
rows: vec![
vec!["1.".to_owned(), "Story".to_owned()],
vec!["12 points".to_owned(), "8 comments".to_owned()],
],
},
ReadableNode::Paragraph("A discussion page.".to_owned()),
];
page.links = vec![
HtmlLink {
text: "Story".to_owned(),
href: "https://example.org/story".to_owned(),
},
HtmlLink {
text: "8 comments".to_owned(),
href: "https://news.ycombinator.com/item?id=42".to_owned(),
},
HtmlLink {
text: "alice".to_owned(),
href: "https://news.ycombinator.com/user?id=alice".to_owned(),
},
HtmlLink {
text: "hide".to_owned(),
href: "https://news.ycombinator.com/hide?id=42&goto=item%3Fid%3D42".to_owned(),
},
HtmlLink {
text: "FAQ".to_owned(),
href: "https://news.ycombinator.com/newsfaq.html".to_owned(),
},
];
page.forms = vec![HtmlForm {
name: "search".to_owned(),
method: "GET".to_owned(),
action: "https://news.ycombinator.com/search".to_owned(),
inputs: vec![HtmlInput {
name: "q".to_owned(),
kind: "text".to_owned(),
value: None,
required: true,
}],
buttons: vec![HtmlButton {
name: Some("go".to_owned()),
value: Some("1".to_owned()),
label: "Search".to_owned(),
}],
}];
let document = registry
.transform(&AdapterContext { page: &page })
.ok_or("adapter should transform")?;
assert_eq!(
document.metadata.adapter_id.as_ref().map(|id| id.as_str()),
Some("hacker-news")
);
assert!(
!document
.nodes
.iter()
.any(|node| matches!(node, index_core::IndexNode::Table { .. }))
);
assert!(document.nodes.iter().any(
|node| matches!(node, index_core::IndexNode::Form(form) if form.name == "search" && form.action == "https://news.ycombinator.com/search")
));
assert!(document.nodes.iter().any(
|node| matches!(node, index_core::IndexNode::Link(link) if link.text == "Story" && link.href == "https://example.org/story")
));
assert!(
!document.nodes.iter().any(
|node| matches!(node, index_core::IndexNode::Link(link) if link.text == "hide")
)
);
Ok(())
}
#[test]
fn hacker_news_detects_login_form_without_canonical_or_title_hint() {
let registry = AdapterRegistry::default_registry();
let page = ReadablePage {
title: "Untitled".to_owned(),
paragraphs: Vec::new(),
nodes: Vec::new(),
links: vec![HtmlLink {
text: "Forgot your password?".to_owned(),
href: "https://news.ycombinator.com/forgot".to_owned(),
}],
forms: vec![HtmlForm {
name: "login".to_owned(),
method: "POST".to_owned(),
action: "https://news.ycombinator.com/login".to_owned(),
inputs: vec![HtmlInput {
name: "acct".to_owned(),
kind: "text".to_owned(),
value: None,
required: true,
}],
buttons: vec![HtmlButton {
name: None,
value: None,
label: "login".to_owned(),
}],
}],
metadata: ReadableMetadata::default(),
};
let matched = registry.detect(&AdapterContext { page: &page });
assert_eq!(
matched.map(|matched| matched.id.as_str().to_owned()),
Some("hacker-news".to_owned())
);
}
#[test]
fn forum_intent_classifier_handles_common_shapes() -> Result<(), Box<dyn std::error::Error>> {
let url = url::Url::parse("https://old.reddit.com/r/rust/comments/abc123/thread?page=2")?;
let reply_page = ReadablePage {
title: "Thread".to_owned(),
paragraphs: vec!["Body".to_owned()],
nodes: vec![ReadableNode::Paragraph("Body".to_owned())],
links: vec![HtmlLink {
text: "Next".to_owned(),
href: "https://old.reddit.com/r/rust/comments/abc123/thread?page=3".to_owned(),
}],
forms: vec![HtmlForm {
name: "reply".to_owned(),
method: "POST".to_owned(),
action: "https://old.reddit.com/comment".to_owned(),
inputs: vec![HtmlInput {
name: "comment".to_owned(),
kind: "text".to_owned(),
value: None,
required: true,
}],
buttons: Vec::new(),
}],
metadata: ReadableMetadata::default(),
};
assert_eq!(
super::classify_forum_intent(Some(&url), &reply_page),
super::ForumIntent::ReplyForm
);
let profile_url = url::Url::parse("https://news.ycombinator.com/user?id=alice")?;
assert_eq!(
super::classify_forum_intent(Some(&profile_url), &page("https://example.com", "x")),
super::ForumIntent::ProfileNoise
);
Ok(())
}
#[test]
fn reddit_script_gated_page_emits_deterministic_fallback_document()
-> Result<(), Box<dyn std::error::Error>> {
let registry = AdapterRegistry::default_registry();
let page = ReadablePage {
title: "Reddit".to_owned(),
paragraphs: vec!["Continue in app".to_owned()],
nodes: vec![ReadableNode::Paragraph("Continue in app".to_owned())],
links: vec![HtmlLink {
text: "continue".to_owned(),
href: "https://www.reddit.com/login/".to_owned(),
}],
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some(
"https://www.reddit.com/r/rust/comments/abc123/example/".to_owned(),
),
language: Some("en".to_owned()),
description: None,
open_graph_title: None,
open_graph_description: None,
},
};
let document = registry
.transform(&AdapterContext { page: &page })
.ok_or("adapter transform missing")?;
assert_eq!(
document.metadata.adapter_id.as_ref().map(|id| id.as_str()),
Some("reddit")
);
assert!(contains_paragraph_text(
&document.nodes,
"additional script/cookie flow"
));
Ok(())
}
#[test]
fn legacy_forum_thread_filters_signature_lines() -> Result<(), Box<dyn std::error::Error>> {
let registry = AdapterRegistry::default_registry();
let page = ReadablePage {
title: "Legacy thread".to_owned(),
paragraphs: Vec::new(),
nodes: vec![
ReadableNode::Paragraph("Useful answer.".to_owned()),
ReadableNode::Paragraph("Sent from my phone".to_owned()),
ReadableNode::CodeBlock {
language: None,
code: "fn legacy() {}".to_owned(),
},
],
links: vec![HtmlLink {
text: "next".to_owned(),
href: "https://forums.tomshardware.com/forum/threads/example.42/?page=2".to_owned(),
}],
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some(
"https://forums.tomshardware.com/forum/threads/example.42/".to_owned(),
),
language: Some("en".to_owned()),
description: None,
open_graph_title: None,
open_graph_description: None,
},
};
let document = registry
.transform(&AdapterContext { page: &page })
.ok_or("adapter transform missing")?;
assert!(contains_code_block(&document.nodes, "legacy"));
assert!(!contains_paragraph_text(&document.nodes, "Sent from my"));
assert!(document.nodes.iter().any(|node| matches!(
node,
index_core::IndexNode::Section {
title: Some(title),
nodes,
..
} if title == "Next steps"
&& nodes.iter().any(|child| matches!(
child,
index_core::IndexNode::Link(link) if link.href.contains("page=2")
))
)));
Ok(())
}
#[test]
fn top100_domain_normalization_handles_common_aliases() {
assert_eq!(
super::normalize_top100_domain("www.google.com"),
"google.com"
);
assert_eq!(
super::normalize_top100_domain("old.reddit.com"),
"reddit.com"
);
assert_eq!(super::normalize_top100_domain("twitter.com"), "x.com");
assert_eq!(
super::normalize_top100_domain("m.youtube.com"),
"youtube.com"
);
assert_eq!(
super::normalize_top100_domain("en.wikipedia.org"),
"wikipedia.org"
);
}
#[test]
fn blocked_flow_classifier_covers_required_classes() {
let mut page = page("https://x.com/home", "Sign in to continue");
page.paragraphs = vec!["Please log in to continue".to_owned()];
assert_eq!(
super::classify_blocked_flow(&page),
super::BlockedFlowClass::AuthWall
);
page.paragraphs = vec!["Enable JavaScript to continue in app".to_owned()];
assert_eq!(
super::classify_blocked_flow(&page),
super::BlockedFlowClass::ScriptGate
);
page.paragraphs = vec!["Captcha: verify you are human".to_owned()];
assert_eq!(
super::classify_blocked_flow(&page),
super::BlockedFlowClass::BotGate
);
page.paragraphs = vec!["This content is not available in your region".to_owned()];
assert_eq!(
super::classify_blocked_flow(&page),
super::BlockedFlowClass::GeoGate
);
page.paragraphs = vec!["Confirm your age to continue (18+)".to_owned()];
assert_eq!(
super::classify_blocked_flow(&page),
super::BlockedFlowClass::AgeGate
);
page.paragraphs = vec!["Access denied: blocked by policy".to_owned()];
assert_eq!(
super::classify_blocked_flow(&page),
super::BlockedFlowClass::PolicyGate
);
}
#[test]
fn blocked_top100_document_emits_remediation_and_capture_guidance() {
let page = page("https://office.com/home", "Access denied");
let document = super::blocked_top100_document(
&page,
&index_core::AdapterId::new("top100.baseline"),
"office.com",
super::Top100Family::ServicesUtility,
super::Top100Intent::AppShell,
super::BlockedFlowClass::PolicyGate,
);
let rendered = format!("{:?}", document.nodes);
assert!(rendered.contains("Remediation"));
assert!(rendered.contains("policy-blocked"));
assert!(rendered.contains(":capture save top100-blocked.capture"));
assert!(
rendered.contains(":capture --preview --redact https://office.com/ blocked-flow.html")
);
}
#[test]
fn top100_baseline_adapter_catches_supported_domains_without_dedicated_adapters()
-> Result<(), Box<dyn std::error::Error>> {
let registry = AdapterRegistry::default_registry();
let search_page = page(
"https://google.com/search?q=index+browser",
"Search results - Google",
);
let search_doc = registry
.transform(&AdapterContext { page: &search_page })
.ok_or("missing top100 adapter output")?;
assert_eq!(
search_doc
.metadata
.adapter_id
.as_ref()
.map(|id| id.as_str()),
Some("top100.baseline")
);
assert!(
search_doc
.title
.starts_with("Top site baseline: google.com")
);
assert!(contains_paragraph_text(
&search_doc.nodes,
"Family: Search Portal | Intent: search-results"
));
let search_portal_page = page("https://brave.com/", "Brave Search portal");
let search_portal_doc = registry
.transform(&AdapterContext {
page: &search_portal_page,
})
.ok_or("missing search portal output")?;
assert_eq!(
search_portal_doc
.metadata
.adapter_id
.as_ref()
.map(|id| id.as_str()),
Some("top100.baseline")
);
assert!(contains_paragraph_text(
&search_portal_doc.nodes,
"Family: Search Portal | Intent: portal-landing"
));
let dzen_page = page("https://dzen.ru/", "Dzen — Discover");
let dzen_doc = registry
.transform(&AdapterContext { page: &dzen_page })
.ok_or("missing dzen output")?;
assert_eq!(
dzen_doc.metadata.adapter_id.as_ref().map(|id| id.as_str()),
Some("top100.baseline")
);
assert!(contains_paragraph_text(
&dzen_doc.nodes,
"Family: Knowledge Reference | Intent: portal-landing"
));
let indiatimes_page = page("https://indiatimes.com/", "Indiatimes home");
let indiatimes_doc = registry
.transform(&AdapterContext {
page: &indiatimes_page,
})
.ok_or("missing indiatimes output")?;
assert_eq!(
indiatimes_doc
.metadata
.adapter_id
.as_ref()
.map(|id| id.as_str()),
Some("top100.baseline")
);
assert!(contains_paragraph_text(
&indiatimes_doc.nodes,
"Family: Knowledge Reference | Intent: portal-landing"
));
let marketplace_page = page("https://rakuten.co.jp/search?f=1", "Rakuten search listing");
let marketplace_doc = registry
.transform(&AdapterContext {
page: &marketplace_page,
})
.ok_or("missing marketplace output")?;
assert_eq!(
marketplace_doc
.metadata
.adapter_id
.as_ref()
.map(|id| id.as_str()),
Some("top100.baseline")
);
assert!(contains_paragraph_text(
&marketplace_doc.nodes,
"Family: Commerce and Marketplace | Intent: marketplace-listing"
));
let blocked_page = ReadablePage {
title: "Office".to_owned(),
paragraphs: vec!["Please sign in to continue".to_owned()],
nodes: vec![ReadableNode::Paragraph(
"Please sign in to continue".to_owned(),
)],
links: vec![HtmlLink {
text: "Sign in".to_owned(),
href: "https://office.com/login".to_owned(),
}],
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some("https://office.com/".to_owned()),
language: Some("en".to_owned()),
description: None,
open_graph_title: None,
open_graph_description: None,
},
};
let blocked_doc = registry
.transform(&AdapterContext {
page: &blocked_page,
})
.ok_or("missing blocked output")?;
assert_eq!(
blocked_doc
.metadata
.adapter_id
.as_ref()
.map(|id| id.as_str()),
Some("top100.baseline")
);
assert!(contains_error_text(&blocked_doc.nodes, "auth-wall"));
Ok(())
}
#[test]
fn top100_helpers_extract_media_metadata_and_listing_nodes() {
let page = ReadablePage {
title: "Video".to_owned(),
paragraphs: vec![
"Creator: Index Channel".to_owned(),
"Duration: 12:34".to_owned(),
"Unrelated paragraph".to_owned(),
],
nodes: vec![
ReadableNode::Paragraph("Results for keyboard".to_owned()),
ReadableNode::List {
ordered: false,
items: vec![
"Mechanical keyboard".to_owned(),
"Compact keyboard".to_owned(),
],
},
],
links: Vec::new(),
forms: Vec::new(),
metadata: ReadableMetadata::default(),
};
let metadata = super::top100_media_metadata(&page);
assert_eq!(
metadata,
vec![
"Creator: Index Channel".to_owned(),
"Duration: 12:34".to_owned()
]
);
let listing_nodes = super::top100_listing_nodes(&page);
assert!(listing_nodes
.iter()
.any(|node| matches!(node, index_core::IndexNode::Paragraph(text) if text.contains("Results for"))));
assert!(listing_nodes.iter().any(
|node| matches!(node, index_core::IndexNode::List { items, .. } if items.len() == 2)
));
}
#[test]
fn compatibility_pack_detects_major_families_without_bespoke_adapters()
-> Result<(), Box<dyn std::error::Error>> {
let registry = AdapterRegistry::default_registry();
let forum = ReadablePage {
title: "Community thread".to_owned(),
paragraphs: vec!["Forum intro".to_owned()],
nodes: vec![ReadableNode::List {
ordered: false,
items: vec!["Reply".to_owned(), "Next".to_owned()],
}],
links: (0..6)
.map(|index| HtmlLink {
text: format!("Forum link {index}"),
href: format!("https://talk.example.org/forums/thread/42?page={index}"),
})
.collect(),
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some("https://talk.example.org/forums/thread/42".to_owned()),
language: Some("en".to_owned()),
description: None,
open_graph_title: None,
open_graph_description: None,
},
};
let Some(forum_doc) = registry.transform(&AdapterContext { page: &forum }) else {
return Err(std::io::Error::other("forum pack output missing").into());
};
assert_eq!(
forum_doc.metadata.adapter_id.as_ref().map(|id| id.as_str()),
Some("family-pack.forums")
);
let qa = ReadablePage {
title: "How do I parse forms?".to_owned(),
paragraphs: vec!["Q body".to_owned()],
nodes: vec![ReadableNode::Paragraph("Answer body".to_owned())],
links: (0..4)
.map(|index| HtmlLink {
text: format!("QA link {index}"),
href: format!("https://answers.example.org/questions/42#answer-{index}"),
})
.collect(),
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some("https://answers.example.org/questions/42".to_owned()),
language: Some("en".to_owned()),
description: None,
open_graph_title: None,
open_graph_description: None,
},
};
let Some(qa_doc) = registry.transform(&AdapterContext { page: &qa }) else {
return Err(std::io::Error::other("qa pack output missing").into());
};
assert_eq!(
qa_doc.metadata.adapter_id.as_ref().map(|id| id.as_str()),
Some("family-pack.qa")
);
let docs = ReadablePage {
title: "Parser API".to_owned(),
paragraphs: vec!["Reference intro".to_owned()],
nodes: vec![
ReadableNode::Heading {
level: 1,
text: "API".to_owned(),
},
ReadableNode::Heading {
level: 2,
text: "parse()".to_owned(),
},
ReadableNode::CodeBlock {
language: Some("rust".to_owned()),
code: "fn parse() {}".to_owned(),
},
],
links: vec![
HtmlLink {
text: "Reference".to_owned(),
href: "https://kb.example.org/docs/reference".to_owned(),
},
HtmlLink {
text: "Guide".to_owned(),
href: "https://kb.example.org/docs/guide".to_owned(),
},
HtmlLink {
text: "Tutorial".to_owned(),
href: "https://kb.example.org/docs/tutorial".to_owned(),
},
],
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some("https://kb.example.org/docs/api/parser".to_owned()),
language: Some("en".to_owned()),
description: None,
open_graph_title: None,
open_graph_description: None,
},
};
let Some(docs_doc) = registry.transform(&AdapterContext { page: &docs }) else {
return Err(std::io::Error::other("docs pack output missing").into());
};
assert_eq!(
docs_doc.metadata.adapter_id.as_ref().map(|id| id.as_str()),
Some("family-pack.docs")
);
let news = ReadablePage {
title: "Tech news analysis".to_owned(),
paragraphs: vec![
"Lead paragraph".to_owned(),
"Context paragraph".to_owned(),
"Quote paragraph".to_owned(),
"Conclusion paragraph".to_owned(),
],
nodes: vec![ReadableNode::Paragraph("Lead paragraph".to_owned())],
links: (0..4)
.map(|index| HtmlLink {
text: format!("Source {index}"),
href: format!("https://media.example.org/news/story-{index}"),
})
.collect(),
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some("https://media.example.org/news/story-1".to_owned()),
language: Some("en".to_owned()),
description: None,
open_graph_title: None,
open_graph_description: None,
},
};
let Some(news_doc) = registry.transform(&AdapterContext { page: &news }) else {
return Err(std::io::Error::other("news pack output missing").into());
};
assert_eq!(
news_doc.metadata.adapter_id.as_ref().map(|id| id.as_str()),
Some("family-pack.news-media")
);
let portal = ReadablePage {
title: "Explore".to_owned(),
paragraphs: vec!["Quick index".to_owned()],
nodes: vec![ReadableNode::Paragraph("Quick index".to_owned())],
links: (0..24)
.map(|index| HtmlLink {
text: format!("Portal {index}"),
href: format!("https://portal.example.org/entry/{index}"),
})
.collect(),
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some("https://portal.example.org/".to_owned()),
language: Some("en".to_owned()),
description: None,
open_graph_title: None,
open_graph_description: None,
},
};
let Some(portal_doc) = registry.transform(&AdapterContext { page: &portal }) else {
return Err(std::io::Error::other("portal pack output missing").into());
};
assert_eq!(
portal_doc
.metadata
.adapter_id
.as_ref()
.map(|id| id.as_str()),
Some("family-pack.portal")
);
let app_shell = ReadablePage {
title: "Workspace dashboard".to_owned(),
paragraphs: vec!["Workspace summary".to_owned()],
nodes: vec![ReadableNode::Paragraph("Workspace summary".to_owned())],
links: (0..8)
.map(|index| HtmlLink {
text: format!("Action {index}"),
href: format!("https://app.example.org/app/dashboard/action/{index}"),
})
.collect(),
forms: vec![HtmlForm {
name: "quick-action".to_owned(),
method: "POST".to_owned(),
action: "https://app.example.org/app/dashboard/action".to_owned(),
inputs: vec![HtmlInput {
name: "command".to_owned(),
kind: "text".to_owned(),
value: None,
required: false,
}],
buttons: vec![HtmlButton {
name: Some("run".to_owned()),
label: "Run".to_owned(),
value: Some("run".to_owned()),
}],
}],
metadata: ReadableMetadata {
canonical_url: Some("https://app.example.org/app/dashboard".to_owned()),
language: Some("en".to_owned()),
description: None,
open_graph_title: None,
open_graph_description: None,
},
};
let Some(app_shell_doc) = registry.transform(&AdapterContext { page: &app_shell }) else {
return Err(std::io::Error::other("app-shell pack output missing").into());
};
assert_eq!(
app_shell_doc
.metadata
.adapter_id
.as_ref()
.map(|id| id.as_str()),
Some("family-pack.app-shell")
);
let commerce = ReadablePage {
title: "Shop deals".to_owned(),
paragraphs: vec![
"Price: $19.99".to_owned(),
"Price: $49.99".to_owned(),
"Public catalog".to_owned(),
],
nodes: vec![ReadableNode::List {
ordered: false,
items: vec!["Keyboard".to_owned(), "Mouse".to_owned()],
}],
links: (0..8)
.map(|index| HtmlLink {
text: format!("Product {index}"),
href: format!("https://shop.example.org/store/products/{index}"),
})
.collect(),
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some("https://shop.example.org/store/deals".to_owned()),
language: Some("en".to_owned()),
description: None,
open_graph_title: None,
open_graph_description: None,
},
};
let Some(commerce_doc) = registry.transform(&AdapterContext { page: &commerce }) else {
return Err(std::io::Error::other("commerce pack output missing").into());
};
assert_eq!(
commerce_doc
.metadata
.adapter_id
.as_ref()
.map(|id| id.as_str()),
Some("family-pack.commerce-cards")
);
let mixed_media = ReadablePage {
title: "Video gallery highlights".to_owned(),
paragraphs: vec![
"Roundup introduction".to_owned(),
"Episode summary".to_owned(),
"Playlist notes".to_owned(),
],
nodes: vec![ReadableNode::List {
ordered: false,
items: vec!["Episode 1".to_owned(), "Episode 2".to_owned()],
}],
links: (0..9)
.map(|index| HtmlLink {
text: format!("Media {index}"),
href: format!("https://media.example.org/watch/highlights/{index}"),
})
.collect(),
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some("https://media.example.org/watch/highlights".to_owned()),
language: Some("en".to_owned()),
description: None,
open_graph_title: None,
open_graph_description: None,
},
};
let Some(mixed_media_doc) = registry.transform(&AdapterContext { page: &mixed_media })
else {
return Err(std::io::Error::other("mixed media pack output missing").into());
};
assert_eq!(
mixed_media_doc
.metadata
.adapter_id
.as_ref()
.map(|id| id.as_str()),
Some("family-pack.mixed-media")
);
let mixed_rendered = format!("{:?}", mixed_media_doc.nodes);
assert!(
mixed_rendered.contains("fallback: confidence below 3 returns generic transformer")
);
Ok(())
}
#[test]
fn compatibility_pack_skips_weak_generic_pages() {
let registry = AdapterRegistry::default_registry();
let weak = page("https://example.com/article", "Short note");
let detected = registry.detect(&AdapterContext { page: &weak });
assert_eq!(detected, None);
}
#[test]
fn compatibility_pack_weak_signals_fall_back_to_generic() {
let registry = AdapterRegistry::default_registry();
let weak_app_shell = ReadablePage {
title: "Dashboard".to_owned(),
paragraphs: vec!["Short note".to_owned()],
nodes: vec![ReadableNode::Paragraph("Short note".to_owned())],
links: vec![HtmlLink {
text: "Home".to_owned(),
href: "https://app.example.org/app".to_owned(),
}],
forms: Vec::new(),
metadata: ReadableMetadata {
canonical_url: Some("https://app.example.org/app".to_owned()),
language: Some("en".to_owned()),
description: None,
open_graph_title: None,
open_graph_description: None,
},
};
let detected = registry.detect(&AdapterContext {
page: &weak_app_shell,
});
assert_eq!(detected, None);
}
}