index-transformer 1.0.0

//! Orthogonal transform instructions.

use std::collections::{BTreeSet, btree_map::Entry};

use index_core::{ButtonAction, Form, IndexDocument, IndexNode, Input, Link, SectionRole};
use index_readability::{ReadableNode, ReadablePage, ReadableSectionRole};
use url::Url;

const MAX_EMITTED_LINKS: usize = 300;

/// A small, composable transformation instruction.
pub trait Instruction {
    /// Executes the instruction against a readable page and mutable document.
    fn execute(&self, page: &ReadablePage, document: &mut IndexDocument);
}

/// Applies readable page metadata.
#[derive(Debug, Clone, Copy)]
pub struct ApplyMetadata;

impl Instruction for ApplyMetadata {
    fn execute(&self, page: &ReadablePage, document: &mut IndexDocument) {
        document.metadata.canonical_url = page.metadata.canonical_url.clone();
        document.metadata.language = page.metadata.language.clone();
        document.metadata.description = page.metadata.description.clone();
        document.metadata.open_graph_title = page.metadata.open_graph_title.clone();
        document.metadata.open_graph_description = page.metadata.open_graph_description.clone();
    }
}

/// Emits the page title as a level-one heading.
#[derive(Debug, Clone, Copy)]
pub struct EmitTitle;

impl Instruction for EmitTitle {
    fn execute(&self, page: &ReadablePage, document: &mut IndexDocument) {
        document.push(IndexNode::Heading {
            level: 1,
            text: page.title.clone(),
        });
    }
}

/// Emits readable semantic content nodes.
#[derive(Debug, Clone, Copy)]
pub struct EmitReadableNodes;

impl Instruction for EmitReadableNodes {
    fn execute(&self, page: &ReadablePage, document: &mut IndexDocument) {
        let mut skipped_title_heading = false;

        for node in &page.nodes {
            if !skipped_title_heading
                && matches!(
                    node,
                    ReadableNode::Heading { level: 1, text } if text == &page.title
                )
            {
                skipped_title_heading = true;
                continue;
            }

            match node {
                ReadableNode::Heading { level, text } => document.push(IndexNode::Heading {
                    level: *level,
                    text: text.clone(),
                }),
                ReadableNode::Paragraph(text) => {
                    document.push(IndexNode::Paragraph(text.clone()));
                }
                ReadableNode::Link(link) => {
                    document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
                }
                ReadableNode::List { ordered, items } => {
                    document.push(IndexNode::List {
                        ordered: *ordered,
                        items: items.clone(),
                    });
                }
                ReadableNode::CodeBlock { language, code } => {
                    document.push(IndexNode::CodeBlock {
                        language: language.clone(),
                        code: code.clone(),
                    });
                }
                ReadableNode::Table { rows } => {
                    document.push(IndexNode::Table { rows: rows.clone() });
                }
                ReadableNode::Spacer { lines } => {
                    document.push(IndexNode::Spacer { lines: *lines });
                }
                ReadableNode::Section {
                    role,
                    title,
                    collapsed,
                    nodes,
                } => {
                    document.push(IndexNode::Section {
                        role: section_role(*role),
                        title: title.clone(),
                        collapsed: *collapsed,
                        nodes: nodes.iter().map(index_node_from_readable).collect(),
                    });
                }
                ReadableNode::Image { alt, src } => {
                    document.push(IndexNode::Image {
                        alt: alt.clone(),
                        src: src.clone(),
                    });
                }
                ReadableNode::Form(form) => {
                    document.push(IndexNode::Form(Form {
                        name: form.name.clone(),
                        method: form.method.clone(),
                        action: form.action.clone(),
                        inputs: form
                            .inputs
                            .iter()
                            .map(|input| Input {
                                name: input.name.clone(),
                                kind: input.kind.clone(),
                                value: input.value.clone(),
                                required: input.required,
                            })
                            .collect(),
                        buttons: form
                            .buttons
                            .iter()
                            .map(|button| ButtonAction {
                                name: button.name.clone(),
                                value: button.value.clone(),
                                label: button.label.clone(),
                            })
                            .collect(),
                    }));
                }
            }
        }
    }
}

fn index_node_from_readable(node: &ReadableNode) -> IndexNode {
    match node {
        ReadableNode::Heading { level, text } => IndexNode::Heading {
            level: *level,
            text: text.clone(),
        },
        ReadableNode::Paragraph(text) => IndexNode::Paragraph(text.clone()),
        ReadableNode::Link(link) => IndexNode::Link(Link::new(&link.text, &link.href)),
        ReadableNode::List { ordered, items } => IndexNode::List {
            ordered: *ordered,
            items: items.clone(),
        },
        ReadableNode::CodeBlock { language, code } => IndexNode::CodeBlock {
            language: language.clone(),
            code: code.clone(),
        },
        ReadableNode::Table { rows } => IndexNode::Table { rows: rows.clone() },
        ReadableNode::Spacer { lines } => IndexNode::Spacer { lines: *lines },
        ReadableNode::Section {
            role,
            title,
            collapsed,
            nodes,
        } => IndexNode::Section {
            role: section_role(*role),
            title: title.clone(),
            collapsed: *collapsed,
            nodes: nodes.iter().map(index_node_from_readable).collect(),
        },
        ReadableNode::Image { alt, src } => IndexNode::Image {
            alt: alt.clone(),
            src: src.clone(),
        },
        ReadableNode::Form(form) => IndexNode::Form(index_form_from_html(form)),
    }
}

fn index_form_from_html(form: &index_dom::HtmlForm) -> Form {
    Form {
        name: form.name.clone(),
        method: form.method.clone(),
        action: form.action.clone(),
        inputs: form
            .inputs
            .iter()
            .map(|input| Input {
                name: input.name.clone(),
                kind: input.kind.clone(),
                value: input.value.clone(),
                required: input.required,
            })
            .collect(),
        buttons: form
            .buttons
            .iter()
            .map(|button| ButtonAction {
                name: button.name.clone(),
                value: button.value.clone(),
                label: button.label.clone(),
            })
            .collect(),
    }
}

fn section_role(role: ReadableSectionRole) -> SectionRole {
    match role {
        ReadableSectionRole::Main => SectionRole::Main,
        ReadableSectionRole::Navigation => SectionRole::Navigation,
        ReadableSectionRole::Aside => SectionRole::Aside,
        ReadableSectionRole::Footer => SectionRole::Footer,
        ReadableSectionRole::Comments => SectionRole::Comments,
        ReadableSectionRole::Related => SectionRole::Related,
        ReadableSectionRole::Unknown => SectionRole::Unknown,
    }
}

/// Emits links as stable document nodes.
#[derive(Debug, Clone, Copy)]
pub struct EmitLinks;

impl Instruction for EmitLinks {
    fn execute(&self, page: &ReadablePage, document: &mut IndexDocument) {
        let mut seen = BTreeSet::new();
        collect_existing_links(&document.nodes, &mut seen);
        let mut emitted = 0usize;
        let mut truncated = 0usize;
        let mut ranked = std::collections::BTreeMap::<String, RankedLink>::new();

        for (index, link) in page.links.iter().enumerate() {
            let normalized = normalized_link_key(&link.href);
            if normalized.is_empty() || seen.contains(&normalized) {
                continue;
            }
            let score = link_relevance_score(&link.text, &link.href);
            let candidate = RankedLink {
                text: link.text.clone(),
                href: link.href.clone(),
                score,
                first_seen: index,
            };
            match ranked.entry(normalized) {
                Entry::Vacant(entry) => {
                    entry.insert(candidate);
                }
                Entry::Occupied(mut entry) => {
                    if candidate.score > entry.get().score
                        || (candidate.score == entry.get().score
                            && candidate.text.len() > entry.get().text.len())
                    {
                        entry.insert(candidate);
                    }
                }
            }
        }

        let mut ranked = ranked.into_values().collect::<Vec<_>>();
        ranked.sort_by(|left, right| {
            right
                .score
                .cmp(&left.score)
                .then(left.first_seen.cmp(&right.first_seen))
                .then(left.text.cmp(&right.text))
                .then(left.href.cmp(&right.href))
        });

        for link in ranked {
            if emitted < MAX_EMITTED_LINKS {
                document.push(IndexNode::Link(Link::new(&link.text, &link.href)));
                emitted += 1;
            } else {
                truncated += 1;
            }
        }

        if truncated > 0 {
            document.push(IndexNode::Section {
                role: SectionRole::Unknown,
                title: Some("Diagnostic".to_owned()),
                collapsed: true,
                nodes: vec![
                    IndexNode::Error(format!(
                        "link list truncated: kept first {MAX_EMITTED_LINKS} unique links"
                    )),
                    IndexNode::List {
                        ordered: false,
                        items: vec![
                            format!("{truncated} additional links were omitted"),
                            "use :extract links to inspect bounded output".to_owned(),
                        ],
                    },
                ],
            });
        }
    }
}

#[derive(Debug, Clone, PartialEq, Eq)]
struct RankedLink {
    text: String,
    href: String,
    score: i32,
    first_seen: usize,
}

fn collect_existing_links(nodes: &[IndexNode], seen: &mut BTreeSet<String>) {
    for node in nodes {
        match node {
            IndexNode::Link(link) => {
                let _ = seen.insert(normalized_link_key(&link.href));
            }
            IndexNode::Section { nodes, .. } => collect_existing_links(nodes, seen),
            _ => {}
        }
    }
}

fn normalized_link_key(href: &str) -> String {
    let trimmed = href.trim();
    if trimmed.is_empty() {
        return String::new();
    }

    if let Ok(mut url) = Url::parse(trimmed) {
        url.set_fragment(None);
        let mut path = url.path().to_owned();
        while path.len() > 1 && path.ends_with('/') {
            path.pop();
        }
        url.set_path(&path);
        return url.to_string();
    }

    trimmed
        .split('#')
        .next()
        .unwrap_or_default()
        .trim_end_matches('/')
        .to_owned()
}

fn link_relevance_score(text: &str, href: &str) -> i32 {
    let text = text.trim().to_ascii_lowercase();
    let href = href.trim().to_ascii_lowercase();
    let mut score = 0i32;

    if text.is_empty() || href.is_empty() {
        return -100;
    }
    if href.starts_with("javascript:") || href.starts_with("mailto:") {
        return -100;
    }

    if matches!(
        text.as_str(),
        "next" | "prev" | "previous" | "older" | "newer" | "continue"
    ) || text.starts_with("page ")
        || href.contains("page=")
        || href.contains("/page-")
        || href.contains("start=")
        || href.contains("p=")
    {
        score += 8;
    }

    if text.contains("result")
        || text.contains("thread")
        || text.contains("discussion")
        || text.contains("article")
        || text.contains("guide")
        || text.contains("documentation")
        || text.contains("reference")
        || text.contains("docs")
    {
        score += 4;
    }

    if matches!(
        text.as_str(),
        "privacy" | "terms" | "cookie" | "sign in" | "log in" | "register" | "login" | "help"
    ) {
        score -= 5;
    }

    if text.len() >= 18 {
        score += 1;
    } else if text.len() <= 2 {
        score -= 2;
    }

    score
}

#[cfg(test)]
mod tests {
    use index_core::{IndexDocument, IndexNode, SectionRole};
    use index_dom::{HtmlButton, HtmlForm, HtmlInput, HtmlLink};
    use index_readability::{ReadableMetadata, ReadableNode, ReadablePage, ReadableSectionRole};

    use super::{ApplyMetadata, EmitLinks, EmitReadableNodes, EmitTitle, Instruction};

    fn page() -> ReadablePage {
        ReadablePage {
            title: "Title".to_owned(),
            paragraphs: vec!["Paragraph.".to_owned()],
            nodes: vec![
                ReadableNode::Paragraph("Paragraph.".to_owned()),
                ReadableNode::List {
                    ordered: false,
                    items: vec!["Read".to_owned(), "Search".to_owned()],
                },
                ReadableNode::CodeBlock {
                    language: Some("rust".to_owned()),
                    code: "fn main() {}".to_owned(),
                },
                ReadableNode::Table {
                    rows: vec![vec!["Name".to_owned()], vec!["Index".to_owned()]],
                },
                ReadableNode::Spacer { lines: 2 },
                ReadableNode::Section {
                    role: ReadableSectionRole::Navigation,
                    title: Some("Site".to_owned()),
                    collapsed: true,
                    nodes: vec![ReadableNode::Link(HtmlLink {
                        text: "About".to_owned(),
                        href: "https://example.com/about".to_owned(),
                    })],
                },
                ReadableNode::Image {
                    alt: "Logo".to_owned(),
                    src: Some("https://example.com/logo.png".to_owned()),
                },
                ReadableNode::Form(HtmlForm {
                    name: "search".to_owned(),
                    method: "GET".to_owned(),
                    action: "https://example.com/search".to_owned(),
                    inputs: vec![HtmlInput {
                        name: "q".to_owned(),
                        kind: "search".to_owned(),
                        value: None,
                        required: true,
                    }],
                    buttons: Vec::new(),
                }),
            ],
            links: vec![HtmlLink {
                text: "Docs".to_owned(),
                href: "https://example.com/docs".to_owned(),
            }],
            forms: Vec::new(),
            metadata: ReadableMetadata {
                canonical_url: Some("https://example.com/docs".to_owned()),
                language: Some("en".to_owned()),
                description: Some("Docs".to_owned()),
                open_graph_title: Some("OG".to_owned()),
                open_graph_description: Some("OpenGraph docs".to_owned()),
            },
        }
    }

    #[test]
    fn apply_metadata_sets_document_metadata() {
        let mut document = IndexDocument::titled("Title");
        ApplyMetadata.execute(&page(), &mut document);
        assert_eq!(
            document.metadata.canonical_url.as_deref(),
            Some("https://example.com/docs")
        );
        assert_eq!(document.metadata.language.as_deref(), Some("en"));
        assert_eq!(document.metadata.description.as_deref(), Some("Docs"));
        assert_eq!(document.metadata.open_graph_title.as_deref(), Some("OG"));
    }

    #[test]
    fn emit_title_adds_heading() {
        let mut document = IndexDocument::titled("Title");
        EmitTitle.execute(&page(), &mut document);
        assert!(matches!(
            document.nodes.first(),
            Some(IndexNode::Heading { .. })
        ));
    }

    #[test]
    fn emit_readable_nodes_adds_structured_nodes() {
        let mut document = IndexDocument::titled("Title");
        EmitReadableNodes.execute(&page(), &mut document);
        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Paragraph(_)))
        );
        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::List { ordered: false, items } if items == &vec!["Read".to_owned(), "Search".to_owned()]))
        );
        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::CodeBlock { .. }))
        );
        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Table { .. }))
        );
        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Spacer { lines: 2 }))
        );
        assert!(document.nodes.iter().any(|node| matches!(
            node,
            IndexNode::Section {
                role: index_core::SectionRole::Navigation,
                title: Some(title),
                collapsed: true,
                nodes
            } if title == "Site" && matches!(nodes.first(), Some(IndexNode::Link(link)) if link.text == "About")
        )));
        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Image { .. }))
        );
        assert!(
            document
                .nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Form(form) if form.name == "search"))
        );
    }

    #[test]
    fn emit_readable_nodes_maps_nested_section_contents() {
        let mut page = page();
        page.nodes = vec![ReadableNode::Section {
            role: ReadableSectionRole::Aside,
            title: Some("More".to_owned()),
            collapsed: false,
            nodes: vec![
                ReadableNode::Heading {
                    level: 2,
                    text: "Nested".to_owned(),
                },
                ReadableNode::Paragraph("Nested paragraph.".to_owned()),
                ReadableNode::Link(HtmlLink {
                    text: "Nested link".to_owned(),
                    href: "https://example.com/nested".to_owned(),
                }),
                ReadableNode::List {
                    ordered: true,
                    items: vec!["One".to_owned()],
                },
                ReadableNode::CodeBlock {
                    language: None,
                    code: "let value = 1;".to_owned(),
                },
                ReadableNode::Table {
                    rows: vec![vec!["cell".to_owned()]],
                },
                ReadableNode::Spacer { lines: 1 },
                ReadableNode::Section {
                    role: ReadableSectionRole::Comments,
                    title: None,
                    collapsed: true,
                    nodes: vec![ReadableNode::Paragraph("Comment.".to_owned())],
                },
                ReadableNode::Image {
                    alt: "Nested image".to_owned(),
                    src: None,
                },
                ReadableNode::Form(HtmlForm {
                    name: "nested-form".to_owned(),
                    method: "POST".to_owned(),
                    action: "https://example.com/submit".to_owned(),
                    inputs: vec![HtmlInput {
                        name: "email".to_owned(),
                        kind: "email".to_owned(),
                        value: Some("reader@example.com".to_owned()),
                        required: true,
                    }],
                    buttons: vec![HtmlButton {
                        name: Some("save".to_owned()),
                        value: Some("1".to_owned()),
                        label: "Save".to_owned(),
                    }],
                }),
            ],
        }];

        let mut document = IndexDocument::titled("Title");
        EmitReadableNodes.execute(&page, &mut document);

        assert!(matches!(
            document.nodes.first(),
            Some(IndexNode::Section { .. })
        ));

        if let Some(IndexNode::Section {
            role,
            title,
            collapsed,
            nodes,
        }) = document.nodes.first()
        {
            assert_eq!(*role, SectionRole::Aside);
            assert_eq!(title.as_deref(), Some("More"));
            assert!(!collapsed);
            assert!(
                matches!(nodes.first(), Some(IndexNode::Heading { level: 2, text }) if text == "Nested")
            );
            assert!(nodes.iter().any(
                |node| matches!(node, IndexNode::Paragraph(text) if text == "Nested paragraph.")
            ));
            assert!(
                nodes.iter().any(
                    |node| matches!(node, IndexNode::Link(link) if link.text == "Nested link")
                )
            );
            assert!(nodes
                .iter()
                .any(|node| matches!(node, IndexNode::List { ordered: true, items } if items == &vec!["One".to_owned()])));
            assert!(nodes
                .iter()
                .any(|node| matches!(node, IndexNode::CodeBlock { language: None, code } if code == "let value = 1;")));
            assert!(nodes
                .iter()
                .any(|node| matches!(node, IndexNode::Table { rows } if rows == &vec![vec!["cell".to_owned()]])));
            assert!(
                nodes
                    .iter()
                    .any(|node| matches!(node, IndexNode::Spacer { lines: 1 }))
            );
            assert!(nodes.iter().any(|node| matches!(
                node,
                IndexNode::Section {
                    role: SectionRole::Comments,
                    collapsed: true,
                    ..
                }
            )));
            assert!(nodes.iter().any(
                |node| matches!(node, IndexNode::Image { alt, src: None } if alt == "Nested image")
            ));
            assert!(nodes.iter().any(
                |node| matches!(node, IndexNode::Form(form) if form.name == "nested-form" && form.buttons.len() == 1)
            ));
        }
    }

    #[test]
    fn emit_readable_nodes_maps_all_section_roles() {
        let roles = [
            (ReadableSectionRole::Main, SectionRole::Main),
            (ReadableSectionRole::Navigation, SectionRole::Navigation),
            (ReadableSectionRole::Aside, SectionRole::Aside),
            (ReadableSectionRole::Footer, SectionRole::Footer),
            (ReadableSectionRole::Comments, SectionRole::Comments),
            (ReadableSectionRole::Related, SectionRole::Related),
            (ReadableSectionRole::Unknown, SectionRole::Unknown),
        ];

        for (readable_role, expected_role) in roles {
            let mut page = page();
            page.nodes = vec![ReadableNode::Section {
                role: readable_role,
                title: None,
                collapsed: true,
                nodes: Vec::new(),
            }];

            let mut document = IndexDocument::titled("Title");
            EmitReadableNodes.execute(&page, &mut document);

            assert!(matches!(
                document.nodes.first(),
                Some(IndexNode::Section { role, .. }) if *role == expected_role
            ));
        }
    }

    #[test]
    fn emit_readable_nodes_skips_duplicate_title_heading() {
        let mut page = page();
        page.nodes.insert(
            0,
            ReadableNode::Heading {
                level: 1,
                text: "Title".to_owned(),
            },
        );
        let mut document = IndexDocument::titled("Title");
        EmitReadableNodes.execute(&page, &mut document);
        assert!(!matches!(
            document.nodes.first(),
            Some(IndexNode::Heading { level: 1, text }) if text == "Title"
        ));
    }

    #[test]
    fn emit_links_adds_link_node() {
        let mut document = IndexDocument::titled("Title");
        EmitLinks.execute(&page(), &mut document);
        assert!(matches!(document.nodes.first(), Some(IndexNode::Link(_))));
    }

    #[test]
    fn emit_links_deduplicates_and_bounds_large_link_sets() {
        let mut document = IndexDocument::titled("Title");
        document.push(IndexNode::Link(index_core::Link::new(
            "Existing",
            "https://example.com/existing",
        )));

        let mut source = page();
        source.links = (0..340)
            .map(|index| HtmlLink {
                text: format!("Link {index}"),
                href: format!("https://example.com/{index}"),
            })
            .collect();
        source.links.push(HtmlLink {
            text: "Existing".to_owned(),
            href: "https://example.com/existing".to_owned(),
        });

        EmitLinks.execute(&source, &mut document);

        let link_count = document
            .nodes
            .iter()
            .filter(|node| matches!(node, IndexNode::Link(_)))
            .count();
        assert_eq!(link_count, 301);
        assert!(document.nodes.iter().any(|node| matches!(
            node,
            IndexNode::Section {
                title: Some(title),
                ..
            } if title == "Diagnostic"
        )));
    }

    #[test]
    fn emit_links_ranks_navigation_and_result_links_ahead_of_policy_links() {
        let mut document = IndexDocument::titled("Title");
        let mut source = page();
        source.links = vec![
            HtmlLink {
                text: "Privacy".to_owned(),
                href: "https://example.com/privacy".to_owned(),
            },
            HtmlLink {
                text: "Next".to_owned(),
                href: "https://example.com/thread?page=2".to_owned(),
            },
            HtmlLink {
                text: "API reference".to_owned(),
                href: "https://example.com/docs/api".to_owned(),
            },
        ];

        EmitLinks.execute(&source, &mut document);

        let emitted = document
            .nodes
            .iter()
            .filter_map(|node| match node {
                IndexNode::Link(link) => Some(link.text.clone()),
                _ => None,
            })
            .collect::<Vec<_>>();
        assert_eq!(emitted.first().map(String::as_str), Some("Next"));
        assert!(emitted.len() >= 3);
        assert!(
            emitted.iter().position(|text| text == "Privacy")
                > emitted.iter().position(|text| text == "API reference")
        );
    }

    #[test]
    fn emit_links_deduplicates_by_normalized_href_and_prefers_richer_text() {
        let mut document = IndexDocument::titled("Title");
        let mut source = page();
        source.links = vec![
            HtmlLink {
                text: "Docs".to_owned(),
                href: "https://example.com/docs#top".to_owned(),
            },
            HtmlLink {
                text: "Documentation".to_owned(),
                href: "https://example.com/docs".to_owned(),
            },
            HtmlLink {
                text: "Docs mirror".to_owned(),
                href: "https://example.com/docs/".to_owned(),
            },
        ];

        EmitLinks.execute(&source, &mut document);

        let docs_links = document
            .nodes
            .iter()
            .filter_map(|node| match node {
                IndexNode::Link(link) if link.href.starts_with("https://example.com/docs") => {
                    Some(link.text.as_str())
                }
                _ => None,
            })
            .collect::<Vec<_>>();

        assert_eq!(docs_links.len(), 1);
        assert_eq!(docs_links[0], "Documentation");
    }
}