hyperlink 0.2.1

Very fast link checker for CI.
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::Arc;

use crate::html::{Href, Link, UsedLink};
use crate::urls::is_external_link;

pub trait LinkCollector<P>: Send {
    fn new() -> Self;
    fn ingest(&mut self, link: Link<'_, P>);
    fn merge(&mut self, other: Self);
}

#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
pub struct OwnedUsedLink<P> {
    pub href: String,
    pub path: Arc<PathBuf>,
    pub paragraph: Option<P>,
}

/// Collects only used links for match-all-paragraphs command. Discards defined links.
pub struct UsedLinkCollector<P> {
    pub used_links: Vec<OwnedUsedLink<P>>,
}

impl<P: Send> LinkCollector<P> for UsedLinkCollector<P> {
    fn new() -> Self {
        UsedLinkCollector {
            used_links: Vec::new(),
        }
    }

    fn ingest(&mut self, link: Link<'_, P>) {
        if let Link::Uses(used_link) = link {
            self.used_links.push(OwnedUsedLink {
                href: used_link.href.0.to_owned(),
                path: used_link.path.to_owned(),
                paragraph: used_link.paragraph,
            });
        }
    }

    fn merge(&mut self, other: Self) {
        self.used_links.extend(other.used_links);
    }
}

#[derive(Debug)]
enum LinkState<P> {
    /// We have observed a DefinedLink for this href
    Defined,
    /// We have not *yet* observed a DefinedLink and therefore need to keep track of all link
    /// usages for potential error reporting.
    Undefined(Vec<(Arc<PathBuf>, Option<P>)>),
}

impl<P: Copy> LinkState<P> {
    fn add_usage(&mut self, link: &UsedLink<P>) {
        if let LinkState::Undefined(ref mut links) = self {
            links.push((link.path.clone(), link.paragraph));
        }
    }

    fn update(&mut self, other: Self) {
        match self {
            LinkState::Defined => (),
            LinkState::Undefined(links) => match other {
                LinkState::Defined => *self = LinkState::Defined,
                LinkState::Undefined(links2) => links.extend(links2),
            },
        }
    }
}

pub struct LocalLinksOnly<C> {
    pub collector: C,
}

/// Filter out external links. Hrefs are already canonicalized by Document::join during
/// extraction.
pub fn filter_local_link<P>(link: Link<'_, P>) -> Option<Link<'_, P>> {
    if let Link::Uses(ref used_link) = link {
        if is_external_link(used_link.href.0.as_bytes()) {
            return None;
        }
    }

    Some(link)
}

impl<P, C: LinkCollector<P>> LinkCollector<P> for LocalLinksOnly<C> {
    fn new() -> Self {
        LocalLinksOnly {
            collector: C::new(),
        }
    }

    fn ingest(&mut self, link: Link<'_, P>) {
        if let Some(link) = filter_local_link(link) {
            self.collector.ingest(link);
        }
    }

    fn merge(&mut self, other: Self) {
        self.collector.merge(other.collector);
    }
}

/// Link collector used for actual link checking. Keeps track of broken links only.
pub struct BrokenLinkCollector<P> {
    // HashMap provides ~13% speedup over BTreeMap for link collection.
    // Benchmarked with 50k files / 2.5M links: BTreeMap 1.73s -> HashMap 1.50s.
    // Adding reserve() in merge() gives another ~7% (1.50s -> 1.39s).
    // Test data: html-bench --file-count 50000 --max-folder-size 100 --link-density 50 --seed 42
    links: HashMap<String, LinkState<P>>,
    used_link_count: usize,
}

impl<P: Send + Copy> LinkCollector<P> for BrokenLinkCollector<P> {
    fn new() -> Self {
        BrokenLinkCollector {
            links: HashMap::new(),
            used_link_count: 0,
        }
    }

    fn ingest(&mut self, link: Link<'_, P>) {
        match link {
            Link::Uses(used_link) => {
                self.used_link_count += 1;

                self.links
                    .entry(used_link.href.0.to_owned())
                    .and_modify(|state| state.add_usage(&used_link))
                    .or_insert_with(|| {
                        let mut state = LinkState::Undefined(Vec::new());
                        state.add_usage(&used_link);
                        state
                    });
            }
            Link::Defines(defined_link) => {
                self.links
                    .insert(defined_link.href.0.to_owned(), LinkState::Defined);
            }
        }
    }

    fn merge(&mut self, other: Self) {
        self.used_link_count += other.used_link_count;
        self.links.reserve(other.links.len());

        for (href, other_state) in other.links {
            if let Some(state) = self.links.get_mut(&href) {
                state.update(other_state);
            } else {
                self.links.insert(href, other_state);
            }
        }
    }
}

#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd)]
pub struct BrokenLink<P> {
    pub hard_404: bool,
    pub link: OwnedUsedLink<P>,
}

impl<P: Copy + PartialEq + Ord> BrokenLinkCollector<P> {
    pub fn get_broken_links(&self, check_anchors: bool) -> impl Iterator<Item = BrokenLink<P>> {
        let mut broken_links = Vec::new();

        for (href, state) in self.links.iter() {
            if let LinkState::Undefined(links) = state {
                let hard_404 = if check_anchors {
                    !matches!(
                        self.links.get(Href(href).without_anchor().0),
                        Some(&LinkState::Defined)
                    )
                } else {
                    true
                };

                for (path, paragraph) in links.iter() {
                    broken_links.push(BrokenLink {
                        hard_404,
                        link: OwnedUsedLink {
                            path: path.clone(),
                            paragraph: *paragraph,
                            href: href.clone(),
                        },
                    });
                }
            }
        }

        // Sort for deterministic output (HashMap iteration order is arbitrary)
        broken_links.sort();
        broken_links.into_iter()
    }

    pub fn used_links_count(&self) -> usize {
        self.used_link_count
    }
}