gobby-wiki 0.7.0

Gobby wiki CLI shell
use std::path::Path;

use scraper::{ElementRef, Html, Node, Selector};

use crate::ingest::{
    markdown_metadata, markdown_title, path_to_string, single_line, text_from_utf8_lossy,
};
use crate::sources::SourceKind;

use super::UrlSnapshot;

pub(super) fn render_url_markdown(
    snapshot: &UrlSnapshot,
    canonical_url: &str,
    title: &str,
    document: &Html,
    source_hash: &str,
) -> String {
    let mut fields = vec![
        ("source_kind", "url".to_string()),
        ("source_url", snapshot.final_url.clone()),
        ("requested_url", snapshot.requested_url.clone()),
        ("canonical_url", canonical_url.to_string()),
        ("fetched_at", snapshot.fetched_at.clone()),
        ("source_hash", source_hash.to_string()),
    ];
    if let Some(content_type) = &snapshot.content_type {
        fields.push(("content_type", content_type.clone()));
    }
    let mut markdown = markdown_metadata(&fields);
    markdown.push_str("# ");
    markdown.push_str(&markdown_title(title));
    markdown.push_str("\n\n");
    markdown.push_str(&html_to_markdownish_text(document));
    markdown.push('\n');
    markdown
}

pub(super) fn render_non_html_url_markdown(
    snapshot: &UrlSnapshot,
    canonical_url: &str,
    title: &str,
    kind: &SourceKind,
    source_hash: &str,
    asset_path: &Path,
) -> String {
    let mut fields = vec![
        ("source_kind", kind.to_string()),
        ("source_url", snapshot.final_url.clone()),
        ("requested_url", snapshot.requested_url.clone()),
        ("canonical_url", canonical_url.to_string()),
        ("fetched_at", snapshot.fetched_at.clone()),
        ("source_hash", source_hash.to_string()),
        ("source_asset", path_to_string(asset_path)),
        ("media_degradation", "url_non_html_asset".to_string()),
    ];
    if let Some(content_type) = &snapshot.content_type {
        fields.push(("content_type", content_type.clone()));
    }
    let mut markdown = markdown_metadata(&fields);
    markdown.push_str("# ");
    markdown.push_str(&markdown_title(title));
    markdown.push_str("\n\n");
    markdown.push_str("Non-HTML URL response preserved as a source asset.\n");
    markdown
}

pub(super) fn snapshot_is_html(snapshot: &UrlSnapshot) -> bool {
    match content_type_media_type(snapshot.content_type.as_deref()).as_deref() {
        Some("text/html" | "application/xhtml+xml") => true,
        Some(_) => false,
        None => body_looks_like_html(&snapshot.body),
    }
}

pub(super) fn source_kind_for_url_response(content_type: Option<&str>) -> SourceKind {
    match content_type_media_type(content_type).as_deref() {
        Some("application/pdf") => SourceKind::Pdf,
        Some(media_type) if media_type.starts_with("image/") => SourceKind::Image,
        Some(media_type) if media_type.starts_with("audio/") => SourceKind::Audio,
        Some(media_type) if media_type.starts_with("video/") => SourceKind::Video,
        Some("application/json" | "application/xml" | "text/plain" | "text/csv" | "text/xml") => {
            SourceKind::Text
        }
        Some(media_type) if media_type.starts_with("text/") => SourceKind::Text,
        _ => SourceKind::File,
    }
}

fn content_type_media_type(content_type: Option<&str>) -> Option<String> {
    content_type?
        .split(';')
        .next()
        .map(str::trim)
        .filter(|value| !value.is_empty())
        .map(str::to_ascii_lowercase)
}

fn body_looks_like_html(body: &[u8]) -> bool {
    let text = text_from_utf8_lossy(&body[..body.len().min(512)]).to_ascii_lowercase();
    let trimmed = text.trim_start();
    trimmed.starts_with("<!doctype html")
        || trimmed.starts_with("<html")
        || trimmed.contains("<body")
}

pub(super) fn file_name_for_url_response(snapshot: &UrlSnapshot, kind: &SourceKind) -> String {
    let from_url = url::Url::parse(&snapshot.final_url)
        .ok()
        .and_then(|url| {
            url.path_segments()
                .and_then(|mut segments| segments.next_back().map(str::to_string))
        })
        .filter(|value| !value.trim().is_empty());
    from_url.unwrap_or_else(|| match kind {
        SourceKind::Pdf => "download.pdf".to_string(),
        SourceKind::Image => "image".to_string(),
        SourceKind::Audio => "audio".to_string(),
        SourceKind::Video => "video".to_string(),
        SourceKind::Text => "download.txt".to_string(),
        _ => "download".to_string(),
    })
}

pub(super) fn extract_title(document: &Html) -> Option<String> {
    let selector = Selector::parse("title").ok()?;
    let title = document
        .select(&selector)
        .next()?
        .text()
        .collect::<Vec<_>>()
        .join(" ");
    let title = single_line(&title);
    (!title.is_empty()).then_some(title)
}

pub(super) fn html_to_markdownish_text(document: &Html) -> String {
    let mut parts = Vec::new();
    let root = Selector::parse("body")
        .ok()
        .and_then(|selector| document.select(&selector).next())
        .unwrap_or_else(|| document.root_element());
    collect_visible_text(root, &mut parts);
    let text = parts.join("\n");
    normalize_markdown_text(&text)
}

fn collect_visible_text(element: ElementRef<'_>, parts: &mut Vec<String>) {
    if is_hidden_element(element.value().name()) {
        return;
    }
    if is_text_block(element.value().name()) {
        let mut text = String::new();
        collect_inline_text(element, &mut text);
        if !single_line(&text).is_empty() {
            parts.push(text);
        }
        return;
    }

    let mut inline = String::new();
    for child in element.children() {
        match child.value() {
            Node::Text(text) => inline.push_str(&text.text),
            Node::Element(_) => {
                if let Some(child_element) = ElementRef::wrap(child) {
                    if is_hidden_element(child_element.value().name()) {
                        continue;
                    }
                    if is_text_block(child_element.value().name()) {
                        push_inline_part(&mut inline, parts);
                        collect_visible_text(child_element, parts);
                    } else {
                        collect_inline_text(child_element, &mut inline);
                    }
                }
            }
            _ => {}
        }
    }
    push_inline_part(&mut inline, parts);
}

fn collect_inline_text(element: ElementRef<'_>, output: &mut String) {
    if is_hidden_element(element.value().name()) {
        return;
    }
    for child in element.children() {
        match child.value() {
            Node::Text(text) => output.push_str(&text.text),
            Node::Element(_) => {
                if let Some(child_element) = ElementRef::wrap(child) {
                    collect_inline_text(child_element, output);
                }
            }
            _ => {}
        }
    }
}

fn push_inline_part(inline: &mut String, parts: &mut Vec<String>) {
    if !single_line(inline).is_empty() {
        parts.push(std::mem::take(inline));
    } else {
        inline.clear();
    }
}

fn is_hidden_element(name: &str) -> bool {
    matches!(name, "head" | "script" | "style")
}

fn is_text_block(name: &str) -> bool {
    matches!(
        name,
        "address"
            | "blockquote"
            | "dd"
            | "dt"
            | "figcaption"
            | "h1"
            | "h2"
            | "h3"
            | "h4"
            | "h5"
            | "h6"
            | "li"
            | "p"
            | "pre"
            | "td"
            | "th"
    )
}

fn normalize_markdown_text(text: &str) -> String {
    let mut lines = Vec::new();
    for line in text.lines() {
        let line = single_line(line);
        if !line.is_empty() && lines.last().is_none_or(|last: &String| last != &line) {
            lines.push(line);
        }
    }
    lines.join("\n\n")
}