grapheme-stdlib 0.6.0

Standard library operation implementations for Grapheme
Documentation
use serde_json::{json, Value as JsonValue};

pub fn to_md(args: &JsonValue) -> JsonValue {
    let request = match ToMdRequest::from_args(args) {
        Ok(request) => request,
        Err(err) => {
            return json!({
                "error": err,
                "text": "",
                "markdown": "",
            });
        }
    };

    match html_to_markdown_rs::convert(&request.html, request.options.clone()) {
        Ok(result) => {
            let markdown = result.content.as_deref().unwrap_or_default().to_string();
            let result_json = serde_json::to_value(&result).unwrap_or(JsonValue::Null);
            json!({
                "text": markdown.clone(),
                "markdown": markdown,
                "result": result_json,
                "used_options": request.options,
            })
        }
        Err(err) => json!({
            "error": format!("html to markdown conversion failed: {err}"),
            "text": "",
            "markdown": "",
        }),
    }
}

pub fn clean_text(args: &JsonValue) -> JsonValue {
    let request = CleanTextRequest::from_args(args);
    let cleaned = clean_page_text_raw(&request.text, request.max_chars);
    json!({
        "text": cleaned,
        "length": cleaned.chars().count(),
    })
}

pub fn clean_page_text_raw(raw: &str, max_chars: Option<usize>) -> String {
    let mut lines = Vec::new();
    let mut in_code = false;

    for original in raw.lines() {
        let line = original.trim();

        if line.starts_with("```") {
            in_code = !in_code;
            continue;
        }
        if in_code || line.is_empty() {
            continue;
        }

        let mut candidate = line.to_string();
        for marker in [
            " const ",
            " window.",
            " document.",
            " function ",
            " @media ",
            " input[type=",
            " { font-family",
            "::-webkit",
            " appearance:",
            " let ",
            " var ",
        ] {
            if let Some(idx) = candidate.find(marker) {
                if idx > 20 {
                    candidate.truncate(idx);
                } else {
                    candidate.clear();
                }
                break;
            }
        }

        let candidate = candidate.trim();
        if candidate.is_empty() {
            continue;
        }

        let lower = candidate.to_lowercase();
        let noisy = lower.starts_with("skip to")
            || lower.contains("cookie")
            || lower.contains("privacy policy")
            || lower.contains("terms of service")
            || lower.starts_with("open menu")
            || lower.contains("keyboard shortcuts")
            || lower.starts_with("sign in")
            || lower.starts_with("sign up")
            || lower.starts_with("footer")
            || lower.starts_with("copyright")
            || candidate.starts_with("window.")
            || candidate.starts_with("document.")
            || candidate.starts_with("function ")
            || candidate.starts_with("const ")
            || candidate.starts_with("let ")
            || candidate.starts_with("var ")
            || candidate.starts_with("@media");

        if noisy {
            continue;
        }

        lines.push(candidate.to_string());
    }

    let cleaned = lines.join("\n");
    match max_chars {
        Some(limit) if cleaned.chars().count() > limit => cleaned.chars().take(limit).collect(),
        _ => cleaned,
    }
}

#[derive(Debug, Clone)]
struct ToMdRequest {
    html: String,
    options: Option<html_to_markdown_rs::ConversionOptions>,
}

impl ToMdRequest {
    fn from_args(args: &JsonValue) -> Result<Self, String> {
        Ok(Self {
            html: arg_text(args, "html"),
            options: parse_html_to_md_options(args)?,
        })
    }
}

#[derive(Debug, Clone)]
struct CleanTextRequest {
    text: String,
    max_chars: Option<usize>,
}

impl CleanTextRequest {
    fn from_args(args: &JsonValue) -> Self {
        Self {
            text: arg_text(args, "text"),
            max_chars: arg_u64(args, "max_chars").map(|v| v as usize),
        }
    }
}

fn parse_html_to_md_options(
    args: &JsonValue,
) -> Result<Option<html_to_markdown_rs::ConversionOptions>, String> {
    let Some(raw) = args.get("options") else {
        return Ok(None);
    };

    if !raw.is_object() {
        return Err("html.to_md options must be an object".to_string());
    }

    serde_json::from_value::<html_to_markdown_rs::ConversionOptions>(raw.clone())
        .map(Some)
        .map_err(|err| format!("invalid html.to_md options: {err}"))
}

fn arg_text(args: &JsonValue, key: &str) -> String {
    args.get(key)
        .and_then(|v| v.as_str())
        .map(ToOwned::to_owned)
        .or_else(|| {
            args.get("__input")
                .and_then(|v| v.as_str())
                .map(ToOwned::to_owned)
        })
        .unwrap_or_default()
}

fn arg_u64(args: &JsonValue, key: &str) -> Option<u64> {
    args.get(key).and_then(|v| {
        v.as_u64()
            .or_else(|| v.as_str().and_then(|s| s.parse::<u64>().ok()))
    })
}

#[cfg(test)]
mod tests {
    use super::*;
    use serde_json::json;

    #[test]
    fn clean_text_filters_script_noise() {
        let out = clean_text(&json!({
            "text": "const x = 1;\nKeyboard shortcuts\nReal content line"
        }));
        let text = out.get("text").and_then(|v| v.as_str()).unwrap_or("");
        assert!(text.contains("Real content line"));
        assert!(!text.contains("Keyboard shortcuts"));
        assert!(!text.contains("const x = 1"));
    }

    #[test]
    fn to_md_options_enable_document_structure() {
        let out = to_md(&json!({
            "html": "<h1>Title</h1><p>Hello</p>",
            "options": {
                "include_document_structure": true,
                "extract_metadata": true,
                "output_format": "markdown"
            }
        }));

        assert!(out.get("error").is_none());
        assert!(out.get("result").is_some());
        assert_eq!(
            out.get("used_options")
                .and_then(|v| v.get("include_document_structure"))
                .and_then(|v| v.as_bool()),
            Some(true)
        );
    }

    #[test]
    fn to_md_rejects_non_object_options() {
        let out = to_md(&json!({
            "html": "<p>hello</p>",
            "options": "strict"
        }));
        assert!(out.get("error").and_then(|v| v.as_str()).is_some());
    }
}