panache-parser 0.14.0

Lossless CST parser and syntax wrappers for Pandoc markdown, Quarto, and RMarkdown
Documentation
//! Generates the bare-URI scheme table for the `autolink_bare_uris` extension.
//!
//! The vendored `uri-schemes.csv` is the IANA registry, fetched verbatim by
//! `scripts/update-uri-schemes.sh`. All processing lives here: we extract the
//! scheme names, fold in the nonstandard schemes pandoc recognizes that the
//! registry omits, sort, and emit a `&[&str]` for `binary_search`.

use std::env;
use std::fs;
use std::path::PathBuf;

/// Nonstandard schemes that pandoc autolinks but the IANA registry omits.
/// Taken from pandoc's `Text.Pandoc.URI.schemes`;
/// folded in alongside the registry and deduped after sorting.
const NONSTANDARD_SCHEMES: &[&str] = &["doi", "gemini", "isbn", "javascript", "pmid"];

fn main() {
    let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
    let csv_path = manifest_dir.join("src/parser/inlines/uri-schemes.csv");
    let csv = fs::read_to_string(&csv_path).expect("read uri-schemes.csv");

    let mut schemes = parse_iana_schemes(&csv);
    schemes.extend(NONSTANDARD_SCHEMES.iter().map(|s| s.to_string()));
    schemes.sort();
    schemes.dedup();

    let mut out = String::from("// @generated by build.rs from uri-schemes.csv — do not edit.\n");
    out.push_str("const BARE_URI_SCHEMES: &[&str] = &[\n");
    for scheme in &schemes {
        out.push_str("    \"");
        out.push_str(scheme);
        out.push_str("\",\n");
    }
    out.push_str("];\n");

    let out_dir = PathBuf::from(env::var("OUT_DIR").unwrap());
    fs::write(out_dir.join("uri_schemes.rs"), out).expect("write generated scheme table");

    println!("cargo:rerun-if-changed=src/parser/inlines/uri-schemes.csv");
    println!("cargo:rerun-if-changed=build.rs");
}

/// Extracts the lowercased scheme names (first column) from the IANA CSV.
///
/// Minimal RFC 4180 reader: we only need each record's first field, but must
/// honor quoted fields because later columns embed commas and newlines.
/// Registry annotations such as `shttp (OBSOLETE)` keep only the leading token.
fn parse_iana_schemes(csv: &str) -> Vec<String> {
    let mut schemes = Vec::new();
    let mut chars = csv.chars().peekable();
    let mut header = true;
    while chars.peek().is_some() {
        // Read the first field of the record.
        let mut field = String::new();
        let mut in_quotes = false;
        while let Some(&c) = chars.peek() {
            if in_quotes {
                chars.next();
                if c == '"' {
                    if chars.peek() == Some(&'"') {
                        chars.next();
                        field.push('"');
                    } else {
                        in_quotes = false;
                    }
                } else {
                    field.push(c);
                }
            } else if c == '"' {
                chars.next();
                in_quotes = true;
            } else if c == ',' || c == '\n' || c == '\r' {
                break;
            } else {
                chars.next();
                field.push(c);
            }
        }
        // Consume the remaining fields up to the record terminator.
        let mut in_quotes = false;
        while let Some(&c) = chars.peek() {
            chars.next();
            match c {
                '"' if in_quotes && chars.peek() == Some(&'"') => {
                    chars.next();
                }
                '"' => in_quotes = !in_quotes,
                '\n' if !in_quotes => break,
                _ => {}
            }
        }
        if header {
            header = false;
            continue;
        }
        // The scheme token is the leading word; drop trailing annotations.
        if let Some(name) = field.split_whitespace().next() {
            schemes.push(name.to_ascii_lowercase());
        }
    }
    schemes
}