unpdf 0.4.2

High-performance PDF content extraction to Markdown, text, and JSON
Documentation
use std::env;
use std::fs;
use std::io::Write;
use std::path::Path;

fn main() {
    let out_dir = env::var("OUT_DIR").unwrap();

    let collections = [
        ("KOREA1", "cmap-data/Adobe-Korea1/cid2code.txt"),
        ("JAPAN1", "cmap-data/Adobe-Japan1/cid2code.txt"),
        ("CNS1", "cmap-data/Adobe-CNS1/cid2code.txt"),
        ("GB1", "cmap-data/Adobe-GB1/cid2code.txt"),
    ];

    let mut all_code = String::new();
    all_code.push_str("// Auto-generated by build.rs from Adobe cmap-resources — do not edit\n\n");

    for (name, path) in &collections {
        println!("cargo:rerun-if-changed={}", path);

        let src = Path::new(path);
        if !src.exists() {
            all_code.push_str(&format!(
                "pub(crate) static CID_TO_UNICODE_{}: &[(u32, u32)] = &[];\n\n",
                name
            ));
            continue;
        }

        let data =
            fs::read_to_string(src).unwrap_or_else(|e| panic!("Failed to read {}: {}", path, e));
        let table = parse_cid2code(&data);

        all_code.push_str(&format!(
            "pub(crate) static CID_TO_UNICODE_{}: &[(u32, u32)] = &[\n",
            name
        ));
        for (cid, unicode) in &table {
            all_code.push_str(&format!("    ({}, 0x{:04X}),\n", cid, unicode));
        }
        all_code.push_str("];\n\n");
    }

    let dest = Path::new(&out_dir).join("cmap_tables.rs");
    let mut f = fs::File::create(&dest).unwrap();
    f.write_all(all_code.as_bytes()).unwrap();
}

fn parse_cid2code(data: &str) -> Vec<(u32, u32)> {
    let mut result = Vec::new();
    let mut unicode_col_idx: Option<usize> = None;

    for line in data.lines() {
        let line = line.trim();
        if line.is_empty() || line.starts_with('#') {
            continue;
        }

        let cols: Vec<&str> = line.split('\t').collect();

        // Detect header row
        if cols.first().map(|c| *c == "CID").unwrap_or(false) {
            // Find the best Unicode column:
            // Prefer last column containing "UTF-32", else last containing "UCS2"
            let mut best_utf32 = None;
            let mut best_ucs2 = None;
            for (i, col) in cols.iter().enumerate() {
                if col.contains("UTF32") {
                    best_utf32 = Some(i);
                } else if col.contains("UCS2") {
                    best_ucs2 = Some(i);
                }
            }
            unicode_col_idx = best_utf32.or(best_ucs2);
            continue;
        }

        let col_idx = match unicode_col_idx {
            Some(idx) => idx,
            None => continue,
        };

        if cols.len() <= col_idx {
            continue;
        }

        // Parse CID
        let cid: u32 = match cols[0].trim().parse() {
            Ok(v) => v,
            Err(_) => continue,
        };

        // Parse Unicode value
        let val = cols[col_idx].trim();
        if val == "*" || val.is_empty() {
            continue;
        }

        // Take the first value if comma-separated
        let first = val.split(',').next().unwrap_or(val).trim();

        // Parse hex
        if let Ok(cp) = u32::from_str_radix(first, 16) {
            if cp > 0 && cp <= 0x10FFFF {
                result.push((cid, cp));
            }
        }
    }

    result.sort_by_key(|&(cid, _)| cid);
    result.dedup_by_key(|entry| entry.0);
    result
}