use sha2::Digest;
use crate::kb::model::KbSourceKind;
pub fn slugify(title: &str) -> String {
let mut out = String::with_capacity(title.len());
let mut prev_dash = false;
for c in title.chars() {
let keep = c.is_alphanumeric() || is_cjk(c);
if keep {
for lc in c.to_lowercase() {
out.push(lc);
}
prev_dash = false;
} else if !prev_dash && !out.is_empty() {
out.push('-');
prev_dash = true;
}
}
while out.ends_with('-') {
out.pop();
}
out.chars().take(80).collect()
}
fn is_cjk(c: char) -> bool {
let cp = c as u32;
(0x4E00..=0x9FFF).contains(&cp)
|| (0x3040..=0x30FF).contains(&cp)
|| (0xAC00..=0xD7AF).contains(&cp)
}
pub fn markdown_rel_path(
kind: KbSourceKind,
slug: &str,
logical_source_id: &str,
body_sha256_hex: &str,
) -> String {
let lsid8 = lsid_hash8(logical_source_id);
let md8: String = body_sha256_hex.chars().take(8).collect();
format!("md/{}/{}--{lsid8}--{md8}.md", kind.as_str(), slug)
}
pub fn lsid_hash8(logical_source_id: &str) -> String {
let mut h = sha2::Sha256::new();
h.update(logical_source_id.as_bytes());
let d = h.finalize();
let mut s = String::with_capacity(8);
for b in d.iter().take(4) {
use std::fmt::Write;
let _ = write!(s, "{b:02x}");
}
s
}
pub fn raw_rel_path(doc_id: &str, ext: &str) -> String {
let ext = ext.trim_start_matches('.');
if ext.is_empty() {
format!("raw/{doc_id}")
} else {
format!("raw/{doc_id}.{ext}")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn slugify_basic() {
assert_eq!(slugify("Hello World"), "hello-world");
}
#[test]
fn slugify_cjk() {
assert_eq!(slugify("蒙牛 奶粉 冲泡指南"), "蒙牛-奶粉-冲泡指南");
}
#[test]
fn slugify_max_len() {
assert!(slugify(&"x".repeat(200)).chars().count() <= 80);
}
#[test]
fn slugify_trims_trailing_dashes() {
assert_eq!(slugify("hello---"), "hello");
assert_eq!(slugify("...hello..."), "hello");
}
#[test]
fn markdown_rel_per_kind_carries_suffixes() {
let body_sha = "deadbeef00000000000000000000000000000000000000000000000000000000";
let p = markdown_rel_path(KbSourceKind::Doc, "蒙牛", "file:sha256:abc", body_sha);
assert!(p.starts_with("md/doc/蒙牛--"), "got {p}");
assert!(p.ends_with(".md"), "got {p}");
let suffix = p
.trim_start_matches("md/doc/蒙牛--")
.trim_end_matches(".md");
assert_eq!(
suffix.len(),
18,
"lsid+md suffix must be 18 chars, got {suffix}"
);
let q = markdown_rel_path(
KbSourceKind::Url,
"x",
"url:https://example.com/p",
body_sha,
);
assert!(q.starts_with("md/url/x--") && q.ends_with(".md"), "got {q}");
}
#[test]
fn markdown_rel_same_lsid_and_body_same_path_idempotent() {
let body_sha = "cafef00d00000000000000000000000000000000000000000000000000000000";
let a = markdown_rel_path(KbSourceKind::Doc, "report", "file:sha256:abc", body_sha);
let b = markdown_rel_path(KbSourceKind::Doc, "report", "file:sha256:abc", body_sha);
assert_eq!(a, b);
}
#[test]
fn markdown_rel_different_lsid_different_path_no_collision() {
let body_sha = "deadbeef00000000000000000000000000000000000000000000000000000000";
let a = markdown_rel_path(KbSourceKind::Doc, "report", "file:sha256:abc", body_sha);
let b = markdown_rel_path(KbSourceKind::Doc, "report", "file:sha256:def", body_sha);
assert_ne!(
a, b,
"same slug + different lsid must map to different paths"
);
}
#[test]
fn markdown_rel_same_lsid_different_body_different_path() {
let body_a = "aaaaaaaa00000000000000000000000000000000000000000000000000000000";
let body_b = "bbbbbbbb00000000000000000000000000000000000000000000000000000000";
let a = markdown_rel_path(KbSourceKind::Doc, "report", "file:sha256:abc", body_a);
let b = markdown_rel_path(KbSourceKind::Doc, "report", "file:sha256:abc", body_b);
assert_ne!(
a, b,
"same lsid + different body must map to different paths"
);
}
#[test]
fn lsid_hash8_is_8_hex_chars() {
let h = lsid_hash8("file:sha256:abc");
assert_eq!(h.len(), 8);
assert!(h.chars().all(|c| c.is_ascii_hexdigit()));
}
#[test]
fn raw_rel_with_ext() {
assert_eq!(raw_rel_path("01HXY", "pdf"), "raw/01HXY.pdf");
assert_eq!(raw_rel_path("01HXY", ""), "raw/01HXY");
assert_eq!(raw_rel_path("01HXY", ".pdf"), "raw/01HXY.pdf");
}
}