use std::env;
#[cfg(feature = "embed-models")]
use std::fs;
use std::path::{Path, PathBuf};
#[cfg(feature = "embed-models")]
const HF_REPO: &str = "https://huggingface.co/meridian-online/finetype-model/resolve/main";
#[cfg(feature = "embed-models")]
const CACHE_VERSION: &str = "0.6.38";
#[cfg(feature = "embed-models")]
const DEFAULT_MODEL: &str = "m2v8m-s43";
#[cfg(feature = "embed-models")]
fn portable_path(p: &Path) -> String {
p.canonicalize()
.unwrap()
.to_string_lossy()
.replace('\\', "/")
}
#[cfg(feature = "embed-models")]
fn find_labels(manifest_dir: &Path, workspace_root: &Path) -> PathBuf {
let manifest_labels = manifest_dir.join("labels");
if manifest_labels.exists() && fs::read_dir(&manifest_labels).is_ok() {
return manifest_labels;
}
let workspace_labels = workspace_root.join("labels");
if workspace_labels.exists() && fs::read_dir(&workspace_labels).is_ok() {
return workspace_labels;
}
panic!(
"Cannot find labels directory. Checked:\n {}\n {}",
manifest_labels.display(),
workspace_labels.display()
);
}
fn find_workspace_with_models(start_dir: &Path) -> Option<PathBuf> {
let mut current = start_dir.to_path_buf();
for _ in 0..10 {
let models_default = current.join("models").join("default");
if models_default.exists()
|| std::fs::read_link(&models_default).is_ok()
|| std::fs::read_to_string(&models_default).is_ok()
{
return Some(current);
}
if !current.pop() {
break;
}
}
None
}
#[cfg(feature = "embed-models")]
fn find_models(manifest_dir: &Path, workspace_root: &Path) -> PathBuf {
let workspace_models = workspace_root.join("models");
if workspace_models.join("default").exists() {
println!(
"cargo:warning=Using models from workspace: {}",
workspace_models.display()
);
return workspace_models;
}
if let Some(found_root) = find_workspace_with_models(manifest_dir) {
let found_models = found_root.join("models");
println!(
"cargo:warning=Found workspace models via walk-up: {}",
found_models.display()
);
return found_models;
}
println!("cargo:warning=Models not found locally, downloading from HuggingFace...");
download_models()
}
#[cfg(feature = "embed-models")]
fn download_models() -> PathBuf {
let cache_dir = get_cache_dir();
let models_dir = cache_dir.join("models");
fs::create_dir_all(&models_dir).expect("Failed to create models cache directory");
download_model_group(
&models_dir,
DEFAULT_MODEL,
&["model.safetensors", "label_map.json", "config.json"],
);
let value_subdir = format!("{DEFAULT_MODEL}/value_model2vec");
download_model_group(
&models_dir,
&value_subdir,
&["model.safetensors", "tokenizer.json"],
);
download_model_group(
&models_dir,
"model2vec",
&[
"model.safetensors",
"type_embeddings.safetensors",
"tokenizer.json",
"label_index.json",
],
);
let default_link = models_dir.join("default");
let _ = fs::remove_file(&default_link);
#[cfg(unix)]
{
use std::os::unix::fs::symlink;
symlink(DEFAULT_MODEL, &default_link).expect("Failed to create models/default symlink");
}
#[cfg(windows)]
{
fs::write(&default_link, DEFAULT_MODEL).expect("Failed to create models/default link file");
}
println!(
"cargo:warning=Downloaded models to cache: {}",
models_dir.display()
);
models_dir
}
#[cfg(feature = "embed-models")]
fn get_cache_dir() -> PathBuf {
if let Ok(cargo_home) = env::var("CARGO_HOME") {
return PathBuf::from(cargo_home)
.join("finetype")
.join(format!("v{}", CACHE_VERSION));
}
#[cfg(target_os = "windows")]
{
if let Ok(appdata) = env::var("LOCALAPPDATA") {
return PathBuf::from(appdata)
.join("finetype")
.join(format!("v{}", CACHE_VERSION));
}
}
#[cfg(not(target_os = "windows"))]
{
if let Ok(home) = env::var("HOME") {
return PathBuf::from(home)
.join(".cache")
.join("finetype")
.join(format!("v{}", CACHE_VERSION));
}
}
let out_dir = env::var("OUT_DIR").unwrap_or_else(|_| "/tmp/finetype-models".to_string());
PathBuf::from(out_dir)
}
#[cfg(feature = "embed-models")]
fn download_model_group(models_dir: &Path, group_name: &str, files: &[&str]) {
let group_dir = models_dir.join(group_name);
fs::create_dir_all(&group_dir)
.unwrap_or_else(|_| panic!("Failed to create {} directory", group_name));
for file in files {
let file_path = group_dir.join(file);
if file_path.exists() {
continue;
}
let url = format!("{}/{}/{}", HF_REPO, group_name, file);
download_file(&url, &file_path).unwrap_or_else(|_| {
panic!(
"Failed to download {}/{} from HuggingFace",
group_name, file
)
});
}
println!(
"cargo:warning=Downloaded {} ({} files)",
group_name,
files.len()
);
}
#[cfg(feature = "embed-models")]
fn download_file(url: &str, dest: &Path) -> Result<(), Box<dyn std::error::Error>> {
let response = ureq::get(url).call()?;
let mut reader = response.into_reader();
let mut file = fs::File::create(dest)?;
std::io::copy(&mut reader, &mut file)?;
Ok(())
}
fn main() {
let manifest_dir = env::var("CARGO_MANIFEST_DIR").unwrap();
let manifest_path = PathBuf::from(&manifest_dir);
let mut workspace_root = manifest_path
.parent()
.and_then(|p| p.parent())
.map(|p| p.to_path_buf());
if let Some(root) = &workspace_root {
if !root.join("Cargo.toml").exists() {
workspace_root = find_workspace_with_models(&manifest_path);
}
} else {
workspace_root = find_workspace_with_models(&manifest_path);
}
let workspace_root = workspace_root.unwrap_or_else(|| {
PathBuf::from(&manifest_dir)
});
#[cfg(feature = "embed-models")]
{
let labels_dir = find_labels(&manifest_path, &workspace_root);
let models_dir = find_models(&manifest_path, &workspace_root);
println!("cargo:rerun-if-changed={}", models_dir.display());
println!("cargo:rerun-if-changed={}", labels_dir.display());
generate_embedded_models(&models_dir, &labels_dir);
}
#[cfg(not(feature = "embed-models"))]
{
let _ = workspace_root;
}
}
#[cfg(feature = "embed-models")]
fn generate_embedded_models(models_base: &Path, labels_base: &Path) {
let default_link = models_base.join("default");
let model_dir = std::fs::read_link(&default_link)
.map(|target| {
if target.is_relative() {
models_base.join(target)
} else {
target
}
})
.or_else(|_| {
std::fs::read_to_string(&default_link).map(|content| models_base.join(content.trim()))
})
.unwrap_or_else(|e| {
panic!(
"Cannot resolve models/default at {:?}: {e}. \
This should not happen — models were either local or downloaded. Please report this issue.",
default_link
)
});
let out_dir = env::var("OUT_DIR").unwrap();
let dest = PathBuf::from(&out_dir).join("embedded_models.rs");
let mut code = String::new();
code.push_str("// Auto-generated by build.rs — do not edit\n\n");
if !(model_dir.join("label_map.json").exists() && model_dir.join("config.json").exists()) {
panic!(
"models/default ({:?}) is not a multi-branch model (missing label_map.json / config.json). \
The flat-CharCNN and tiered model types were removed in choice 0107.",
model_dir
);
}
let weights_path = portable_path(&model_dir.join("model.safetensors"));
let config_path = portable_path(&model_dir.join("config.json"));
let labels_path = portable_path(&model_dir.join("label_map.json"));
code.push_str(&format!(
"\npub const MB_WEIGHTS: &[u8] = include_bytes!(\"{weights_path}\");\n"
));
code.push_str(&format!(
"pub const MB_CONFIG: &[u8] = include_bytes!(\"{config_path}\");\n"
));
code.push_str(&format!(
"pub const MB_LABELS: &[u8] = include_bytes!(\"{labels_path}\");\n"
));
code.push_str("\npub const EMBEDDED_MODEL_TYPE: &str = \"multi-branch\";\n");
let value_m2v = model_dir.join("value_model2vec");
if value_m2v.join("model.safetensors").exists() {
println!("cargo:rerun-if-changed={}", value_m2v.display());
let vtok = portable_path(&value_m2v.join("tokenizer.json"));
let vmodel = portable_path(&value_m2v.join("model.safetensors"));
code.push_str("\npub const HAS_MB_VALUE_M2V: bool = true;\n");
code.push_str(&format!(
"pub const MB_VALUE_TOKENIZER: &[u8] = include_bytes!(\"{vtok}\");\n"
));
code.push_str(&format!(
"pub const MB_VALUE_MODEL: &[u8] = include_bytes!(\"{vmodel}\");\n"
));
println!(
"cargo:warning=Embedding dual-encoder value model2vec from {}",
value_m2v.display()
);
} else {
code.push_str("\npub const HAS_MB_VALUE_M2V: bool = false;\n");
code.push_str("pub const MB_VALUE_TOKENIZER: &[u8] = &[];\n");
code.push_str("pub const MB_VALUE_MODEL: &[u8] = &[];\n");
}
let mut yaml_paths: Vec<_> = fs::read_dir(labels_base)
.expect("Failed to read labels directory")
.filter_map(|e| e.ok())
.map(|e| e.path())
.filter(|p| {
p.file_name()
.and_then(|n| n.to_str())
.map(|n| n.starts_with("definitions_") && n.ends_with(".yaml"))
.unwrap_or(false)
})
.collect();
yaml_paths.sort();
code.push_str("\npub const TAXONOMY_YAMLS: &[&str] = &[\n");
for path in &yaml_paths {
let canonical = portable_path(path);
code.push_str(&format!(" include_str!(\"{canonical}\"),\n"));
}
code.push_str("];\n");
let model2vec_dir = models_base.join("model2vec");
println!("cargo:rerun-if-changed={}", model2vec_dir.display());
if model2vec_dir.join("model.safetensors").exists() {
let tok_path = portable_path(&model2vec_dir.join("tokenizer.json"));
let emb_path = portable_path(&model2vec_dir.join("model.safetensors"));
code.push_str("\n// Model2Vec header encoder (multi-branch header branch)\n");
code.push_str("pub const HAS_MODEL2VEC: bool = true;\n");
code.push_str(&format!(
"pub const M2V_TOKENIZER: &[u8] = include_bytes!(\"{tok_path}\");\n"
));
code.push_str(&format!(
"pub const M2V_MODEL: &[u8] = include_bytes!(\"{emb_path}\");\n"
));
println!(
"cargo:warning=Embedding Model2Vec from {}",
model2vec_dir.display()
);
} else {
code.push_str("\n// Model2Vec not available — header encoder disabled\n");
code.push_str("pub const HAS_MODEL2VEC: bool = false;\n");
code.push_str("pub const M2V_TOKENIZER: &[u8] = &[];\n");
code.push_str("pub const M2V_MODEL: &[u8] = &[];\n");
}
fs::write(&dest, code).unwrap_or_else(|e| panic!("Failed to write {}: {}", dest.display(), e));
}