use ahash::{AHashMap, AHashSet};
use alef_core::config::ResolvedCrateConfig;
use alef_core::ir::{ApiSurface, TypeDef, TypeRef};
use anyhow::Context as _;
use std::collections::HashMap;
use std::path::Path;
use tracing::{debug, info};
use crate::cache;
use super::version::read_version;
pub fn ensure_gitignore(base_dir: &Path, config: &ResolvedCrateConfig) {
use alef_core::config::Language;
let gitignore_path = base_dir.join(".gitignore");
let existing = std::fs::read_to_string(&gitignore_path).unwrap_or_default();
let existing_lines: AHashSet<&str> = existing.lines().map(str::trim).collect();
let mut entries: Vec<&str> = vec![".alef/"];
for lang in &config.languages {
match lang {
Language::Python => {
entries.extend_from_slice(&["__pycache__/", "*.so", "*.pyd", ".venv/", "*.egg-info/", "dist/"])
}
Language::Node => entries.extend_from_slice(&["node_modules/", "*.node"]),
Language::Ruby => entries.extend_from_slice(&[".gems/", "vendor/bundle/"]),
Language::Php => entries.extend_from_slice(&["vendor/"]),
Language::Ffi => entries.push("*.h.bak"),
Language::Go => entries.push("*.test"),
Language::Java => entries.extend_from_slice(&["target/", "*.class"]),
Language::Csharp => entries.extend_from_slice(&["bin/", "obj/", "*.nupkg"]),
Language::Wasm => {}
_ => {}
}
}
let mut to_add = Vec::new();
for entry in &entries {
if !existing_lines.contains(entry) {
to_add.push(*entry);
}
}
if to_add.is_empty() {
return;
}
let separator = if existing.is_empty() || existing.ends_with('\n') {
""
} else {
"\n"
};
let additions = to_add.join("\n");
let new_content = format!("{existing}{separator}{additions}\n");
if let Err(e) = std::fs::write(&gitignore_path, new_content) {
debug!("Could not update .gitignore: {e}");
} else {
debug!("Updated .gitignore with {} entries", to_add.len());
}
}
pub fn extract(config: &ResolvedCrateConfig, config_path: &Path, clean: bool) -> anyhow::Result<ApiSurface> {
if let Some(parent) = config_path.parent() {
ensure_gitignore(parent, config);
}
cache::validate_cache_crate_name(&config.name).context("invalid crate name for cache")?;
let source_hash = cache::sources_hash(&config.sources).context("failed to compute sources hash")?;
if !clean && cache::is_ir_cached(&config.name, &source_hash) {
info!("Using cached IR");
return cache::read_cached_ir(&config.name).context("failed to read cached IR");
}
let mut api = extract_raw(config, config_path)?;
api = apply_filters(api, config);
inject_declared_opaque_types(&mut api, config);
strip_cfg_fields(&mut api, &config.features);
sanitize_unknown_types(&mut api);
apply_path_mappings(&mut api, config);
dedup_api_surface(&mut api);
cache::write_ir_cache(&config.name, &api, &source_hash).context("failed to write IR cache")?;
info!(
"Extracted {} types, {} functions, {} enums",
api.types.len(),
api.functions.len(),
api.enums.len()
);
Ok(api)
}
fn extract_raw(config: &ResolvedCrateConfig, _config_path: &Path) -> anyhow::Result<ApiSurface> {
info!("Extracting API surface from Rust source...");
let version = read_version(&config.version_from)?;
let workspace_root = config.workspace_root.as_deref();
let default_name = &config.name;
let mut groups: std::collections::BTreeMap<String, Vec<&Path>> = std::collections::BTreeMap::new();
if !config.source_crates.is_empty() {
for sc in &config.source_crates {
let crate_name = sc.name.replace('-', "_");
for source in &sc.sources {
groups.entry(crate_name.clone()).or_default().push(source.as_path());
}
}
} else {
for source in &config.sources {
let crate_name = derive_crate_name_from_path(source, default_name);
groups.entry(crate_name).or_default().push(source.as_path());
}
}
let mut merged = ApiSurface {
crate_name: default_name.to_string(),
version: version.clone(),
types: vec![],
functions: vec![],
enums: vec![],
errors: vec![],
};
for (crate_name, sources) in &groups {
let api = alef_extract::extractor::extract(sources, crate_name, &version, workspace_root)
.with_context(|| format!("failed to extract API surface from crate {crate_name}"))?;
merged.types.extend(api.types);
merged.functions.extend(api.functions);
merged.enums.extend(api.enums);
merged.errors.extend(api.errors);
}
Ok(merged)
}
fn derive_crate_name_from_path(path: &Path, default: &str) -> String {
let path_str = path.to_string_lossy();
if let Some(after_crates) = path_str.split("crates/").nth(1) {
if let Some(name) = after_crates.split('/').next() {
if path_str.contains(&format!("crates/{name}/src/")) {
return name.replace('-', "_");
}
}
}
default.to_string()
}
fn inject_declared_opaque_types(api: &mut ApiSurface, config: &ResolvedCrateConfig) {
let mut sorted_opaques: Vec<_> = config.opaque_types.iter().collect();
sorted_opaques.sort_by_key(|(name, _)| (*name).clone());
for (name, rust_path) in sorted_opaques {
if !api.types.iter().any(|t| t.name == *name) && !api.enums.iter().any(|e| e.name == *name) {
api.types.push(alef_core::ir::TypeDef {
name: name.clone(),
rust_path: rust_path.clone(),
original_rust_path: rust_path.clone(),
fields: vec![],
methods: vec![],
is_opaque: true,
is_clone: false,
is_copy: false,
is_trait: false,
has_default: false,
has_stripped_cfg_fields: false,
is_return_type: false,
doc: String::new(),
cfg: None,
serde_rename_all: None,
has_serde: false,
super_traits: vec![],
});
debug!("Injected declared opaque type: {name} -> {rust_path}");
}
}
}
fn sanitize_unknown_types(api: &mut ApiSurface) {
let known_types: AHashSet<String> = api.types.iter().map(|t| t.name.clone()).collect();
let known_enums: AHashSet<String> = api.enums.iter().map(|e| e.name.clone()).collect();
let known_type_paths: AHashSet<String> = api.types.iter().map(|t| t.rust_path.replace('-', "_")).collect();
let known_enum_paths: AHashSet<String> = api.enums.iter().map(|e| e.rust_path.replace('-', "_")).collect();
for typ in &mut api.types {
for field in &mut typ.fields {
if sanitize_type_ref(&mut field.ty, &known_types, &known_enums) {
field.sanitized = true;
}
if !field.sanitized {
if let Some(ref path) = field.type_rust_path {
let normalized_path = path.replace('-', "_");
if let TypeRef::Named(ref name) = field.ty {
if known_types.contains(name.as_str()) || known_enums.contains(name.as_str()) {
let path_type_name = normalized_path.rsplit("::").next().unwrap_or("");
let path_matches = known_type_paths
.iter()
.chain(known_enum_paths.iter())
.any(|kp| kp.rsplit("::").next().unwrap_or("") == path_type_name);
if !path_matches {
field.ty = TypeRef::String;
field.sanitized = true;
}
}
}
if let TypeRef::Vec(ref inner) = field.ty {
if let TypeRef::Named(ref name) = **inner {
let vec_path_type = normalized_path.rsplit("::").next().unwrap_or("");
let vec_matches = known_type_paths
.iter()
.chain(known_enum_paths.iter())
.any(|kp| kp.rsplit("::").next().unwrap_or("") == vec_path_type);
if (known_types.contains(name.as_str()) || known_enums.contains(name.as_str()))
&& !vec_matches
{
field.ty = TypeRef::String;
field.sanitized = true;
}
}
}
}
}
}
let type_name = typ.name.clone();
for method in &mut typ.methods {
let mut method_sanitized = false;
for param in &mut method.params {
if sanitize_type_ref(&mut param.ty, &known_types, &known_enums) {
param.sanitized = true;
method_sanitized = true;
}
}
let is_self_return = matches!(&method.return_type, TypeRef::Named(n) if n == &type_name);
if !is_self_return && sanitize_type_ref(&mut method.return_type, &known_types, &known_enums) {
method_sanitized = true;
}
if method_sanitized {
method.sanitized = true;
}
}
}
for func in &mut api.functions {
let mut func_sanitized = false;
for param in &mut func.params {
if sanitize_type_ref(&mut param.ty, &known_types, &known_enums) {
param.sanitized = true;
func_sanitized = true;
}
}
if sanitize_type_ref(&mut func.return_type, &known_types, &known_enums) {
func_sanitized = true;
func.return_sanitized = true;
}
if func_sanitized {
func.sanitized = true;
}
}
for enum_def in &mut api.enums {
for variant in &mut enum_def.variants {
for field in &mut variant.fields {
if sanitize_type_ref(&mut field.ty, &known_types, &known_enums) {
field.sanitized = true;
}
}
}
}
for error_def in &mut api.errors {
for variant in &mut error_def.variants {
for field in &mut variant.fields {
if sanitize_type_ref(&mut field.ty, &known_types, &known_enums) {
field.sanitized = true;
}
}
}
}
}
fn sanitize_type_ref(ty: &mut TypeRef, known_types: &AHashSet<String>, known_enums: &AHashSet<String>) -> bool {
match ty {
TypeRef::Named(name) if !known_types.contains(name.as_str()) && !known_enums.contains(name.as_str()) => {
if let Some(elem_ty) = parse_homogeneous_tuple(name) {
*ty = TypeRef::Vec(Box::new(elem_ty));
return true; }
*ty = TypeRef::String;
true
}
TypeRef::Optional(inner) | TypeRef::Vec(inner) => sanitize_type_ref(inner, known_types, known_enums),
TypeRef::Map(k, v) => {
sanitize_type_ref(k, known_types, known_enums);
sanitize_type_ref(v, known_types, known_enums);
false
}
_ => false,
}
}
fn parse_homogeneous_tuple(name: &str) -> Option<TypeRef> {
use alef_core::ir::PrimitiveType;
let name = name.trim();
let inner = name.strip_prefix('(')?.strip_suffix(')')?;
let parts: Vec<&str> = inner.split(',').map(str::trim).collect();
if parts.is_empty() {
return None;
}
let first = parts[0];
if !parts.iter().all(|p| *p == first) {
return None;
}
let prim = match first {
"u8" => PrimitiveType::U8,
"u16" => PrimitiveType::U16,
"u32" => PrimitiveType::U32,
"u64" => PrimitiveType::U64,
"i8" => PrimitiveType::I8,
"i16" => PrimitiveType::I16,
"i32" => PrimitiveType::I32,
"i64" => PrimitiveType::I64,
"f32" => PrimitiveType::F32,
"f64" => PrimitiveType::F64,
"usize" => PrimitiveType::Usize,
"isize" => PrimitiveType::Isize,
_ => return None,
};
Some(TypeRef::Primitive(prim))
}
fn strip_cfg_fields(api: &mut ApiSurface, enabled_features: &[String]) {
for typ in &mut api.types {
let original_count = typ.fields.len();
let cfg_count = typ.fields.iter().filter(|f| f.cfg.is_some()).count();
typ.fields.retain(|f| match &f.cfg {
None => true,
Some(cfg_str) => cfg_condition_enabled(cfg_str, enabled_features),
});
for field in &mut typ.fields {
field.cfg = None;
}
if cfg_count > 0 && typ.fields.len() < original_count {
typ.has_stripped_cfg_fields = true;
}
}
}
fn cfg_condition_enabled(cfg_str: &str, enabled_features: &[String]) -> bool {
let normalized: String = {
let t = cfg_str.trim();
let t = t.replace(" (", "(");
t
};
let cfg_str = normalized.as_str();
if let Some(feature) = cfg_str.strip_prefix("feature = \"").and_then(|s| s.strip_suffix('"')) {
return enabled_features.iter().any(|ef| ef == feature);
}
if let Some(inner) = cfg_str.strip_prefix("any(").and_then(|s| s.strip_suffix(')')) {
return parse_cfg_list(inner)
.iter()
.any(|cond| cfg_condition_enabled(cond, enabled_features));
}
if let Some(inner) = cfg_str.strip_prefix("all(").and_then(|s| s.strip_suffix(')')) {
return parse_cfg_list(inner)
.iter()
.all(|cond| cfg_condition_enabled(cond, enabled_features));
}
if let Some(inner) = cfg_str.strip_prefix("not(").and_then(|s| s.strip_suffix(')')) {
return !cfg_condition_enabled(inner.trim(), enabled_features);
}
false
}
fn parse_cfg_list(s: &str) -> Vec<String> {
let mut result = Vec::new();
let mut depth = 0usize;
let mut current = String::new();
for ch in s.chars() {
match ch {
'(' => {
depth += 1;
current.push(ch);
}
')' => {
depth = depth.saturating_sub(1);
current.push(ch);
}
',' if depth == 0 => {
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
result.push(trimmed);
}
current.clear();
}
_ => current.push(ch),
}
}
let trimmed = current.trim().to_string();
if !trimmed.is_empty() {
result.push(trimmed);
}
result
}
fn dedup_api_surface(api: &mut ApiSurface) {
let enum_names: AHashSet<String> = api.enums.iter().map(|e| e.name.clone()).collect();
api.types.retain(|t| !enum_names.contains(&t.name));
let error_names: AHashSet<String> = api.errors.iter().map(|e| e.name.clone()).collect();
api.types.retain(|t| !error_names.contains(&t.name));
{
let mut best: AHashMap<String, usize> = AHashMap::new();
for (i, t) in api.types.iter().enumerate() {
best.entry(t.name.clone())
.and_modify(|prev_i| {
if api.types[i].rust_path.len() < api.types[*prev_i].rust_path.len() {
*prev_i = i;
}
})
.or_insert(i);
}
let keep: AHashSet<usize> = best.values().copied().collect();
let mut idx = 0;
api.types.retain(|_| {
let k = keep.contains(&idx);
idx += 1;
k
});
}
{
let mut best: AHashMap<String, usize> = AHashMap::new();
for (i, e) in api.enums.iter().enumerate() {
best.entry(e.name.clone())
.and_modify(|prev_i| {
if api.enums[i].rust_path.len() < api.enums[*prev_i].rust_path.len() {
*prev_i = i;
}
})
.or_insert(i);
}
let keep: AHashSet<usize> = best.values().copied().collect();
let mut idx = 0;
api.enums.retain(|_| {
let k = keep.contains(&idx);
idx += 1;
k
});
}
{
let mut best: AHashMap<String, usize> = AHashMap::new();
for (i, f) in api.functions.iter().enumerate() {
best.entry(f.name.clone())
.and_modify(|prev_i| {
if api.functions[i].rust_path.len() < api.functions[*prev_i].rust_path.len() {
*prev_i = i;
}
})
.or_insert(i);
}
let keep: AHashSet<usize> = best.values().copied().collect();
let mut idx = 0;
api.functions.retain(|_| {
let k = keep.contains(&idx);
idx += 1;
k
});
}
let mut seen_errors: AHashSet<String> = AHashSet::new();
api.errors.retain(|e| seen_errors.insert(e.name.clone()));
}
fn is_type_excluded(name: &str, rust_path: &str, exclude_list: &[String]) -> bool {
exclude_list.iter().any(|entry| {
if entry.contains("::") {
let normalised = rust_path.replace('-', "_");
normalised == entry.as_str()
} else {
name == entry.as_str()
}
})
}
fn apply_filters(mut api: ApiSurface, config: &ResolvedCrateConfig) -> ApiSurface {
let exclude = &config.exclude;
let include = &config.include;
if !include.types.is_empty() {
let expanded = expand_include_list(&api, &include.types);
api.types.retain(|t| expanded.contains(&t.name));
api.enums.retain(|e| expanded.contains(&e.name));
}
if !include.functions.is_empty() {
api.functions.retain(|f| include.functions.contains(&f.name));
}
api.types
.retain(|t| !is_type_excluded(&t.name, &t.rust_path, &exclude.types));
api.functions.retain(|f| !exclude.functions.contains(&f.name));
api.enums
.retain(|e| !is_type_excluded(&e.name, &e.rust_path, &exclude.types));
api.errors
.retain(|e| !is_type_excluded(&e.name, &e.rust_path, &exclude.types));
if !exclude.methods.is_empty() {
for typ in &mut api.types {
typ.methods.retain(|m| {
let key = format!("{}.{}", typ.name, m.name);
!exclude.methods.contains(&key)
});
}
}
api
}
fn expand_include_list(api: &ApiSurface, include_types: &[String]) -> AHashSet<String> {
let mut needed: AHashSet<String> = include_types.iter().cloned().collect();
let mut changed = true;
let all_types: AHashMap<String, &TypeDef> = api.types.iter().map(|t| (t.name.clone(), t)).collect();
let all_enums: AHashSet<String> = api.enums.iter().map(|e| e.name.clone()).collect();
while changed {
changed = false;
let current: Vec<String> = needed.iter().cloned().collect();
for type_name in ¤t {
if let Some(typ) = all_types.get(type_name) {
for field in &typ.fields {
collect_named_types(&field.ty, &mut needed, &all_types, &all_enums, &mut changed);
}
for method in &typ.methods {
collect_named_types(&method.return_type, &mut needed, &all_types, &all_enums, &mut changed);
for param in &method.params {
collect_named_types(¶m.ty, &mut needed, &all_types, &all_enums, &mut changed);
}
}
}
}
}
needed
}
fn collect_named_types(
ty: &TypeRef,
needed: &mut AHashSet<String>,
all_types: &AHashMap<String, &TypeDef>,
all_enums: &AHashSet<String>,
changed: &mut bool,
) {
match ty {
TypeRef::Named(name)
if (all_types.contains_key(name) || all_enums.contains(name)) && needed.insert(name.clone()) =>
{
*changed = true;
}
TypeRef::Optional(inner) | TypeRef::Vec(inner) => {
collect_named_types(inner, needed, all_types, all_enums, changed);
}
TypeRef::Map(k, v) => {
collect_named_types(k, needed, all_types, all_enums, changed);
collect_named_types(v, needed, all_types, all_enums, changed);
}
_ => {}
}
}
fn rewrite_path(path: &str, mappings: &HashMap<String, String>) -> String {
let mut sorted: Vec<_> = mappings.iter().collect();
sorted.sort_by_key(|b| std::cmp::Reverse(b.0.len()));
for (from, to) in sorted {
if path.starts_with(from.as_str()) {
return format!("{}{}", to, &path[from.len()..]);
}
}
path.to_string()
}
fn apply_path_mappings(api: &mut ApiSurface, config: &ResolvedCrateConfig) {
let mappings = config.effective_path_mappings();
if mappings.is_empty() {
return;
}
for typ in &mut api.types {
if typ.original_rust_path.is_empty() {
typ.original_rust_path = typ.rust_path.clone();
}
typ.rust_path = rewrite_path(&typ.rust_path, &mappings);
for field in &mut typ.fields {
if let Some(ref mut path) = field.type_rust_path {
*path = rewrite_path(path, &mappings);
}
}
}
for func in &mut api.functions {
if func.original_rust_path.is_empty() {
func.original_rust_path = func.rust_path.clone();
}
func.rust_path = rewrite_path(&func.rust_path, &mappings);
}
for enum_def in &mut api.enums {
if enum_def.original_rust_path.is_empty() {
enum_def.original_rust_path = enum_def.rust_path.clone();
}
enum_def.rust_path = rewrite_path(&enum_def.rust_path, &mappings);
}
for error_def in &mut api.errors {
if error_def.original_rust_path.is_empty() {
error_def.original_rust_path = error_def.rust_path.clone();
}
error_def.rust_path = rewrite_path(&error_def.rust_path, &mappings);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn sanitize_map_with_cow_key_preserves_map_structure_and_returns_not_sanitized() {
let known_types = AHashSet::default();
let known_enums = AHashSet::default();
let mut ty = TypeRef::Map(Box::new(TypeRef::Named("str".into())), Box::new(TypeRef::Json));
let sanitized = sanitize_type_ref(&mut ty, &known_types, &known_enums);
assert!(
matches!(&ty, TypeRef::Map(k, v)
if matches!(k.as_ref(), TypeRef::String)
&& matches!(v.as_ref(), TypeRef::Json)),
"expected Map(String, Json) but got {ty:?}"
);
assert!(
!sanitized,
"sanitize_type_ref returned sanitized=true for Map — this triggers the Debug-format fallback"
);
let _ = known_types;
let mut ty2 = TypeRef::Map(Box::new(TypeRef::String), Box::new(TypeRef::Json));
let sanitized2 = sanitize_type_ref(&mut ty2, &AHashSet::default(), &AHashSet::default());
assert!(!sanitized2, "Map(String, Json) should not be sanitized");
assert!(
matches!(&ty2, TypeRef::Map(k, v)
if matches!(k.as_ref(), TypeRef::String)
&& matches!(v.as_ref(), TypeRef::Json)),
"Map(String, Json) must not be mutated when already clean"
);
}
#[test]
fn sanitize_map_with_both_string_types_returns_not_sanitized() {
let mut ty = TypeRef::Map(Box::new(TypeRef::String), Box::new(TypeRef::String));
let sanitized = sanitize_type_ref(&mut ty, &AHashSet::default(), &AHashSet::default());
assert!(!sanitized);
assert!(matches!(
&ty,
TypeRef::Map(k, v)
if matches!(k.as_ref(), TypeRef::String) && matches!(v.as_ref(), TypeRef::String)
));
}
#[test]
fn sanitize_named_unknown_type_returns_sanitized_true() {
let mut ty = TypeRef::Named("UnknownForeignType".into());
let sanitized = sanitize_type_ref(&mut ty, &AHashSet::default(), &AHashSet::default());
assert!(sanitized);
assert!(matches!(ty, TypeRef::String));
}
#[test]
fn sanitize_vec_with_unknown_named_returns_sanitized_true() {
let mut ty = TypeRef::Vec(Box::new(TypeRef::Named("MyForeignStruct".into())));
let sanitized = sanitize_type_ref(&mut ty, &AHashSet::default(), &AHashSet::default());
assert!(sanitized);
assert!(matches!(
&ty,
TypeRef::Vec(inner) if matches!(inner.as_ref(), TypeRef::String)
));
}
#[test]
fn is_type_excluded_plain_entry_matches_by_name() {
let exclude = vec!["OutputFormat".to_string()];
assert!(
is_type_excluded("OutputFormat", "kreuzberg::types::OutputFormat", &exclude),
"plain entry must match when name matches"
);
assert!(
!is_type_excluded("SomethingElse", "kreuzberg::types::SomethingElse", &exclude),
"plain entry must not match when name differs"
);
}
#[test]
fn is_type_excluded_qualified_entry_matches_rust_path_not_name() {
let exclude = vec!["kreuzberg::core::config::formats::OutputFormat".to_string()];
assert!(
is_type_excluded(
"OutputFormat",
"kreuzberg::core::config::formats::OutputFormat",
&exclude
),
"qualified entry must match the exact rust_path"
);
assert!(
!is_type_excluded("OutputFormat", "kreuzberg::types::OutputFormat", &exclude),
"qualified entry must NOT match a different rust_path with the same short name"
);
}
#[test]
fn is_type_excluded_normalises_hyphens_in_rust_path() {
let exclude = vec!["my_crate::some_module::Foo".to_string()];
assert!(
is_type_excluded("Foo", "my-crate::some_module::Foo", &exclude),
"hyphens in rust_path should be normalised to underscores"
);
}
}