floe-core 0.3.6

Core library for Floe, a YAML-driven technical ingestion tool.
Documentation
use std::path::{Component, Path, PathBuf};

const MAX_FILENAME_COMPONENT_BYTES: usize = 255;
const MAX_ARCHIVE_RUN_COMPONENT_BYTES: usize = 48;

pub fn build_output_filename(stem: &str, suffix: &str, extension: &str) -> String {
    let ext = extension.trim_start_matches('.');
    if suffix.is_empty() {
        format!("{stem}.{ext}")
    } else {
        format!("{stem}{suffix}.{ext}")
    }
}

pub fn build_part_stem(index: usize) -> String {
    format!("part-{index:05}")
}

pub fn resolve_output_path(base_path: &str, filename: &str) -> PathBuf {
    let base = Path::new(base_path);
    if base.extension().is_some() {
        base.to_path_buf()
    } else if base.as_os_str().is_empty() {
        PathBuf::from(filename)
    } else {
        base.join(filename)
    }
}

pub fn resolve_output_dir_path(base_path: &str, filename: &str) -> PathBuf {
    let base = Path::new(base_path);
    if base.as_os_str().is_empty() {
        PathBuf::from(filename)
    } else {
        base.join(filename)
    }
}

pub fn resolve_sibling_path(base_path: &str, filename: &str) -> PathBuf {
    let base = Path::new(base_path);
    let dir = if base.extension().is_some() {
        base.parent().unwrap_or(base)
    } else if base.as_os_str().is_empty() {
        Path::new("")
    } else {
        base
    };
    dir.join(filename)
}

pub fn normalize_local_path(path: &Path) -> PathBuf {
    let mut normalized = PathBuf::new();

    for component in path.components() {
        match component {
            Component::Prefix(prefix) => normalized.push(prefix.as_os_str()),
            Component::RootDir => normalized.push(component.as_os_str()),
            Component::CurDir => {}
            Component::ParentDir => {
                let can_pop = normalized
                    .components()
                    .next_back()
                    .is_some_and(|tail| matches!(tail, Component::Normal(_)));
                if can_pop && !last_component_is_symlink(&normalized) {
                    normalized.pop();
                } else if !path.is_absolute() || !normalized_has_root_only(&normalized) {
                    normalized.push("..");
                }
            }
            Component::Normal(segment) => normalized.push(segment),
        }
    }

    if normalized.as_os_str().is_empty() {
        if path.is_absolute() {
            PathBuf::from(std::path::MAIN_SEPARATOR.to_string())
        } else {
            PathBuf::from(".")
        }
    } else {
        normalized
    }
}

fn normalized_has_root_only(path: &Path) -> bool {
    let mut saw_root = false;
    for component in path.components() {
        match component {
            Component::Prefix(_) => {}
            Component::RootDir => saw_root = true,
            Component::CurDir => {}
            Component::ParentDir | Component::Normal(_) => return false,
        }
    }
    saw_root
}

fn last_component_is_symlink(path: &Path) -> bool {
    std::fs::symlink_metadata(path)
        .map(|meta| meta.file_type().is_symlink())
        .unwrap_or(false)
}

pub fn resolve_output_key(base_key: &str, filename: &str) -> String {
    let base = normalize_key(base_key);
    if Path::new(&base).extension().is_some() {
        base
    } else if base.is_empty() {
        filename.to_string()
    } else {
        format!("{base}/{filename}")
    }
}

pub fn resolve_output_dir_key(base_key: &str, filename: &str) -> String {
    let base = normalize_key(base_key);
    if base.is_empty() {
        filename.to_string()
    } else {
        format!("{base}/{filename}")
    }
}

pub fn resolve_sibling_key(base_key: &str, filename: &str) -> String {
    let base = normalize_key(base_key);
    let dir = if Path::new(&base).extension().is_some() {
        parent_key(&base)
    } else {
        base
    };
    if dir.is_empty() {
        filename.to_string()
    } else {
        format!("{dir}/{filename}")
    }
}

pub fn archive_relative_path(entity: &str, filename: &str) -> String {
    let name = Path::new(filename)
        .file_name()
        .and_then(|name| name.to_str())
        .unwrap_or(filename);
    let entity = entity.trim_matches('/');
    if entity.is_empty() {
        name.to_string()
    } else {
        format!("{entity}/{name}")
    }
}

pub fn archive_relative_path_for_run(
    entity: &str,
    filename: &str,
    run_id: &str,
    source_uri: &str,
) -> String {
    let name = archive_filename_for_run(filename, run_id, source_uri);
    let entity = entity.trim_matches('/');
    if entity.is_empty() {
        name
    } else {
        format!("{entity}/{name}")
    }
}

pub fn archive_filename_for_run(filename: &str, run_id: &str, source_uri: &str) -> String {
    let original_name = Path::new(filename)
        .file_name()
        .and_then(|name| name.to_str())
        .unwrap_or(filename);
    let path = Path::new(original_name);
    let stem = path
        .file_stem()
        .and_then(|value| value.to_str())
        .unwrap_or(original_name);
    let ext = path.extension().and_then(|value| value.to_str());
    let run_component = compact_archive_run_component(run_id);
    let source_hash = short_stable_hash_hex(source_uri);
    let suffix = format!("__run-{run_component}__src-{source_hash}");
    let extension_suffix = match ext {
        Some(ext) if !ext.is_empty() => format!(".{ext}"),
        _ => String::new(),
    };
    let stem_source = if extension_suffix.is_empty() {
        original_name
    } else {
        stem
    };

    let reserved_bytes = suffix.len() + extension_suffix.len();
    let available_stem_bytes = MAX_FILENAME_COMPONENT_BYTES.saturating_sub(reserved_bytes);
    let stem = truncate_utf8_to_bytes(stem_source, available_stem_bytes);

    format!("{stem}{suffix}{extension_suffix}")
}

pub fn resolve_archive_path(base_path: &str, entity: &str, filename: &str) -> PathBuf {
    let relative = archive_relative_path(entity, filename);
    resolve_output_dir_path(base_path, &relative)
}

pub fn resolve_archive_path_for_run(
    base_path: &str,
    entity: &str,
    filename: &str,
    run_id: &str,
    source_uri: &str,
) -> PathBuf {
    let relative = archive_relative_path_for_run(entity, filename, run_id, source_uri);
    resolve_output_dir_path(base_path, &relative)
}

pub fn resolve_archive_key(base_key: &str, entity: &str, filename: &str) -> String {
    let relative = archive_relative_path(entity, filename);
    resolve_output_dir_key(base_key, &relative)
}

pub fn resolve_archive_key_for_run(
    base_key: &str,
    entity: &str,
    filename: &str,
    run_id: &str,
    source_uri: &str,
) -> String {
    let relative = archive_relative_path_for_run(entity, filename, run_id, source_uri);
    resolve_output_dir_key(base_key, &relative)
}

fn normalize_key(base_key: &str) -> String {
    base_key.trim_matches('/').to_string()
}

fn parent_key(base: &str) -> String {
    match base.rsplit_once('/') {
        Some((parent, _)) => parent.to_string(),
        None => base.to_string(),
    }
}

fn sanitize_archive_component(value: &str) -> String {
    let sanitized = value
        .chars()
        .map(|ch| {
            if ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_' | '.') {
                ch
            } else {
                '_'
            }
        })
        .collect::<String>()
        .trim_matches('_')
        .to_string();
    if sanitized.is_empty() {
        "run".to_string()
    } else {
        sanitized
    }
}

fn compact_archive_run_component(run_id: &str) -> String {
    let sanitized = sanitize_archive_component(run_id);
    if sanitized.len() <= MAX_ARCHIVE_RUN_COMPONENT_BYTES {
        return sanitized;
    }
    let truncated = truncate_utf8_to_bytes(&sanitized, MAX_ARCHIVE_RUN_COMPONENT_BYTES);
    let run_hash = &short_stable_hash_hex(run_id)[..8];
    format!("{truncated}_{run_hash}")
}

fn truncate_utf8_to_bytes(value: &str, max_bytes: usize) -> String {
    if value.len() <= max_bytes {
        return value.to_string();
    }
    if max_bytes == 0 {
        return String::new();
    }
    let mut end = 0;
    for (idx, ch) in value.char_indices() {
        let next = idx + ch.len_utf8();
        if next > max_bytes {
            break;
        }
        end = next;
    }
    if end == 0 {
        String::new()
    } else {
        value[..end].to_string()
    }
}

fn short_stable_hash_hex(value: &str) -> String {
    let mut hash: u64 = 0xcbf29ce484222325;
    for byte in value.as_bytes() {
        hash ^= u64::from(*byte);
        hash = hash.wrapping_mul(0x100000001b3);
    }
    format!("{:016x}", hash)
}