fn extract_max_data_end(header_json: &serde_json::Value) -> u64 {
let Some(obj) = header_json.as_object() else {
return 0;
};
obj.iter()
.filter(|(k, _)| *k != "__metadata__")
.filter_map(|(_, v)| {
v.get("data_offsets")
.and_then(|d| d.as_array())
.and_then(|arr| arr.get(1))
.and_then(|e| e.as_u64())
})
.max()
.unwrap_or(0)
}
impl FormatType {
pub fn from_extension(path: &Path) -> Result<Self> {
let ext = path
.extension()
.and_then(|e| e.to_str())
.ok_or_else(|| {
let is_dir = path.is_dir();
let message = if is_dir {
let candidates = ["model.gguf", "model.apr", "model.safetensors"];
let found: Vec<_> = candidates
.iter()
.filter(|f| path.join(f).exists())
.collect();
if found.is_empty() {
format!(
"Path '{}' is a directory, not a model file. \
Expected a file with .gguf, .apr, or .safetensors extension.",
path.display()
)
} else {
format!(
"Path '{}' is a directory. Did you mean '{}'?",
path.display(),
path.join(found[0]).display()
)
}
} else {
format!(
"No file extension found in '{}'. Expected .gguf, .apr, or .safetensors",
path.display()
)
};
AprenderError::FormatError { message }
})?
.to_lowercase();
match ext.as_str() {
"gguf" => Ok(Self::Gguf),
"safetensors" => Ok(Self::SafeTensors),
"apr" => Ok(Self::Apr),
_ => Err(AprenderError::FormatError {
message: format!("Unknown format extension: .{ext}"),
}),
}
}
pub fn from_magic(path: &Path) -> Result<Self> {
use std::fs::File;
use std::io::{Read, Seek, SeekFrom};
let mut file = File::open(path).map_err(|e| AprenderError::FormatError {
message: format!("Cannot open file: {e}"),
})?;
let mut magic = [0u8; 8];
file.read_exact(&mut magic)
.map_err(|e| AprenderError::FormatError {
message: format!("Cannot read magic bytes: {e}"),
})?;
if magic.get(0..4) == Some(b"GGUF") {
return Ok(Self::Gguf);
}
let header_len = u64::from_le_bytes(magic);
if header_len < 100_000_000 {
let mut json_start = [0u8; 2];
if file.read_exact(&mut json_start).is_ok() && &json_start == b"{\"" {
let file_size = std::fs::metadata(path).map(|m| m.len()).unwrap_or(0);
let min_size = 8 + header_len;
if file_size < min_size {
return Err(AprenderError::FormatError {
message: format!(
"Truncated SafeTensors file: header declares {min_size} bytes but file is only {file_size} bytes"
),
});
}
let data_section_start = 8 + header_len;
file.seek(SeekFrom::Start(8))
.map_err(|e| AprenderError::FormatError {
message: format!("Cannot seek in SafeTensors file: {e}"),
})?;
let mut header_buf = vec![0u8; header_len as usize];
file.read_exact(&mut header_buf)
.map_err(|e| AprenderError::FormatError {
message: format!("Cannot read SafeTensors header: {e}"),
})?;
let header_json = serde_json::from_slice::<serde_json::Value>(&header_buf);
if let Ok(header_json) = header_json {
let max_data_end = extract_max_data_end(&header_json);
let required_size = data_section_start + max_data_end;
if file_size < required_size {
let msg = format!(
"Truncated SafeTensors file: needs {required_size}B, got {file_size}B"
);
return Err(AprenderError::FormatError { message: msg });
}
}
return Ok(Self::SafeTensors);
}
}
if magic.get(0..3) == Some(b"APR") {
return Ok(Self::Apr);
}
Err(AprenderError::FormatError {
message: "Unknown file format - magic bytes not recognized".to_string(),
})
}
#[must_use]
pub const fn extension(&self) -> &'static str {
match self {
Self::Gguf => "gguf",
Self::SafeTensors => "safetensors",
Self::Apr => "apr",
}
}
#[must_use]
pub const fn can_convert_to(&self, target: Self) -> bool {
!matches!(
(self, target),
(Self::Gguf, Self::Gguf)
| (Self::SafeTensors, Self::SafeTensors)
| (Self::Apr, Self::Apr)
)
}
}
impl fmt::Display for FormatType {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Gguf => write!(f, "GGUF"),
Self::SafeTensors => write!(f, "SafeTensors"),
Self::Apr => write!(f, "APR"),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ConversionPath {
pub source: FormatType,
pub target: FormatType,
pub intermediates: Vec<FormatType>,
}
impl ConversionPath {
#[must_use]
pub const fn direct(source: FormatType, target: FormatType) -> Self {
Self {
source,
target,
intermediates: Vec::new(),
}
}
#[must_use]
pub fn chain(source: FormatType, intermediates: Vec<FormatType>, target: FormatType) -> Self {
Self {
source,
target,
intermediates,
}
}
#[must_use]
pub fn steps(&self) -> Vec<FormatType> {
let mut steps = vec![self.source];
steps.extend(self.intermediates.clone());
steps.push(self.target);
steps
}
#[must_use]
pub fn is_roundtrip(&self) -> bool {
self.source == self.target && !self.intermediates.is_empty()
}
#[must_use]
pub fn has_cycle(&self) -> bool {
let steps = self.steps();
let middle: Vec<_> = steps[1..steps.len() - 1].to_vec();
let mut seen = std::collections::HashSet::new();
for fmt in middle {
if !seen.insert(fmt) {
return true;
}
}
false
}
}
impl fmt::Display for ConversionPath {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let steps = self.steps();
let path_str: Vec<_> = steps.iter().map(ToString::to_string).collect();
write!(f, "{}", path_str.join(" → "))
}
}
#[derive(Debug, Clone)]
pub struct TensorInfo {
pub name: String,
pub dtype: String,
pub shape: Vec<usize>,
pub size_bytes: usize,
pub stats: Option<TensorStats>,
}
#[derive(Debug, Clone, Copy)]
pub struct TensorStats {
pub min: f32,
pub max: f32,
pub mean: f32,
pub std: f32,
}
#[derive(Debug, Clone)]
pub struct InspectionReport {
pub format: FormatType,
pub file_size: usize,
pub metadata: BTreeMap<String, String>,
pub tensors: Vec<TensorInfo>,
pub total_params: usize,
pub quantization: Option<String>,
pub architecture: Option<String>,
}
impl fmt::Display for InspectionReport {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "=== Rosetta Stone Inspection ===")?;
writeln!(f, "Format: {}", self.format)?;
writeln!(f, "File Size: {} bytes", self.file_size)?;
writeln!(f, "Total Parameters: {}", self.total_params)?;
if let Some(ref arch) = self.architecture {
writeln!(f, "Architecture: {arch}")?;
}
if let Some(ref quant) = self.quantization {
writeln!(f, "Quantization: {quant}")?;
}
writeln!(f, "\n--- Metadata ({} keys) ---", self.metadata.len())?;
for (k, v) in &self.metadata {
let display_v = if v.len() > 60 {
let truncated = match v.get(..60) {
Some(s) => s,
None => v.as_str(),
};
format!("{truncated}...")
} else {
v.clone()
};
writeln!(f, " {k}: {display_v}")?;
}
writeln!(f, "\n--- Tensors ({} total) ---", self.tensors.len())?;
for (i, t) in self.tensors.iter().enumerate() {
if i < 10 || i >= self.tensors.len() - 2 {
writeln!(
f,
" {}: {} {:?} ({} bytes)",
t.name, t.dtype, t.shape, t.size_bytes
)?;
} else if i == 10 {
writeln!(f, " ... ({} more tensors) ...", self.tensors.len() - 12)?;
}
}
Ok(())
}
}
#[derive(Debug, Clone)]
pub struct ConversionReport {
pub path: ConversionPath,
pub source_inspection: InspectionReport,
pub target_inspection: InspectionReport,
pub warnings: Vec<String>,
pub duration_ms: u64,
pub modified_tensors: Vec<String>,
pub dropped_tensors: Vec<String>,
}
struct TensorAccum {
min: f32,
max: f32,
sum: f64,
nan_count: usize,
inf_count: usize,
zero_count: usize,
}
include!("check.rs");
include!("mod_include_02.rs");
include!("default_impl.rs");