pub(crate) fn create_demo_apr_model(_input_dim: usize) -> Result<AprModel, RealizarError> {
use crate::apr::TensorEntry;
let metadata = r#"{"model_type":"demo","name":"demo-model"}"#;
let tensor_index: Vec<TensorEntry> = vec![TensorEntry {
name: "weight".to_string(),
dtype: "F32".to_string(),
shape: vec![4],
offset: 0,
size: 16,
}];
let tensor_index_json = serde_json::to_vec(&tensor_index).unwrap_or_default();
let tensor_data: [f32; 4] = [1.0, 1.0, 1.0, 1.0];
let tensor_bytes: Vec<u8> = tensor_data.iter().flat_map(|f| f.to_le_bytes()).collect();
let metadata_offset = HEADER_SIZE as u64;
let metadata_size = metadata.len() as u32;
let tensor_index_offset =
((metadata_offset as usize + metadata.len()).div_ceil(64) * 64) as u64;
let data_offset =
((tensor_index_offset as usize + tensor_index_json.len()).div_ceil(64) * 64) as u64;
let mut data = vec![0u8; data_offset as usize + tensor_bytes.len()];
data[0..4].copy_from_slice(&MAGIC);
data[4] = 2; data[5] = 0; data[6..8].copy_from_slice(&0u16.to_le_bytes()); data[8..12].copy_from_slice(&1u32.to_le_bytes()); data[12..20].copy_from_slice(&metadata_offset.to_le_bytes());
data[20..24].copy_from_slice(&metadata_size.to_le_bytes());
data[24..32].copy_from_slice(&tensor_index_offset.to_le_bytes());
data[32..40].copy_from_slice(&data_offset.to_le_bytes());
data[metadata_offset as usize..metadata_offset as usize + metadata.len()]
.copy_from_slice(metadata.as_bytes());
data[tensor_index_offset as usize..tensor_index_offset as usize + tensor_index_json.len()]
.copy_from_slice(&tensor_index_json);
data[data_offset as usize..data_offset as usize + tensor_bytes.len()]
.copy_from_slice(&tensor_bytes);
AprModel::from_bytes(data)
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatCompletionRequest {
pub model: String,
pub messages: Vec<ChatMessage>,
#[serde(default)]
pub max_tokens: Option<usize>,
#[serde(default)]
pub temperature: Option<f32>,
#[serde(default)]
pub top_p: Option<f32>,
#[serde(default = "default_n")]
pub n: usize,
#[serde(default)]
pub stream: bool,
#[serde(default)]
pub stop: Option<Vec<String>>,
#[serde(default)]
pub user: Option<String>,
}
fn default_n() -> usize {
1
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatMessage {
pub role: String,
pub content: String,
#[serde(default)]
pub name: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatCompletionResponse {
pub id: String,
pub object: String,
pub created: i64,
pub model: String,
pub choices: Vec<ChatChoice>,
pub usage: Usage,
#[serde(skip_serializing_if = "Option::is_none")]
pub brick_trace: Option<TraceData>,
#[serde(skip_serializing_if = "Option::is_none")]
pub step_trace: Option<TraceData>,
#[serde(skip_serializing_if = "Option::is_none")]
pub layer_trace: Option<TraceData>,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum TraceProvenance {
Measured,
WallClockTotal,
#[default]
Estimated,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TraceData {
pub level: String,
pub operations: usize,
pub total_time_us: u64,
pub breakdown: Vec<TraceOperation>,
#[serde(default)]
pub provenance: TraceProvenance,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TraceOperation {
pub name: String,
pub time_us: u64,
#[serde(skip_serializing_if = "Option::is_none")]
pub details: Option<String>,
}
#[must_use]
pub fn build_trace_data(
trace_level: Option<&str>,
latency_us: u64,
prompt_tokens: usize,
completion_tokens: usize,
num_layers: usize,
) -> (Option<TraceData>, Option<TraceData>, Option<TraceData>) {
match trace_level {
Some("brick") => (
Some(TraceData {
level: "brick".to_string(),
operations: completion_tokens,
total_time_us: latency_us,
breakdown: vec![
TraceOperation {
name: "total_inference".to_string(),
time_us: latency_us,
details: Some(format!(
"{} prompt + {} completion tokens, {} layers. \
Per-op breakdown not available โ use `apr profile` for real brick-level telemetry",
prompt_tokens, completion_tokens, num_layers
)),
},
],
provenance: TraceProvenance::WallClockTotal,
}),
None,
None,
),
Some("step") => (
None,
Some(TraceData {
level: "step".to_string(),
operations: completion_tokens,
total_time_us: latency_us,
breakdown: vec![
TraceOperation {
name: "total_inference".to_string(),
time_us: latency_us,
details: Some(format!(
"{} prompt + {} completion tokens, {} layers. \
Step-level breakdown not instrumented โ use `apr profile` for real timing",
prompt_tokens, completion_tokens, num_layers
)),
},
],
provenance: TraceProvenance::WallClockTotal,
}),
None,
),
Some("layer") => (
None,
None,
Some(TraceData {
level: "layer".to_string(),
operations: num_layers,
total_time_us: latency_us,
breakdown: vec![
TraceOperation {
name: "total_inference".to_string(),
time_us: latency_us,
details: Some(format!(
"{} layers, {} tokens. \
Per-layer breakdown not instrumented โ use `apr profile --granular` for real per-layer timing",
num_layers, prompt_tokens + completion_tokens
)),
},
],
provenance: TraceProvenance::WallClockTotal,
}),
),
_ => (None, None, None),
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatChoice {
pub index: usize,
pub message: ChatMessage,
pub finish_reason: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Usage {
pub prompt_tokens: usize,
pub completion_tokens: usize,
pub total_tokens: usize,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OpenAIModelsResponse {
pub object: String,
pub data: Vec<OpenAIModel>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct OpenAIModel {
pub id: String,
pub object: String,
pub created: i64,
pub owned_by: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatCompletionChunk {
pub id: String,
pub object: String,
pub created: i64,
pub model: String,
pub choices: Vec<ChatChunkChoice>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatChunkChoice {
pub index: usize,
pub delta: ChatDelta,
pub finish_reason: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatDelta {
#[serde(skip_serializing_if = "Option::is_none")]
pub role: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub content: Option<String>,
}
impl ChatCompletionChunk {
fn new(id: &str, model: &str, content: Option<String>, finish_reason: Option<String>) -> Self {
Self {
id: id.to_string(),
object: "chat.completion.chunk".to_string(),
created: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.map(|d| d.as_secs() as i64)
.unwrap_or(0),
model: model.to_string(),
choices: vec![ChatChunkChoice {
index: 0,
delta: ChatDelta {
role: if content.is_none() && finish_reason.is_none() {
Some("assistant".to_string())
} else {
None
},
content,
},
finish_reason,
}],
}
}
fn initial(id: &str, model: &str) -> Self {
Self::new(id, model, None, None)
}
fn content(id: &str, model: &str, text: &str) -> Self {
Self::new(id, model, Some(text.to_string()), None)
}
fn done(id: &str, model: &str) -> Self {
Self::new(id, model, None, Some("stop".to_string()))
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PredictRequest {
#[serde(default)]
pub model: Option<String>,
pub features: Vec<f32>,
#[serde(default)]
pub feature_names: Option<Vec<String>>,
#[serde(default)]
pub top_k: Option<usize>,
#[serde(default = "default_true")]
pub include_confidence: bool,
}
pub(crate) fn default_true() -> bool {
true
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PredictResponse {
pub request_id: String,
pub model: String,
pub prediction: serde_json::Value,
#[serde(skip_serializing_if = "Option::is_none")]
pub confidence: Option<f32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub top_k_predictions: Option<Vec<PredictionWithScore>>,
pub latency_ms: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PredictionWithScore {
pub label: String,
pub score: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ExplainRequest {
#[serde(default)]
pub model: Option<String>,
pub features: Vec<f32>,
pub feature_names: Vec<String>,
#[serde(default = "default_top_k_features")]
pub top_k_features: usize,
#[serde(default = "default_explain_method")]
pub method: String,
}
pub(crate) fn default_top_k_features() -> usize {
5
}
pub(crate) fn default_explain_method() -> String {
"shap".to_string()
}