#![allow(
unsafe_code,
clippy::borrow_as_ptr,
clippy::type_complexity,
clippy::must_use_candidate,
clippy::default_trait_access,
clippy::unused_self,
clippy::needless_pass_by_value,
clippy::bind_instead_of_map,
clippy::map_unwrap_or,
clippy::needless_return,
clippy::unnecessary_debug_formatting,
clippy::used_underscore_binding,
clippy::no_effect_underscore_binding,
clippy::needless_raw_string_hashes
)]
use std::collections::HashMap;
use std::net::IpAddr;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::{Duration, Instant};
use async_trait::async_trait;
use futures_util::StreamExt;
use oci_client::manifest::OciImageManifest;
use tokio::sync::{Mutex, RwLock};
use tokio::time::timeout as tokio_timeout;
use tracing::instrument;
use windows::core::GUID;
use zlayer_observability::logs::{LogEntry, LogSource, LogStream};
use zlayer_overlay::ipnet;
use zlayer_spec::{PullPolicy, RegistryAuth as SpecRegistryAuth, ServiceSpec};
use zlayer_overlayd::OverlaydClient;
use zlayer_types::overlayd::{AttachHandle, OverlaydRequest, OverlaydResponse};
use crate::cgroups_stats::ContainerStats;
use crate::error::{AgentError, Result};
use crate::runtime::{
ContainerId, ContainerInspectDetails, ContainerState, ExecEvent, ExecEventStream, ImageInfo,
PruneResult, Runtime, WaitOutcome, WaitReason,
};
use crate::windows::uvm::Uvm;
use crate::windows::{scratch, unpacker};
use zlayer_hcs::enumerate;
use zlayer_hcs::events::{self, HcsEventKind};
use zlayer_hcs::schema::{
Chipset, ComputeSystem as HcsDoc, Container as HcsContainer, ContainerMemory,
ContainerProcessor, DebugOptions, Devices, GpuAssignment, GpuAssignmentMode,
GpuAssignmentRequest, GuestCrashReporting, GuestOs as HcsGuestOs, HvSocket2,
HvSocketServiceConfig, HvSocketSystemConfig, ProcessParameters, RegistryChanges, RegistryHive,
RegistryKey, RegistryValue, RegistryValueType, SchemaVersion, ScsiAttachment, ScsiController,
Statistics, Storage as HcsStorage, Topology, TopologyMemory, TopologyProcessor, Uefi,
UefiBootEntry, VirtualMachine, VirtualSmb, VirtualSmbShare, VirtualSmbShareOptions,
};
use zlayer_hcs::system::ComputeSystem;
#[must_use]
pub fn owner_tag(daemon_name: &str) -> String {
if daemon_name == "zlayer" {
"zlayer".to_string()
} else {
daemon_name.to_string()
}
}
#[must_use]
pub fn overlay_network_name(daemon_name: &str) -> String {
if daemon_name == "zlayer" {
"zlayer-overlay".to_string()
} else {
format!("{daemon_name}-overlay")
}
}
fn format_guid_bare(id: GUID) -> String {
format!("{id:?}")
.trim_matches(|c: char| c == '{' || c == '}')
.to_ascii_lowercase()
}
fn netbios_hostname(raw: &str) -> String {
let mut cleaned: String = raw
.chars()
.filter_map(|c| match c {
'A'..='Z' | 'a'..='z' | '0'..='9' | '-' => Some(c),
'_' | '.' | ' ' => Some('-'),
_ => None,
})
.collect();
if !cleaned
.chars()
.next()
.is_some_and(|c| c.is_ascii_alphabetic())
{
cleaned.insert(0, 'z');
}
cleaned.truncate(15);
if cleaned.is_empty() {
"zlayer".to_string()
} else {
cleaned
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
pub enum IsolationMode {
#[default]
Process,
Hyperv,
}
static HOST_WIN_BUILD: std::sync::OnceLock<Option<(u32, u32, u32)>> = std::sync::OnceLock::new();
fn host_windows_build() -> Option<(u32, u32, u32)> {
*HOST_WIN_BUILD.get_or_init(|| {
use windows::Win32::System::SystemInformation::OSVERSIONINFOW;
windows::core::link!(
"ntdll.dll" "system" fn RtlGetVersion(
version_info: *mut OSVERSIONINFOW,
) -> windows::core::HRESULT
);
let mut info = OSVERSIONINFOW {
dwOSVersionInfoSize: u32::try_from(std::mem::size_of::<OSVERSIONINFOW>()).unwrap_or(0),
..Default::default()
};
let hr = unsafe { RtlGetVersion(&mut info) };
if hr.is_ok() {
Some((info.dwMajorVersion, info.dwMinorVersion, info.dwBuildNumber))
} else {
None
}
})
}
fn parse_os_version(s: &str) -> Option<(u32, u32, u32)> {
let mut parts = s.split('.').map(str::parse::<u32>);
let major = parts.next()?.ok()?;
let minor = parts.next()?.ok()?;
let build = parts.next()?.ok()?;
Some((major, minor, build))
}
fn decide_isolation(
spec: Option<zlayer_spec::IsolationMode>,
image_build: Option<(u32, u32, u32)>,
host_build: Option<(u32, u32, u32)>,
) -> IsolationMode {
use zlayer_spec::IsolationMode as Spec;
match spec {
Some(Spec::Process) => IsolationMode::Process,
Some(Spec::Hyperv) => IsolationMode::Hyperv,
None | Some(Spec::Auto) => match (image_build, host_build) {
(Some(img), Some(host)) if img == host => IsolationMode::Process,
(Some(_), Some(_) | None) => IsolationMode::Hyperv,
(None, _) => IsolationMode::Process,
},
}
}
fn resolve_isolation_for_image(
spec: Option<zlayer_spec::IsolationMode>,
image_os_version: Option<&str>,
) -> IsolationMode {
decide_isolation(
spec,
image_os_version.and_then(parse_os_version),
host_windows_build(),
)
}
#[derive(Debug, Clone)]
pub struct HcsConfig {
pub storage_root: PathBuf,
pub default_scratch_size_gb: u64,
pub cluster_cidr: String,
pub slice_cidr: Option<ipnet::IpNet>,
pub daemon_name: String,
pub data_dir: PathBuf,
}
impl Default for HcsConfig {
fn default() -> Self {
let dirs = zlayer_paths::ZLayerDirs::system_default();
Self {
storage_root: std::env::var("ZLAYER_HCS_STORAGE_ROOT")
.map_or_else(|_| dirs.containers().join("hcs"), PathBuf::from),
default_scratch_size_gb: 20,
cluster_cidr: "10.200.0.0/16".to_string(),
slice_cidr: None,
daemon_name: "zlayer".to_string(),
data_dir: dirs.data_dir().to_path_buf(),
}
}
}
#[derive(Debug)]
struct CachedImage {
unpacked: unpacker::UnpackedImage,
os_version: Option<String>,
}
#[derive(Debug)]
struct ContainerEntry {
system: ComputeSystem,
scratch_layer: Option<scratch::WritableLayer>,
hcs_id: String,
last_exit_code: Arc<RwLock<Option<i32>>>,
overlay_attach: Option<WindowsOverlayAttach>,
uvm: Option<Uvm>,
activated_parent_layers: Vec<PathBuf>,
#[cfg(feature = "hcs-runtime")]
#[allow(dead_code)] gcs: Option<zlayer_gcs::bridge::GcsBridge>,
log_buffer: Arc<RwLock<Vec<LogEntry>>>,
}
#[derive(Debug, Clone)]
struct WindowsOverlayAttach {
namespace_guid: String,
ip: IpAddr,
}
pub struct HcsRuntime {
config: HcsConfig,
containers: RwLock<HashMap<String, ContainerEntry>>,
images: RwLock<HashMap<String, CachedImage>>,
registry: Arc<zlayer_registry::ImagePuller>,
auth_resolver: zlayer_core::AuthResolver,
overlayd: Option<Arc<Mutex<OverlaydClient>>>,
next_container_ip: Arc<Mutex<Option<IpAddr>>>,
next_container_dns: Arc<Mutex<Option<(Option<IpAddr>, Option<String>)>>>,
ip_allocator: Mutex<Option<zlayer_overlay::IpAllocator>>,
}
impl std::fmt::Debug for HcsRuntime {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("HcsRuntime")
.field("config", &self.config)
.finish_non_exhaustive()
}
}
impl HcsRuntime {
pub async fn new(config: HcsConfig) -> Result<Self> {
let cache_type = zlayer_registry::CacheType::from_env().map_err(|e| {
AgentError::Configuration(format!("failed to configure HCS blob cache from env: {e}"))
})?;
let blob_cache = cache_type.build().await.map_err(|e| {
AgentError::Configuration(format!("failed to open HCS blob cache: {e}"))
})?;
let registry = Arc::new(zlayer_registry::ImagePuller::with_cache(blob_cache));
let data_dir = config.data_dir.clone();
let mut runtime = Self::new_with_registry(config, registry)?;
let socket = zlayer_paths::ZLayerDirs::default_overlayd_socket_path_for(&data_dir);
match OverlaydClient::connect_with_backoff(std::path::Path::new(&socket)).await {
Ok(client) => {
runtime.overlayd = Some(Arc::new(Mutex::new(client)));
}
Err(e) => {
tracing::warn!(
socket = %socket,
error = %e,
"could not connect to overlayd; Windows containers will fail to attach overlay networking until it is reachable"
);
}
}
if let Err(e) = runtime.reconcile_orphan_systems().await {
tracing::warn!(
error = %e,
"startup reconcile of orphan HCS compute systems failed; continuing without it"
);
}
if let Err(e) = runtime.reconcile_orphans().await {
tracing::warn!(
error = %e,
"startup reconcile of orphan HCN endpoints failed; continuing without it"
);
}
Ok(runtime)
}
pub fn new_with_registry(
config: HcsConfig,
registry: Arc<zlayer_registry::ImagePuller>,
) -> Result<Self> {
std::fs::create_dir_all(&config.storage_root).map_err(|e| {
AgentError::Configuration(format!(
"failed to create HCS storage root {:?}: {e}",
config.storage_root
))
})?;
std::fs::create_dir_all(config.storage_root.join("images")).map_err(|e| {
AgentError::Configuration(format!("failed to create HCS image cache dir: {e}"))
})?;
std::fs::create_dir_all(config.storage_root.join("scratch")).map_err(|e| {
AgentError::Configuration(format!("failed to create HCS scratch dir: {e}"))
})?;
let ip_allocator = config.slice_cidr.and_then(|slice| {
match zlayer_overlay::IpAllocator::new(&slice.to_string()) {
Ok(mut alloc) => {
let _ = alloc.allocate_first(); Some(alloc)
}
Err(e) => {
tracing::warn!(
slice = %slice,
error = %e,
"failed to build IP allocator from slice_cidr; Windows \
containers will start without overlay networking"
);
None
}
}
});
Ok(Self {
config,
containers: RwLock::new(HashMap::new()),
images: RwLock::new(HashMap::new()),
registry,
auth_resolver: zlayer_core::AuthResolver::new(zlayer_core::AuthConfig::default()),
overlayd: None,
next_container_ip: Arc::new(Mutex::new(None)),
next_container_dns: Arc::new(Mutex::new(None)),
ip_allocator: Mutex::new(ip_allocator),
})
}
fn hcs_id(id: &ContainerId) -> String {
id.to_string()
}
fn image_layer_dir(&self, image: &str) -> PathBuf {
let safe = image.replace(['/', ':', '@'], "_");
self.config.storage_root.join("images").join(safe)
}
fn scratch_dir(&self, hcs_id: &str) -> PathBuf {
self.config.storage_root.join("scratch").join(hcs_id)
}
pub async fn set_next_container_ip(&self, ip: IpAddr) {
*self.next_container_ip.lock().await = Some(ip);
}
pub async fn set_next_container_dns(
&self,
dns_server: Option<IpAddr>,
dns_domain: Option<String>,
) {
*self.next_container_dns.lock().await = Some((dns_server, dns_domain));
}
async fn overlayd_attach_windows(
&self,
container_id: &str,
service: &str,
ip: IpAddr,
dns_server: Option<IpAddr>,
dns_domain: Option<String>,
) -> Result<WindowsOverlayAttach> {
let client = self.overlayd.as_ref().ok_or_else(|| {
AgentError::Network("overlayd is not connected; cannot attach overlay".to_string())
})?;
let resp = {
let mut conn = client.lock().await;
conn.call(OverlaydRequest::AttachContainer {
handle: AttachHandle::WindowsContainer {
container_id: container_id.to_string(),
ip: Some(ip),
},
service: service.to_string(),
join_global: false,
dns_server,
dns_domain,
})
.await
.map_err(|e| AgentError::Network(format!("overlayd AttachContainer failed: {e}")))?
};
match resp {
OverlaydResponse::Attached(result) => Ok(WindowsOverlayAttach {
namespace_guid: result.namespace_guid.unwrap_or_default(),
ip: result.ip,
}),
other => Err(AgentError::Network(format!(
"overlayd AttachContainer returned unexpected response: {other:?}"
))),
}
}
async fn overlayd_detach_windows(&self, namespace_guid: &str) {
let Some(client) = self.overlayd.as_ref() else {
return;
};
let mut conn = client.lock().await;
if let Err(e) = conn
.call(OverlaydRequest::DetachContainer {
handle: AttachHandle::WindowsContainer {
container_id: namespace_guid.to_string(),
ip: None,
},
})
.await
{
tracing::warn!(ns = %namespace_guid, error = %e, "overlayd DetachContainer failed");
}
}
pub async fn reconcile_orphan_systems(&self) -> Result<()> {
let live: std::collections::HashSet<String> =
self.containers.read().await.keys().cloned().collect();
let tag = owner_tag(&self.config.daemon_name);
let systems = enumerate::list_by_owner(&tag)
.await
.map_err(|e| AgentError::Internal(format!("HcsEnumerateComputeSystems: {e}")))?;
if systems.is_empty() {
tracing::debug!(owner = %tag, "reconcile: no HCS compute systems found for owner");
return Ok(());
}
for sys in systems {
if live.contains(&sys.id) {
continue;
}
let id = sys.id.clone();
match ComputeSystem::open(&id, 0) {
Ok(system) => match system.terminate("").await {
Ok(()) => {
tracing::info!(
hcs_id = %id,
state = %sys.state,
"reconcile: terminated orphan HCS compute system"
);
drop(system);
}
Err(e) => {
tracing::warn!(
hcs_id = %id,
error = %e,
"reconcile: HcsTerminateComputeSystem failed for orphan; \
system may need manual cleanup"
);
}
},
Err(e) => {
tracing::warn!(
hcs_id = %id,
error = %e,
"reconcile: HcsOpenComputeSystem failed for enumerated orphan; \
skipping (may have just exited)"
);
}
}
}
Ok(())
}
#[allow(clippy::unused_async)]
pub async fn reconcile_orphans(&self) -> Result<()> {
Ok(())
}
fn manifest_to_descriptors(
manifest: &OciImageManifest,
) -> Vec<unpacker::ResolvedLayerDescriptor> {
manifest
.layers
.iter()
.map(|l| unpacker::ResolvedLayerDescriptor {
digest: l.digest.clone(),
media_type: l.media_type.clone(),
size: l.size,
urls: l.urls.clone().unwrap_or_default(),
})
.collect()
}
async fn do_pull(
&self,
image: &str,
_policy: PullPolicy,
_auth: Option<&SpecRegistryAuth>,
) -> Result<()> {
{
let cache = self.images.read().await;
if cache.contains_key(image) {
tracing::debug!(image = %image, "HCS image cache hit, skipping pull");
return Ok(());
}
}
let auth = self.auth_resolver.resolve(image);
let (manifest, _digest) = self
.registry
.pull_manifest(image, &auth)
.await
.map_err(|e| AgentError::PullFailed {
image: image.to_string(),
reason: format!("manifest pull: {e}"),
})?;
match self.registry.image_os(image, &auth).await {
Ok(Some(os)) if os != zlayer_spec::OsKind::Windows => {
tracing::debug!(
image,
image_os = os.as_oci_str(),
"HCS runtime skipping unpack: image is not a Windows image"
);
return Err(AgentError::WrongPlatform {
runtime: "hcs".to_string(),
expected: zlayer_spec::OsKind::Windows.as_oci_str().to_string(),
actual: os.as_oci_str().to_string(),
image: image.to_string(),
});
}
Ok(_) => {}
Err(e) => {
tracing::warn!(
image,
error = %e,
"failed to inspect image OS before HCS unpack; proceeding optimistically",
);
}
}
let descriptors = Self::manifest_to_descriptors(&manifest);
let dest_root = self.image_layer_dir(image);
let unpacked = unpacker::unpack_windows_image(
self.registry.as_ref(),
image,
&auth,
&descriptors,
&dest_root,
)
.await
.map_err(|e| AgentError::PullFailed {
image: image.to_string(),
reason: format!("unpack: {e}"),
})?;
let os_version = match self.registry.image_os_version(image, &auth).await {
Ok(v) => v,
Err(e) => {
tracing::warn!(
image,
error = %e,
"failed to fetch image os.version; isolation auto-resolution will fall back to Hyper-V",
);
None
}
};
let mut cache = self.images.write().await;
cache.insert(
image.to_string(),
CachedImage {
unpacked,
os_version,
},
);
Ok(())
}
#[allow(clippy::too_many_arguments, clippy::too_many_lines)]
fn build_compute_system_doc(
&self,
hcs_id: &str,
spec: &ServiceSpec,
scratch_layer: &scratch::WritableLayer,
parent_layers: Vec<zlayer_hcs::schema::Layer>,
namespace_ids: Vec<String>,
isolation: IsolationMode,
uvm: Option<&Uvm>,
) -> Result<HcsDoc> {
let gpu_adapters: Vec<HostGpuAdapter> = if let Some(gpu_spec) = spec.resources.gpu.as_ref()
{
if matches!(isolation, IsolationMode::Process) {
return Err(AgentError::Unsupported(
"GPU passthrough with `isolation: process` is not yet wired; switch to \
`isolation: hyperv` (DirectX device-sharing for Process isolation requires \
dxgkrnl device paths that drift between Windows builds and would need a \
stable hcsshim binding to be safe)"
.to_string(),
));
}
if matches!(gpu_spec.sharing, Some(zlayer_spec::GpuSharingMode::Mps))
&& matches!(isolation, IsolationMode::Hyperv)
{
return Err(AgentError::GpuSharingUnavailable {
mode: "mps".to_string(),
reason: "MPS is not supported with Hyper-V isolation; use Process isolation \
or remove the sharing config"
.to_string(),
});
}
let all_adapters =
enumerate_host_gpu_adapters().map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("DXGI host GPU enumeration: {e}"),
})?;
filter_adapters_by_gpu_spec(&all_adapters, gpu_spec)
} else {
Vec::new()
};
let virtual_machine = match isolation {
IsolationMode::Process => None,
IsolationMode::Hyperv => {
let uvm = uvm.ok_or_else(|| {
AgentError::Internal(
"build_compute_system_doc called with Hyperv isolation but no UVM provided"
.to_string(),
)
})?;
Some(build_virtual_machine_doc(
uvm,
&parent_layers,
spec,
&gpu_adapters,
))
}
};
let processor = Some(
spec.resources
.cpu
.and_then(|cpu| {
let count = cpu.ceil();
#[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
let count_u32 = if count.is_finite() && count >= 1.0 {
count as u32
} else {
return None;
};
Some(ContainerProcessor {
count: Some(count_u32),
maximum: None,
weight: None,
})
})
.unwrap_or_default(),
);
let memory = spec.resources.memory.as_ref().and_then(|mem_str| {
crate::bundle::parse_memory_string(mem_str)
.ok()
.map(|bytes| {
let mib = bytes.div_ceil(1024 * 1024);
ContainerMemory {
size_in_mb: Some(mib),
}
})
});
let mut root_path = scratch_layer.vhd_mount_path().to_string();
if !root_path.is_empty() && !root_path.ends_with('\\') {
root_path.push('\\');
}
let storage = HcsStorage {
layers: parent_layers,
path: Some(root_path),
};
let networking =
namespace_ids
.into_iter()
.next()
.map(|ns| zlayer_hcs::schema::ContainerNetworking {
allow_unqualified_dns_query: Some(true),
dns_search_list: Vec::new(),
namespace: Some(ns),
network_shared_container_name: None,
});
let hostname_source = spec.hostname.as_deref().unwrap_or(hcs_id);
let guest_os = Some(HcsGuestOs {
host_name: Some(netbios_hostname(hostname_source)),
});
let container = HcsContainer {
guest_os,
storage: Some(storage),
networking,
mapped_directories: Vec::new(),
mapped_pipes: Vec::new(),
processor,
memory,
};
let (container_doc, vm_doc) = match isolation {
IsolationMode::Process => (Some(container), None),
IsolationMode::Hyperv => (None, virtual_machine),
};
let doc = HcsDoc {
owner: owner_tag(&self.config.daemon_name),
schema_version: SchemaVersion::default(),
hosting_system_id: String::new(),
container: container_doc,
virtual_machine: vm_doc,
should_terminate_on_last_handle_closed: Some(true),
}
.apply_service_id(hcs_id);
Ok(doc)
}
async fn resolve_parent_chain(&self, image: &str) -> Result<Vec<zlayer_hcs::schema::Layer>> {
let cache = self.images.read().await;
let entry = cache.get(image).ok_or_else(|| AgentError::CreateFailed {
id: image.to_string(),
reason: format!("image '{image}' not pulled before create_container"),
})?;
Ok(entry.unpacked.chain.0.clone())
}
async fn resolve_image_os_version(&self, image: &str) -> Option<String> {
let cache = self.images.read().await;
cache.get(image).and_then(|e| e.os_version.clone())
}
fn activate_parent_layers(
parent_layers: &[zlayer_hcs::schema::Layer],
) -> std::io::Result<Vec<PathBuf>> {
let mut activated: Vec<PathBuf> = Vec::with_capacity(parent_layers.len());
let n = parent_layers.len();
for i in (0..n).rev() {
let layer = &parent_layers[i];
let layer_path = PathBuf::from(&layer.path);
if let Err(e) = crate::windows::wclayer::activate_layer(&layer_path) {
rollback_parent_activations(&activated);
return Err(std::io::Error::other(format!(
"ActivateLayer({}) failed: {e}",
layer_path.display()
)));
}
activated.push(layer_path);
}
activated.reverse();
Ok(activated)
}
fn spawn_exit_watcher(
&self,
hcs_id: String,
system_raw: windows::Win32::System::HostComputeSystem::HCS_SYSTEM,
sink: Arc<RwLock<Option<i32>>>,
) {
let (_sub, mut stream) = match events::subscribe(system_raw) {
Ok(pair) => pair,
Err(e) => {
tracing::warn!(
hcs_id = %hcs_id,
error = %e,
"failed to subscribe to HCS lifecycle events; exit code will be unknown"
);
return;
}
};
tokio::spawn(async move {
let _sub = _sub;
while let Some(evt) = stream.next().await {
if matches!(evt.kind, HcsEventKind::SystemExited) {
let code = extract_exit_code(&evt.detail_json).unwrap_or(0);
*sink.write().await = Some(code);
break;
}
if matches!(evt.kind, HcsEventKind::ServiceDisconnect) {
*sink.write().await = Some(-1);
break;
}
}
});
}
#[cfg(all(target_os = "windows", feature = "hcs-runtime"))]
#[allow(clippy::too_many_lines, clippy::too_many_arguments)]
async fn hyperv_create_via_gcs(
&self,
hcs_id: &str,
spec: &ServiceSpec,
scratch_layer: &scratch::WritableLayer,
parent_layers: &[zlayer_hcs::schema::Layer],
namespace_strs: &[String],
uvm: &Uvm,
network_attachment: Option<&WindowsOverlayAttach>,
) -> Result<(ComputeSystem, HyperVGcsState)> {
use uuid::Uuid;
use zlayer_gcs::bridge::GcsBridge;
use zlayer_gcs::diagnostics::ts_us;
use zlayer_gcs::frame::RpcMessageType;
use zlayer_gcs::protocol::{
CreateRequest, CreateResponse, ModifySettingsRequest, ModifySettingsResponse,
RequestBase, StartRequest, StartResponse,
};
macro_rules! step_log {
($($arg:tt)*) => {{
eprintln!(
"[t=+{}us] hcs_id={hcs_id} Hyper-V step: {}",
ts_us(),
format_args!($($arg)*),
);
}};
}
let gpu_adapters: Vec<HostGpuAdapter> = if let Some(gpu_spec) = spec.resources.gpu.as_ref()
{
if matches!(gpu_spec.sharing, Some(zlayer_spec::GpuSharingMode::Mps)) {
return Err(AgentError::GpuSharingUnavailable {
mode: "mps".to_string(),
reason: "MPS is not supported with Hyper-V isolation; use Process \
isolation or remove the sharing config"
.to_string(),
});
}
let all_adapters =
enumerate_host_gpu_adapters().map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("DXGI host GPU enumeration: {e}"),
})?;
filter_adapters_by_gpu_spec(&all_adapters, gpu_spec)
} else {
Vec::new()
};
let owner = owner_tag(&self.config.daemon_name);
let uvm_doc = build_uvm_only_doc(&owner, hcs_id, uvm, parent_layers, spec, &gpu_adapters);
let uvm_doc_json =
serde_json::to_string(&uvm_doc).map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 1: serialize UVM doc: {e}"),
})?;
tracing::error!(
target: "zlayer_agent::hcs::diag",
hcs_id = %hcs_id,
doc = %uvm_doc_json,
"HCS_CREATE_UVM_DOC"
);
if let Ok(dir) = std::env::var("ZLAYER_HCS_DOC_DUMP_DIR") {
let path = std::path::PathBuf::from(&dir).join(format!("{hcs_id}.uvm.json"));
let _ = std::fs::create_dir_all(&dir);
let _ = std::fs::write(&path, &uvm_doc_json);
}
tracing::info!(
hcs_id = %hcs_id,
parents = parent_layers.len(),
"Hyper-V step 1b: HcsGrantVmAccess on parent layer dirs",
);
step_log!(
"1b: HcsGrantVmAccess on parent layer dirs ({} parents)",
parent_layers.len()
);
for layer in parent_layers {
crate::windows::wclayer::grant_vm_access(
uvm.runtime_id(),
std::path::Path::new(&layer.path),
)
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!(
"Hyper-V step 1b: HcsGrantVmAccess(parent layer {}): {e}",
layer.path
),
})?;
}
let uvm_system_id = format_guid_bare(uvm.runtime_id());
tracing::info!(
hcs_id = %hcs_id,
uvm_system_id = %uvm_system_id,
"Hyper-V step 2: HcsCreateComputeSystem (UVM)"
);
step_log!("2: HcsCreateComputeSystem (UVM) uvm_system_id={uvm_system_id}");
#[cfg(feature = "windows-debug")]
{
let bcd = uvm.os_files_dir().join(r"EFI\Microsoft\Boot\BCD");
let set_bcd = |opt: &str, val: &str| match std::process::Command::new("bcdedit")
.args([
"/store",
bcd.to_string_lossy().as_ref(),
"/set",
"{default}",
opt,
val,
])
.output()
{
Ok(o) => step_log!(
"2-bcd (windows-debug): {opt} {val} on {} status={} out={} err={}",
bcd.display(),
o.status,
String::from_utf8_lossy(&o.stdout).trim(),
String::from_utf8_lossy(&o.stderr).trim()
),
Err(e) => step_log!("2-bcd (windows-debug): bcdedit {opt} spawn failed: {e}"),
};
if std::env::var("ZLAYER_GCS_BOOTLOG").as_deref() == Ok("1") {
set_bcd("bootlog", "Yes");
}
if std::env::var("ZLAYER_GCS_KD").as_deref() == Ok("1") {
set_bcd("debug", "Yes");
}
if std::env::var("ZLAYER_GCS_SVCDUMP").as_deref() == Ok("1") {
let host_sc = std::path::Path::new(
&std::env::var("SystemRoot").unwrap_or_else(|_| r"C:\Windows".to_string()),
)
.join(r"System32\sc.exe");
let dst_sc = uvm.os_files_dir().join(r"Windows\System32\sc.exe");
match std::fs::copy(&host_sc, &dst_sc) {
Ok(n) => step_log!(
"2-svcdump (windows-debug): injected sc.exe {} -> {} ({n} bytes)",
host_sc.display(),
dst_sc.display(),
),
Err(e) => step_log!(
"2-svcdump (windows-debug): sc.exe inject {} -> {} FAILED: {e}",
host_sc.display(),
dst_sc.display(),
),
}
}
}
let uvm_system = ComputeSystem::create(&uvm_system_id, &uvm_doc_json)
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 2: HcsCreateComputeSystem (UVM): {e}"),
})?;
tracing::info!(
hcs_id = %hcs_id,
runtime_id = %format_guid_bare(uvm.runtime_id()),
"Hyper-V step 3a: GcsBridge::listen (bind host hvsock listener)"
);
step_log!(
"3a: GcsBridge::listen (bind host hvsock listener) runtime_id={}",
format_guid_bare(uvm.runtime_id())
);
let pending_bridge =
GcsBridge::listen(uvm.runtime_id())
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 3a: GcsBridge::listen: {e}"),
})?;
#[cfg(feature = "windows-debug")]
let log_fwd = {
tracing::info!(
hcs_id = %hcs_id,
runtime_id = %format_guid_bare(uvm.runtime_id()),
"Hyper-V step 3a' (windows-debug): bind host log-forward hvsock listener"
);
step_log!("3a' (windows-debug): bind host log-forward hvsock listener");
zlayer_gcs::log_forward::LogForwardListener::bind(uvm.runtime_id(), uvm.debug_dir())
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 3a': LogForwardListener::bind: {e}"),
})?
};
tracing::info!(hcs_id = %hcs_id, "Hyper-V step 3: HcsStartComputeSystem (UVM)");
step_log!("3: HcsStartComputeSystem (UVM)");
uvm_system
.start("")
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 3: HcsStartComputeSystem (UVM): {e}"),
})?;
#[cfg(feature = "windows-debug")]
log_fwd.spawn();
let diag_base_props = uvm_system
.properties("{}")
.await
.unwrap_or_else(|e| format!("<base-props query failed: {e}>"));
let diag_guest_conn = uvm_system
.properties(r#"{"PropertyTypes":["GuestConnection"]}"#)
.await
.unwrap_or_else(|e| format!("<guest-conn query failed: {e}>"));
tracing::info!(
hcs_id = %hcs_id,
runtime_id = %format_guid_bare(uvm.runtime_id()),
"Hyper-V step 4: PendingGcsBridge::accept (await guest GCS dial-out)"
);
step_log!("4: PendingGcsBridge::accept (await guest GCS dial-out, 120s timeout)");
let host_tz = crate::windows::timezone::host_timezone_information();
if host_tz.is_none() {
tracing::warn!(
hcs_id = %hcs_id,
"Hyper-V step 4: host timezone query returned TIME_ZONE_ID_INVALID; \
cold-start Create will fall back to the UTC TimeZoneInformation constant"
);
}
let accept_secs = std::env::var("ZLAYER_GCS_ACCEPT_TIMEOUT_SECS")
.ok()
.and_then(|s| s.parse::<u64>().ok())
.unwrap_or(120);
let accept_fut =
pending_bridge.accept(std::time::Duration::from_secs(accept_secs), host_tz);
tokio::pin!(accept_fut);
let mut state_poll = tokio::time::interval(std::time::Duration::from_secs(5));
let bridge = loop {
tokio::select! {
res = &mut accept_fut => {
match res {
Ok(bridge) => break bridge,
Err(e) => {
let dump = read_uvm_debug_dump(uvm.debug_dir());
#[allow(unused_mut)]
let mut svcdump_capture = String::new();
if std::env::var("ZLAYER_KEEP_UVM_ON_FAILURE").as_deref()
== Ok("1")
{
if let Err(te) = uvm_system.terminate("").await {
tracing::warn!(
hcs_id = %hcs_id,
error = %te,
"ZLAYER_KEEP_UVM_ON_FAILURE=1 — best-effort UVM terminate failed; scratch VHDX may remain locked until the VM is stopped manually",
);
}
let vhdx = uvm.scratch_vhdx().display().to_string();
tracing::warn!(
hcs_id = %hcs_id,
runtime_id = %format_guid_bare(uvm.runtime_id()),
scratch_vhdx = %vhdx,
"ZLAYER_KEEP_UVM_ON_FAILURE=1 — UVM TERMINATED and scratch VHDX PRESERVED for offline inspection",
);
let debug_root =
std::path::Path::new(r"C:\zlayer-uvm-debug");
let already_have_vhdx = std::fs::read_dir(debug_root)
.map(|rd| {
rd.filter_map(Result::ok).any(|e| {
e.path()
.extension()
.is_some_and(|x| x.eq_ignore_ascii_case("vhdx"))
})
})
.unwrap_or(false);
if already_have_vhdx {
step_log!(
"ZLAYER_KEEP_UVM_ON_FAILURE=1 — {} already contains a captured *.vhdx; skipping copy of {vhdx}",
debug_root.display(),
);
} else if let Err(ce) = std::fs::create_dir_all(debug_root) {
eprintln!(
"ZLAYER_KEEP_UVM_ON_FAILURE=1 — failed to create {}: {ce}; scratch VHDX NOT copied out (in-place copy at {vhdx} will be reaped on panic)",
debug_root.display(),
);
} else {
let dest = debug_root
.join(format!("{hcs_id}-scratch.vhdx"));
let dest_str = dest.display().to_string();
match std::fs::copy(uvm.scratch_vhdx(), &dest) {
Ok(_) => {
step_log!(
"ZLAYER_KEEP_UVM_ON_FAILURE=1 — scratch VHDX COPIED to {dest_str} (survives temp-dir reap). \
Read the guest event log + any WER dump OFFLINE from the COPY:\n\
\x20 Mount-VHD -Path '{dest_str}' -ReadOnly # note the drive letter, e.g. X:\n\
\x20 Get-WinEvent -Path 'X:\\Windows\\System32\\winevt\\Logs\\Application.evtx' | Select-Object -First 100 | Format-List\n\
\x20 Get-WinEvent -Path 'X:\\Windows\\System32\\winevt\\Logs\\System.evtx' | Select-Object -First 100 | Format-List\n\
\x20 Get-ChildItem 'X:\\zlayer-dbg\\*.dmp'\n\
\x20 Dismount-VHD -Path '{dest_str}'",
);
#[cfg(feature = "windows-debug")]
if std::env::var("ZLAYER_GCS_SVCDUMP").as_deref()
== Ok("1")
{
let cap = mount_and_read_svcdump(&dest);
step_log!(
"ZLAYER_GCS_SVCDUMP=1 — on-box svcdump capture from {dest_str}:\n{cap}",
);
svcdump_capture = cap;
}
}
Err(ce) => eprintln!(
"ZLAYER_KEEP_UVM_ON_FAILURE=1 — failed to copy scratch VHDX {vhdx} -> {dest_str}: {ce}; \
fall back to the in-place copy at {vhdx} if it survives",
),
}
}
}
let svcdump_fold = if svcdump_capture.is_empty() {
String::new()
} else {
format!(" || svcdump={svcdump_capture}")
};
return Err(AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!(
"Hyper-V step 4: GcsBridge accept: {e} || create_id={uvm_system_id} \
|| base_props={diag_base_props} || guest_conn={diag_guest_conn} \
|| guest_debug_dump={dump}{svcdump_fold}"
),
});
}
}
}
_ = state_poll.tick() => {
match uvm_system.properties("{}").await {
Ok(props) => tracing::info!(
hcs_id = %hcs_id,
uvm_props = %props,
"Hyper-V step 4: UVM state poll (awaiting guest GCS dial-out)"
),
Err(e) => tracing::warn!(
hcs_id = %hcs_id,
error = %e,
"Hyper-V step 4: UVM state poll failed"
),
}
}
}
};
tracing::info!(hcs_id = %hcs_id, "Hyper-V step 4b: configureHvSocketForGCS");
step_log!("4b: configureHvSocketForGCS");
let hvsocket_setup_req = ModifySettingsRequest {
base: RequestBase {
activity_id: Uuid::new_v4(),
container_id: NULL_GUID_STR.to_string(),
},
request: serde_json::json!({
"ResourceType": "HvSocket",
"RequestType": "Update",
"Settings": {
"LocalAddress": format_guid_bare(uvm.runtime_id()),
"ParentAddress": format_guid_bare(
zlayer_gcs::transport::WINDOWS_GCS_HV_HOST_ID
),
}
}),
};
let hvsocket_setup_resp: ModifySettingsResponse = bridge
.send_rpc_json(RpcMessageType::ModifySettings, &hvsocket_setup_req)
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 4b: GCS HvSocket setup: {e}"),
})?;
if hvsocket_setup_resp.result != 0 {
let hresult_u32 = u32::from_ne_bytes(hvsocket_setup_resp.result.to_ne_bytes());
return Err(AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!(
"Hyper-V step 4b: GCS HvSocket setup returned HRESULT 0x{hresult_u32:08x}: {}",
hvsocket_setup_resp.error_message,
),
});
}
tracing::info!(
hcs_id = %hcs_id,
layer_count = parent_layers.len(),
"Hyper-V step 5: hot-attach per-layer VSMB shares"
);
step_log!(
"5: hot-attach per-layer VSMB shares ({} layers)",
parent_layers.len()
);
for (i, layer) in parent_layers.iter().enumerate() {
let share = VirtualSmbShare {
name: format!("s{i}"),
path: layer.path.clone(),
options: Some(VirtualSmbShareOptions {
read_only: true,
share_read: true,
cache_io: true,
pseudo_oplocks: true,
..Default::default()
}),
..Default::default()
};
uvm_system
.add_vsmb(&share)
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 5: add_vsmb(s{i} -> {}): {e}", layer.id),
})?;
}
tracing::info!(
hcs_id = %hcs_id,
"Hyper-V step 6: hot-attach scratch VHDX on SCSI LUN 1"
);
step_log!("6: hot-attach scratch VHDX on SCSI LUN 1");
let scratch_vhd = scratch_layer
.layer_path()
.join("sandbox.vhdx")
.to_string_lossy()
.into_owned();
let scratch_attachment = ScsiAttachment {
path: scratch_vhd.clone(),
r#type: "VirtualDisk".to_string(),
read_only: Some(false),
};
uvm_system
.add_scsi(PRIMARY_SCSI_CTRL_GUID, 1, &scratch_attachment)
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!(
"Hyper-V step 6: add_scsi({PRIMARY_SCSI_CTRL_GUID}, 1, {scratch_vhd}): {e}"
),
})?;
let vsmb_controller_guid = "{dcc079ae-60ba-4d07-847c-3493609c0870}";
let guest_vsmb_paths: Vec<String> = (0..parent_layers.len())
.map(|n| format!(r"\\?\VMSMB\VSMB-{vsmb_controller_guid}\s{n}"))
.collect();
let guest_root_path = r"c:\mounts\scsi\m0".to_string();
tracing::info!(hcs_id = %hcs_id, scratch_mount = %guest_root_path, "Hyper-V step 6.5: GCS MappedVirtualDisk (mount scratch in guest)");
step_log!("6.5: GCS MappedVirtualDisk (mount scratch in guest) at {guest_root_path}");
let mount_scratch_req = ModifySettingsRequest {
base: RequestBase {
activity_id: Uuid::new_v4(),
container_id: NULL_GUID_STR.to_string(),
},
request: serde_json::json!({
"ResourceType": "MappedVirtualDisk",
"RequestType": "Add",
"Settings": { "ContainerPath": guest_root_path, "Lun": 1 },
}),
};
let mount_scratch_resp: ModifySettingsResponse = bridge
.send_rpc_json(RpcMessageType::ModifySettings, &mount_scratch_req)
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 6.5: GCS MappedVirtualDisk (mount scratch): {e}"),
})?;
if mount_scratch_resp.result != 0 {
let hresult_u32 = u32::from_ne_bytes(mount_scratch_resp.result.to_ne_bytes());
return Err(AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!(
"Hyper-V step 6.5: MappedVirtualDisk (scratch) returned HRESULT 0x{hresult_u32:08x}: {}",
mount_scratch_resp.error_message,
),
});
}
tracing::info!(
hcs_id = %hcs_id,
guest_root = %guest_root_path,
"Hyper-V step 7: CombineLayersWCOW via GCS"
);
step_log!("7: CombineLayersWCOW via GCS guest_root={guest_root_path}");
let combine_req = ModifySettingsRequest {
base: RequestBase {
activity_id: Uuid::new_v4(),
container_id: NULL_GUID_STR.to_string(),
},
request: serde_json::json!({
"ResourceType": "CombinedLayers",
"RequestType": "Add",
"Settings": {
"ContainerRootPath": guest_root_path,
"Layers": parent_layers
.iter()
.zip(guest_vsmb_paths.iter())
.map(|(orig, gp)| serde_json::json!({ "Id": orig.id, "Path": gp }))
.collect::<Vec<_>>(),
},
}),
};
let combine_resp: ModifySettingsResponse = bridge
.send_rpc_json(RpcMessageType::ModifySettings, &combine_req)
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 7: GCS ModifySettings (CombineLayersWCOW): {e}"),
})?;
if combine_resp.result != 0 {
let hresult_u32 = u32::from_ne_bytes(combine_resp.result.to_ne_bytes());
return Err(AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!(
"Hyper-V step 7: CombineLayersWCOW returned HRESULT 0x{hresult_u32:08x}: {}",
combine_resp.error_message,
),
});
}
if let Some(network_attachment) = network_attachment {
let namespace_guid = network_attachment.namespace_guid.clone();
let ns_id =
GUID::try_from(namespace_guid.as_str()).map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!(
"Hyper-V step 7.5: overlayd namespace GUID {namespace_guid} unparseable: {e}"
),
})?;
let endpoint_guid = {
let ns_for_lookup = ns_id;
let endpoints = tokio::task::spawn_blocking(move || {
zlayer_hns::namespace::Namespace::open(ns_for_lookup)
.and_then(|ns| ns.list_endpoints())
})
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 7.5: spawn_blocking join failed: {e}"),
})?
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!(
"Hyper-V step 7.5: failed to list endpoints for namespace {namespace_guid}: {e}"
),
})?;
endpoints
.into_iter()
.next()
.ok_or_else(|| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!(
"Hyper-V step 7.5: overlayd namespace {namespace_guid} has no endpoints"
),
})?
};
let endpoint_id_bare = endpoint_guid
.trim_matches(|c: char| c == '{' || c == '}')
.to_ascii_lowercase();
tracing::info!(
hcs_id = %hcs_id,
endpoint_id = %endpoint_id_bare,
namespace_id = %namespace_guid,
"Hyper-V step 7.5: wiring HCN endpoint into UVM network compartment"
);
step_log!(
"7.5: wiring HCN endpoint into UVM network compartment endpoint_id={} namespace_id={}",
endpoint_id_bare,
namespace_guid,
);
let add_endpoint_req = ModifySettingsRequest {
base: RequestBase {
activity_id: Uuid::new_v4(),
container_id: hcs_id.to_string(),
},
request: serde_json::json!({
"ResourcePath": "Container/Networks",
"RequestType": "Add",
"Settings": {
"NamespaceId": namespace_guid,
"EndpointId": endpoint_id_bare,
"AllocatedIPAddress": network_attachment.ip.to_string(),
},
}),
};
let add_endpoint_resp: ModifySettingsResponse = bridge
.send_rpc_json(RpcMessageType::ModifySettings, &add_endpoint_req)
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 7.5 (network compartment attach): {e}"),
})?;
if add_endpoint_resp.result != 0 {
let hresult_u32 = u32::from_ne_bytes(add_endpoint_resp.result.to_ne_bytes());
return Err(AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!(
"Hyper-V step 7.5 (network compartment attach): guest GCS returned \
HRESULT 0x{:08x}: {}",
hresult_u32, add_endpoint_resp.error_message,
),
});
}
} else {
step_log!(
"7.5: skipped network compartment attach — no overlay attachment \
(degraded; container has no overlay NIC)"
);
}
let guest_layers: Vec<zlayer_hcs::schema::Layer> = parent_layers
.iter()
.zip(guest_vsmb_paths.iter())
.map(|(orig, guest_path)| zlayer_hcs::schema::Layer {
id: orig.id.clone(),
path: guest_path.clone(),
})
.collect();
let namespace_id = namespace_strs.first().cloned();
let hosted_doc =
build_hosted_container_doc(spec, guest_layers, guest_root_path, namespace_id, hcs_id);
tracing::info!(hcs_id = %hcs_id, "Hyper-V step 9: GCS RpcCreate (hosted container)");
step_log!("9: GCS RpcCreate (hosted container)");
let container_value =
serde_json::to_value(&hosted_doc).map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 9: serialize hosted-container body: {e}"),
})?;
let create_settings = serde_json::json!({
"SchemaVersion": { "Major": 2, "Minor": 1 },
"Container": container_value,
});
let create_req = CreateRequest {
base: RequestBase {
activity_id: Uuid::new_v4(),
container_id: hcs_id.to_string(),
},
container_config: zlayer_gcs::protocol::AnyInString::new(create_settings),
};
let create_resp: CreateResponse = bridge
.send_rpc_json(RpcMessageType::Create, &create_req)
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 9: GCS RpcCreate: {e}"),
})?;
if create_resp.result != 0 {
let hresult_u32 = u32::from_ne_bytes(create_resp.result.to_ne_bytes());
return Err(AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!(
"Hyper-V step 9: GCS RpcCreate returned HRESULT 0x{hresult_u32:08x}: {}",
create_resp.error_message,
),
});
}
tracing::info!(hcs_id = %hcs_id, "Hyper-V step 10: GCS RpcStart (hosted container)");
step_log!("10: GCS RpcStart (hosted container)");
let start_req = StartRequest {
base: RequestBase {
activity_id: Uuid::new_v4(),
container_id: hcs_id.to_string(),
},
};
let start_resp: StartResponse = bridge
.send_rpc_json(RpcMessageType::Start, &start_req)
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!("Hyper-V step 10: GCS RpcStart: {e}"),
})?;
if start_resp.result != 0 {
let hresult_u32 = u32::from_ne_bytes(start_resp.result.to_ne_bytes());
return Err(AgentError::CreateFailed {
id: hcs_id.to_string(),
reason: format!(
"Hyper-V step 10: GCS RpcStart returned HRESULT 0x{hresult_u32:08x}: {}",
start_resp.error_message,
),
});
}
Ok((
uvm_system,
HyperVGcsState {
bridge: Some(bridge),
},
))
}
}
#[cfg(all(target_os = "windows", not(feature = "hcs-runtime")))]
impl HcsRuntime {
#[allow(clippy::too_many_arguments, clippy::unused_async)]
async fn hyperv_create_via_gcs(
&self,
hcs_id: &str,
_spec: &ServiceSpec,
_scratch_layer: &scratch::WritableLayer,
_parent_layers: &[zlayer_hcs::schema::Layer],
_namespace_strs: &[String],
_uvm: &Uvm,
_network_attachment: Option<&WindowsOverlayAttach>,
) -> Result<(ComputeSystem, HyperVGcsState)> {
Err(AgentError::Unsupported(format!(
"Hyper-V isolation requires the `hcs-runtime` cargo feature \
(zlayer-gcs); rebuild zlayer-agent with --features hcs-runtime \
or set `isolation: process` for {hcs_id}",
)))
}
}
#[cfg(all(target_os = "windows", feature = "hcs-runtime"))]
const NULL_GUID_STR: &str = "00000000-0000-0000-0000-000000000000";
#[derive(Default)]
struct HyperVGcsState {
#[cfg(feature = "hcs-runtime")]
bridge: Option<zlayer_gcs::bridge::GcsBridge>,
}
const UVM_DEFAULT_VCPUS: u32 = 2;
const UVM_DEFAULT_MEMORY_MB: u64 = 1024;
const PRIMARY_SCSI_CTRL_GUID: &str = "df6d0690-79e5-55b6-a5ec-c1e2f77f580a";
const WINDOWS_LOGGING_HVSOCK_SERVICE_ID: &str = "172dad59-976d-45f2-8b6c-6d1b13f2ac4d";
fn encode_multi_sz_utf16le(strs: &[&str]) -> String {
use base64::Engine;
let mut bytes = Vec::<u8>::new();
for s in strs {
for c in s.encode_utf16() {
bytes.extend_from_slice(&c.to_le_bytes());
}
bytes.extend_from_slice(&[0, 0]);
}
bytes.extend_from_slice(&[0, 0]);
base64::engine::general_purpose::STANDARD.encode(&bytes)
}
#[cfg(feature = "hcs-runtime")]
fn read_uvm_debug_dump(debug_dir: &std::path::Path) -> String {
const PER_FILE_CAP: usize = 2 * 1024;
let Ok(entries) = std::fs::read_dir(debug_dir) else {
return "(empty)".to_string();
};
let mut files: Vec<std::path::PathBuf> = entries
.filter_map(Result::ok)
.map(|e| e.path())
.filter(|p| p.is_file())
.collect();
if files.is_empty() {
return "(no-files)".to_string();
}
files.sort();
let mut out = String::new();
for path in &files {
let name = path
.file_name()
.map(|n| n.to_string_lossy().into_owned())
.unwrap_or_else(|| "?".to_string());
let is_binary_artifact = path
.extension()
.and_then(|e| e.to_str())
.is_some_and(|e| e.eq_ignore_ascii_case("vmrs") || e.eq_ignore_ascii_case("dmp"));
let body = if is_binary_artifact {
match std::fs::metadata(path) {
Ok(m) => format!("<binary crash artifact, {} bytes>", m.len()),
Err(e) => format!("<binary crash artifact, stat error: {e}>"),
}
} else {
match read_prefix(path, PER_FILE_CAP + 1) {
Ok(bytes) => {
let truncated = bytes.len() > PER_FILE_CAP;
let cap = bytes.len().min(PER_FILE_CAP);
let body = String::from_utf8_lossy(&bytes[..cap])
.replace(['\n', '\r'], " ")
.trim()
.to_string();
if truncated {
format!("{body}…<truncated>")
} else {
body
}
}
Err(e) => format!("<read error: {e}>"),
}
};
if !out.is_empty() {
out.push_str(" || ");
}
out.push_str(&name);
out.push('=');
out.push_str(&body);
}
out
}
#[cfg(feature = "hcs-runtime")]
fn read_prefix(path: &std::path::Path, max: usize) -> std::io::Result<Vec<u8>> {
use std::io::Read;
let mut f = std::fs::File::open(path)?;
let mut buf = vec![0u8; max];
let mut filled = 0usize;
while filled < max {
let n = f.read(&mut buf[filled..])?;
if n == 0 {
break;
}
filled += n;
}
buf.truncate(filled);
Ok(buf)
}
#[cfg(all(feature = "hcs-runtime", feature = "windows-debug"))]
fn mount_and_read_svcdump(vhdx: &std::path::Path) -> String {
const SVCDUMP_CAP: usize = 16 * 1024;
let vhdx_str = vhdx.display().to_string();
let script = format!(
r#"$ErrorActionPreference='Stop'
$p='{vhdx_str}'
try {{
$img = Mount-DiskImage -ImagePath $p -Access ReadOnly -PassThru
$disk = $img | Get-DiskImage | Get-Disk
$letters = ($disk | Get-Partition | Where-Object {{ $_.DriveLetter }} | ForEach-Object {{ $_.DriveLetter }})
$found=$null
foreach ($dl in $letters) {{
$f = ($dl + ':\zlayer-dbg\svcdump.txt')
if (Test-Path $f) {{ $found=$f; break }}
}}
if ($found) {{ Get-Content -Raw -LiteralPath $found }}
else {{ Write-Output '(mounted-no-svcdump)' }}
}} finally {{
try {{ Dismount-DiskImage -ImagePath $p | Out-Null }} catch {{}}
}}"#,
);
let out = match std::process::Command::new("powershell")
.args(["-NoProfile", "-NonInteractive", "-Command", &script])
.output()
{
Ok(o) => o,
Err(e) => return format!("(mount-spawn-failed: {e})"),
};
if !out.status.success() {
return format!(
"(mount-failed: status={} err={})",
out.status,
String::from_utf8_lossy(&out.stderr)
.replace(['\n', '\r'], " ")
.trim(),
);
}
let body = String::from_utf8_lossy(&out.stdout);
let trimmed = body.trim();
if trimmed.is_empty() {
return "(empty)".to_string();
}
if trimmed.len() > SVCDUMP_CAP {
format!("{}…<truncated>", &trimmed[..SVCDUMP_CAP])
} else {
trimmed.to_string()
}
}
fn build_uvm_registry_changes() -> Vec<RegistryValue> {
let mut values = vec![
RegistryValue {
key: Some(RegistryKey {
hive: Some(RegistryHive::System),
name: r"CurrentControlSet\Services\gns".to_string(),
}),
name: "EnableCompartmentNamespace".to_string(),
r#type: Some(RegistryValueType::DWord),
d_word_value: Some(1),
..Default::default()
},
];
if std::env::var("ZLAYER_GCS_STRIP_DEPS").as_deref() == Ok("1") {
tracing::warn!(
"ZLAYER_GCS_STRIP_DEPS=1 — stripping gcs DependOnService to \
condrv+hvsocketcontrol (debug A/B; not the normal path)"
);
values.push(RegistryValue {
key: Some(RegistryKey {
hive: Some(RegistryHive::System),
name: r"CurrentControlSet\Services\gcs".to_string(),
}),
name: "DependOnService".to_string(),
r#type: Some(RegistryValueType::MultiString),
binary_value: encode_multi_sz_utf16le(&["condrv", "hvsocketcontrol"]),
..Default::default()
});
}
#[cfg(feature = "windows-debug")]
if std::env::var("ZLAYER_GCS_KD").as_deref() == Ok("1") {
values.push(RegistryValue {
key: Some(RegistryKey {
hive: Some(RegistryHive::System),
name: r"CurrentControlSet\Control\Session Manager\Debug Print Filter".to_string(),
}),
name: "DEFAULT".to_string(),
r#type: Some(RegistryValueType::DWord),
d_word_value: Some(0x7FFF_FFFF),
..Default::default()
});
}
#[cfg(feature = "windows-debug")]
if std::env::var("ZLAYER_GCS_SVCDUMP").as_deref() == Ok("1") {
let svc = r"CurrentControlSet\Services\zlayer-svcdump";
let svc_key = || {
Some(RegistryKey {
hive: Some(RegistryHive::System),
name: svc.to_string(),
})
};
let image_path = concat!(
r#"%comspec% /c "md C:\zlayer-dbg 2>nul & for /l %i in (1,1,12) do ("#,
r#"echo ===== iter %i ===== >> C:\zlayer-dbg\svcdump.txt & "#,
r#"sc queryex mpssvc >> C:\zlayer-dbg\svcdump.txt 2>&1 & "#,
r#"sc queryex netsetupsvc >> C:\zlayer-dbg\svcdump.txt 2>&1 & "#,
r#"sc queryex BFE >> C:\zlayer-dbg\svcdump.txt 2>&1 & "#,
r#"sc queryex RpcSs >> C:\zlayer-dbg\svcdump.txt 2>&1 & "#,
r#"sc queryex nsi >> C:\zlayer-dbg\svcdump.txt 2>&1 & "#,
r#"sc queryex gcs >> C:\zlayer-dbg\svcdump.txt 2>&1 & "#,
r#"sc qc mpssvc >> C:\zlayer-dbg\svcdump.txt 2>&1 & "#,
r#"sc qc netsetupsvc >> C:\zlayer-dbg\svcdump.txt 2>&1 & "#,
r#"sc query >> C:\zlayer-dbg\svcdump.txt 2>&1 & "#,
r#"ping -n 6 127.0.0.1 >nul )""#,
);
values.extend([
RegistryValue {
key: svc_key(),
name: "Start".to_string(),
r#type: Some(RegistryValueType::DWord),
d_word_value: Some(2), ..Default::default()
},
RegistryValue {
key: svc_key(),
name: "Type".to_string(),
r#type: Some(RegistryValueType::DWord),
d_word_value: Some(0x10), ..Default::default()
},
RegistryValue {
key: svc_key(),
name: "ErrorControl".to_string(),
r#type: Some(RegistryValueType::DWord),
d_word_value: Some(0), ..Default::default()
},
RegistryValue {
key: svc_key(),
name: "ImagePath".to_string(),
r#type: Some(RegistryValueType::ExpandedString),
string_value: image_path.to_string(),
..Default::default()
},
]);
}
#[cfg(feature = "windows-debug")]
values.extend(build_uvm_debug_registry_changes());
values
}
#[cfg(feature = "windows-debug")]
fn build_uvm_debug_registry_changes() -> Vec<RegistryValue> {
let dump_folder = r"C:\zlayer-dbg".to_string();
vec![
RegistryValue {
key: Some(RegistryKey {
hive: Some(RegistryHive::Software),
name: r"Microsoft\Windows\Windows Error Reporting\LocalDumps\vmcomputeagent.exe"
.to_string(),
}),
name: "DumpFolder".to_string(),
r#type: Some(RegistryValueType::ExpandedString),
string_value: dump_folder,
..Default::default()
},
RegistryValue {
key: Some(RegistryKey {
hive: Some(RegistryHive::Software),
name: r"Microsoft\Windows\Windows Error Reporting\LocalDumps\vmcomputeagent.exe"
.to_string(),
}),
name: "DumpType".to_string(),
r#type: Some(RegistryValueType::DWord),
d_word_value: Some(2),
..Default::default()
},
RegistryValue {
key: Some(RegistryKey {
hive: Some(RegistryHive::Software),
name: r"Microsoft\Windows\Windows Error Reporting\LocalDumps\vmcomputeagent.exe"
.to_string(),
}),
name: "DumpCount".to_string(),
r#type: Some(RegistryValueType::DWord),
d_word_value: Some(5),
..Default::default()
},
]
}
#[allow(clippy::too_many_lines)] fn build_virtual_machine_doc(
uvm: &Uvm,
_parent_layers: &[zlayer_hcs::schema::Layer],
spec: &ServiceSpec,
gpu_adapters: &[HostGpuAdapter],
) -> VirtualMachine {
use std::collections::BTreeMap;
let mut scsi_attachments: BTreeMap<String, ScsiAttachment> = BTreeMap::new();
scsi_attachments.insert(
"0".to_string(),
ScsiAttachment {
path: uvm.scratch_vhdx().to_string_lossy().into_owned(),
r#type: "VirtualDisk".to_string(),
read_only: Some(false),
},
);
let mut scsi: BTreeMap<String, ScsiController> = BTreeMap::new();
scsi.insert(
PRIMARY_SCSI_CTRL_GUID.to_string(),
ScsiController {
attachments: scsi_attachments,
},
);
let shares: Vec<VirtualSmbShare> = vec![VirtualSmbShare {
name: "os".to_string(),
path: uvm.os_files_dir().to_string_lossy().into_owned(),
options: Some(VirtualSmbShareOptions {
read_only: true,
share_read: true,
cache_io: true,
pseudo_oplocks: true,
take_backup_privilege: true,
..Default::default()
}),
..Default::default()
}];
let virtual_smb = Some(VirtualSmb {
shares,
direct_file_mapping_in_mb: Some(1024),
});
let debug_options: Option<DebugOptions> = None;
let guest_crash_reporting: Option<GuestCrashReporting> = None;
let gpu = spec.resources.gpu.as_ref().map(|_| {
let requests: Vec<GpuAssignmentRequest> = gpu_adapters
.iter()
.map(|a| GpuAssignmentRequest {
#[allow(clippy::cast_sign_loss)]
virtual_machine_id_string: format!(
"0x{:08x}:0x{:08x}",
a.luid_high, a.luid_low as u32,
),
adapter_luid_high_part: a.luid_high,
adapter_luid_low_part: a.luid_low,
})
.collect();
let mode = if requests.is_empty() {
GpuAssignmentMode::Default
} else {
GpuAssignmentMode::List
};
GpuAssignment {
assignment_mode: mode,
assignment_request: requests,
allow_vendor_extension: Some(true),
}
});
VirtualMachine {
stop_on_reset: true,
registry_changes: Some(RegistryChanges {
add_values: build_uvm_registry_changes(),
}),
debug_options,
chipset: Some(Chipset {
uefi: Some(Uefi {
boot_this: Some(UefiBootEntry {
device_type: "VmbFs".to_string(),
device_path: r"\EFI\Microsoft\Boot\bootmgfw.efi".to_string(),
disk_number: None,
}),
..Default::default()
}),
}),
compute_topology: Some(Topology {
memory: Some(TopologyMemory {
size_in_mb: UVM_DEFAULT_MEMORY_MB,
allow_overcommit: Some(true),
enable_hot_hint: Some(true),
}),
processor: Some(TopologyProcessor {
count: UVM_DEFAULT_VCPUS,
}),
}),
devices: Some(Devices {
com_ports: {
let mut m = BTreeMap::new();
m.insert(
"0".to_string(),
zlayer_hcs::schema::ComPort {
named_pipe: format!(
r"\\.\pipe\zlayer-uvm-{}-com1",
format_guid_bare(uvm.runtime_id())
),
optimize_for_debugger: std::env::var("ZLAYER_GCS_KD").as_deref() == Ok("1"),
},
);
m
},
scsi,
virtual_smb,
gpu,
hv_socket: Some(HvSocket2 {
hv_socket_config: Some(HvSocketSystemConfig {
default_bind_security_descriptor: "D:P(A;;FA;;;SY)(A;;FA;;;BA)".to_string(),
service_table: {
let mut table = BTreeMap::new();
table.insert(
WINDOWS_LOGGING_HVSOCK_SERVICE_ID.to_string(),
HvSocketServiceConfig {
bind_security_descriptor: "D:P(A;;FA;;;SY)(A;;FA;;;BA)".to_string(),
connect_security_descriptor: "D:P(A;;FA;;;SY)(A;;FA;;;BA)"
.to_string(),
allow_wildcard_binds: true,
..Default::default()
},
);
table
},
..Default::default()
}),
}),
guest_crash_reporting,
}),
guest_state: None,
runtime_state_file_path: None,
}
}
#[allow(clippy::unnecessary_wraps, dead_code)]
fn build_container_processor(spec: &ServiceSpec) -> Option<ContainerProcessor> {
Some(
spec.resources
.cpu
.and_then(|cpu| {
let count = cpu.ceil();
#[allow(clippy::cast_sign_loss, clippy::cast_possible_truncation)]
let count_u32 = if count.is_finite() && count >= 1.0 {
count as u32
} else {
return None;
};
Some(ContainerProcessor {
count: Some(count_u32),
maximum: None,
weight: None,
})
})
.unwrap_or_default(),
)
}
#[allow(dead_code)]
fn build_container_memory(spec: &ServiceSpec) -> Option<ContainerMemory> {
spec.resources.memory.as_ref().and_then(|mem_str| {
crate::bundle::parse_memory_string(mem_str)
.ok()
.map(|bytes| {
let mib = bytes.div_ceil(1024 * 1024);
ContainerMemory {
size_in_mb: Some(mib),
}
})
})
}
#[allow(dead_code)] fn build_uvm_only_doc(
owner_tag: &str,
uvm_id: &str,
uvm: &Uvm,
parent_layers: &[zlayer_hcs::schema::Layer],
spec: &ServiceSpec,
gpu_adapters: &[HostGpuAdapter],
) -> HcsDoc {
let vm_doc = build_virtual_machine_doc(uvm, parent_layers, spec, gpu_adapters);
HcsDoc {
owner: owner_tag.to_string(),
schema_version: SchemaVersion::default(),
hosting_system_id: String::new(),
container: None,
virtual_machine: Some(vm_doc),
should_terminate_on_last_handle_closed: Some(true),
}
.apply_service_id(uvm_id)
}
#[allow(dead_code)] fn build_hosted_container_doc(
_spec: &ServiceSpec,
guest_layers: Vec<zlayer_hcs::schema::Layer>,
guest_root_volume: String,
namespace_id: Option<String>,
_hcs_id: &str,
) -> HcsContainer {
let storage = HcsStorage {
layers: guest_layers,
path: Some(guest_root_volume),
};
let networking = namespace_id.map(|ns| zlayer_hcs::schema::ContainerNetworking {
allow_unqualified_dns_query: Some(true),
dns_search_list: Vec::new(),
namespace: Some(ns),
network_shared_container_name: None,
});
HcsContainer {
guest_os: None,
storage: Some(storage),
networking,
mapped_directories: Vec::new(),
mapped_pipes: Vec::new(),
processor: None,
memory: None,
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub(crate) struct HostGpuAdapter {
pub luid_high: u32,
pub luid_low: i32,
pub description: String,
pub vendor_id: u32,
pub device_id: u32,
}
const VENDOR_ID_MICROSOFT_BASIC: u32 = 0x1414;
#[cfg(target_os = "windows")]
fn enumerate_host_gpu_adapters() -> std::io::Result<Vec<HostGpuAdapter>> {
use windows::Win32::Graphics::Dxgi::{CreateDXGIFactory1, IDXGIAdapter, IDXGIFactory1};
let factory: IDXGIFactory1 = unsafe {
CreateDXGIFactory1()
.map_err(|e| std::io::Error::other(format!("CreateDXGIFactory1 failed: {e}")))?
};
let mut adapters = Vec::new();
let mut index: u32 = 0;
loop {
let adapter: IDXGIAdapter = unsafe {
match factory.EnumAdapters(index) {
Ok(a) => a,
Err(e) => {
#[allow(clippy::cast_sign_loss)]
let code = e.code().0 as u32;
if code == 0x887A_0002 {
break;
}
return Err(std::io::Error::other(format!(
"IDXGIFactory1::EnumAdapters({index}) failed: {e}",
)));
}
}
};
let desc = unsafe {
adapter.GetDesc().map_err(|e| {
std::io::Error::other(format!("IDXGIAdapter::GetDesc({index}) failed: {e}"))
})?
};
if desc.VendorId == VENDOR_ID_MICROSOFT_BASIC {
index += 1;
continue;
}
let nul = desc
.Description
.iter()
.position(|&c| c == 0)
.unwrap_or(desc.Description.len());
let description = String::from_utf16_lossy(&desc.Description[..nul]);
#[allow(clippy::cast_sign_loss)]
let luid_high = desc.AdapterLuid.HighPart as u32;
#[allow(clippy::cast_possible_wrap)]
let luid_low = desc.AdapterLuid.LowPart as i32;
adapters.push(HostGpuAdapter {
luid_high,
luid_low,
description,
vendor_id: desc.VendorId,
device_id: desc.DeviceId,
});
index += 1;
}
Ok(adapters)
}
#[cfg(not(target_os = "windows"))]
fn enumerate_host_gpu_adapters() -> std::io::Result<Vec<HostGpuAdapter>> {
Err(std::io::Error::new(
std::io::ErrorKind::Unsupported,
"DXGI host GPU enumeration is Windows-only",
))
}
fn vendor_id_for_spec(vendor: &str) -> Option<u32> {
match vendor.to_ascii_lowercase().as_str() {
"nvidia" => Some(0x10de),
"amd" | "ati" => Some(0x1002),
"intel" => Some(0x8086),
_ => None,
}
}
fn filter_adapters_by_gpu_spec(
adapters: &[HostGpuAdapter],
spec: &zlayer_spec::GpuSpec,
) -> Vec<HostGpuAdapter> {
let vendor_filter = if spec.vendor.eq_ignore_ascii_case("all") || spec.vendor.is_empty() {
None
} else {
vendor_id_for_spec(&spec.vendor)
};
let model_lower = spec.model.as_deref().map(str::to_ascii_lowercase);
let mut filtered: Vec<HostGpuAdapter> = adapters
.iter()
.filter(|a| match vendor_filter {
Some(vid) => a.vendor_id == vid,
None => true,
})
.filter(|a| match model_lower.as_deref() {
Some(needle) => a.description.to_ascii_lowercase().contains(needle),
None => true,
})
.cloned()
.collect();
let want = spec.count.max(1) as usize;
if filtered.len() > want {
filtered.truncate(want);
}
filtered
}
struct ParentLayerActivationGuard {
layers: Vec<PathBuf>,
armed: bool,
}
impl ParentLayerActivationGuard {
fn new(layers: Vec<PathBuf>) -> Self {
Self {
layers,
armed: true,
}
}
fn disarm(mut self) -> Vec<PathBuf> {
self.armed = false;
std::mem::take(&mut self.layers)
}
}
impl Drop for ParentLayerActivationGuard {
fn drop(&mut self) {
if !self.armed {
return;
}
for path in &self.layers {
if let Err(e) = crate::windows::wclayer::deactivate_layer(path) {
tracing::warn!(
layer = %path.display(),
error = %e,
"DeactivateLayer failed during create_container rollback",
);
}
}
}
}
fn rollback_parent_activations(activated: &[PathBuf]) {
for path in activated.iter().rev() {
if let Err(e) = crate::windows::wclayer::deactivate_layer(path) {
tracing::warn!(
layer = %path.display(),
error = %e,
"DeactivateLayer failed during parent-activation rollback",
);
}
}
}
fn extract_exit_code(detail_json: &str) -> Option<i32> {
if detail_json.trim().is_empty() {
return None;
}
let v: serde_json::Value = serde_json::from_str(detail_json).ok()?;
v.get("ExitCode")
.and_then(serde_json::Value::as_i64)
.and_then(|n| {
#[allow(clippy::cast_possible_truncation)]
Some(n as i32)
})
}
trait ApplyServiceId {
fn apply_service_id(self, hcs_id: &str) -> Self;
}
impl ApplyServiceId for HcsDoc {
fn apply_service_id(self, hcs_id: &str) -> Self {
let _ = hcs_id;
self
}
}
#[cfg(feature = "hcs-runtime")]
impl HcsRuntime {
#[allow(clippy::too_many_lines)]
async fn exec_hyperv(
hcs_id: &str,
id: &ContainerId,
cmd: &[String],
bridge: &zlayer_gcs::bridge::GcsBridge,
uvm_runtime_id: windows::core::GUID,
log_buffer: &Arc<RwLock<Vec<LogEntry>>>,
) -> Result<(i32, String, String)> {
use zlayer_gcs::frame::RpcMessageType;
use zlayer_gcs::protocol::{
ExecuteProcessRequest, ExecuteProcessResponse, ExecuteProcessSettings,
RequestBase as GcsRequestBase, StdioRelaySettings, WaitForProcessRequest,
WaitForProcessResponse,
};
use zlayer_gcs::transport::HvSockListener;
let stdout_uuid = uuid::Uuid::new_v4();
let stderr_uuid = uuid::Uuid::new_v4();
let stdout_listener = HvSockListener::bind(
uvm_runtime_id,
windows::core::GUID::from_u128(stdout_uuid.as_u128()),
)
.await
.map_err(|e| AgentError::Internal(format!("Hyper-V exec: bind stdout hvsock: {e}")))?;
let stderr_listener = HvSockListener::bind(
uvm_runtime_id,
windows::core::GUID::from_u128(stderr_uuid.as_u128()),
)
.await
.map_err(|e| AgentError::Internal(format!("Hyper-V exec: bind stderr hvsock: {e}")))?;
let pp = serde_json::json!({
"CommandLine": cmd.join(" "),
"CreateStdInPipe": false,
"CreateStdOutPipe": true,
"CreateStdErrPipe": true,
"EmulateConsole": false,
});
let process_parameters =
serde_json::Value::String(serde_json::to_string(&pp).map_err(|e| {
AgentError::Internal(format!("Hyper-V exec: serialize ProcessParameters: {e}"))
})?);
let exec_req = ExecuteProcessRequest {
base: GcsRequestBase {
activity_id: uuid::Uuid::new_v4(),
container_id: hcs_id.to_string(),
},
settings: ExecuteProcessSettings {
process_parameters,
stdio_relay_settings: Some(StdioRelaySettings {
stdin_pipe: None,
stdout_pipe: Some(stdout_uuid),
stderr_pipe: Some(stderr_uuid),
}),
},
};
let exec_resp: ExecuteProcessResponse = bridge
.send_rpc_json(RpcMessageType::ExecuteProcess, &exec_req)
.await
.map_err(|e| AgentError::Internal(format!("Hyper-V exec ExecuteProcess: {e}")))?;
if exec_resp.base.result != 0 {
let hr = u32::from_ne_bytes(exec_resp.base.result.to_ne_bytes());
return Err(AgentError::Internal(format!(
"Hyper-V exec ExecuteProcess returned HRESULT 0x{hr:08x}: {}",
exec_resp.base.error_message
)));
}
let pid = exec_resp.process_id;
let drain = |listener: HvSockListener| async move {
let Ok(Ok(stream)) =
tokio::time::timeout(Duration::from_secs(120), listener.accept()).await
else {
return Vec::<u8>::new();
};
let mut bytes = Vec::new();
loop {
match stream.read_some(64 * 1024).await {
Ok(chunk) if chunk.is_empty() => break,
Ok(chunk) => bytes.extend(chunk),
Err(_) => break,
}
}
bytes
};
let (stdout_bytes, stderr_bytes) =
tokio::join!(drain(stdout_listener), drain(stderr_listener));
let stdout = String::from_utf8_lossy(&stdout_bytes).into_owned();
let stderr = String::from_utf8_lossy(&stderr_bytes).into_owned();
let wait_req = WaitForProcessRequest {
base: GcsRequestBase {
activity_id: uuid::Uuid::new_v4(),
container_id: hcs_id.to_string(),
},
process_id: pid,
timeout_in_ms: u32::MAX,
};
let wait_resp: WaitForProcessResponse = bridge
.send_rpc_json(RpcMessageType::WaitForProcess, &wait_req)
.await
.map_err(|e| AgentError::Internal(format!("Hyper-V exec WaitForProcess: {e}")))?;
#[allow(clippy::cast_possible_wrap)]
let exit_code = wait_resp.exit_code as i32;
{
let now = chrono::Utc::now();
let source = LogSource::Container(id.to_string());
let mut buf = log_buffer.write().await;
for line in stdout.lines() {
buf.push(LogEntry {
timestamp: now,
stream: LogStream::Stdout,
message: line.to_string(),
source: source.clone(),
service: None,
deployment: None,
});
}
for line in stderr.lines() {
buf.push(LogEntry {
timestamp: now,
stream: LogStream::Stderr,
message: line.to_string(),
source: source.clone(),
service: None,
deployment: None,
});
}
}
Ok((exit_code, stdout, stderr))
}
}
#[async_trait]
impl Runtime for HcsRuntime {
#[instrument(skip(self), fields(otel.name = "image.pull", container.image.name = %image))]
async fn pull_image(&self, image: &str) -> Result<()> {
self.do_pull(image, PullPolicy::IfNotPresent, None).await
}
#[instrument(
skip(self, auth, _source),
fields(otel.name = "image.pull", container.image.name = %image, pull_policy = ?policy)
)]
async fn pull_image_with_policy(
&self,
image: &str,
policy: PullPolicy,
auth: Option<&SpecRegistryAuth>,
_source: zlayer_spec::SourcePolicy,
) -> Result<()> {
if matches!(policy, PullPolicy::Never) {
let cache = self.images.read().await;
return if cache.contains_key(image) {
Ok(())
} else {
Err(AgentError::PullFailed {
image: image.to_string(),
reason: "pull_policy=never and image not cached locally".to_string(),
})
};
}
self.do_pull(image, policy, auth).await
}
#[allow(clippy::too_many_lines)]
#[instrument(
skip(self, spec),
fields(
otel.name = "container.create",
container.id = %id,
service.name = %id.service,
service.replica = %id.replica,
container.image.name = %spec.image.name,
)
)]
async fn create_container(&self, id: &ContainerId, spec: &ServiceSpec) -> Result<()> {
let hcs_id = Self::hcs_id(id);
let image_name = spec.image.name.to_string();
{
let cache = self.images.read().await;
if !cache.contains_key(&image_name) {
drop(cache);
self.do_pull(&image_name, spec.image.pull_policy, None)
.await?;
}
}
let parent_layers = self.resolve_parent_chain(&image_name).await?;
let parent_activation_guard = {
let parents = parent_layers.clone();
let activated =
tokio::task::spawn_blocking(move || Self::activate_parent_layers(&parents))
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.clone(),
reason: format!("spawn_blocking join for parent layer activation: {e}"),
})?
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.clone(),
reason: format!("parent layer activation: {e}"),
})?;
ParentLayerActivationGuard::new(activated)
};
let image_os_version = self.resolve_image_os_version(&image_name).await;
let isolation = resolve_isolation_for_image(spec.isolation, image_os_version.as_deref());
let scratch_dir = self.scratch_dir(&hcs_id);
let chain = crate::windows::wclayer::LayerChain::new(parent_layers.clone());
let scratch_layer = match isolation {
IsolationMode::Hyperv => scratch::create_unactivated(&scratch_dir, &chain),
IsolationMode::Process => scratch::create(&scratch_dir, &chain),
}
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.clone(),
reason: format!("scratch layer create: {e}"),
})?;
let slice_cidr = self.config.slice_cidr;
let allocated_ip = match self.next_container_ip.lock().await.take() {
Some(ip) => Some(ip),
None => self
.ip_allocator
.lock()
.await
.as_mut()
.and_then(zlayer_overlay::IpAllocator::allocate),
};
let dns_config = self.next_container_dns.lock().await.take();
let network_attachment: Option<WindowsOverlayAttach> = match (slice_cidr, allocated_ip) {
(Some(_slice), Some(ip)) if self.overlayd.is_some() => {
let (dns_server, dns_domain) = dns_config.unwrap_or((None, None));
match self
.overlayd_attach_windows(&hcs_id, &id.service, ip, dns_server, dns_domain)
.await
{
Ok(att) => Some(att),
Err(e) => {
return Err(AgentError::CreateFailed {
id: hcs_id.clone(),
reason: format!("overlayd overlay attach failed: {e}"),
});
}
}
}
(Some(_), Some(_)) => {
tracing::warn!(
hcs_id = %hcs_id,
"overlayd not connected; starting container without overlay networking"
);
None
}
(None, _) => {
tracing::warn!(
hcs_id = %hcs_id,
"node has no assigned slice yet; starting container without overlay networking"
);
None
}
(Some(_), None) => {
tracing::warn!(
hcs_id = %hcs_id,
"no overlay IP could be allocated; starting container without overlay networking"
);
None
}
};
let namespace_strs: Vec<String> = network_attachment
.as_ref()
.map(|a| vec![a.namespace_guid.clone()])
.unwrap_or_default();
let uvm =
match isolation {
IsolationMode::Hyperv => {
let boot_files = crate::windows::unpacker::locate_uvm_boot_files(&chain)
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.clone(),
reason: format!(
"Hyper-V isolation requires a Windows base image with UVM payload: {e}"
),
})?;
Some(
Uvm::create(&hcs_id, &self.config.storage_root, &boot_files).map_err(
|e| AgentError::CreateFailed {
id: hcs_id.clone(),
reason: format!("UVM provisioning failed: {e}"),
},
)?,
)
}
IsolationMode::Process => None,
};
let create_result: Result<(ComputeSystem, HyperVGcsState)> = match isolation {
IsolationMode::Process => {
let doc = self.build_compute_system_doc(
&hcs_id,
spec,
&scratch_layer,
parent_layers.clone(),
namespace_strs.clone(),
isolation,
uvm.as_ref(),
)?;
let doc_json =
serde_json::to_string(&doc).map_err(|e| AgentError::CreateFailed {
id: hcs_id.clone(),
reason: format!("serialize ComputeSystem doc: {e}"),
})?;
tracing::error!(
target: "zlayer_agent::hcs::diag",
hcs_id = %hcs_id,
doc = %doc_json,
"HCS_CREATE_DOC"
);
if let Ok(dir) = std::env::var("ZLAYER_HCS_DOC_DUMP_DIR") {
let path = std::path::PathBuf::from(&dir).join(format!("{hcs_id}.json"));
let _ = std::fs::create_dir_all(&dir);
let _ = std::fs::write(&path, &doc_json);
}
ComputeSystem::create(&hcs_id, &doc_json)
.await
.map(|s| (s, HyperVGcsState::default()))
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.clone(),
reason: format!("HcsCreateComputeSystem: {e}"),
})
}
IsolationMode::Hyperv => {
let uvm_ref = uvm.as_ref().ok_or_else(|| AgentError::Internal(
"Hyper-V isolation selected but no Uvm was provisioned earlier in create_container".to_string(),
))?;
self.hyperv_create_via_gcs(
&hcs_id,
spec,
&scratch_layer,
&parent_layers,
&namespace_strs,
uvm_ref,
network_attachment.as_ref(),
)
.await
}
};
let (system, hyperv_state) = match create_result {
Ok(v) => v,
Err(e) => {
if let Some(att) = &network_attachment {
self.overlayd_detach_windows(&att.namespace_guid).await;
if let Some(alloc) = self.ip_allocator.lock().await.as_mut() {
alloc.release(att.ip);
}
}
return Err(e);
}
};
let sink: Arc<RwLock<Option<i32>>> = Arc::new(RwLock::new(None));
self.spawn_exit_watcher(hcs_id.clone(), *system.raw(), sink.clone());
#[cfg(feature = "hcs-runtime")]
if let Some(bridge) = hyperv_state.bridge.clone() {
use zlayer_gcs::frame::RpcMessageType;
use zlayer_gcs::protocol::{
ExecuteProcessRequest, ExecuteProcessResponse, ExecuteProcessSettings,
RequestBase as GcsRequestBase, WaitForProcessRequest, WaitForProcessResponse,
};
let mut argv: Vec<String> = spec.command.entrypoint.clone().unwrap_or_default();
argv.extend(spec.command.args.clone().unwrap_or_default());
let command_line = argv.join(" ");
let mut pp = serde_json::json!({
"CommandLine": command_line.clone(),
"CreateStdInPipe": false,
"CreateStdOutPipe": false,
"CreateStdErrPipe": false,
});
if let Some(wd) = spec.command.workdir.clone().filter(|w| !w.is_empty()) {
pp["WorkingDirectory"] = serde_json::Value::String(wd);
}
let process_parameters =
serde_json::Value::String(serde_json::to_string(&pp).map_err(|e| {
AgentError::CreateFailed {
id: hcs_id.clone(),
reason: format!("Hyper-V init process: serialize ProcessParameters: {e}"),
}
})?);
tracing::info!(hcs_id = %hcs_id, command_line = %command_line, "Hyper-V: launching workload init process over GCS bridge");
let exec_req = ExecuteProcessRequest {
base: GcsRequestBase {
activity_id: uuid::Uuid::new_v4(),
container_id: hcs_id.clone(),
},
settings: ExecuteProcessSettings {
process_parameters,
stdio_relay_settings: None,
},
};
let exec_resp: ExecuteProcessResponse = bridge
.send_rpc_json(RpcMessageType::ExecuteProcess, &exec_req)
.await
.map_err(|e| AgentError::CreateFailed {
id: hcs_id.clone(),
reason: format!("Hyper-V init process ExecuteProcess: {e}"),
})?;
if exec_resp.base.result != 0 {
let hr = u32::from_ne_bytes(exec_resp.base.result.to_ne_bytes());
return Err(AgentError::CreateFailed {
id: hcs_id.clone(),
reason: format!(
"Hyper-V init process ExecuteProcess returned HRESULT 0x{hr:08x}: {}",
exec_resp.base.error_message
),
});
}
let pid = exec_resp.process_id;
let wait_bridge = bridge.clone();
let wait_sink = sink.clone();
let wait_hcs_id = hcs_id.clone();
tokio::spawn(async move {
let wait_req = WaitForProcessRequest {
base: GcsRequestBase {
activity_id: uuid::Uuid::new_v4(),
container_id: wait_hcs_id.clone(),
},
process_id: pid,
timeout_in_ms: u32::MAX,
};
let res: zlayer_gcs::error::GcsResult<WaitForProcessResponse> = wait_bridge
.send_rpc_json(RpcMessageType::WaitForProcess, &wait_req)
.await;
match res {
Ok(resp) => {
#[allow(clippy::cast_possible_wrap)]
let code = resp.exit_code as i32;
*wait_sink.write().await = Some(code);
}
Err(e) => {
tracing::warn!(hcs_id = %wait_hcs_id, error = %e, "Hyper-V init-process WaitForProcess failed; container exit code unobserved");
}
}
});
}
let activated_parent_layers = parent_activation_guard.disarm();
let entry = ContainerEntry {
system,
scratch_layer: Some(scratch_layer),
hcs_id: hcs_id.clone(),
last_exit_code: sink,
overlay_attach: network_attachment,
uvm,
activated_parent_layers,
#[cfg(feature = "hcs-runtime")]
gcs: hyperv_state.bridge,
log_buffer: Arc::new(RwLock::new(Vec::new())),
};
#[cfg(not(feature = "hcs-runtime"))]
let _ = hyperv_state;
self.containers.write().await.insert(hcs_id, entry);
Ok(())
}
#[instrument(skip(self), fields(otel.name = "container.start", container.id = %id))]
async fn start_container(&self, id: &ContainerId) -> Result<()> {
let hcs_id = Self::hcs_id(id);
let containers = self.containers.read().await;
let entry = containers
.get(&hcs_id)
.ok_or_else(|| AgentError::NotFound {
container: hcs_id.clone(),
reason: "no HCS entry for container".to_string(),
})?;
if entry.uvm.is_some() {
return Ok(());
}
entry
.system
.start("")
.await
.map_err(|e| AgentError::StartFailed {
id: hcs_id.clone(),
reason: format!("HcsStartComputeSystem: {e}"),
})
}
#[instrument(skip(self), fields(otel.name = "container.stop", container.id = %id))]
async fn stop_container(&self, id: &ContainerId, timeout: Duration) -> Result<()> {
let hcs_id = Self::hcs_id(id);
let containers = self.containers.read().await;
let entry = containers
.get(&hcs_id)
.ok_or_else(|| AgentError::NotFound {
container: hcs_id.clone(),
reason: "no HCS entry for container".to_string(),
})?;
let opts_json = format!(r#"{{"TimeoutSeconds":{}}}"#, timeout.as_secs().max(1));
match tokio_timeout(timeout, entry.system.shutdown(&opts_json)).await {
Ok(Ok(())) => Ok(()),
Ok(Err(e)) => {
tracing::warn!(
hcs_id = %hcs_id,
error = %e,
"graceful shutdown failed; escalating to terminate"
);
entry
.system
.terminate("")
.await
.map_err(|e| AgentError::Internal(format!("HcsTerminateComputeSystem: {e}")))
}
Err(_elapsed) => {
tracing::warn!(
hcs_id = %hcs_id,
"graceful shutdown timed out; escalating to terminate"
);
entry
.system
.terminate("")
.await
.map_err(|e| AgentError::Internal(format!("HcsTerminateComputeSystem: {e}")))
}
}
}
#[instrument(skip(self), fields(otel.name = "container.remove", container.id = %id))]
async fn remove_container(&self, id: &ContainerId) -> Result<()> {
let hcs_id = Self::hcs_id(id);
let mut containers = self.containers.write().await;
let Some(mut entry) = containers.remove(&hcs_id) else {
return Err(AgentError::NotFound {
container: hcs_id,
reason: "no HCS entry for container".to_string(),
});
};
if let Err(e) = entry.system.terminate("").await {
tracing::debug!(
hcs_id = %entry.hcs_id,
error = %e,
"terminate during remove failed (container may already be stopped)"
);
}
if let Some(scratch_layer) = entry.scratch_layer.take() {
scratch_layer
.detach_and_destroy()
.map_err(|e| AgentError::Internal(format!("scratch teardown: {e}")))?;
}
if !entry.activated_parent_layers.is_empty() {
let layers = std::mem::take(&mut entry.activated_parent_layers);
let hcs_id_for_log = entry.hcs_id.clone();
let _ = tokio::task::spawn_blocking(move || {
for path in &layers {
if let Err(e) = crate::windows::wclayer::deactivate_layer(path) {
tracing::warn!(
hcs_id = %hcs_id_for_log,
layer = %path.display(),
error = %e,
"DeactivateLayer failed during remove_container; layer table may leak until reboot",
);
}
}
})
.await;
}
if let Some(uvm) = entry.uvm.take() {
drop(uvm);
}
if let Some(attachment) = entry.overlay_attach.take() {
self.overlayd_detach_windows(&attachment.namespace_guid)
.await;
if let Some(alloc) = self.ip_allocator.lock().await.as_mut() {
alloc.release(attachment.ip);
}
}
drop(entry);
Ok(())
}
async fn container_state(&self, id: &ContainerId) -> Result<ContainerState> {
let hcs_id = Self::hcs_id(id);
let containers = self.containers.read().await;
let Some(entry) = containers.get(&hcs_id) else {
return Err(AgentError::NotFound {
container: hcs_id,
reason: "no HCS entry for container".to_string(),
});
};
if let Some(code) = *entry.last_exit_code.read().await {
return Ok(ContainerState::Exited { code });
}
Ok(ContainerState::Running)
}
async fn container_logs(&self, id: &ContainerId, tail: usize) -> Result<Vec<LogEntry>> {
let hcs_id = Self::hcs_id(id);
let buffer = {
let containers = self.containers.read().await;
let entry = containers
.get(&hcs_id)
.ok_or_else(|| AgentError::NotFound {
container: hcs_id.clone(),
reason: "no HCS entry for container".to_string(),
})?;
entry.log_buffer.clone()
};
let mut entries = buffer.read().await.clone();
if tail > 0 && tail != usize::MAX && entries.len() > tail {
entries = entries.split_off(entries.len() - tail);
}
Ok(entries)
}
async fn exec(&self, id: &ContainerId, cmd: &[String]) -> Result<(i32, String, String)> {
use zlayer_hcs::process::ComputeProcess;
if cmd.is_empty() {
return Err(AgentError::InvalidSpec(
"exec command must not be empty".to_string(),
));
}
let hcs_id = Self::hcs_id(id);
let (system_handle, log_buffer, hyperv) = {
let containers = self.containers.read().await;
let entry = containers
.get(&hcs_id)
.ok_or_else(|| AgentError::NotFound {
container: hcs_id.clone(),
reason: "no HCS entry for container".to_string(),
})?;
#[cfg(feature = "hcs-runtime")]
let hyperv = match entry.gcs.clone() {
Some(bridge) => {
let uvm_runtime_id =
entry.uvm.as_ref().map(Uvm::runtime_id).ok_or_else(|| {
AgentError::Internal(
"Hyper-V exec: container has a GCS bridge but no UVM runtime id"
.to_string(),
)
})?;
Some((bridge, uvm_runtime_id))
}
None => None,
};
#[cfg(not(feature = "hcs-runtime"))]
let hyperv: Option<()> = None;
(entry.system.raw(), entry.log_buffer.clone(), hyperv)
};
#[cfg(feature = "hcs-runtime")]
if let Some((bridge, uvm_runtime_id)) = hyperv {
return Self::exec_hyperv(&hcs_id, id, cmd, &bridge, uvm_runtime_id, &log_buffer).await;
}
#[cfg(not(feature = "hcs-runtime"))]
let _ = &hyperv;
let command_line = cmd.join(" ");
let params = ProcessParameters {
command_line,
working_directory: String::new(),
environment: Default::default(),
emulate_console: Some(false),
create_std_in_pipe: Some(false),
create_std_out_pipe: Some(true),
create_std_err_pipe: Some(true),
console_size: None,
user: None,
};
let (process, stdout, stderr) = tokio::task::spawn_blocking(move || {
let captured = ComputeProcess::create_capturing_blocking(system_handle, ¶ms)?;
Ok::<_, zlayer_hcs::error::HcsError>(captured.drain_with_process())
})
.await
.map_err(|e| AgentError::Internal(format!("exec spawn_blocking join: {e}")))?
.map_err(|e| AgentError::Internal(format!("HcsCreateProcess/capture: {e}")))?;
{
let now = chrono::Utc::now();
let source = LogSource::Container(id.to_string());
let mut buf = log_buffer.write().await;
for line in stdout.lines() {
buf.push(LogEntry {
timestamp: now,
stream: LogStream::Stdout,
message: line.to_string(),
source: source.clone(),
service: None,
deployment: None,
});
}
for line in stderr.lines() {
buf.push(LogEntry {
timestamp: now,
stream: LogStream::Stderr,
message: line.to_string(),
source: source.clone(),
service: None,
deployment: None,
});
}
}
for _ in 0..600 {
let raw_props = process
.properties(r#"{"PropertyTypes":["ProcessStatus"]}"#)
.await
.map_err(|e| AgentError::Internal(format!("HcsGetProcessProperties: {e}")))?;
if let Some(code) = extract_process_exit_code(&raw_props) {
return Ok((code, stdout, stderr));
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
Err(AgentError::Timeout {
timeout: Duration::from_secs(60),
})
}
async fn exec_stream(&self, id: &ContainerId, cmd: &[String]) -> Result<ExecEventStream> {
let (exit, stdout, stderr) = self.exec(id, cmd).await?;
let mut events: Vec<ExecEvent> = Vec::with_capacity(3);
if !stdout.is_empty() {
events.push(ExecEvent::Stdout(stdout));
}
if !stderr.is_empty() {
events.push(ExecEvent::Stderr(stderr));
}
events.push(ExecEvent::Exit(exit));
Ok(Box::pin(futures_util::stream::iter(events)))
}
async fn get_container_stats(&self, id: &ContainerId) -> Result<ContainerStats> {
let hcs_id = Self::hcs_id(id);
let containers = self.containers.read().await;
let entry = containers
.get(&hcs_id)
.ok_or_else(|| AgentError::NotFound {
container: hcs_id.clone(),
reason: "no HCS entry for container".to_string(),
})?;
let raw = entry
.system
.read_statistics()
.await
.map_err(|e| AgentError::Internal(format!("HcsGetComputeSystemProperties: {e}")))?;
Ok(translate_stats(&raw))
}
async fn wait_container(&self, id: &ContainerId) -> Result<i32> {
let hcs_id = Self::hcs_id(id);
let sink = {
let containers = self.containers.read().await;
let entry = containers
.get(&hcs_id)
.ok_or_else(|| AgentError::NotFound {
container: hcs_id.clone(),
reason: "no HCS entry for container".to_string(),
})?;
entry.last_exit_code.clone()
};
loop {
if let Some(code) = *sink.read().await {
return Ok(code);
}
tokio::time::sleep(Duration::from_millis(200)).await;
}
}
async fn wait_outcome(&self, id: &ContainerId) -> Result<WaitOutcome> {
let exit_code = self.wait_container(id).await?;
let reason = if exit_code == -1 {
WaitReason::RuntimeError
} else {
WaitReason::Exited
};
Ok(WaitOutcome {
exit_code,
reason,
signal: None,
finished_at: Some(chrono::Utc::now()),
})
}
async fn get_logs(&self, id: &ContainerId) -> Result<Vec<LogEntry>> {
self.container_logs(id, usize::MAX).await
}
async fn get_container_pid(&self, _id: &ContainerId) -> Result<Option<u32>> {
Ok(None)
}
async fn get_container_namespace_id(
&self,
id: &ContainerId,
) -> Result<Option<windows::core::GUID>> {
let hcs_id = Self::hcs_id(id);
let entries = self.containers.read().await;
Ok(entries.get(&hcs_id).and_then(|e| {
e.overlay_attach
.as_ref()
.and_then(|a| GUID::try_from(a.namespace_guid.as_str()).ok())
}))
}
async fn get_container_ip(&self, id: &ContainerId) -> Result<Option<IpAddr>> {
let hcs_id = Self::hcs_id(id);
let containers = self.containers.read().await;
let Some(entry) = containers.get(&hcs_id) else {
return Err(AgentError::NotFound {
container: hcs_id,
reason: "no HCS entry for container".to_string(),
});
};
Ok(entry.overlay_attach.as_ref().map(|a| a.ip))
}
async fn list_images(&self) -> Result<Vec<ImageInfo>> {
let cache = self.images.read().await;
Ok(cache
.keys()
.map(|reference| ImageInfo {
reference: reference.clone(),
digest: None,
size_bytes: None,
})
.collect())
}
async fn remove_image(&self, image: &str, _force: bool) -> Result<()> {
let mut cache = self.images.write().await;
if let Some(entry) = cache.remove(image) {
for layer in &entry.unpacked.chain.0 {
let path = std::path::Path::new(&layer.path);
if let Err(e) = crate::windows::wclayer::destroy_layer(path) {
tracing::warn!(layer = %layer.path, error = %e, "destroy_layer failed");
}
}
}
Ok(())
}
async fn prune_images(&self) -> Result<PruneResult> {
Ok(PruneResult::default())
}
async fn kill_container(&self, id: &ContainerId, signal: Option<&str>) -> Result<()> {
let _ = crate::runtime::validate_signal(signal.unwrap_or("SIGKILL"))?;
let hcs_id = Self::hcs_id(id);
let containers = self.containers.read().await;
let entry = containers
.get(&hcs_id)
.ok_or_else(|| AgentError::NotFound {
container: hcs_id.clone(),
reason: "no HCS entry for container".to_string(),
})?;
entry
.system
.terminate("")
.await
.map_err(|e| AgentError::Internal(format!("HcsTerminateComputeSystem: {e}")))
}
async fn tag_image(&self, source: &str, target: &str) -> Result<()> {
let mut cache = self.images.write().await;
let Some(entry) = cache.get(source) else {
return Err(AgentError::NotFound {
container: source.to_string(),
reason: "source image not cached".to_string(),
});
};
let cloned = CachedImage {
unpacked: entry.unpacked.clone(),
os_version: entry.os_version.clone(),
};
cache.insert(target.to_string(), cloned);
Ok(())
}
async fn inspect_detailed(&self, id: &ContainerId) -> Result<ContainerInspectDetails> {
let hcs_id = Self::hcs_id(id);
let last_exit_code_lock = {
let containers = self.containers.read().await;
let entry = containers
.get(&hcs_id)
.ok_or_else(|| AgentError::NotFound {
container: hcs_id.clone(),
reason: "no HCS entry for container".to_string(),
})?;
Arc::clone(&entry.last_exit_code)
};
let exit_code = *last_exit_code_lock.read().await;
Ok(ContainerInspectDetails {
ports: Vec::new(),
networks: Vec::new(),
ipv4: None,
health: None,
exit_code,
})
}
}
fn translate_stats(raw: &Statistics) -> ContainerStats {
let cpu_usage_usec = raw
.processor
.as_ref()
.map(|p| p.total_runtime_100ns / 10)
.unwrap_or(0);
let memory_bytes = raw
.memory
.as_ref()
.map(|m| m.memory_usage_private_working_set_bytes)
.unwrap_or(0);
ContainerStats {
cpu_usage_usec,
memory_bytes,
memory_limit: u64::MAX,
timestamp: Instant::now(),
}
}
fn extract_process_exit_code(raw_json: &str) -> Option<i32> {
let v: serde_json::Value = serde_json::from_str(raw_json).ok()?;
let status = v
.get("ProcessStatus")
.and_then(|s| s.get("ExitCode"))
.or_else(|| v.get("ExitCode"))?;
status.as_i64().map(|n| {
#[allow(clippy::cast_possible_truncation)]
let truncated = n as i32;
truncated
})
}
pub async fn list_owned_systems(daemon_name: &str) -> Result<Vec<String>> {
let tag = owner_tag(daemon_name);
let systems = enumerate::list_by_owner(&tag)
.await
.map_err(|e| AgentError::Internal(format!("HcsEnumerateComputeSystems: {e}")))?;
Ok(systems.into_iter().map(|s| s.id).collect())
}
#[cfg(test)]
mod tests {
use super::*;
use zlayer_hcs::schema::{MemoryStats, ProcessorStats};
#[test]
fn translate_stats_converts_100ns_to_usec_and_private_working_set_to_bytes() {
let raw = Statistics {
timestamp: None,
container_start_time: None,
uptime_100ns: 0,
processor: Some(ProcessorStats {
total_runtime_100ns: 12_345_000, runtime_user_100ns: 0,
runtime_kernel_100ns: 0,
}),
memory: Some(MemoryStats {
memory_usage_commit_bytes: 0,
memory_usage_commit_peak_bytes: 0,
memory_usage_private_working_set_bytes: 256 * 1024 * 1024,
}),
storage: None,
};
let stats = translate_stats(&raw);
assert_eq!(stats.cpu_usage_usec, 1_234_500);
assert_eq!(stats.memory_bytes, 256 * 1024 * 1024);
assert_eq!(stats.memory_limit, u64::MAX);
}
#[test]
fn translate_stats_defaults_zero_when_fields_missing() {
let raw = Statistics::default();
let stats = translate_stats(&raw);
assert_eq!(stats.cpu_usage_usec, 0);
assert_eq!(stats.memory_bytes, 0);
assert_eq!(stats.memory_limit, u64::MAX);
}
#[test]
fn extract_exit_code_reads_json_payload() {
assert_eq!(extract_exit_code(r#"{"ExitCode":42}"#), Some(42));
assert_eq!(extract_exit_code(""), None);
assert_eq!(extract_exit_code("not json"), None);
assert_eq!(extract_exit_code(r#"{"NoExitCode":1}"#), None);
}
#[test]
fn extract_process_exit_code_handles_nested_and_flat() {
assert_eq!(
extract_process_exit_code(r#"{"ProcessStatus":{"ExitCode":7}}"#),
Some(7)
);
assert_eq!(extract_process_exit_code(r#"{"ExitCode":9}"#), Some(9));
assert_eq!(extract_process_exit_code(r#"{}"#), None);
}
#[test]
fn hcs_config_default_sets_overlay_networking_fields() {
let cfg = HcsConfig::default();
assert_eq!(cfg.cluster_cidr, "10.200.0.0/16");
assert!(
cfg.slice_cidr.is_none(),
"slice_cidr must be None until the node joins the cluster and the leader hands out a slice"
);
}
#[test]
fn hcs_config_default_daemon_name_is_legacy() {
let cfg = HcsConfig::default();
assert_eq!(
cfg.daemon_name, "zlayer",
"single-instance installs must keep the legacy `zlayer` owner tag"
);
}
#[test]
fn owner_tag_legacy() {
assert_eq!(owner_tag("zlayer"), "zlayer");
}
#[test]
fn owner_tag_dev() {
assert_eq!(owner_tag("zlayer-dev"), "zlayer-dev");
}
#[test]
fn overlay_network_legacy() {
assert_eq!(overlay_network_name("zlayer"), "zlayer-overlay");
}
#[test]
fn overlay_network_dev() {
assert_eq!(overlay_network_name("zlayer-dev"), "zlayer-dev-overlay");
}
#[test]
fn parse_os_version_four_components() {
assert_eq!(parse_os_version("10.0.20348.2700"), Some((10, 0, 20348)));
}
#[test]
fn parse_os_version_three_components() {
assert_eq!(parse_os_version("10.0.26100"), Some((10, 0, 26100)));
}
#[test]
fn parse_os_version_rejects_malformed() {
assert_eq!(parse_os_version(""), None);
assert_eq!(parse_os_version("10"), None);
assert_eq!(parse_os_version("10.0"), None);
assert_eq!(parse_os_version("10.0.x"), None);
assert_eq!(parse_os_version("not.a.version"), None);
}
#[test]
fn decide_isolation_auto_matched_builds_picks_process() {
assert_eq!(
decide_isolation(
Some(zlayer_spec::IsolationMode::Auto),
Some((10, 0, 26100)),
Some((10, 0, 26100)),
),
IsolationMode::Process,
);
assert_eq!(
decide_isolation(None, Some((10, 0, 26100)), Some((10, 0, 26100))),
IsolationMode::Process,
);
}
#[test]
fn decide_isolation_auto_mismatched_builds_picks_hyperv() {
assert_eq!(
decide_isolation(
Some(zlayer_spec::IsolationMode::Auto),
Some((10, 0, 20348)),
Some((10, 0, 26100)),
),
IsolationMode::Hyperv,
);
}
#[test]
fn decide_isolation_auto_known_image_unknown_host_picks_hyperv() {
assert_eq!(
decide_isolation(
Some(zlayer_spec::IsolationMode::Auto),
Some((10, 0, 26100)),
None,
),
IsolationMode::Hyperv,
);
}
#[test]
fn decide_isolation_auto_unknown_image_picks_process() {
assert_eq!(
decide_isolation(Some(zlayer_spec::IsolationMode::Auto), None, None),
IsolationMode::Process,
);
assert_eq!(
decide_isolation(None, None, Some((10, 0, 26100))),
IsolationMode::Process,
);
}
#[test]
fn decide_isolation_explicit_process_overrides_matrix() {
assert_eq!(
decide_isolation(
Some(zlayer_spec::IsolationMode::Process),
Some((10, 0, 20348)),
Some((10, 0, 26100)),
),
IsolationMode::Process,
);
assert_eq!(
decide_isolation(Some(zlayer_spec::IsolationMode::Process), None, None),
IsolationMode::Process,
);
}
#[test]
fn decide_isolation_explicit_hyperv_overrides_matrix() {
assert_eq!(
decide_isolation(
Some(zlayer_spec::IsolationMode::Hyperv),
Some((10, 0, 26100)),
Some((10, 0, 26100)),
),
IsolationMode::Hyperv,
);
assert_eq!(
decide_isolation(Some(zlayer_spec::IsolationMode::Hyperv), None, None),
IsolationMode::Hyperv,
);
}
#[test]
fn resolve_isolation_for_image_smoke() {
let mode = resolve_isolation_for_image(None, None);
assert!(
matches!(mode, IsolationMode::Process | IsolationMode::Hyperv),
"resolve_isolation_for_image returned an unexpected variant: {mode:?}",
);
}
fn fixture_spec() -> ServiceSpec {
let yaml = r"
version: v1
deployment: test
services:
test:
rtype: service
image:
name: mcr.microsoft.com/windows/nanoserver:ltsc2022
";
serde_yaml::from_str::<zlayer_spec::DeploymentSpec>(yaml)
.expect("valid fixture yaml")
.services
.remove("test")
.expect("service 'test' present")
}
#[test]
fn build_virtual_machine_doc_populates_uvm_fields() {
use std::path::PathBuf;
use zlayer_hcs::schema::Layer;
let scratch = PathBuf::from(r"C:\zlayer\uvms\test-container\scratch.vhdx");
let os_files = PathBuf::from(r"C:\zlayer\images\app\UtilityVM\Files");
let uvm = Uvm::for_test("test-container", scratch.clone(), os_files.clone());
let parent_layers = vec![
Layer {
id: "11111111-1111-1111-1111-111111111111".to_string(),
path: r"C:\zlayer\images\base".to_string(),
},
Layer {
id: "22222222-2222-2222-2222-222222222222".to_string(),
path: r"C:\zlayer\images\app".to_string(),
},
];
let spec = fixture_spec();
let vm = build_virtual_machine_doc(&uvm, &parent_layers, &spec, &[]);
let chipset = vm.chipset.expect("chipset");
let uefi = chipset.uefi.expect("uefi");
let boot_entry = uefi.boot_this.expect("boot_this");
assert_eq!(boot_entry.device_type, "VmbFs");
assert_eq!(boot_entry.device_path, r"\EFI\Microsoft\Boot\bootmgfw.efi");
assert_eq!(boot_entry.disk_number, None);
let devices = vm.devices.expect("devices");
let controller = devices
.scsi
.get(PRIMARY_SCSI_CTRL_GUID)
.expect("scsi controller keyed by primary GUID");
let attachment = controller.attachments.get("0").expect("scsi attachment 0");
assert_eq!(attachment.path, scratch.to_string_lossy());
assert_eq!(attachment.r#type, "VirtualDisk");
assert_eq!(attachment.read_only, Some(false));
let vsmb = devices
.virtual_smb
.as_ref()
.expect("VirtualSmb block populated");
assert_eq!(
vsmb.shares.len(),
1,
"create-time shares: `os` only; parent layers hot-attached at Step 5; zlayer-debug dropped per B-4",
);
assert_eq!(
vsmb.shares[0].name, "os",
"`os` boot-files share must be first per hcsshim convention",
);
let os_share = &vsmb.shares[0];
assert_eq!(os_share.path, os_files.to_string_lossy());
let os_opts = os_share
.options
.as_ref()
.expect("os share carries named options");
assert!(os_opts.read_only, "os share must be read-only");
assert!(os_opts.share_read, "os share must set ShareRead");
assert!(os_opts.cache_io, "os share must set CacheIo");
assert!(os_opts.pseudo_oplocks, "os share must set PseudoOplocks");
assert!(
os_opts.take_backup_privilege,
"os share must set TakeBackupPrivilege per hcsshim DefaultVSMBOptions(true)",
);
assert!(
os_share.flags.is_none(),
"named options replace the legacy raw flags bitmask",
);
assert!(
vsmb.shares.iter().all(|s| s.name == "os"),
"create-time VSMB.Shares must be `os` only; got {:?}",
vsmb.shares
.iter()
.map(|s| s.name.as_str())
.collect::<Vec<_>>(),
);
let topology = vm.compute_topology.expect("compute_topology");
assert_eq!(topology.processor.expect("processor").count, 2);
assert_eq!(topology.memory.expect("memory").size_in_mb, 1024);
assert!(
vm.guest_state.is_none(),
"VmbFs-boot UVMs must not carry a host GuestState path",
);
}
#[test]
fn build_virtual_machine_doc_sets_hcsshim_parity_fields() {
use std::path::PathBuf;
let uvm = Uvm::for_test(
"parity-container",
PathBuf::from(r"C:\zlayer\uvms\parity\scratch.vhdx"),
PathBuf::from(r"C:\zlayer\images\app\UtilityVM\Files"),
);
let spec = fixture_spec();
let vm = build_virtual_machine_doc(&uvm, &[], &spec, &[]);
assert!(vm.stop_on_reset, "UVM must set StopOnReset like hcsshim");
let vsmb = vm
.devices
.as_ref()
.and_then(|d| d.virtual_smb.as_ref())
.expect("VirtualSmb block populated");
assert_eq!(
vsmb.direct_file_mapping_in_mb,
Some(1024),
"VSMB DirectFileMappingInMB must mirror hcsshim's WCOW default",
);
let reg = vm
.registry_changes
.as_ref()
.expect("UVM must carry RegistryChanges for hcsshim parity");
let gns = reg
.add_values
.iter()
.find(|v| v.name == "EnableCompartmentNamespace")
.expect("gns EnableCompartmentNamespace key present");
assert_eq!(gns.d_word_value, Some(1));
assert_eq!(gns.r#type, Some(RegistryValueType::DWord));
let key = gns.key.as_ref().expect("registry value carries a key");
assert_eq!(key.hive, Some(RegistryHive::System));
assert_eq!(key.name, r"CurrentControlSet\Services\gns");
}
#[test]
fn build_virtual_machine_doc_handles_empty_parent_chain() {
use std::path::PathBuf;
let uvm = Uvm::for_test(
"empty-chain",
PathBuf::from(r"C:\scratch.vhdx"),
PathBuf::from(r"C:\os-files"),
);
let spec = fixture_spec();
let vm = build_virtual_machine_doc(&uvm, &[], &spec, &[]);
let devices = vm.devices.expect("devices");
let vsmb = devices
.virtual_smb
.as_ref()
.expect("VirtualSmb block populated");
assert_eq!(vsmb.shares.len(), 1);
assert_eq!(vsmb.shares[0].name, "os");
assert_eq!(devices.scsi.len(), 1);
assert!(devices.scsi.contains_key(PRIMARY_SCSI_CTRL_GUID));
assert!(
vm.guest_state.is_none(),
"VmbFs-boot UVMs must not carry a host GuestState path",
);
}
#[cfg(not(target_os = "windows"))]
#[test]
fn enumerate_host_gpu_adapters_returns_unsupported_on_non_windows() {
let err =
super::enumerate_host_gpu_adapters().expect_err("must be Unsupported off-Windows");
assert_eq!(err.kind(), std::io::ErrorKind::Unsupported);
}
#[cfg(target_os = "windows")]
#[test]
#[ignore = "requires a real Windows host with at least one GPU adapter"]
fn enumerate_host_gpu_adapters_on_windows_finds_at_least_one() {
let adapters = super::enumerate_host_gpu_adapters().expect("DXGI probe must succeed");
assert!(
!adapters.is_empty(),
"expected at least one host GPU adapter (WARP excluded); got {adapters:?}",
);
}
fn fixture_adapters() -> Vec<HostGpuAdapter> {
vec![
HostGpuAdapter {
luid_high: 0,
luid_low: 1,
description: "NVIDIA GeForce RTX 4090".to_string(),
vendor_id: 0x10de,
device_id: 0x2684,
},
HostGpuAdapter {
luid_high: 0,
luid_low: 2,
description: "NVIDIA RTX A6000".to_string(),
vendor_id: 0x10de,
device_id: 0x2230,
},
HostGpuAdapter {
luid_high: 0,
luid_low: 3,
description: "AMD Radeon RX 7900 XTX".to_string(),
vendor_id: 0x1002,
device_id: 0x744c,
},
]
}
#[test]
fn filter_adapters_by_vendor_nvidia() {
let adapters = fixture_adapters();
let spec = zlayer_spec::GpuSpec {
count: 99, vendor: "nvidia".to_string(),
mode: None,
model: None,
scheduling: None,
distributed: None,
sharing: None,
mps_pipe_dir: None,
mps_log_dir: None,
time_slice_index: None,
time_slicing_config_path: None,
};
let filtered = filter_adapters_by_gpu_spec(&adapters, &spec);
assert_eq!(filtered.len(), 2);
assert!(filtered.iter().all(|a| a.vendor_id == 0x10de));
}
#[test]
fn filter_adapters_by_count_truncates() {
let adapters = fixture_adapters();
let spec = zlayer_spec::GpuSpec {
count: 1,
vendor: "all".to_string(),
mode: None,
model: None,
scheduling: None,
distributed: None,
sharing: None,
mps_pipe_dir: None,
mps_log_dir: None,
time_slice_index: None,
time_slicing_config_path: None,
};
let filtered = filter_adapters_by_gpu_spec(&adapters, &spec);
assert_eq!(filtered.len(), 1);
}
#[test]
fn filter_adapters_by_model_substring() {
let adapters = fixture_adapters();
let spec = zlayer_spec::GpuSpec {
count: 99,
vendor: "nvidia".to_string(),
mode: None,
model: Some("a6000".to_string()),
scheduling: None,
distributed: None,
sharing: None,
mps_pipe_dir: None,
mps_log_dir: None,
time_slice_index: None,
time_slicing_config_path: None,
};
let filtered = filter_adapters_by_gpu_spec(&adapters, &spec);
assert_eq!(filtered.len(), 1);
assert_eq!(filtered[0].description, "NVIDIA RTX A6000");
}
#[test]
fn build_virtual_machine_doc_with_gpu_populates_assignment() {
use std::path::PathBuf;
let uvm = Uvm::for_test(
"gpu-list",
PathBuf::from(r"C:\scratch.vhdx"),
PathBuf::from(r"C:\boot"),
);
let mut spec = fixture_spec();
spec.resources.gpu = Some(zlayer_spec::GpuSpec {
count: 1,
vendor: "nvidia".to_string(),
mode: None,
model: None,
scheduling: None,
distributed: None,
sharing: None,
mps_pipe_dir: None,
mps_log_dir: None,
time_slice_index: None,
time_slicing_config_path: None,
});
let adapters = vec![HostGpuAdapter {
luid_high: 0xdead_beef,
luid_low: 0x1234_5678,
description: "NVIDIA GeForce RTX 4090".to_string(),
vendor_id: 0x10de,
device_id: 0x2684,
}];
let vm = build_virtual_machine_doc(&uvm, &[], &spec, &adapters);
let devices = vm.devices.expect("devices");
let gpu = devices.gpu.expect("gpu assignment present");
assert_eq!(gpu.assignment_mode, GpuAssignmentMode::List);
assert_eq!(gpu.assignment_request.len(), 1);
let req = &gpu.assignment_request[0];
assert_eq!(req.adapter_luid_high_part, 0xdead_beef);
assert_eq!(req.adapter_luid_low_part, 0x1234_5678);
assert_eq!(
req.virtual_machine_id_string, "0xdeadbeef:0x12345678",
"LUID hex string must be `0x<hi>:0x<lo>`",
);
assert_eq!(gpu.allow_vendor_extension, Some(true));
}
#[test]
fn build_virtual_machine_doc_with_gpu_no_adapters_falls_back_to_default() {
use std::path::PathBuf;
let uvm = Uvm::for_test(
"gpu-default",
PathBuf::from(r"C:\scratch.vhdx"),
PathBuf::from(r"C:\boot"),
);
let mut spec = fixture_spec();
spec.resources.gpu = Some(zlayer_spec::GpuSpec {
count: 1,
vendor: "all".to_string(),
mode: None,
model: None,
scheduling: None,
distributed: None,
sharing: None,
mps_pipe_dir: None,
mps_log_dir: None,
time_slice_index: None,
time_slicing_config_path: None,
});
let vm = build_virtual_machine_doc(&uvm, &[], &spec, &[]);
let devices = vm.devices.expect("devices");
let gpu = devices.gpu.expect("gpu assignment present");
assert_eq!(gpu.assignment_mode, GpuAssignmentMode::Default);
assert!(gpu.assignment_request.is_empty());
}
#[test]
fn build_virtual_machine_doc_without_gpu_omits_assignment() {
use std::path::PathBuf;
let uvm = Uvm::for_test(
"no-gpu",
PathBuf::from(r"C:\scratch.vhdx"),
PathBuf::from(r"C:\boot"),
);
let spec = fixture_spec();
let vm = build_virtual_machine_doc(&uvm, &[], &spec, &[]);
let devices = vm.devices.expect("devices");
assert!(
devices.gpu.is_none(),
"spec without GpuSpec must produce a Devices block with no GPU field",
);
}
}