coreos-installer 0.16.1

Installer for Fedora CoreOS and RHEL CoreOS
// Copyright 2019 CoreOS, Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.

use anyhow::{anyhow, bail, Context, Result};
use gptman::{GPTPartitionEntry, GPT};
use nix::sys::stat::{major, minor};
use nix::{errno::Errno, mount, sched};
use regex::Regex;
use std::collections::{HashMap, HashSet};
use std::fs::{
    canonicalize, metadata, read_dir, read_to_string, remove_dir, symlink_metadata, File,
use std::io::{Read, Seek, SeekFrom, Write};
use std::num::{NonZeroU32, NonZeroU64};
use std::os::linux::fs::MetadataExt;
use std::os::raw::c_int;
use std::os::unix::fs::FileTypeExt;
use std::os::unix::io::AsRawFd;
use std::path::{Path, PathBuf};
use std::process::Command;
use std::thread::sleep;
use std::time::Duration;
use uuid::Uuid;

use crate::cmdline::PartitionFilter;
use crate::util::*;

use crate::{runcmd, runcmd_output};

pub struct Disk {
    path: String,

impl Disk {
    pub fn new<P: AsRef<Path>>(path: P) -> Result<Self> {
        let path = path.as_ref();
        let canon_path = path
            .with_context(|| format!("canonicalizing {}", path.display()))?;

        let canon_path = canon_path
            .with_context(|| {
                    "path {} canonicalized from {} is not UTF-8",

        Ok(Disk { path: canon_path })

    pub fn mount_partition_by_label(&self, label: &str, flags: mount::MsFlags) -> Result<Mount> {
        // get partition list
        let partitions = self.get_partitions()?;
        if partitions.is_empty() {
            bail!("couldn't find any partitions on {}", self.path);

        // find the partition with the matching label
        let matching_partitions = partitions
            .filter(|d| d.label.as_ref().unwrap_or(&"".to_string()) == label)
        let part = match matching_partitions.len() {
            0 => bail!("couldn't find {} device for {}", label, self.path),
            1 => matching_partitions[0],
            _ => bail!(
                "found multiple devices on {} with label \"{}\"",

        // mount it
        match &part.fstype {
            Some(fstype) => Mount::try_mount(&part.path, fstype, flags),
            None => bail!(
                "couldn't get filesystem type of {} device for {}",

    fn get_partitions(&self) -> Result<Vec<Partition>> {
        // walk each device in the output
        let mut result: Vec<Partition> = Vec::new();
        for devinfo in lsblk(Path::new(&self.path), true)? {
            if let Some(name) = devinfo.get("NAME") {
                // Only return partitions.  Skip the whole-disk device, as well
                // as holders like LVM or RAID devices using one of the partitions.
                if devinfo.get("TYPE").map(|s| s.as_str()) != Some("part") {
                let (mountpoint, swap) = match devinfo.get("MOUNTPOINT") {
                    Some(mp) if mp == "[SWAP]" => (None, true),
                    Some(mp) => (Some(mp.to_string()), false),
                    None => (None, false),
                result.push(Partition {
                    path: name.to_owned(),
                    label: devinfo.get("LABEL").map(<_>::to_string),
                    fstype: devinfo.get("FSTYPE").map(<_>::to_string),
                    parent: self.path.to_owned(),

    /// Return an empty list if we have exclusive access to the device, or
    /// a list of partitions preventing us from gaining exclusive access.
    pub fn get_busy_partitions(self) -> Result<Vec<Partition>> {
        // Try rereading the partition table.  This is the most complete
        // check, but it only works on partitionable devices.
        let rereadpt_result = {
            let mut f = OpenOptions::new()
                .with_context(|| format!("opening {}", &self.path))?;
            reread_partition_table(&mut f, false).map(|_| Vec::new())
        if rereadpt_result.is_ok() {
            return rereadpt_result;

        // Walk partitions, record the ones that are reported in use,
        // and return the list if any
        let mut busy: Vec<Partition> = Vec::new();
        for d in self.get_partitions()? {
            if d.mountpoint.is_some() || d.swap || !d.get_holders()?.is_empty() {
        if !busy.is_empty() {
            return Ok(busy);

        // Our investigation found nothing.  If the device is expected to be
        // partitionable but reread failed, we evidently missed something,
        // so error out for safety
        if !self.is_dm_device() {
            return rereadpt_result;


    /// Get a handle to the set of device nodes for individual partitions
    /// of the device.
    pub fn get_partition_table(&self) -> Result<Box<dyn PartTable>> {
        if self.is_dm_device() {
        } else {

    pub fn is_dm_device(&self) -> bool {

    pub fn is_luks_integrity(&self) -> Result<bool> {
        if !self.is_dm_device() {
            return Ok(false);
        .with_context(|| format!("checking if device {} is type LUKS integrity", self.path))?

/// A handle to the set of device nodes for individual partitions of a
/// device.  Must be held as long as the device nodes are needed; they might
/// be removed upon drop.
pub trait PartTable {
    /// Update device nodes for the current state of the partition table
    fn reread(&mut self) -> Result<()>;

/// Device nodes for partitionable kernel devices, managed by the kernel.
pub struct PartTableKernel {
    file: File,

impl PartTableKernel {
    fn new(path: &str) -> Result<Self> {
        let file = OpenOptions::new()
            .with_context(|| format!("opening {}", path))?;
        Ok(Self { file })

impl PartTable for PartTableKernel {
    fn reread(&mut self) -> Result<()> {
        reread_partition_table(&mut self.file, true)?;

/// Device nodes for non-partitionable kernel devices, managed by running
/// kpartx to parse the partition table and create device-mapper devices for
/// each partition.
pub struct PartTableKpartx {
    path: String,
    need_teardown: bool,

impl PartTableKpartx {
    fn new(path: &str) -> Result<Self> {
        let mut table = Self {
            path: path.to_string(),
            need_teardown: !Self::already_set_up(path)?,
        // create/sync partition devices if missing

    // We only want to kpartx -d on drop if we're the one initially
    // creating the partition devices.  There's no good way to detect
    // this.
    fn already_set_up(path: &str) -> Result<bool> {
        let re = Regex::new(r"^p[0-9]+$").expect("compiling RE");
        let expected = Path::new(path)
            .with_context(|| format!("getting filename of {}", path))?
            .map_err(|_| anyhow!("converting filename of {}", path))?;
        for ent in read_dir("/dev/mapper").context("listing /dev/mapper")? {
            let ent = ent.context("reading /dev/mapper entry")?;
            let found = ent.file_name().into_string().map_err(|_| {
                    "converting filename of {}",
            if found.starts_with(&expected) && re.is_match(&found[expected.len()..]) {
                return Ok(true);

    fn run_kpartx(&self, flag: &str) -> Result<()> {
        // Swallow stderr on success.  Avoids spurious warnings:
        //   GPT:Primary header thinks Alt. header is not at the end of the disk.
        //   GPT:Alternate GPT header not at the end of the disk.
        //   GPT: Use GNU Parted to correct GPT errors.
        // By default, kpartx waits for udev to settle before returning,
        // but this blocks indefinitely inside a container.  See e.g.
        // Use -n to skip blocking on udev, and then manually settle.
        runcmd_output!("kpartx", flag, "-n", &self.path)?;

impl PartTable for PartTableKpartx {
    fn reread(&mut self) -> Result<()> {
        let delay = 1;
        for _ in 0..4 {
            match self.run_kpartx("-u") {
                Ok(()) => return Ok(()),
                Err(e) => eprintln!("Error: {}", e),
            eprintln!("Retrying in {} second", delay);

impl Drop for PartTableKpartx {
    /// If we created the partition devices (rather than finding them
    /// already existing), delete them afterward so we don't leave DM
    /// devices attached to the specified disk.
    fn drop(&mut self) {
        if self.need_teardown {
            if let Err(e) = self.run_kpartx("-d") {
                eprintln!("{}", e)

pub struct Partition {
    pub path: String,
    pub label: Option<String>,
    pub fstype: Option<String>,

    pub parent: String,
    pub mountpoint: Option<String>,
    pub swap: bool,

impl Partition {
    /// Return start and end offsets within the disk.
    pub fn get_offsets(path: &str) -> Result<(u64, u64)> {
        let dev = metadata(path)
            .with_context(|| format!("getting metadata for {}", path))?
        let maj: u64 = major(dev);
        let min: u64 = minor(dev);

        let start = read_sysfs_dev_block_value_u64(maj, min, "start")?;
        let size = read_sysfs_dev_block_value_u64(maj, min, "size")?;

        // We multiply by 512 here: the kernel values are always in 512 blocks, regardless of the
        // actual sector size of the block device. We keep the values as bytes to make things
        // easier.
        let start_offset: u64 = start
            .context("start offset mult overflow")?;
        let end_offset: u64 = start_offset
            .checked_add(size.checked_mul(512).context("end offset mult overflow")?)
            .context("end offset add overflow")?;
        Ok((start_offset, end_offset))

    pub fn get_holders(&self) -> Result<Vec<String>> {
        let holders = self.get_sysfs_dir()?.join("holders");
        let mut ret: Vec<String> = Vec::new();
        for ent in read_dir(&holders).with_context(|| format!("reading {}", &holders.display()))? {
            let ent = ent.with_context(|| format!("reading {} entry", &holders.display()))?;
            ret.push(format!("/dev/{}", ent.file_name().to_string_lossy()));

    // Try to locate the device directory in sysfs.
    fn get_sysfs_dir(&self) -> Result<PathBuf> {
        let basedir = Path::new("/sys/block");

        // First assume we have a regular partition.
        // /sys/block/sda/sda1
        let devdir = basedir
                    .with_context(|| format!("parent {} has no filename", self.parent))?,
                    .with_context(|| format!("path {} has no filename", self.path))?,
        if devdir.exists() {
            return Ok(devdir);

        // Now assume a kpartx "partition", where the path is a symlink to
        // an unpartitioned DM device node.
        // /sys/block/dm-1
        let is_link = symlink_metadata(&self.path)
            .with_context(|| format!("reading metadata for {}", self.path))?
        if is_link {
            let target = canonicalize(&self.path)
                .with_context(|| format!("getting absolute path to {}", self.path))?;
            let devdir = basedir.join(
                    .with_context(|| format!("target {} has no filename", target.display()))?,
            if devdir.exists() {
                return Ok(devdir);

        // Give up
            "couldn't find /sys/block directory for partition {} of {}",

pub struct Mount {
    device: String,
    mountpoint: PathBuf,
    /// Whether we own this mount.
    owned: bool,

impl Mount {
    pub fn try_mount(device: &str, fstype: &str, flags: mount::MsFlags) -> Result<Mount> {
        let tempdir = tempfile::Builder::new()
            .context("creating temporary directory")?;
        // avoid auto-cleanup of tempdir, which could recursively remove
        // the partition contents if umount failed
        let mountpoint = tempdir.into_path();

        // Ensure we're in a private mount namespace so the mount isn't
        // visible to the rest of the system.  Multiple unshare calls
        // should be safe.
        sched::unshare(sched::CloneFlags::CLONE_NEWNS).context("unsharing mount namespace")?;

        mount::mount::<str, Path, str, str>(Some(device), &mountpoint, Some(fstype), flags, None)
            .with_context(|| format!("mounting device {} on {}", device, mountpoint.display()))?;

        Ok(Mount {
            device: device.to_string(),
            owned: true,

    pub fn from_existing(path: &str) -> Result<Mount> {
        let mounts = read_to_string("/proc/self/mounts").context("reading mount table")?;
        for line in mounts.lines() {
            let mount: Vec<&str> = line.split_whitespace().collect();
            // see
            if mount.len() != 6 {
                bail!("invalid line in /proc/self/mounts: {}", line);
            if mount[1] == path {
                return Ok(Mount {
                    device: mount[0].to_string(),
                    mountpoint: path.into(),
                    owned: false,
        bail!("mountpoint {} not found", path);

    pub fn device(&self) -> &str {

    pub fn mountpoint(&self) -> &Path {

    pub fn get_partition_offsets(&self) -> Result<(u64, u64)> {

    pub fn get_filesystem_uuid(&self) -> Result<String> {
        let devinfo = lsblk_single(Path::new(&self.device))?;
            .with_context(|| format!("filesystem {} has no UUID", self.device))

impl Drop for Mount {
    fn drop(&mut self) {
        if !self.owned {

        // Unmount sometimes fails immediately after closing the last open
        // file on the partition.  Retry several times before giving up.
        for retries in (0..20).rev() {
            match mount::umount(&self.mountpoint) {
                Ok(_) => break,
                Err(err) => {
                    if retries == 0 {
                        eprintln!("umounting {}: {}", self.device, err);
                    } else {
        if let Err(err) = remove_dir(&self.mountpoint) {
            eprintln!("removing {}: {}", self.mountpoint.display(), err);

pub struct SavedPartitions {
    sector_size: u64,
    partitions: Vec<(u32, GPTPartitionEntry)>,

impl SavedPartitions {
    /// Create a SavedPartitions for a block device with a sector size.
    pub fn new_from_disk(disk: &mut File, filters: &[PartitionFilter]) -> Result<Self> {
        if !disk
            .context("getting disk metadata")?
            bail!("specified file is not a block device");
        Self::new(disk, get_sector_size(disk)?.get() as u64, filters)

    /// Create a SavedPartitions for a file with a specified imputed sector
    /// size.  Useful for unit tests, and fails on a real disk.
    pub fn new_from_file(
        disk: &mut File,
        sector_size: u64,
        filters: &[PartitionFilter],
    ) -> Result<Self> {
        if disk
            .context("getting disk metadata")?
            bail!("called new_from_file() on a block device");
        match sector_size {
            512 | 4096 => (),
            _ => bail!("specified unreasonable sector size {}", sector_size),
        Self::new(disk, sector_size, filters)

    fn new(disk: &mut File, sector_size: u64, filters: &[PartitionFilter]) -> Result<Self> {
        // if there are no filters, ignore existing GPT, since we're going to
        // overwrite it
        if filters.is_empty() {
            return Ok(Self {
                partitions: Vec::new(),

        // read GPT
        let gpt = match GPT::find_from(disk) {
            Ok(gpt) => gpt,
            Err(gptman::Error::InvalidSignature) => {
                // ensure no indexes are listed to be saved from a MBR disk
                // we don't need to check for labels since MBR does not support them
                if filters
                    .any(|f| matches!(f, PartitionFilter::Index(_, _)))
                    && disk_has_mbr(disk).context("checking if disk has an MBR")?
                    bail!("saving partitions from an MBR disk is not yet supported");

                // no GPT on this disk, so no partitions to save
                return Ok(Self {
                    partitions: Vec::new(),
            Err(e) => return Err(e).context("reading partition table"),

        // cross-check GPT sector size
        Self::verify_gpt_sector_size(&gpt, sector_size)?;

        // save partitions accepted by filters
        let mut partitions = Vec::new();
        for (i, p) in gpt.iter() {
            if Self::matches_filters(i, p, filters) {
                partitions.push((i, p.clone()));
        let result = Self {

        // Test restoring the saved partitions to a temporary file.  If the
        // resulting partition table contains invalid data (e.g. duplicate
        // partition GUIDs) we need to know now, before the caller
        // overwrites the partition table.  Otherwise we could fail to
        // restore, clear the table, and fail to restore _again_ to the
        // empty table.
        if !result.partitions.is_empty() {
            let len ="getting disk size")?;
            let mut temp = tempfile::tempfile().context("creating dry run image")?;
                .with_context(|| format!("setting test image size to {}", len))?;
            result.overwrite(&mut temp).context(
                "failed dry run restoring saved partitions; input partition table may be invalid",


    fn verify_disk_sector_size(&self, disk: &File) -> Result<()> {
        if !disk
            .context("getting disk metadata")?
            return Ok(());
        let disk_sector_size = get_sector_size(disk)?.get() as u64;
        if disk_sector_size != self.sector_size {
                "disk sector size {} doesn't match expected {}",

    fn verify_gpt_sector_size(gpt: &GPT, sector_size: u64) -> Result<()> {
        if gpt.sector_size != sector_size {
                "GPT sector size {} doesn't match expected {}",

    fn matches_filters(i: u32, p: &GPTPartitionEntry, filters: &[PartitionFilter]) -> bool {
        use PartitionFilter::*;
        if !p.is_used() {
            return false;
        filters.iter().any(|f| match f {
            Index(Some(first), _) if first.get() > i => false,
            Index(_, Some(last)) if last.get() < i => false,
            Index(_, _) => true,
            Label(glob) if glob.matches(p.partition_name.as_str()) => true,
            _ => false,

    /// Unconditionally write the saved partitions, and only the saved
    /// partitions, to the disk.  Write a protective MBR and overwrite any
    /// MBR boot code.  Updating the kernel partition table is the caller's
    /// responsibility.
    pub fn overwrite(&self, disk: &mut File) -> Result<()> {
        // create GPT
        let mut gpt = GPT::new_from(disk, self.sector_size, *Uuid::new_v4().as_bytes())
            .context("creating new GPT")?;

        // add partitions
        for (i, p) in &self.partitions {
            gpt[*i] = p.clone();

        // write GPT
        gpt.write_into(disk).context("writing new GPT")?;

        // Overwrite only the parts of the MBR that don't contain the
        // partition table, then write protective MBR.  This ensures that
        // there's no time window without an MBR, during which the kernel
        // would refuse to read the GPT."seeking to MBR")?;
        disk.write(&[0u8; 446])
            .context("overwriting MBR boot code")?;
        if self.sector_size > 512 {
                .context("seeking to end of MBR")?;
            disk.write(&vec![0u8; self.sector_size as usize - 512])
                .context("overwriting end of MBR")?;
        GPT::write_protective_mbr_into(disk, self.sector_size).context("writing protective MBR")?;


    /// If any partitions are saved, merge them into the GPT from source,
    /// which must be valid, and write a protective MBR with the correct
    /// protective partition size.  Updating the kernel partition table is
    /// the caller's responsibility.
    pub fn merge(&self, source: &mut (impl Read + Seek), disk: &mut File) -> Result<()> {
        if self.partitions.is_empty() {
            return Ok(());

        // read GPT
        let mut gpt =
            GPT::find_from(source).context("couldn't read partition table from source")?;
        Self::verify_gpt_sector_size(&gpt, self.sector_size)?;
        // The GPT thinks the disk is the size of the install image.
        // Update sizing.
            .update_from(disk, self.sector_size)
            .context("updating GPT header")?;

        // merge saved partitions into partition table
        // find partition number one larger than the largest used one
        let mut next = gpt
            .fold(1, |prev, (i, e)| if e.is_used() { i + 1 } else { prev });
        for (i, p) in &self.partitions {
            // use the next partition number in the sequence if we have to,
            // or the partition's original number if it's larger
            next = next.max(*i);
                "Saving partition {} (\"{}\") to new partition {}",
                i, p.partition_name, next
            gpt[next] = p.clone();
            next += 1;

        // write
        gpt.write_into(disk).context("writing updated GPT")?;

        // update protective partition size
        GPT::write_protective_mbr_into(disk, self.sector_size).context("writing protective MBR")?;


    /// Get the sector size in use for this partition table.
    pub fn get_sector_size(&self) -> u64 {

    /// Get the byte offset of the first byte not to be overwritten, if any,
    /// plus a description of the partition at that offset.
    pub fn get_offset(&self) -> Result<Option<(u64, String)>> {
        match self.partitions.iter().min_by_key(|(_, p)| p.starting_lba) {
            None => Ok(None),
            Some((i, p)) => Ok(Some((
                    .context("overflow calculating partition start")?,
                format!("partition {} (\"{}\")", i, p.partition_name.as_str()),

    pub fn is_saved(&self) -> bool {

fn read_sysfs_dev_block_value_u64(maj: u64, min: u64, field: &str) -> Result<u64> {
    let s = read_sysfs_dev_block_value(maj, min, field).with_context(|| {
            "reading partition {}:{} {} value from sysfs",
            maj, min, field
    s.parse().with_context(|| {
            "parsing partition {}:{} {} value \"{}\" as u64",
            maj, min, field, &s

fn read_sysfs_dev_block_value(maj: u64, min: u64, field: &str) -> Result<String> {
    let path = PathBuf::from(format!("/sys/dev/block/{}:{}/{}", maj, min, field));

pub fn lsblk_single(dev: &Path) -> Result<HashMap<String, String>> {
    let mut devinfos = lsblk(Path::new(dev), false)?;
    if devinfos.is_empty() {
        // this should never happen because `lsblk` itself would've failed
        bail!("no lsblk results for {}", dev.display());

/// Returns all available filesystems.
/// rereadpt mitigates possible issue with outdated UUIDs on different
/// paths to the same disk: after 'ignition-ostree-firstboot-uuid'
/// '/dev/sdaX' path gets new UUID, but '/dev/sdbX/' path has an old one
fn get_all_filesystems(rereadpt: bool) -> Result<Vec<HashMap<String, String>>> {
    if rereadpt {
        let mut cmd = Command::new("lsblk");
        let output = cmd_output(&mut cmd)?;
        for dev in output.lines() {
            if let Ok(mut fd) = std::fs::File::open(dev) {
                // best-effort reread of disk that may have busy partitions; don't retry
                let _ = reread_partition_table(&mut fd, false);

/// Returns filesystems with given label.
/// If multiple filesystems with the label have the same UUID, we only return one of them.
pub fn get_filesystems_with_label(label: &str, rereadpt: bool) -> Result<Vec<String>> {
    let mut uuids = HashSet::new();
    let result = get_all_filesystems(rereadpt)?
        .filter(|v| v.get("LABEL").map(|l| l.as_str()) == Some(label))
        .filter(|v| match v.get("UUID") {
            Some(uuid) => {
                if !uuid.is_empty() {
                } else {
            None => true,
        .filter_map(|v| v.get("NAME").map(<_>::to_owned))

pub fn lsblk(dev: &Path, with_deps: bool) -> Result<Vec<HashMap<String, String>>> {
    let mut cmd = Command::new("lsblk");
    // Older lsblk, e.g. in CentOS 7.6, doesn't support PATH, but --paths option
    if !with_deps {
    let output = cmd_output(&mut cmd)?;
    let mut result: Vec<HashMap<String, String>> = Vec::new();
    for line in output.lines() {
        // parse key-value pairs

/// Parse key-value pairs from blkid.
fn split_blkid_line(line: &str) -> HashMap<String, String> {
    let (name, data) = match line.find(':') {
        Some(n) => line.split_at(n),
        None => return HashMap::new(),

    let (name, data) = (name.trim(), data[1..].trim());
    if name.is_empty() {
        return HashMap::new();

    let mut fields = split_lsblk_line(data);
    fields.insert("NAME".to_string(), name.to_string());

fn blkid() -> Result<Vec<HashMap<String, String>>> {
    let mut cmd = Command::new("blkid");
    let output = cmd_output(&mut cmd)?;

    let mut result: Vec<HashMap<String, String>> = Vec::new();
    for line in output.lines() {

/// This is a bit fuzzy, but... this function will return every block device in the parent
/// hierarchy of `device` capable of containing other partitions. So e.g. parent devices of type
/// "part" doesn't match, but "disk" and "mpath" does.
pub fn find_parent_devices(device: &str) -> Result<Vec<String>> {
    let mut cmd = Command::new("lsblk");
    // Older lsblk, e.g. in CentOS 7.6, doesn't support PATH, but --paths option
    let output = cmd_output(&mut cmd)?;
    let mut parents = Vec::new();
    // skip first line, which is the device itself
    for line in output.lines().skip(1) {
        let dev = split_lsblk_line(line);
        let name = dev
            .with_context(|| format!("device in hierarchy of {} missing NAME", device))?;
        let kind = dev
            .with_context(|| format!("device in hierarchy of {} missing TYPE", device))?;
        if kind == "disk" {
        } else if kind == "mpath" {
            // we don't need to know what disks back the multipath
    if parents.is_empty() {
        bail!("no parent devices found for {}", device);

/// Find ESP partitions which sit at the same hierarchy level as `device`.
pub fn find_colocated_esps(device: &str) -> Result<Vec<String>> {
    const ESP_TYPE_GUID: &str = "c12a7328-f81f-11d2-ba4b-00a0c93ec93b";

    // first, get the parent device
    let parent_devices = find_parent_devices(device)
        .with_context(|| format!("while looking for colocated ESPs of '{}'", device))?;

    // now, look for all ESPs on those devices
    let mut esps = Vec::new();
    for parent_device in parent_devices {
        let mut cmd = Command::new("lsblk");
        // Older lsblk, e.g. in CentOS 7.6, doesn't support PATH, but --paths option
        for line in cmd_output(&mut cmd)?.lines() {
            let dev = split_lsblk_line(line);
            if dev.get("PARTTYPE").map(|t| t.as_str()) == Some(ESP_TYPE_GUID) {
                        .context("ESP device with missing NAME")?,

/// This is basically a Rust version of:
pub fn find_efi_vendor_dir(efi_mount: &Mount) -> Result<PathBuf> {
    let p = efi_mount.mountpoint().join("EFI");
    let mut vendor_dir: Vec<PathBuf> = Vec::new();
    for ent in p.read_dir()? {
        let ent = ent.with_context(|| format!("reading directory entry in {}", p.display()))?;
        if !ent.file_type()?.is_dir() {
        let path = ent.path();
        if path.join("grub.cfg").is_file() {
    if vendor_dir.len() != 1 {
            "Expected one vendor dir on {}, got {} ({:?})",

/// Parse key-value pairs from lsblk --pairs.
/// Newer versions of lsblk support JSON but the one in CentOS 7 doesn't.
fn split_lsblk_line(line: &str) -> HashMap<String, String> {
    let re = Regex::new(r#"([A-Z-_]+)="([^"]+)""#).unwrap();
    let mut fields: HashMap<String, String> = HashMap::new();
    for cap in re.captures_iter(line) {
        fields.insert(cap[1].to_string(), cap[2].to_string());

pub fn get_blkdev_deps(device: &Path) -> Result<Vec<PathBuf>> {
    let deps = {
        let mut p = PathBuf::from("/sys/block");
                .with_context(|| format!("canonicalizing {}", device.display()))?
                .with_context(|| format!("path {} has no filename", device.display()))?,
    let mut ret: Vec<PathBuf> = Vec::new();
    let dir_iter = match read_dir(&deps) {
        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(ret),
        Err(e) => return Err(e).with_context(|| format!("reading dir {}", &deps.display())),
        Ok(it) => it,
    for ent in dir_iter {
        let ent = ent.with_context(|| format!("reading {} entry", &deps.display()))?;

pub fn get_blkdev_deps_recursing(device: &Path) -> Result<Vec<PathBuf>> {
    let mut ret: Vec<PathBuf> = Vec::new();
    for dep in get_blkdev_deps(device)? {

fn reread_partition_table(file: &mut File, retry: bool) -> Result<()> {
    let fd = file.as_raw_fd();
    // Reread sometimes fails inexplicably.  Retry several times before
    // giving up.
    let max_tries = if retry { 20 } else { 1 };
    for retries in (0..max_tries).rev() {
        let result = unsafe { ioctl::blkrrpart(fd) };
        match result {
            Ok(_) => break,
            Err(err) if retries == 0 && err == Errno::EINVAL => {
                return Err(err)
                    .context("couldn't reread partition table: device may not support partitions")
            Err(err) if retries == 0 && err == Errno::EBUSY => {
                return Err(err).context("couldn't reread partition table: device is in use")
            Err(err) if retries == 0 => return Err(err).context("couldn't reread partition table"),
            Err(_) => sleep(Duration::from_millis(100)),

/// Get the sector size of the block device at a given path.
pub fn get_sector_size_for_path(device: &Path) -> Result<NonZeroU32> {
    let dev = OpenOptions::new()
        .with_context(|| format!("opening {:?}", device))?;

    if !dev
        .with_context(|| format!("getting metadata for {:?}", device))?
        bail!("{:?} is not a block device", device);


/// Get the logical sector size of a block device.
pub fn get_sector_size(file: &File) -> Result<NonZeroU32> {
    let fd = file.as_raw_fd();
    let mut size: c_int = 0;
    match unsafe { ioctl::blksszget(fd, &mut size) } {
        Ok(_) => {
            let size_u32: u32 = size
                .with_context(|| format!("sector size {} doesn't fit in u32", size))?;
            NonZeroU32::new(size_u32).context("found sector size of zero")
        Err(e) => Err(anyhow!(e).context("getting sector size")),

/// Get the size of a block device.
pub fn get_block_device_size(file: &File) -> Result<NonZeroU64> {
    let fd = file.as_raw_fd();
    let mut size: libc::size_t = 0;
    match unsafe { ioctl::blkgetsize64(fd, &mut size) } {
        // just cast using `as`: there is no platform we care about today where size_t > 64bits
        Ok(_) => NonZeroU64::new(size as u64).context("found block size of zero"),
        Err(e) => Err(anyhow!(e).context("getting block size")),

/// Get the size of the GPT metadata at the start of the disk.
pub fn get_gpt_size(file: &mut (impl Read + Seek)) -> Result<u64> {
    let gpt = GPT::find_from(file).context("reading GPT")?;
    Ok(gpt.header.first_usable_lba * gpt.sector_size)

fn disk_has_mbr(file: &mut (impl Read + Seek)) -> Result<bool> {
    let mut sig = [0u8; 2];
        .context("seeking to MBR signature")?;
    file.read_exact(&mut sig).context("reading MBR signature")?;
    Ok(sig == [0x55, 0xaa])

pub fn udev_settle() -> Result<()> {
    // "udevadm settle" silently no-ops if the udev socket is missing, and
    // then lsblk can't find partition labels.  Catch this early.
    if !Path::new("/run/udev/control").exists() {
        bail!("udevd socket missing; are we running in a container without /run/udev mounted?");

    // There's a potential window after rereading the partition table where
    // udevd hasn't yet received updates from the kernel, settle will return
    // immediately, and lsblk won't pick up partition labels.  Try to sleep
    // our way out of this.

    runcmd!("udevadm", "settle")?;

/// Inspect a buffer from the start of a disk image and return its formatted
/// sector size, if any can be determined.
pub fn detect_formatted_sector_size(buf: &[u8]) -> Option<NonZeroU32> {
    let gpt_magic: &[u8; 8] = b"EFI PART";

    if buf.len() >= 520 && buf[512..520] == gpt_magic[..] {
        // GPT at offset 512
    } else if buf.len() >= 4104 && buf[4096..4104] == gpt_magic[..] {
        // GPT at offset 4096
    } else {
        // Unknown

/// Checks if underlying device is IBM DASD disk
pub fn is_dasd(device: &str, fd: Option<&mut File>) -> Result<bool> {
    let target =
        canonicalize(device).with_context(|| format!("getting absolute path to {}", device))?;
    if target.to_string_lossy().starts_with("/dev/dasd") {
        return Ok(true);
    let read_magic = |device: &str, disk: &mut File| -> Result<[u8; 4]> {
        let offset = disk
            .with_context(|| format!("saving offset {}", device))?;
            .with_context(|| format!("seeking {}", device))?;
        let mut lbl = [0u8; 4];
        disk.read_exact(&mut lbl)
            .with_context(|| format!("reading label {}", device))?;
            .with_context(|| format!("restoring offset {}", device))?;
    if target.to_string_lossy().starts_with("/dev/vd") {
        let cdl_magic = [0xd3, 0xf1, 0xe5, 0xd6];
        let lbl = if let Some(t) = fd {
            read_magic(device, t)?
        } else {
            let mut disk = File::open(device).with_context(|| format!("opening {}", device))?;
            read_magic(device, &mut disk)?
        return Ok(cdl_magic == lbl);

// create unsafe ioctl wrappers
mod ioctl {
    use super::c_int;
    use nix::{ioctl_none, ioctl_read, ioctl_read_bad, request_code_none};
    ioctl_none!(blkrrpart, 0x12, 95);
    ioctl_read_bad!(blksszget, request_code_none!(0x12, 104), c_int);
    ioctl_read!(blkgetsize64, 0x12, 114, libc::size_t);

mod tests {
    use super::*;
    use maplit::hashmap;
    use std::io::copy;
    use tempfile::tempfile;
    use xz2::read::XzDecoder;

    fn lsblk_split() {
            split_lsblk_line(r#"NAME="sda" LABEL="" FSTYPE="""#),
            hashmap! {
                String::from("NAME") => String::from("sda"),
            split_lsblk_line(r#"NAME="sda1" LABEL="" FSTYPE="vfat""#),
            hashmap! {
                String::from("NAME") => String::from("sda1"),
                String::from("FSTYPE") => String::from("vfat")
            split_lsblk_line(r#"NAME="sda2" LABEL="boot" FSTYPE="ext4""#),
            hashmap! {
                String::from("NAME") => String::from("sda2"),
                String::from("LABEL") => String::from("boot"),
                String::from("FSTYPE") => String::from("ext4"),
            split_lsblk_line(r#"NAME="sda3" LABEL="foo=\x22bar\x22 baz" FSTYPE="ext4""#),
            hashmap! {
                String::from("NAME") => String::from("sda3"),
                // for now, we don't care about resolving lsblk's hex escapes,
                // so we just pass them through
                String::from("LABEL") => String::from(r#"foo=\x22bar\x22 baz"#),
                String::from("FSTYPE") => String::from("ext4"),

    fn blkid_split() {
        assert_eq!(split_blkid_line(r#""#), std::collections::HashMap::new());
        assert_eq!(split_blkid_line(r#" : "#), std::collections::HashMap::new());

            split_blkid_line(r#": UUID="0000""#),

            hashmap! {
                String::from("NAME") => String::from("/dev/empty")

                r#"/dev/mapper/luks-f022921b-0100-4d48-9812-cfa6c225060a: UUID="2ff16ac3-103f-41d4-8e02-03686e255270" BLOCK_SIZE="4096" TYPE="ext4""#
            hashmap! {
                String::from("NAME") => String::from("/dev/mapper/luks-f022921b-0100-4d48-9812-cfa6c225060a"),
                String::from("UUID") => String::from("2ff16ac3-103f-41d4-8e02-03686e255270"),
                String::from("TYPE") => String::from("ext4"),
                String::from("BLOCK_SIZE") => String::from("4096")

                r#"/dev/vdb4: UUID="fdc69fb1-d7f3-4696-846e-b2275504f63c" LABEL="crypt_rootfs" TYPE="crypto_LUKS" PARTLABEL="root" PARTUUID="835753cb-d7f0-465e-84db-07860d3da2f6""#
            hashmap! {
                String::from("NAME") => String::from("/dev/vdb4"),
                String::from("LABEL") => String::from("crypt_rootfs"),
                String::from("UUID") => String::from("fdc69fb1-d7f3-4696-846e-b2275504f63c"),
                String::from("TYPE") => String::from("crypto_LUKS"),
                String::from("PARTLABEL") => String::from("root"),
                String::from("PARTUUID") => String::from("835753cb-d7f0-465e-84db-07860d3da2f6"),

    fn disk_sector_size_reader() {
        struct Test {
            name: &'static str,
            data: &'static [u8],
            compressed: bool,
            result: Option<NonZeroU32>,
        let tests = vec![
            Test {
                name: "zero-length",
                data: b"",
                compressed: false,
                result: None,
            Test {
                name: "empty-disk",
                data: include_bytes!("../fixtures/empty.xz"),
                compressed: true,
                result: None,
            Test {
                name: "gpt-512",
                data: include_bytes!("../fixtures/gpt-512.xz"),
                compressed: true,
                result: NonZeroU32::new(512),
            Test {
                name: "gpt-4096",
                data: include_bytes!("../fixtures/gpt-4096.xz"),
                compressed: true,
                result: NonZeroU32::new(4096),

        for test in tests {
            let data = if test.compressed {
                let mut decoder = XzDecoder::new(;
                let mut data: Vec<u8> = Vec::new();
                decoder.read_to_end(&mut data).expect("decompress failed");
            } else {

    fn test_saved_partitions() {
        use PartitionFilter::*;

        let make_part = |i: u32, name: &str, start: u64, end: u64| {
                GPTPartitionEntry {
                    partition_type_guid: make_guid("type"),
                    unique_partition_guid: make_guid(&format!("{} {} {}", name, start, end)),
                    starting_lba: start * 2048,
                    ending_lba: end * 2048 - 1,
                    attribute_bits: 0,
                    partition_name: name.into(),

        let base_parts = vec![
            make_part(1, "one", 1, 1024),
            make_part(2, "two", 1024, 2048),
            make_part(3, "three", 2048, 3072),
            make_part(4, "four", 3072, 4096),
            make_part(5, "five", 4096, 5120),
            make_part(7, "seven", 5120, 6144),
            make_part(8, "eight", 6144, 7168),
            make_part(9, "nine", 7168, 8192),
            make_part(10, "", 8192, 8193),
            make_part(11, "", 8193, 8194),
        let image_parts = vec![
            make_part(1, "boot", 1, 384),
            make_part(2, "EFI-SYSTEM", 384, 512),
            make_part(4, "root", 1024, 2200),
        let merge_base_parts = vec![make_part(2, "unused", 500, 3500)];

        let index = |i| Some(NonZeroU32::new(i).unwrap());
        let label = |l| Label(glob::Pattern::new(l).unwrap());
        let tests = vec![
            // Partition range
                vec![Index(index(5), None)],
                    make_part(5, "five", 4096, 5120),
                    make_part(7, "seven", 5120, 6144),
                    make_part(8, "eight", 6144, 7168),
                    make_part(9, "nine", 7168, 8192),
                    make_part(10, "", 8192, 8193),
                    make_part(11, "", 8193, 8194),
                    make_part(1, "boot", 1, 384),
                    make_part(2, "EFI-SYSTEM", 384, 512),
                    make_part(4, "root", 1024, 2200),
                    make_part(5, "five", 4096, 5120),
                    make_part(7, "seven", 5120, 6144),
                    make_part(8, "eight", 6144, 7168),
                    make_part(9, "nine", 7168, 8192),
                    make_part(10, "", 8192, 8193),
                    make_part(11, "", 8193, 8194),
            // Glob
                    make_part(5, "five", 4096, 5120),
                    make_part(8, "eight", 6144, 7168),
                    make_part(9, "nine", 7168, 8192),
                    make_part(1, "boot", 1, 384),
                    make_part(2, "EFI-SYSTEM", 384, 512),
                    make_part(4, "root", 1024, 2200),
                    make_part(5, "five", 4096, 5120),
                    make_part(8, "eight", 6144, 7168),
                    make_part(9, "nine", 7168, 8192),
            // Missing label, single partition, irrelevant range
                    Index(index(7), index(7)),
                    Index(index(15), None),
                vec![make_part(7, "seven", 5120, 6144)],
                    make_part(1, "boot", 1, 384),
                    make_part(2, "EFI-SYSTEM", 384, 512),
                    make_part(4, "root", 1024, 2200),
                    make_part(7, "seven", 5120, 6144),
            // Empty label match, multiple results
                vec![make_part(10, "", 8192, 8193), make_part(11, "", 8193, 8194)],
                    make_part(1, "boot", 1, 384),
                    make_part(2, "EFI-SYSTEM", 384, 512),
                    make_part(4, "root", 1024, 2200),
                    make_part(10, "", 8192, 8193),
                    make_part(11, "", 8193, 8194),
            // Partition renumbering
                vec![Index(index(4), None)],
                    make_part(4, "four", 3072, 4096),
                    make_part(5, "five", 4096, 5120),
                    make_part(7, "seven", 5120, 6144),
                    make_part(8, "eight", 6144, 7168),
                    make_part(9, "nine", 7168, 8192),
                    make_part(10, "", 8192, 8193),
                    make_part(11, "", 8193, 8194),
                    make_part(1, "boot", 1, 384),
                    make_part(2, "EFI-SYSTEM", 384, 512),
                    make_part(4, "root", 1024, 2200),
                    make_part(5, "four", 3072, 4096),
                    make_part(6, "five", 4096, 5120),
                    make_part(7, "seven", 5120, 6144),
                    make_part(8, "eight", 6144, 7168),
                    make_part(9, "nine", 7168, 8192),
                    make_part(10, "", 8192, 8193),
                    make_part(11, "", 8193, 8194),
            // No saved partitions
                vec![Index(index(15), None)],
            // No filters
            (vec![], vec![], merge_base_parts.clone()),

        let mut base = make_disk(512, &base_parts);
        let mut image = make_disk(512, &image_parts);
        for (testnum, (filter, expected_blank, expected_image)) in tests.iter().enumerate() {
            // try overwriting on blank disk
            let saved = SavedPartitions::new_from_file(&mut base, 512, filter).unwrap();
            let mut disk = make_unformatted_disk();
            saved.overwrite(&mut disk).unwrap();
            assert!(disk_has_mbr(&mut disk).unwrap(), "test {}", testnum);
            let result = GPT::find_from(&mut disk).unwrap();
                get_gpt_size(&mut disk).unwrap(),
                512 * result.header.first_usable_lba
            assert_partitions_eq(expected_blank, &result, &format!("test {} blank", testnum));

            // try merging with image disk onto merge_base disk
            let mut disk = make_disk(512, &merge_base_parts);
            saved.merge(&mut image, &mut disk).unwrap();
                disk_has_mbr(&mut disk).unwrap() == !expected_blank.is_empty(),
                "test {}",
            let result = GPT::find_from(&mut disk).unwrap();
                get_gpt_size(&mut disk).unwrap(),
                512 * result.header.first_usable_lba
            assert_partitions_eq(expected_image, &result, &format!("test {} image", testnum));
                match expected_blank.is_empty() {
                    true => None,
                    false => {
                        let (i, p) = &expected_blank[0];
                            p.starting_lba * 512,
                            format!("partition {} (\"{}\")", i, p.partition_name.as_str()),
                "test {}",

        // ensure overwrite clobbers every byte of MBR
        for sector_size in [512 as usize, 4096 as usize].iter() {
            let mut disk = make_unformatted_disk();
            disk.write_all(&vec![0xdau8; *sector_size]).unwrap();
            let saved =
                SavedPartitions::new_from_file(&mut disk, *sector_size as u64, &vec![]).unwrap();
            saved.overwrite(&mut disk).unwrap();
            assert!(disk_has_mbr(&mut disk).unwrap(), "{}", *sector_size);
            let mut buf = vec![0u8; *sector_size + 1];
            disk.read_exact(&mut buf).unwrap();
                buf.iter().position(|v| *v == 0xda),
            // verify the first byte of the GPT magic number is intact
            assert_eq!(buf[*sector_size], 0x45u8, "{}", *sector_size);

        // test merging with unformatted initial disk
        let mut disk = make_unformatted_disk();
        let saved = SavedPartitions::new_from_file(&mut disk, 512, &vec![label("z")]).unwrap();
        let mut disk = make_disk(512, &merge_base_parts);
        saved.merge(&mut image, &mut disk).unwrap();
        let result = GPT::find_from(&mut disk).unwrap();
        assert_partitions_eq(&merge_base_parts, &result, "unformatted disk");

        // test overlapping partitions
        let saved =
            SavedPartitions::new_from_file(&mut base, 512, &vec![Index(index(1), index(1))])
        let mut disk = make_disk(512, &merge_base_parts);
        let err = saved.merge(&mut image, &mut disk).unwrap_err();
            format!("{:#}", err).contains(&gptman::Error::InvalidPartitionBoundaries.to_string()),
            "incorrect error: {:#}",

        // test trying to save partitions from a MBR disk
        let mut disk = make_unformatted_disk();
        gptman::GPT::write_protective_mbr_into(&mut disk, 512).unwrap();
        // label only
        SavedPartitions::new(&mut disk, 512, &vec![label("*i*")]).unwrap();
        // index only
            SavedPartitions::new(&mut disk, 512, &vec![Index(index(1), index(1))])
            "saving partitions from an MBR disk is not yet supported"
        // label and index
                &mut disk,
                &vec![Index(index(1), index(1)), label("*i*")]
            "saving partitions from an MBR disk is not yet supported"

        // test sector size mismatch
        let saved = SavedPartitions::new_from_file(&mut base, 512, &vec![label("*i*")]).unwrap();
        let mut image_4096 = make_disk(4096, &image_parts);
            get_gpt_size(&mut image_4096).unwrap(),
            4096 * GPT::find_from(&mut image_4096)
        let mut disk = make_disk(4096, &merge_base_parts);
                .merge(&mut image_4096, &mut disk)
            "GPT sector size 4096 doesn't match expected 512"

        // test copying invalid partitions
        let mut disk = make_unformatted_disk();
        let data = include_bytes!("../fixtures/gpt-512-duplicate-partition-guids.xz");
        copy(&mut XzDecoder::new(&data[..]), &mut disk).unwrap();
            SavedPartitions::new_from_file(&mut disk, 512, &vec![label("*")])
            "failed dry run restoring saved partitions; input partition table may be invalid"

        // test corrupt input partition table
        for sector_size in &[512, 4096] {
            let sector_size: u64 = *sector_size;
            // backup corrupt
            let mut disk = make_damaged_disk(sector_size, &base_parts, false, true);
            let saved = SavedPartitions::new_from_file(&mut disk, sector_size, &vec![]).unwrap();
            let saved = SavedPartitions::new_from_file(&mut disk, sector_size, &vec![label("one")])
            // primary corrupt
            let mut disk = make_damaged_disk(sector_size, &base_parts, true, false);
            let saved = SavedPartitions::new_from_file(&mut disk, sector_size, &vec![]).unwrap();
            let saved = SavedPartitions::new_from_file(&mut disk, sector_size, &vec![label("one")])
            // both corrupt
            let mut disk = make_damaged_disk(sector_size, &base_parts, true, true);
            let saved = SavedPartitions::new_from_file(&mut disk, sector_size, &vec![]).unwrap();
            let err = SavedPartitions::new_from_file(&mut disk, sector_size, &vec![label("one")])
                format!("{:#}", err).contains("could not read primary header"),
                "incorrect error: {:#}",

    // TODO: The partitions array assumes 512-byte sectors and we don't
    // scale the start/end values for 4096.  This doesn't matter right now
    // because the only use of 4096-byte sectors is in an error test.
    fn make_disk(sector_size: u64, partitions: &Vec<(u32, GPTPartitionEntry)>) -> File {
        let mut disk = make_unformatted_disk();
        // Make the disk just large enough for its partitions, then resize
        // it back up afterward.  This tests that we properly handle copying
        // saved partitions from the larger base disk into the smaller
        // install image.
        let len = if partitions.is_empty() {
            1024 * 1024
        } else {
            partitions[partitions.len() - 1].1.ending_lba * sector_size + 1024 * 1024
        let mut gpt = GPT::new_from(&mut disk, sector_size, make_guid("disk")).unwrap();
        for (partnum, entry) in partitions {
            gpt[*partnum] = entry.clone();
        gpt.write_into(&mut disk).unwrap();
        disk.set_len(10 * 1024 * 1024 * 1024).unwrap();

    fn make_unformatted_disk() -> File {
        let disk = tempfile().unwrap();
        disk.set_len(10 * 1024 * 1024 * 1024).unwrap();

    fn make_damaged_disk(
        sector_size: u64,
        partitions: &Vec<(u32, GPTPartitionEntry)>,
        damage_primary: bool,
        damage_backup: bool,
    ) -> File {
        let mut disk = make_unformatted_disk();
        // don't use make_disk() because it intentionally misaligns the
        // backup GPT
        let mut gpt = GPT::new_from(&mut disk, sector_size, make_guid("disk")).unwrap();
        for (partnum, entry) in partitions {
            gpt[*partnum] = entry.clone();
            gpt[*partnum].starting_lba /= sector_size / 512;
            gpt[*partnum].ending_lba /= sector_size / 512;
        gpt.write_into(&mut disk).unwrap();
        if damage_primary {
            // write garbage to the HeaderCRC32
   * sector_size + 16))
            disk.write_all(&[0x15, 0xcd, 0x5b, 0x07]).unwrap();
        if damage_backup {
            // write garbage to the HeaderCRC32
   * sector_size + 16))
            disk.write_all(&[0xb1, 0x68, 0xde, 0x3a]).unwrap();

    fn make_guid(seed: &str) -> [u8; 16] {
        let mut guid = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
        for (i, b) in seed.as_bytes().iter().enumerate() {
            guid[i % guid.len()] ^= *b;

    fn assert_partitions_eq(expected: &Vec<(u32, GPTPartitionEntry)>, found: &GPT, message: &str) {
                .map(|(i, p)| (*i, p))
                .collect::<Vec<(u32, &GPTPartitionEntry)>>(),
                .filter(|(_, p)| p.is_used())
                .collect::<Vec<(u32, &GPTPartitionEntry)>>(),