From 41e1ca4e4d4ce9261721fa1f23ddcdf569d9a7a3 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Mon, 27 Apr 2026 13:00:28 +0200 Subject: [PATCH 01/33] block: add mirror module skeleton Blockdev-mirroring for virtio-blk needs a home for its new types. Add the mirror module with the lifecycle state, ahead of the logic that fills it in. MirrorPhase the lifecycle: Running, Ready, Completing, Completed, Cancelling, Failed MirrorState the phase behind a Mutex, shared via Arc, with a guarded transition_to_phase that applies only the documented edges Follow-up commits add the range lock for copy/write exclusion, the AsyncIo wrapper that fans writes to both backends, the background copy worker, and the virtio-blk and REST integration. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/lib.rs | 1 + block/src/mirror.rs | 101 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 block/src/mirror.rs diff --git a/block/src/lib.rs b/block/src/lib.rs index 9d688f5ff4..5e58f69cfb 100644 --- a/block/src/lib.rs +++ b/block/src/lib.rs @@ -20,6 +20,7 @@ pub mod fixed_vhd; pub mod fixed_vhd_async; pub mod fixed_vhd_disk; pub mod fixed_vhd_sync; +pub mod mirror; pub mod qcow; #[cfg(feature = "io_uring")] pub(crate) mod qcow_async; diff --git a/block/src/mirror.rs b/block/src/mirror.rs new file mode 100644 index 0000000000..8b8ab8c065 --- /dev/null +++ b/block/src/mirror.rs @@ -0,0 +1,101 @@ +// Copyright © 2026 Cyberus Technology GmbH +// +// SPDX-License-Identifier: Apache-2.0 + +//! Blockdev-mirroring for virtio-blk devices. +//! +//! Mirrors guest writes to a destination disk while a background +//! worker copies existing data from source to destination. Once +//! both sides are in sync the device manager can complete the mirror, +//! switching the device to serve I/O from the destination. + +use std::mem; +use std::sync::{Arc, Mutex}; + +use log::warn; + +/// Phase of a mirror. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum MirrorPhase { + /// Background copy is in progress. + Running, + /// All blocks copied. Source and destination are in sync. + Ready, + /// Switch-over to the destination is in progress. + Completing, + /// All virtqueues switched to the destination. + Completed, + /// Mirror cancellation is in progress. + Cancelling, + /// The mirror has failed. + Failed(String), +} + +/// State shared by the copy worker and the per-queue mirroring +/// `AsyncIo` handles. +/// +/// Held in an `Arc` so all threads see the same phase. +pub struct MirrorState { + /// Current phase of the mirror. + #[allow(dead_code)] + phase: Mutex, +} + +#[allow(dead_code)] +impl MirrorState { + pub fn new() -> Arc { + Arc::new(Self { + phase: Mutex::new(MirrorPhase::Running), + }) + } + + /// Returns a snapshot of the current phase. + pub fn phase(&self) -> MirrorPhase { + self.phase.lock().unwrap().clone() + } + + /// Attempts a phase transition. Only the documented transitions are + /// applied. Any other attempt is ignored and logged. + /// + /// Allowed transitions: + /// ```text + /// Running -> Ready | Cancelling | Failed(_) + /// Ready -> Completing | Cancelling | Failed(_) + /// Completing -> Completed + /// Failed(_) -> Cancelling + /// ``` + /// Plus idempotent self-transitions. `Completed` and `Cancelling` are + /// terminal: the mirror handle is dropped out of them, after which + /// `Block::mirror_status` reports no active mirror. + pub fn transition_to_phase(&self, target: MirrorPhase) { + use MirrorPhase::*; + let mut current = self.phase.lock().unwrap(); + + // Ignore idempotent transitions to the current state + if mem::discriminant(&*current) == mem::discriminant(&target) { + return; + } + + let transition_allowed = matches!( + (&*current, &target), + (Running, Ready) + | (Running, Cancelling) + | (Running, Failed(_)) + | (Ready, Completing) + | (Ready, Cancelling) + | (Ready, Failed(_)) + | (Completing, Completed) + | (Failed(_), Cancelling) + ); + + if !transition_allowed { + warn!( + "Invalid mirror phase transition attempted: {:?} -> {:?}", + *current, target + ); + return; + } + + *current = target; + } +} From d9c604510681735232ceabe9f3f8fe9a2c2557e7 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Mon, 27 Apr 2026 16:12:50 +0200 Subject: [PATCH 02/33] block: add MirroringAsyncIo skeleton Mirroring needs a per-queue AsyncIo that the virtio device can install in place of the plain backend. Add the type now so later commits introducing the shard locks and the write fan-out have something to reference. Every method delegates to source. alignment() is the exception and returns max of source and dest. The request handler reads alignment per request to choose bounce-buffer placement, and the same iovec is later submitted to both backends, so the stricter requirement has to win even before fan-out lands. submit_batch_requests is left unimplemented and batch_requests_enabled returns false. Follow-ups add Shard-based mutual exclusion between the copy worker and mirror writes, then rewrite write_vectored, punch_hole, write_zeroes, fsync, and next_completed_request to fan out to destination and pair completions via a synthetic dest-side user_data. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 66 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index 8b8ab8c065..eadba9a85c 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -12,7 +12,12 @@ use std::mem; use std::sync::{Arc, Mutex}; +use libc::{iovec, off_t}; use log::warn; +use vmm_sys_util::eventfd::EventFd; + +use crate::BatchRequest; +use crate::async_io::{AsyncIo, AsyncIoResult}; /// Phase of a mirror. #[derive(Debug, Clone, PartialEq, Eq)] @@ -99,3 +104,64 @@ impl MirrorState { *current = target; } } + +/// Per-queue `AsyncIo` handle for a mirror. +#[allow(dead_code)] +pub struct MirroringAsyncIo { + source: Box, + destination: Box, + state: Arc, +} + +impl AsyncIo for MirroringAsyncIo { + fn notifier(&self) -> &EventFd { + self.source.notifier() + } + + fn read_vectored( + &mut self, + offset: off_t, + iovecs: &[iovec], + user_data: u64, + ) -> AsyncIoResult<()> { + self.source.read_vectored(offset, iovecs, user_data) + } + + fn write_vectored( + &mut self, + offset: off_t, + iovecs: &[iovec], + user_data: u64, + ) -> AsyncIoResult<()> { + self.source.write_vectored(offset, iovecs, user_data) + } + + fn fsync(&mut self, user_data: Option) -> AsyncIoResult<()> { + self.source.fsync(user_data) + } + + fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { + self.source.punch_hole(offset, length, user_data) + } + + fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { + self.source.write_zeroes(offset, length, user_data) + } + + fn next_completed_request(&mut self) -> Option<(u64, i32)> { + self.source.next_completed_request() + } + + fn batch_requests_enabled(&self) -> bool { + false + } + + fn submit_batch_requests(&mut self, _batch_request: &[BatchRequest]) -> AsyncIoResult<()> { + unimplemented!("Batch requests are not supported in MirroringAsyncIo") + } + + fn alignment(&self) -> u64 { + // Stricter alignment wins. Same iovec goes to both backends. + self.source.alignment().max(self.destination.alignment()) + } +} From 3c2f3f547847e77e481f3c530fcb19c7328da5ef Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Tue, 28 Apr 2026 10:45:02 +0200 Subject: [PATCH 03/33] block: add range lock primitive for mirror The copy worker and the virtqueue workers can both target the same destination bytes during a mirror. Without coordination a destination block can mix bytes from both. Add a primitive both will use to serialise overlapping ranges. RangeLockManager wraps a Mutex> and a Condvar. lock_range blocks while any held range overlaps. lock_iovecs locks the contiguous span of iovecs using lock_range Wired into the `AsyncIo` impl of `MirroringAsyncIo` and the copy worker in follow-up commits. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 128 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 126 insertions(+), 2 deletions(-) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index eadba9a85c..fbe7e7c3fa 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -9,8 +9,9 @@ //! both sides are in sync the device manager can complete the mirror, //! switching the device to serve I/O from the destination. -use std::mem; -use std::sync::{Arc, Mutex}; +use std::collections::BTreeMap; +use std::sync::{Arc, Condvar, Mutex}; +use std::{io, mem}; use libc::{iovec, off_t}; use log::warn; @@ -19,6 +20,95 @@ use vmm_sys_util::eventfd::EventFd; use crate::BatchRequest; use crate::async_io::{AsyncIo, AsyncIoResult}; +/// Serializes overlapping byte ranges between the copy worker and the +/// per-queue mirror writes. +/// +/// Each party calls [`Self::lock_range`] before submitting I/O and +/// holds the returned [`RangeGuard`] until completion. A conflicting +/// request blocks on a `Condvar` until the held guard is dropped. +/// Lookups are O(log n) on the number of held ranges. +#[allow(dead_code)] +struct RangeLockManager { + /// Held ranges as `start -> end_exclusive`. The mutex makes the + /// overlap check and insert in [`Self::lock_range`] atomic with + /// respect to releases in [`RangeGuard::drop`]. + ranges: Mutex>, + /// Notified on guard drop. Waiters re-check their range. + cv: Condvar, +} + +#[allow(dead_code)] +impl RangeLockManager { + pub fn new() -> Arc { + Arc::new(Self { + ranges: Mutex::new(BTreeMap::new()), + cv: Condvar::new(), + }) + } + + /// Returns true if `[start, end)` overlaps any range in `ranges`. + fn overlaps_any(ranges: &BTreeMap, start: u64, end: u64) -> bool { + ranges + .range(..end) + .next_back() + .is_some_and(|(_, &e)| e > start) + } + + /// Acquires an exclusive lock on `[offset, offset + length)`. + /// Blocks while any held range overlaps. + fn lock_range(self: &Arc, offset: u64, length: u64) -> io::Result { + if length == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "Range length is zero", + )); + } + + let end = offset + .checked_add(length) + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "Range overflow"))?; + // Wait until no held range overlaps, then claim it. + let mut ranges = self + .cv + .wait_while(self.ranges.lock().unwrap(), |ranges| { + RangeLockManager::overlaps_any(ranges, offset, end) + }) + .unwrap(); + ranges.insert(offset, end); + + Ok(RangeGuard { + mgr: Arc::clone(self), + start: offset, + }) + } + + /// Acquires a [`RangeGuard`] covering the contiguous bytes from + /// `offset` through the end of `iovecs`. + fn lock_iovecs(self: &Arc, offset: off_t, iovecs: &[iovec]) -> io::Result { + let total_len = iovecs + .iter() + .try_fold(0u64, |acc, v| acc.checked_add(v.iov_len as u64)) + .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "iovec length overflow"))?; + + self.lock_range(offset as u64, total_len) + } +} + +/// RAII handle for a range held in a [`RangeLockManager`]. Drop +/// releases the range and wakes all waiters. +#[allow(dead_code)] +struct RangeGuard { + mgr: Arc, + start: u64, +} +impl Drop for RangeGuard { + fn drop(&mut self) { + let mut ranges = self.mgr.ranges.lock().unwrap(); + ranges.remove(&self.start); + self.mgr.cv.notify_all(); + } +} + /// Phase of a mirror. #[derive(Debug, Clone, PartialEq, Eq)] pub enum MirrorPhase { @@ -165,3 +255,37 @@ impl AsyncIo for MirroringAsyncIo { self.source.alignment().max(self.destination.alignment()) } } + +#[cfg(test)] +mod tests { + use super::*; + + /// Overlap is detected whether the held range precedes the query or starts + /// inside it. + #[test] + fn overlaps_detects_overlap() { + let mut preceding = BTreeMap::new(); + preceding.insert(10u64, 25u64); + assert!(RangeLockManager::overlaps_any(&preceding, 20, 30)); + + let mut starts_inside = BTreeMap::new(); + starts_inside.insert(10u64, 20u64); + starts_inside.insert(25u64, 30u64); + assert!(RangeLockManager::overlaps_any(&starts_inside, 21, 26)); + } + + #[test] + fn overlaps_disjoint_returns_false() { + let mut locked = BTreeMap::new(); + locked.insert(10u64, 20u64); + locked.insert(30u64, 40u64); + assert!(!RangeLockManager::overlaps_any(&locked, 22, 28)); + } + + #[test] + fn overlaps_touching_boundary_is_not_overlap() { + let mut locked = BTreeMap::new(); + locked.insert(10u64, 20u64); + assert!(!RangeLockManager::overlaps_any(&locked, 20, 30)); + } +} From 23997879d6b54422659475b581c7c7ab98668ee0 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Wed, 29 Apr 2026 12:25:33 +0200 Subject: [PATCH 04/33] block: add EpollWaiter for single-fd waits The copy worker and the per-queue mirror writes need to block until an AsyncIo backend's notifier eventfd has a completion to read. Every backend creates its notifier with EFD_NONBLOCK, so it needs to be polled, and the virtio-block seccomp filter allows epoll_* but not poll/ppoll. Add EpollWaiter, a wrapper around vmm_sys_util::epoll::Epoll that registers one fd for readability at construction. wait() blocks until the fd is readable, retrying on EINTR. next_completion() loops over a backend's completions, blocking on wait() and draining the eventfd between rounds. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 50 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index fbe7e7c3fa..ea8cca2add 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -10,11 +10,13 @@ //! switching the device to serve I/O from the destination. use std::collections::BTreeMap; +use std::os::fd::RawFd; use std::sync::{Arc, Condvar, Mutex}; use std::{io, mem}; use libc::{iovec, off_t}; use log::warn; +use vmm_sys_util::epoll; use vmm_sys_util::eventfd::EventFd; use crate::BatchRequest; @@ -256,6 +258,54 @@ impl AsyncIo for MirroringAsyncIo { } } +/// Single-fd `epoll` wrapper. Built once per eventfd and reused for +/// every `wait()` call so the copy worker doesn't pay setup cost per +/// block. +/// +/// `wait()` blocks until the eventfd becomes readable. +#[allow(dead_code)] +struct EpollWaiter { + epoll: epoll::Epoll, +} + +#[allow(dead_code)] +impl EpollWaiter { + /// Creates a reusable `EpollWaiter` for the given eventfd. + fn new(event_fd: RawFd) -> io::Result { + let epoll = epoll::Epoll::new()?; + epoll.ctl( + epoll::ControlOperation::Add, + event_fd, + epoll::EpollEvent::new(epoll::EventSet::IN, 0), + )?; + Ok(Self { epoll }) + } + + /// Blocks until the event fd becomes readable. Retries on EINTR. + fn wait(&self) -> io::Result<()> { + let mut events = [epoll::EpollEvent::default(); 1]; + loop { + match self.epoll.wait(-1, &mut events) { + Ok(_) => return Ok(()), + Err(e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => return Err(e), + } + } + } + + /// Blocks until `io` reports a completion, then returns it. + fn next_completion(&self, io: &mut Box) -> io::Result<(u64, i32)> { + loop { + if let Some(completion) = io.next_completed_request() { + return Ok(completion); + } + self.wait()?; + // Drain the eventfd so the next wait does not fire on a stale signal. + let _ = io.notifier().read()?; + } + } +} + #[cfg(test)] mod tests { use super::*; From 27fa09a90a151445fe9e46a9debe884ebf3d315a Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Tue, 28 Apr 2026 15:01:56 +0200 Subject: [PATCH 05/33] block: mirror mutating I/O to destination Mirroring moves a virtio-blk disk to a new backend path while the guest keeps running. Guest writes during the move have to land on both disks. A background copy worker streams the existing data across. The guest write path and the copy worker can target the same byte range at the same time. Mirror mutating guest requests to both backends via virtqueues, and hold a range lock for each request so neither side touches the same bytes at once. Read requests are not mirrored and wired to the source disk. The guest sees source's result, a destination failure moves MirrorPhase to Failed. Follow-ups: the copy worker that takes the same lock, and the coordinator that handles start, cancel, complete, and rollback on Failed. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 164 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 151 insertions(+), 13 deletions(-) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index ea8cca2add..e82a6c9216 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -9,7 +9,7 @@ //! both sides are in sync the device manager can complete the mirror, //! switching the device to serve I/O from the destination. -use std::collections::BTreeMap; +use std::collections::{BTreeMap, VecDeque}; use std::os::fd::RawFd; use std::sync::{Arc, Condvar, Mutex}; use std::{io, mem}; @@ -20,7 +20,7 @@ use vmm_sys_util::epoll; use vmm_sys_util::eventfd::EventFd; use crate::BatchRequest; -use crate::async_io::{AsyncIo, AsyncIoResult}; +use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult}; /// Serializes overlapping byte ranges between the copy worker and the /// per-queue mirror writes. @@ -29,7 +29,6 @@ use crate::async_io::{AsyncIo, AsyncIoResult}; /// holds the returned [`RangeGuard`] until completion. A conflicting /// request blocks on a `Condvar` until the held guard is dropped. /// Lookups are O(log n) on the number of held ranges. -#[allow(dead_code)] struct RangeLockManager { /// Held ranges as `start -> end_exclusive`. The mutex makes the /// overlap check and insert in [`Self::lock_range`] atomic with @@ -39,7 +38,6 @@ struct RangeLockManager { cv: Condvar, } -#[allow(dead_code)] impl RangeLockManager { pub fn new() -> Arc { Arc::new(Self { @@ -98,7 +96,6 @@ impl RangeLockManager { /// RAII handle for a range held in a [`RangeLockManager`]. Drop /// releases the range and wakes all waiters. -#[allow(dead_code)] struct RangeGuard { mgr: Arc, start: u64, @@ -134,15 +131,15 @@ pub enum MirrorPhase { /// Held in an `Arc` so all threads see the same phase. pub struct MirrorState { /// Current phase of the mirror. - #[allow(dead_code)] phase: Mutex, + range_locks: Arc, } -#[allow(dead_code)] impl MirrorState { pub fn new() -> Arc { Arc::new(Self { phase: Mutex::new(MirrorPhase::Running), + range_locks: RangeLockManager::new(), }) } @@ -198,11 +195,97 @@ impl MirrorState { } /// Per-queue `AsyncIo` handle for a mirror. -#[allow(dead_code)] pub struct MirroringAsyncIo { source: Box, destination: Box, state: Arc, + /// Completions of inflight requests to be popped by `next_completed_request`. + inflight_completions: VecDeque<(u64, i32)>, + /// Reusable waiters parked on the source and destination notifier eventfds + /// while a mirrored write awaits its completions. Built once so each write + /// does not pay the epoll setup cost. + source_waiter: EpollWaiter, + dest_waiter: EpollWaiter, +} +impl MirroringAsyncIo { + /// Flip the mirror to the `Failed` phase. The operator must cancel to + /// clean up the destination and the copy worker. + fn fail(&mut self, reason: String) { + self.state.transition_to_phase(MirrorPhase::Failed(reason)); + } + + /// Helper that submits an `AsyncIo` request to both source and destination. + /// + /// Source error bubbles to the guest. Destination error fails the mirror + /// but is hidden from the guest, since `source` is the disk the guest sees. + fn mirror_request( + &mut self, + request_label: &str, + submit_source: S, + submit_destination: D, + ) -> AsyncIoResult<()> + where + S: FnOnce(&mut Box) -> AsyncIoResult<()>, + D: FnOnce(&mut Box) -> AsyncIoResult<()>, + { + submit_source(&mut self.source)?; + if let Err(e) = submit_destination(&mut self.destination) { + self.fail(format!("destination {request_label} submit failed: {e:?}")); + } + Ok(()) + } + + /// Block until `user_data`'s source and destination completion arrive, then + /// queue the single guest-visible `(user_data, src_result)`. Other + /// completions seen while waiting (e.g. an async read finishing) are stashed + /// for later delivery. + fn wait_for_completions(&mut self, user_data: u64) -> io::Result<()> { + let src_result = Self::await_completion( + &mut self.source, + &self.source_waiter, + &mut self.inflight_completions, + user_data, + )?; + + match Self::await_completion( + &mut self.destination, + &self.dest_waiter, + &mut self.inflight_completions, + user_data, + ) { + // Destination reported an I/O error. + Ok(dest_result) if dest_result < 0 => self.fail(format!( + "destination completion failed: user_data={user_data}" + )), + Ok(_) => {} + // The destination wait itself failed (broken notifier or epoll). + // Hide it from the guest like any other destination failure. + Err(e) => self.fail(format!( + "destination wait failed for user_data={user_data}: {e}" + )), + } + + self.inflight_completions.push_back((user_data, src_result)); + let _ = self.source.notifier().write(1); + Ok(()) + } + + /// Drain `io` until `user_data`'s own completion appears and push + /// additional ones to `inflight_completions`. + fn await_completion( + io: &mut Box, + waiter: &EpollWaiter, + inflight_completions: &mut VecDeque<(u64, i32)>, + user_data: u64, + ) -> io::Result { + loop { + let (id, res) = waiter.next_completion(io)?; + if id == user_data { + return Ok(res); + } + inflight_completions.push_back((id, res)); + } + } } impl AsyncIo for MirroringAsyncIo { @@ -225,23 +308,78 @@ impl AsyncIo for MirroringAsyncIo { iovecs: &[iovec], user_data: u64, ) -> AsyncIoResult<()> { - self.source.write_vectored(offset, iovecs, user_data) + let _guard = self + .state + .range_locks + .lock_iovecs(offset, iovecs) + .map_err(AsyncIoError::WriteVectored)?; + + self.mirror_request( + "write_vectored", + |src| src.write_vectored(offset, iovecs, user_data), + |dst| dst.write_vectored(offset, iovecs, user_data), + )?; + + self.wait_for_completions(user_data) + .map_err(AsyncIoError::WriteVectored)?; + Ok(()) } fn fsync(&mut self, user_data: Option) -> AsyncIoResult<()> { - self.source.fsync(user_data) + self.mirror_request( + "fsync", + |src| src.fsync(user_data), + |dst| dst.fsync(user_data), + )?; + + // A tracked fsync (Some) waits for its completion. A barrier fsync (None) does not. + if let Some(user_data) = user_data { + self.wait_for_completions(user_data) + .map_err(AsyncIoError::Fsync)?; + } + Ok(()) } fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { - self.source.punch_hole(offset, length, user_data) + let _guard = self + .state + .range_locks + .lock_range(offset, length) + .map_err(AsyncIoError::PunchHole)?; + self.mirror_request( + "punch_hole", + |src| src.punch_hole(offset, length, user_data), + |dst| dst.punch_hole(offset, length, user_data), + )?; + + self.wait_for_completions(user_data) + .map_err(AsyncIoError::PunchHole)?; + Ok(()) } fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { - self.source.write_zeroes(offset, length, user_data) + let _guard = self + .state + .range_locks + .lock_range(offset, length) + .map_err(AsyncIoError::WriteZeroes)?; + self.mirror_request( + "write_zeroes", + |src| src.write_zeroes(offset, length, user_data), + |dst| dst.write_zeroes(offset, length, user_data), + )?; + + self.wait_for_completions(user_data) + .map_err(AsyncIoError::WriteZeroes)?; + Ok(()) } fn next_completed_request(&mut self) -> Option<(u64, i32)> { - self.source.next_completed_request() + // Mirrored writes are awaited synchronously. Only async source reads complete here. + while let Some((id, res)) = self.source.next_completed_request() { + self.inflight_completions.push_back((id, res)); + } + self.inflight_completions.pop_front() } fn batch_requests_enabled(&self) -> bool { From 1011388042fccf652056c025f05ca2385fabadae Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Wed, 29 Apr 2026 12:34:45 +0200 Subject: [PATCH 06/33] block: add background copy worker for mirror The blockdev-mirror, replicates guest writes to the destination, but the destination still misses everything that was on source before the mirror virtqueues started. A background worker has to copy those existing bytes while the guest keeps running. The mirror's per-queue writers and this worker can target the same byte range, so the worker takes the same range lock the writers hold. CopyWorker submits reads and writes through AsyncIo so it works with any disk format the trait supports. Each block is held under a RangeGuard during a sequential read on source and write on destination. Completions are awaited via EpollWaiter on the non-blocking notifier eventfd. On success the phase transitions to Ready. An I/O error or a spawn failure transitions to `Failed` via the internal state machine. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 182 +++++++++++++++++++++++++++++++++++++-- block/src/qcow_common.rs | 3 + 2 files changed, 179 insertions(+), 6 deletions(-) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index e82a6c9216..ca1a4b5d01 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -10,9 +10,11 @@ //! switching the device to serve I/O from the destination. use std::collections::{BTreeMap, VecDeque}; -use std::os::fd::RawFd; +use std::os::fd::{AsRawFd, RawFd}; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Condvar, Mutex}; -use std::{io, mem}; +use std::thread::JoinHandle; +use std::{io, mem, thread}; use libc::{iovec, off_t}; use log::warn; @@ -21,6 +23,13 @@ use vmm_sys_util::eventfd::EventFd; use crate::BatchRequest; use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult}; +use crate::disk_file::AsyncFullDiskFile; +use crate::error::BlockResult; +use crate::qcow_common::AlignedBuf; + +/// Block size for the copy worker, in which it copies data from +/// source to destination and holds the range lock. +pub const MIRROR_BLOCK_SIZE: usize = 512 * 1024; // 512 KiB /// Serializes overlapping byte ranges between the copy worker and the /// per-queue mirror writes. @@ -128,18 +137,23 @@ pub enum MirrorPhase { /// State shared by the copy worker and the per-queue mirroring /// `AsyncIo` handles. /// -/// Held in an `Arc` so all threads see the same phase. +/// Held in an `Arc` so all threads see the same phase and progress +/// counters. pub struct MirrorState { /// Current phase of the mirror. phase: Mutex, range_locks: Arc, + copied_bytes: AtomicU64, + total_bytes: u64, } impl MirrorState { - pub fn new() -> Arc { + pub fn new(logical_disk_size: u64) -> Arc { Arc::new(Self { phase: Mutex::new(MirrorPhase::Running), range_locks: RangeLockManager::new(), + copied_bytes: AtomicU64::new(0), + total_bytes: logical_disk_size, }) } @@ -396,17 +410,173 @@ impl AsyncIo for MirroringAsyncIo { } } +/// Owns the copy worker thread's [`JoinHandle`]. The thread is joined +/// on [`Self::join`] or on drop, which blocks until the worker finishes +/// its current block, since cancellation is only observed between blocks. +pub struct CopyWorkerHandle { + join: Option>, +} + +impl CopyWorkerHandle { + /// Waits for the copy worker thread to finish. Idempotent: + /// subsequent calls return `Ok(())` without blocking. + pub fn join(&mut self) -> thread::Result<()> { + if let Some(t) = self.join.take() { + return t.join(); + } + + Ok(()) + } +} + +impl Drop for CopyWorkerHandle { + fn drop(&mut self) { + self.join().ok(); + } +} + +/// Background thread that copies existing source bytes to destination +/// in fixed-size blocks. Holds a [`RangeGuard`] across each block so +/// the virtqueue mirror writes cannot race the copy. +pub struct CopyWorker { + source_io: Box, + dest_io: Box, + state: Arc, + /// Once allocated, the buffer is reused for all blocks to avoid repeated allocations. + buf: AlignedBuf, + block_size_bytes: usize, + /// Tracks the next user_data for request and completion notifications. + next_user_data: u64, + source_waiter: EpollWaiter, + dest_waiter: EpollWaiter, +} +impl CopyWorker { + /// Builds a worker on top of two async I/O handles. Queue depth 1 + /// is enough, as the worker is sequential. The caller must initialize the + /// destination disk. + /// + /// Start the worker thread with [`Self::spawn`]. + pub fn new( + source_disk: &dyn AsyncFullDiskFile, + destination_disk: &dyn AsyncFullDiskFile, + state: Arc, + block_size_bytes: usize, + ) -> BlockResult { + let source_io = source_disk.create_async_io(1)?; + let dest_io = destination_disk.create_async_io(1)?; + let source_waiter = EpollWaiter::new(source_io.notifier().as_raw_fd())?; + let dest_waiter = EpollWaiter::new(dest_io.notifier().as_raw_fd())?; + let alignment = source_io.alignment().max(dest_io.alignment()); + + Ok(Self { + source_io, + dest_io, + state, + buf: AlignedBuf::new(block_size_bytes, alignment as usize)?, + block_size_bytes, + next_user_data: 0, + source_waiter, + dest_waiter, + }) + } + + /// Spawns the worker on a named thread and returns its handle. + /// On error inside the thread, the migration phase transitions + /// to [`MirrorPhase::Failed`]. + pub fn spawn(self) -> io::Result { + let state = self.state.clone(); + let join = thread::Builder::new() + .name("blockdev-mirror-copy-worker".into()) + .spawn(move || { + let mut worker = self; + if let Err(e) = worker.run() { + state.transition_to_phase(MirrorPhase::Failed(format!( + "Copy worker failed: {e:?}" + ))); + } + })?; + + Ok(CopyWorkerHandle { join: Some(join) }) + } + + /// Drives the block-by-block copy for predefined [`MirrorState::total_bytes`], + /// then transitions the migration phase to [`MirrorPhase::Ready`]. + fn run(&mut self) -> io::Result<()> { + let total_size = self.state.total_bytes; + let max_length = self.block_size_bytes as u64; + let mut offset = 0; + + while offset < total_size { + let length = max_length.min(total_size - offset) as usize; + self.copy_block(offset, length)?; + offset += length as u64; + } + + self.state.transition_to_phase(MirrorPhase::Ready); + Ok(()) + } + + /// Copies `length` bytes at `offset` from source to destination. + /// + /// Holds a range lock for the duration so virtqueue mirror writes cannot race + /// the copy. Uses `self.buf` for the copy to avoid repeated allocations. + fn copy_block(&mut self, offset: u64, length: usize) -> io::Result<()> { + let _guard = self.state.range_locks.lock_range(offset, length as u64)?; + + // Create a single iovec for the requested block. + let iovecs = [iovec { + iov_base: self.buf.as_mut_slice(length).as_mut_ptr().cast(), + iov_len: length, + }]; + + // Read from source into buf. + self.buf.as_mut_slice(length).fill(0); + let read_id = self.generate_user_data(); + self.source_io + .read_vectored(offset as off_t, &iovecs, read_id) + .map_err(|e| io::Error::other(format!("async io read_vectored failed: {e}")))?; + let (user_data, result) = self.source_waiter.next_completion(&mut self.source_io)?; + if result < 0 { + return Err(io::Error::from_raw_os_error(-result)); + } + debug_assert_eq!(user_data, read_id); + + // Write buf to destination. + let write_id = self.generate_user_data(); + self.dest_io + .write_vectored(offset as off_t, &iovecs, write_id) + .map_err(|e| io::Error::other(format!("async io write_vectored failed: {e}")))?; + let (user_data, result) = self.dest_waiter.next_completion(&mut self.dest_io)?; + if result < 0 { + return Err(io::Error::from_raw_os_error(-result)); + } + debug_assert_eq!(user_data, write_id); + + self.state + .copied_bytes + .fetch_add(length as u64, Ordering::Relaxed); + + Ok(()) + } + + /// Returns the current [`Self::next_user_data`] and increments it, wrapping on overflow. + fn generate_user_data(&mut self) -> u64 { + let user_data = self.next_user_data; + self.next_user_data = self.next_user_data.wrapping_add(1); + + user_data + } +} + /// Single-fd `epoll` wrapper. Built once per eventfd and reused for /// every `wait()` call so the copy worker doesn't pay setup cost per /// block. /// /// `wait()` blocks until the eventfd becomes readable. -#[allow(dead_code)] struct EpollWaiter { epoll: epoll::Epoll, } -#[allow(dead_code)] impl EpollWaiter { /// Creates a reusable `EpollWaiter` for the given eventfd. fn new(event_fd: RawFd) -> io::Result { diff --git a/block/src/qcow_common.rs b/block/src/qcow_common.rs index 49cae1f79a..4f483444b7 100644 --- a/block/src/qcow_common.rs +++ b/block/src/qcow_common.rs @@ -149,6 +149,9 @@ impl Drop for AlignedBuf { } } +// SAFETY: AlignedBuf solely owns its plain-byte heap allocation (no Clone/Copy). +unsafe impl Send for AlignedBuf {} + /// Read into `buf` via an aligned bounce buffer when O_DIRECT requires it. pub fn aligned_pread(fd: RawFd, buf: &mut [u8], offset: u64, alignment: usize) -> io::Result<()> { if alignment == 0 From 8238fbd454394809d7d305c9798809c908311842 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Wed, 29 Apr 2026 15:45:45 +0200 Subject: [PATCH 07/33] virtio-devices: swap disk_image via queue commands Blockdev-mirroring needs to install a MirroringAsyncIo on each virtqueue worker without restarting the threads. Restarting would be guest-visible (a device reset) and would need to drain in-flight I/O first. Per-queue worker state cannot be mutated from another thread, so the swap has to happen on the worker's own thread in response to a signal. Add the receiving side. Each virtqueue gets a BlockQueueCommandReceiver holding a single-command slot and an eventfd. The API thread fills the slot (from a future Block::start_mirror) and writes the eventfd. The worker wakes on BLOCK_COMMAND_EVENT, takes the command, and applies it via apply_block_queue_command: swap disk_image and re-register the completion notifier on the worker's epoll set. A BlockQueueCommand carries its kind (InstallMirror, CompleteToDestination, CancelToSource), the replacement AsyncIo, and an acknowledgement channel. The acknowledgement stays unused here and is wired up when Block::start_mirror lands. cmd_receiver is Option on BlockEpollHandler, None at construction, so the non-mirror path is unchanged: no event is registered and no new branches fire. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- virtio-devices/src/block.rs | 169 +++++++++++++++++++++++++++++++++++- 1 file changed, 167 insertions(+), 2 deletions(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 261091e494..05b8ef8f83 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -15,7 +15,8 @@ use std::ops::Deref; use std::os::unix::io::AsRawFd; use std::path::PathBuf; use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64, AtomicUsize, Ordering}; -use std::sync::{Arc, Barrier}; +use std::sync::mpsc::Sender; +use std::sync::{Arc, Barrier, Mutex}; use std::time::{Duration, Instant}; use std::{io, result, thread}; @@ -63,6 +64,9 @@ const COMPLETION_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2; // New 'wake up' event from the rate limiter const RATE_LIMITER_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 3; +// A `BlockQueueCommand` has been queued for this worker to apply (e.g. swap disk_image). +const BLOCK_COMMAND_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 4; + // latency scale, for reduce precision loss in calculate. const LATENCY_SCALE: u64 = 10000; @@ -112,10 +116,108 @@ pub enum Error { ConfigChange(#[source] io::Error), #[error("Disk resize failed")] DiskResize(#[source] BlockError), + #[error("Failed applying mirror command: {0}")] + MirrorSwap(String), } pub type Result = result::Result; +/// Lifecycle command kind for a virtqueue worker. +#[derive(Debug)] +pub enum BlockQueueCommandKind { + /// Replace the plain source backend with a mirroring backend. + InstallMirror, + /// Replace the mirroring backend with a plain destination backend. + CompleteToDestination, + /// Replace the mirroring backend with a plain source backend. + CancelToSource, +} + +/// Acknowledgement sent by one virtqueue worker after handling a command. +pub struct BlockQueueAck { + /// ID of the command that is being acknowledged. + pub op_id: u64, + /// Result of applying the command inside the worker. + pub result: Result<()>, +} + +/// Command sent from `Block` to one virtqueue worker to change the worker's +/// active block I/O backend. +pub struct BlockQueueCommand { + /// Unique id for this lifecycle operation, used to match the succeeding acknowledgement. + pub op_id: u64, + /// Lifecycle action the worker should apply. + pub kind: BlockQueueCommandKind, + /// New async I/O backend that will replace the worker's current + /// `disk_image` after the old backend has drained. + /// + /// For start this is a `MirroringAsyncIo`. For cancel this is a plain + /// source `AsyncIo`. For completion this is a plain destination `AsyncIo`. + pub async_io: Box, + + /// Channel used by the worker to report that the command was applied or + /// failed. + pub ack: Sender, +} + +impl BlockQueueCommand { + pub fn install_mirror( + op_id: u64, + async_io: Box, + ack: Sender, + ) -> Self { + BlockQueueCommand { + op_id, + kind: BlockQueueCommandKind::InstallMirror, + async_io, + ack, + } + } + + pub fn complete_to_destination( + op_id: u64, + async_io: Box, + ack: Sender, + ) -> Self { + BlockQueueCommand { + op_id, + kind: BlockQueueCommandKind::CompleteToDestination, + async_io, + ack, + } + } + + /// Cancel mirroring and revert to the original source backend. + pub fn cancel_to_source( + op_id: u64, + async_io: Box, + ack: Sender, + ) -> Self { + BlockQueueCommand { + op_id, + kind: BlockQueueCommandKind::CancelToSource, + async_io, + ack, + } + } +} + +/// Per-virtqueue plumbing for swapping the worker's `disk_image` at +/// runtime. +/// +/// `cmd` and `evt` are shared with the API thread, which puts a +/// [`BlockQueueCommand`] into `cmd` (from [`Block::start_mirror`], +/// `complete_mirror`, or `cancel_mirror`) and writes to `evt` to wake the +/// worker. The worker takes the command and applies it. +pub struct BlockQueueCommandReceiver { + /// Next [`BlockQueueCommand`] to apply. Written by the API thread, + /// taken by the worker on `BLOCK_COMMAND_EVENT`. + pub cmd: Arc>>, + /// Wakes the worker after `cmd` is filled. Fires `BLOCK_COMMAND_EVENT` + /// on the worker's epoll set. + pub evt: EventFd, +} + // latency will be records as microseconds, average latency // will be save as scaled value. #[derive(Clone)] @@ -193,6 +295,8 @@ struct BlockEpollHandler { host_cpus: Option>, acked_features: u64, disable_sector0_writes: bool, + /// Receives mirror lifecycle commands for this virtqueue worker. + cmd_receiver: Option, } fn has_feature(features: u64, feature_flag: u64) -> bool { @@ -466,6 +570,45 @@ impl BlockEpollHandler { self.try_signal_used_queue() } + fn apply_block_queue_command( + disk_image: &mut Box, + command: BlockQueueCommand, + helper: &mut EpollHelper, + ) -> result::Result<(), Error> { + let BlockQueueCommand { + op_id: _, + kind: _, + async_io: new_disk_image, + ack: _, + } = command; + + let new_disk_fd = new_disk_image.notifier().as_raw_fd(); + let old_disk_fd = disk_image.notifier().as_raw_fd(); + + // Register the new backend's completion eventFd. + helper + .add_event(new_disk_fd, COMPLETION_EVENT) + .map_err(|e| { + Error::MirrorSwap(format!("Failed to register new disk notifier: {e:?}")) + })?; + + // Deregister the old backend's completion eventFd. + if let Err(e) = + helper.del_event_custom(old_disk_fd, COMPLETION_EVENT, epoll::Events::EPOLLIN) + { + // Rollback the new disk_image registration. + let _ = helper.del_event_custom(new_disk_fd, COMPLETION_EVENT, epoll::Events::EPOLLIN); + return Err(Error::MirrorSwap(format!( + "Failed to deregister old disk notifier: {e:?}" + ))); + } + + // Commit the swap. + *disk_image = new_disk_image; + + Ok(()) + } + #[inline] fn find_inflight_request(&mut self, completed_head: u16) -> Result { // This loop neatly handles the fast path where the completions are @@ -682,6 +825,9 @@ impl BlockEpollHandler { if let Some(rate_limiter) = &self.rate_limiter { helper.add_event(rate_limiter.as_raw_fd(), RATE_LIMITER_EVENT)?; } + if let Some(cmd_receiver) = &self.cmd_receiver { + helper.add_event(cmd_receiver.evt.as_raw_fd(), BLOCK_COMMAND_EVENT)?; + } self.set_queue_thread_affinity(); helper.run(paused, paused_sync, self)?; @@ -692,7 +838,7 @@ impl BlockEpollHandler { impl EpollHelperHandler for BlockEpollHandler { fn handle_event( &mut self, - _helper: &mut EpollHelper, + helper: &mut EpollHelper, event: &epoll::Event, ) -> result::Result<(), EpollHelperError> { let ev_type = event.data as u16; @@ -744,6 +890,24 @@ impl EpollHelperHandler for BlockEpollHandler { ))); } } + BLOCK_COMMAND_EVENT => { + // Apply a staged command: swap disk_image and re-register notifiers. + if let Some(m) = self.cmd_receiver.as_mut() { + m.evt.read().map_err(|e| { + EpollHelperError::HandleEvent(anyhow!( + "Failed to read block command event: {e:?}" + )) + })?; + if let Some(command) = m.cmd.lock().unwrap().take() { + Self::apply_block_queue_command(&mut self.disk_image, command, helper) + .map_err(|e| { + EpollHelperError::HandleEvent(anyhow!( + "Failed to apply block queue command: {e}" + )) + })?; + } + } + } _ => { return Err(EpollHelperError::HandleEvent(anyhow!( "Unexpected event: {ev_type}" @@ -1233,6 +1397,7 @@ impl VirtioDevice for Block { disable_sector0_writes: self.disable_sector0_writes, active_request_count: self.active_request_count.clone(), draining_active_requests: self.draining_active_requests.clone(), + cmd_receiver: None, }; let paused = self.common.paused.clone(); From ba3b2391e7e129b4144e713c3c804a2f756fccfe Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Wed, 29 Apr 2026 16:25:16 +0200 Subject: [PATCH 08/33] virtio-devices: pre-allocate per-queue command slots The previous commit added the receiving side of the queue command channel to BlockEpollHandler. For Block::start_mirror to fill a slot and write an eventfd, those handles have to exist at activation time and be reachable from both the virtqueue worker and Block itself. Build a BlockQueueCommandReceiver per virtqueue when the device is activated. A clone of the slot Arc and a clone of the eventfd are stored on the new Block.queue_cmd_senders field. The receiver with its eventfd clone is handed to BlockEpollHandler. The slot starts empty and the eventfd is silent, so BLOCK_COMMAND_EVENT does not fire and behaviour is unchanged. queue_cmd_senders is a Vec indexed by virtqueue. It is cleared at the start of every activation and re-populated for that activation's virtqueues. Follow-up: Block::start_mirror that fills each slot with an InstallMirror command carrying a MirroringAsyncIo, writes each evt, and spawns the copy worker. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- virtio-devices/src/block.rs | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 05b8ef8f83..810077519d 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -938,6 +938,10 @@ pub struct Block { device_status: Arc, active_request_count: Arc, draining_active_requests: Arc, + /// Per-virtqueue mirror writer-side handles, populated at + /// activation. `Block::start_mirror` fills each slot with a + /// [`BlockQueueCommand`] and writes the corresponding evt. + queue_cmd_senders: Vec<(Arc>>, EventFd)>, } #[derive(Serialize, Deserialize)] @@ -1104,6 +1108,7 @@ impl Block { device_status: Arc::new(AtomicU8::new(0)), active_request_count: Arc::new(AtomicUsize::new(0)), draining_active_requests: Arc::new(AtomicBool::new(false)), + queue_cmd_senders: Vec::new(), }) } @@ -1354,6 +1359,12 @@ impl VirtioDevice for Block { let mut epoll_threads = Vec::new(); let event_idx = self.common.feature_acked(VIRTIO_RING_F_EVENT_IDX.into()); + // Reset and pre-allocate per-virtqueue mirror handoffs. The + // writer-side (slot + evt) is kept on `Block`. The receiver-side + // is handed to the BlockEpollHandler. + self.queue_cmd_senders.clear(); + self.queue_cmd_senders.reserve(queues.len()); + for i in 0..queues.len() { let (_, mut queue, queue_evt) = queues.remove(0); queue.set_event_idx(event_idx); @@ -1362,6 +1373,22 @@ impl VirtioDevice for Block { let (kill_evt, pause_evt) = self.common.dup_eventfds(); let queue_idx = i as u16; + let queue_command: Arc>> = Arc::new(Mutex::new(None)); + let queue_command_evt = EventFd::new(libc::EFD_NONBLOCK).map_err(|e| { + error!("failed to create mirror eventfd: {e}"); + ActivateError::BadActivate + })?; + let mirror_handler_evt = queue_command_evt.try_clone().map_err(|e| { + error!("failed to clone mirror eventfd: {e}"); + ActivateError::BadActivate + })?; + let cmd_receiver = BlockQueueCommandReceiver { + cmd: Arc::clone(&queue_command), + evt: mirror_handler_evt, + }; + self.queue_cmd_senders + .push((queue_command, queue_command_evt)); + let mut handler = BlockEpollHandler { queue_index: queue_idx, queue, @@ -1397,7 +1424,7 @@ impl VirtioDevice for Block { disable_sector0_writes: self.disable_sector0_writes, active_request_count: self.active_request_count.clone(), draining_active_requests: self.draining_active_requests.clone(), - cmd_receiver: None, + cmd_receiver: Some(cmd_receiver), }; let paused = self.common.paused.clone(); From 740d7fd12cd6231eeffaa5f0aef003fbbfe29a26 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Wed, 29 Apr 2026 16:48:06 +0200 Subject: [PATCH 09/33] block: add BlockMirrorHandle The device side needs something to retain while a blockdev-mirror is active: the shared mirror state for status queries and the copy worker handle so the thread is joined on drop. Add BlockMirrorHandle bundling Arc and CopyWorkerHandle. Follow-up: Block::start_mirror that wires these up. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index ca1a4b5d01..5cca70b73a 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -568,6 +568,15 @@ impl CopyWorker { } } +/// Handle returned by `Block::start_mirror`. The owner (typically the +/// device manager) keeps it alive for the duration of the mirror to +/// observe `MirrorState` and to retain the [`CopyWorker`] thread. +#[allow(dead_code)] +pub struct BlockMirrorHandle { + pub state: Arc, + pub copy_worker: CopyWorkerHandle, +} + /// Single-fd `epoll` wrapper. Built once per eventfd and reused for /// every `wait()` call so the copy worker doesn't pay setup cost per /// block. From 573027f581a774d54772badda07858fe40f3b392 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Thu, 30 Apr 2026 08:23:57 +0200 Subject: [PATCH 10/33] block: add MirroringAsyncIo::create Building a MirroringAsyncIo for a virtqueue takes more than the AsyncFullDiskFile trait offers: it needs the shared MirrorState to pair with the copy worker, and it sets up its own waiters on the source and destination notifiers so a mirrored write can wait for both completions inside the write call. Add MirroringAsyncIo::create, an associated function on the type it builds. The caller passes source and destination as &dyn AsyncFullDiskFile along with the MirrorState and ring depth, and the function returns the boxed AsyncIo. The destination notifier is read only inside MirroringAsyncIo, so the virtqueue worker keeps watching just the source notifier. BlockMirrorHandle gains a destination field: Block.disk_image stays the source for the lifetime of the mirror, so the destination disk is owned by the handle. Follow-up: Block::start_mirror that wires up the new API. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index 5cca70b73a..d084ecc92f 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -11,6 +11,7 @@ use std::collections::{BTreeMap, VecDeque}; use std::os::fd::{AsRawFd, RawFd}; +use std::path::PathBuf; use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Condvar, Mutex}; use std::thread::JoinHandle; @@ -222,6 +223,35 @@ pub struct MirroringAsyncIo { dest_waiter: EpollWaiter, } impl MirroringAsyncIo { + #[allow(dead_code)] + /// Builds a [`MirroringAsyncIo`] for one virtqueue, wrapped in + /// `Box`. + /// + /// A mirrored write waits for both the source and destination completions + /// inside the write call, so this struct is the only reader of the + /// destination notifier. The virtqueue worker watches only the source + /// notifier, which it still needs to pick up read completions. + pub fn create( + source_disk: &dyn AsyncFullDiskFile, + destination_disk: &dyn AsyncFullDiskFile, + state: Arc, + ring_depth: u32, + ) -> BlockResult> { + let source = source_disk.create_async_io(ring_depth)?; + let destination = destination_disk.create_async_io(ring_depth)?; + let source_waiter = EpollWaiter::new(source.notifier().as_raw_fd())?; + let dest_waiter = EpollWaiter::new(destination.notifier().as_raw_fd())?; + + Ok(Box::new(MirroringAsyncIo { + source, + destination, + state, + inflight_completions: VecDeque::new(), + source_waiter, + dest_waiter, + })) + } + /// Flip the mirror to the `Failed` phase. The operator must cancel to /// clean up the destination and the copy worker. fn fail(&mut self, reason: String) { @@ -575,6 +605,8 @@ impl CopyWorker { pub struct BlockMirrorHandle { pub state: Arc, pub copy_worker: CopyWorkerHandle, + pub destination: Box, + pub destination_path: PathBuf, } /// Single-fd `epoll` wrapper. Built once per eventfd and reused for From 368cfac187fd301cd8663b340d5918d864d94487 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Thu, 30 Apr 2026 09:25:35 +0200 Subject: [PATCH 11/33] virtio-devices: add Block::start_mirror Each virtqueue has its own AsyncIo, so mirroring needs one MirroringAsyncIo per virtqueue, installed through the per-queue command channel added earlier. start_mirror drives that handover and returns a BlockMirrorHandle for the device manager to own. It rejects a destination smaller than the source, then builds all per-virtqueue InstallMirror commands before sending any, so a construction failure leaves the device unchanged. It sends the commands, waits for all acknowledgements with a timeout, and spawns the copy worker, so a failure cannot leak a thread whose Drop blocks on join. The worker side that acknowledges each swap after draining its old backend lands in a follow-up commit. On any failure after the first command was sent, the queues are reverted to plain AsyncIo on the source disk via CancelToSource commands. A failed revert is logged and does not mask the install error. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/error.rs | 3 + block/src/mirror.rs | 1 - virtio-devices/src/block.rs | 189 ++++++++++++++++++++++++++++++++++-- 3 files changed, 186 insertions(+), 7 deletions(-) diff --git a/block/src/error.rs b/block/src/error.rs index 645057005e..ebb0e66d34 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -42,6 +42,8 @@ pub enum BlockErrorKind { NotFound, /// An internal counter or limit was exceeded. Overflow, + /// A mirror swap was requested but was unsuccessful. + MirrorSwap, } impl Display for BlockErrorKind { @@ -54,6 +56,7 @@ impl Display for BlockErrorKind { Self::OutOfBounds => write!(f, "Out of bounds"), Self::NotFound => write!(f, "Not found"), Self::Overflow => write!(f, "Overflow"), + Self::MirrorSwap => write!(f, "Failed to swap AsyncIO in virtqueue worker for mirror"), } } } diff --git a/block/src/mirror.rs b/block/src/mirror.rs index d084ecc92f..4a164416f1 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -601,7 +601,6 @@ impl CopyWorker { /// Handle returned by `Block::start_mirror`. The owner (typically the /// device manager) keeps it alive for the duration of the mirror to /// observe `MirrorState` and to retain the [`CopyWorker`] thread. -#[allow(dead_code)] pub struct BlockMirrorHandle { pub state: Arc, pub copy_worker: CopyWorkerHandle, diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 810077519d..477b1d9691 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -15,16 +15,20 @@ use std::ops::Deref; use std::os::unix::io::AsRawFd; use std::path::PathBuf; use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64, AtomicUsize, Ordering}; -use std::sync::mpsc::Sender; -use std::sync::{Arc, Barrier, Mutex}; +use std::sync::mpsc::{Receiver, Sender}; +use std::sync::{Arc, Barrier, Mutex, mpsc}; use std::time::{Duration, Instant}; use std::{io, result, thread}; use anyhow::anyhow; use block::async_io::{AsyncIo, AsyncIoError}; use block::disk_file::AsyncFullDiskFile; -use block::error::BlockError; +use block::error::{BlockError, BlockErrorKind, BlockResult}; use block::fcntl::{LockError, LockGranularity, LockGranularityChoice, LockType, get_lock_state}; +use block::mirror::{ + BlockMirrorHandle, CopyWorker, CopyWorkerHandle, MIRROR_BLOCK_SIZE, MirrorPhase, MirrorState, + MirroringAsyncIo, +}; use block::{ ExecuteAsync, ExecuteError, MAX_DISCARD_WRITE_ZEROES_SEG, Request, RequestType, VirtioBlockConfig, build_serial, fcntl, @@ -67,6 +71,9 @@ const RATE_LIMITER_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 3; // A `BlockQueueCommand` has been queued for this worker to apply (e.g. swap disk_image). const BLOCK_COMMAND_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 4; +// Maximum duration to wait for a command to be acknowledged by the virtqueue worker. +const MIRROR_COMMAND_ACK_TIMEOUT: Duration = Duration::from_secs(5); + // latency scale, for reduce precision loss in calculate. const LATENCY_SCALE: u64 = 10000; @@ -218,6 +225,12 @@ pub struct BlockQueueCommandReceiver { pub evt: EventFd, } +struct BlockQueueCommandSender { + cmd: Arc>>, + evt: EventFd, + queue_size: u16, +} + // latency will be records as microseconds, average latency // will be save as scaled value. #[derive(Clone)] @@ -941,7 +954,8 @@ pub struct Block { /// Per-virtqueue mirror writer-side handles, populated at /// activation. `Block::start_mirror` fills each slot with a /// [`BlockQueueCommand`] and writes the corresponding evt. - queue_cmd_senders: Vec<(Arc>>, EventFd)>, + queue_cmd_senders: Vec, + next_queue_cmd_op_id: u64, } #[derive(Serialize, Deserialize)] @@ -1109,6 +1123,7 @@ impl Block { active_request_count: Arc::new(AtomicUsize::new(0)), draining_active_requests: Arc::new(AtomicBool::new(false)), queue_cmd_senders: Vec::new(), + next_queue_cmd_op_id: 1, }) } @@ -1278,6 +1293,165 @@ impl Block { .map_err(Error::ConfigChange) } + /// Start mirroring the device's disk to `destination`. + /// + /// Each virtqueue worker swaps its `disk_image` to a new + /// [`MirroringAsyncIo`] that fans every mutating request out to both + /// backends. A background [`CopyWorker`] copies existing source bytes + /// to destination until all initial bytes are copied. + /// The [`MirroringAsyncIo`] stays in place until completion, keeping the device's + /// disk and `destination` in sync. + /// + /// Returns an error if the destination is smaller than the source, on + /// `logical_size()` failure, [`MirroringAsyncIo`] construction failure, or + /// copy worker spawn failure. + pub fn start_mirror( + &mut self, + destination: Box, + destination_path: PathBuf, + ) -> BlockResult { + let source_size = self.disk_image.logical_size()?; + let dest_size = destination.logical_size()?; + if dest_size < source_size { + return Err(BlockError::new( + BlockErrorKind::Io, + io::Error::other(format!( + "mirror destination ({dest_size} bytes) is smaller than source ({source_size} bytes)" + )), + )); + } + + let state = MirrorState::new(source_size); + let op_id = self.next_mirror_op_id(); + let (ack_tx, ack_rx) = mpsc::channel(); + + let mut commands = Vec::with_capacity(self.queue_cmd_senders.len()); + for sender in &self.queue_cmd_senders { + let async_io = MirroringAsyncIo::create( + self.disk_image.as_ref(), + destination.as_ref(), + state.clone(), + sender.queue_size as u32, + )?; + commands.push(( + sender, + BlockQueueCommand::install_mirror(op_id, async_io, ack_tx.clone()), + )); + } + + drop(ack_tx); + + let install_result: BlockResult = (|| { + Self::send_mirror_queue_commands(commands)?; + Self::wait_for_mirror_queue_command_acks(op_id, &ack_rx, self.queue_cmd_senders.len())?; + CopyWorker::new( + self.disk_image.as_ref(), + destination.as_ref(), + state.clone(), + MIRROR_BLOCK_SIZE, + )? + .spawn() + .map_err(|e| BlockError::new(BlockErrorKind::Io, e)) + })(); + + let copy_worker = match install_result { + Ok(worker) => worker, + Err(e) => { + state.transition_to_phase(MirrorPhase::Failed(format!( + "mirror install failed: {e}" + ))); + + // Don't mask the install error on revert err. + if let Err(revert_err) = self.revert_queues_to_source() { + error!( + "failed to revert virtqueues to source after mirror install failure: {revert_err}" + ); + } + return Err(e); + } + }; + + Ok(BlockMirrorHandle { + state, + copy_worker, + destination, + destination_path, + }) + } + + fn next_mirror_op_id(&mut self) -> u64 { + let op_id = self.next_queue_cmd_op_id; + self.next_queue_cmd_op_id = self.next_queue_cmd_op_id.wrapping_add(1); + op_id + } + + fn mirror_swap_error(msg: impl Into) -> BlockError { + BlockError::new(BlockErrorKind::MirrorSwap, io::Error::other(msg.into())) + } + + fn send_mirror_queue_commands( + commands: Vec<(&BlockQueueCommandSender, BlockQueueCommand)>, + ) -> BlockResult<()> { + for (sender, command) in commands { + let mut slot = sender.cmd.lock().unwrap(); + + if slot.is_some() { + return Err(Self::mirror_swap_error("mirror command slot is occupied")); + } + + *slot = Some(command); + sender.evt.write(1).map_err(|e| { + Self::mirror_swap_error(format!("failed to notify mirror queue worker: {e}")) + })?; + } + + Ok(()) + } + + /// Wait for n acknowledgments of a mirror command with the given op_id on ack_rx, returning + /// an error if a timeout occurs or if any ack reports an error or mismatched op_id. + fn wait_for_mirror_queue_command_acks( + op_id: u64, + ack_rx: &Receiver, + expected_acks: usize, + ) -> BlockResult<()> { + for _ in 0..expected_acks { + let ack = ack_rx + .recv_timeout(MIRROR_COMMAND_ACK_TIMEOUT) + .map_err(|e| Self::mirror_swap_error(format!("mirror command ack timeout: {e}")))?; + + if ack.op_id != op_id { + return Err(Self::mirror_swap_error(format!( + "received mirror command ack for op_id {}, expected {}", + ack.op_id, op_id + ))); + } + + ack.result.map_err(|e| { + Self::mirror_swap_error(format!("mirror command failed in queue worker: {e}")) + })?; + } + + Ok(()) + } + + /// Swap every virtqueue worker back to a plain AsyncIo on the source disk. + fn revert_queues_to_source(&mut self) -> BlockResult<()> { + let op_id = self.next_mirror_op_id(); + let (ack_tx, ack_rx) = mpsc::channel(); + let mut commands = Vec::with_capacity(self.queue_cmd_senders.len()); + for sender in &self.queue_cmd_senders { + let async_io = self.disk_image.create_async_io(sender.queue_size as u32)?; + commands.push(( + sender, + BlockQueueCommand::cancel_to_source(op_id, async_io, ack_tx.clone()), + )); + } + drop(ack_tx); + Self::send_mirror_queue_commands(commands)?; + Self::wait_for_mirror_queue_command_acks(op_id, &ack_rx, self.queue_cmd_senders.len()) + } + #[cfg(fuzzing)] pub fn wait_for_epoll_threads(&mut self) { self.common.wait_for_epoll_threads(); @@ -1386,8 +1560,11 @@ impl VirtioDevice for Block { cmd: Arc::clone(&queue_command), evt: mirror_handler_evt, }; - self.queue_cmd_senders - .push((queue_command, queue_command_evt)); + self.queue_cmd_senders.push(BlockQueueCommandSender { + cmd: queue_command, + evt: queue_command_evt, + queue_size, + }); let mut handler = BlockEpollHandler { queue_index: queue_idx, From 2540dc601eb3723b986826bab089ca7b5aa485b6 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Thu, 30 Apr 2026 09:54:20 +0200 Subject: [PATCH 12/33] virtio-devices, block: add mirror status helper After starting the blockdev-mirror, the operator must be able to observe the current progress of the copy worker to decide whether to complete to the destination disk or to react to a failure. Introduce a Block::mirror_handle field and a dedicated MirrorStatus struct, to be used in the upcoming API endpoint. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 14 ++++++++++++++ virtio-devices/src/block.rs | 16 ++++++++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index 4a164416f1..e5036773ee 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -207,6 +207,20 @@ impl MirrorState { *current = target; } + + pub fn status(&self) -> MirrorStatus { + MirrorStatus { + phase: self.phase(), + copied_bytes: self.copied_bytes.load(Ordering::Relaxed), + total_bytes: self.total_bytes, + } + } +} + +pub struct MirrorStatus { + pub phase: MirrorPhase, + pub copied_bytes: u64, + pub total_bytes: u64, } /// Per-queue `AsyncIo` handle for a mirror. diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 477b1d9691..06f303bfff 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -27,7 +27,7 @@ use block::error::{BlockError, BlockErrorKind, BlockResult}; use block::fcntl::{LockError, LockGranularity, LockGranularityChoice, LockType, get_lock_state}; use block::mirror::{ BlockMirrorHandle, CopyWorker, CopyWorkerHandle, MIRROR_BLOCK_SIZE, MirrorPhase, MirrorState, - MirroringAsyncIo, + MirrorStatus, MirroringAsyncIo, }; use block::{ ExecuteAsync, ExecuteError, MAX_DISCARD_WRITE_ZEROES_SEG, Request, RequestType, @@ -956,6 +956,7 @@ pub struct Block { /// [`BlockQueueCommand`] and writes the corresponding evt. queue_cmd_senders: Vec, next_queue_cmd_op_id: u64, + mirror_handle: Option, } #[derive(Serialize, Deserialize)] @@ -1124,6 +1125,7 @@ impl Block { draining_active_requests: Arc::new(AtomicBool::new(false)), queue_cmd_senders: Vec::new(), next_queue_cmd_op_id: 1, + mirror_handle: None, }) } @@ -1309,7 +1311,7 @@ impl Block { &mut self, destination: Box, destination_path: PathBuf, - ) -> BlockResult { + ) -> BlockResult<()> { let source_size = self.disk_image.logical_size()?; let dest_size = destination.logical_size()?; if dest_size < source_size { @@ -1371,12 +1373,13 @@ impl Block { } }; - Ok(BlockMirrorHandle { + self.mirror_handle = Some(BlockMirrorHandle { state, copy_worker, destination, destination_path, - }) + }); + Ok(()) } fn next_mirror_op_id(&mut self) -> u64 { @@ -1452,6 +1455,11 @@ impl Block { Self::wait_for_mirror_queue_command_acks(op_id, &ack_rx, self.queue_cmd_senders.len()) } + /// Returns a snapshot of the current mirror progress. + pub fn mirror_status(&self) -> Option { + self.mirror_handle.as_ref().map(|h| h.state.status()) + } + #[cfg(fuzzing)] pub fn wait_for_epoll_threads(&mut self) { self.common.wait_for_epoll_threads(); From 90b090e02d6164bc70598deb7db6814818887e7e Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Thu, 30 Apr 2026 13:11:07 +0200 Subject: [PATCH 13/33] vmm: add device manager block mirror start and status Start exposes the lifecycle entrypoint that the upcoming REST endpoint will route to. The destination file must not exist yet. It is created with the same image format and backend flags as the source disk so the mirror can fan writes out to a backend that behaves identically. Status surfaces the shared mirror state through the device manager so operators can poll and observe progress. Adds three DeviceManagerError variants for the new failure modes and a BlockErrorKind::AlreadyExists for the create_disk pre-condition. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/error.rs | 3 ++ block/src/factory.rs | 42 +++++++++++++++++ virtio-devices/src/block.rs | 4 ++ vmm/src/device_manager.rs | 94 ++++++++++++++++++++++++++++++++++++- 4 files changed, 142 insertions(+), 1 deletion(-) diff --git a/block/src/error.rs b/block/src/error.rs index ebb0e66d34..98103f91c4 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -42,6 +42,8 @@ pub enum BlockErrorKind { NotFound, /// An internal counter or limit was exceeded. Overflow, + /// The file already exists, when disk creation was requested. + AlreadyExists, /// A mirror swap was requested but was unsuccessful. MirrorSwap, } @@ -56,6 +58,7 @@ impl Display for BlockErrorKind { Self::OutOfBounds => write!(f, "Out of bounds"), Self::NotFound => write!(f, "Not found"), Self::Overflow => write!(f, "Overflow"), + Self::AlreadyExists => write!(f, "Already exists"), Self::MirrorSwap => write!(f, "Failed to swap AsyncIO in virtqueue worker for mirror"), } } diff --git a/block/src/factory.rs b/block/src/factory.rs index ffe65f7d9f..3e1cd18b18 100644 --- a/block/src/factory.rs +++ b/block/src/factory.rs @@ -21,6 +21,7 @@ use crate::block_io_uring_is_supported; use crate::disk_file::AsyncFullDiskFile; use crate::error::{BlockError, BlockErrorKind, BlockResult}; use crate::fixed_vhd_disk::FixedVhdDisk; +use crate::qcow::{QcowFile, RawFile}; use crate::qcow_disk::QcowDisk; use crate::raw_disk::{RawBackend, RawDisk}; use crate::vhdx_sync::VhdxDiskSync; @@ -203,6 +204,47 @@ fn open_qcow2( )) } +/// Create a new disk image at `options.path` of the given image type +/// and logical `size`. The file must not exist yet. +pub fn create_disk( + options: &DiskOpenOptions<'_>, + image_type: ImageType, + size: u64, +) -> BlockResult<()> { + if options.path.exists() { + return Err(BlockError::from_kind(BlockErrorKind::AlreadyExists).with_path(options.path)); + } + let file = fs::OpenOptions::new() + .read(true) + .write(true) + .create_new(true) + .open(options.path) + .map_err(|e| { + BlockError::from_kind(BlockErrorKind::Io) + .with_path(options.path) + .with_source(e) + })?; + + match image_type { + ImageType::Raw => { + file.set_len(size) + .map_err(|e| BlockError::from(e).with_path(options.path))?; + } + ImageType::Qcow2 => { + let raw_file = RawFile::new(file.try_clone()?, options.direct); + QcowFile::new(raw_file, 3, size, options.sparse) + .map_err(|e| e.with_path(options.path))?; + } + _ => { + return Err( + BlockError::from_kind(BlockErrorKind::UnsupportedFeature).with_path(options.path) + ); + } + } + + Ok(()) +} + #[cfg(test)] mod unit_tests { use std::io::Write; diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 06f303bfff..829f8b67b9 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -1295,6 +1295,10 @@ impl Block { .map_err(Error::ConfigChange) } + pub fn logical_size(&self) -> BlockResult { + self.disk_image.logical_size() + } + /// Start mirroring the device's disk to `destination`. /// /// Each virtqueue worker swaps its `disk_image` to a new diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 60e07f0029..3a923db11f 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -35,7 +35,8 @@ use arch::{DeviceType, MmioDeviceInfo}; use arch::{NumaNodes, layout}; use block::ImageType; use block::error::BlockError; -use block::factory::{DiskOpenOptions, open_disk}; +use block::factory::{DiskOpenOptions, create_disk, open_disk}; +use block::mirror::MirrorStatus; #[cfg(target_arch = "riscv64")] use devices::aia; #[cfg(target_arch = "x86_64")] @@ -679,6 +680,16 @@ pub enum DeviceManagerError { specified: ImageType, detected: ImageType, }, + + /// No block mirroring is active for the current device. + #[error("No block mirroring is active for the current disk with identifier: {0}")] + BlockMirrorNotActive(String), + + /// The block mirroring destination path already exists. + #[error( + "The block mirroring destination path already exists for the disk with identifier: {0} at path: {1}" + )] + BlockMirrorDestAlreadyExists(String, String), } pub type DeviceManagerResult = result::Result; @@ -5320,6 +5331,87 @@ impl DeviceManager { } } + /// Start mirroring the disk identified by `device_id` to a new + /// file at `dest_path`. + /// + /// The destination file must not exist yet. It is created with the + /// same image format and backend flags as the source disk, sized to + /// match the source's logical size, and handed to the virtio block + /// device which mirrors later guest writes out to both backends + /// while a background worker copies the existing source contents. + /// + /// Returns an error if no disk with the given identifier is attached + /// to the VM, or the destination cannot be created or opened. + pub fn mirror_disk(&self, device_id: &str, dest_path: &Path) -> DeviceManagerResult<()> { + for dev in &self.block_devices { + let mut disk = dev.lock().unwrap(); + if disk.id() != device_id { + continue; + } + + let (options, image_type) = { + let cfg = self.config.lock().unwrap(); + let disks = cfg + .disks + .as_ref() + .ok_or_else(|| DeviceManagerError::UnknownDeviceId(device_id.to_string()))?; + + let src = disks + .iter() + .find(|d| d.pci_common.id.as_deref() == Some(device_id)) + .ok_or_else(|| DeviceManagerError::UnknownDeviceId(device_id.to_string()))?; + + ( + &DiskOpenOptions { + path: dest_path, + readonly: false, // ignore source's readonly, mirroring needs write access. + direct: src.direct, + sparse: src.sparse, + backing_files: src.backing_files, + disable_io_uring: src.disable_io_uring, + disable_aio: src.disable_aio, + }, + src.image_type, + ) + }; + + // TODO: make this configurable via request flags (create_disk, + // use_existing_disk). For now, create the destination only when it + // is missing and open it either way. + if !dest_path.exists() { + let logical_size = disk.logical_size().map_err(DeviceManagerError::Disk)?; + create_disk(options, image_type, logical_size).map_err(DeviceManagerError::Disk)?; + } + let dest_disk = open_disk(options).map_err(DeviceManagerError::Disk)?.disk; + + disk.start_mirror(dest_disk, dest_path.to_path_buf()) + .map_err(DeviceManagerError::Disk)?; + + return Ok(()); + } + + Err(DeviceManagerError::UnknownDeviceId(device_id.to_string())) + } + + /// Return the current state of the active mirror for the disk + /// identified by `device_id`. + /// + /// Returns an error if no disk with the given identifier is + /// attached to the VM, or if the disk has no active mirror. + pub fn mirror_disk_status(&self, device_id: &str) -> DeviceManagerResult { + for dev in &self.block_devices { + let disk = dev.lock().unwrap(); + + if disk.id() == device_id { + return disk.mirror_status().ok_or_else(|| { + DeviceManagerError::BlockMirrorNotActive(device_id.to_string()) + }); + } + } + + Err(DeviceManagerError::UnknownDeviceId(device_id.to_string())) + } + /// Helps the environment converge quickly after a live migration by /// prompting devices to advertise the VM from its new host. /// From 59cbf6e84610ee8c476f512d36bf1aea8aee2d0e Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Thu, 30 Apr 2026 15:37:16 +0200 Subject: [PATCH 14/33] vmm: add vm.disk-mirror-start REST endpoint Wire disk mirroring through to the HTTP API so a running VM can be told to start mirroring a disk onto a destination path. Add the /vm.disk-mirror-start endpoint and its request handler, the vm_disk_mirror_start dispatch on the VMM, and the Vm::mirror_disk wrapper that locks the device manager and maps its error into the vmm error type. A new DiskMirrorStart error covers the case where no VM owns the device manager. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- fuzz/fuzz_targets/http_api.rs | 4 +++ vmm/src/api/http/http_endpoint.rs | 38 +++++++++++++++++--- vmm/src/api/http/mod.rs | 11 ++++-- vmm/src/api/mod.rs | 44 +++++++++++++++++++++++ vmm/src/api/openapi/cloud-hypervisor.yaml | 31 ++++++++++++++++ vmm/src/device_manager.rs | 13 +++++++ vmm/src/lib.rs | 15 +++++++- vmm/src/vm.rs | 23 ++++++++++++ 8 files changed, 171 insertions(+), 8 deletions(-) diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index aa3841243d..033aff80f3 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -113,6 +113,10 @@ impl RequestHandler for StubApiRequestHandler { Ok(()) } + fn vm_disk_mirror_start(&mut self, _: String, _: PathBuf) -> Result<(), VmError> { + Ok(()) + } + #[cfg(target_arch = "x86_64")] fn vm_coredump(&mut self, _: &str) -> Result<(), VmError> { Ok(()) diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index 57aa6c4469..9069533196 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -48,13 +48,14 @@ use crate::api::http::{EndpointHandler, HttpError, error_response}; use crate::api::{ AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, - VmCancelMigration, VmConfig, VmCounters, VmDelete, VmMigrationProgress, VmNmi, VmPause, - VmPostMigrationAnnounce, VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, - VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, - VmShutdown, VmSnapshot, + VmCancelMigration, VmConfig, VmCounters, VmDelete, VmDiskMirrorStart, VmDiskMirrorStartData, + VmMigrationProgress, VmNmi, VmPause, VmPostMigrationAnnounce, VmPowerButton, VmReboot, + VmReceiveMigration, VmReceiveMigrationData, VmRemoveDevice, VmResize, VmResizeDisk, + VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::config::RestoreConfig; use crate::cpu::Error as CpuError; +use crate::device_manager::DeviceManagerError; use crate::vm::Error as VmError; /// Helper module for attaching externally opened FDs to config objects. @@ -518,6 +519,35 @@ impl PutHandler for VmSendMigration { impl GetHandler for VmSendMigration {} +impl PutHandler for VmDiskMirrorStart { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + _files: Vec, + ) -> Result, HttpError> { + let body = body.as_ref().ok_or(HttpError::BadRequest)?; + let data: VmDiskMirrorStartData = serde_json::from_slice(body.raw())?; + + self.send(api_notifier, api_sender, data) + .map_err(|e| match &e { + ApiError::VmDiskMirrorStart(VmError::DeviceManager( + DeviceManagerError::UnknownDeviceId(_), + )) => HttpError::NotFound, + ApiError::VmDiskMirrorStart(VmError::DeviceManager( + DeviceManagerError::BlockMirrorDestAlreadyExists(_, _), + )) => HttpError::BadRequest, + ApiError::VmDiskMirrorStart(VmError::DeviceManager( + DeviceManagerError::BlockMirrorAlreadyActive(_), + )) => HttpError::BadRequest, + _ => HttpError::ApiError(e), + }) + } +} + +impl GetHandler for VmDiskMirrorStart {} + impl PutHandler for VmResize { fn handle_request( &'static self, diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index 5464ca87ab..e1ed29ea54 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -30,9 +30,9 @@ use crate::api::VmCoredump; use crate::api::{ AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCancelMigration, VmCounters, - VmDelete, VmMigrationProgress, VmNmi, VmPause, VmPostMigrationAnnounce, VmPowerButton, - VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, - VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmDelete, VmDiskMirrorStart, VmMigrationProgress, VmNmi, VmPause, VmPostMigrationAnnounce, + VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, + VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::landlock::Landlock; use crate::seccomp_filters::{Thread, get_seccomp_filter}; @@ -141,6 +141,7 @@ pub trait EndpointHandler { error_response(e, StatusCode::BadRequest) } Err(e @ HttpError::TooManyRequests) => error_response(e, StatusCode::TooManyRequests), + Err(e @ HttpError::NotFound) => error_response(e, StatusCode::NotFound), Err(e) => error_response(e, StatusCode::InternalServerError), } } @@ -233,6 +234,10 @@ pub static HTTP_ROUTES: LazyLock = LazyLock::new(|| { endpoint!("/vm.delete"), Box::new(VmActionHandler::new(&VmDelete)), ); + r.routes.insert( + endpoint!("/vm.disk-mirror-start"), + Box::new(VmActionHandler::new(&VmDiskMirrorStart)), + ); r.routes.insert(endpoint!("/vm.info"), Box::new(VmInfo {})); r.routes.insert( endpoint!("/vm.pause"), diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 6d1f7c6d7e..471cfbb34c 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -149,6 +149,10 @@ pub enum ApiError { #[error("The disk could not be resized")] VmResizeDisk(#[source] VmError), + /// Error starting disk mirror + #[error("Error starting disk mirror")] + VmDiskMirrorStart(#[source] VmError), + /// The memory zone could not be resized. #[error("The memory zone could not be resized")] VmResizeZone(#[source] VmError), @@ -235,6 +239,12 @@ pub struct VmInfoResponse { pub device_tree: Option, } +#[derive(Clone, Deserialize, Serialize, Default, Debug)] +pub struct VmDiskMirrorStartData { + pub id: String, + pub destination_path: PathBuf, +} + #[derive(Clone, Deserialize, Serialize)] pub struct VmmPingResponse { pub build_version: String, @@ -752,6 +762,11 @@ pub trait RequestHandler { fn vm_resize_zone(&mut self, id: String, desired_ram: u64) -> Result<(), VmError>; fn vm_resize_disk(&mut self, id: String, desired_size: u64) -> Result<(), VmError>; + fn vm_disk_mirror_start( + &mut self, + id: String, + destination_path: PathBuf, + ) -> Result<(), VmError>; fn vm_add_device(&mut self, device_cfg: DeviceConfig) -> Result>, VmError>; @@ -1379,6 +1394,35 @@ impl ApiAction for VmDelete { get_response_body(self, api_evt, api_sender, data) } } +pub struct VmDiskMirrorStart; +impl ApiAction for VmDiskMirrorStart { + type RequestBody = VmDiskMirrorStartData; + type ResponseBody = Option; + + fn request(&self, data: Self::RequestBody, response_sender: Sender) -> ApiRequest { + Box::new(move |vmm| { + let response = vmm + .vm_disk_mirror_start(data.id, data.destination_path) + .map_err(ApiError::VmDiskMirrorStart) + .map(|_| ApiResponsePayload::Empty); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + get_response_body(self, api_evt, api_sender, data) + } +} pub struct VmInfo; diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index fd5ccec531..5c5dc5192b 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -510,6 +510,26 @@ paths: 500: description: The VM migration could not be sent. + /vm.disk-mirror-start: + put: + summary: Start mirroring a disk to a destination + requestBody: + description: The disk to mirror and the destination path + content: + application/json: + schema: + $ref: "#/components/schemas/VmDiskMirrorStartData" + required: true + responses: + 204: + description: Disk mirroring was successfully started. + 400: + description: A mirror is already active for the disk, or the destination is not usable. + 404: + description: No disk with the given identifier was found. + 500: + description: Disk mirroring could not be started. + components: schemas: VmmPingResponse: @@ -1564,3 +1584,14 @@ components: type: string access: type: string + + VmDiskMirrorStartData: + required: + - id + - destination_path + type: object + properties: + id: + type: string + destination_path: + type: string diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 3a923db11f..2c1299313e 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -685,6 +685,12 @@ pub enum DeviceManagerError { #[error("No block mirroring is active for the current disk with identifier: {0}")] BlockMirrorNotActive(String), + /// Mirroring is already active for the current device. + #[error( + "Failed to start block mirroring for the disk with identifier: {0} as mirroring is already active" + )] + BlockMirrorAlreadyActive(String), + /// The block mirroring destination path already exists. #[error( "The block mirroring destination path already exists for the disk with identifier: {0} at path: {1}" @@ -5349,6 +5355,13 @@ impl DeviceManager { continue; } + if let Some(status) = disk.mirror_status() { + return Err(DeviceManagerError::BlockMirrorAlreadyActive(format!( + "{device_id} is in phase {:?}, cancel the mirror before starting a new one", + status.phase + ))); + } + let (options, image_type) = { let cfg = self.config.lock().unwrap(); let disks = cfg diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index c94eee40ea..0625a1bcd7 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -17,7 +17,6 @@ use std::fs::File; use std::io::{Read, Write, stdout}; use std::os::unix::io::{AsRawFd, FromRawFd, RawFd}; use std::panic::AssertUnwindSafe; -#[cfg(feature = "guest_debug")] use std::path::PathBuf; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::mpsc::{Receiver, RecvError, SendError, Sender}; @@ -3508,6 +3507,20 @@ impl RequestHandler for Vmm { let lock = MIGRATION_PROGRESS_SNAPSHOT.lock().unwrap(); lock.clone() } + + fn vm_disk_mirror_start( + &mut self, + id: String, + destination_path: PathBuf, + ) -> result::Result<(), VmError> { + self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.mirror_disk(&id, &destination_path), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::DiskMirrorStart), + } + } } const CPU_MANAGER_SNAPSHOT_ID: &str = "cpu-manager"; diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index d62173e5fa..1999f909c8 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -19,6 +19,7 @@ use std::mem::size_of; use std::num::Wrapping; use std::ops::Deref; use std::os::unix::net::UnixStream; +use std::path::Path; use std::sync::{Arc, Mutex}; #[cfg(not(target_arch = "riscv64"))] use std::time::Instant; @@ -34,6 +35,7 @@ use arch::x86_64::MAX_SUPPORTED_CPUS_LEGACY; #[cfg(feature = "tdx")] use arch::x86_64::tdx::TdvfSection; use arch::{EntryPoint, NumaNode, NumaNodes, get_host_cpu_phys_bits}; +use block::mirror::MirrorStatus; use devices::AcpiNotificationFlags; #[cfg(target_arch = "aarch64")] use devices::interrupt_controller; @@ -269,6 +271,9 @@ pub enum Error { #[error("Failed resizing a disk image")] ResizeDisk, + #[error("Failed to start disk mirror")] + DiskMirrorStart, + #[error("Cannot activate virtio devices")] ActivateVirtioDevices(#[source] DeviceManagerError), @@ -3301,6 +3306,24 @@ impl Vm { .map_err(Error::ErrorNmi); } + pub fn mirror_disk(&self, id: &str, dest_path: &Path) -> Result<()> { + self.device_manager + .lock() + .unwrap() + .mirror_disk(id, dest_path) + .map_err(Error::DeviceManager)?; + + Ok(()) + } + + pub fn mirror_disk_status(&self, id: &str) -> Result { + self.device_manager + .lock() + .unwrap() + .mirror_disk_status(id) + .map_err(Error::DeviceManager) + } + /// Calls [`DeviceManager::post_migration_announce`]. pub fn post_migration_announce(&self) { self.device_manager From 780a24f844bd2f40202adacbd612183f1533899e Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Thu, 30 Apr 2026 16:08:43 +0200 Subject: [PATCH 15/33] vmm: add vm.disk-mirror-status REST endpoint Expose the disk mirror status operation as a REST entrypoint so operators can poll progress and detect terminal phases. The endpoint returns the current phase, copied bytes, total bytes, and a failure reason when the mirror is in the failed phase. The PutHandler maps unknown disk id and inactive mirror to 404 so management layers can distinguish operator errors from server faults. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- fuzz/fuzz_targets/http_api.rs | 4 ++ vmm/src/api/http/http_endpoint.rs | 33 ++++++++++- vmm/src/api/http/mod.rs | 10 +++- vmm/src/api/mod.rs | 68 +++++++++++++++++++++++ vmm/src/api/openapi/cloud-hypervisor.yaml | 55 ++++++++++++++++++ vmm/src/lib.rs | 19 ++++++- vmm/src/vm.rs | 3 + 7 files changed, 184 insertions(+), 8 deletions(-) diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index 033aff80f3..96129f48fe 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -117,6 +117,10 @@ impl RequestHandler for StubApiRequestHandler { Ok(()) } + fn vm_disk_mirror_status(&mut self, _: String) -> Result>, VmError> { + Ok(None) + } + #[cfg(target_arch = "x86_64")] fn vm_coredump(&mut self, _: &str) -> Result<(), VmError> { Ok(()) diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index 9069533196..65574af44f 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -49,9 +49,10 @@ use crate::api::{ AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCancelMigration, VmConfig, VmCounters, VmDelete, VmDiskMirrorStart, VmDiskMirrorStartData, - VmMigrationProgress, VmNmi, VmPause, VmPostMigrationAnnounce, VmPowerButton, VmReboot, - VmReceiveMigration, VmReceiveMigrationData, VmRemoveDevice, VmResize, VmResizeDisk, - VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmDiskMirrorStatus, VmDiskMirrorStatusData, VmMigrationProgress, VmNmi, VmPause, + VmPostMigrationAnnounce, VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, + VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, + VmShutdown, VmSnapshot, }; use crate::config::RestoreConfig; use crate::cpu::Error as CpuError; @@ -548,6 +549,32 @@ impl PutHandler for VmDiskMirrorStart { impl GetHandler for VmDiskMirrorStart {} +impl PutHandler for VmDiskMirrorStatus { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + _files: Vec, + ) -> Result, HttpError> { + let body = body.as_ref().ok_or(HttpError::BadRequest)?; + let data: VmDiskMirrorStatusData = serde_json::from_slice(body.raw())?; + + self.send(api_notifier, api_sender, data) + .map_err(|e| match &e { + ApiError::VmDiskMirrorStatus(VmError::DeviceManager( + DeviceManagerError::UnknownDeviceId(_), + )) => HttpError::NotFound, + ApiError::VmDiskMirrorStatus(VmError::DeviceManager( + DeviceManagerError::BlockMirrorNotActive(_), + )) => HttpError::NotFound, + _ => HttpError::ApiError(e), + }) + } +} + +impl GetHandler for VmDiskMirrorStatus {} + impl PutHandler for VmResize { fn handle_request( &'static self, diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index e1ed29ea54..e21f630d4c 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -30,9 +30,9 @@ use crate::api::VmCoredump; use crate::api::{ AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCancelMigration, VmCounters, - VmDelete, VmDiskMirrorStart, VmMigrationProgress, VmNmi, VmPause, VmPostMigrationAnnounce, - VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, - VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmDelete, VmDiskMirrorStart, VmDiskMirrorStatus, VmMigrationProgress, VmNmi, VmPause, + VmPostMigrationAnnounce, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, + VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::landlock::Landlock; use crate::seccomp_filters::{Thread, get_seccomp_filter}; @@ -238,6 +238,10 @@ pub static HTTP_ROUTES: LazyLock = LazyLock::new(|| { endpoint!("/vm.disk-mirror-start"), Box::new(VmActionHandler::new(&VmDiskMirrorStart)), ); + r.routes.insert( + endpoint!("/vm.disk-mirror-status"), + Box::new(VmActionHandler::new(&VmDiskMirrorStatus)), + ); r.routes.insert(endpoint!("/vm.info"), Box::new(VmInfo {})); r.routes.insert( endpoint!("/vm.pause"), diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 471cfbb34c..2a87cc25e2 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -40,6 +40,7 @@ use std::str::FromStr; use std::sync::mpsc::{RecvError, SendError, Sender, channel}; use std::time::Duration; +use block::mirror::{MirrorPhase, MirrorStatus}; use log::{info, trace}; use micro_http::Body; use option_parser::{OptionParser, OptionParserError, Toggle}; @@ -153,6 +154,9 @@ pub enum ApiError { #[error("Error starting disk mirror")] VmDiskMirrorStart(#[source] VmError), + #[error("Error reading disk mirror state")] + VmDiskMirrorStatus(#[source] VmError), + /// The memory zone could not be resized. #[error("The memory zone could not be resized")] VmResizeZone(#[source] VmError), @@ -245,6 +249,39 @@ pub struct VmDiskMirrorStartData { pub destination_path: PathBuf, } +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct VmDiskMirrorStatusData { + pub id: String, +} + +#[derive(Clone, Debug, Serialize)] +pub struct VmDiskMirrorStatusResponse { + pub phase: String, // "running" | "ready" | "completing" | "completed" | "cancelling" | "failed" + pub copied_bytes: u64, + pub total_bytes: u64, + #[serde(skip_serializing_if = "Option::is_none")] + pub failure: Option, +} + +impl From for VmDiskMirrorStatusResponse { + fn from(s: MirrorStatus) -> Self { + let (phase, failure) = match s.phase { + MirrorPhase::Running => ("running".to_owned(), None), + MirrorPhase::Ready => ("ready".to_owned(), None), + MirrorPhase::Cancelling => ("cancelling".to_owned(), None), + MirrorPhase::Failed(reason) => ("failed".to_owned(), Some(reason)), + MirrorPhase::Completing => ("completing".to_owned(), None), + MirrorPhase::Completed => ("completed".to_owned(), None), + }; + Self { + phase, + copied_bytes: s.copied_bytes, + total_bytes: s.total_bytes, + failure, + } + } +} + #[derive(Clone, Deserialize, Serialize)] pub struct VmmPingResponse { pub build_version: String, @@ -768,6 +805,8 @@ pub trait RequestHandler { destination_path: PathBuf, ) -> Result<(), VmError>; + fn vm_disk_mirror_status(&mut self, id: String) -> Result>, VmError>; + fn vm_add_device(&mut self, device_cfg: DeviceConfig) -> Result>, VmError>; fn vm_add_user_device( @@ -1424,6 +1463,35 @@ impl ApiAction for VmDiskMirrorStart { } } +pub struct VmDiskMirrorStatus; +impl ApiAction for VmDiskMirrorStatus { + type RequestBody = VmDiskMirrorStatusData; + type ResponseBody = Option; + + fn request(&self, data: Self::RequestBody, response_sender: Sender) -> ApiRequest { + Box::new(move |vmm| { + let response = vmm + .vm_disk_mirror_status(data.id) + .map_err(ApiError::VmDiskMirrorStatus) + .map(ApiResponsePayload::VmAction); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + get_response_body(self, api_evt, api_sender, data) + } +} + pub struct VmInfo; impl ApiAction for VmInfo { diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 5c5dc5192b..4445ffb082 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -530,6 +530,28 @@ paths: 500: description: Disk mirroring could not be started. + /vm.disk-mirror-status: + put: + summary: Query the status of a disk mirror + requestBody: + description: The identifier of the mirrored disk + content: + application/json: + schema: + $ref: "#/components/schemas/VmDiskMirrorStatusData" + required: true + responses: + 200: + description: The current status of the disk mirror. + content: + application/json: + schema: + $ref: "#/components/schemas/VmDiskMirrorStatusResponse" + 404: + description: No disk with the given identifier was found, or no mirror is active for it. + 500: + description: The disk mirror status could not be retrieved. + components: schemas: VmmPingResponse: @@ -1595,3 +1617,36 @@ components: type: string destination_path: type: string + + VmDiskMirrorStatusData: + required: + - id + type: object + properties: + id: + type: string + + VmDiskMirrorStatusResponse: + required: + - phase + - copied_bytes + - total_bytes + type: object + properties: + phase: + type: string + enum: + - running + - ready + - completing + - completed + - cancelling + - failed + copied_bytes: + type: integer + format: int64 + total_bytes: + type: integer + format: int64 + failure: + type: string diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index 0625a1bcd7..cd8b875ffc 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -62,8 +62,8 @@ use vmm_sys_util::signal::unblock_signal; use vmm_sys_util::sock_ctrl_msg::ScmSocket; use crate::api::{ - ApiRequest, ApiResponse, RequestHandler, TimeoutStrategy, VmInfoResponse, - VmReceiveMigrationData, VmSendMigrationData, VmmPingResponse, + ApiRequest, ApiResponse, RequestHandler, TimeoutStrategy, VmDiskMirrorStatusResponse, + VmInfoResponse, VmReceiveMigrationData, VmSendMigrationData, VmmPingResponse, }; use crate::config::{MemoryRestoreMode, RestoreConfig, add_to_config}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] @@ -3521,6 +3521,21 @@ impl RequestHandler for Vmm { MaybeVmOwnership::None => Err(VmError::DiskMirrorStart), } } + + fn vm_disk_mirror_status(&mut self, id: String) -> result::Result>, VmError> { + self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + + match self.vm { + MaybeVmOwnership::Vmm(ref vm) => { + let status = vm.mirror_disk_status(&id)?; + let response: VmDiskMirrorStatusResponse = status.into(); + let json = serde_json::to_vec(&response).map_err(|_| VmError::DiskMirrorStatus)?; + Ok(Some(json)) + } + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::DiskMirrorStatus), + } + } } const CPU_MANAGER_SNAPSHOT_ID: &str = "cpu-manager"; diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 1999f909c8..5348c93676 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -274,6 +274,9 @@ pub enum Error { #[error("Failed to start disk mirror")] DiskMirrorStart, + #[error("Failed to read disk mirror state")] + DiskMirrorStatus, + #[error("Cannot activate virtio devices")] ActivateVirtioDevices(#[source] DeviceManagerError), From 0db8190b6614739e5102dec02dbe2f57f988a85e Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Thu, 30 Apr 2026 16:52:21 +0200 Subject: [PATCH 16/33] block: implement MirroringAsyncIo::submit_batch_requests The virtio-blk queue worker calls submit_batch_requests unconditionally on the disk image, ignoring batch_requests_enabled. The previous stub panicked, which crashed the VM as soon as a mirrored disk processed a batched read or write. Dispatch In and Out to the existing read_vectored and write_vectored methods, which already fan out to source and destination. Other request types do not reach this path under the current request pipeline. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index e5036773ee..7cd797180b 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -22,11 +22,11 @@ use log::warn; use vmm_sys_util::epoll; use vmm_sys_util::eventfd::EventFd; -use crate::BatchRequest; use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult}; use crate::disk_file::AsyncFullDiskFile; use crate::error::BlockResult; use crate::qcow_common::AlignedBuf; +use crate::{BatchRequest, RequestType}; /// Block size for the copy worker, in which it copies data from /// source to destination and holds the range lock. @@ -441,11 +441,27 @@ impl AsyncIo for MirroringAsyncIo { } fn batch_requests_enabled(&self) -> bool { - false - } - - fn submit_batch_requests(&mut self, _batch_request: &[BatchRequest]) -> AsyncIoResult<()> { - unimplemented!("Batch requests are not supported in MirroringAsyncIo") + true + } + + fn submit_batch_requests(&mut self, batch_request: &[BatchRequest]) -> AsyncIoResult<()> { + for req in batch_request { + let result = match req.request_type { + RequestType::In => self.read_vectored(req.offset, &req.iovecs, req.user_data), + RequestType::Out => self.write_vectored(req.offset, &req.iovecs, req.user_data), + // Only In and Out are batched, see request.rs. + _ => unreachable!("Unexpected batch request type: {:?}", req.request_type), + }; + + // Push partial batch error to completions, vectored op has not + // pushed it to the inflight_completions queue. + if result.is_err() { + self.inflight_completions + .push_back((req.user_data, -libc::EIO)); + let _ = self.source.notifier().write(1); + } + } + Ok(()) } fn alignment(&self) -> u64 { From eed7545848d5b08e69946bfc22d2ec883dbfe224 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Sun, 3 May 2026 13:00:02 +0200 Subject: [PATCH 17/33] block: add AsyncIo::has_inflight_requests Complete and cancel swap a virtqueue worker's disk_image and need to ensure no inflight request and completion pairs are pending. Expose whether the implementation still holds request pairings the worker has to wait on before the swap. This is only relevant for the `MirroringAsyncIo`, as it needs to serialize the requests and completions to two children `AsyncIo`s. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/async_io.rs | 8 ++++++++ block/src/mirror.rs | 4 ++++ 2 files changed, 12 insertions(+) diff --git a/block/src/async_io.rs b/block/src/async_io.rs index bbdd77779d..52323bb32e 100644 --- a/block/src/async_io.rs +++ b/block/src/async_io.rs @@ -109,4 +109,12 @@ pub trait AsyncIo: Send { fn alignment(&self) -> u64 { SECTOR_SIZE } + + /// Returns true when this implementation has request pairings in flight + /// that have not yet been acked to the guest. Only the mirroring + /// implementation tracks such pairings, plain backends always return + /// false. + fn has_inflight_requests(&self) -> bool { + false + } } diff --git a/block/src/mirror.rs b/block/src/mirror.rs index 7cd797180b..91fbe8208f 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -468,6 +468,10 @@ impl AsyncIo for MirroringAsyncIo { // Stricter alignment wins. Same iovec goes to both backends. self.source.alignment().max(self.destination.alignment()) } + + fn has_inflight_requests(&self) -> bool { + !self.inflight_completions.is_empty() + } } /// Owns the copy worker thread's [`JoinHandle`]. The thread is joined From 7f84688a84500f57076e3dd3eb893db29c3cf7c1 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Sun, 3 May 2026 14:02:35 +0200 Subject: [PATCH 18/33] virtio-devices: drain mirror wrapper before disk_image swap A virtqueue worker may still hold source and destination write-pairs when complete or cancel arrives. Swapping the wrapper out at that moment would orphan the pending completions, leaving the guest waiting on writes that will never be acked. Stage the incoming BlockQueueCommand in pending_block_queue_command and apply it only once neither the handler nor the backend reports in-flight requests. The submit path is gated for the duration of the drain, otherwise sustained guest writes would keep the in-flight count from ever reaching zero. After applying the command the worker sends the acknowledgement and processes the avail ring directly: while the command was pending, QUEUE_AVAIL_EVENT handling consumed the guest's kicks without submitting, and the guest will not kick again for descriptors it already queued. The same protocol is reused by the upcoming complete and cancel endpoints. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- virtio-devices/src/block.rs | 77 +++++++++++++++++++++++++++++++------ 1 file changed, 66 insertions(+), 11 deletions(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 829f8b67b9..35f113bcd1 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -223,6 +223,9 @@ pub struct BlockQueueCommandReceiver { /// Wakes the worker after `cmd` is filled. Fires `BLOCK_COMMAND_EVENT` /// on the worker's epoll set. pub evt: EventFd, + /// Command taken from `cmd` and held until `disk_image` reports no + /// in-flight requests. Owned and accessed only by the worker. + pending_block_queue_command: Option, } struct BlockQueueCommandSender { @@ -366,7 +369,15 @@ impl BlockEpollHandler { if draining_active_requests.load(Ordering::SeqCst) { return Ok(()); } - + // Defer submitting new descriptors while a mirror swap is draining. + // The queue_evt is kicked at the end of the swap. + if self + .cmd_receiver + .as_ref() + .is_some_and(|m| m.pending_block_queue_command.is_some()) + { + return Ok(()); + } let queue = &mut self.queue; let queue_size = queue.size(); let mut batch_requests = Vec::new(); @@ -622,6 +633,47 @@ impl BlockEpollHandler { Ok(()) } + /// Applies a pending mirror update if one is staged and the current + /// `disk_image` has no in-flight requests. Returns `Ok(())` without + /// changes when either condition is not met. The next completion + /// event will trigger another attempt. + fn try_apply_pending_block_queue_command( + &mut self, + helper: &mut EpollHelper, + ) -> result::Result<(), EpollHelperError> { + // If any disk requests are in flight, we can't apply the pending command. + if !self.inflight_requests.is_empty() || self.disk_image.has_inflight_requests() { + return Ok(()); + } + + let Some(cmd_receiver) = self.cmd_receiver.as_mut() else { + return Ok(()); + }; + + let Some(command) = cmd_receiver.pending_block_queue_command.take() else { + return Ok(()); + }; + + let op_id = command.op_id; + let ack = command.ack.clone(); + + let result = Self::apply_block_queue_command(&mut self.disk_image, command, helper); + + let _ = ack.send(BlockQueueAck { op_id, result }); + + // While the command was pending, QUEUE_AVAIL_EVENT handling consumed the + // guest's kicks without submitting (see the guard in process_queue_submit). + // The guest won't kick again for descriptors it already queued, so process + // the avail ring now, whether the command succeeded or failed, or those + // requests stall until unrelated guest I/O arrives. + let rate_limit_reached = self.rate_limiter.as_ref().is_some_and(|r| r.is_blocked()); + if !rate_limit_reached { + self.process_queue_submit_and_signal()?; + } + + Ok(()) + } + #[inline] fn find_inflight_request(&mut self, completed_head: u16) -> Result { // This loop neatly handles the fast path where the completions are @@ -885,6 +937,7 @@ impl EpollHelperHandler for BlockEpollHandler { if !rate_limit_reached { self.process_queue_submit_and_signal()?; } + self.try_apply_pending_block_queue_command(helper)?; } RATE_LIMITER_EVENT => { if let Some(rate_limiter) = &mut self.rate_limiter { @@ -904,22 +957,23 @@ impl EpollHelperHandler for BlockEpollHandler { } } BLOCK_COMMAND_EVENT => { - // Apply a staged command: swap disk_image and re-register notifiers. - if let Some(m) = self.cmd_receiver.as_mut() { - m.evt.read().map_err(|e| { + if let Some(cmd_receiver) = self.cmd_receiver.as_mut() { + cmd_receiver.evt.read().map_err(|e| { EpollHelperError::HandleEvent(anyhow!( "Failed to read block command event: {e:?}" )) })?; - if let Some(command) = m.cmd.lock().unwrap().take() { - Self::apply_block_queue_command(&mut self.disk_image, command, helper) - .map_err(|e| { - EpollHelperError::HandleEvent(anyhow!( - "Failed to apply block queue command: {e}" - )) - })?; + if let Some(update) = cmd_receiver.cmd.lock().unwrap().take() + && let Some(stale) = + cmd_receiver.pending_block_queue_command.replace(update) + { + warn!( + "Replacing pending block queue command {:?} before it was applied", + stale.kind + ); } } + self.try_apply_pending_block_queue_command(helper)?; } _ => { return Err(EpollHelperError::HandleEvent(anyhow!( @@ -1571,6 +1625,7 @@ impl VirtioDevice for Block { let cmd_receiver = BlockQueueCommandReceiver { cmd: Arc::clone(&queue_command), evt: mirror_handler_evt, + pending_block_queue_command: None, }; self.queue_cmd_senders.push(BlockQueueCommandSender { cmd: queue_command, From 0ee00f1835dcc4f03d9f1fd98c51758bfe7ade95 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Sun, 3 May 2026 14:09:32 +0200 Subject: [PATCH 19/33] virtio-devices, block: add Block::complete_mirror After the copy worker reports the mirror ready, the operator needs to switch the device to the destination disk so the source can be detached. Completion is allowed from MirrorPhase `Ready`, or from `Completing` as a retry. A plain destination AsyncIo per virtqueue is pre-built before any command is sent, so a create_async_io failure leaves the device unchanged and the operator can retry. The state transitions to `Completing` before the first CompleteToDestination command goes out: from that point a queue may already write to the destination only, so the source stops being a safe fallback and cancel is no longer allowed. The drain protocol from the previous commit makes each worker finish its in-flight pairings before swapping. Failures of the command send or the acknowledgement wait panic instead of returning an error. A partial completion splits the queues between destination-only writers and source readers, which can serve stale reads to the guest. There is no revert that does not lose acknowledged writes, so we prefer the panic. After all acknowledgements the state becomes `Completed`, the handle is dropped and the control plane disk_image is replaced by the destination. Two BlockErrorKind variants cover the operator-visible preconditions. Device manager and REST plumbing follow. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/error.rs | 6 +++ virtio-devices/src/block.rs | 81 +++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/block/src/error.rs b/block/src/error.rs index 98103f91c4..865eb126a9 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -44,6 +44,10 @@ pub enum BlockErrorKind { Overflow, /// The file already exists, when disk creation was requested. AlreadyExists, + /// A mirror operation was requested but no mirror is active for the device. + MirrorNotActive, + /// A completion was requested but the mirror has not reached the ready phase. + MirrorNotReady, /// A mirror swap was requested but was unsuccessful. MirrorSwap, } @@ -59,6 +63,8 @@ impl Display for BlockErrorKind { Self::NotFound => write!(f, "Not found"), Self::Overflow => write!(f, "Overflow"), Self::AlreadyExists => write!(f, "Already exists"), + Self::MirrorNotActive => write!(f, "No active mirror for the device"), + Self::MirrorNotReady => write!(f, "Mirror is not yet ready, cannot complete"), Self::MirrorSwap => write!(f, "Failed to swap AsyncIO in virtqueue worker for mirror"), } } diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 35f113bcd1..5f8dd35f10 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -1440,6 +1440,87 @@ impl Block { Ok(()) } + /// Switch the device's mirroring wrapper to the destination disk. + /// + /// Each virtqueue worker swaps its [`MirroringAsyncIo`] for a plain + /// [`AsyncIo`] on the destination through the same slot and eventfd + /// mechanism used to install the mirror. After this call the source + /// disk is no longer used by the VM and the operator can detach or + /// remove it. + /// + /// Returns [`BlockErrorKind::MirrorNotActive`] when no mirror is + /// active for the device, and [`BlockErrorKind::MirrorNotReady`] when + /// the copy worker has not yet reported the ready phase or the mirror + /// has since failed. Both errors return before any queue command is + /// sent, so the mirror handle is left in place and the caller can poll + /// the state and retry. + /// + /// # Panics + /// + /// Panics if a queue command cannot be sent or acknowledged after the + /// switch-over has started. At that point some queues may already write + /// to the destination only, and there is no revert that keeps + /// acknowledged writes, so aborting is preferred over data loss. + pub fn complete_mirror(&mut self) -> BlockResult { + let op_id = self.next_mirror_op_id(); + let (ack_tx, ack_rx) = mpsc::channel(); + + let handle = self + .mirror_handle + .as_ref() + .ok_or_else(|| BlockError::from_kind(BlockErrorKind::MirrorNotActive))?; + + // Only allow completing when the copy worker is in the ready phase or as a retry. + if !matches!( + handle.state.phase(), + MirrorPhase::Ready | MirrorPhase::Completing + ) { + return Err(BlockError::from_kind(BlockErrorKind::MirrorNotReady)); + } + + let mut commands = Vec::with_capacity(self.queue_cmd_senders.len()); + for sender in &self.queue_cmd_senders { + let async_io = handle + .destination + .create_async_io(sender.queue_size as u32)?; + commands.push(( + sender, + BlockQueueCommand::complete_to_destination(op_id, async_io, ack_tx.clone()), + )); + } + + drop(ack_tx); + + // A concurrent destination failure may have moved the mirror to + // Failed since the phase guard above. Confirm Completing took effect + // before sending any command, otherwise we would swap the device + // onto a failed mirror. + handle.state.transition_to_phase(MirrorPhase::Completing); + if handle.state.phase() != MirrorPhase::Completing { + return Err(BlockError::from_kind(BlockErrorKind::MirrorNotReady)); + } + + // Once the first command is sent a queue may write to the destination + // only, so a partial switch-over has no safe revert. We panic rather + // than risk losing acknowledged writes. + Self::send_mirror_queue_commands(commands).expect("mirror queue commands sent"); + Self::wait_for_mirror_queue_command_acks(op_id, &ack_rx, self.queue_cmd_senders.len()) + .expect("mirror queue command acks received"); + handle.state.transition_to_phase(MirrorPhase::Completed); + + // Pre-build succeeded, own the destination now and commit the completion. + let BlockMirrorHandle { + destination, + destination_path, + copy_worker: _, + state: _, + } = self.mirror_handle.take().unwrap(); + + self.disk_image = destination; + self.disk_path = destination_path.clone(); + Ok(destination_path) + } + fn next_mirror_op_id(&mut self) -> u64 { let op_id = self.next_queue_cmd_op_id; self.next_queue_cmd_op_id = self.next_queue_cmd_op_id.wrapping_add(1); From 6b8d9908637e0a76171d005c1d27942cca897eac Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Sun, 3 May 2026 14:35:51 +0200 Subject: [PATCH 20/33] vmm: add vm.disk-mirror-complete REST endpoint Operators trigger the complete stage of blockdev-mirroring through this entrypoint. The endpoint switches the device to the destination disk after the copy worker reports the mirror ready. The PutHandler maps device manager errors to HTTP status codes so management layers can distinguish operator errors (404 for unknown disk or no active mirror, 400 for not-yet-ready) from server faults. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- fuzz/fuzz_targets/http_api.rs | 4 +++ vmm/src/api/http/http_endpoint.rs | 41 ++++++++++++++++++++--- vmm/src/api/http/mod.rs | 11 ++++-- vmm/src/api/mod.rs | 39 +++++++++++++++++++++ vmm/src/api/openapi/cloud-hypervisor.yaml | 28 ++++++++++++++++ vmm/src/device_manager.rs | 37 ++++++++++++++++++++ vmm/src/lib.rs | 10 ++++++ vmm/src/vm.rs | 12 +++++++ 8 files changed, 174 insertions(+), 8 deletions(-) diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index 96129f48fe..cf7728147f 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -121,6 +121,10 @@ impl RequestHandler for StubApiRequestHandler { Ok(None) } + fn vm_disk_mirror_complete(&mut self, _: String) -> Result<(), VmError> { + Ok(()) + } + #[cfg(target_arch = "x86_64")] fn vm_coredump(&mut self, _: &str) -> Result<(), VmError> { Ok(()) diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index 65574af44f..81d6ffad9e 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -37,6 +37,7 @@ use std::fs::File; use std::sync::mpsc::Sender; +use block::error::BlockErrorKind; use log::info; use micro_http::{Body, Method, Request, Response, StatusCode, Version}; use vmm_sys_util::eventfd::EventFd; @@ -48,11 +49,11 @@ use crate::api::http::{EndpointHandler, HttpError, error_response}; use crate::api::{ AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, - VmCancelMigration, VmConfig, VmCounters, VmDelete, VmDiskMirrorStart, VmDiskMirrorStartData, - VmDiskMirrorStatus, VmDiskMirrorStatusData, VmMigrationProgress, VmNmi, VmPause, - VmPostMigrationAnnounce, VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, - VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, - VmShutdown, VmSnapshot, + VmCancelMigration, VmConfig, VmCounters, VmDelete, VmDiskMirrorComplete, + VmDiskMirrorCompleteData, VmDiskMirrorStart, VmDiskMirrorStartData, VmDiskMirrorStatus, + VmDiskMirrorStatusData, VmMigrationProgress, VmNmi, VmPause, VmPostMigrationAnnounce, + VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, VmRemoveDevice, VmResize, + VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, }; use crate::config::RestoreConfig; use crate::cpu::Error as CpuError; @@ -575,6 +576,36 @@ impl PutHandler for VmDiskMirrorStatus { impl GetHandler for VmDiskMirrorStatus {} +impl PutHandler for VmDiskMirrorComplete { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + _files: Vec, + ) -> Result, HttpError> { + let body = body.as_ref().ok_or(HttpError::BadRequest)?; + let data: VmDiskMirrorCompleteData = serde_json::from_slice(body.raw())?; + + self.send(api_notifier, api_sender, data) + .map_err(|e| match &e { + ApiError::VmDiskMirrorComplete(VmError::DeviceManager( + DeviceManagerError::UnknownDeviceId(_), + )) => HttpError::NotFound, + ApiError::VmDiskMirrorComplete(VmError::DeviceManager( + DeviceManagerError::BlockMirrorComplete(_, block_err), + )) => match block_err.kind() { + BlockErrorKind::MirrorNotActive => HttpError::NotFound, + BlockErrorKind::MirrorNotReady => HttpError::BadRequest, + _ => HttpError::ApiError(e), + }, + _ => HttpError::ApiError(e), + }) + } +} + +impl GetHandler for VmDiskMirrorComplete {} + impl PutHandler for VmResize { fn handle_request( &'static self, diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index e21f630d4c..ed28b2f599 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -30,9 +30,10 @@ use crate::api::VmCoredump; use crate::api::{ AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCancelMigration, VmCounters, - VmDelete, VmDiskMirrorStart, VmDiskMirrorStatus, VmMigrationProgress, VmNmi, VmPause, - VmPostMigrationAnnounce, VmPowerButton, VmReboot, VmReceiveMigration, VmRemoveDevice, VmResize, - VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmDelete, VmDiskMirrorComplete, VmDiskMirrorStart, VmDiskMirrorStatus, VmMigrationProgress, + VmNmi, VmPause, VmPostMigrationAnnounce, VmPowerButton, VmReboot, VmReceiveMigration, + VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, + VmShutdown, VmSnapshot, }; use crate::landlock::Landlock; use crate::seccomp_filters::{Thread, get_seccomp_filter}; @@ -242,6 +243,10 @@ pub static HTTP_ROUTES: LazyLock = LazyLock::new(|| { endpoint!("/vm.disk-mirror-status"), Box::new(VmActionHandler::new(&VmDiskMirrorStatus)), ); + r.routes.insert( + endpoint!("/vm.disk-mirror-complete"), + Box::new(VmActionHandler::new(&VmDiskMirrorComplete)), + ); r.routes.insert(endpoint!("/vm.info"), Box::new(VmInfo {})); r.routes.insert( endpoint!("/vm.pause"), diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 2a87cc25e2..25aef2cdf3 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -157,6 +157,9 @@ pub enum ApiError { #[error("Error reading disk mirror state")] VmDiskMirrorStatus(#[source] VmError), + #[error("Error completing disk mirror")] + VmDiskMirrorComplete(#[source] VmError), + /// The memory zone could not be resized. #[error("The memory zone could not be resized")] VmResizeZone(#[source] VmError), @@ -254,6 +257,11 @@ pub struct VmDiskMirrorStatusData { pub id: String, } +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct VmDiskMirrorCompleteData { + pub id: String, +} + #[derive(Clone, Debug, Serialize)] pub struct VmDiskMirrorStatusResponse { pub phase: String, // "running" | "ready" | "completing" | "completed" | "cancelling" | "failed" @@ -806,6 +814,7 @@ pub trait RequestHandler { ) -> Result<(), VmError>; fn vm_disk_mirror_status(&mut self, id: String) -> Result>, VmError>; + fn vm_disk_mirror_complete(&mut self, id: String) -> Result<(), VmError>; fn vm_add_device(&mut self, device_cfg: DeviceConfig) -> Result>, VmError>; @@ -1492,6 +1501,36 @@ impl ApiAction for VmDiskMirrorStatus { } } +pub struct VmDiskMirrorComplete; + +impl ApiAction for VmDiskMirrorComplete { + type RequestBody = VmDiskMirrorCompleteData; + type ResponseBody = Option; + + fn request(&self, data: Self::RequestBody, response_sender: Sender) -> ApiRequest { + Box::new(move |vmm| { + let response = vmm + .vm_disk_mirror_complete(data.id) + .map_err(ApiError::VmDiskMirrorComplete) + .map(|_| ApiResponsePayload::Empty); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + get_response_body(self, api_evt, api_sender, data) + } +} + pub struct VmInfo; impl ApiAction for VmInfo { diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 4445ffb082..ed0574f232 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -552,6 +552,26 @@ paths: 500: description: The disk mirror status could not be retrieved. + /vm.disk-mirror-complete: + put: + summary: Complete a disk mirror and switch to the destination + requestBody: + description: The identifier of the mirrored disk + content: + application/json: + schema: + $ref: "#/components/schemas/VmDiskMirrorCompleteData" + required: true + responses: + 204: + description: The disk mirror was completed and the device now uses the destination. + 400: + description: The mirror is not ready to complete. + 404: + description: No disk with the given identifier was found, or no mirror is active for it. + 500: + description: The disk mirror could not be completed. + components: schemas: VmmPingResponse: @@ -1650,3 +1670,11 @@ components: format: int64 failure: type: string + + VmDiskMirrorCompleteData: + required: + - id + type: object + properties: + id: + type: string diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 2c1299313e..6564164405 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -696,6 +696,9 @@ pub enum DeviceManagerError { "The block mirroring destination path already exists for the disk with identifier: {0} at path: {1}" )] BlockMirrorDestAlreadyExists(String, String), + + #[error("Failed to complete block mirror for disk {0}: {1}")] + BlockMirrorComplete(String, #[source] BlockError), } pub type DeviceManagerResult = result::Result; @@ -5425,6 +5428,40 @@ impl DeviceManager { Err(DeviceManagerError::UnknownDeviceId(device_id.to_string())) } + /// Completes the active block mirror for the disk identified by `device_id`, + /// switching over to the destination disk. Errors if no disk with that + /// identifier is attached, if no mirror is active, or if the mirror is not + /// yet ready. + pub fn mirror_disk_complete(&self, device_id: &str) -> DeviceManagerResult<()> { + for dev in &self.block_devices { + let mut disk = dev.lock().unwrap(); + if disk.id() == device_id { + let new_path = disk.complete_mirror().map_err(|e| { + DeviceManagerError::BlockMirrorComplete(device_id.to_string(), e) + })?; + + // Repoint the config entry so a rebuild reopens the destination. + if let Some(cfg) = self + .config + .lock() + .unwrap() + .disks + .as_mut() + .and_then(|disks| { + disks + .iter_mut() + .find(|d| d.pci_common.id.as_deref() == Some(device_id)) + }) + { + cfg.path = Some(new_path); + } + + return Ok(()); + } + } + Err(DeviceManagerError::UnknownDeviceId(device_id.to_string())) + } + /// Helps the environment converge quickly after a live migration by /// prompting devices to advertise the VM from its new host. /// diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index cd8b875ffc..e71953da65 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -3536,6 +3536,16 @@ impl RequestHandler for Vmm { MaybeVmOwnership::None => Err(VmError::DiskMirrorStatus), } } + + fn vm_disk_mirror_complete(&mut self, id: String) -> result::Result<(), VmError> { + self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.mirror_disk_complete(&id), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::DiskMirrorComplete), + } + } } const CPU_MANAGER_SNAPSHOT_ID: &str = "cpu-manager"; diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 5348c93676..8ac33bb586 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -277,6 +277,9 @@ pub enum Error { #[error("Failed to read disk mirror state")] DiskMirrorStatus, + #[error("Failed to complete disk mirror")] + DiskMirrorComplete, + #[error("Cannot activate virtio devices")] ActivateVirtioDevices(#[source] DeviceManagerError), @@ -3327,6 +3330,15 @@ impl Vm { .map_err(Error::DeviceManager) } + pub fn mirror_disk_complete(&self, id: &str) -> Result<()> { + self.device_manager + .lock() + .unwrap() + .mirror_disk_complete(id) + .map_err(Error::DeviceManager)?; + Ok(()) + } + /// Calls [`DeviceManager::post_migration_announce`]. pub fn post_migration_announce(&self) { self.device_manager From 6af76b53437b6059a072ccfe9b6fd2b73561fc23 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Wed, 10 Jun 2026 14:06:55 +0200 Subject: [PATCH 21/33] virtio-devices, block: add Block::cancel_mirror Cancel reverts every virtqueue worker to a plain AsyncIo on the source disk, transitions the mirror to Cancelling and drops the handle, joining the copy worker and releasing the destination. The copy worker now exits before the next block once the phase is terminal instead of copying the remainder. Cancel is rejected once a completion was attempted: a queue may already write to the destination only, so reverting would lose acknowledged guest writes. A guest-initiated device reset cancels an active mirror before VirtioCommon::reset tears down the virtqueue workers, which must still be alive to acknowledge the revert. The REST plumbing for cancel comes in a follow-up commit. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/error.rs | 3 +++ block/src/mirror.rs | 5 ++++ virtio-devices/src/block.rs | 54 +++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) diff --git a/block/src/error.rs b/block/src/error.rs index 865eb126a9..6fd62a7c5d 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -50,6 +50,8 @@ pub enum BlockErrorKind { MirrorNotReady, /// A mirror swap was requested but was unsuccessful. MirrorSwap, + /// A mirror completion is already in progress. + MirrorCompletionInProgress, } impl Display for BlockErrorKind { @@ -66,6 +68,7 @@ impl Display for BlockErrorKind { Self::MirrorNotActive => write!(f, "No active mirror for the device"), Self::MirrorNotReady => write!(f, "Mirror is not yet ready, cannot complete"), Self::MirrorSwap => write!(f, "Failed to swap AsyncIO in virtqueue worker for mirror"), + Self::MirrorCompletionInProgress => write!(f, "Mirror completion already in progress"), } } } diff --git a/block/src/mirror.rs b/block/src/mirror.rs index 91fbe8208f..8c1ce57b70 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -571,6 +571,11 @@ impl CopyWorker { let mut offset = 0; while offset < total_size { + // Return early on cancellation or failure. + if self.state.phase() != MirrorPhase::Running { + return Ok(()); + } + let length = max_length.min(total_size - offset) as usize; self.copy_block(offset, length)?; offset += length as u64; diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 5f8dd35f10..12821c0fb2 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -1594,6 +1594,52 @@ impl Block { Self::wait_for_mirror_queue_command_acks(op_id, &ack_rx, self.queue_cmd_senders.len()) } + /// Cancel an active mirror and revert the device to the source disk. + /// + /// Transitions the mirror to [`MirrorPhase::Cancelling`] to mark that + /// cancellation has started, reverts every virtqueue worker to a plain + /// [`AsyncIo`] on the source, then drops the handle, which joins the + /// copy worker and releases the destination. + /// + /// Returns [`BlockErrorKind::MirrorNotActive`] when no mirror is active, + /// and [`BlockErrorKind::MirrorCompletionInProgress`] once a completion + /// has been attempted, because a queue may already write to the + /// destination only and reverting would lose acknowledged guest writes. + /// + /// If the revert fails the mirror stays in [`MirrorPhase::Cancelling`] + /// with the handle held, so calling this again retries the revert and + /// finishes the cancellation. + /// + /// Blocks until the copy worker finishes its current block and joins, + /// which can stall on a slow or hung destination. + pub fn cancel_mirror(&mut self) -> BlockResult<()> { + let state = self + .mirror_handle + .as_ref() + .ok_or_else(|| BlockError::from_kind(BlockErrorKind::MirrorNotActive))? + .state + .clone(); + + if !matches!( + state.phase(), + MirrorPhase::Running + | MirrorPhase::Ready + | MirrorPhase::Failed(_) + | MirrorPhase::Cancelling + ) { + return Err(BlockError::from_kind( + BlockErrorKind::MirrorCompletionInProgress, + )); + } + + state.transition_to_phase(MirrorPhase::Cancelling); + self.revert_queues_to_source()?; + + drop(self.mirror_handle.take().unwrap()); + + Ok(()) + } + /// Returns a snapshot of the current mirror progress. pub fn mirror_status(&self) -> Option { self.mirror_handle.as_ref().map(|h| h.state.status()) @@ -1774,6 +1820,14 @@ impl VirtioDevice for Block { } fn reset(&mut self) { + // Let the mirror fail without reverting the queues: reverting rebuilds + // an AsyncIo (io_setup) on this vcpu thread, which seccomp blocks. The + // queues are dropped by common.reset() and rebuilt by activate() anyway. + if let Some(handle) = self.mirror_handle.take() { + handle.state.transition_to_phase(MirrorPhase::Cancelling); + drop(handle); + } + self.common.reset(); self.draining_active_requests.store(false, Ordering::SeqCst); self.active_request_count.store(0, Ordering::SeqCst); From b37eb5d01a271594c5731e00f9041643934c37cd Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Wed, 10 Jun 2026 14:09:10 +0200 Subject: [PATCH 22/33] vmm: deny conflicting ops while a disk mirror runs Resizing or snapshotting the disk, removing the device, shutting down, rebooting or deleting the VM and starting a live migration all invalidate an active mirror: the destination silently falls behind or the mirror state is lost, since it is not migratable. Reject these operations while a mirror is active so the operator has to complete or cancel first. DeviceManager::active_block_mirrors lists the active mirrors and backs the new Vm::any_active_block_mirrors check. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- virtio-devices/src/block.rs | 12 ++++++++++++ vmm/src/device_manager.rs | 33 +++++++++++++++++++++++++-------- vmm/src/lib.rs | 30 +++++++++++++++++++++++++++--- vmm/src/vm.rs | 11 +++++++++++ 4 files changed, 75 insertions(+), 11 deletions(-) diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 12821c0fb2..36db1f56cd 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -123,6 +123,8 @@ pub enum Error { ConfigChange(#[source] io::Error), #[error("Disk resize failed")] DiskResize(#[source] BlockError), + #[error("Mirror is currently active")] + MirrorActive, #[error("Failed applying mirror command: {0}")] MirrorSwap(String), } @@ -1330,6 +1332,10 @@ impl Block { return Err(Error::InvalidSize); } + if self.mirror_handle.is_some() { + return Err(Error::MirrorActive); + } + self.disk_image .resize(new_size) .map_err(Error::DiskResize)?; @@ -1919,6 +1925,12 @@ impl Snapshottable for Block { } fn snapshot(&mut self) -> std::result::Result { + if self.mirror_handle.is_some() { + return Err(MigratableError::Snapshot(anyhow!( + "Cannot snapshot while mirror is active" + ))); + } + Snapshot::new_from_state(&self.state()) } } diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 6564164405..b3822aaf95 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -691,6 +691,12 @@ pub enum DeviceManagerError { )] BlockMirrorAlreadyActive(String), + /// Cannot perform given action, as the device is currently performing a block mirroring operation. + #[error( + "Failed to perform the requested action for the disk with identifier: {0} as it is currently performing a block mirroring operation" + )] + BlockMirrorActive(String), + /// The block mirroring destination path already exists. #[error( "The block mirroring destination path already exists for the disk with identifier: {0} at path: {1}" @@ -4781,16 +4787,20 @@ impl DeviceManager { // Release advisory locks by dropping all references. // Linux automatically releases all locks of that file if the last open FD is closed. { - let maybe_block_device_index = self + if let Some(index) = self .block_devices .iter() - .enumerate() - .find(|(_, dev)| { - let dev = dev.lock().unwrap(); - dev.id() == id - }) - .map(|(i, _)| i); - if let Some(index) = maybe_block_device_index { + .position(|dev| dev.lock().unwrap().id() == id) + { + // Deny removal of active mirroring block device. + if self.block_devices[index] + .lock() + .unwrap() + .mirror_status() + .is_some() + { + return Err(DeviceManagerError::BlockMirrorActive(id.to_string())); + } let _ = self.block_devices.swap_remove(index); } } @@ -5504,6 +5514,13 @@ impl DeviceManager { MAX_DELAY, ); } + + /// Returns true if there is an active mirror in any of the block devices, false otherwise. + pub fn any_active_block_mirrors(&self) -> bool { + self.block_devices + .iter() + .any(|dev| dev.lock().unwrap().mirror_status().is_some()) + } } /// Starts a thread that periodically performs the post-migration announcements. diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index e71953da65..c132719ce4 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -2625,6 +2625,10 @@ impl RequestHandler for Vmm { fn vm_snapshot(&mut self, destination_url: &str) -> result::Result<(), VmError> { match self.vm { MaybeVmOwnership::Vmm(ref mut vm) => { + if vm.any_active_block_mirrors() { + return Err(VmError::ActiveBlockMirror); + } + // Drain console_info so that FDs are not reused let _ = self.console_info.take(); vm.snapshot() @@ -2721,6 +2725,11 @@ impl RequestHandler for Vmm { MaybeVmOwnership::Migration(_) => return Err(VmError::VmMigrating), MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; + + if vm.any_active_block_mirrors() { + return Err(VmError::ActiveBlockMirror); + } + // Drain console_info so that the FDs are not reused let _ = self.console_info.take(); let r = vm.shutdown(); @@ -2742,6 +2751,11 @@ impl RequestHandler for Vmm { MaybeVmOwnership::Migration(_) => return Err(VmError::VmMigrating), MaybeVmOwnership::None => return Err(VmError::VmNotRunning), }; + + if vm.any_active_block_mirrors() { + return Err(VmError::ActiveBlockMirror); + } + let config = vm.get_config(); vm.shutdown()?; self.vm = MaybeVmOwnership::None; @@ -2863,7 +2877,11 @@ impl RequestHandler for Vmm { } match &self.vm { - MaybeVmOwnership::Vmm(_vm) => { + MaybeVmOwnership::Vmm(vm) => { + if vm.any_active_block_mirrors() { + return Err(VmError::ActiveBlockMirror); + } + event!("vm", "deleted"); // If a VM is booted, we first try to shut it down. @@ -3369,8 +3387,14 @@ impl RequestHandler for Vmm { .context("Invalid send migration configuration") .map_err(MigratableError::MigrateSend)?; - match self.vm { - MaybeVmOwnership::Vmm(_) => (), + match &self.vm { + MaybeVmOwnership::Vmm(vm) => { + if vm.any_active_block_mirrors() { + return Err(MigratableError::MigrateSend(anyhow!( + "Cannot start migration with active disk mirrors" + ))); + } + } MaybeVmOwnership::Migration(_) => { return Err(MigratableError::MigrateSend(anyhow!( "There is already an ongoing migration" diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 8ac33bb586..7cb02b9dcc 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -280,6 +280,9 @@ pub enum Error { #[error("Failed to complete disk mirror")] DiskMirrorComplete, + #[error("At least one disk mirror is active")] + ActiveBlockMirror, + #[error("Cannot activate virtio devices")] ActivateVirtioDevices(#[source] DeviceManagerError), @@ -3339,6 +3342,14 @@ impl Vm { Ok(()) } + /// Returns true if there is an active mirror in any of the block devices, false otherwise. + pub fn any_active_block_mirrors(&self) -> bool { + self.device_manager + .lock() + .unwrap() + .any_active_block_mirrors() + } + /// Calls [`DeviceManager::post_migration_announce`]. pub fn post_migration_announce(&self) { self.device_manager From d9ce5a1b82407e9589c61b5a884b8c6b182a8741 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Thu, 11 Jun 2026 14:59:21 +0200 Subject: [PATCH 23/33] vmm: add vm.disk-mirror-cancel REST endpoint Block::cancel_mirror was only reachable through a guest-initiated device reset. Give the operator a way to abort a mirror and keep the guest on the source disk. Wire the call through the layers: DeviceManager::mirror_disk_cancel resolves the device and maps errors, Vm and the RequestHandler forward the call, and a new VmDiskMirrorCancel action backs the PUT /vm.disk-mirror-cancel endpoint. Unknown device ids and inactive mirrors map to 404, a cancel after an attempted completion maps to 400, and revert failures surface as internal errors. A failed cancel keeps the mirror handle and leaves the mirror in the Cancelling phase. Cancel accepts that phase as a retry, so the request can simply be retried. CancelToSource commands are idempotent per virtqueue. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 78 +++++++++++++++++------ fuzz/fuzz_targets/http_api.rs | 4 ++ vmm/src/api/http/http_endpoint.rs | 41 ++++++++++-- vmm/src/api/http/mod.rs | 12 ++-- vmm/src/api/mod.rs | 41 ++++++++++++ vmm/src/api/openapi/cloud-hypervisor.yaml | 28 ++++++++ vmm/src/device_manager.rs | 22 +++++++ vmm/src/lib.rs | 10 +++ vmm/src/vm.rs | 11 ++++ 9 files changed, 217 insertions(+), 30 deletions(-) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index 8c1ce57b70..49b6ec8a57 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -235,6 +235,9 @@ pub struct MirroringAsyncIo { /// does not pay the epoll setup cost. source_waiter: EpollWaiter, dest_waiter: EpollWaiter, + /// Set once this virtqueue worker observes a failure. While true, the + /// virtqueue worker forwards only to the source and ignores the destination. + source_passthrough: bool, } impl MirroringAsyncIo { #[allow(dead_code)] @@ -263,13 +266,16 @@ impl MirroringAsyncIo { inflight_completions: VecDeque::new(), source_waiter, dest_waiter, + source_passthrough: false, })) } /// Flip the mirror to the `Failed` phase. The operator must cancel to /// clean up the destination and the copy worker. fn fail(&mut self, reason: String) { + // Phase fails the mirror globally, passthrough is per worker, so other queues fail independently. self.state.transition_to_phase(MirrorPhase::Failed(reason)); + self.source_passthrough = true; } /// Helper that submits an `AsyncIo` request to both source and destination. @@ -293,10 +299,10 @@ impl MirroringAsyncIo { Ok(()) } - /// Block until `user_data`'s source and destination completion arrive, then - /// queue the single guest-visible `(user_data, src_result)`. Other - /// completions seen while waiting (e.g. an async read finishing) are stashed - /// for later delivery. + /// Block until `user_data`'s source (and, unless already degraded to + /// passthrough, destination) completion arrives, then queue the single + /// guest-visible `(user_data, src_result)`. Other completions seen while + /// waiting (e.g. an async read finishing) are stashed for later delivery. fn wait_for_completions(&mut self, user_data: u64) -> io::Result<()> { let src_result = Self::await_completion( &mut self.source, @@ -305,22 +311,24 @@ impl MirroringAsyncIo { user_data, )?; - match Self::await_completion( - &mut self.destination, - &self.dest_waiter, - &mut self.inflight_completions, - user_data, - ) { - // Destination reported an I/O error. - Ok(dest_result) if dest_result < 0 => self.fail(format!( - "destination completion failed: user_data={user_data}" - )), - Ok(_) => {} - // The destination wait itself failed (broken notifier or epoll). - // Hide it from the guest like any other destination failure. - Err(e) => self.fail(format!( - "destination wait failed for user_data={user_data}: {e}" - )), + if !self.source_passthrough { + match Self::await_completion( + &mut self.destination, + &self.dest_waiter, + &mut self.inflight_completions, + user_data, + ) { + // Destination reported an I/O error. + Ok(dest_result) if dest_result < 0 => self.fail(format!( + "destination completion failed: user_data={user_data}" + )), + Ok(_) => {} + // The destination wait itself failed (broken notifier or epoll). + // Hide it from the guest like any other destination failure. + Err(e) => self.fail(format!( + "destination wait failed for user_data={user_data}: {e}" + )), + } } self.inflight_completions.push_back((user_data, src_result)); @@ -366,6 +374,10 @@ impl AsyncIo for MirroringAsyncIo { iovecs: &[iovec], user_data: u64, ) -> AsyncIoResult<()> { + if self.source_passthrough { + return self.source.write_vectored(offset, iovecs, user_data); + } + let _guard = self .state .range_locks @@ -384,6 +396,10 @@ impl AsyncIo for MirroringAsyncIo { } fn fsync(&mut self, user_data: Option) -> AsyncIoResult<()> { + if self.source_passthrough { + return self.source.fsync(user_data); + } + self.mirror_request( "fsync", |src| src.fsync(user_data), @@ -399,6 +415,10 @@ impl AsyncIo for MirroringAsyncIo { } fn punch_hole(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { + if self.source_passthrough { + return self.source.punch_hole(offset, length, user_data); + } + let _guard = self .state .range_locks @@ -416,6 +436,10 @@ impl AsyncIo for MirroringAsyncIo { } fn write_zeroes(&mut self, offset: u64, length: u64, user_data: u64) -> AsyncIoResult<()> { + if self.source_passthrough { + return self.source.write_zeroes(offset, length, user_data); + } + let _guard = self .state .range_locks @@ -433,7 +457,7 @@ impl AsyncIo for MirroringAsyncIo { } fn next_completed_request(&mut self) -> Option<(u64, i32)> { - // Mirrored writes are awaited synchronously. Only async source reads complete here. + // Mirrored writes are awaited synchronously, only reads and post-failure passthrough writes surface here. while let Some((id, res)) = self.source.next_completed_request() { self.inflight_completions.push_back((id, res)); } @@ -441,10 +465,18 @@ impl AsyncIo for MirroringAsyncIo { } fn batch_requests_enabled(&self) -> bool { + if self.source_passthrough { + return self.source.batch_requests_enabled(); + } + true } fn submit_batch_requests(&mut self, batch_request: &[BatchRequest]) -> AsyncIoResult<()> { + if self.source_passthrough { + return self.source.submit_batch_requests(batch_request); + } + for req in batch_request { let result = match req.request_type { RequestType::In => self.read_vectored(req.offset, &req.iovecs, req.user_data), @@ -465,6 +497,10 @@ impl AsyncIo for MirroringAsyncIo { } fn alignment(&self) -> u64 { + if self.source_passthrough { + return self.source.alignment(); + } + // Stricter alignment wins. Same iovec goes to both backends. self.source.alignment().max(self.destination.alignment()) } diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index cf7728147f..c2b2fa023c 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -125,6 +125,10 @@ impl RequestHandler for StubApiRequestHandler { Ok(()) } + fn vm_disk_mirror_cancel(&mut self, _: String) -> Result<(), VmError> { + Ok(()) + } + #[cfg(target_arch = "x86_64")] fn vm_coredump(&mut self, _: &str) -> Result<(), VmError> { Ok(()) diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index 81d6ffad9e..6eb7ea94cd 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -49,11 +49,12 @@ use crate::api::http::{EndpointHandler, HttpError, error_response}; use crate::api::{ AddDisk, ApiAction, ApiError, ApiRequest, NetConfig, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, - VmCancelMigration, VmConfig, VmCounters, VmDelete, VmDiskMirrorComplete, - VmDiskMirrorCompleteData, VmDiskMirrorStart, VmDiskMirrorStartData, VmDiskMirrorStatus, - VmDiskMirrorStatusData, VmMigrationProgress, VmNmi, VmPause, VmPostMigrationAnnounce, - VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, VmRemoveDevice, VmResize, - VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, VmShutdown, VmSnapshot, + VmCancelMigration, VmConfig, VmCounters, VmDelete, VmDiskMirrorCancel, VmDiskMirrorCancelData, + VmDiskMirrorComplete, VmDiskMirrorCompleteData, VmDiskMirrorStart, VmDiskMirrorStartData, + VmDiskMirrorStatus, VmDiskMirrorStatusData, VmMigrationProgress, VmNmi, VmPause, + VmPostMigrationAnnounce, VmPowerButton, VmReboot, VmReceiveMigration, VmReceiveMigrationData, + VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, + VmShutdown, VmSnapshot, }; use crate::config::RestoreConfig; use crate::cpu::Error as CpuError; @@ -606,6 +607,36 @@ impl PutHandler for VmDiskMirrorComplete { impl GetHandler for VmDiskMirrorComplete {} +impl PutHandler for VmDiskMirrorCancel { + fn handle_request( + &'static self, + api_notifier: EventFd, + api_sender: Sender, + body: &Option, + _files: Vec, + ) -> Result, HttpError> { + let body = body.as_ref().ok_or(HttpError::BadRequest)?; + let data: VmDiskMirrorCancelData = serde_json::from_slice(body.raw())?; + + self.send(api_notifier, api_sender, data) + .map_err(|e| match &e { + ApiError::VmDiskMirrorCancel(VmError::DeviceManager( + DeviceManagerError::UnknownDeviceId(_), + )) => HttpError::NotFound, + ApiError::VmDiskMirrorCancel(VmError::DeviceManager( + DeviceManagerError::BlockMirrorCancel(_, block_err), + )) => match block_err.kind() { + BlockErrorKind::MirrorNotActive => HttpError::NotFound, + BlockErrorKind::MirrorCompletionInProgress => HttpError::BadRequest, + _ => HttpError::ApiError(e), + }, + _ => HttpError::ApiError(e), + }) + } +} + +impl GetHandler for VmDiskMirrorCancel {} + impl PutHandler for VmResize { fn handle_request( &'static self, diff --git a/vmm/src/api/http/mod.rs b/vmm/src/api/http/mod.rs index ed28b2f599..59097a5820 100644 --- a/vmm/src/api/http/mod.rs +++ b/vmm/src/api/http/mod.rs @@ -30,10 +30,10 @@ use crate::api::VmCoredump; use crate::api::{ AddDisk, ApiError, ApiRequest, VmAddDevice, VmAddFs, VmAddGenericVhostUser, VmAddNet, VmAddPmem, VmAddUserDevice, VmAddVdpa, VmAddVsock, VmBoot, VmCancelMigration, VmCounters, - VmDelete, VmDiskMirrorComplete, VmDiskMirrorStart, VmDiskMirrorStatus, VmMigrationProgress, - VmNmi, VmPause, VmPostMigrationAnnounce, VmPowerButton, VmReboot, VmReceiveMigration, - VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, VmSendMigration, - VmShutdown, VmSnapshot, + VmDelete, VmDiskMirrorCancel, VmDiskMirrorComplete, VmDiskMirrorStart, VmDiskMirrorStatus, + VmMigrationProgress, VmNmi, VmPause, VmPostMigrationAnnounce, VmPowerButton, VmReboot, + VmReceiveMigration, VmRemoveDevice, VmResize, VmResizeDisk, VmResizeZone, VmRestore, VmResume, + VmSendMigration, VmShutdown, VmSnapshot, }; use crate::landlock::Landlock; use crate::seccomp_filters::{Thread, get_seccomp_filter}; @@ -247,6 +247,10 @@ pub static HTTP_ROUTES: LazyLock = LazyLock::new(|| { endpoint!("/vm.disk-mirror-complete"), Box::new(VmActionHandler::new(&VmDiskMirrorComplete)), ); + r.routes.insert( + endpoint!("/vm.disk-mirror-cancel"), + Box::new(VmActionHandler::new(&VmDiskMirrorCancel)), + ); r.routes.insert(endpoint!("/vm.info"), Box::new(VmInfo {})); r.routes.insert( endpoint!("/vm.pause"), diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index 25aef2cdf3..bfdcd01dda 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -160,6 +160,10 @@ pub enum ApiError { #[error("Error completing disk mirror")] VmDiskMirrorComplete(#[source] VmError), + /// Error cancelling disk mirror + #[error("Error cancelling disk mirror")] + VmDiskMirrorCancel(#[source] VmError), + /// The memory zone could not be resized. #[error("The memory zone could not be resized")] VmResizeZone(#[source] VmError), @@ -262,6 +266,11 @@ pub struct VmDiskMirrorCompleteData { pub id: String, } +#[derive(Clone, Debug, Deserialize, Serialize)] +pub struct VmDiskMirrorCancelData { + pub id: String, +} + #[derive(Clone, Debug, Serialize)] pub struct VmDiskMirrorStatusResponse { pub phase: String, // "running" | "ready" | "completing" | "completed" | "cancelling" | "failed" @@ -816,6 +825,8 @@ pub trait RequestHandler { fn vm_disk_mirror_status(&mut self, id: String) -> Result>, VmError>; fn vm_disk_mirror_complete(&mut self, id: String) -> Result<(), VmError>; + fn vm_disk_mirror_cancel(&mut self, id: String) -> Result<(), VmError>; + fn vm_add_device(&mut self, device_cfg: DeviceConfig) -> Result>, VmError>; fn vm_add_user_device( @@ -1531,6 +1542,36 @@ impl ApiAction for VmDiskMirrorComplete { } } +pub struct VmDiskMirrorCancel; + +impl ApiAction for VmDiskMirrorCancel { + type RequestBody = VmDiskMirrorCancelData; + type ResponseBody = Option; + + fn request(&self, data: Self::RequestBody, response_sender: Sender) -> ApiRequest { + Box::new(move |vmm| { + let response = vmm + .vm_disk_mirror_cancel(data.id) + .map_err(ApiError::VmDiskMirrorCancel) + .map(|_| ApiResponsePayload::Empty); + + response_sender + .send(response) + .map_err(VmmError::ApiResponseSend)?; + Ok(false) + }) + } + + fn send( + &self, + api_evt: EventFd, + api_sender: Sender, + data: Self::RequestBody, + ) -> ApiResult { + get_response_body(self, api_evt, api_sender, data) + } +} + pub struct VmInfo; impl ApiAction for VmInfo { diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index ed0574f232..3eb0cd01b5 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -572,6 +572,26 @@ paths: 500: description: The disk mirror could not be completed. + /vm.disk-mirror-cancel: + put: + summary: Cancel a disk mirror and keep the source disk + requestBody: + description: The identifier of the mirrored disk + content: + application/json: + schema: + $ref: "#/components/schemas/VmDiskMirrorCancelData" + required: true + responses: + 204: + description: The disk mirror was cancelled and the device keeps using the source. + 400: + description: The mirror cannot be cancelled because completion is already in progress. + 404: + description: No disk with the given identifier was found, or no mirror is active for it. + 500: + description: The disk mirror could not be cancelled. + components: schemas: VmmPingResponse: @@ -1678,3 +1698,11 @@ components: properties: id: type: string + + VmDiskMirrorCancelData: + required: + - id + type: object + properties: + id: + type: string diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index b3822aaf95..3e6c19886f 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -705,6 +705,10 @@ pub enum DeviceManagerError { #[error("Failed to complete block mirror for disk {0}: {1}")] BlockMirrorComplete(String, #[source] BlockError), + + /// Cancelling the block mirror failed. + #[error("Failed to cancel the block mirror for the disk with identifier: {0}")] + BlockMirrorCancel(String, #[source] BlockError), } pub type DeviceManagerResult = result::Result; @@ -5472,6 +5476,24 @@ impl DeviceManager { Err(DeviceManagerError::UnknownDeviceId(device_id.to_string())) } + /// Cancels the active block mirror for the disk identified by + /// `device_id`, reverting all virtqueue workers to the source disk + /// and releasing the destination. Errors if no disk with that + /// identifier is attached, if no mirror is active, if a completion has + /// already been attempted, or if reverting the virtqueue workers + /// fails. + pub fn mirror_disk_cancel(&self, device_id: &str) -> DeviceManagerResult<()> { + for dev in &self.block_devices { + let mut disk = dev.lock().unwrap(); + if disk.id() == device_id { + return disk + .cancel_mirror() + .map_err(|e| DeviceManagerError::BlockMirrorCancel(device_id.to_string(), e)); + } + } + Err(DeviceManagerError::UnknownDeviceId(device_id.to_string())) + } + /// Helps the environment converge quickly after a live migration by /// prompting devices to advertise the VM from its new host. /// diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index c132719ce4..e847c7001a 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -3570,6 +3570,16 @@ impl RequestHandler for Vmm { MaybeVmOwnership::None => Err(VmError::DiskMirrorComplete), } } + + fn vm_disk_mirror_cancel(&mut self, id: String) -> result::Result<(), VmError> { + self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; + + match self.vm { + MaybeVmOwnership::Vmm(ref mut vm) => vm.mirror_disk_cancel(&id), + MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), + MaybeVmOwnership::None => Err(VmError::DiskMirrorCancel), + } + } } const CPU_MANAGER_SNAPSHOT_ID: &str = "cpu-manager"; diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index 7cb02b9dcc..f9c127ad4b 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -280,6 +280,9 @@ pub enum Error { #[error("Failed to complete disk mirror")] DiskMirrorComplete, + #[error("Failed to cancel disk mirror")] + DiskMirrorCancel, + #[error("At least one disk mirror is active")] ActiveBlockMirror, @@ -3342,6 +3345,14 @@ impl Vm { Ok(()) } + pub fn mirror_disk_cancel(&self, id: &str) -> Result<()> { + self.device_manager + .lock() + .unwrap() + .mirror_disk_cancel(id) + .map_err(Error::DeviceManager) + } + /// Returns true if there is an active mirror in any of the block devices, false otherwise. pub fn any_active_block_mirrors(&self) -> bool { self.device_manager From ea69cadf60b9c6309909f989a49b0eeef4ca440d Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Wed, 17 Jun 2026 11:12:37 +0200 Subject: [PATCH 24/33] block: preserve sparseness in CopyWorker During block-dev mirroring, the CopyWorker reads each block from the source disk and writes it to the destination. It used to write zero-filled blocks as well, which allocates storage on the destination for regions that hold no data. When the destination supports sparse operations, we now check whether a block is all zeros. If it is, we call punch_hole instead of write_vectored. This keeps the destination as sparse as the source. The check currently looks at the full MIRROR_BLOCK_SIZE block, so only all-zero blocks become holes. A smaller granularity would also punch holes inside partly-zero blocks and save more space, at the cost of more compute per block. We leave this for a later change. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index 49b6ec8a57..afb17d0788 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -541,6 +541,7 @@ impl Drop for CopyWorkerHandle { pub struct CopyWorker { source_io: Box, dest_io: Box, + dest_is_sparse: bool, state: Arc, /// Once allocated, the buffer is reused for all blocks to avoid repeated allocations. buf: AlignedBuf, @@ -571,6 +572,7 @@ impl CopyWorker { Ok(Self { source_io, dest_io, + dest_is_sparse: destination_disk.supports_sparse_operations(), state, buf: AlignedBuf::new(block_size_bytes, alignment as usize)?, block_size_bytes, @@ -646,11 +648,19 @@ impl CopyWorker { } debug_assert_eq!(user_data, read_id); - // Write buf to destination. let write_id = self.generate_user_data(); - self.dest_io - .write_vectored(offset as off_t, &iovecs, write_id) - .map_err(|e| io::Error::other(format!("async io write_vectored failed: {e}")))?; + if self.dest_is_sparse && self.buf.as_slice(length).iter().all(|&b| b == 0) { + // Source block is all zeros: punch a hole to keep the destination sparse. + self.dest_io + .punch_hole(offset, length as u64, write_id) + .map_err(|e| io::Error::other(format!("async io punch_hole failed: {e}")))?; + } else { + // Write buf to destination. + self.dest_io + .write_vectored(offset as off_t, &iovecs, write_id) + .map_err(|e| io::Error::other(format!("async io write_vectored failed: {e}")))?; + } + let (user_data, result) = self.dest_waiter.next_completion(&mut self.dest_io)?; if result < 0 { return Err(io::Error::from_raw_os_error(-result)); From 7011d86c00ca38b6463888e0810de9a2eb4051cd Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Mon, 22 Jun 2026 11:59:48 +0200 Subject: [PATCH 25/33] block: add mirror unit tests Cover the range-lock and passthrough behaviour of MirroringAsyncIo with a mock AsyncIo, so the synchronization invariants are checked without real disk I/O. The tests cover: - overlapping mirror writes complete in order under the range lock - a copy-worker range hold blocks an overlapping guest write until it is released - reads pass through to the source only - a destination submit failure degrades the mirror to source passthrough A watchdog thread fails a test on timeout, so a locking regression surfaces as a failure rather than a hang. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 241 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 241 insertions(+) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index afb17d0788..6da8f16875 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -771,4 +771,245 @@ mod tests { locked.insert(10u64, 20u64); assert!(!RangeLockManager::overlaps_any(&locked, 20, 30)); } + + use std::collections::VecDeque; + use std::sync::mpsc; + use std::time::Duration; + + /// In-memory [`AsyncIo`] backend for driving [`MirroringAsyncIo`] in a unit + /// test without a real fd, io_uring, or the copy worker. Each submission is + /// recorded as an immediately-available completion and the notifier eventfd + /// is signaled, so a synchronous wait loop on the notifier observes it. + struct MockAsyncIo { + evt: EventFd, + completions: VecDeque<(u64, i32)>, + /// When set, the `write_vectored` submit at this 0-based index returns + /// an error instead of completing. Drives the destination-failure and + /// partial-batch paths. + fail_on_nth_write: Option, + writes_seen: usize, + } + + impl MockAsyncIo { + fn new() -> Self { + Self { + evt: EventFd::new(libc::EFD_NONBLOCK).unwrap(), + completions: VecDeque::new(), + fail_on_nth_write: None, + writes_seen: 0, + } + } + + /// Record a completion and wake any waiter parked on the notifier. + fn complete(&mut self, user_data: u64, result: i32) { + self.completions.push_back((user_data, result)); + self.evt.write(1).unwrap(); + } + } + + impl AsyncIo for MockAsyncIo { + fn notifier(&self) -> &EventFd { + &self.evt + } + fn read_vectored(&mut self, _o: off_t, _i: &[iovec], ud: u64) -> AsyncIoResult<()> { + self.complete(ud, 0); + Ok(()) + } + fn write_vectored(&mut self, _o: off_t, _i: &[iovec], ud: u64) -> AsyncIoResult<()> { + let index = self.writes_seen; + self.writes_seen += 1; + if self.fail_on_nth_write == Some(index) { + return Err(AsyncIoError::WriteVectored(io::Error::other( + "injected write submit failure", + ))); + } + self.complete(ud, 0); + Ok(()) + } + fn fsync(&mut self, ud: Option) -> AsyncIoResult<()> { + if let Some(ud) = ud { + self.complete(ud, 0); + } + Ok(()) + } + fn punch_hole(&mut self, _o: u64, _l: u64, ud: u64) -> AsyncIoResult<()> { + self.complete(ud, 0); + Ok(()) + } + fn write_zeroes(&mut self, _o: u64, _l: u64, ud: u64) -> AsyncIoResult<()> { + self.complete(ud, 0); + Ok(()) + } + fn next_completed_request(&mut self) -> Option<(u64, i32)> { + self.completions.pop_front() + } + } + + fn mirror_with_mocks() -> MirroringAsyncIo { + mirror_from( + MockAsyncIo::new(), + MockAsyncIo::new(), + MirrorState::new(1 << 20), + ) + } + + /// The one place to update when `MirroringAsyncIo`'s fields change. + fn mirror_from( + source: S, + destination: D, + state: Arc, + ) -> MirroringAsyncIo { + let source_waiter = EpollWaiter::new(source.notifier().as_raw_fd()).unwrap(); + let dest_waiter = EpollWaiter::new(destination.notifier().as_raw_fd()).unwrap(); + MirroringAsyncIo { + source: Box::new(source), + destination: Box::new(destination), + state, + inflight_completions: VecDeque::new(), + source_passthrough: false, + source_waiter, + dest_waiter, + } + } + + /// One iovec over `buf`. The mocks never read it, so it only needs to + /// outlive the submit call. + fn iov_of(buf: &[u8]) -> [iovec; 1] { + [iovec { + iov_base: buf.as_ptr() as *mut libc::c_void, + iov_len: buf.len(), + }] + } + + /// Runs `f` on a worker thread and fails the test if it does not finish + /// within `timeout`. Turns a submit-path deadlock into a clean failure + /// instead of a hung suite: the worker stays blocked, but the test thread + /// resumes after the timeout and panics. + fn run_with_watchdog(timeout: Duration, f: impl FnOnce() + Send + 'static) { + let (tx, rx) = mpsc::channel(); + thread::spawn(move || { + f(); + let _ = tx.send(()); + }); + if rx.recv_timeout(timeout).is_err() { + panic!("scenario did not finish within {timeout:?} (deadlock)"); + } + } + + /// Drains completions until `n` have arrived (or the budget is exhausted). + fn drain_n(mirror: &mut MirroringAsyncIo, n: usize) -> Vec { + let mut acked = Vec::new(); + for _ in 0..64 { + while let Some((user_data, result)) = mirror.next_completed_request() { + assert_eq!(result, 0, "unexpected error completion"); + acked.push(user_data); + } + if acked.len() >= n { + break; + } + } + acked + } + + /// Two overlapping guest writes submitted before either is reaped must both + /// complete in submission order without deadlocking. + #[test] + fn overlapping_writes_complete_in_order() { + run_with_watchdog(Duration::from_secs(5), || { + let mut mirror = mirror_with_mocks(); + let buf = [0u8; 4096]; + let iov = iov_of(&buf); + + mirror.write_vectored(0, &iov, 1).unwrap(); + mirror.write_vectored(0, &iov, 2).unwrap(); + + assert_eq!( + drain_n(&mut mirror, 2), + vec![1, 2], + "both overlapping writes complete in submission order" + ); + }); + } + + /// While the copy worker holds a range (simulated by holding a `RangeGuard` + /// on the shared lock manager), an overlapping guest write must block and + /// proceed only once the range is released. + #[test] + fn copy_worker_hold_serializes_overlapping_guest_write() { + let state = MirrorState::new(1 << 20); + // The "copy worker" holds [0, 4096). + let guard = state.range_locks.lock_range(0, 4096).unwrap(); + + let mut mirror = mirror_from(MockAsyncIo::new(), MockAsyncIo::new(), state.clone()); + + let (tx, rx) = mpsc::channel(); + let handle = thread::spawn(move || { + let buf = [0u8; 4096]; + let iov = iov_of(&buf); + mirror.write_vectored(0, &iov, 1).unwrap(); + tx.send(()).unwrap(); + }); + + // The held range must block the overlapping guest write. + assert!( + rx.recv_timeout(Duration::from_millis(200)).is_err(), + "guest write proceeded while the copy worker held the range" + ); + + // Releasing the range lets the write through. + drop(guard); + assert!( + rx.recv_timeout(Duration::from_secs(5)).is_ok(), + "guest write did not proceed after the range was released" + ); + handle.join().unwrap(); + } + + /// Reads are source-only passthrough (no range lock) and still complete. + #[test] + fn read_passes_through_to_source() { + run_with_watchdog(Duration::from_secs(5), || { + let mut mirror = mirror_with_mocks(); + let buf = [0u8; 4096]; + let iov = iov_of(&buf); + + mirror.read_vectored(0, &iov, 7).unwrap(); + + let mut got = None; + for _ in 0..64 { + if let Some(c) = mirror.next_completed_request() { + got = Some(c); + break; + } + } + assert_eq!(got, Some((7, 0)), "read completes via the source"); + }); + } + + /// A destination submit failure degrades the mirror to source passthrough: + /// the phase goes `Failed`, and both the failing write and a subsequent + /// write still complete to the guest off the source alone. + #[test] + fn destination_submit_failure_degrades_to_passthrough() { + run_with_watchdog(Duration::from_secs(5), || { + let mut dest = MockAsyncIo::new(); + dest.fail_on_nth_write = Some(0); + let mut mirror = mirror_from(MockAsyncIo::new(), dest, MirrorState::new(1 << 20)); + let buf = [0u8; 4096]; + let iov = iov_of(&buf); + + mirror.write_vectored(0, &iov, 1).unwrap(); + assert!( + matches!(mirror.state.phase(), MirrorPhase::Failed(_)), + "destination failure transitions the mirror to Failed" + ); + + // Subsequent write goes to the source only. + mirror.write_vectored(0, &iov, 2).unwrap(); + + let mut acked = drain_n(&mut mirror, 2); + acked.sort(); + assert_eq!(acked, vec![1, 2], "both writes complete off the source"); + }); + } } From 7247a847203339f17260aa8898562590358e95ee Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Tue, 23 Jun 2026 10:42:45 +0200 Subject: [PATCH 26/33] block: test range guard held across mirror write The synchronous mirrored write holds its range guard from acquisition through both the source and destination completions. Nothing else pins that lifetime, so a regression to dropping the guard early (`let _` instead of `let _guard`) would let an overlapping lock_range acquire while the write is still in flight, the exact race the range lock exists to prevent. Add guard_is_held_across_submit_and_wait and a GatedMockAsyncIo backend whose destination completion is withheld until released from another thread. The write parks in wait_for_completions holding its guard while the test asserts an overlapping lock_range blocks, then acquires only once the completion is released and the write drops the guard. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 135 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index 6da8f16875..fb304f917a 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -1012,4 +1012,139 @@ mod tests { assert_eq!(acked, vec![1, 2], "both writes complete off the source"); }); } + + /// Mock backend whose completions are withheld until [`Gate::release`], so a + /// test can hold a write parked in `wait_for_completions`. + struct GatedMockAsyncIo { + evt: EventFd, + inner: Arc>, + /// Notified on each submit, so a test can wait until the in-flight write + /// has reached this backend (and so already holds its range guard). + on_submit: mpsc::Sender<()>, + } + + struct GatedInner { + /// Submitted, not yet released. + pending: VecDeque<(u64, i32)>, + /// Released, deliverable via `next_completed_request`. + ready: VecDeque<(u64, i32)>, + } + + /// Releases a [`GatedMockAsyncIo`]'s withheld completions from another thread. + struct Gate { + evt: EventFd, + inner: Arc>, + } + + impl Gate { + fn release(&self) { + let mut inner = self.inner.lock().unwrap(); + while let Some(completion) = inner.pending.pop_front() { + inner.ready.push_back(completion); + } + self.evt.write(1).unwrap(); + } + } + + impl GatedMockAsyncIo { + fn new(on_submit: mpsc::Sender<()>) -> Self { + Self { + evt: EventFd::new(libc::EFD_NONBLOCK).unwrap(), + inner: Arc::new(Mutex::new(GatedInner { + pending: VecDeque::new(), + ready: VecDeque::new(), + })), + on_submit, + } + } + + fn gate(&self) -> Gate { + Gate { + evt: self.evt.try_clone().unwrap(), + inner: Arc::clone(&self.inner), + } + } + + fn submit(&self, user_data: u64) { + self.inner.lock().unwrap().pending.push_back((user_data, 0)); + let _ = self.on_submit.send(()); + } + } + + impl AsyncIo for GatedMockAsyncIo { + fn notifier(&self) -> &EventFd { + &self.evt + } + fn read_vectored(&mut self, _o: off_t, _i: &[iovec], ud: u64) -> AsyncIoResult<()> { + self.submit(ud); + Ok(()) + } + fn write_vectored(&mut self, _o: off_t, _i: &[iovec], ud: u64) -> AsyncIoResult<()> { + self.submit(ud); + Ok(()) + } + fn fsync(&mut self, ud: Option) -> AsyncIoResult<()> { + if let Some(ud) = ud { + self.submit(ud); + } + Ok(()) + } + fn punch_hole(&mut self, _o: u64, _l: u64, ud: u64) -> AsyncIoResult<()> { + self.submit(ud); + Ok(()) + } + fn write_zeroes(&mut self, _o: u64, _l: u64, ud: u64) -> AsyncIoResult<()> { + self.submit(ud); + Ok(()) + } + fn next_completed_request(&mut self) -> Option<(u64, i32)> { + self.inner.lock().unwrap().ready.pop_front() + } + } + + /// The range guard must stay held across the whole synchronous submit+wait, + /// not just acquisition. A regression to `let _ =` drops it early and lets + /// an overlapping `lock_range` acquire while the write is still in flight. + #[test] + fn guard_is_held_across_submit_and_wait() { + let state = MirrorState::new(1 << 20); + + // Source completes immediately; destination is gated, so the write parks + // waiting on the destination completion while holding the range lock. + let (submitted_tx, submitted_rx) = mpsc::channel(); + let dest = GatedMockAsyncIo::new(submitted_tx); + let gate = dest.gate(); + let mut mirror = mirror_from(MockAsyncIo::new(), dest, state.clone()); + + let writer = thread::spawn(move || { + let buf = [0u8; 4096]; + let iov = iov_of(&buf); + mirror.write_vectored(0, &iov, 1).unwrap(); + }); + + // The write reached the destination submit, so its range guard is held. + submitted_rx.recv().unwrap(); + + // An overlapping lock_range must block while the in-flight write holds it. + let locker_state = state.clone(); + let (locked_tx, locked_rx) = mpsc::channel(); + let locker = thread::spawn(move || { + let _g = locker_state.range_locks.lock_range(0, 4096).unwrap(); + locked_tx.send(()).unwrap(); + }); + assert!( + locked_rx.recv_timeout(Duration::from_millis(200)).is_err(), + "lock_range acquired while the in-flight write still held the range" + ); + + // Releasing the destination completion lets the write finish and drop its + // guard, which unblocks the overlapping lock_range. + gate.release(); + writer.join().unwrap(); + assert!( + locked_rx.recv_timeout(Duration::from_secs(5)).is_ok(), + "lock_range did not acquire after the write released the range" + ); + locker.join().unwrap(); + } } From 306c1b695c01251119def187648ff9f8dbeba85c Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Tue, 23 Jun 2026 13:11:59 +0200 Subject: [PATCH 27/33] virtio-devices, block: reject mirror ops on a paused device A paused virtqueue worker is parked on its pause barrier and never reaches its epoll loop, so it cannot pick up a staged BlockQueueCommand. start_mirror, complete_mirror, and cancel_mirror staged the command anyway and blocked in wait_for_mirror_queue_command_acks until the ack timeout, then returned an error while the command lingered in the slot and was applied late once the VM resumed, leaving the mirror half-installed. complete_mirror additionally panicked on that timeout. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/error.rs | 5 +++++ virtio-devices/src/block.rs | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/block/src/error.rs b/block/src/error.rs index 6fd62a7c5d..5019bfff84 100644 --- a/block/src/error.rs +++ b/block/src/error.rs @@ -52,6 +52,8 @@ pub enum BlockErrorKind { MirrorSwap, /// A mirror completion is already in progress. MirrorCompletionInProgress, + /// A mirror operation was requested while the device is paused. + MirrorDevicePaused, } impl Display for BlockErrorKind { @@ -69,6 +71,9 @@ impl Display for BlockErrorKind { Self::MirrorNotReady => write!(f, "Mirror is not yet ready, cannot complete"), Self::MirrorSwap => write!(f, "Failed to swap AsyncIO in virtqueue worker for mirror"), Self::MirrorCompletionInProgress => write!(f, "Mirror completion already in progress"), + Self::MirrorDevicePaused => { + write!(f, "Mirror operation rejected: the device is paused") + } } } } diff --git a/virtio-devices/src/block.rs b/virtio-devices/src/block.rs index 36db1f56cd..9df953de26 100644 --- a/virtio-devices/src/block.rs +++ b/virtio-devices/src/block.rs @@ -1376,6 +1376,7 @@ impl Block { destination: Box, destination_path: PathBuf, ) -> BlockResult<()> { + self.ensure_not_paused_for_mirror()?; let source_size = self.disk_image.logical_size()?; let dest_size = destination.logical_size()?; if dest_size < source_size { @@ -1468,6 +1469,7 @@ impl Block { /// to the destination only, and there is no revert that keeps /// acknowledged writes, so aborting is preferred over data loss. pub fn complete_mirror(&mut self) -> BlockResult { + self.ensure_not_paused_for_mirror()?; let op_id = self.next_mirror_op_id(); let (ack_tx, ack_rx) = mpsc::channel(); @@ -1527,6 +1529,15 @@ impl Block { Ok(destination_path) } + /// Fails with `MirrorDevicePaused` when the device is paused, since a + /// parked worker cannot apply a staged mirror command. + fn ensure_not_paused_for_mirror(&self) -> BlockResult<()> { + if self.common.paused.load(Ordering::SeqCst) { + return Err(BlockError::from_kind(BlockErrorKind::MirrorDevicePaused)); + } + Ok(()) + } + fn next_mirror_op_id(&mut self) -> u64 { let op_id = self.next_queue_cmd_op_id; self.next_queue_cmd_op_id = self.next_queue_cmd_op_id.wrapping_add(1); @@ -1619,6 +1630,7 @@ impl Block { /// Blocks until the copy worker finishes its current block and joins, /// which can stall on a slow or hung destination. pub fn cancel_mirror(&mut self) -> BlockResult<()> { + self.ensure_not_paused_for_mirror()?; let state = self .mirror_handle .as_ref() From f22f06f22e9f4ef1511bc354b0874a55cc9e76cd Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Wed, 24 Jun 2026 11:14:01 +0200 Subject: [PATCH 28/33] vmm: reject mirror destination already backing a disk Cloud Hypervisor holds the disk image lock process-wide, so re-opening the destination here would not trip the lock. Compare canonicalized paths instead and refuse a destination that already backs one of the VM's disks. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- vmm/src/device_manager.rs | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 3e6c19886f..094609b6db 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -691,6 +691,10 @@ pub enum DeviceManagerError { )] BlockMirrorAlreadyActive(String), + /// The mirror destination path is already backing one of the VM's disks. + #[error("Cannot mirror to '{0}': it is already in use as a disk image by this VM")] + BlockMirrorDestinationInUse(String), + /// Cannot perform given action, as the device is currently performing a block mirroring operation. #[error( "Failed to perform the requested action for the disk with identifier: {0} as it is currently performing a block mirroring operation" @@ -5379,6 +5383,24 @@ impl DeviceManager { ))); } + // Refuse a destination that already backs one of this VM's disks, comparing canonicalized paths. + let canon = |p: &Path| std::fs::canonicalize(p).unwrap_or_else(|_| p.to_path_buf()); + let dest_canon = canon(dest_path); + let dest_in_use = self + .config + .lock() + .unwrap() + .disks + .iter() + .flatten() + .filter_map(|d| d.path.as_deref()) + .any(|src| canon(src) == dest_canon); + if dest_in_use { + return Err(DeviceManagerError::BlockMirrorDestinationInUse( + dest_path.display().to_string(), + )); + } + let (options, image_type) = { let cfg = self.config.lock().unwrap(); let disks = cfg From e7dd6de66719805f0c48c9adc955dda08594d219 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Wed, 24 Jun 2026 11:46:29 +0200 Subject: [PATCH 29/33] vmm: seccomp: allow io_uring and eventfd2 on vcpus A virtio-blk device activated by the guest, and the blockdev mirror rebuilding a backend on a guest-initiated reset, set up io_uring rings and eventfds on the activating vcpu thread, after that thread's seccomp filter is installed. Allow io_uring_setup and io_uring_register plus eventfd2 for the notifier, matching what the vmm thread already permits, so the backend is not killed with SIGSYS. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- vmm/src/seccomp_filters.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vmm/src/seccomp_filters.rs b/vmm/src/seccomp_filters.rs index 69c4a83f3f..4b2192df29 100644 --- a/vmm/src/seccomp_filters.rs +++ b/vmm/src/seccomp_filters.rs @@ -890,6 +890,9 @@ fn vcpu_thread_rules( (libc::SYS_dup, vec![]), (libc::SYS_exit, vec![]), (libc::SYS_epoll_ctl, vec![]), + (libc::SYS_eventfd2, vec![]), + (libc::SYS_io_uring_setup, vec![]), + (libc::SYS_io_uring_register, vec![]), ( libc::SYS_fallocate, or![and![Cond::new( From 81a8efc56258cf3048f81c6e55cd5661b08d23f0 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Wed, 24 Jun 2026 13:57:40 +0200 Subject: [PATCH 30/33] block: test batched submit on partial failure submit_batch_requests serializes each entry and queues a completion per write, a submit failure mid-batch must still return Ok with one completion per entry. Otherwise the virtqueue worker, which records the batch as in-flight only on Ok, strands the completions already queued for earlier entries and dies with MissingEntryRequestList. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 53 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index fb304f917a..3b233d1a96 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -1147,4 +1147,57 @@ mod tests { ); locker.join().unwrap(); } + + /// A single-iovec `Out` batch entry backed by `buf`. + fn batch_write(offset: off_t, buf: &[u8], user_data: u64) -> BatchRequest { + BatchRequest { + offset, + iovecs: [iovec { + iov_base: buf.as_ptr() as *mut libc::c_void, + iov_len: buf.len(), + }] + .into_iter() + .collect(), + user_data, + request_type: RequestType::Out, + } + } + + /// A mid-batch submit failure must still return `Ok` with one completion per + /// entry (an error completion for the failed one). The worker records the + /// batch as in-flight only on `Ok`, so aborting with `Err` strands the + /// completions already queued for earlier entries. + #[test] + fn failed_batch_submit_accounts_every_request() { + // Second write (index 1) fails at submit on the source; the first + // already went through. + let mut source = MockAsyncIo::new(); + source.fail_on_nth_write = Some(1); + let mut mirror = mirror_from(source, MockAsyncIo::new(), MirrorState::new(1 << 20)); + let buf = [0u8; 4096]; + + let batch = [batch_write(0, &buf, 1), batch_write(4096, &buf, 2)]; + + mirror + .submit_batch_requests(&batch) + .expect("a mid-batch submit failure must not fail the whole batch"); + + let mut completions = Vec::new(); + while let Some(c) = mirror.next_completed_request() { + completions.push(c); + } + completions.sort_by_key(|(user_data, _)| *user_data); + + assert_eq!( + completions.len(), + 2, + "every batch entry owes exactly one completion" + ); + assert_eq!(completions[0], (1, 0), "first write completes successfully"); + assert_eq!(completions[1].0, 2, "second entry is still accounted"); + assert!( + completions[1].1 < 0, + "second entry carries an error result (reported IOERR), not an orphan" + ); + } } From 4b89e5ed46dba23f37223faa3a29f1b432787316 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Wed, 24 Jun 2026 14:45:24 +0200 Subject: [PATCH 31/33] block: test mirror phase transitions and op fan-out Test the MirrorState phase state machine (allowed transitions, rejected ones, terminal Completed, and Failed keeping its first reason), the tracked-vs-barrier fsync split, write_zeroes mirroring, and that a degraded mirror passes every op through to the source alone. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- block/src/mirror.rs | 101 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) diff --git a/block/src/mirror.rs b/block/src/mirror.rs index 3b233d1a96..a4d5d89067 100644 --- a/block/src/mirror.rs +++ b/block/src/mirror.rs @@ -1200,4 +1200,105 @@ mod tests { "second entry carries an error result (reported IOERR), not an orphan" ); } + + /// The lifecycle advances Running -> Ready -> Completing -> Completed, each + /// state reached only from its documented predecessor. + #[test] + fn phase_advances_through_the_lifecycle() { + let state = MirrorState::new(1 << 20); + assert_eq!(state.phase(), MirrorPhase::Running); + state.transition_to_phase(MirrorPhase::Ready); + state.transition_to_phase(MirrorPhase::Completing); + state.transition_to_phase(MirrorPhase::Completed); + assert_eq!(state.phase(), MirrorPhase::Completed); + } + + /// A transition not in the table is ignored, leaving the phase unchanged. + #[test] + fn invalid_phase_transition_is_ignored() { + let state = MirrorState::new(1 << 20); + // Running -> Completed skips Ready and Completing, so it is rejected. + state.transition_to_phase(MirrorPhase::Completed); + assert_eq!(state.phase(), MirrorPhase::Running); + } + + /// `Completed` is terminal: no later transition takes effect. + #[test] + fn completed_phase_is_terminal() { + let state = MirrorState::new(1 << 20); + state.transition_to_phase(MirrorPhase::Ready); + state.transition_to_phase(MirrorPhase::Completing); + state.transition_to_phase(MirrorPhase::Completed); + state.transition_to_phase(MirrorPhase::Cancelling); + assert_eq!(state.phase(), MirrorPhase::Completed); + } + + /// A failure keeps its first reason (transitions compare only the variant) + /// and can still move to `Cancelling` for cleanup. + #[test] + fn failed_keeps_first_reason_then_cancels() { + let state = MirrorState::new(1 << 20); + state.transition_to_phase(MirrorPhase::Failed("first".into())); + state.transition_to_phase(MirrorPhase::Failed("second".into())); + assert_eq!(state.phase(), MirrorPhase::Failed("first".into())); + state.transition_to_phase(MirrorPhase::Cancelling); + assert_eq!(state.phase(), MirrorPhase::Cancelling); + } + + /// A tracked fsync (`Some`) flushes both backends and surfaces one guest + /// completion for its user_data. + #[test] + fn tracked_fsync_completes_to_guest() { + run_with_watchdog(Duration::from_secs(5), || { + let mut mirror = mirror_with_mocks(); + mirror.fsync(Some(5)).unwrap(); + assert_eq!(drain_n(&mut mirror, 1), vec![5]); + }); + } + + /// A barrier fsync (`None`) flushes both backends but owes the guest no + /// completion, so nothing surfaces. + #[test] + fn barrier_fsync_surfaces_no_completion() { + let mut mirror = mirror_with_mocks(); + mirror.fsync(None).unwrap(); + assert!(mirror.next_completed_request().is_none()); + } + + /// `write_zeroes` mirrors to both backends under the range lock and + /// surfaces one guest completion, like a write. + #[test] + fn write_zeroes_mirrors_and_completes() { + run_with_watchdog(Duration::from_secs(5), || { + let mut mirror = mirror_with_mocks(); + mirror.write_zeroes(0, 4096, 3).unwrap(); + assert_eq!(drain_n(&mut mirror, 1), vec![3]); + }); + } + + /// Once degraded to passthrough, every mutating op forwards to the source + /// alone and still completes, with no destination and no range lock. + #[test] + fn degraded_mirror_passes_all_ops_through_to_source() { + run_with_watchdog(Duration::from_secs(5), || { + let mut dest = MockAsyncIo::new(); + dest.fail_on_nth_write = Some(0); + let mut mirror = mirror_from(MockAsyncIo::new(), dest, MirrorState::new(1 << 20)); + let buf = [0u8; 4096]; + let iov = iov_of(&buf); + + // The first write fails on the destination and flips to passthrough. + mirror.write_vectored(0, &iov, 1).unwrap(); + assert!(matches!(mirror.state.phase(), MirrorPhase::Failed(_))); + + // Subsequent ops take the source-only passthrough branch. + mirror.fsync(Some(2)).unwrap(); + mirror.punch_hole(0, 4096, 3).unwrap(); + mirror.write_zeroes(0, 4096, 4).unwrap(); + + let mut acked = drain_n(&mut mirror, 4); + acked.sort(); + assert_eq!(acked, vec![1, 2, 3, 4]); + }); + } } From 030f3494266694216373e6cd6a9b60b646d95abb Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Thu, 25 Jun 2026 13:00:24 +0200 Subject: [PATCH 32/33] docs: add disk mirroring guide Document the operator workflow (start, status, complete, cancel, failure handling, unrecoverable errors, and conflicting operations) and the design behind it: the CopyWorker, the MirroringAsyncIo write fan-out, and the range lock. On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- docs/disk_mirroring.md | 175 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 docs/disk_mirroring.md diff --git a/docs/disk_mirroring.md b/docs/disk_mirroring.md new file mode 100644 index 0000000000..edad98db2d --- /dev/null +++ b/docs/disk_mirroring.md @@ -0,0 +1,175 @@ +# Disk Mirroring + +Disk mirroring copies a running VM's disk to another file on the host and +keeps the two in sync, so the disk image can be moved to a different backing +store without stopping the guest. It is the Cloud Hypervisor counterpart of +QEMU's `blockdev-mirror`. + +A typical use is rebalancing storage: when the share backing a disk image +fills up, the operator mirrors that disk onto a file on another share and +switches the VM over to it. + +## Overview + +Mirroring runs as a sequence of phases driven by four API calls: + +- `/vm.disk-mirror-start` begins mirroring a disk onto a destination path. +- `/vm.disk-mirror-status` reports the current phase and copy progress. +- `/vm.disk-mirror-complete` switches the VM over to the destination. +- `/vm.disk-mirror-cancel` aborts and keeps the VM on the source. + +```mermaid +stateDiagram-v2 + [*] --> running: disk-mirror-start + running --> ready: background copy finished + ready --> completing: disk-mirror-complete + completing --> completed: all queues switched + completed --> [*] + running --> cancelling: disk-mirror-cancel + ready --> cancelling: disk-mirror-cancel + failed --> cancelling: disk-mirror-cancel + running --> failed: destination I/O error + ready --> failed: destination I/O error + cancelling --> [*] +``` + +While `running`, a background worker copies the existing data block by block. +At the same time every guest write is forwarded to both disks, so once the +copy finishes the two are identical. Reaching `ready` means the two disks are +in sync and stay so until the operator completes or cancels. + +## Operator usage + +The examples use `curl` against the VMM's API socket. Replace the socket path +and the disk identifier with your own. The disk identifier is the device `id` +shown by `vm.info` (the same `id` used when the disk was configured or hot +added). + +### Start a mirror + +```console +curl --unix-socket /tmp/cloud-hypervisor.sock -i \ + -X PUT 'http://localhost/api/v1/vm.disk-mirror-start' \ + -H 'Content-Type: application/json' \ + -d '{"id": "_disk0", "destination_path": "/new/store/disk0.raw"}' +``` + +This switches the disk to a mirroring backend and starts the background copy. +The VM keeps serving I/O throughout. A `204` response means mirroring started. + +### Check progress + +```console +curl --unix-socket /tmp/cloud-hypervisor.sock \ + -X PUT 'http://localhost/api/v1/vm.disk-mirror-status' \ + -H 'Content-Type: application/json' \ + -d '{"id": "_disk0"}' +``` + +The response reports the phase and how far the copy has progressed: + +```json +{"phase": "running", "copied_bytes": 1073741824, "total_bytes": 4294967296} +``` + +`phase` is one of `running`, `ready`, `completing`, `completed`, +`cancelling`, or `failed`. A `failed` status also carries a `failure` field +describing what went wrong. Poll this endpoint until the phase becomes +`ready`. + +### Complete the mirror + +Once the phase is `ready`, switch the VM over to the destination: + +```console +curl --unix-socket /tmp/cloud-hypervisor.sock -i \ + -X PUT 'http://localhost/api/v1/vm.disk-mirror-complete' \ + -H 'Content-Type: application/json' \ + -d '{"id": "_disk0"}' +``` + +The call blocks until the switch-over finishes. On success (`204`) the VM +serves all I/O from the destination disk and the source disk can be removed. +Completion is only accepted from the `ready` phase. A `404` or `400` leaves the +mirror active, so you can fix the cause and retry. + +### Cancel the mirror + +At any time before completion the operator can abort and keep the VM on the +source disk: + +```console +curl --unix-socket /tmp/cloud-hypervisor.sock -i \ + -X PUT 'http://localhost/api/v1/vm.disk-mirror-cancel' \ + -H 'Content-Type: application/json' \ + -d '{"id": "_disk0"}' +``` + +The destination disk is released and the VM continues on the source. Cancel is +refused once completion has been requested, because by then a queue may +already be writing only to the destination. + +### Failure handling + +If the destination disk fails (for example its backing store becomes +unreachable), the mirror moves to `failed` and the affected queues fall back +to serving the guest from the source disk, so the guest keeps running on +intact data. The operator then cancels the failed mirror to release the +destination. + +### Unrecoverable errors + +Completing a mirror cannot be undone. Once the switch to the destination +begins, some virtqueues may already be writing only to the destination, so +there is no consistent state to roll back to. If a queue cannot be switched +over during completion, the VMM aborts. The alternative would leave the disk +half on the source and half on the destination and could lose acknowledged +writes. This is rare: it needs a queue worker to fail mid-swap (for example an +epoll registration error), or its switch-over command to be lost or +unacknowledged. + +### Conflicting operations + +While a mirror is active, the VMM rejects operations that would disturb it: +snapshotting, live migration, resizing the disk, removing the device, and +rebooting, shutting down, or deleting the VM. Complete or cancel the mirror +first. Pausing the VM is allowed, but a mirror cannot be started, completed, +or cancelled while the device is paused. + +## Implementation details + +Mirroring is built from two cooperating pieces and a range lock that keeps +them from corrupting each other: + +```mermaid +flowchart LR + guest[Guest] -->|read / write| mio[MirroringAsyncIo] + mio -->|reads, all writes| src[(Source disk)] + mio -->|writes only| dst[(Destination disk)] + cw[CopyWorker] -->|read block| src + cw -->|write block| dst + mio -.range lock.- rl((RangeLockManager)) + cw -.range lock.- rl +``` + +**CopyWorker.** A background thread copies the source disk to the destination +in 512 KiB blocks. A block that reads back as all zeros is punched as a hole +on the destination instead of being written, so sparse images stay sparse. The +worker updates the copied-byte counter that `vm.disk-mirror-status` reports, +and stops early once the phase becomes terminal. + +**MirroringAsyncIo.** When a mirror starts, each virtqueue worker's `AsyncIo` +backend is swapped for a `MirroringAsyncIo`. It forwards reads to the source +and forwards every mutating operation (`write_vectored`, `fsync`, +`punch_hole`, `write_zeroes`) to both the source and the destination. The +completions of the two sides are awaited inside the write call, so an error on +the destination can be handled before the guest sees the write as done. On a +destination error that queue degrades to source passthrough and the mirror +fails, rather than letting the guest diverge from intact data. + +**Range lock.** The CopyWorker and the guest writes can target overlapping +byte ranges at the same time. Each side takes an exclusive lock on the range +it is about to touch and holds it until its I/O completes, so a copy and a +guest write to the same region cannot interleave into an inconsistent result. +Lookups are over a small set of held ranges, so the lock is cheap in the +common non-overlapping case. From 403fb4eaef3b80e60e3870847d21d27c947c6ed5 Mon Sep 17 00:00:00 2001 From: Leander Kohler Date: Mon, 29 Jun 2026 16:18:45 +0200 Subject: [PATCH 33/33] vmm: add configurable mirror destination mode vm.disk-mirror-start always created the destination when it was missing. Add a destination_mode field so callers control creation vs reuse: - RequireExisting (default): the destination must already exist - Create: create it, fail if it already exists - CreateIfMissing: create it only when absent On-behalf-of: SAP leander.kohler@sap.com Signed-off-by: Leander Kohler --- fuzz/fuzz_targets/http_api.rs | 11 +++++-- vmm/src/api/http/http_endpoint.rs | 3 ++ vmm/src/api/mod.rs | 17 +++++++++- vmm/src/api/openapi/cloud-hypervisor.yaml | 7 +++++ vmm/src/device_manager.rs | 38 ++++++++++++++++++++--- vmm/src/lib.rs | 10 ++++-- vmm/src/vm.rs | 10 ++++-- 7 files changed, 82 insertions(+), 14 deletions(-) diff --git a/fuzz/fuzz_targets/http_api.rs b/fuzz/fuzz_targets/http_api.rs index c2b2fa023c..1371835694 100644 --- a/fuzz/fuzz_targets/http_api.rs +++ b/fuzz/fuzz_targets/http_api.rs @@ -15,8 +15,8 @@ use vm_migration::progress::MigrationProgress; use vm_migration::MigratableError; use vmm::api::http::*; use vmm::api::{ - ApiRequest, RequestHandler, VmInfoResponse, VmReceiveMigrationData, VmSendMigrationData, - VmmPingResponse, + ApiRequest, MirrorDestinationMode, RequestHandler, VmInfoResponse, VmReceiveMigrationData, + VmSendMigrationData, VmmPingResponse, }; use vmm::config::RestoreConfig; use vmm::vm::{Error as VmError, VmState}; @@ -113,7 +113,12 @@ impl RequestHandler for StubApiRequestHandler { Ok(()) } - fn vm_disk_mirror_start(&mut self, _: String, _: PathBuf) -> Result<(), VmError> { + fn vm_disk_mirror_start( + &mut self, + _: String, + _: PathBuf, + _: MirrorDestinationMode, + ) -> Result<(), VmError> { Ok(()) } diff --git a/vmm/src/api/http/http_endpoint.rs b/vmm/src/api/http/http_endpoint.rs index 6eb7ea94cd..de2aef9103 100644 --- a/vmm/src/api/http/http_endpoint.rs +++ b/vmm/src/api/http/http_endpoint.rs @@ -541,6 +541,9 @@ impl PutHandler for VmDiskMirrorStart { ApiError::VmDiskMirrorStart(VmError::DeviceManager( DeviceManagerError::BlockMirrorDestAlreadyExists(_, _), )) => HttpError::BadRequest, + ApiError::VmDiskMirrorStart(VmError::DeviceManager( + DeviceManagerError::BlockMirrorDestMissing(_, _), + )) => HttpError::BadRequest, ApiError::VmDiskMirrorStart(VmError::DeviceManager( DeviceManagerError::BlockMirrorAlreadyActive(_), )) => HttpError::BadRequest, diff --git a/vmm/src/api/mod.rs b/vmm/src/api/mod.rs index bfdcd01dda..832e543a9a 100644 --- a/vmm/src/api/mod.rs +++ b/vmm/src/api/mod.rs @@ -250,10 +250,24 @@ pub struct VmInfoResponse { pub device_tree: Option, } +/// Controls whether mirror start creates the destination image or reuses it. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Deserialize, Serialize, Default)] +pub enum MirrorDestinationMode { + /// Require the destination to already exist. + #[default] + RequireExisting, + /// Create the destination. Fails if it already exists. + Create, + /// Create the destination if missing, otherwise reuse it. + CreateIfMissing, +} + #[derive(Clone, Deserialize, Serialize, Default, Debug)] pub struct VmDiskMirrorStartData { pub id: String, pub destination_path: PathBuf, + #[serde(default)] + pub destination_mode: MirrorDestinationMode, } #[derive(Clone, Debug, Deserialize, Serialize)] @@ -820,6 +834,7 @@ pub trait RequestHandler { &mut self, id: String, destination_path: PathBuf, + destination_mode: MirrorDestinationMode, ) -> Result<(), VmError>; fn vm_disk_mirror_status(&mut self, id: String) -> Result>, VmError>; @@ -1461,7 +1476,7 @@ impl ApiAction for VmDiskMirrorStart { fn request(&self, data: Self::RequestBody, response_sender: Sender) -> ApiRequest { Box::new(move |vmm| { let response = vmm - .vm_disk_mirror_start(data.id, data.destination_path) + .vm_disk_mirror_start(data.id, data.destination_path, data.destination_mode) .map_err(ApiError::VmDiskMirrorStart) .map(|_| ApiResponsePayload::Empty); diff --git a/vmm/src/api/openapi/cloud-hypervisor.yaml b/vmm/src/api/openapi/cloud-hypervisor.yaml index 3eb0cd01b5..bb52249aba 100644 --- a/vmm/src/api/openapi/cloud-hypervisor.yaml +++ b/vmm/src/api/openapi/cloud-hypervisor.yaml @@ -1657,6 +1657,13 @@ components: type: string destination_path: type: string + destination_mode: + type: string + enum: + - RequireExisting + - Create + - CreateIfMissing + default: RequireExisting VmDiskMirrorStatusData: required: diff --git a/vmm/src/device_manager.rs b/vmm/src/device_manager.rs index 094609b6db..de230797e1 100644 --- a/vmm/src/device_manager.rs +++ b/vmm/src/device_manager.rs @@ -113,6 +113,7 @@ use vm_migration::{ use vm_virtio::{AccessPlatform, VirtioDeviceType}; use vmm_sys_util::eventfd::EventFd; +use crate::api::MirrorDestinationMode; use crate::console_devices::{ConsoleDeviceError, ConsoleInfo, ConsoleTransport}; use crate::cpu::{AcpiCpuHotplugController, CPU_MANAGER_ACPI_SIZE, CpuManager}; use crate::device_tree::{DeviceNode, DeviceTree}; @@ -707,6 +708,12 @@ pub enum DeviceManagerError { )] BlockMirrorDestAlreadyExists(String, String), + /// The block mirroring destination path does not exist. + #[error( + "The block mirroring destination path does not exist for the disk with identifier: {0} at path: {1}" + )] + BlockMirrorDestMissing(String, String), + #[error("Failed to complete block mirror for disk {0}: {1}")] BlockMirrorComplete(String, #[source] BlockError), @@ -5369,7 +5376,12 @@ impl DeviceManager { /// /// Returns an error if no disk with the given identifier is attached /// to the VM, or the destination cannot be created or opened. - pub fn mirror_disk(&self, device_id: &str, dest_path: &Path) -> DeviceManagerResult<()> { + pub fn mirror_disk( + &self, + device_id: &str, + dest_path: &Path, + dest_mode: MirrorDestinationMode, + ) -> DeviceManagerResult<()> { for dev in &self.block_devices { let mut disk = dev.lock().unwrap(); if disk.id() != device_id { @@ -5427,10 +5439,26 @@ impl DeviceManager { ) }; - // TODO: make this configurable via request flags (create_disk, - // use_existing_disk). For now, create the destination only when it - // is missing and open it either way. - if !dest_path.exists() { + // Decide whether to create the destination based on the requested mode. + let dest_exists = dest_path.exists(); + let id = device_id.to_string(); + let path = dest_path.display().to_string(); + let create = match dest_mode { + MirrorDestinationMode::RequireExisting => { + if !dest_exists { + return Err(DeviceManagerError::BlockMirrorDestMissing(id, path)); + } + false + } + MirrorDestinationMode::Create => { + if dest_exists { + return Err(DeviceManagerError::BlockMirrorDestAlreadyExists(id, path)); + } + true + } + MirrorDestinationMode::CreateIfMissing => !dest_exists, + }; + if create { let logical_size = disk.logical_size().map_err(DeviceManagerError::Disk)?; create_disk(options, image_type, logical_size).map_err(DeviceManagerError::Disk)?; } diff --git a/vmm/src/lib.rs b/vmm/src/lib.rs index e847c7001a..2baa71b95f 100644 --- a/vmm/src/lib.rs +++ b/vmm/src/lib.rs @@ -62,8 +62,9 @@ use vmm_sys_util::signal::unblock_signal; use vmm_sys_util::sock_ctrl_msg::ScmSocket; use crate::api::{ - ApiRequest, ApiResponse, RequestHandler, TimeoutStrategy, VmDiskMirrorStatusResponse, - VmInfoResponse, VmReceiveMigrationData, VmSendMigrationData, VmmPingResponse, + ApiRequest, ApiResponse, MirrorDestinationMode, RequestHandler, TimeoutStrategy, + VmDiskMirrorStatusResponse, VmInfoResponse, VmReceiveMigrationData, VmSendMigrationData, + VmmPingResponse, }; use crate::config::{MemoryRestoreMode, RestoreConfig, add_to_config}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] @@ -3536,11 +3537,14 @@ impl RequestHandler for Vmm { &mut self, id: String, destination_path: PathBuf, + destination_mode: MirrorDestinationMode, ) -> result::Result<(), VmError> { self.vm_config.as_ref().ok_or(VmError::VmNotCreated)?; match self.vm { - MaybeVmOwnership::Vmm(ref mut vm) => vm.mirror_disk(&id, &destination_path), + MaybeVmOwnership::Vmm(ref mut vm) => { + vm.mirror_disk(&id, &destination_path, destination_mode) + } MaybeVmOwnership::Migration(_) => Err(VmError::VmMigrating), MaybeVmOwnership::None => Err(VmError::DiskMirrorStart), } diff --git a/vmm/src/vm.rs b/vmm/src/vm.rs index f9c127ad4b..cf0da6b58c 100644 --- a/vmm/src/vm.rs +++ b/vmm/src/vm.rs @@ -86,6 +86,7 @@ use vm_migration::{ use vmm_sys_util::eventfd::EventFd; use vmm_sys_util::sock_ctrl_msg::ScmSocket; +use crate::api::MirrorDestinationMode; use crate::config::{MemoryRestoreMode, ValidationError, add_to_config}; use crate::console_devices::{ConsoleDeviceError, ConsoleInfo}; #[cfg(all(target_arch = "x86_64", feature = "guest_debug"))] @@ -3318,11 +3319,16 @@ impl Vm { .map_err(Error::ErrorNmi); } - pub fn mirror_disk(&self, id: &str, dest_path: &Path) -> Result<()> { + pub fn mirror_disk( + &self, + id: &str, + dest_path: &Path, + dest_mode: MirrorDestinationMode, + ) -> Result<()> { self.device_manager .lock() .unwrap() - .mirror_disk(id, dest_path) + .mirror_disk(id, dest_path, dest_mode) .map_err(Error::DeviceManager)?; Ok(())