xref: /cloud-hypervisor/block/src/lib.rs (revision eeae63b4595fbf0cc69f62b6e9d9a79c543c4ac7)
1 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 //
3 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
4 // Use of this source code is governed by a BSD-style license that can be
5 // found in the LICENSE-BSD-3-Clause file.
6 //
7 // Copyright © 2020 Intel Corporation
8 //
9 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
10 
11 #[macro_use]
12 extern crate log;
13 
14 pub mod async_io;
15 pub mod fixed_vhd;
16 #[cfg(feature = "io_uring")]
17 /// Enabled with the `"io_uring"` feature
18 pub mod fixed_vhd_async;
19 pub mod fixed_vhd_sync;
20 pub mod qcow;
21 pub mod qcow_sync;
22 #[cfg(feature = "io_uring")]
23 /// Async primitives based on `io-uring`
24 ///
25 /// Enabled with the `"io_uring"` feature
26 pub mod raw_async;
27 pub mod raw_async_aio;
28 pub mod raw_sync;
29 pub mod vhd;
30 pub mod vhdx;
31 pub mod vhdx_sync;
32 
33 use std::alloc::{alloc_zeroed, dealloc, Layout};
34 use std::collections::VecDeque;
35 use std::fmt::Debug;
36 use std::fs::File;
37 use std::io::{self, IoSlice, IoSliceMut, Read, Seek, SeekFrom, Write};
38 use std::os::linux::fs::MetadataExt;
39 use std::os::unix::io::AsRawFd;
40 use std::path::Path;
41 use std::sync::{Arc, MutexGuard};
42 use std::time::Instant;
43 use std::{cmp, result};
44 
45 #[cfg(feature = "io_uring")]
46 use io_uring::{opcode, IoUring, Probe};
47 use libc::{ioctl, S_IFBLK, S_IFMT};
48 use serde::{Deserialize, Serialize};
49 use smallvec::SmallVec;
50 use thiserror::Error;
51 use virtio_bindings::virtio_blk::*;
52 use virtio_queue::DescriptorChain;
53 use vm_memory::bitmap::Bitmap;
54 use vm_memory::{
55     ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryError, GuestMemoryLoadGuard,
56 };
57 use vm_virtio::{AccessPlatform, Translatable};
58 use vmm_sys_util::eventfd::EventFd;
59 use vmm_sys_util::{aio, ioctl_io_nr, ioctl_ioc_nr};
60 
61 use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult};
62 use crate::vhdx::VhdxError;
63 
64 const SECTOR_SHIFT: u8 = 9;
65 pub const SECTOR_SIZE: u64 = 0x01 << SECTOR_SHIFT;
66 
67 #[derive(Error, Debug)]
68 pub enum Error {
69     #[error("Guest gave us bad memory addresses")]
70     GuestMemory(GuestMemoryError),
71     #[error("Guest gave us offsets that would have overflowed a usize")]
72     CheckedOffset(GuestAddress, usize),
73     #[error("Guest gave us a write only descriptor that protocol says to read from")]
74     UnexpectedWriteOnlyDescriptor,
75     #[error("Guest gave us a read only descriptor that protocol says to write to")]
76     UnexpectedReadOnlyDescriptor,
77     #[error("Guest gave us too few descriptors in a descriptor chain")]
78     DescriptorChainTooShort,
79     #[error("Guest gave us a descriptor that was too short to use")]
80     DescriptorLengthTooSmall,
81     #[error("Failed to detect image type: {0}")]
82     DetectImageType(std::io::Error),
83     #[error("Failure in fixed vhd: {0}")]
84     FixedVhdError(std::io::Error),
85     #[error("Getting a block's metadata fails for any reason")]
86     GetFileMetadata,
87     #[error("The requested operation would cause a seek beyond disk end")]
88     InvalidOffset,
89     #[error("Failure in qcow: {0}")]
90     QcowError(qcow::Error),
91     #[error("Failure in raw file: {0}")]
92     RawFileError(std::io::Error),
93     #[error("The requested operation does not support multiple descriptors")]
94     TooManyDescriptors,
95     #[error("Failure in vhdx: {0}")]
96     VhdxError(VhdxError),
97 }
98 
99 fn build_device_id(disk_path: &Path) -> result::Result<String, Error> {
100     let blk_metadata = match disk_path.metadata() {
101         Err(_) => return Err(Error::GetFileMetadata),
102         Ok(m) => m,
103     };
104     // This is how kvmtool does it.
105     let device_id = format!(
106         "{}{}{}",
107         blk_metadata.st_dev(),
108         blk_metadata.st_rdev(),
109         blk_metadata.st_ino()
110     );
111     Ok(device_id)
112 }
113 
114 pub fn build_serial(disk_path: &Path) -> Vec<u8> {
115     let mut default_serial = vec![0; VIRTIO_BLK_ID_BYTES as usize];
116     match build_device_id(disk_path) {
117         Err(_) => {
118             warn!("Could not generate device id. We'll use a default.");
119         }
120         Ok(m) => {
121             // The kernel only knows to read a maximum of VIRTIO_BLK_ID_BYTES.
122             // This will also zero out any leftover bytes.
123             let disk_id = m.as_bytes();
124             let bytes_to_copy = cmp::min(disk_id.len(), VIRTIO_BLK_ID_BYTES as usize);
125             default_serial[..bytes_to_copy].clone_from_slice(&disk_id[..bytes_to_copy])
126         }
127     }
128     default_serial
129 }
130 
131 #[derive(Error, Debug)]
132 pub enum ExecuteError {
133     #[error("Bad request: {0}")]
134     BadRequest(Error),
135     #[error("Failed to flush: {0}")]
136     Flush(io::Error),
137     #[error("Failed to read: {0}")]
138     Read(GuestMemoryError),
139     #[error("Failed to read_exact: {0}")]
140     ReadExact(io::Error),
141     #[error("Failed to seek: {0}")]
142     Seek(io::Error),
143     #[error("Failed to write: {0}")]
144     Write(GuestMemoryError),
145     #[error("Failed to write_all: {0}")]
146     WriteAll(io::Error),
147     #[error("Unsupported request: {0}")]
148     Unsupported(u32),
149     #[error("Failed to submit io uring: {0}")]
150     SubmitIoUring(io::Error),
151     #[error("Failed to get guest address: {0}")]
152     GetHostAddress(GuestMemoryError),
153     #[error("Failed to async read: {0}")]
154     AsyncRead(AsyncIoError),
155     #[error("Failed to async write: {0}")]
156     AsyncWrite(AsyncIoError),
157     #[error("failed to async flush: {0}")]
158     AsyncFlush(AsyncIoError),
159     #[error("Failed allocating a temporary buffer: {0}")]
160     TemporaryBufferAllocation(io::Error),
161 }
162 
163 impl ExecuteError {
164     pub fn status(&self) -> u8 {
165         let status = match *self {
166             ExecuteError::BadRequest(_) => VIRTIO_BLK_S_IOERR,
167             ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR,
168             ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR,
169             ExecuteError::ReadExact(_) => VIRTIO_BLK_S_IOERR,
170             ExecuteError::Seek(_) => VIRTIO_BLK_S_IOERR,
171             ExecuteError::Write(_) => VIRTIO_BLK_S_IOERR,
172             ExecuteError::WriteAll(_) => VIRTIO_BLK_S_IOERR,
173             ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP,
174             ExecuteError::SubmitIoUring(_) => VIRTIO_BLK_S_IOERR,
175             ExecuteError::GetHostAddress(_) => VIRTIO_BLK_S_IOERR,
176             ExecuteError::AsyncRead(_) => VIRTIO_BLK_S_IOERR,
177             ExecuteError::AsyncWrite(_) => VIRTIO_BLK_S_IOERR,
178             ExecuteError::AsyncFlush(_) => VIRTIO_BLK_S_IOERR,
179             ExecuteError::TemporaryBufferAllocation(_) => VIRTIO_BLK_S_IOERR,
180         };
181         status as u8
182     }
183 }
184 
185 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
186 pub enum RequestType {
187     In,
188     Out,
189     Flush,
190     GetDeviceId,
191     Unsupported(u32),
192 }
193 
194 pub fn request_type<B: Bitmap + 'static>(
195     mem: &vm_memory::GuestMemoryMmap<B>,
196     desc_addr: GuestAddress,
197 ) -> result::Result<RequestType, Error> {
198     let type_ = mem.read_obj(desc_addr).map_err(Error::GuestMemory)?;
199     match type_ {
200         VIRTIO_BLK_T_IN => Ok(RequestType::In),
201         VIRTIO_BLK_T_OUT => Ok(RequestType::Out),
202         VIRTIO_BLK_T_FLUSH => Ok(RequestType::Flush),
203         VIRTIO_BLK_T_GET_ID => Ok(RequestType::GetDeviceId),
204         t => Ok(RequestType::Unsupported(t)),
205     }
206 }
207 
208 fn sector<B: Bitmap + 'static>(
209     mem: &vm_memory::GuestMemoryMmap<B>,
210     desc_addr: GuestAddress,
211 ) -> result::Result<u64, Error> {
212     const SECTOR_OFFSET: usize = 8;
213     let addr = match mem.checked_offset(desc_addr, SECTOR_OFFSET) {
214         Some(v) => v,
215         None => return Err(Error::CheckedOffset(desc_addr, SECTOR_OFFSET)),
216     };
217 
218     mem.read_obj(addr).map_err(Error::GuestMemory)
219 }
220 
221 const DEFAULT_DESCRIPTOR_VEC_SIZE: usize = 32;
222 
223 #[derive(Debug)]
224 pub struct AlignedOperation {
225     origin_ptr: u64,
226     aligned_ptr: u64,
227     size: usize,
228     layout: Layout,
229 }
230 
231 #[derive(Debug)]
232 pub struct Request {
233     pub request_type: RequestType,
234     pub sector: u64,
235     pub data_descriptors: SmallVec<[(GuestAddress, u32); DEFAULT_DESCRIPTOR_VEC_SIZE]>,
236     pub status_addr: GuestAddress,
237     pub writeback: bool,
238     pub aligned_operations: SmallVec<[AlignedOperation; DEFAULT_DESCRIPTOR_VEC_SIZE]>,
239     pub start: Instant,
240 }
241 
242 impl Request {
243     pub fn parse<B: Bitmap + 'static>(
244         desc_chain: &mut DescriptorChain<GuestMemoryLoadGuard<vm_memory::GuestMemoryMmap<B>>>,
245         access_platform: Option<&Arc<dyn AccessPlatform>>,
246     ) -> result::Result<Request, Error> {
247         let hdr_desc = desc_chain
248             .next()
249             .ok_or(Error::DescriptorChainTooShort)
250             .inspect_err(|_| {
251                 error!("Missing head descriptor");
252             })?;
253 
254         // The head contains the request type which MUST be readable.
255         if hdr_desc.is_write_only() {
256             return Err(Error::UnexpectedWriteOnlyDescriptor);
257         }
258 
259         let hdr_desc_addr = hdr_desc
260             .addr()
261             .translate_gva(access_platform, hdr_desc.len() as usize);
262 
263         let mut req = Request {
264             request_type: request_type(desc_chain.memory(), hdr_desc_addr)?,
265             sector: sector(desc_chain.memory(), hdr_desc_addr)?,
266             data_descriptors: SmallVec::with_capacity(DEFAULT_DESCRIPTOR_VEC_SIZE),
267             status_addr: GuestAddress(0),
268             writeback: true,
269             aligned_operations: SmallVec::with_capacity(DEFAULT_DESCRIPTOR_VEC_SIZE),
270             start: Instant::now(),
271         };
272 
273         let status_desc;
274         let mut desc = desc_chain
275             .next()
276             .ok_or(Error::DescriptorChainTooShort)
277             .inspect_err(|_| {
278                 error!("Only head descriptor present: request = {:?}", req);
279             })?;
280 
281         if !desc.has_next() {
282             status_desc = desc;
283             // Only flush requests are allowed to skip the data descriptor.
284             if req.request_type != RequestType::Flush {
285                 error!("Need a data descriptor: request = {:?}", req);
286                 return Err(Error::DescriptorChainTooShort);
287             }
288         } else {
289             req.data_descriptors.reserve_exact(1);
290             while desc.has_next() {
291                 if desc.is_write_only() && req.request_type == RequestType::Out {
292                     return Err(Error::UnexpectedWriteOnlyDescriptor);
293                 }
294                 if !desc.is_write_only() && req.request_type == RequestType::In {
295                     return Err(Error::UnexpectedReadOnlyDescriptor);
296                 }
297                 if !desc.is_write_only() && req.request_type == RequestType::GetDeviceId {
298                     return Err(Error::UnexpectedReadOnlyDescriptor);
299                 }
300 
301                 req.data_descriptors.push((
302                     desc.addr()
303                         .translate_gva(access_platform, desc.len() as usize),
304                     desc.len(),
305                 ));
306                 desc = desc_chain
307                     .next()
308                     .ok_or(Error::DescriptorChainTooShort)
309                     .inspect_err(|_| {
310                         error!("DescriptorChain corrupted: request = {:?}", req);
311                     })?;
312             }
313             status_desc = desc;
314         }
315 
316         // The status MUST always be writable.
317         if !status_desc.is_write_only() {
318             return Err(Error::UnexpectedReadOnlyDescriptor);
319         }
320 
321         if status_desc.len() < 1 {
322             return Err(Error::DescriptorLengthTooSmall);
323         }
324 
325         req.status_addr = status_desc
326             .addr()
327             .translate_gva(access_platform, status_desc.len() as usize);
328 
329         Ok(req)
330     }
331 
332     pub fn execute<T: Seek + Read + Write, B: Bitmap + 'static>(
333         &self,
334         disk: &mut T,
335         disk_nsectors: u64,
336         mem: &vm_memory::GuestMemoryMmap<B>,
337         serial: &[u8],
338     ) -> result::Result<u32, ExecuteError> {
339         disk.seek(SeekFrom::Start(self.sector << SECTOR_SHIFT))
340             .map_err(ExecuteError::Seek)?;
341         let mut len = 0;
342         for (data_addr, data_len) in &self.data_descriptors {
343             let mut top: u64 = u64::from(*data_len) / SECTOR_SIZE;
344             if u64::from(*data_len) % SECTOR_SIZE != 0 {
345                 top += 1;
346             }
347             top = top
348                 .checked_add(self.sector)
349                 .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?;
350             if top > disk_nsectors {
351                 return Err(ExecuteError::BadRequest(Error::InvalidOffset));
352             }
353 
354             match self.request_type {
355                 RequestType::In => {
356                     let mut buf = vec![0u8; *data_len as usize];
357                     disk.read_exact(&mut buf).map_err(ExecuteError::ReadExact)?;
358                     mem.read_exact_volatile_from(
359                         *data_addr,
360                         &mut buf.as_slice(),
361                         *data_len as usize,
362                     )
363                     .map_err(ExecuteError::Read)?;
364                     len += data_len;
365                 }
366                 RequestType::Out => {
367                     let mut buf: Vec<u8> = Vec::new();
368                     mem.write_all_volatile_to(*data_addr, &mut buf, *data_len as usize)
369                         .map_err(ExecuteError::Write)?;
370                     disk.write_all(&buf).map_err(ExecuteError::WriteAll)?;
371                     if !self.writeback {
372                         disk.flush().map_err(ExecuteError::Flush)?;
373                     }
374                 }
375                 RequestType::Flush => disk.flush().map_err(ExecuteError::Flush)?,
376                 RequestType::GetDeviceId => {
377                     if (*data_len as usize) < serial.len() {
378                         return Err(ExecuteError::BadRequest(Error::InvalidOffset));
379                     }
380                     mem.write_slice(serial, *data_addr)
381                         .map_err(ExecuteError::Write)?;
382                 }
383                 RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)),
384             };
385         }
386         Ok(len)
387     }
388 
389     pub fn execute_async<B: Bitmap + 'static>(
390         &mut self,
391         mem: &vm_memory::GuestMemoryMmap<B>,
392         disk_nsectors: u64,
393         disk_image: &mut dyn AsyncIo,
394         serial: &[u8],
395         user_data: u64,
396     ) -> result::Result<bool, ExecuteError> {
397         let sector = self.sector;
398         let request_type = self.request_type;
399         let offset = (sector << SECTOR_SHIFT) as libc::off_t;
400 
401         let mut iovecs: SmallVec<[libc::iovec; DEFAULT_DESCRIPTOR_VEC_SIZE]> =
402             SmallVec::with_capacity(self.data_descriptors.len());
403         for (data_addr, data_len) in &self.data_descriptors {
404             if *data_len == 0 {
405                 continue;
406             }
407             let mut top: u64 = u64::from(*data_len) / SECTOR_SIZE;
408             if u64::from(*data_len) % SECTOR_SIZE != 0 {
409                 top += 1;
410             }
411             top = top
412                 .checked_add(sector)
413                 .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?;
414             if top > disk_nsectors {
415                 return Err(ExecuteError::BadRequest(Error::InvalidOffset));
416             }
417 
418             let origin_ptr = mem
419                 .get_slice(*data_addr, *data_len as usize)
420                 .map_err(ExecuteError::GetHostAddress)?
421                 .ptr_guard();
422 
423             // Verify the buffer alignment.
424             // In case it's not properly aligned, an intermediate buffer is
425             // created with the correct alignment, and a copy from/to the
426             // origin buffer is performed, depending on the type of operation.
427             let iov_base = if (origin_ptr.as_ptr() as u64) % SECTOR_SIZE != 0 {
428                 let layout =
429                     Layout::from_size_align(*data_len as usize, SECTOR_SIZE as usize).unwrap();
430                 // SAFETY: layout has non-zero size
431                 let aligned_ptr = unsafe { alloc_zeroed(layout) };
432                 if aligned_ptr.is_null() {
433                     return Err(ExecuteError::TemporaryBufferAllocation(
434                         io::Error::last_os_error(),
435                     ));
436                 }
437 
438                 // We need to perform the copy beforehand in case we're writing
439                 // data out.
440                 if request_type == RequestType::Out {
441                     // SAFETY: destination buffer has been allocated with
442                     // the proper size.
443                     unsafe { std::ptr::copy(origin_ptr.as_ptr(), aligned_ptr, *data_len as usize) };
444                 }
445 
446                 // Store both origin and aligned pointers for complete_async()
447                 // to process them.
448                 self.aligned_operations.push(AlignedOperation {
449                     origin_ptr: origin_ptr.as_ptr() as u64,
450                     aligned_ptr: aligned_ptr as u64,
451                     size: *data_len as usize,
452                     layout,
453                 });
454 
455                 aligned_ptr as *mut libc::c_void
456             } else {
457                 origin_ptr.as_ptr() as *mut libc::c_void
458             };
459 
460             let iovec = libc::iovec {
461                 iov_base,
462                 iov_len: *data_len as libc::size_t,
463             };
464             iovecs.push(iovec);
465         }
466 
467         // Queue operations expected to be submitted.
468         match request_type {
469             RequestType::In => {
470                 for (data_addr, data_len) in &self.data_descriptors {
471                     mem.get_slice(*data_addr, *data_len as usize)
472                         .map_err(ExecuteError::GetHostAddress)?
473                         .bitmap()
474                         .mark_dirty(0, *data_len as usize);
475                 }
476                 disk_image
477                     .read_vectored(offset, &iovecs, user_data)
478                     .map_err(ExecuteError::AsyncRead)?;
479             }
480             RequestType::Out => {
481                 disk_image
482                     .write_vectored(offset, &iovecs, user_data)
483                     .map_err(ExecuteError::AsyncWrite)?;
484             }
485             RequestType::Flush => {
486                 disk_image
487                     .fsync(Some(user_data))
488                     .map_err(ExecuteError::AsyncFlush)?;
489             }
490             RequestType::GetDeviceId => {
491                 let (data_addr, data_len) = if self.data_descriptors.len() == 1 {
492                     (self.data_descriptors[0].0, self.data_descriptors[0].1)
493                 } else {
494                     return Err(ExecuteError::BadRequest(Error::TooManyDescriptors));
495                 };
496                 if (data_len as usize) < serial.len() {
497                     return Err(ExecuteError::BadRequest(Error::InvalidOffset));
498                 }
499                 mem.write_slice(serial, data_addr)
500                     .map_err(ExecuteError::Write)?;
501                 return Ok(false);
502             }
503             RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)),
504         }
505 
506         Ok(true)
507     }
508 
509     pub fn complete_async(&mut self) -> result::Result<(), Error> {
510         for aligned_operation in self.aligned_operations.drain(..) {
511             // We need to perform the copy after the data has been read inside
512             // the aligned buffer in case we're reading data in.
513             if self.request_type == RequestType::In {
514                 // SAFETY: origin buffer has been allocated with the
515                 // proper size.
516                 unsafe {
517                     std::ptr::copy(
518                         aligned_operation.aligned_ptr as *const u8,
519                         aligned_operation.origin_ptr as *mut u8,
520                         aligned_operation.size,
521                     )
522                 };
523             }
524 
525             // Free the temporary aligned buffer.
526             // SAFETY: aligned_ptr was allocated by alloc_zeroed with the same
527             // layout
528             unsafe {
529                 dealloc(
530                     aligned_operation.aligned_ptr as *mut u8,
531                     aligned_operation.layout,
532                 )
533             };
534         }
535 
536         Ok(())
537     }
538 
539     pub fn set_writeback(&mut self, writeback: bool) {
540         self.writeback = writeback
541     }
542 }
543 
544 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)]
545 #[repr(C, packed)]
546 pub struct VirtioBlockConfig {
547     pub capacity: u64,
548     pub size_max: u32,
549     pub seg_max: u32,
550     pub geometry: VirtioBlockGeometry,
551     pub blk_size: u32,
552     pub physical_block_exp: u8,
553     pub alignment_offset: u8,
554     pub min_io_size: u16,
555     pub opt_io_size: u32,
556     pub writeback: u8,
557     pub unused: u8,
558     pub num_queues: u16,
559     pub max_discard_sectors: u32,
560     pub max_discard_seg: u32,
561     pub discard_sector_alignment: u32,
562     pub max_write_zeroes_sectors: u32,
563     pub max_write_zeroes_seg: u32,
564     pub write_zeroes_may_unmap: u8,
565     pub unused1: [u8; 3],
566 }
567 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)]
568 #[repr(C, packed)]
569 pub struct VirtioBlockGeometry {
570     pub cylinders: u16,
571     pub heads: u8,
572     pub sectors: u8,
573 }
574 
575 // SAFETY: data structure only contain a series of integers
576 unsafe impl ByteValued for VirtioBlockConfig {}
577 // SAFETY: data structure only contain a series of integers
578 unsafe impl ByteValued for VirtioBlockGeometry {}
579 
580 /// Check if aio can be used on the current system.
581 pub fn block_aio_is_supported() -> bool {
582     aio::IoContext::new(1).is_ok()
583 }
584 
585 /// Check if io_uring for block device can be used on the current system, as
586 /// it correctly supports the expected io_uring features.
587 pub fn block_io_uring_is_supported() -> bool {
588     #[cfg(not(feature = "io_uring"))]
589     {
590         info!("io_uring is disabled by crate features");
591         false
592     }
593 
594     #[cfg(feature = "io_uring")]
595     {
596         let error_msg = "io_uring not supported:";
597 
598         // Check we can create an io_uring instance, which effectively verifies
599         // that io_uring_setup() syscall is supported.
600         let io_uring = match IoUring::new(1) {
601             Ok(io_uring) => io_uring,
602             Err(e) => {
603                 info!("{} failed to create io_uring instance: {}", error_msg, e);
604                 return false;
605             }
606         };
607 
608         let submitter = io_uring.submitter();
609 
610         let mut probe = Probe::new();
611 
612         // Check we can register a probe to validate supported operations.
613         match submitter.register_probe(&mut probe) {
614             Ok(_) => {}
615             Err(e) => {
616                 info!("{} failed to register a probe: {}", error_msg, e);
617                 return false;
618             }
619         }
620 
621         // Check IORING_OP_FSYNC is supported
622         if !probe.is_supported(opcode::Fsync::CODE) {
623             info!("{} IORING_OP_FSYNC operation not supported", error_msg);
624             return false;
625         }
626 
627         // Check IORING_OP_READV is supported
628         if !probe.is_supported(opcode::Readv::CODE) {
629             info!("{} IORING_OP_READV operation not supported", error_msg);
630             return false;
631         }
632 
633         // Check IORING_OP_WRITEV is supported
634         if !probe.is_supported(opcode::Writev::CODE) {
635             info!("{} IORING_OP_WRITEV operation not supported", error_msg);
636             return false;
637         }
638 
639         true
640     }
641 }
642 
643 pub trait AsyncAdaptor<F>
644 where
645     F: Read + Write + Seek,
646 {
647     fn read_vectored_sync(
648         &mut self,
649         offset: libc::off_t,
650         iovecs: &[libc::iovec],
651         user_data: u64,
652         eventfd: &EventFd,
653         completion_list: &mut VecDeque<(u64, i32)>,
654     ) -> AsyncIoResult<()> {
655         // Convert libc::iovec into IoSliceMut
656         let mut slices: SmallVec<[IoSliceMut; DEFAULT_DESCRIPTOR_VEC_SIZE]> =
657             SmallVec::with_capacity(iovecs.len());
658         for iovec in iovecs.iter() {
659             // SAFETY: on Linux IoSliceMut wraps around libc::iovec
660             slices.push(IoSliceMut::new(unsafe {
661                 std::mem::transmute::<libc::iovec, &mut [u8]>(*iovec)
662             }));
663         }
664 
665         let result = {
666             let mut file = self.file();
667 
668             // Move the cursor to the right offset
669             file.seek(SeekFrom::Start(offset as u64))
670                 .map_err(AsyncIoError::ReadVectored)?;
671 
672             let mut r = 0;
673             for b in slices.iter_mut() {
674                 r += file.read(b).map_err(AsyncIoError::ReadVectored)?;
675             }
676             r
677         };
678 
679         completion_list.push_back((user_data, result as i32));
680         eventfd.write(1).unwrap();
681 
682         Ok(())
683     }
684 
685     fn write_vectored_sync(
686         &mut self,
687         offset: libc::off_t,
688         iovecs: &[libc::iovec],
689         user_data: u64,
690         eventfd: &EventFd,
691         completion_list: &mut VecDeque<(u64, i32)>,
692     ) -> AsyncIoResult<()> {
693         // Convert libc::iovec into IoSlice
694         let mut slices: SmallVec<[IoSlice; DEFAULT_DESCRIPTOR_VEC_SIZE]> =
695             SmallVec::with_capacity(iovecs.len());
696         for iovec in iovecs.iter() {
697             // SAFETY: on Linux IoSlice wraps around libc::iovec
698             slices.push(IoSlice::new(unsafe {
699                 std::mem::transmute::<libc::iovec, &mut [u8]>(*iovec)
700             }));
701         }
702 
703         let result = {
704             let mut file = self.file();
705 
706             // Move the cursor to the right offset
707             file.seek(SeekFrom::Start(offset as u64))
708                 .map_err(AsyncIoError::WriteVectored)?;
709 
710             let mut r = 0;
711             for b in slices.iter() {
712                 r += file.write(b).map_err(AsyncIoError::WriteVectored)?;
713             }
714             r
715         };
716 
717         completion_list.push_back((user_data, result as i32));
718         eventfd.write(1).unwrap();
719 
720         Ok(())
721     }
722 
723     fn fsync_sync(
724         &mut self,
725         user_data: Option<u64>,
726         eventfd: &EventFd,
727         completion_list: &mut VecDeque<(u64, i32)>,
728     ) -> AsyncIoResult<()> {
729         let result: i32 = {
730             let mut file = self.file();
731 
732             // Flush
733             file.flush().map_err(AsyncIoError::Fsync)?;
734 
735             0
736         };
737 
738         if let Some(user_data) = user_data {
739             completion_list.push_back((user_data, result));
740             eventfd.write(1).unwrap();
741         }
742 
743         Ok(())
744     }
745 
746     fn file(&mut self) -> MutexGuard<F>;
747 }
748 
749 pub enum ImageType {
750     FixedVhd,
751     Qcow2,
752     Raw,
753     Vhdx,
754 }
755 
756 const QCOW_MAGIC: u32 = 0x5146_49fb;
757 const VHDX_SIGN: u64 = 0x656C_6966_7864_6876;
758 
759 /// Read a block into memory aligned by the source block size (needed for O_DIRECT)
760 pub fn read_aligned_block_size(f: &mut File) -> std::io::Result<Vec<u8>> {
761     let blocksize = DiskTopology::probe(f)?.logical_block_size as usize;
762     // SAFETY: We are allocating memory that is naturally aligned (size = alignment) and we meet
763     // requirements for safety from Vec::from_raw_parts() as we are using the global allocator
764     // and transferring ownership of the memory.
765     let mut data = unsafe {
766         Vec::from_raw_parts(
767             alloc_zeroed(Layout::from_size_align_unchecked(blocksize, blocksize)),
768             blocksize,
769             blocksize,
770         )
771     };
772     f.read_exact(&mut data)?;
773     Ok(data)
774 }
775 
776 /// Determine image type through file parsing.
777 pub fn detect_image_type(f: &mut File) -> std::io::Result<ImageType> {
778     let block = read_aligned_block_size(f)?;
779 
780     // Check 4 first bytes to get the header value and determine the image type
781     let image_type = if u32::from_be_bytes(block[0..4].try_into().unwrap()) == QCOW_MAGIC {
782         ImageType::Qcow2
783     } else if vhd::is_fixed_vhd(f)? {
784         ImageType::FixedVhd
785     } else if u64::from_le_bytes(block[0..8].try_into().unwrap()) == VHDX_SIGN {
786         ImageType::Vhdx
787     } else {
788         ImageType::Raw
789     };
790 
791     Ok(image_type)
792 }
793 
794 pub trait BlockBackend: Read + Write + Seek + Send + Debug {
795     fn size(&self) -> Result<u64, Error>;
796 }
797 
798 #[derive(Debug)]
799 pub struct DiskTopology {
800     pub logical_block_size: u64,
801     pub physical_block_size: u64,
802     pub minimum_io_size: u64,
803     pub optimal_io_size: u64,
804 }
805 
806 impl Default for DiskTopology {
807     fn default() -> Self {
808         Self {
809             logical_block_size: 512,
810             physical_block_size: 512,
811             minimum_io_size: 512,
812             optimal_io_size: 0,
813         }
814     }
815 }
816 
817 ioctl_io_nr!(BLKSSZGET, 0x12, 104);
818 ioctl_io_nr!(BLKPBSZGET, 0x12, 123);
819 ioctl_io_nr!(BLKIOMIN, 0x12, 120);
820 ioctl_io_nr!(BLKIOOPT, 0x12, 121);
821 
822 enum BlockSize {
823     LogicalBlock,
824     PhysicalBlock,
825     MinimumIo,
826     OptimalIo,
827 }
828 
829 impl DiskTopology {
830     fn is_block_device(f: &File) -> std::io::Result<bool> {
831         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
832         // SAFETY: FFI call with a valid fd and buffer
833         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
834         if ret != 0 {
835             return Err(std::io::Error::last_os_error());
836         }
837 
838         // SAFETY: stat is valid at this point
839         let is_block = unsafe { (*stat.as_ptr()).st_mode & S_IFMT == S_IFBLK };
840         Ok(is_block)
841     }
842 
843     // libc::ioctl() takes different types on different architectures
844     fn query_block_size(f: &File, block_size_type: BlockSize) -> std::io::Result<u64> {
845         let mut block_size = 0;
846         // SAFETY: FFI call with correct arguments
847         let ret = unsafe {
848             ioctl(
849                 f.as_raw_fd(),
850                 match block_size_type {
851                     BlockSize::LogicalBlock => BLKSSZGET(),
852                     BlockSize::PhysicalBlock => BLKPBSZGET(),
853                     BlockSize::MinimumIo => BLKIOMIN(),
854                     BlockSize::OptimalIo => BLKIOOPT(),
855                 } as _,
856                 &mut block_size,
857             )
858         };
859         if ret != 0 {
860             return Err(std::io::Error::last_os_error());
861         };
862 
863         Ok(block_size)
864     }
865 
866     pub fn probe(f: &File) -> std::io::Result<Self> {
867         if !Self::is_block_device(f)? {
868             return Ok(DiskTopology::default());
869         }
870 
871         Ok(DiskTopology {
872             logical_block_size: Self::query_block_size(f, BlockSize::LogicalBlock)?,
873             physical_block_size: Self::query_block_size(f, BlockSize::PhysicalBlock)?,
874             minimum_io_size: Self::query_block_size(f, BlockSize::MinimumIo)?,
875             optimal_io_size: Self::query_block_size(f, BlockSize::OptimalIo)?,
876         })
877     }
878 }
879