xref: /cloud-hypervisor/block/src/lib.rs (revision bc6acb842f1ebb263245cd95fe5a92fe5f350bd3)
1 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 //
3 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
4 // Use of this source code is governed by a BSD-style license that can be
5 // found in the LICENSE-BSD-3-Clause file.
6 //
7 // Copyright © 2020 Intel Corporation
8 //
9 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
10 
11 #[macro_use]
12 extern crate log;
13 
14 pub mod async_io;
15 pub mod fixed_vhd;
16 #[cfg(feature = "io_uring")]
17 /// Enabled with the `"io_uring"` feature
18 pub mod fixed_vhd_async;
19 pub mod fixed_vhd_sync;
20 pub mod qcow;
21 pub mod qcow_sync;
22 #[cfg(feature = "io_uring")]
23 /// Async primitives based on `io-uring`
24 ///
25 /// Enabled with the `"io_uring"` feature
26 pub mod raw_async;
27 pub mod raw_async_aio;
28 pub mod raw_sync;
29 pub mod vhd;
30 pub mod vhdx;
31 pub mod vhdx_sync;
32 
33 use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult};
34 use crate::fixed_vhd::FixedVhd;
35 use crate::qcow::{QcowFile, RawFile};
36 use crate::vhdx::{Vhdx, VhdxError};
37 #[cfg(feature = "io_uring")]
38 use io_uring::{opcode, IoUring, Probe};
39 use libc::{ioctl, S_IFBLK, S_IFMT};
40 use serde::{Deserialize, Serialize};
41 use smallvec::SmallVec;
42 use std::alloc::{alloc_zeroed, dealloc, Layout};
43 use std::cmp;
44 use std::collections::VecDeque;
45 use std::fmt::Debug;
46 use std::fs::File;
47 use std::io::{self, IoSlice, IoSliceMut, Read, Seek, SeekFrom, Write};
48 use std::os::linux::fs::MetadataExt;
49 use std::os::unix::io::AsRawFd;
50 use std::path::Path;
51 use std::result;
52 use std::sync::Arc;
53 use std::sync::MutexGuard;
54 use std::time::Instant;
55 use thiserror::Error;
56 use virtio_bindings::virtio_blk::*;
57 use virtio_queue::DescriptorChain;
58 use vm_memory::{
59     bitmap::Bitmap, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryError,
60     GuestMemoryLoadGuard,
61 };
62 use vm_virtio::{AccessPlatform, Translatable};
63 use vmm_sys_util::aio;
64 use vmm_sys_util::eventfd::EventFd;
65 use vmm_sys_util::{ioctl_io_nr, ioctl_ioc_nr};
66 
67 const SECTOR_SHIFT: u8 = 9;
68 pub const SECTOR_SIZE: u64 = 0x01 << SECTOR_SHIFT;
69 
70 #[derive(Error, Debug)]
71 pub enum Error {
72     #[error("Guest gave us bad memory addresses")]
73     GuestMemory(GuestMemoryError),
74     #[error("Guest gave us offsets that would have overflowed a usize")]
75     CheckedOffset(GuestAddress, usize),
76     #[error("Guest gave us a write only descriptor that protocol says to read from")]
77     UnexpectedWriteOnlyDescriptor,
78     #[error("Guest gave us a read only descriptor that protocol says to write to")]
79     UnexpectedReadOnlyDescriptor,
80     #[error("Guest gave us too few descriptors in a descriptor chain")]
81     DescriptorChainTooShort,
82     #[error("Guest gave us a descriptor that was too short to use")]
83     DescriptorLengthTooSmall,
84     #[error("Failed to detect image type: {0}")]
85     DetectImageType(std::io::Error),
86     #[error("Failure in fixed vhd: {0}")]
87     FixedVhdError(std::io::Error),
88     #[error("Getting a block's metadata fails for any reason")]
89     GetFileMetadata,
90     #[error("The requested operation would cause a seek beyond disk end")]
91     InvalidOffset,
92     #[error("Failure in qcow: {0}")]
93     QcowError(qcow::Error),
94     #[error("Failure in raw file: {0}")]
95     RawFileError(std::io::Error),
96     #[error("The requested operation does not support multiple descriptors")]
97     TooManyDescriptors,
98     #[error("Failure in vhdx: {0}")]
99     VhdxError(VhdxError),
100 }
101 
102 fn build_device_id(disk_path: &Path) -> result::Result<String, Error> {
103     let blk_metadata = match disk_path.metadata() {
104         Err(_) => return Err(Error::GetFileMetadata),
105         Ok(m) => m,
106     };
107     // This is how kvmtool does it.
108     let device_id = format!(
109         "{}{}{}",
110         blk_metadata.st_dev(),
111         blk_metadata.st_rdev(),
112         blk_metadata.st_ino()
113     );
114     Ok(device_id)
115 }
116 
117 pub fn build_serial(disk_path: &Path) -> Vec<u8> {
118     let mut default_serial = vec![0; VIRTIO_BLK_ID_BYTES as usize];
119     match build_device_id(disk_path) {
120         Err(_) => {
121             warn!("Could not generate device id. We'll use a default.");
122         }
123         Ok(m) => {
124             // The kernel only knows to read a maximum of VIRTIO_BLK_ID_BYTES.
125             // This will also zero out any leftover bytes.
126             let disk_id = m.as_bytes();
127             let bytes_to_copy = cmp::min(disk_id.len(), VIRTIO_BLK_ID_BYTES as usize);
128             default_serial[..bytes_to_copy].clone_from_slice(&disk_id[..bytes_to_copy])
129         }
130     }
131     default_serial
132 }
133 
134 #[derive(Error, Debug)]
135 pub enum ExecuteError {
136     #[error("Bad request: {0}")]
137     BadRequest(Error),
138     #[error("Failed to flush: {0}")]
139     Flush(io::Error),
140     #[error("Failed to read: {0}")]
141     Read(GuestMemoryError),
142     #[error("Failed to read_exact: {0}")]
143     ReadExact(io::Error),
144     #[error("Failed to seek: {0}")]
145     Seek(io::Error),
146     #[error("Failed to write: {0}")]
147     Write(GuestMemoryError),
148     #[error("Failed to write_all: {0}")]
149     WriteAll(io::Error),
150     #[error("Unsupported request: {0}")]
151     Unsupported(u32),
152     #[error("Failed to submit io uring: {0}")]
153     SubmitIoUring(io::Error),
154     #[error("Failed to get guest address: {0}")]
155     GetHostAddress(GuestMemoryError),
156     #[error("Failed to async read: {0}")]
157     AsyncRead(AsyncIoError),
158     #[error("Failed to async write: {0}")]
159     AsyncWrite(AsyncIoError),
160     #[error("failed to async flush: {0}")]
161     AsyncFlush(AsyncIoError),
162     #[error("Failed allocating a temporary buffer: {0}")]
163     TemporaryBufferAllocation(io::Error),
164 }
165 
166 impl ExecuteError {
167     pub fn status(&self) -> u8 {
168         let status = match *self {
169             ExecuteError::BadRequest(_) => VIRTIO_BLK_S_IOERR,
170             ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR,
171             ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR,
172             ExecuteError::ReadExact(_) => VIRTIO_BLK_S_IOERR,
173             ExecuteError::Seek(_) => VIRTIO_BLK_S_IOERR,
174             ExecuteError::Write(_) => VIRTIO_BLK_S_IOERR,
175             ExecuteError::WriteAll(_) => VIRTIO_BLK_S_IOERR,
176             ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP,
177             ExecuteError::SubmitIoUring(_) => VIRTIO_BLK_S_IOERR,
178             ExecuteError::GetHostAddress(_) => VIRTIO_BLK_S_IOERR,
179             ExecuteError::AsyncRead(_) => VIRTIO_BLK_S_IOERR,
180             ExecuteError::AsyncWrite(_) => VIRTIO_BLK_S_IOERR,
181             ExecuteError::AsyncFlush(_) => VIRTIO_BLK_S_IOERR,
182             ExecuteError::TemporaryBufferAllocation(_) => VIRTIO_BLK_S_IOERR,
183         };
184         status as u8
185     }
186 }
187 
188 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
189 pub enum RequestType {
190     In,
191     Out,
192     Flush,
193     GetDeviceId,
194     Unsupported(u32),
195 }
196 
197 pub fn request_type<B: Bitmap + 'static>(
198     mem: &vm_memory::GuestMemoryMmap<B>,
199     desc_addr: GuestAddress,
200 ) -> result::Result<RequestType, Error> {
201     let type_ = mem.read_obj(desc_addr).map_err(Error::GuestMemory)?;
202     match type_ {
203         VIRTIO_BLK_T_IN => Ok(RequestType::In),
204         VIRTIO_BLK_T_OUT => Ok(RequestType::Out),
205         VIRTIO_BLK_T_FLUSH => Ok(RequestType::Flush),
206         VIRTIO_BLK_T_GET_ID => Ok(RequestType::GetDeviceId),
207         t => Ok(RequestType::Unsupported(t)),
208     }
209 }
210 
211 fn sector<B: Bitmap + 'static>(
212     mem: &vm_memory::GuestMemoryMmap<B>,
213     desc_addr: GuestAddress,
214 ) -> result::Result<u64, Error> {
215     const SECTOR_OFFSET: usize = 8;
216     let addr = match mem.checked_offset(desc_addr, SECTOR_OFFSET) {
217         Some(v) => v,
218         None => return Err(Error::CheckedOffset(desc_addr, SECTOR_OFFSET)),
219     };
220 
221     mem.read_obj(addr).map_err(Error::GuestMemory)
222 }
223 
224 #[derive(Debug)]
225 pub struct AlignedOperation {
226     origin_ptr: u64,
227     aligned_ptr: u64,
228     size: usize,
229     layout: Layout,
230 }
231 
232 #[derive(Debug)]
233 pub struct Request {
234     pub request_type: RequestType,
235     pub sector: u64,
236     pub data_descriptors: SmallVec<[(GuestAddress, u32); 1]>,
237     pub status_addr: GuestAddress,
238     pub writeback: bool,
239     pub aligned_operations: SmallVec<[AlignedOperation; 1]>,
240     pub start: Instant,
241 }
242 
243 impl Request {
244     pub fn parse<B: Bitmap + 'static>(
245         desc_chain: &mut DescriptorChain<GuestMemoryLoadGuard<vm_memory::GuestMemoryMmap<B>>>,
246         access_platform: Option<&Arc<dyn AccessPlatform>>,
247     ) -> result::Result<Request, Error> {
248         let hdr_desc = desc_chain
249             .next()
250             .ok_or(Error::DescriptorChainTooShort)
251             .map_err(|e| {
252                 error!("Missing head descriptor");
253                 e
254             })?;
255 
256         // The head contains the request type which MUST be readable.
257         if hdr_desc.is_write_only() {
258             return Err(Error::UnexpectedWriteOnlyDescriptor);
259         }
260 
261         let hdr_desc_addr = hdr_desc
262             .addr()
263             .translate_gva(access_platform, hdr_desc.len() as usize);
264 
265         let mut req = Request {
266             request_type: request_type(desc_chain.memory(), hdr_desc_addr)?,
267             sector: sector(desc_chain.memory(), hdr_desc_addr)?,
268             data_descriptors: SmallVec::with_capacity(1),
269             status_addr: GuestAddress(0),
270             writeback: true,
271             aligned_operations: SmallVec::with_capacity(1),
272             start: Instant::now(),
273         };
274 
275         let status_desc;
276         let mut desc = desc_chain
277             .next()
278             .ok_or(Error::DescriptorChainTooShort)
279             .map_err(|e| {
280                 error!("Only head descriptor present: request = {:?}", req);
281                 e
282             })?;
283 
284         if !desc.has_next() {
285             status_desc = desc;
286             // Only flush requests are allowed to skip the data descriptor.
287             if req.request_type != RequestType::Flush {
288                 error!("Need a data descriptor: request = {:?}", req);
289                 return Err(Error::DescriptorChainTooShort);
290             }
291         } else {
292             req.data_descriptors.reserve_exact(1);
293             while desc.has_next() {
294                 if desc.is_write_only() && req.request_type == RequestType::Out {
295                     return Err(Error::UnexpectedWriteOnlyDescriptor);
296                 }
297                 if !desc.is_write_only() && req.request_type == RequestType::In {
298                     return Err(Error::UnexpectedReadOnlyDescriptor);
299                 }
300                 if !desc.is_write_only() && req.request_type == RequestType::GetDeviceId {
301                     return Err(Error::UnexpectedReadOnlyDescriptor);
302                 }
303 
304                 req.data_descriptors.push((
305                     desc.addr()
306                         .translate_gva(access_platform, desc.len() as usize),
307                     desc.len(),
308                 ));
309                 desc = desc_chain
310                     .next()
311                     .ok_or(Error::DescriptorChainTooShort)
312                     .map_err(|e| {
313                         error!("DescriptorChain corrupted: request = {:?}", req);
314                         e
315                     })?;
316             }
317             status_desc = desc;
318         }
319 
320         // The status MUST always be writable.
321         if !status_desc.is_write_only() {
322             return Err(Error::UnexpectedReadOnlyDescriptor);
323         }
324 
325         if status_desc.len() < 1 {
326             return Err(Error::DescriptorLengthTooSmall);
327         }
328 
329         req.status_addr = status_desc
330             .addr()
331             .translate_gva(access_platform, status_desc.len() as usize);
332 
333         Ok(req)
334     }
335 
336     pub fn execute<T: Seek + Read + Write, B: Bitmap + 'static>(
337         &self,
338         disk: &mut T,
339         disk_nsectors: u64,
340         mem: &vm_memory::GuestMemoryMmap<B>,
341         serial: &[u8],
342     ) -> result::Result<u32, ExecuteError> {
343         disk.seek(SeekFrom::Start(self.sector << SECTOR_SHIFT))
344             .map_err(ExecuteError::Seek)?;
345         let mut len = 0;
346         for (data_addr, data_len) in &self.data_descriptors {
347             let mut top: u64 = u64::from(*data_len) / SECTOR_SIZE;
348             if u64::from(*data_len) % SECTOR_SIZE != 0 {
349                 top += 1;
350             }
351             top = top
352                 .checked_add(self.sector)
353                 .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?;
354             if top > disk_nsectors {
355                 return Err(ExecuteError::BadRequest(Error::InvalidOffset));
356             }
357 
358             match self.request_type {
359                 RequestType::In => {
360                     let mut buf = vec![0u8; *data_len as usize];
361                     disk.read_exact(&mut buf).map_err(ExecuteError::ReadExact)?;
362                     mem.read_exact_volatile_from(
363                         *data_addr,
364                         &mut buf.as_slice(),
365                         *data_len as usize,
366                     )
367                     .map_err(ExecuteError::Read)?;
368                     len += data_len;
369                 }
370                 RequestType::Out => {
371                     let mut buf: Vec<u8> = Vec::new();
372                     mem.write_all_volatile_to(*data_addr, &mut buf, *data_len as usize)
373                         .map_err(ExecuteError::Write)?;
374                     disk.write_all(&buf).map_err(ExecuteError::WriteAll)?;
375                     if !self.writeback {
376                         disk.flush().map_err(ExecuteError::Flush)?;
377                     }
378                 }
379                 RequestType::Flush => disk.flush().map_err(ExecuteError::Flush)?,
380                 RequestType::GetDeviceId => {
381                     if (*data_len as usize) < serial.len() {
382                         return Err(ExecuteError::BadRequest(Error::InvalidOffset));
383                     }
384                     mem.write_slice(serial, *data_addr)
385                         .map_err(ExecuteError::Write)?;
386                 }
387                 RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)),
388             };
389         }
390         Ok(len)
391     }
392 
393     pub fn execute_async<B: Bitmap + 'static>(
394         &mut self,
395         mem: &vm_memory::GuestMemoryMmap<B>,
396         disk_nsectors: u64,
397         disk_image: &mut dyn AsyncIo,
398         serial: &[u8],
399         user_data: u64,
400     ) -> result::Result<bool, ExecuteError> {
401         let sector = self.sector;
402         let request_type = self.request_type;
403         let offset = (sector << SECTOR_SHIFT) as libc::off_t;
404 
405         let mut iovecs: SmallVec<[libc::iovec; 1]> =
406             SmallVec::with_capacity(self.data_descriptors.len());
407         for (data_addr, data_len) in &self.data_descriptors {
408             if *data_len == 0 {
409                 continue;
410             }
411             let mut top: u64 = u64::from(*data_len) / SECTOR_SIZE;
412             if u64::from(*data_len) % SECTOR_SIZE != 0 {
413                 top += 1;
414             }
415             top = top
416                 .checked_add(sector)
417                 .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?;
418             if top > disk_nsectors {
419                 return Err(ExecuteError::BadRequest(Error::InvalidOffset));
420             }
421 
422             let origin_ptr = mem
423                 .get_slice(*data_addr, *data_len as usize)
424                 .map_err(ExecuteError::GetHostAddress)?
425                 .ptr_guard();
426 
427             // Verify the buffer alignment.
428             // In case it's not properly aligned, an intermediate buffer is
429             // created with the correct alignment, and a copy from/to the
430             // origin buffer is performed, depending on the type of operation.
431             let iov_base = if (origin_ptr.as_ptr() as u64) % SECTOR_SIZE != 0 {
432                 let layout =
433                     Layout::from_size_align(*data_len as usize, SECTOR_SIZE as usize).unwrap();
434                 // SAFETY: layout has non-zero size
435                 let aligned_ptr = unsafe { alloc_zeroed(layout) };
436                 if aligned_ptr.is_null() {
437                     return Err(ExecuteError::TemporaryBufferAllocation(
438                         io::Error::last_os_error(),
439                     ));
440                 }
441 
442                 // We need to perform the copy beforehand in case we're writing
443                 // data out.
444                 if request_type == RequestType::Out {
445                     // SAFETY: destination buffer has been allocated with
446                     // the proper size.
447                     unsafe { std::ptr::copy(origin_ptr.as_ptr(), aligned_ptr, *data_len as usize) };
448                 }
449 
450                 // Store both origin and aligned pointers for complete_async()
451                 // to process them.
452                 self.aligned_operations.push(AlignedOperation {
453                     origin_ptr: origin_ptr.as_ptr() as u64,
454                     aligned_ptr: aligned_ptr as u64,
455                     size: *data_len as usize,
456                     layout,
457                 });
458 
459                 aligned_ptr as *mut libc::c_void
460             } else {
461                 origin_ptr.as_ptr() as *mut libc::c_void
462             };
463 
464             let iovec = libc::iovec {
465                 iov_base,
466                 iov_len: *data_len as libc::size_t,
467             };
468             iovecs.push(iovec);
469         }
470 
471         // Queue operations expected to be submitted.
472         match request_type {
473             RequestType::In => {
474                 for (data_addr, data_len) in &self.data_descriptors {
475                     mem.get_slice(*data_addr, *data_len as usize)
476                         .map_err(ExecuteError::GetHostAddress)?
477                         .bitmap()
478                         .mark_dirty(0, *data_len as usize);
479                 }
480                 disk_image
481                     .read_vectored(offset, &iovecs, user_data)
482                     .map_err(ExecuteError::AsyncRead)?;
483             }
484             RequestType::Out => {
485                 disk_image
486                     .write_vectored(offset, &iovecs, user_data)
487                     .map_err(ExecuteError::AsyncWrite)?;
488             }
489             RequestType::Flush => {
490                 disk_image
491                     .fsync(Some(user_data))
492                     .map_err(ExecuteError::AsyncFlush)?;
493             }
494             RequestType::GetDeviceId => {
495                 let (data_addr, data_len) = if self.data_descriptors.len() == 1 {
496                     (self.data_descriptors[0].0, self.data_descriptors[0].1)
497                 } else {
498                     return Err(ExecuteError::BadRequest(Error::TooManyDescriptors));
499                 };
500                 if (data_len as usize) < serial.len() {
501                     return Err(ExecuteError::BadRequest(Error::InvalidOffset));
502                 }
503                 mem.write_slice(serial, data_addr)
504                     .map_err(ExecuteError::Write)?;
505                 return Ok(false);
506             }
507             RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)),
508         }
509 
510         Ok(true)
511     }
512 
513     pub fn complete_async(&mut self) -> result::Result<(), Error> {
514         for aligned_operation in self.aligned_operations.drain(..) {
515             // We need to perform the copy after the data has been read inside
516             // the aligned buffer in case we're reading data in.
517             if self.request_type == RequestType::In {
518                 // SAFETY: origin buffer has been allocated with the
519                 // proper size.
520                 unsafe {
521                     std::ptr::copy(
522                         aligned_operation.aligned_ptr as *const u8,
523                         aligned_operation.origin_ptr as *mut u8,
524                         aligned_operation.size,
525                     )
526                 };
527             }
528 
529             // Free the temporary aligned buffer.
530             // SAFETY: aligned_ptr was allocated by alloc_zeroed with the same
531             // layout
532             unsafe {
533                 dealloc(
534                     aligned_operation.aligned_ptr as *mut u8,
535                     aligned_operation.layout,
536                 )
537             };
538         }
539 
540         Ok(())
541     }
542 
543     pub fn set_writeback(&mut self, writeback: bool) {
544         self.writeback = writeback
545     }
546 }
547 
548 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)]
549 #[repr(C, packed)]
550 pub struct VirtioBlockConfig {
551     pub capacity: u64,
552     pub size_max: u32,
553     pub seg_max: u32,
554     pub geometry: VirtioBlockGeometry,
555     pub blk_size: u32,
556     pub physical_block_exp: u8,
557     pub alignment_offset: u8,
558     pub min_io_size: u16,
559     pub opt_io_size: u32,
560     pub writeback: u8,
561     pub unused: u8,
562     pub num_queues: u16,
563     pub max_discard_sectors: u32,
564     pub max_discard_seg: u32,
565     pub discard_sector_alignment: u32,
566     pub max_write_zeroes_sectors: u32,
567     pub max_write_zeroes_seg: u32,
568     pub write_zeroes_may_unmap: u8,
569     pub unused1: [u8; 3],
570 }
571 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)]
572 #[repr(C, packed)]
573 pub struct VirtioBlockGeometry {
574     pub cylinders: u16,
575     pub heads: u8,
576     pub sectors: u8,
577 }
578 
579 // SAFETY: data structure only contain a series of integers
580 unsafe impl ByteValued for VirtioBlockConfig {}
581 // SAFETY: data structure only contain a series of integers
582 unsafe impl ByteValued for VirtioBlockGeometry {}
583 
584 /// Check if aio can be used on the current system.
585 pub fn block_aio_is_supported() -> bool {
586     aio::IoContext::new(1).is_ok()
587 }
588 
589 /// Check if io_uring for block device can be used on the current system, as
590 /// it correctly supports the expected io_uring features.
591 pub fn block_io_uring_is_supported() -> bool {
592     #[cfg(not(feature = "io_uring"))]
593     {
594         info!("io_uring is disabled by crate features");
595         false
596     }
597 
598     #[cfg(feature = "io_uring")]
599     {
600         let error_msg = "io_uring not supported:";
601 
602         // Check we can create an io_uring instance, which effectively verifies
603         // that io_uring_setup() syscall is supported.
604         let io_uring = match IoUring::new(1) {
605             Ok(io_uring) => io_uring,
606             Err(e) => {
607                 info!("{} failed to create io_uring instance: {}", error_msg, e);
608                 return false;
609             }
610         };
611 
612         let submitter = io_uring.submitter();
613 
614         let mut probe = Probe::new();
615 
616         // Check we can register a probe to validate supported operations.
617         match submitter.register_probe(&mut probe) {
618             Ok(_) => {}
619             Err(e) => {
620                 info!("{} failed to register a probe: {}", error_msg, e);
621                 return false;
622             }
623         }
624 
625         // Check IORING_OP_FSYNC is supported
626         if !probe.is_supported(opcode::Fsync::CODE) {
627             info!("{} IORING_OP_FSYNC operation not supported", error_msg);
628             return false;
629         }
630 
631         // Check IORING_OP_READV is supported
632         if !probe.is_supported(opcode::Readv::CODE) {
633             info!("{} IORING_OP_READV operation not supported", error_msg);
634             return false;
635         }
636 
637         // Check IORING_OP_WRITEV is supported
638         if !probe.is_supported(opcode::Writev::CODE) {
639             info!("{} IORING_OP_WRITEV operation not supported", error_msg);
640             return false;
641         }
642 
643         true
644     }
645 }
646 
647 pub trait AsyncAdaptor<F>
648 where
649     F: Read + Write + Seek,
650 {
651     fn read_vectored_sync(
652         &mut self,
653         offset: libc::off_t,
654         iovecs: &[libc::iovec],
655         user_data: u64,
656         eventfd: &EventFd,
657         completion_list: &mut VecDeque<(u64, i32)>,
658     ) -> AsyncIoResult<()> {
659         // Convert libc::iovec into IoSliceMut
660         let mut slices: SmallVec<[IoSliceMut; 1]> = SmallVec::with_capacity(iovecs.len());
661         for iovec in iovecs.iter() {
662             // SAFETY: on Linux IoSliceMut wraps around libc::iovec
663             slices.push(IoSliceMut::new(unsafe {
664                 std::mem::transmute::<libc::iovec, &mut [u8]>(*iovec)
665             }));
666         }
667 
668         let result = {
669             let mut file = self.file();
670 
671             // Move the cursor to the right offset
672             file.seek(SeekFrom::Start(offset as u64))
673                 .map_err(AsyncIoError::ReadVectored)?;
674 
675             // Read vectored
676             file.read_vectored(slices.as_mut_slice())
677                 .map_err(AsyncIoError::ReadVectored)?
678         };
679 
680         completion_list.push_back((user_data, result as i32));
681         eventfd.write(1).unwrap();
682 
683         Ok(())
684     }
685 
686     fn write_vectored_sync(
687         &mut self,
688         offset: libc::off_t,
689         iovecs: &[libc::iovec],
690         user_data: u64,
691         eventfd: &EventFd,
692         completion_list: &mut VecDeque<(u64, i32)>,
693     ) -> AsyncIoResult<()> {
694         // Convert libc::iovec into IoSlice
695         let mut slices: SmallVec<[IoSlice; 1]> = SmallVec::with_capacity(iovecs.len());
696         for iovec in iovecs.iter() {
697             // SAFETY: on Linux IoSlice wraps around libc::iovec
698             slices.push(IoSlice::new(unsafe {
699                 std::mem::transmute::<libc::iovec, &mut [u8]>(*iovec)
700             }));
701         }
702 
703         let result = {
704             let mut file = self.file();
705 
706             // Move the cursor to the right offset
707             file.seek(SeekFrom::Start(offset as u64))
708                 .map_err(AsyncIoError::WriteVectored)?;
709 
710             // Write vectored
711             file.write_vectored(slices.as_slice())
712                 .map_err(AsyncIoError::WriteVectored)?
713         };
714 
715         completion_list.push_back((user_data, result as i32));
716         eventfd.write(1).unwrap();
717 
718         Ok(())
719     }
720 
721     fn fsync_sync(
722         &mut self,
723         user_data: Option<u64>,
724         eventfd: &EventFd,
725         completion_list: &mut VecDeque<(u64, i32)>,
726     ) -> AsyncIoResult<()> {
727         let result: i32 = {
728             let mut file = self.file();
729 
730             // Flush
731             file.flush().map_err(AsyncIoError::Fsync)?;
732 
733             0
734         };
735 
736         if let Some(user_data) = user_data {
737             completion_list.push_back((user_data, result));
738             eventfd.write(1).unwrap();
739         }
740 
741         Ok(())
742     }
743 
744     fn file(&mut self) -> MutexGuard<F>;
745 }
746 
747 pub enum ImageType {
748     FixedVhd,
749     Qcow2,
750     Raw,
751     Vhdx,
752 }
753 
754 const QCOW_MAGIC: u32 = 0x5146_49fb;
755 const VHDX_SIGN: u64 = 0x656C_6966_7864_6876;
756 
757 /// Read a block into memory aligned by the source block size (needed for O_DIRECT)
758 pub fn read_aligned_block_size(f: &mut File) -> std::io::Result<Vec<u8>> {
759     let blocksize = DiskTopology::probe(f)?.logical_block_size as usize;
760     // SAFETY: We are allocating memory that is naturally aligned (size = alignment) and we meet
761     // requirements for safety from Vec::from_raw_parts() as we are using the global allocator
762     // and transferring ownership of the memory.
763     let mut data = unsafe {
764         Vec::from_raw_parts(
765             alloc_zeroed(Layout::from_size_align_unchecked(blocksize, blocksize)),
766             blocksize,
767             blocksize,
768         )
769     };
770     f.read_exact(&mut data)?;
771     Ok(data)
772 }
773 
774 /// Determine image type through file parsing.
775 pub fn detect_image_type(f: &mut File) -> std::io::Result<ImageType> {
776     let block = read_aligned_block_size(f)?;
777 
778     // Check 4 first bytes to get the header value and determine the image type
779     let image_type = if u32::from_be_bytes(block[0..4].try_into().unwrap()) == QCOW_MAGIC {
780         ImageType::Qcow2
781     } else if vhd::is_fixed_vhd(f)? {
782         ImageType::FixedVhd
783     } else if u64::from_le_bytes(block[0..8].try_into().unwrap()) == VHDX_SIGN {
784         ImageType::Vhdx
785     } else {
786         ImageType::Raw
787     };
788 
789     Ok(image_type)
790 }
791 
792 pub trait BlockBackend: Read + Write + Seek + Send + Debug {
793     fn size(&self) -> Result<u64, Error>;
794 }
795 
796 /// Inspect the image file type and create an appropriate disk file to match it.
797 pub fn create_disk_file(mut file: File, direct_io: bool) -> Result<Box<dyn BlockBackend>, Error> {
798     let image_type = detect_image_type(&mut file).map_err(Error::DetectImageType)?;
799 
800     Ok(match image_type {
801         ImageType::Qcow2 => {
802             Box::new(QcowFile::from(RawFile::new(file, direct_io)).map_err(Error::QcowError)?)
803                 as Box<dyn BlockBackend>
804         }
805         ImageType::FixedVhd => {
806             Box::new(FixedVhd::new(file).map_err(Error::FixedVhdError)?) as Box<dyn BlockBackend>
807         }
808         ImageType::Vhdx => {
809             Box::new(Vhdx::new(file).map_err(Error::VhdxError)?) as Box<dyn BlockBackend>
810         }
811         ImageType::Raw => Box::new(RawFile::new(file, direct_io)) as Box<dyn BlockBackend>,
812     })
813 }
814 
815 #[derive(Debug)]
816 pub struct DiskTopology {
817     pub logical_block_size: u64,
818     pub physical_block_size: u64,
819     pub minimum_io_size: u64,
820     pub optimal_io_size: u64,
821 }
822 
823 impl Default for DiskTopology {
824     fn default() -> Self {
825         Self {
826             logical_block_size: 512,
827             physical_block_size: 512,
828             minimum_io_size: 512,
829             optimal_io_size: 0,
830         }
831     }
832 }
833 
834 ioctl_io_nr!(BLKSSZGET, 0x12, 104);
835 ioctl_io_nr!(BLKPBSZGET, 0x12, 123);
836 ioctl_io_nr!(BLKIOMIN, 0x12, 120);
837 ioctl_io_nr!(BLKIOOPT, 0x12, 121);
838 
839 enum BlockSize {
840     LogicalBlock,
841     PhysicalBlock,
842     MinimumIo,
843     OptimalIo,
844 }
845 
846 impl DiskTopology {
847     fn is_block_device(f: &File) -> std::io::Result<bool> {
848         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
849         // SAFETY: FFI call with a valid fd and buffer
850         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
851         if ret != 0 {
852             return Err(std::io::Error::last_os_error());
853         }
854 
855         // SAFETY: stat is valid at this point
856         let is_block = unsafe { (*stat.as_ptr()).st_mode & S_IFMT == S_IFBLK };
857         Ok(is_block)
858     }
859 
860     // libc::ioctl() takes different types on different architectures
861     fn query_block_size(f: &File, block_size_type: BlockSize) -> std::io::Result<u64> {
862         let mut block_size = 0;
863         // SAFETY: FFI call with correct arguments
864         let ret = unsafe {
865             ioctl(
866                 f.as_raw_fd(),
867                 match block_size_type {
868                     BlockSize::LogicalBlock => BLKSSZGET(),
869                     BlockSize::PhysicalBlock => BLKPBSZGET(),
870                     BlockSize::MinimumIo => BLKIOMIN(),
871                     BlockSize::OptimalIo => BLKIOOPT(),
872                 } as _,
873                 &mut block_size,
874             )
875         };
876         if ret != 0 {
877             return Err(std::io::Error::last_os_error());
878         };
879 
880         Ok(block_size)
881     }
882 
883     pub fn probe(f: &File) -> std::io::Result<Self> {
884         if !Self::is_block_device(f)? {
885             return Ok(DiskTopology::default());
886         }
887 
888         Ok(DiskTopology {
889             logical_block_size: Self::query_block_size(f, BlockSize::LogicalBlock)?,
890             physical_block_size: Self::query_block_size(f, BlockSize::PhysicalBlock)?,
891             minimum_io_size: Self::query_block_size(f, BlockSize::MinimumIo)?,
892             optimal_io_size: Self::query_block_size(f, BlockSize::OptimalIo)?,
893         })
894     }
895 }
896