xref: /cloud-hypervisor/block/src/lib.rs (revision 190d90196fff389b60b93b57acf958957b71b249)
1 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 //
3 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
4 // Use of this source code is governed by a BSD-style license that can be
5 // found in the LICENSE-BSD-3-Clause file.
6 //
7 // Copyright © 2020 Intel Corporation
8 //
9 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
10 
11 #[macro_use]
12 extern crate log;
13 
14 pub mod async_io;
15 pub mod fcntl;
16 pub mod fixed_vhd;
17 #[cfg(feature = "io_uring")]
18 /// Enabled with the `"io_uring"` feature
19 pub mod fixed_vhd_async;
20 pub mod fixed_vhd_sync;
21 pub mod qcow;
22 pub mod qcow_sync;
23 #[cfg(feature = "io_uring")]
24 /// Async primitives based on `io-uring`
25 ///
26 /// Enabled with the `"io_uring"` feature
27 pub mod raw_async;
28 pub mod raw_async_aio;
29 pub mod raw_sync;
30 pub mod vhd;
31 pub mod vhdx;
32 pub mod vhdx_sync;
33 
34 use std::alloc::{alloc_zeroed, dealloc, Layout};
35 use std::collections::VecDeque;
36 use std::fmt::Debug;
37 use std::fs::File;
38 use std::io::{self, IoSlice, IoSliceMut, Read, Seek, SeekFrom, Write};
39 use std::os::linux::fs::MetadataExt;
40 use std::os::unix::io::AsRawFd;
41 use std::path::Path;
42 use std::sync::{Arc, MutexGuard};
43 use std::time::Instant;
44 use std::{cmp, result};
45 
46 #[cfg(feature = "io_uring")]
47 use io_uring::{opcode, IoUring, Probe};
48 use libc::{ioctl, S_IFBLK, S_IFMT};
49 use serde::{Deserialize, Serialize};
50 use smallvec::SmallVec;
51 use thiserror::Error;
52 use virtio_bindings::virtio_blk::*;
53 use virtio_queue::DescriptorChain;
54 use vm_memory::bitmap::Bitmap;
55 use vm_memory::{
56     ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryError, GuestMemoryLoadGuard,
57 };
58 use vm_virtio::{AccessPlatform, Translatable};
59 use vmm_sys_util::eventfd::EventFd;
60 use vmm_sys_util::{aio, ioctl_io_nr};
61 
62 use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult};
63 use crate::vhdx::VhdxError;
64 
65 const SECTOR_SHIFT: u8 = 9;
66 pub const SECTOR_SIZE: u64 = 0x01 << SECTOR_SHIFT;
67 
68 #[derive(Error, Debug)]
69 pub enum Error {
70     #[error("Guest gave us bad memory addresses")]
71     GuestMemory(#[source] GuestMemoryError),
72     #[error("Guest gave us offsets that would have overflowed a usize")]
73     CheckedOffset(GuestAddress, usize /* sector offset */),
74     #[error("Guest gave us a write only descriptor that protocol says to read from")]
75     UnexpectedWriteOnlyDescriptor,
76     #[error("Guest gave us a read only descriptor that protocol says to write to")]
77     UnexpectedReadOnlyDescriptor,
78     #[error("Guest gave us too few descriptors in a descriptor chain")]
79     DescriptorChainTooShort,
80     #[error("Guest gave us a descriptor that was too short to use")]
81     DescriptorLengthTooSmall,
82     #[error("Failed to detect image type")]
83     DetectImageType(#[source] std::io::Error),
84     #[error("Failure in fixed vhd")]
85     FixedVhdError(#[source] std::io::Error),
86     #[error("Getting a block's metadata fails for any reason")]
87     GetFileMetadata,
88     #[error("The requested operation would cause a seek beyond disk end")]
89     InvalidOffset,
90     #[error("Failure in qcow")]
91     QcowError(#[source] qcow::Error),
92     #[error("Failure in raw file")]
93     RawFileError(#[source] std::io::Error),
94     #[error("The requested operation does not support multiple descriptors")]
95     TooManyDescriptors,
96     #[error("Failure in vhdx")]
97     VhdxError(#[source] VhdxError),
98 }
99 
build_device_id(disk_path: &Path) -> result::Result<String, Error>100 fn build_device_id(disk_path: &Path) -> result::Result<String, Error> {
101     let blk_metadata = match disk_path.metadata() {
102         Err(_) => return Err(Error::GetFileMetadata),
103         Ok(m) => m,
104     };
105     // This is how kvmtool does it.
106     let device_id = format!(
107         "{}{}{}",
108         blk_metadata.st_dev(),
109         blk_metadata.st_rdev(),
110         blk_metadata.st_ino()
111     );
112     Ok(device_id)
113 }
114 
build_serial(disk_path: &Path) -> Vec<u8>115 pub fn build_serial(disk_path: &Path) -> Vec<u8> {
116     let mut default_serial = vec![0; VIRTIO_BLK_ID_BYTES as usize];
117     match build_device_id(disk_path) {
118         Err(_) => {
119             warn!("Could not generate device id. We'll use a default.");
120         }
121         Ok(m) => {
122             // The kernel only knows to read a maximum of VIRTIO_BLK_ID_BYTES.
123             // This will also zero out any leftover bytes.
124             let disk_id = m.as_bytes();
125             let bytes_to_copy = cmp::min(disk_id.len(), VIRTIO_BLK_ID_BYTES as usize);
126             default_serial[..bytes_to_copy].clone_from_slice(&disk_id[..bytes_to_copy])
127         }
128     }
129     default_serial
130 }
131 
132 #[derive(Error, Debug)]
133 pub enum ExecuteError {
134     #[error("Bad request")]
135     BadRequest(#[source] Error),
136     #[error("Failed to flush")]
137     Flush(#[source] io::Error),
138     #[error("Failed to read")]
139     Read(#[source] GuestMemoryError),
140     #[error("Failed to read_exact")]
141     ReadExact(#[source] io::Error),
142     #[error("Failed to seek")]
143     Seek(#[source] io::Error),
144     #[error("Failed to write")]
145     Write(#[source] GuestMemoryError),
146     #[error("Failed to write_all")]
147     WriteAll(#[source] io::Error),
148     #[error("Unsupported request: {0}")]
149     Unsupported(u32),
150     #[error("Failed to submit io uring")]
151     SubmitIoUring(#[source] io::Error),
152     #[error("Failed to get guest address")]
153     GetHostAddress(#[source] GuestMemoryError),
154     #[error("Failed to async read")]
155     AsyncRead(#[source] AsyncIoError),
156     #[error("Failed to async write")]
157     AsyncWrite(#[source] AsyncIoError),
158     #[error("failed to async flush")]
159     AsyncFlush(#[source] AsyncIoError),
160     #[error("Failed allocating a temporary buffer")]
161     TemporaryBufferAllocation(#[source] io::Error),
162 }
163 
164 impl ExecuteError {
status(&self) -> u8165     pub fn status(&self) -> u8 {
166         let status = match *self {
167             ExecuteError::BadRequest(_) => VIRTIO_BLK_S_IOERR,
168             ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR,
169             ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR,
170             ExecuteError::ReadExact(_) => VIRTIO_BLK_S_IOERR,
171             ExecuteError::Seek(_) => VIRTIO_BLK_S_IOERR,
172             ExecuteError::Write(_) => VIRTIO_BLK_S_IOERR,
173             ExecuteError::WriteAll(_) => VIRTIO_BLK_S_IOERR,
174             ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP,
175             ExecuteError::SubmitIoUring(_) => VIRTIO_BLK_S_IOERR,
176             ExecuteError::GetHostAddress(_) => VIRTIO_BLK_S_IOERR,
177             ExecuteError::AsyncRead(_) => VIRTIO_BLK_S_IOERR,
178             ExecuteError::AsyncWrite(_) => VIRTIO_BLK_S_IOERR,
179             ExecuteError::AsyncFlush(_) => VIRTIO_BLK_S_IOERR,
180             ExecuteError::TemporaryBufferAllocation(_) => VIRTIO_BLK_S_IOERR,
181         };
182         status as u8
183     }
184 }
185 
186 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
187 pub enum RequestType {
188     In,
189     Out,
190     Flush,
191     GetDeviceId,
192     Unsupported(u32),
193 }
194 
request_type<B: Bitmap + 'static>( mem: &vm_memory::GuestMemoryMmap<B>, desc_addr: GuestAddress, ) -> result::Result<RequestType, Error>195 pub fn request_type<B: Bitmap + 'static>(
196     mem: &vm_memory::GuestMemoryMmap<B>,
197     desc_addr: GuestAddress,
198 ) -> result::Result<RequestType, Error> {
199     let type_ = mem.read_obj(desc_addr).map_err(Error::GuestMemory)?;
200     match type_ {
201         VIRTIO_BLK_T_IN => Ok(RequestType::In),
202         VIRTIO_BLK_T_OUT => Ok(RequestType::Out),
203         VIRTIO_BLK_T_FLUSH => Ok(RequestType::Flush),
204         VIRTIO_BLK_T_GET_ID => Ok(RequestType::GetDeviceId),
205         t => Ok(RequestType::Unsupported(t)),
206     }
207 }
208 
sector<B: Bitmap + 'static>( mem: &vm_memory::GuestMemoryMmap<B>, desc_addr: GuestAddress, ) -> result::Result<u64, Error>209 fn sector<B: Bitmap + 'static>(
210     mem: &vm_memory::GuestMemoryMmap<B>,
211     desc_addr: GuestAddress,
212 ) -> result::Result<u64, Error> {
213     const SECTOR_OFFSET: usize = 8;
214     let addr = match mem.checked_offset(desc_addr, SECTOR_OFFSET) {
215         Some(v) => v,
216         None => return Err(Error::CheckedOffset(desc_addr, SECTOR_OFFSET)),
217     };
218 
219     mem.read_obj(addr).map_err(Error::GuestMemory)
220 }
221 
222 const DEFAULT_DESCRIPTOR_VEC_SIZE: usize = 32;
223 
224 #[derive(Debug)]
225 pub struct AlignedOperation {
226     origin_ptr: u64,
227     aligned_ptr: u64,
228     size: usize,
229     layout: Layout,
230 }
231 
232 #[derive(Debug)]
233 pub struct Request {
234     pub request_type: RequestType,
235     pub sector: u64,
236     pub data_descriptors: SmallVec<[(GuestAddress, u32); DEFAULT_DESCRIPTOR_VEC_SIZE]>,
237     pub status_addr: GuestAddress,
238     pub writeback: bool,
239     pub aligned_operations: SmallVec<[AlignedOperation; DEFAULT_DESCRIPTOR_VEC_SIZE]>,
240     pub start: Instant,
241 }
242 
243 impl Request {
parse<B: Bitmap + 'static>( desc_chain: &mut DescriptorChain<GuestMemoryLoadGuard<vm_memory::GuestMemoryMmap<B>>>, access_platform: Option<&Arc<dyn AccessPlatform>>, ) -> result::Result<Request, Error>244     pub fn parse<B: Bitmap + 'static>(
245         desc_chain: &mut DescriptorChain<GuestMemoryLoadGuard<vm_memory::GuestMemoryMmap<B>>>,
246         access_platform: Option<&Arc<dyn AccessPlatform>>,
247     ) -> result::Result<Request, Error> {
248         let hdr_desc = desc_chain
249             .next()
250             .ok_or(Error::DescriptorChainTooShort)
251             .inspect_err(|_| {
252                 error!("Missing head descriptor");
253             })?;
254 
255         // The head contains the request type which MUST be readable.
256         if hdr_desc.is_write_only() {
257             return Err(Error::UnexpectedWriteOnlyDescriptor);
258         }
259 
260         let hdr_desc_addr = hdr_desc
261             .addr()
262             .translate_gva(access_platform, hdr_desc.len() as usize);
263 
264         let mut req = Request {
265             request_type: request_type(desc_chain.memory(), hdr_desc_addr)?,
266             sector: sector(desc_chain.memory(), hdr_desc_addr)?,
267             data_descriptors: SmallVec::with_capacity(DEFAULT_DESCRIPTOR_VEC_SIZE),
268             status_addr: GuestAddress(0),
269             writeback: true,
270             aligned_operations: SmallVec::with_capacity(DEFAULT_DESCRIPTOR_VEC_SIZE),
271             start: Instant::now(),
272         };
273 
274         let status_desc;
275         let mut desc = desc_chain
276             .next()
277             .ok_or(Error::DescriptorChainTooShort)
278             .inspect_err(|_| {
279                 error!("Only head descriptor present: request = {:?}", req);
280             })?;
281 
282         if !desc.has_next() {
283             status_desc = desc;
284             // Only flush requests are allowed to skip the data descriptor.
285             if req.request_type != RequestType::Flush {
286                 error!("Need a data descriptor: request = {:?}", req);
287                 return Err(Error::DescriptorChainTooShort);
288             }
289         } else {
290             req.data_descriptors.reserve_exact(1);
291             while desc.has_next() {
292                 if desc.is_write_only() && req.request_type == RequestType::Out {
293                     return Err(Error::UnexpectedWriteOnlyDescriptor);
294                 }
295                 if !desc.is_write_only() && req.request_type == RequestType::In {
296                     return Err(Error::UnexpectedReadOnlyDescriptor);
297                 }
298                 if !desc.is_write_only() && req.request_type == RequestType::GetDeviceId {
299                     return Err(Error::UnexpectedReadOnlyDescriptor);
300                 }
301 
302                 req.data_descriptors.push((
303                     desc.addr()
304                         .translate_gva(access_platform, desc.len() as usize),
305                     desc.len(),
306                 ));
307                 desc = desc_chain
308                     .next()
309                     .ok_or(Error::DescriptorChainTooShort)
310                     .inspect_err(|_| {
311                         error!("DescriptorChain corrupted: request = {:?}", req);
312                     })?;
313             }
314             status_desc = desc;
315         }
316 
317         // The status MUST always be writable.
318         if !status_desc.is_write_only() {
319             return Err(Error::UnexpectedReadOnlyDescriptor);
320         }
321 
322         if status_desc.len() < 1 {
323             return Err(Error::DescriptorLengthTooSmall);
324         }
325 
326         req.status_addr = status_desc
327             .addr()
328             .translate_gva(access_platform, status_desc.len() as usize);
329 
330         Ok(req)
331     }
332 
execute<T: Seek + Read + Write, B: Bitmap + 'static>( &self, disk: &mut T, disk_nsectors: u64, mem: &vm_memory::GuestMemoryMmap<B>, serial: &[u8], ) -> result::Result<u32, ExecuteError>333     pub fn execute<T: Seek + Read + Write, B: Bitmap + 'static>(
334         &self,
335         disk: &mut T,
336         disk_nsectors: u64,
337         mem: &vm_memory::GuestMemoryMmap<B>,
338         serial: &[u8],
339     ) -> result::Result<u32, ExecuteError> {
340         disk.seek(SeekFrom::Start(self.sector << SECTOR_SHIFT))
341             .map_err(ExecuteError::Seek)?;
342         let mut len = 0;
343         for (data_addr, data_len) in &self.data_descriptors {
344             let mut top: u64 = u64::from(*data_len) / SECTOR_SIZE;
345             if u64::from(*data_len) % SECTOR_SIZE != 0 {
346                 top += 1;
347             }
348             top = top
349                 .checked_add(self.sector)
350                 .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?;
351             if top > disk_nsectors {
352                 return Err(ExecuteError::BadRequest(Error::InvalidOffset));
353             }
354 
355             match self.request_type {
356                 RequestType::In => {
357                     let mut buf = vec![0u8; *data_len as usize];
358                     disk.read_exact(&mut buf).map_err(ExecuteError::ReadExact)?;
359                     mem.read_exact_volatile_from(
360                         *data_addr,
361                         &mut buf.as_slice(),
362                         *data_len as usize,
363                     )
364                     .map_err(ExecuteError::Read)?;
365                     len += data_len;
366                 }
367                 RequestType::Out => {
368                     let mut buf: Vec<u8> = Vec::new();
369                     mem.write_all_volatile_to(*data_addr, &mut buf, *data_len as usize)
370                         .map_err(ExecuteError::Write)?;
371                     disk.write_all(&buf).map_err(ExecuteError::WriteAll)?;
372                     if !self.writeback {
373                         disk.flush().map_err(ExecuteError::Flush)?;
374                     }
375                 }
376                 RequestType::Flush => disk.flush().map_err(ExecuteError::Flush)?,
377                 RequestType::GetDeviceId => {
378                     if (*data_len as usize) < serial.len() {
379                         return Err(ExecuteError::BadRequest(Error::InvalidOffset));
380                     }
381                     mem.write_slice(serial, *data_addr)
382                         .map_err(ExecuteError::Write)?;
383                 }
384                 RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)),
385             };
386         }
387         Ok(len)
388     }
389 
execute_async<B: Bitmap + 'static>( &mut self, mem: &vm_memory::GuestMemoryMmap<B>, disk_nsectors: u64, disk_image: &mut dyn AsyncIo, serial: &[u8], user_data: u64, ) -> result::Result<bool, ExecuteError>390     pub fn execute_async<B: Bitmap + 'static>(
391         &mut self,
392         mem: &vm_memory::GuestMemoryMmap<B>,
393         disk_nsectors: u64,
394         disk_image: &mut dyn AsyncIo,
395         serial: &[u8],
396         user_data: u64,
397     ) -> result::Result<bool, ExecuteError> {
398         let sector = self.sector;
399         let request_type = self.request_type;
400         let offset = (sector << SECTOR_SHIFT) as libc::off_t;
401 
402         let mut iovecs: SmallVec<[libc::iovec; DEFAULT_DESCRIPTOR_VEC_SIZE]> =
403             SmallVec::with_capacity(self.data_descriptors.len());
404         for &(data_addr, data_len) in &self.data_descriptors {
405             let _: u32 = data_len; // compiler-checked documentation
406             const _: () = assert!(
407                 core::mem::size_of::<u32>() <= core::mem::size_of::<usize>(),
408                 "unsupported platform"
409             );
410             if data_len == 0 {
411                 continue;
412             }
413             let mut top: u64 = u64::from(data_len) / SECTOR_SIZE;
414             if u64::from(data_len) % SECTOR_SIZE != 0 {
415                 top += 1;
416             }
417             let data_len = data_len as usize;
418             top = top
419                 .checked_add(sector)
420                 .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?;
421             if top > disk_nsectors {
422                 return Err(ExecuteError::BadRequest(Error::InvalidOffset));
423             }
424 
425             let origin_ptr = mem
426                 .get_slice(data_addr, data_len)
427                 .map_err(ExecuteError::GetHostAddress)?
428                 .ptr_guard();
429 
430             // Verify the buffer alignment.
431             // In case it's not properly aligned, an intermediate buffer is
432             // created with the correct alignment, and a copy from/to the
433             // origin buffer is performed, depending on the type of operation.
434             let iov_base = if (origin_ptr.as_ptr() as u64) % SECTOR_SIZE != 0 {
435                 let layout = Layout::from_size_align(data_len, SECTOR_SIZE as usize).unwrap();
436                 // SAFETY: layout has non-zero size
437                 let aligned_ptr = unsafe { alloc_zeroed(layout) };
438                 if aligned_ptr.is_null() {
439                     return Err(ExecuteError::TemporaryBufferAllocation(
440                         io::Error::last_os_error(),
441                     ));
442                 }
443 
444                 // We need to perform the copy beforehand in case we're writing
445                 // data out.
446                 if request_type == RequestType::Out {
447                     // SAFETY: destination buffer has been allocated with
448                     // the proper size.
449                     unsafe { std::ptr::copy(origin_ptr.as_ptr(), aligned_ptr, data_len) };
450                 }
451 
452                 // Store both origin and aligned pointers for complete_async()
453                 // to process them.
454                 self.aligned_operations.push(AlignedOperation {
455                     origin_ptr: origin_ptr.as_ptr() as u64,
456                     aligned_ptr: aligned_ptr as u64,
457                     size: data_len,
458                     layout,
459                 });
460 
461                 aligned_ptr as *mut libc::c_void
462             } else {
463                 origin_ptr.as_ptr() as *mut libc::c_void
464             };
465 
466             let iovec = libc::iovec {
467                 iov_base,
468                 iov_len: data_len as libc::size_t,
469             };
470             iovecs.push(iovec);
471         }
472 
473         // Queue operations expected to be submitted.
474         match request_type {
475             RequestType::In => {
476                 for (data_addr, data_len) in &self.data_descriptors {
477                     mem.get_slice(*data_addr, *data_len as usize)
478                         .map_err(ExecuteError::GetHostAddress)?
479                         .bitmap()
480                         .mark_dirty(0, *data_len as usize);
481                 }
482                 disk_image
483                     .read_vectored(offset, &iovecs, user_data)
484                     .map_err(ExecuteError::AsyncRead)?;
485             }
486             RequestType::Out => {
487                 disk_image
488                     .write_vectored(offset, &iovecs, user_data)
489                     .map_err(ExecuteError::AsyncWrite)?;
490             }
491             RequestType::Flush => {
492                 disk_image
493                     .fsync(Some(user_data))
494                     .map_err(ExecuteError::AsyncFlush)?;
495             }
496             RequestType::GetDeviceId => {
497                 let (data_addr, data_len) = if self.data_descriptors.len() == 1 {
498                     (self.data_descriptors[0].0, self.data_descriptors[0].1)
499                 } else {
500                     return Err(ExecuteError::BadRequest(Error::TooManyDescriptors));
501                 };
502                 if (data_len as usize) < serial.len() {
503                     return Err(ExecuteError::BadRequest(Error::InvalidOffset));
504                 }
505                 mem.write_slice(serial, data_addr)
506                     .map_err(ExecuteError::Write)?;
507                 return Ok(false);
508             }
509             RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)),
510         }
511 
512         Ok(true)
513     }
514 
complete_async(&mut self) -> result::Result<(), Error>515     pub fn complete_async(&mut self) -> result::Result<(), Error> {
516         for aligned_operation in self.aligned_operations.drain(..) {
517             // We need to perform the copy after the data has been read inside
518             // the aligned buffer in case we're reading data in.
519             if self.request_type == RequestType::In {
520                 // SAFETY: origin buffer has been allocated with the
521                 // proper size.
522                 unsafe {
523                     std::ptr::copy(
524                         aligned_operation.aligned_ptr as *const u8,
525                         aligned_operation.origin_ptr as *mut u8,
526                         aligned_operation.size,
527                     )
528                 };
529             }
530 
531             // Free the temporary aligned buffer.
532             // SAFETY: aligned_ptr was allocated by alloc_zeroed with the same
533             // layout
534             unsafe {
535                 dealloc(
536                     aligned_operation.aligned_ptr as *mut u8,
537                     aligned_operation.layout,
538                 )
539             };
540         }
541 
542         Ok(())
543     }
544 
set_writeback(&mut self, writeback: bool)545     pub fn set_writeback(&mut self, writeback: bool) {
546         self.writeback = writeback
547     }
548 }
549 
550 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)]
551 #[repr(C, packed)]
552 pub struct VirtioBlockConfig {
553     pub capacity: u64,
554     pub size_max: u32,
555     pub seg_max: u32,
556     pub geometry: VirtioBlockGeometry,
557     pub blk_size: u32,
558     pub physical_block_exp: u8,
559     pub alignment_offset: u8,
560     pub min_io_size: u16,
561     pub opt_io_size: u32,
562     pub writeback: u8,
563     pub unused: u8,
564     pub num_queues: u16,
565     pub max_discard_sectors: u32,
566     pub max_discard_seg: u32,
567     pub discard_sector_alignment: u32,
568     pub max_write_zeroes_sectors: u32,
569     pub max_write_zeroes_seg: u32,
570     pub write_zeroes_may_unmap: u8,
571     pub unused1: [u8; 3],
572 }
573 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)]
574 #[repr(C, packed)]
575 pub struct VirtioBlockGeometry {
576     pub cylinders: u16,
577     pub heads: u8,
578     pub sectors: u8,
579 }
580 
581 // SAFETY: data structure only contain a series of integers
582 unsafe impl ByteValued for VirtioBlockConfig {}
583 // SAFETY: data structure only contain a series of integers
584 unsafe impl ByteValued for VirtioBlockGeometry {}
585 
586 /// Check if aio can be used on the current system.
block_aio_is_supported() -> bool587 pub fn block_aio_is_supported() -> bool {
588     aio::IoContext::new(1).is_ok()
589 }
590 
591 /// Check if io_uring for block device can be used on the current system, as
592 /// it correctly supports the expected io_uring features.
block_io_uring_is_supported() -> bool593 pub fn block_io_uring_is_supported() -> bool {
594     #[cfg(not(feature = "io_uring"))]
595     {
596         info!("io_uring is disabled by crate features");
597         false
598     }
599 
600     #[cfg(feature = "io_uring")]
601     {
602         let error_msg = "io_uring not supported:";
603 
604         // Check we can create an io_uring instance, which effectively verifies
605         // that io_uring_setup() syscall is supported.
606         let io_uring = match IoUring::new(1) {
607             Ok(io_uring) => io_uring,
608             Err(e) => {
609                 info!("{} failed to create io_uring instance: {}", error_msg, e);
610                 return false;
611             }
612         };
613 
614         let submitter = io_uring.submitter();
615 
616         let mut probe = Probe::new();
617 
618         // Check we can register a probe to validate supported operations.
619         match submitter.register_probe(&mut probe) {
620             Ok(_) => {}
621             Err(e) => {
622                 info!("{} failed to register a probe: {}", error_msg, e);
623                 return false;
624             }
625         }
626 
627         // Check IORING_OP_FSYNC is supported
628         if !probe.is_supported(opcode::Fsync::CODE) {
629             info!("{} IORING_OP_FSYNC operation not supported", error_msg);
630             return false;
631         }
632 
633         // Check IORING_OP_READV is supported
634         if !probe.is_supported(opcode::Readv::CODE) {
635             info!("{} IORING_OP_READV operation not supported", error_msg);
636             return false;
637         }
638 
639         // Check IORING_OP_WRITEV is supported
640         if !probe.is_supported(opcode::Writev::CODE) {
641             info!("{} IORING_OP_WRITEV operation not supported", error_msg);
642             return false;
643         }
644 
645         true
646     }
647 }
648 
649 pub trait AsyncAdaptor<F>
650 where
651     F: Read + Write + Seek,
652 {
read_vectored_sync( &mut self, offset: libc::off_t, iovecs: &[libc::iovec], user_data: u64, eventfd: &EventFd, completion_list: &mut VecDeque<(u64, i32)>, ) -> AsyncIoResult<()>653     fn read_vectored_sync(
654         &mut self,
655         offset: libc::off_t,
656         iovecs: &[libc::iovec],
657         user_data: u64,
658         eventfd: &EventFd,
659         completion_list: &mut VecDeque<(u64, i32)>,
660     ) -> AsyncIoResult<()> {
661         // Convert libc::iovec into IoSliceMut
662         let mut slices: SmallVec<[IoSliceMut; DEFAULT_DESCRIPTOR_VEC_SIZE]> =
663             SmallVec::with_capacity(iovecs.len());
664         for iovec in iovecs.iter() {
665             // SAFETY: on Linux IoSliceMut wraps around libc::iovec
666             slices.push(IoSliceMut::new(unsafe {
667                 std::mem::transmute::<libc::iovec, &mut [u8]>(*iovec)
668             }));
669         }
670 
671         let result = {
672             let mut file = self.file();
673 
674             // Move the cursor to the right offset
675             file.seek(SeekFrom::Start(offset as u64))
676                 .map_err(AsyncIoError::ReadVectored)?;
677 
678             let mut r = 0;
679             for b in slices.iter_mut() {
680                 r += file.read(b).map_err(AsyncIoError::ReadVectored)?;
681             }
682             r
683         };
684 
685         completion_list.push_back((user_data, result as i32));
686         eventfd.write(1).unwrap();
687 
688         Ok(())
689     }
690 
write_vectored_sync( &mut self, offset: libc::off_t, iovecs: &[libc::iovec], user_data: u64, eventfd: &EventFd, completion_list: &mut VecDeque<(u64, i32)>, ) -> AsyncIoResult<()>691     fn write_vectored_sync(
692         &mut self,
693         offset: libc::off_t,
694         iovecs: &[libc::iovec],
695         user_data: u64,
696         eventfd: &EventFd,
697         completion_list: &mut VecDeque<(u64, i32)>,
698     ) -> AsyncIoResult<()> {
699         // Convert libc::iovec into IoSlice
700         let mut slices: SmallVec<[IoSlice; DEFAULT_DESCRIPTOR_VEC_SIZE]> =
701             SmallVec::with_capacity(iovecs.len());
702         for iovec in iovecs.iter() {
703             // SAFETY: on Linux IoSlice wraps around libc::iovec
704             slices.push(IoSlice::new(unsafe {
705                 std::mem::transmute::<libc::iovec, &mut [u8]>(*iovec)
706             }));
707         }
708 
709         let result = {
710             let mut file = self.file();
711 
712             // Move the cursor to the right offset
713             file.seek(SeekFrom::Start(offset as u64))
714                 .map_err(AsyncIoError::WriteVectored)?;
715 
716             let mut r = 0;
717             for b in slices.iter() {
718                 r += file.write(b).map_err(AsyncIoError::WriteVectored)?;
719             }
720             r
721         };
722 
723         completion_list.push_back((user_data, result as i32));
724         eventfd.write(1).unwrap();
725 
726         Ok(())
727     }
728 
fsync_sync( &mut self, user_data: Option<u64>, eventfd: &EventFd, completion_list: &mut VecDeque<(u64, i32)>, ) -> AsyncIoResult<()>729     fn fsync_sync(
730         &mut self,
731         user_data: Option<u64>,
732         eventfd: &EventFd,
733         completion_list: &mut VecDeque<(u64, i32)>,
734     ) -> AsyncIoResult<()> {
735         let result: i32 = {
736             let mut file = self.file();
737 
738             // Flush
739             file.flush().map_err(AsyncIoError::Fsync)?;
740 
741             0
742         };
743 
744         if let Some(user_data) = user_data {
745             completion_list.push_back((user_data, result));
746             eventfd.write(1).unwrap();
747         }
748 
749         Ok(())
750     }
751 
file(&mut self) -> MutexGuard<'_, F>752     fn file(&mut self) -> MutexGuard<'_, F>;
753 }
754 
755 pub enum ImageType {
756     FixedVhd,
757     Qcow2,
758     Raw,
759     Vhdx,
760 }
761 
762 const QCOW_MAGIC: u32 = 0x5146_49fb;
763 const VHDX_SIGN: u64 = 0x656C_6966_7864_6876;
764 
765 /// Read a block into memory aligned by the source block size (needed for O_DIRECT)
read_aligned_block_size(f: &mut File) -> std::io::Result<Vec<u8>>766 pub fn read_aligned_block_size(f: &mut File) -> std::io::Result<Vec<u8>> {
767     let blocksize = DiskTopology::probe(f)?.logical_block_size as usize;
768     // SAFETY: We are allocating memory that is naturally aligned (size = alignment) and we meet
769     // requirements for safety from Vec::from_raw_parts() as we are using the global allocator
770     // and transferring ownership of the memory.
771     let mut data = unsafe {
772         Vec::from_raw_parts(
773             alloc_zeroed(Layout::from_size_align_unchecked(blocksize, blocksize)),
774             blocksize,
775             blocksize,
776         )
777     };
778     f.read_exact(&mut data)?;
779     Ok(data)
780 }
781 
782 /// Determine image type through file parsing.
detect_image_type(f: &mut File) -> std::io::Result<ImageType>783 pub fn detect_image_type(f: &mut File) -> std::io::Result<ImageType> {
784     let block = read_aligned_block_size(f)?;
785 
786     // Check 4 first bytes to get the header value and determine the image type
787     let image_type = if u32::from_be_bytes(block[0..4].try_into().unwrap()) == QCOW_MAGIC {
788         ImageType::Qcow2
789     } else if vhd::is_fixed_vhd(f)? {
790         ImageType::FixedVhd
791     } else if u64::from_le_bytes(block[0..8].try_into().unwrap()) == VHDX_SIGN {
792         ImageType::Vhdx
793     } else {
794         ImageType::Raw
795     };
796 
797     Ok(image_type)
798 }
799 
800 pub trait BlockBackend: Read + Write + Seek + Send + Debug {
size(&self) -> Result<u64, Error>801     fn size(&self) -> Result<u64, Error>;
802 }
803 
804 #[derive(Debug)]
805 pub struct DiskTopology {
806     pub logical_block_size: u64,
807     pub physical_block_size: u64,
808     pub minimum_io_size: u64,
809     pub optimal_io_size: u64,
810 }
811 
812 impl Default for DiskTopology {
default() -> Self813     fn default() -> Self {
814         Self {
815             logical_block_size: 512,
816             physical_block_size: 512,
817             minimum_io_size: 512,
818             optimal_io_size: 0,
819         }
820     }
821 }
822 
823 ioctl_io_nr!(BLKSSZGET, 0x12, 104);
824 ioctl_io_nr!(BLKPBSZGET, 0x12, 123);
825 ioctl_io_nr!(BLKIOMIN, 0x12, 120);
826 ioctl_io_nr!(BLKIOOPT, 0x12, 121);
827 
828 enum BlockSize {
829     LogicalBlock,
830     PhysicalBlock,
831     MinimumIo,
832     OptimalIo,
833 }
834 
835 impl DiskTopology {
is_block_device(f: &File) -> std::io::Result<bool>836     fn is_block_device(f: &File) -> std::io::Result<bool> {
837         let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
838         // SAFETY: FFI call with a valid fd and buffer
839         let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
840         if ret != 0 {
841             return Err(std::io::Error::last_os_error());
842         }
843 
844         // SAFETY: stat is valid at this point
845         let is_block = unsafe { (*stat.as_ptr()).st_mode & S_IFMT == S_IFBLK };
846         Ok(is_block)
847     }
848 
849     // libc::ioctl() takes different types on different architectures
query_block_size(f: &File, block_size_type: BlockSize) -> std::io::Result<u64>850     fn query_block_size(f: &File, block_size_type: BlockSize) -> std::io::Result<u64> {
851         let mut block_size = 0;
852         // SAFETY: FFI call with correct arguments
853         let ret = unsafe {
854             ioctl(
855                 f.as_raw_fd(),
856                 match block_size_type {
857                     BlockSize::LogicalBlock => BLKSSZGET(),
858                     BlockSize::PhysicalBlock => BLKPBSZGET(),
859                     BlockSize::MinimumIo => BLKIOMIN(),
860                     BlockSize::OptimalIo => BLKIOOPT(),
861                 } as _,
862                 &mut block_size,
863             )
864         };
865         if ret != 0 {
866             return Err(std::io::Error::last_os_error());
867         };
868 
869         Ok(block_size)
870     }
871 
probe(f: &File) -> std::io::Result<Self>872     pub fn probe(f: &File) -> std::io::Result<Self> {
873         if !Self::is_block_device(f)? {
874             return Ok(DiskTopology::default());
875         }
876 
877         Ok(DiskTopology {
878             logical_block_size: Self::query_block_size(f, BlockSize::LogicalBlock)?,
879             physical_block_size: Self::query_block_size(f, BlockSize::PhysicalBlock)?,
880             minimum_io_size: Self::query_block_size(f, BlockSize::MinimumIo)?,
881             optimal_io_size: Self::query_block_size(f, BlockSize::OptimalIo)?,
882         })
883     }
884 }
885