1 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 //
3 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved.
4 // Use of this source code is governed by a BSD-style license that can be
5 // found in the LICENSE-BSD-3-Clause file.
6 //
7 // Copyright © 2020 Intel Corporation
8 //
9 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
10
11 #[macro_use]
12 extern crate log;
13
14 pub mod async_io;
15 pub mod fcntl;
16 pub mod fixed_vhd;
17 #[cfg(feature = "io_uring")]
18 /// Enabled with the `"io_uring"` feature
19 pub mod fixed_vhd_async;
20 pub mod fixed_vhd_sync;
21 pub mod qcow;
22 pub mod qcow_sync;
23 #[cfg(feature = "io_uring")]
24 /// Async primitives based on `io-uring`
25 ///
26 /// Enabled with the `"io_uring"` feature
27 pub mod raw_async;
28 pub mod raw_async_aio;
29 pub mod raw_sync;
30 pub mod vhd;
31 pub mod vhdx;
32 pub mod vhdx_sync;
33
34 use std::alloc::{alloc_zeroed, dealloc, Layout};
35 use std::collections::VecDeque;
36 use std::fmt::Debug;
37 use std::fs::File;
38 use std::io::{self, IoSlice, IoSliceMut, Read, Seek, SeekFrom, Write};
39 use std::os::linux::fs::MetadataExt;
40 use std::os::unix::io::AsRawFd;
41 use std::path::Path;
42 use std::sync::{Arc, MutexGuard};
43 use std::time::Instant;
44 use std::{cmp, result};
45
46 #[cfg(feature = "io_uring")]
47 use io_uring::{opcode, IoUring, Probe};
48 use libc::{ioctl, S_IFBLK, S_IFMT};
49 use serde::{Deserialize, Serialize};
50 use smallvec::SmallVec;
51 use thiserror::Error;
52 use virtio_bindings::virtio_blk::*;
53 use virtio_queue::DescriptorChain;
54 use vm_memory::bitmap::Bitmap;
55 use vm_memory::{
56 ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryError, GuestMemoryLoadGuard,
57 };
58 use vm_virtio::{AccessPlatform, Translatable};
59 use vmm_sys_util::eventfd::EventFd;
60 use vmm_sys_util::{aio, ioctl_io_nr};
61
62 use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult};
63 use crate::vhdx::VhdxError;
64
65 const SECTOR_SHIFT: u8 = 9;
66 pub const SECTOR_SIZE: u64 = 0x01 << SECTOR_SHIFT;
67
68 #[derive(Error, Debug)]
69 pub enum Error {
70 #[error("Guest gave us bad memory addresses")]
71 GuestMemory(#[source] GuestMemoryError),
72 #[error("Guest gave us offsets that would have overflowed a usize")]
73 CheckedOffset(GuestAddress, usize /* sector offset */),
74 #[error("Guest gave us a write only descriptor that protocol says to read from")]
75 UnexpectedWriteOnlyDescriptor,
76 #[error("Guest gave us a read only descriptor that protocol says to write to")]
77 UnexpectedReadOnlyDescriptor,
78 #[error("Guest gave us too few descriptors in a descriptor chain")]
79 DescriptorChainTooShort,
80 #[error("Guest gave us a descriptor that was too short to use")]
81 DescriptorLengthTooSmall,
82 #[error("Failed to detect image type")]
83 DetectImageType(#[source] std::io::Error),
84 #[error("Failure in fixed vhd")]
85 FixedVhdError(#[source] std::io::Error),
86 #[error("Getting a block's metadata fails for any reason")]
87 GetFileMetadata,
88 #[error("The requested operation would cause a seek beyond disk end")]
89 InvalidOffset,
90 #[error("Failure in qcow")]
91 QcowError(#[source] qcow::Error),
92 #[error("Failure in raw file")]
93 RawFileError(#[source] std::io::Error),
94 #[error("The requested operation does not support multiple descriptors")]
95 TooManyDescriptors,
96 #[error("Failure in vhdx")]
97 VhdxError(#[source] VhdxError),
98 }
99
build_device_id(disk_path: &Path) -> result::Result<String, Error>100 fn build_device_id(disk_path: &Path) -> result::Result<String, Error> {
101 let blk_metadata = match disk_path.metadata() {
102 Err(_) => return Err(Error::GetFileMetadata),
103 Ok(m) => m,
104 };
105 // This is how kvmtool does it.
106 let device_id = format!(
107 "{}{}{}",
108 blk_metadata.st_dev(),
109 blk_metadata.st_rdev(),
110 blk_metadata.st_ino()
111 );
112 Ok(device_id)
113 }
114
build_serial(disk_path: &Path) -> Vec<u8>115 pub fn build_serial(disk_path: &Path) -> Vec<u8> {
116 let mut default_serial = vec![0; VIRTIO_BLK_ID_BYTES as usize];
117 match build_device_id(disk_path) {
118 Err(_) => {
119 warn!("Could not generate device id. We'll use a default.");
120 }
121 Ok(m) => {
122 // The kernel only knows to read a maximum of VIRTIO_BLK_ID_BYTES.
123 // This will also zero out any leftover bytes.
124 let disk_id = m.as_bytes();
125 let bytes_to_copy = cmp::min(disk_id.len(), VIRTIO_BLK_ID_BYTES as usize);
126 default_serial[..bytes_to_copy].clone_from_slice(&disk_id[..bytes_to_copy])
127 }
128 }
129 default_serial
130 }
131
132 #[derive(Error, Debug)]
133 pub enum ExecuteError {
134 #[error("Bad request")]
135 BadRequest(#[source] Error),
136 #[error("Failed to flush")]
137 Flush(#[source] io::Error),
138 #[error("Failed to read")]
139 Read(#[source] GuestMemoryError),
140 #[error("Failed to read_exact")]
141 ReadExact(#[source] io::Error),
142 #[error("Failed to seek")]
143 Seek(#[source] io::Error),
144 #[error("Failed to write")]
145 Write(#[source] GuestMemoryError),
146 #[error("Failed to write_all")]
147 WriteAll(#[source] io::Error),
148 #[error("Unsupported request: {0}")]
149 Unsupported(u32),
150 #[error("Failed to submit io uring")]
151 SubmitIoUring(#[source] io::Error),
152 #[error("Failed to get guest address")]
153 GetHostAddress(#[source] GuestMemoryError),
154 #[error("Failed to async read")]
155 AsyncRead(#[source] AsyncIoError),
156 #[error("Failed to async write")]
157 AsyncWrite(#[source] AsyncIoError),
158 #[error("failed to async flush")]
159 AsyncFlush(#[source] AsyncIoError),
160 #[error("Failed allocating a temporary buffer")]
161 TemporaryBufferAllocation(#[source] io::Error),
162 }
163
164 impl ExecuteError {
status(&self) -> u8165 pub fn status(&self) -> u8 {
166 let status = match *self {
167 ExecuteError::BadRequest(_) => VIRTIO_BLK_S_IOERR,
168 ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR,
169 ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR,
170 ExecuteError::ReadExact(_) => VIRTIO_BLK_S_IOERR,
171 ExecuteError::Seek(_) => VIRTIO_BLK_S_IOERR,
172 ExecuteError::Write(_) => VIRTIO_BLK_S_IOERR,
173 ExecuteError::WriteAll(_) => VIRTIO_BLK_S_IOERR,
174 ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP,
175 ExecuteError::SubmitIoUring(_) => VIRTIO_BLK_S_IOERR,
176 ExecuteError::GetHostAddress(_) => VIRTIO_BLK_S_IOERR,
177 ExecuteError::AsyncRead(_) => VIRTIO_BLK_S_IOERR,
178 ExecuteError::AsyncWrite(_) => VIRTIO_BLK_S_IOERR,
179 ExecuteError::AsyncFlush(_) => VIRTIO_BLK_S_IOERR,
180 ExecuteError::TemporaryBufferAllocation(_) => VIRTIO_BLK_S_IOERR,
181 };
182 status as u8
183 }
184 }
185
186 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
187 pub enum RequestType {
188 In,
189 Out,
190 Flush,
191 GetDeviceId,
192 Unsupported(u32),
193 }
194
request_type<B: Bitmap + 'static>( mem: &vm_memory::GuestMemoryMmap<B>, desc_addr: GuestAddress, ) -> result::Result<RequestType, Error>195 pub fn request_type<B: Bitmap + 'static>(
196 mem: &vm_memory::GuestMemoryMmap<B>,
197 desc_addr: GuestAddress,
198 ) -> result::Result<RequestType, Error> {
199 let type_ = mem.read_obj(desc_addr).map_err(Error::GuestMemory)?;
200 match type_ {
201 VIRTIO_BLK_T_IN => Ok(RequestType::In),
202 VIRTIO_BLK_T_OUT => Ok(RequestType::Out),
203 VIRTIO_BLK_T_FLUSH => Ok(RequestType::Flush),
204 VIRTIO_BLK_T_GET_ID => Ok(RequestType::GetDeviceId),
205 t => Ok(RequestType::Unsupported(t)),
206 }
207 }
208
sector<B: Bitmap + 'static>( mem: &vm_memory::GuestMemoryMmap<B>, desc_addr: GuestAddress, ) -> result::Result<u64, Error>209 fn sector<B: Bitmap + 'static>(
210 mem: &vm_memory::GuestMemoryMmap<B>,
211 desc_addr: GuestAddress,
212 ) -> result::Result<u64, Error> {
213 const SECTOR_OFFSET: usize = 8;
214 let addr = match mem.checked_offset(desc_addr, SECTOR_OFFSET) {
215 Some(v) => v,
216 None => return Err(Error::CheckedOffset(desc_addr, SECTOR_OFFSET)),
217 };
218
219 mem.read_obj(addr).map_err(Error::GuestMemory)
220 }
221
222 const DEFAULT_DESCRIPTOR_VEC_SIZE: usize = 32;
223
224 #[derive(Debug)]
225 pub struct AlignedOperation {
226 origin_ptr: u64,
227 aligned_ptr: u64,
228 size: usize,
229 layout: Layout,
230 }
231
232 #[derive(Debug)]
233 pub struct Request {
234 pub request_type: RequestType,
235 pub sector: u64,
236 pub data_descriptors: SmallVec<[(GuestAddress, u32); DEFAULT_DESCRIPTOR_VEC_SIZE]>,
237 pub status_addr: GuestAddress,
238 pub writeback: bool,
239 pub aligned_operations: SmallVec<[AlignedOperation; DEFAULT_DESCRIPTOR_VEC_SIZE]>,
240 pub start: Instant,
241 }
242
243 impl Request {
parse<B: Bitmap + 'static>( desc_chain: &mut DescriptorChain<GuestMemoryLoadGuard<vm_memory::GuestMemoryMmap<B>>>, access_platform: Option<&Arc<dyn AccessPlatform>>, ) -> result::Result<Request, Error>244 pub fn parse<B: Bitmap + 'static>(
245 desc_chain: &mut DescriptorChain<GuestMemoryLoadGuard<vm_memory::GuestMemoryMmap<B>>>,
246 access_platform: Option<&Arc<dyn AccessPlatform>>,
247 ) -> result::Result<Request, Error> {
248 let hdr_desc = desc_chain
249 .next()
250 .ok_or(Error::DescriptorChainTooShort)
251 .inspect_err(|_| {
252 error!("Missing head descriptor");
253 })?;
254
255 // The head contains the request type which MUST be readable.
256 if hdr_desc.is_write_only() {
257 return Err(Error::UnexpectedWriteOnlyDescriptor);
258 }
259
260 let hdr_desc_addr = hdr_desc
261 .addr()
262 .translate_gva(access_platform, hdr_desc.len() as usize);
263
264 let mut req = Request {
265 request_type: request_type(desc_chain.memory(), hdr_desc_addr)?,
266 sector: sector(desc_chain.memory(), hdr_desc_addr)?,
267 data_descriptors: SmallVec::with_capacity(DEFAULT_DESCRIPTOR_VEC_SIZE),
268 status_addr: GuestAddress(0),
269 writeback: true,
270 aligned_operations: SmallVec::with_capacity(DEFAULT_DESCRIPTOR_VEC_SIZE),
271 start: Instant::now(),
272 };
273
274 let status_desc;
275 let mut desc = desc_chain
276 .next()
277 .ok_or(Error::DescriptorChainTooShort)
278 .inspect_err(|_| {
279 error!("Only head descriptor present: request = {:?}", req);
280 })?;
281
282 if !desc.has_next() {
283 status_desc = desc;
284 // Only flush requests are allowed to skip the data descriptor.
285 if req.request_type != RequestType::Flush {
286 error!("Need a data descriptor: request = {:?}", req);
287 return Err(Error::DescriptorChainTooShort);
288 }
289 } else {
290 req.data_descriptors.reserve_exact(1);
291 while desc.has_next() {
292 if desc.is_write_only() && req.request_type == RequestType::Out {
293 return Err(Error::UnexpectedWriteOnlyDescriptor);
294 }
295 if !desc.is_write_only() && req.request_type == RequestType::In {
296 return Err(Error::UnexpectedReadOnlyDescriptor);
297 }
298 if !desc.is_write_only() && req.request_type == RequestType::GetDeviceId {
299 return Err(Error::UnexpectedReadOnlyDescriptor);
300 }
301
302 req.data_descriptors.push((
303 desc.addr()
304 .translate_gva(access_platform, desc.len() as usize),
305 desc.len(),
306 ));
307 desc = desc_chain
308 .next()
309 .ok_or(Error::DescriptorChainTooShort)
310 .inspect_err(|_| {
311 error!("DescriptorChain corrupted: request = {:?}", req);
312 })?;
313 }
314 status_desc = desc;
315 }
316
317 // The status MUST always be writable.
318 if !status_desc.is_write_only() {
319 return Err(Error::UnexpectedReadOnlyDescriptor);
320 }
321
322 if status_desc.len() < 1 {
323 return Err(Error::DescriptorLengthTooSmall);
324 }
325
326 req.status_addr = status_desc
327 .addr()
328 .translate_gva(access_platform, status_desc.len() as usize);
329
330 Ok(req)
331 }
332
execute<T: Seek + Read + Write, B: Bitmap + 'static>( &self, disk: &mut T, disk_nsectors: u64, mem: &vm_memory::GuestMemoryMmap<B>, serial: &[u8], ) -> result::Result<u32, ExecuteError>333 pub fn execute<T: Seek + Read + Write, B: Bitmap + 'static>(
334 &self,
335 disk: &mut T,
336 disk_nsectors: u64,
337 mem: &vm_memory::GuestMemoryMmap<B>,
338 serial: &[u8],
339 ) -> result::Result<u32, ExecuteError> {
340 disk.seek(SeekFrom::Start(self.sector << SECTOR_SHIFT))
341 .map_err(ExecuteError::Seek)?;
342 let mut len = 0;
343 for (data_addr, data_len) in &self.data_descriptors {
344 let mut top: u64 = u64::from(*data_len) / SECTOR_SIZE;
345 if u64::from(*data_len) % SECTOR_SIZE != 0 {
346 top += 1;
347 }
348 top = top
349 .checked_add(self.sector)
350 .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?;
351 if top > disk_nsectors {
352 return Err(ExecuteError::BadRequest(Error::InvalidOffset));
353 }
354
355 match self.request_type {
356 RequestType::In => {
357 let mut buf = vec![0u8; *data_len as usize];
358 disk.read_exact(&mut buf).map_err(ExecuteError::ReadExact)?;
359 mem.read_exact_volatile_from(
360 *data_addr,
361 &mut buf.as_slice(),
362 *data_len as usize,
363 )
364 .map_err(ExecuteError::Read)?;
365 len += data_len;
366 }
367 RequestType::Out => {
368 let mut buf: Vec<u8> = Vec::new();
369 mem.write_all_volatile_to(*data_addr, &mut buf, *data_len as usize)
370 .map_err(ExecuteError::Write)?;
371 disk.write_all(&buf).map_err(ExecuteError::WriteAll)?;
372 if !self.writeback {
373 disk.flush().map_err(ExecuteError::Flush)?;
374 }
375 }
376 RequestType::Flush => disk.flush().map_err(ExecuteError::Flush)?,
377 RequestType::GetDeviceId => {
378 if (*data_len as usize) < serial.len() {
379 return Err(ExecuteError::BadRequest(Error::InvalidOffset));
380 }
381 mem.write_slice(serial, *data_addr)
382 .map_err(ExecuteError::Write)?;
383 }
384 RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)),
385 };
386 }
387 Ok(len)
388 }
389
execute_async<B: Bitmap + 'static>( &mut self, mem: &vm_memory::GuestMemoryMmap<B>, disk_nsectors: u64, disk_image: &mut dyn AsyncIo, serial: &[u8], user_data: u64, ) -> result::Result<bool, ExecuteError>390 pub fn execute_async<B: Bitmap + 'static>(
391 &mut self,
392 mem: &vm_memory::GuestMemoryMmap<B>,
393 disk_nsectors: u64,
394 disk_image: &mut dyn AsyncIo,
395 serial: &[u8],
396 user_data: u64,
397 ) -> result::Result<bool, ExecuteError> {
398 let sector = self.sector;
399 let request_type = self.request_type;
400 let offset = (sector << SECTOR_SHIFT) as libc::off_t;
401
402 let mut iovecs: SmallVec<[libc::iovec; DEFAULT_DESCRIPTOR_VEC_SIZE]> =
403 SmallVec::with_capacity(self.data_descriptors.len());
404 for &(data_addr, data_len) in &self.data_descriptors {
405 let _: u32 = data_len; // compiler-checked documentation
406 const _: () = assert!(
407 core::mem::size_of::<u32>() <= core::mem::size_of::<usize>(),
408 "unsupported platform"
409 );
410 if data_len == 0 {
411 continue;
412 }
413 let mut top: u64 = u64::from(data_len) / SECTOR_SIZE;
414 if u64::from(data_len) % SECTOR_SIZE != 0 {
415 top += 1;
416 }
417 let data_len = data_len as usize;
418 top = top
419 .checked_add(sector)
420 .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?;
421 if top > disk_nsectors {
422 return Err(ExecuteError::BadRequest(Error::InvalidOffset));
423 }
424
425 let origin_ptr = mem
426 .get_slice(data_addr, data_len)
427 .map_err(ExecuteError::GetHostAddress)?
428 .ptr_guard();
429
430 // Verify the buffer alignment.
431 // In case it's not properly aligned, an intermediate buffer is
432 // created with the correct alignment, and a copy from/to the
433 // origin buffer is performed, depending on the type of operation.
434 let iov_base = if (origin_ptr.as_ptr() as u64) % SECTOR_SIZE != 0 {
435 let layout = Layout::from_size_align(data_len, SECTOR_SIZE as usize).unwrap();
436 // SAFETY: layout has non-zero size
437 let aligned_ptr = unsafe { alloc_zeroed(layout) };
438 if aligned_ptr.is_null() {
439 return Err(ExecuteError::TemporaryBufferAllocation(
440 io::Error::last_os_error(),
441 ));
442 }
443
444 // We need to perform the copy beforehand in case we're writing
445 // data out.
446 if request_type == RequestType::Out {
447 // SAFETY: destination buffer has been allocated with
448 // the proper size.
449 unsafe { std::ptr::copy(origin_ptr.as_ptr(), aligned_ptr, data_len) };
450 }
451
452 // Store both origin and aligned pointers for complete_async()
453 // to process them.
454 self.aligned_operations.push(AlignedOperation {
455 origin_ptr: origin_ptr.as_ptr() as u64,
456 aligned_ptr: aligned_ptr as u64,
457 size: data_len,
458 layout,
459 });
460
461 aligned_ptr as *mut libc::c_void
462 } else {
463 origin_ptr.as_ptr() as *mut libc::c_void
464 };
465
466 let iovec = libc::iovec {
467 iov_base,
468 iov_len: data_len as libc::size_t,
469 };
470 iovecs.push(iovec);
471 }
472
473 // Queue operations expected to be submitted.
474 match request_type {
475 RequestType::In => {
476 for (data_addr, data_len) in &self.data_descriptors {
477 mem.get_slice(*data_addr, *data_len as usize)
478 .map_err(ExecuteError::GetHostAddress)?
479 .bitmap()
480 .mark_dirty(0, *data_len as usize);
481 }
482 disk_image
483 .read_vectored(offset, &iovecs, user_data)
484 .map_err(ExecuteError::AsyncRead)?;
485 }
486 RequestType::Out => {
487 disk_image
488 .write_vectored(offset, &iovecs, user_data)
489 .map_err(ExecuteError::AsyncWrite)?;
490 }
491 RequestType::Flush => {
492 disk_image
493 .fsync(Some(user_data))
494 .map_err(ExecuteError::AsyncFlush)?;
495 }
496 RequestType::GetDeviceId => {
497 let (data_addr, data_len) = if self.data_descriptors.len() == 1 {
498 (self.data_descriptors[0].0, self.data_descriptors[0].1)
499 } else {
500 return Err(ExecuteError::BadRequest(Error::TooManyDescriptors));
501 };
502 if (data_len as usize) < serial.len() {
503 return Err(ExecuteError::BadRequest(Error::InvalidOffset));
504 }
505 mem.write_slice(serial, data_addr)
506 .map_err(ExecuteError::Write)?;
507 return Ok(false);
508 }
509 RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)),
510 }
511
512 Ok(true)
513 }
514
complete_async(&mut self) -> result::Result<(), Error>515 pub fn complete_async(&mut self) -> result::Result<(), Error> {
516 for aligned_operation in self.aligned_operations.drain(..) {
517 // We need to perform the copy after the data has been read inside
518 // the aligned buffer in case we're reading data in.
519 if self.request_type == RequestType::In {
520 // SAFETY: origin buffer has been allocated with the
521 // proper size.
522 unsafe {
523 std::ptr::copy(
524 aligned_operation.aligned_ptr as *const u8,
525 aligned_operation.origin_ptr as *mut u8,
526 aligned_operation.size,
527 )
528 };
529 }
530
531 // Free the temporary aligned buffer.
532 // SAFETY: aligned_ptr was allocated by alloc_zeroed with the same
533 // layout
534 unsafe {
535 dealloc(
536 aligned_operation.aligned_ptr as *mut u8,
537 aligned_operation.layout,
538 )
539 };
540 }
541
542 Ok(())
543 }
544
set_writeback(&mut self, writeback: bool)545 pub fn set_writeback(&mut self, writeback: bool) {
546 self.writeback = writeback
547 }
548 }
549
550 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)]
551 #[repr(C, packed)]
552 pub struct VirtioBlockConfig {
553 pub capacity: u64,
554 pub size_max: u32,
555 pub seg_max: u32,
556 pub geometry: VirtioBlockGeometry,
557 pub blk_size: u32,
558 pub physical_block_exp: u8,
559 pub alignment_offset: u8,
560 pub min_io_size: u16,
561 pub opt_io_size: u32,
562 pub writeback: u8,
563 pub unused: u8,
564 pub num_queues: u16,
565 pub max_discard_sectors: u32,
566 pub max_discard_seg: u32,
567 pub discard_sector_alignment: u32,
568 pub max_write_zeroes_sectors: u32,
569 pub max_write_zeroes_seg: u32,
570 pub write_zeroes_may_unmap: u8,
571 pub unused1: [u8; 3],
572 }
573 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)]
574 #[repr(C, packed)]
575 pub struct VirtioBlockGeometry {
576 pub cylinders: u16,
577 pub heads: u8,
578 pub sectors: u8,
579 }
580
581 // SAFETY: data structure only contain a series of integers
582 unsafe impl ByteValued for VirtioBlockConfig {}
583 // SAFETY: data structure only contain a series of integers
584 unsafe impl ByteValued for VirtioBlockGeometry {}
585
586 /// Check if aio can be used on the current system.
block_aio_is_supported() -> bool587 pub fn block_aio_is_supported() -> bool {
588 aio::IoContext::new(1).is_ok()
589 }
590
591 /// Check if io_uring for block device can be used on the current system, as
592 /// it correctly supports the expected io_uring features.
block_io_uring_is_supported() -> bool593 pub fn block_io_uring_is_supported() -> bool {
594 #[cfg(not(feature = "io_uring"))]
595 {
596 info!("io_uring is disabled by crate features");
597 false
598 }
599
600 #[cfg(feature = "io_uring")]
601 {
602 let error_msg = "io_uring not supported:";
603
604 // Check we can create an io_uring instance, which effectively verifies
605 // that io_uring_setup() syscall is supported.
606 let io_uring = match IoUring::new(1) {
607 Ok(io_uring) => io_uring,
608 Err(e) => {
609 info!("{} failed to create io_uring instance: {}", error_msg, e);
610 return false;
611 }
612 };
613
614 let submitter = io_uring.submitter();
615
616 let mut probe = Probe::new();
617
618 // Check we can register a probe to validate supported operations.
619 match submitter.register_probe(&mut probe) {
620 Ok(_) => {}
621 Err(e) => {
622 info!("{} failed to register a probe: {}", error_msg, e);
623 return false;
624 }
625 }
626
627 // Check IORING_OP_FSYNC is supported
628 if !probe.is_supported(opcode::Fsync::CODE) {
629 info!("{} IORING_OP_FSYNC operation not supported", error_msg);
630 return false;
631 }
632
633 // Check IORING_OP_READV is supported
634 if !probe.is_supported(opcode::Readv::CODE) {
635 info!("{} IORING_OP_READV operation not supported", error_msg);
636 return false;
637 }
638
639 // Check IORING_OP_WRITEV is supported
640 if !probe.is_supported(opcode::Writev::CODE) {
641 info!("{} IORING_OP_WRITEV operation not supported", error_msg);
642 return false;
643 }
644
645 true
646 }
647 }
648
649 pub trait AsyncAdaptor<F>
650 where
651 F: Read + Write + Seek,
652 {
read_vectored_sync( &mut self, offset: libc::off_t, iovecs: &[libc::iovec], user_data: u64, eventfd: &EventFd, completion_list: &mut VecDeque<(u64, i32)>, ) -> AsyncIoResult<()>653 fn read_vectored_sync(
654 &mut self,
655 offset: libc::off_t,
656 iovecs: &[libc::iovec],
657 user_data: u64,
658 eventfd: &EventFd,
659 completion_list: &mut VecDeque<(u64, i32)>,
660 ) -> AsyncIoResult<()> {
661 // Convert libc::iovec into IoSliceMut
662 let mut slices: SmallVec<[IoSliceMut; DEFAULT_DESCRIPTOR_VEC_SIZE]> =
663 SmallVec::with_capacity(iovecs.len());
664 for iovec in iovecs.iter() {
665 // SAFETY: on Linux IoSliceMut wraps around libc::iovec
666 slices.push(IoSliceMut::new(unsafe {
667 std::mem::transmute::<libc::iovec, &mut [u8]>(*iovec)
668 }));
669 }
670
671 let result = {
672 let mut file = self.file();
673
674 // Move the cursor to the right offset
675 file.seek(SeekFrom::Start(offset as u64))
676 .map_err(AsyncIoError::ReadVectored)?;
677
678 let mut r = 0;
679 for b in slices.iter_mut() {
680 r += file.read(b).map_err(AsyncIoError::ReadVectored)?;
681 }
682 r
683 };
684
685 completion_list.push_back((user_data, result as i32));
686 eventfd.write(1).unwrap();
687
688 Ok(())
689 }
690
write_vectored_sync( &mut self, offset: libc::off_t, iovecs: &[libc::iovec], user_data: u64, eventfd: &EventFd, completion_list: &mut VecDeque<(u64, i32)>, ) -> AsyncIoResult<()>691 fn write_vectored_sync(
692 &mut self,
693 offset: libc::off_t,
694 iovecs: &[libc::iovec],
695 user_data: u64,
696 eventfd: &EventFd,
697 completion_list: &mut VecDeque<(u64, i32)>,
698 ) -> AsyncIoResult<()> {
699 // Convert libc::iovec into IoSlice
700 let mut slices: SmallVec<[IoSlice; DEFAULT_DESCRIPTOR_VEC_SIZE]> =
701 SmallVec::with_capacity(iovecs.len());
702 for iovec in iovecs.iter() {
703 // SAFETY: on Linux IoSlice wraps around libc::iovec
704 slices.push(IoSlice::new(unsafe {
705 std::mem::transmute::<libc::iovec, &mut [u8]>(*iovec)
706 }));
707 }
708
709 let result = {
710 let mut file = self.file();
711
712 // Move the cursor to the right offset
713 file.seek(SeekFrom::Start(offset as u64))
714 .map_err(AsyncIoError::WriteVectored)?;
715
716 let mut r = 0;
717 for b in slices.iter() {
718 r += file.write(b).map_err(AsyncIoError::WriteVectored)?;
719 }
720 r
721 };
722
723 completion_list.push_back((user_data, result as i32));
724 eventfd.write(1).unwrap();
725
726 Ok(())
727 }
728
fsync_sync( &mut self, user_data: Option<u64>, eventfd: &EventFd, completion_list: &mut VecDeque<(u64, i32)>, ) -> AsyncIoResult<()>729 fn fsync_sync(
730 &mut self,
731 user_data: Option<u64>,
732 eventfd: &EventFd,
733 completion_list: &mut VecDeque<(u64, i32)>,
734 ) -> AsyncIoResult<()> {
735 let result: i32 = {
736 let mut file = self.file();
737
738 // Flush
739 file.flush().map_err(AsyncIoError::Fsync)?;
740
741 0
742 };
743
744 if let Some(user_data) = user_data {
745 completion_list.push_back((user_data, result));
746 eventfd.write(1).unwrap();
747 }
748
749 Ok(())
750 }
751
file(&mut self) -> MutexGuard<'_, F>752 fn file(&mut self) -> MutexGuard<'_, F>;
753 }
754
755 pub enum ImageType {
756 FixedVhd,
757 Qcow2,
758 Raw,
759 Vhdx,
760 }
761
762 const QCOW_MAGIC: u32 = 0x5146_49fb;
763 const VHDX_SIGN: u64 = 0x656C_6966_7864_6876;
764
765 /// Read a block into memory aligned by the source block size (needed for O_DIRECT)
read_aligned_block_size(f: &mut File) -> std::io::Result<Vec<u8>>766 pub fn read_aligned_block_size(f: &mut File) -> std::io::Result<Vec<u8>> {
767 let blocksize = DiskTopology::probe(f)?.logical_block_size as usize;
768 // SAFETY: We are allocating memory that is naturally aligned (size = alignment) and we meet
769 // requirements for safety from Vec::from_raw_parts() as we are using the global allocator
770 // and transferring ownership of the memory.
771 let mut data = unsafe {
772 Vec::from_raw_parts(
773 alloc_zeroed(Layout::from_size_align_unchecked(blocksize, blocksize)),
774 blocksize,
775 blocksize,
776 )
777 };
778 f.read_exact(&mut data)?;
779 Ok(data)
780 }
781
782 /// Determine image type through file parsing.
detect_image_type(f: &mut File) -> std::io::Result<ImageType>783 pub fn detect_image_type(f: &mut File) -> std::io::Result<ImageType> {
784 let block = read_aligned_block_size(f)?;
785
786 // Check 4 first bytes to get the header value and determine the image type
787 let image_type = if u32::from_be_bytes(block[0..4].try_into().unwrap()) == QCOW_MAGIC {
788 ImageType::Qcow2
789 } else if vhd::is_fixed_vhd(f)? {
790 ImageType::FixedVhd
791 } else if u64::from_le_bytes(block[0..8].try_into().unwrap()) == VHDX_SIGN {
792 ImageType::Vhdx
793 } else {
794 ImageType::Raw
795 };
796
797 Ok(image_type)
798 }
799
800 pub trait BlockBackend: Read + Write + Seek + Send + Debug {
size(&self) -> Result<u64, Error>801 fn size(&self) -> Result<u64, Error>;
802 }
803
804 #[derive(Debug)]
805 pub struct DiskTopology {
806 pub logical_block_size: u64,
807 pub physical_block_size: u64,
808 pub minimum_io_size: u64,
809 pub optimal_io_size: u64,
810 }
811
812 impl Default for DiskTopology {
default() -> Self813 fn default() -> Self {
814 Self {
815 logical_block_size: 512,
816 physical_block_size: 512,
817 minimum_io_size: 512,
818 optimal_io_size: 0,
819 }
820 }
821 }
822
823 ioctl_io_nr!(BLKSSZGET, 0x12, 104);
824 ioctl_io_nr!(BLKPBSZGET, 0x12, 123);
825 ioctl_io_nr!(BLKIOMIN, 0x12, 120);
826 ioctl_io_nr!(BLKIOOPT, 0x12, 121);
827
828 enum BlockSize {
829 LogicalBlock,
830 PhysicalBlock,
831 MinimumIo,
832 OptimalIo,
833 }
834
835 impl DiskTopology {
is_block_device(f: &File) -> std::io::Result<bool>836 fn is_block_device(f: &File) -> std::io::Result<bool> {
837 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit();
838 // SAFETY: FFI call with a valid fd and buffer
839 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) };
840 if ret != 0 {
841 return Err(std::io::Error::last_os_error());
842 }
843
844 // SAFETY: stat is valid at this point
845 let is_block = unsafe { (*stat.as_ptr()).st_mode & S_IFMT == S_IFBLK };
846 Ok(is_block)
847 }
848
849 // libc::ioctl() takes different types on different architectures
query_block_size(f: &File, block_size_type: BlockSize) -> std::io::Result<u64>850 fn query_block_size(f: &File, block_size_type: BlockSize) -> std::io::Result<u64> {
851 let mut block_size = 0;
852 // SAFETY: FFI call with correct arguments
853 let ret = unsafe {
854 ioctl(
855 f.as_raw_fd(),
856 match block_size_type {
857 BlockSize::LogicalBlock => BLKSSZGET(),
858 BlockSize::PhysicalBlock => BLKPBSZGET(),
859 BlockSize::MinimumIo => BLKIOMIN(),
860 BlockSize::OptimalIo => BLKIOOPT(),
861 } as _,
862 &mut block_size,
863 )
864 };
865 if ret != 0 {
866 return Err(std::io::Error::last_os_error());
867 };
868
869 Ok(block_size)
870 }
871
probe(f: &File) -> std::io::Result<Self>872 pub fn probe(f: &File) -> std::io::Result<Self> {
873 if !Self::is_block_device(f)? {
874 return Ok(DiskTopology::default());
875 }
876
877 Ok(DiskTopology {
878 logical_block_size: Self::query_block_size(f, BlockSize::LogicalBlock)?,
879 physical_block_size: Self::query_block_size(f, BlockSize::PhysicalBlock)?,
880 minimum_io_size: Self::query_block_size(f, BlockSize::MinimumIo)?,
881 optimal_io_size: Self::query_block_size(f, BlockSize::OptimalIo)?,
882 })
883 }
884 }
885