1 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 // 3 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 4 // Use of this source code is governed by a BSD-style license that can be 5 // found in the LICENSE-BSD-3-Clause file. 6 // 7 // Copyright © 2020 Intel Corporation 8 // 9 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 10 11 #[macro_use] 12 extern crate log; 13 14 pub mod async_io; 15 pub mod fixed_vhd; 16 #[cfg(feature = "io_uring")] 17 /// Enabled with the `"io_uring"` feature 18 pub mod fixed_vhd_async; 19 pub mod fixed_vhd_sync; 20 pub mod qcow; 21 pub mod qcow_sync; 22 #[cfg(feature = "io_uring")] 23 /// Async primitives based on `io-uring` 24 /// 25 /// Enabled with the `"io_uring"` feature 26 pub mod raw_async; 27 pub mod raw_async_aio; 28 pub mod raw_sync; 29 pub mod vhd; 30 pub mod vhdx; 31 pub mod vhdx_sync; 32 33 use std::alloc::{alloc_zeroed, dealloc, Layout}; 34 use std::collections::VecDeque; 35 use std::fmt::Debug; 36 use std::fs::File; 37 use std::io::{self, IoSlice, IoSliceMut, Read, Seek, SeekFrom, Write}; 38 use std::os::linux::fs::MetadataExt; 39 use std::os::unix::io::AsRawFd; 40 use std::path::Path; 41 use std::sync::{Arc, MutexGuard}; 42 use std::time::Instant; 43 use std::{cmp, result}; 44 45 #[cfg(feature = "io_uring")] 46 use io_uring::{opcode, IoUring, Probe}; 47 use libc::{ioctl, S_IFBLK, S_IFMT}; 48 use serde::{Deserialize, Serialize}; 49 use smallvec::SmallVec; 50 use thiserror::Error; 51 use virtio_bindings::virtio_blk::*; 52 use virtio_queue::DescriptorChain; 53 use vm_memory::bitmap::Bitmap; 54 use vm_memory::{ 55 ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryError, GuestMemoryLoadGuard, 56 }; 57 use vm_virtio::{AccessPlatform, Translatable}; 58 use vmm_sys_util::eventfd::EventFd; 59 use vmm_sys_util::{aio, ioctl_io_nr, ioctl_ioc_nr}; 60 61 use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult}; 62 use crate::vhdx::VhdxError; 63 64 const SECTOR_SHIFT: u8 = 9; 65 pub const SECTOR_SIZE: u64 = 0x01 << SECTOR_SHIFT; 66 67 #[derive(Error, Debug)] 68 pub enum Error { 69 #[error("Guest gave us bad memory addresses")] 70 GuestMemory(GuestMemoryError), 71 #[error("Guest gave us offsets that would have overflowed a usize")] 72 CheckedOffset(GuestAddress, usize), 73 #[error("Guest gave us a write only descriptor that protocol says to read from")] 74 UnexpectedWriteOnlyDescriptor, 75 #[error("Guest gave us a read only descriptor that protocol says to write to")] 76 UnexpectedReadOnlyDescriptor, 77 #[error("Guest gave us too few descriptors in a descriptor chain")] 78 DescriptorChainTooShort, 79 #[error("Guest gave us a descriptor that was too short to use")] 80 DescriptorLengthTooSmall, 81 #[error("Failed to detect image type: {0}")] 82 DetectImageType(std::io::Error), 83 #[error("Failure in fixed vhd: {0}")] 84 FixedVhdError(std::io::Error), 85 #[error("Getting a block's metadata fails for any reason")] 86 GetFileMetadata, 87 #[error("The requested operation would cause a seek beyond disk end")] 88 InvalidOffset, 89 #[error("Failure in qcow: {0}")] 90 QcowError(qcow::Error), 91 #[error("Failure in raw file: {0}")] 92 RawFileError(std::io::Error), 93 #[error("The requested operation does not support multiple descriptors")] 94 TooManyDescriptors, 95 #[error("Failure in vhdx: {0}")] 96 VhdxError(VhdxError), 97 } 98 99 fn build_device_id(disk_path: &Path) -> result::Result<String, Error> { 100 let blk_metadata = match disk_path.metadata() { 101 Err(_) => return Err(Error::GetFileMetadata), 102 Ok(m) => m, 103 }; 104 // This is how kvmtool does it. 105 let device_id = format!( 106 "{}{}{}", 107 blk_metadata.st_dev(), 108 blk_metadata.st_rdev(), 109 blk_metadata.st_ino() 110 ); 111 Ok(device_id) 112 } 113 114 pub fn build_serial(disk_path: &Path) -> Vec<u8> { 115 let mut default_serial = vec![0; VIRTIO_BLK_ID_BYTES as usize]; 116 match build_device_id(disk_path) { 117 Err(_) => { 118 warn!("Could not generate device id. We'll use a default."); 119 } 120 Ok(m) => { 121 // The kernel only knows to read a maximum of VIRTIO_BLK_ID_BYTES. 122 // This will also zero out any leftover bytes. 123 let disk_id = m.as_bytes(); 124 let bytes_to_copy = cmp::min(disk_id.len(), VIRTIO_BLK_ID_BYTES as usize); 125 default_serial[..bytes_to_copy].clone_from_slice(&disk_id[..bytes_to_copy]) 126 } 127 } 128 default_serial 129 } 130 131 #[derive(Error, Debug)] 132 pub enum ExecuteError { 133 #[error("Bad request: {0}")] 134 BadRequest(Error), 135 #[error("Failed to flush: {0}")] 136 Flush(io::Error), 137 #[error("Failed to read: {0}")] 138 Read(GuestMemoryError), 139 #[error("Failed to read_exact: {0}")] 140 ReadExact(io::Error), 141 #[error("Failed to seek: {0}")] 142 Seek(io::Error), 143 #[error("Failed to write: {0}")] 144 Write(GuestMemoryError), 145 #[error("Failed to write_all: {0}")] 146 WriteAll(io::Error), 147 #[error("Unsupported request: {0}")] 148 Unsupported(u32), 149 #[error("Failed to submit io uring: {0}")] 150 SubmitIoUring(io::Error), 151 #[error("Failed to get guest address: {0}")] 152 GetHostAddress(GuestMemoryError), 153 #[error("Failed to async read: {0}")] 154 AsyncRead(AsyncIoError), 155 #[error("Failed to async write: {0}")] 156 AsyncWrite(AsyncIoError), 157 #[error("failed to async flush: {0}")] 158 AsyncFlush(AsyncIoError), 159 #[error("Failed allocating a temporary buffer: {0}")] 160 TemporaryBufferAllocation(io::Error), 161 } 162 163 impl ExecuteError { 164 pub fn status(&self) -> u8 { 165 let status = match *self { 166 ExecuteError::BadRequest(_) => VIRTIO_BLK_S_IOERR, 167 ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR, 168 ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR, 169 ExecuteError::ReadExact(_) => VIRTIO_BLK_S_IOERR, 170 ExecuteError::Seek(_) => VIRTIO_BLK_S_IOERR, 171 ExecuteError::Write(_) => VIRTIO_BLK_S_IOERR, 172 ExecuteError::WriteAll(_) => VIRTIO_BLK_S_IOERR, 173 ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP, 174 ExecuteError::SubmitIoUring(_) => VIRTIO_BLK_S_IOERR, 175 ExecuteError::GetHostAddress(_) => VIRTIO_BLK_S_IOERR, 176 ExecuteError::AsyncRead(_) => VIRTIO_BLK_S_IOERR, 177 ExecuteError::AsyncWrite(_) => VIRTIO_BLK_S_IOERR, 178 ExecuteError::AsyncFlush(_) => VIRTIO_BLK_S_IOERR, 179 ExecuteError::TemporaryBufferAllocation(_) => VIRTIO_BLK_S_IOERR, 180 }; 181 status as u8 182 } 183 } 184 185 #[derive(Clone, Copy, Debug, PartialEq, Eq)] 186 pub enum RequestType { 187 In, 188 Out, 189 Flush, 190 GetDeviceId, 191 Unsupported(u32), 192 } 193 194 pub fn request_type<B: Bitmap + 'static>( 195 mem: &vm_memory::GuestMemoryMmap<B>, 196 desc_addr: GuestAddress, 197 ) -> result::Result<RequestType, Error> { 198 let type_ = mem.read_obj(desc_addr).map_err(Error::GuestMemory)?; 199 match type_ { 200 VIRTIO_BLK_T_IN => Ok(RequestType::In), 201 VIRTIO_BLK_T_OUT => Ok(RequestType::Out), 202 VIRTIO_BLK_T_FLUSH => Ok(RequestType::Flush), 203 VIRTIO_BLK_T_GET_ID => Ok(RequestType::GetDeviceId), 204 t => Ok(RequestType::Unsupported(t)), 205 } 206 } 207 208 fn sector<B: Bitmap + 'static>( 209 mem: &vm_memory::GuestMemoryMmap<B>, 210 desc_addr: GuestAddress, 211 ) -> result::Result<u64, Error> { 212 const SECTOR_OFFSET: usize = 8; 213 let addr = match mem.checked_offset(desc_addr, SECTOR_OFFSET) { 214 Some(v) => v, 215 None => return Err(Error::CheckedOffset(desc_addr, SECTOR_OFFSET)), 216 }; 217 218 mem.read_obj(addr).map_err(Error::GuestMemory) 219 } 220 221 const DEFAULT_DESCRIPTOR_VEC_SIZE: usize = 32; 222 223 #[derive(Debug)] 224 pub struct AlignedOperation { 225 origin_ptr: u64, 226 aligned_ptr: u64, 227 size: usize, 228 layout: Layout, 229 } 230 231 #[derive(Debug)] 232 pub struct Request { 233 pub request_type: RequestType, 234 pub sector: u64, 235 pub data_descriptors: SmallVec<[(GuestAddress, u32); DEFAULT_DESCRIPTOR_VEC_SIZE]>, 236 pub status_addr: GuestAddress, 237 pub writeback: bool, 238 pub aligned_operations: SmallVec<[AlignedOperation; DEFAULT_DESCRIPTOR_VEC_SIZE]>, 239 pub start: Instant, 240 } 241 242 impl Request { 243 pub fn parse<B: Bitmap + 'static>( 244 desc_chain: &mut DescriptorChain<GuestMemoryLoadGuard<vm_memory::GuestMemoryMmap<B>>>, 245 access_platform: Option<&Arc<dyn AccessPlatform>>, 246 ) -> result::Result<Request, Error> { 247 let hdr_desc = desc_chain 248 .next() 249 .ok_or(Error::DescriptorChainTooShort) 250 .inspect_err(|_| { 251 error!("Missing head descriptor"); 252 })?; 253 254 // The head contains the request type which MUST be readable. 255 if hdr_desc.is_write_only() { 256 return Err(Error::UnexpectedWriteOnlyDescriptor); 257 } 258 259 let hdr_desc_addr = hdr_desc 260 .addr() 261 .translate_gva(access_platform, hdr_desc.len() as usize); 262 263 let mut req = Request { 264 request_type: request_type(desc_chain.memory(), hdr_desc_addr)?, 265 sector: sector(desc_chain.memory(), hdr_desc_addr)?, 266 data_descriptors: SmallVec::with_capacity(DEFAULT_DESCRIPTOR_VEC_SIZE), 267 status_addr: GuestAddress(0), 268 writeback: true, 269 aligned_operations: SmallVec::with_capacity(DEFAULT_DESCRIPTOR_VEC_SIZE), 270 start: Instant::now(), 271 }; 272 273 let status_desc; 274 let mut desc = desc_chain 275 .next() 276 .ok_or(Error::DescriptorChainTooShort) 277 .inspect_err(|_| { 278 error!("Only head descriptor present: request = {:?}", req); 279 })?; 280 281 if !desc.has_next() { 282 status_desc = desc; 283 // Only flush requests are allowed to skip the data descriptor. 284 if req.request_type != RequestType::Flush { 285 error!("Need a data descriptor: request = {:?}", req); 286 return Err(Error::DescriptorChainTooShort); 287 } 288 } else { 289 req.data_descriptors.reserve_exact(1); 290 while desc.has_next() { 291 if desc.is_write_only() && req.request_type == RequestType::Out { 292 return Err(Error::UnexpectedWriteOnlyDescriptor); 293 } 294 if !desc.is_write_only() && req.request_type == RequestType::In { 295 return Err(Error::UnexpectedReadOnlyDescriptor); 296 } 297 if !desc.is_write_only() && req.request_type == RequestType::GetDeviceId { 298 return Err(Error::UnexpectedReadOnlyDescriptor); 299 } 300 301 req.data_descriptors.push(( 302 desc.addr() 303 .translate_gva(access_platform, desc.len() as usize), 304 desc.len(), 305 )); 306 desc = desc_chain 307 .next() 308 .ok_or(Error::DescriptorChainTooShort) 309 .inspect_err(|_| { 310 error!("DescriptorChain corrupted: request = {:?}", req); 311 })?; 312 } 313 status_desc = desc; 314 } 315 316 // The status MUST always be writable. 317 if !status_desc.is_write_only() { 318 return Err(Error::UnexpectedReadOnlyDescriptor); 319 } 320 321 if status_desc.len() < 1 { 322 return Err(Error::DescriptorLengthTooSmall); 323 } 324 325 req.status_addr = status_desc 326 .addr() 327 .translate_gva(access_platform, status_desc.len() as usize); 328 329 Ok(req) 330 } 331 332 pub fn execute<T: Seek + Read + Write, B: Bitmap + 'static>( 333 &self, 334 disk: &mut T, 335 disk_nsectors: u64, 336 mem: &vm_memory::GuestMemoryMmap<B>, 337 serial: &[u8], 338 ) -> result::Result<u32, ExecuteError> { 339 disk.seek(SeekFrom::Start(self.sector << SECTOR_SHIFT)) 340 .map_err(ExecuteError::Seek)?; 341 let mut len = 0; 342 for (data_addr, data_len) in &self.data_descriptors { 343 let mut top: u64 = u64::from(*data_len) / SECTOR_SIZE; 344 if u64::from(*data_len) % SECTOR_SIZE != 0 { 345 top += 1; 346 } 347 top = top 348 .checked_add(self.sector) 349 .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?; 350 if top > disk_nsectors { 351 return Err(ExecuteError::BadRequest(Error::InvalidOffset)); 352 } 353 354 match self.request_type { 355 RequestType::In => { 356 let mut buf = vec![0u8; *data_len as usize]; 357 disk.read_exact(&mut buf).map_err(ExecuteError::ReadExact)?; 358 mem.read_exact_volatile_from( 359 *data_addr, 360 &mut buf.as_slice(), 361 *data_len as usize, 362 ) 363 .map_err(ExecuteError::Read)?; 364 len += data_len; 365 } 366 RequestType::Out => { 367 let mut buf: Vec<u8> = Vec::new(); 368 mem.write_all_volatile_to(*data_addr, &mut buf, *data_len as usize) 369 .map_err(ExecuteError::Write)?; 370 disk.write_all(&buf).map_err(ExecuteError::WriteAll)?; 371 if !self.writeback { 372 disk.flush().map_err(ExecuteError::Flush)?; 373 } 374 } 375 RequestType::Flush => disk.flush().map_err(ExecuteError::Flush)?, 376 RequestType::GetDeviceId => { 377 if (*data_len as usize) < serial.len() { 378 return Err(ExecuteError::BadRequest(Error::InvalidOffset)); 379 } 380 mem.write_slice(serial, *data_addr) 381 .map_err(ExecuteError::Write)?; 382 } 383 RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)), 384 }; 385 } 386 Ok(len) 387 } 388 389 pub fn execute_async<B: Bitmap + 'static>( 390 &mut self, 391 mem: &vm_memory::GuestMemoryMmap<B>, 392 disk_nsectors: u64, 393 disk_image: &mut dyn AsyncIo, 394 serial: &[u8], 395 user_data: u64, 396 ) -> result::Result<bool, ExecuteError> { 397 let sector = self.sector; 398 let request_type = self.request_type; 399 let offset = (sector << SECTOR_SHIFT) as libc::off_t; 400 401 let mut iovecs: SmallVec<[libc::iovec; DEFAULT_DESCRIPTOR_VEC_SIZE]> = 402 SmallVec::with_capacity(self.data_descriptors.len()); 403 for (data_addr, data_len) in &self.data_descriptors { 404 if *data_len == 0 { 405 continue; 406 } 407 let mut top: u64 = u64::from(*data_len) / SECTOR_SIZE; 408 if u64::from(*data_len) % SECTOR_SIZE != 0 { 409 top += 1; 410 } 411 top = top 412 .checked_add(sector) 413 .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?; 414 if top > disk_nsectors { 415 return Err(ExecuteError::BadRequest(Error::InvalidOffset)); 416 } 417 418 let origin_ptr = mem 419 .get_slice(*data_addr, *data_len as usize) 420 .map_err(ExecuteError::GetHostAddress)? 421 .ptr_guard(); 422 423 // Verify the buffer alignment. 424 // In case it's not properly aligned, an intermediate buffer is 425 // created with the correct alignment, and a copy from/to the 426 // origin buffer is performed, depending on the type of operation. 427 let iov_base = if (origin_ptr.as_ptr() as u64) % SECTOR_SIZE != 0 { 428 let layout = 429 Layout::from_size_align(*data_len as usize, SECTOR_SIZE as usize).unwrap(); 430 // SAFETY: layout has non-zero size 431 let aligned_ptr = unsafe { alloc_zeroed(layout) }; 432 if aligned_ptr.is_null() { 433 return Err(ExecuteError::TemporaryBufferAllocation( 434 io::Error::last_os_error(), 435 )); 436 } 437 438 // We need to perform the copy beforehand in case we're writing 439 // data out. 440 if request_type == RequestType::Out { 441 // SAFETY: destination buffer has been allocated with 442 // the proper size. 443 unsafe { std::ptr::copy(origin_ptr.as_ptr(), aligned_ptr, *data_len as usize) }; 444 } 445 446 // Store both origin and aligned pointers for complete_async() 447 // to process them. 448 self.aligned_operations.push(AlignedOperation { 449 origin_ptr: origin_ptr.as_ptr() as u64, 450 aligned_ptr: aligned_ptr as u64, 451 size: *data_len as usize, 452 layout, 453 }); 454 455 aligned_ptr as *mut libc::c_void 456 } else { 457 origin_ptr.as_ptr() as *mut libc::c_void 458 }; 459 460 let iovec = libc::iovec { 461 iov_base, 462 iov_len: *data_len as libc::size_t, 463 }; 464 iovecs.push(iovec); 465 } 466 467 // Queue operations expected to be submitted. 468 match request_type { 469 RequestType::In => { 470 for (data_addr, data_len) in &self.data_descriptors { 471 mem.get_slice(*data_addr, *data_len as usize) 472 .map_err(ExecuteError::GetHostAddress)? 473 .bitmap() 474 .mark_dirty(0, *data_len as usize); 475 } 476 disk_image 477 .read_vectored(offset, &iovecs, user_data) 478 .map_err(ExecuteError::AsyncRead)?; 479 } 480 RequestType::Out => { 481 disk_image 482 .write_vectored(offset, &iovecs, user_data) 483 .map_err(ExecuteError::AsyncWrite)?; 484 } 485 RequestType::Flush => { 486 disk_image 487 .fsync(Some(user_data)) 488 .map_err(ExecuteError::AsyncFlush)?; 489 } 490 RequestType::GetDeviceId => { 491 let (data_addr, data_len) = if self.data_descriptors.len() == 1 { 492 (self.data_descriptors[0].0, self.data_descriptors[0].1) 493 } else { 494 return Err(ExecuteError::BadRequest(Error::TooManyDescriptors)); 495 }; 496 if (data_len as usize) < serial.len() { 497 return Err(ExecuteError::BadRequest(Error::InvalidOffset)); 498 } 499 mem.write_slice(serial, data_addr) 500 .map_err(ExecuteError::Write)?; 501 return Ok(false); 502 } 503 RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)), 504 } 505 506 Ok(true) 507 } 508 509 pub fn complete_async(&mut self) -> result::Result<(), Error> { 510 for aligned_operation in self.aligned_operations.drain(..) { 511 // We need to perform the copy after the data has been read inside 512 // the aligned buffer in case we're reading data in. 513 if self.request_type == RequestType::In { 514 // SAFETY: origin buffer has been allocated with the 515 // proper size. 516 unsafe { 517 std::ptr::copy( 518 aligned_operation.aligned_ptr as *const u8, 519 aligned_operation.origin_ptr as *mut u8, 520 aligned_operation.size, 521 ) 522 }; 523 } 524 525 // Free the temporary aligned buffer. 526 // SAFETY: aligned_ptr was allocated by alloc_zeroed with the same 527 // layout 528 unsafe { 529 dealloc( 530 aligned_operation.aligned_ptr as *mut u8, 531 aligned_operation.layout, 532 ) 533 }; 534 } 535 536 Ok(()) 537 } 538 539 pub fn set_writeback(&mut self, writeback: bool) { 540 self.writeback = writeback 541 } 542 } 543 544 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)] 545 #[repr(C, packed)] 546 pub struct VirtioBlockConfig { 547 pub capacity: u64, 548 pub size_max: u32, 549 pub seg_max: u32, 550 pub geometry: VirtioBlockGeometry, 551 pub blk_size: u32, 552 pub physical_block_exp: u8, 553 pub alignment_offset: u8, 554 pub min_io_size: u16, 555 pub opt_io_size: u32, 556 pub writeback: u8, 557 pub unused: u8, 558 pub num_queues: u16, 559 pub max_discard_sectors: u32, 560 pub max_discard_seg: u32, 561 pub discard_sector_alignment: u32, 562 pub max_write_zeroes_sectors: u32, 563 pub max_write_zeroes_seg: u32, 564 pub write_zeroes_may_unmap: u8, 565 pub unused1: [u8; 3], 566 } 567 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)] 568 #[repr(C, packed)] 569 pub struct VirtioBlockGeometry { 570 pub cylinders: u16, 571 pub heads: u8, 572 pub sectors: u8, 573 } 574 575 // SAFETY: data structure only contain a series of integers 576 unsafe impl ByteValued for VirtioBlockConfig {} 577 // SAFETY: data structure only contain a series of integers 578 unsafe impl ByteValued for VirtioBlockGeometry {} 579 580 /// Check if aio can be used on the current system. 581 pub fn block_aio_is_supported() -> bool { 582 aio::IoContext::new(1).is_ok() 583 } 584 585 /// Check if io_uring for block device can be used on the current system, as 586 /// it correctly supports the expected io_uring features. 587 pub fn block_io_uring_is_supported() -> bool { 588 #[cfg(not(feature = "io_uring"))] 589 { 590 info!("io_uring is disabled by crate features"); 591 false 592 } 593 594 #[cfg(feature = "io_uring")] 595 { 596 let error_msg = "io_uring not supported:"; 597 598 // Check we can create an io_uring instance, which effectively verifies 599 // that io_uring_setup() syscall is supported. 600 let io_uring = match IoUring::new(1) { 601 Ok(io_uring) => io_uring, 602 Err(e) => { 603 info!("{} failed to create io_uring instance: {}", error_msg, e); 604 return false; 605 } 606 }; 607 608 let submitter = io_uring.submitter(); 609 610 let mut probe = Probe::new(); 611 612 // Check we can register a probe to validate supported operations. 613 match submitter.register_probe(&mut probe) { 614 Ok(_) => {} 615 Err(e) => { 616 info!("{} failed to register a probe: {}", error_msg, e); 617 return false; 618 } 619 } 620 621 // Check IORING_OP_FSYNC is supported 622 if !probe.is_supported(opcode::Fsync::CODE) { 623 info!("{} IORING_OP_FSYNC operation not supported", error_msg); 624 return false; 625 } 626 627 // Check IORING_OP_READV is supported 628 if !probe.is_supported(opcode::Readv::CODE) { 629 info!("{} IORING_OP_READV operation not supported", error_msg); 630 return false; 631 } 632 633 // Check IORING_OP_WRITEV is supported 634 if !probe.is_supported(opcode::Writev::CODE) { 635 info!("{} IORING_OP_WRITEV operation not supported", error_msg); 636 return false; 637 } 638 639 true 640 } 641 } 642 643 pub trait AsyncAdaptor<F> 644 where 645 F: Read + Write + Seek, 646 { 647 fn read_vectored_sync( 648 &mut self, 649 offset: libc::off_t, 650 iovecs: &[libc::iovec], 651 user_data: u64, 652 eventfd: &EventFd, 653 completion_list: &mut VecDeque<(u64, i32)>, 654 ) -> AsyncIoResult<()> { 655 // Convert libc::iovec into IoSliceMut 656 let mut slices: SmallVec<[IoSliceMut; DEFAULT_DESCRIPTOR_VEC_SIZE]> = 657 SmallVec::with_capacity(iovecs.len()); 658 for iovec in iovecs.iter() { 659 // SAFETY: on Linux IoSliceMut wraps around libc::iovec 660 slices.push(IoSliceMut::new(unsafe { 661 std::mem::transmute::<libc::iovec, &mut [u8]>(*iovec) 662 })); 663 } 664 665 let result = { 666 let mut file = self.file(); 667 668 // Move the cursor to the right offset 669 file.seek(SeekFrom::Start(offset as u64)) 670 .map_err(AsyncIoError::ReadVectored)?; 671 672 let mut r = 0; 673 for b in slices.iter_mut() { 674 r += file.read(b).map_err(AsyncIoError::ReadVectored)?; 675 } 676 r 677 }; 678 679 completion_list.push_back((user_data, result as i32)); 680 eventfd.write(1).unwrap(); 681 682 Ok(()) 683 } 684 685 fn write_vectored_sync( 686 &mut self, 687 offset: libc::off_t, 688 iovecs: &[libc::iovec], 689 user_data: u64, 690 eventfd: &EventFd, 691 completion_list: &mut VecDeque<(u64, i32)>, 692 ) -> AsyncIoResult<()> { 693 // Convert libc::iovec into IoSlice 694 let mut slices: SmallVec<[IoSlice; DEFAULT_DESCRIPTOR_VEC_SIZE]> = 695 SmallVec::with_capacity(iovecs.len()); 696 for iovec in iovecs.iter() { 697 // SAFETY: on Linux IoSlice wraps around libc::iovec 698 slices.push(IoSlice::new(unsafe { 699 std::mem::transmute::<libc::iovec, &mut [u8]>(*iovec) 700 })); 701 } 702 703 let result = { 704 let mut file = self.file(); 705 706 // Move the cursor to the right offset 707 file.seek(SeekFrom::Start(offset as u64)) 708 .map_err(AsyncIoError::WriteVectored)?; 709 710 let mut r = 0; 711 for b in slices.iter() { 712 r += file.write(b).map_err(AsyncIoError::WriteVectored)?; 713 } 714 r 715 }; 716 717 completion_list.push_back((user_data, result as i32)); 718 eventfd.write(1).unwrap(); 719 720 Ok(()) 721 } 722 723 fn fsync_sync( 724 &mut self, 725 user_data: Option<u64>, 726 eventfd: &EventFd, 727 completion_list: &mut VecDeque<(u64, i32)>, 728 ) -> AsyncIoResult<()> { 729 let result: i32 = { 730 let mut file = self.file(); 731 732 // Flush 733 file.flush().map_err(AsyncIoError::Fsync)?; 734 735 0 736 }; 737 738 if let Some(user_data) = user_data { 739 completion_list.push_back((user_data, result)); 740 eventfd.write(1).unwrap(); 741 } 742 743 Ok(()) 744 } 745 746 fn file(&mut self) -> MutexGuard<F>; 747 } 748 749 pub enum ImageType { 750 FixedVhd, 751 Qcow2, 752 Raw, 753 Vhdx, 754 } 755 756 const QCOW_MAGIC: u32 = 0x5146_49fb; 757 const VHDX_SIGN: u64 = 0x656C_6966_7864_6876; 758 759 /// Read a block into memory aligned by the source block size (needed for O_DIRECT) 760 pub fn read_aligned_block_size(f: &mut File) -> std::io::Result<Vec<u8>> { 761 let blocksize = DiskTopology::probe(f)?.logical_block_size as usize; 762 // SAFETY: We are allocating memory that is naturally aligned (size = alignment) and we meet 763 // requirements for safety from Vec::from_raw_parts() as we are using the global allocator 764 // and transferring ownership of the memory. 765 let mut data = unsafe { 766 Vec::from_raw_parts( 767 alloc_zeroed(Layout::from_size_align_unchecked(blocksize, blocksize)), 768 blocksize, 769 blocksize, 770 ) 771 }; 772 f.read_exact(&mut data)?; 773 Ok(data) 774 } 775 776 /// Determine image type through file parsing. 777 pub fn detect_image_type(f: &mut File) -> std::io::Result<ImageType> { 778 let block = read_aligned_block_size(f)?; 779 780 // Check 4 first bytes to get the header value and determine the image type 781 let image_type = if u32::from_be_bytes(block[0..4].try_into().unwrap()) == QCOW_MAGIC { 782 ImageType::Qcow2 783 } else if vhd::is_fixed_vhd(f)? { 784 ImageType::FixedVhd 785 } else if u64::from_le_bytes(block[0..8].try_into().unwrap()) == VHDX_SIGN { 786 ImageType::Vhdx 787 } else { 788 ImageType::Raw 789 }; 790 791 Ok(image_type) 792 } 793 794 pub trait BlockBackend: Read + Write + Seek + Send + Debug { 795 fn size(&self) -> Result<u64, Error>; 796 } 797 798 #[derive(Debug)] 799 pub struct DiskTopology { 800 pub logical_block_size: u64, 801 pub physical_block_size: u64, 802 pub minimum_io_size: u64, 803 pub optimal_io_size: u64, 804 } 805 806 impl Default for DiskTopology { 807 fn default() -> Self { 808 Self { 809 logical_block_size: 512, 810 physical_block_size: 512, 811 minimum_io_size: 512, 812 optimal_io_size: 0, 813 } 814 } 815 } 816 817 ioctl_io_nr!(BLKSSZGET, 0x12, 104); 818 ioctl_io_nr!(BLKPBSZGET, 0x12, 123); 819 ioctl_io_nr!(BLKIOMIN, 0x12, 120); 820 ioctl_io_nr!(BLKIOOPT, 0x12, 121); 821 822 enum BlockSize { 823 LogicalBlock, 824 PhysicalBlock, 825 MinimumIo, 826 OptimalIo, 827 } 828 829 impl DiskTopology { 830 fn is_block_device(f: &File) -> std::io::Result<bool> { 831 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 832 // SAFETY: FFI call with a valid fd and buffer 833 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 834 if ret != 0 { 835 return Err(std::io::Error::last_os_error()); 836 } 837 838 // SAFETY: stat is valid at this point 839 let is_block = unsafe { (*stat.as_ptr()).st_mode & S_IFMT == S_IFBLK }; 840 Ok(is_block) 841 } 842 843 // libc::ioctl() takes different types on different architectures 844 fn query_block_size(f: &File, block_size_type: BlockSize) -> std::io::Result<u64> { 845 let mut block_size = 0; 846 // SAFETY: FFI call with correct arguments 847 let ret = unsafe { 848 ioctl( 849 f.as_raw_fd(), 850 match block_size_type { 851 BlockSize::LogicalBlock => BLKSSZGET(), 852 BlockSize::PhysicalBlock => BLKPBSZGET(), 853 BlockSize::MinimumIo => BLKIOMIN(), 854 BlockSize::OptimalIo => BLKIOOPT(), 855 } as _, 856 &mut block_size, 857 ) 858 }; 859 if ret != 0 { 860 return Err(std::io::Error::last_os_error()); 861 }; 862 863 Ok(block_size) 864 } 865 866 pub fn probe(f: &File) -> std::io::Result<Self> { 867 if !Self::is_block_device(f)? { 868 return Ok(DiskTopology::default()); 869 } 870 871 Ok(DiskTopology { 872 logical_block_size: Self::query_block_size(f, BlockSize::LogicalBlock)?, 873 physical_block_size: Self::query_block_size(f, BlockSize::PhysicalBlock)?, 874 minimum_io_size: Self::query_block_size(f, BlockSize::MinimumIo)?, 875 optimal_io_size: Self::query_block_size(f, BlockSize::OptimalIo)?, 876 }) 877 } 878 } 879