1 // Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 // 3 // Portions Copyright 2017 The Chromium OS Authors. All rights reserved. 4 // Use of this source code is governed by a BSD-style license that can be 5 // found in the LICENSE-BSD-3-Clause file. 6 // 7 // Copyright © 2020 Intel Corporation 8 // 9 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause 10 11 #[macro_use] 12 extern crate log; 13 14 pub mod async_io; 15 pub mod fixed_vhd; 16 #[cfg(feature = "io_uring")] 17 /// Enabled with the `"io_uring"` feature 18 pub mod fixed_vhd_async; 19 pub mod fixed_vhd_sync; 20 pub mod qcow; 21 pub mod qcow_sync; 22 #[cfg(feature = "io_uring")] 23 /// Async primitives based on `io-uring` 24 /// 25 /// Enabled with the `"io_uring"` feature 26 pub mod raw_async; 27 pub mod raw_async_aio; 28 pub mod raw_sync; 29 pub mod vhd; 30 pub mod vhdx; 31 pub mod vhdx_sync; 32 33 use crate::async_io::{AsyncIo, AsyncIoError, AsyncIoResult}; 34 use crate::fixed_vhd::FixedVhd; 35 use crate::qcow::{QcowFile, RawFile}; 36 use crate::vhdx::{Vhdx, VhdxError}; 37 #[cfg(feature = "io_uring")] 38 use io_uring::{opcode, IoUring, Probe}; 39 use libc::{ioctl, S_IFBLK, S_IFMT}; 40 use serde::{Deserialize, Serialize}; 41 use smallvec::SmallVec; 42 use std::alloc::{alloc_zeroed, dealloc, Layout}; 43 use std::cmp; 44 use std::collections::VecDeque; 45 use std::fmt::Debug; 46 use std::fs::File; 47 use std::io::{self, IoSlice, IoSliceMut, Read, Seek, SeekFrom, Write}; 48 use std::os::linux::fs::MetadataExt; 49 use std::os::unix::io::AsRawFd; 50 use std::path::Path; 51 use std::result; 52 use std::sync::Arc; 53 use std::sync::MutexGuard; 54 use std::time::Instant; 55 use thiserror::Error; 56 use virtio_bindings::virtio_blk::*; 57 use virtio_queue::DescriptorChain; 58 use vm_memory::{ 59 bitmap::Bitmap, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryError, 60 GuestMemoryLoadGuard, 61 }; 62 use vm_virtio::{AccessPlatform, Translatable}; 63 use vmm_sys_util::aio; 64 use vmm_sys_util::eventfd::EventFd; 65 use vmm_sys_util::{ioctl_io_nr, ioctl_ioc_nr}; 66 67 const SECTOR_SHIFT: u8 = 9; 68 pub const SECTOR_SIZE: u64 = 0x01 << SECTOR_SHIFT; 69 70 #[derive(Error, Debug)] 71 pub enum Error { 72 #[error("Guest gave us bad memory addresses")] 73 GuestMemory(GuestMemoryError), 74 #[error("Guest gave us offsets that would have overflowed a usize")] 75 CheckedOffset(GuestAddress, usize), 76 #[error("Guest gave us a write only descriptor that protocol says to read from")] 77 UnexpectedWriteOnlyDescriptor, 78 #[error("Guest gave us a read only descriptor that protocol says to write to")] 79 UnexpectedReadOnlyDescriptor, 80 #[error("Guest gave us too few descriptors in a descriptor chain")] 81 DescriptorChainTooShort, 82 #[error("Guest gave us a descriptor that was too short to use")] 83 DescriptorLengthTooSmall, 84 #[error("Failed to detect image type: {0}")] 85 DetectImageType(std::io::Error), 86 #[error("Failure in fixed vhd: {0}")] 87 FixedVhdError(std::io::Error), 88 #[error("Getting a block's metadata fails for any reason")] 89 GetFileMetadata, 90 #[error("The requested operation would cause a seek beyond disk end")] 91 InvalidOffset, 92 #[error("Failure in qcow: {0}")] 93 QcowError(qcow::Error), 94 #[error("Failure in raw file: {0}")] 95 RawFileError(std::io::Error), 96 #[error("The requested operation does not support multiple descriptors")] 97 TooManyDescriptors, 98 #[error("Failure in vhdx: {0}")] 99 VhdxError(VhdxError), 100 } 101 102 fn build_device_id(disk_path: &Path) -> result::Result<String, Error> { 103 let blk_metadata = match disk_path.metadata() { 104 Err(_) => return Err(Error::GetFileMetadata), 105 Ok(m) => m, 106 }; 107 // This is how kvmtool does it. 108 let device_id = format!( 109 "{}{}{}", 110 blk_metadata.st_dev(), 111 blk_metadata.st_rdev(), 112 blk_metadata.st_ino() 113 ); 114 Ok(device_id) 115 } 116 117 pub fn build_serial(disk_path: &Path) -> Vec<u8> { 118 let mut default_serial = vec![0; VIRTIO_BLK_ID_BYTES as usize]; 119 match build_device_id(disk_path) { 120 Err(_) => { 121 warn!("Could not generate device id. We'll use a default."); 122 } 123 Ok(m) => { 124 // The kernel only knows to read a maximum of VIRTIO_BLK_ID_BYTES. 125 // This will also zero out any leftover bytes. 126 let disk_id = m.as_bytes(); 127 let bytes_to_copy = cmp::min(disk_id.len(), VIRTIO_BLK_ID_BYTES as usize); 128 default_serial[..bytes_to_copy].clone_from_slice(&disk_id[..bytes_to_copy]) 129 } 130 } 131 default_serial 132 } 133 134 #[derive(Error, Debug)] 135 pub enum ExecuteError { 136 #[error("Bad request: {0}")] 137 BadRequest(Error), 138 #[error("Failed to flush: {0}")] 139 Flush(io::Error), 140 #[error("Failed to read: {0}")] 141 Read(GuestMemoryError), 142 #[error("Failed to read_exact: {0}")] 143 ReadExact(io::Error), 144 #[error("Failed to seek: {0}")] 145 Seek(io::Error), 146 #[error("Failed to write: {0}")] 147 Write(GuestMemoryError), 148 #[error("Failed to write_all: {0}")] 149 WriteAll(io::Error), 150 #[error("Unsupported request: {0}")] 151 Unsupported(u32), 152 #[error("Failed to submit io uring: {0}")] 153 SubmitIoUring(io::Error), 154 #[error("Failed to get guest address: {0}")] 155 GetHostAddress(GuestMemoryError), 156 #[error("Failed to async read: {0}")] 157 AsyncRead(AsyncIoError), 158 #[error("Failed to async write: {0}")] 159 AsyncWrite(AsyncIoError), 160 #[error("failed to async flush: {0}")] 161 AsyncFlush(AsyncIoError), 162 #[error("Failed allocating a temporary buffer: {0}")] 163 TemporaryBufferAllocation(io::Error), 164 } 165 166 impl ExecuteError { 167 pub fn status(&self) -> u8 { 168 let status = match *self { 169 ExecuteError::BadRequest(_) => VIRTIO_BLK_S_IOERR, 170 ExecuteError::Flush(_) => VIRTIO_BLK_S_IOERR, 171 ExecuteError::Read(_) => VIRTIO_BLK_S_IOERR, 172 ExecuteError::ReadExact(_) => VIRTIO_BLK_S_IOERR, 173 ExecuteError::Seek(_) => VIRTIO_BLK_S_IOERR, 174 ExecuteError::Write(_) => VIRTIO_BLK_S_IOERR, 175 ExecuteError::WriteAll(_) => VIRTIO_BLK_S_IOERR, 176 ExecuteError::Unsupported(_) => VIRTIO_BLK_S_UNSUPP, 177 ExecuteError::SubmitIoUring(_) => VIRTIO_BLK_S_IOERR, 178 ExecuteError::GetHostAddress(_) => VIRTIO_BLK_S_IOERR, 179 ExecuteError::AsyncRead(_) => VIRTIO_BLK_S_IOERR, 180 ExecuteError::AsyncWrite(_) => VIRTIO_BLK_S_IOERR, 181 ExecuteError::AsyncFlush(_) => VIRTIO_BLK_S_IOERR, 182 ExecuteError::TemporaryBufferAllocation(_) => VIRTIO_BLK_S_IOERR, 183 }; 184 status as u8 185 } 186 } 187 188 #[derive(Clone, Copy, Debug, PartialEq, Eq)] 189 pub enum RequestType { 190 In, 191 Out, 192 Flush, 193 GetDeviceId, 194 Unsupported(u32), 195 } 196 197 pub fn request_type<B: Bitmap + 'static>( 198 mem: &vm_memory::GuestMemoryMmap<B>, 199 desc_addr: GuestAddress, 200 ) -> result::Result<RequestType, Error> { 201 let type_ = mem.read_obj(desc_addr).map_err(Error::GuestMemory)?; 202 match type_ { 203 VIRTIO_BLK_T_IN => Ok(RequestType::In), 204 VIRTIO_BLK_T_OUT => Ok(RequestType::Out), 205 VIRTIO_BLK_T_FLUSH => Ok(RequestType::Flush), 206 VIRTIO_BLK_T_GET_ID => Ok(RequestType::GetDeviceId), 207 t => Ok(RequestType::Unsupported(t)), 208 } 209 } 210 211 fn sector<B: Bitmap + 'static>( 212 mem: &vm_memory::GuestMemoryMmap<B>, 213 desc_addr: GuestAddress, 214 ) -> result::Result<u64, Error> { 215 const SECTOR_OFFSET: usize = 8; 216 let addr = match mem.checked_offset(desc_addr, SECTOR_OFFSET) { 217 Some(v) => v, 218 None => return Err(Error::CheckedOffset(desc_addr, SECTOR_OFFSET)), 219 }; 220 221 mem.read_obj(addr).map_err(Error::GuestMemory) 222 } 223 224 #[derive(Debug)] 225 pub struct AlignedOperation { 226 origin_ptr: u64, 227 aligned_ptr: u64, 228 size: usize, 229 layout: Layout, 230 } 231 232 #[derive(Debug)] 233 pub struct Request { 234 pub request_type: RequestType, 235 pub sector: u64, 236 pub data_descriptors: SmallVec<[(GuestAddress, u32); 1]>, 237 pub status_addr: GuestAddress, 238 pub writeback: bool, 239 pub aligned_operations: SmallVec<[AlignedOperation; 1]>, 240 pub start: Instant, 241 } 242 243 impl Request { 244 pub fn parse<B: Bitmap + 'static>( 245 desc_chain: &mut DescriptorChain<GuestMemoryLoadGuard<vm_memory::GuestMemoryMmap<B>>>, 246 access_platform: Option<&Arc<dyn AccessPlatform>>, 247 ) -> result::Result<Request, Error> { 248 let hdr_desc = desc_chain 249 .next() 250 .ok_or(Error::DescriptorChainTooShort) 251 .map_err(|e| { 252 error!("Missing head descriptor"); 253 e 254 })?; 255 256 // The head contains the request type which MUST be readable. 257 if hdr_desc.is_write_only() { 258 return Err(Error::UnexpectedWriteOnlyDescriptor); 259 } 260 261 let hdr_desc_addr = hdr_desc 262 .addr() 263 .translate_gva(access_platform, hdr_desc.len() as usize); 264 265 let mut req = Request { 266 request_type: request_type(desc_chain.memory(), hdr_desc_addr)?, 267 sector: sector(desc_chain.memory(), hdr_desc_addr)?, 268 data_descriptors: SmallVec::with_capacity(1), 269 status_addr: GuestAddress(0), 270 writeback: true, 271 aligned_operations: SmallVec::with_capacity(1), 272 start: Instant::now(), 273 }; 274 275 let status_desc; 276 let mut desc = desc_chain 277 .next() 278 .ok_or(Error::DescriptorChainTooShort) 279 .map_err(|e| { 280 error!("Only head descriptor present: request = {:?}", req); 281 e 282 })?; 283 284 if !desc.has_next() { 285 status_desc = desc; 286 // Only flush requests are allowed to skip the data descriptor. 287 if req.request_type != RequestType::Flush { 288 error!("Need a data descriptor: request = {:?}", req); 289 return Err(Error::DescriptorChainTooShort); 290 } 291 } else { 292 req.data_descriptors.reserve_exact(1); 293 while desc.has_next() { 294 if desc.is_write_only() && req.request_type == RequestType::Out { 295 return Err(Error::UnexpectedWriteOnlyDescriptor); 296 } 297 if !desc.is_write_only() && req.request_type == RequestType::In { 298 return Err(Error::UnexpectedReadOnlyDescriptor); 299 } 300 if !desc.is_write_only() && req.request_type == RequestType::GetDeviceId { 301 return Err(Error::UnexpectedReadOnlyDescriptor); 302 } 303 304 req.data_descriptors.push(( 305 desc.addr() 306 .translate_gva(access_platform, desc.len() as usize), 307 desc.len(), 308 )); 309 desc = desc_chain 310 .next() 311 .ok_or(Error::DescriptorChainTooShort) 312 .map_err(|e| { 313 error!("DescriptorChain corrupted: request = {:?}", req); 314 e 315 })?; 316 } 317 status_desc = desc; 318 } 319 320 // The status MUST always be writable. 321 if !status_desc.is_write_only() { 322 return Err(Error::UnexpectedReadOnlyDescriptor); 323 } 324 325 if status_desc.len() < 1 { 326 return Err(Error::DescriptorLengthTooSmall); 327 } 328 329 req.status_addr = status_desc 330 .addr() 331 .translate_gva(access_platform, status_desc.len() as usize); 332 333 Ok(req) 334 } 335 336 pub fn execute<T: Seek + Read + Write, B: Bitmap + 'static>( 337 &self, 338 disk: &mut T, 339 disk_nsectors: u64, 340 mem: &vm_memory::GuestMemoryMmap<B>, 341 serial: &[u8], 342 ) -> result::Result<u32, ExecuteError> { 343 disk.seek(SeekFrom::Start(self.sector << SECTOR_SHIFT)) 344 .map_err(ExecuteError::Seek)?; 345 let mut len = 0; 346 for (data_addr, data_len) in &self.data_descriptors { 347 let mut top: u64 = u64::from(*data_len) / SECTOR_SIZE; 348 if u64::from(*data_len) % SECTOR_SIZE != 0 { 349 top += 1; 350 } 351 top = top 352 .checked_add(self.sector) 353 .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?; 354 if top > disk_nsectors { 355 return Err(ExecuteError::BadRequest(Error::InvalidOffset)); 356 } 357 358 match self.request_type { 359 RequestType::In => { 360 let mut buf = vec![0u8; *data_len as usize]; 361 disk.read_exact(&mut buf).map_err(ExecuteError::ReadExact)?; 362 mem.read_exact_volatile_from( 363 *data_addr, 364 &mut buf.as_slice(), 365 *data_len as usize, 366 ) 367 .map_err(ExecuteError::Read)?; 368 len += data_len; 369 } 370 RequestType::Out => { 371 let mut buf: Vec<u8> = Vec::new(); 372 mem.write_all_volatile_to(*data_addr, &mut buf, *data_len as usize) 373 .map_err(ExecuteError::Write)?; 374 disk.write_all(&buf).map_err(ExecuteError::WriteAll)?; 375 if !self.writeback { 376 disk.flush().map_err(ExecuteError::Flush)?; 377 } 378 } 379 RequestType::Flush => disk.flush().map_err(ExecuteError::Flush)?, 380 RequestType::GetDeviceId => { 381 if (*data_len as usize) < serial.len() { 382 return Err(ExecuteError::BadRequest(Error::InvalidOffset)); 383 } 384 mem.write_slice(serial, *data_addr) 385 .map_err(ExecuteError::Write)?; 386 } 387 RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)), 388 }; 389 } 390 Ok(len) 391 } 392 393 pub fn execute_async<B: Bitmap + 'static>( 394 &mut self, 395 mem: &vm_memory::GuestMemoryMmap<B>, 396 disk_nsectors: u64, 397 disk_image: &mut dyn AsyncIo, 398 serial: &[u8], 399 user_data: u64, 400 ) -> result::Result<bool, ExecuteError> { 401 let sector = self.sector; 402 let request_type = self.request_type; 403 let offset = (sector << SECTOR_SHIFT) as libc::off_t; 404 405 let mut iovecs: SmallVec<[libc::iovec; 1]> = 406 SmallVec::with_capacity(self.data_descriptors.len()); 407 for (data_addr, data_len) in &self.data_descriptors { 408 if *data_len == 0 { 409 continue; 410 } 411 let mut top: u64 = u64::from(*data_len) / SECTOR_SIZE; 412 if u64::from(*data_len) % SECTOR_SIZE != 0 { 413 top += 1; 414 } 415 top = top 416 .checked_add(sector) 417 .ok_or(ExecuteError::BadRequest(Error::InvalidOffset))?; 418 if top > disk_nsectors { 419 return Err(ExecuteError::BadRequest(Error::InvalidOffset)); 420 } 421 422 let origin_ptr = mem 423 .get_slice(*data_addr, *data_len as usize) 424 .map_err(ExecuteError::GetHostAddress)? 425 .ptr_guard(); 426 427 // Verify the buffer alignment. 428 // In case it's not properly aligned, an intermediate buffer is 429 // created with the correct alignment, and a copy from/to the 430 // origin buffer is performed, depending on the type of operation. 431 let iov_base = if (origin_ptr.as_ptr() as u64) % SECTOR_SIZE != 0 { 432 let layout = 433 Layout::from_size_align(*data_len as usize, SECTOR_SIZE as usize).unwrap(); 434 // SAFETY: layout has non-zero size 435 let aligned_ptr = unsafe { alloc_zeroed(layout) }; 436 if aligned_ptr.is_null() { 437 return Err(ExecuteError::TemporaryBufferAllocation( 438 io::Error::last_os_error(), 439 )); 440 } 441 442 // We need to perform the copy beforehand in case we're writing 443 // data out. 444 if request_type == RequestType::Out { 445 // SAFETY: destination buffer has been allocated with 446 // the proper size. 447 unsafe { std::ptr::copy(origin_ptr.as_ptr(), aligned_ptr, *data_len as usize) }; 448 } 449 450 // Store both origin and aligned pointers for complete_async() 451 // to process them. 452 self.aligned_operations.push(AlignedOperation { 453 origin_ptr: origin_ptr.as_ptr() as u64, 454 aligned_ptr: aligned_ptr as u64, 455 size: *data_len as usize, 456 layout, 457 }); 458 459 aligned_ptr as *mut libc::c_void 460 } else { 461 origin_ptr.as_ptr() as *mut libc::c_void 462 }; 463 464 let iovec = libc::iovec { 465 iov_base, 466 iov_len: *data_len as libc::size_t, 467 }; 468 iovecs.push(iovec); 469 } 470 471 // Queue operations expected to be submitted. 472 match request_type { 473 RequestType::In => { 474 for (data_addr, data_len) in &self.data_descriptors { 475 mem.get_slice(*data_addr, *data_len as usize) 476 .map_err(ExecuteError::GetHostAddress)? 477 .bitmap() 478 .mark_dirty(0, *data_len as usize); 479 } 480 disk_image 481 .read_vectored(offset, &iovecs, user_data) 482 .map_err(ExecuteError::AsyncRead)?; 483 } 484 RequestType::Out => { 485 disk_image 486 .write_vectored(offset, &iovecs, user_data) 487 .map_err(ExecuteError::AsyncWrite)?; 488 } 489 RequestType::Flush => { 490 disk_image 491 .fsync(Some(user_data)) 492 .map_err(ExecuteError::AsyncFlush)?; 493 } 494 RequestType::GetDeviceId => { 495 let (data_addr, data_len) = if self.data_descriptors.len() == 1 { 496 (self.data_descriptors[0].0, self.data_descriptors[0].1) 497 } else { 498 return Err(ExecuteError::BadRequest(Error::TooManyDescriptors)); 499 }; 500 if (data_len as usize) < serial.len() { 501 return Err(ExecuteError::BadRequest(Error::InvalidOffset)); 502 } 503 mem.write_slice(serial, data_addr) 504 .map_err(ExecuteError::Write)?; 505 return Ok(false); 506 } 507 RequestType::Unsupported(t) => return Err(ExecuteError::Unsupported(t)), 508 } 509 510 Ok(true) 511 } 512 513 pub fn complete_async(&mut self) -> result::Result<(), Error> { 514 for aligned_operation in self.aligned_operations.drain(..) { 515 // We need to perform the copy after the data has been read inside 516 // the aligned buffer in case we're reading data in. 517 if self.request_type == RequestType::In { 518 // SAFETY: origin buffer has been allocated with the 519 // proper size. 520 unsafe { 521 std::ptr::copy( 522 aligned_operation.aligned_ptr as *const u8, 523 aligned_operation.origin_ptr as *mut u8, 524 aligned_operation.size, 525 ) 526 }; 527 } 528 529 // Free the temporary aligned buffer. 530 // SAFETY: aligned_ptr was allocated by alloc_zeroed with the same 531 // layout 532 unsafe { 533 dealloc( 534 aligned_operation.aligned_ptr as *mut u8, 535 aligned_operation.layout, 536 ) 537 }; 538 } 539 540 Ok(()) 541 } 542 543 pub fn set_writeback(&mut self, writeback: bool) { 544 self.writeback = writeback 545 } 546 } 547 548 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)] 549 #[repr(C, packed)] 550 pub struct VirtioBlockConfig { 551 pub capacity: u64, 552 pub size_max: u32, 553 pub seg_max: u32, 554 pub geometry: VirtioBlockGeometry, 555 pub blk_size: u32, 556 pub physical_block_exp: u8, 557 pub alignment_offset: u8, 558 pub min_io_size: u16, 559 pub opt_io_size: u32, 560 pub writeback: u8, 561 pub unused: u8, 562 pub num_queues: u16, 563 pub max_discard_sectors: u32, 564 pub max_discard_seg: u32, 565 pub discard_sector_alignment: u32, 566 pub max_write_zeroes_sectors: u32, 567 pub max_write_zeroes_seg: u32, 568 pub write_zeroes_may_unmap: u8, 569 pub unused1: [u8; 3], 570 } 571 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)] 572 #[repr(C, packed)] 573 pub struct VirtioBlockGeometry { 574 pub cylinders: u16, 575 pub heads: u8, 576 pub sectors: u8, 577 } 578 579 // SAFETY: data structure only contain a series of integers 580 unsafe impl ByteValued for VirtioBlockConfig {} 581 // SAFETY: data structure only contain a series of integers 582 unsafe impl ByteValued for VirtioBlockGeometry {} 583 584 /// Check if aio can be used on the current system. 585 pub fn block_aio_is_supported() -> bool { 586 aio::IoContext::new(1).is_ok() 587 } 588 589 /// Check if io_uring for block device can be used on the current system, as 590 /// it correctly supports the expected io_uring features. 591 pub fn block_io_uring_is_supported() -> bool { 592 #[cfg(not(feature = "io_uring"))] 593 { 594 info!("io_uring is disabled by crate features"); 595 false 596 } 597 598 #[cfg(feature = "io_uring")] 599 { 600 let error_msg = "io_uring not supported:"; 601 602 // Check we can create an io_uring instance, which effectively verifies 603 // that io_uring_setup() syscall is supported. 604 let io_uring = match IoUring::new(1) { 605 Ok(io_uring) => io_uring, 606 Err(e) => { 607 info!("{} failed to create io_uring instance: {}", error_msg, e); 608 return false; 609 } 610 }; 611 612 let submitter = io_uring.submitter(); 613 614 let mut probe = Probe::new(); 615 616 // Check we can register a probe to validate supported operations. 617 match submitter.register_probe(&mut probe) { 618 Ok(_) => {} 619 Err(e) => { 620 info!("{} failed to register a probe: {}", error_msg, e); 621 return false; 622 } 623 } 624 625 // Check IORING_OP_FSYNC is supported 626 if !probe.is_supported(opcode::Fsync::CODE) { 627 info!("{} IORING_OP_FSYNC operation not supported", error_msg); 628 return false; 629 } 630 631 // Check IORING_OP_READV is supported 632 if !probe.is_supported(opcode::Readv::CODE) { 633 info!("{} IORING_OP_READV operation not supported", error_msg); 634 return false; 635 } 636 637 // Check IORING_OP_WRITEV is supported 638 if !probe.is_supported(opcode::Writev::CODE) { 639 info!("{} IORING_OP_WRITEV operation not supported", error_msg); 640 return false; 641 } 642 643 true 644 } 645 } 646 647 pub trait AsyncAdaptor<F> 648 where 649 F: Read + Write + Seek, 650 { 651 fn read_vectored_sync( 652 &mut self, 653 offset: libc::off_t, 654 iovecs: &[libc::iovec], 655 user_data: u64, 656 eventfd: &EventFd, 657 completion_list: &mut VecDeque<(u64, i32)>, 658 ) -> AsyncIoResult<()> { 659 // Convert libc::iovec into IoSliceMut 660 let mut slices: SmallVec<[IoSliceMut; 1]> = SmallVec::with_capacity(iovecs.len()); 661 for iovec in iovecs.iter() { 662 // SAFETY: on Linux IoSliceMut wraps around libc::iovec 663 slices.push(IoSliceMut::new(unsafe { 664 std::mem::transmute::<libc::iovec, &mut [u8]>(*iovec) 665 })); 666 } 667 668 let result = { 669 let mut file = self.file(); 670 671 // Move the cursor to the right offset 672 file.seek(SeekFrom::Start(offset as u64)) 673 .map_err(AsyncIoError::ReadVectored)?; 674 675 // Read vectored 676 file.read_vectored(slices.as_mut_slice()) 677 .map_err(AsyncIoError::ReadVectored)? 678 }; 679 680 completion_list.push_back((user_data, result as i32)); 681 eventfd.write(1).unwrap(); 682 683 Ok(()) 684 } 685 686 fn write_vectored_sync( 687 &mut self, 688 offset: libc::off_t, 689 iovecs: &[libc::iovec], 690 user_data: u64, 691 eventfd: &EventFd, 692 completion_list: &mut VecDeque<(u64, i32)>, 693 ) -> AsyncIoResult<()> { 694 // Convert libc::iovec into IoSlice 695 let mut slices: SmallVec<[IoSlice; 1]> = SmallVec::with_capacity(iovecs.len()); 696 for iovec in iovecs.iter() { 697 // SAFETY: on Linux IoSlice wraps around libc::iovec 698 slices.push(IoSlice::new(unsafe { 699 std::mem::transmute::<libc::iovec, &mut [u8]>(*iovec) 700 })); 701 } 702 703 let result = { 704 let mut file = self.file(); 705 706 // Move the cursor to the right offset 707 file.seek(SeekFrom::Start(offset as u64)) 708 .map_err(AsyncIoError::WriteVectored)?; 709 710 // Write vectored 711 file.write_vectored(slices.as_slice()) 712 .map_err(AsyncIoError::WriteVectored)? 713 }; 714 715 completion_list.push_back((user_data, result as i32)); 716 eventfd.write(1).unwrap(); 717 718 Ok(()) 719 } 720 721 fn fsync_sync( 722 &mut self, 723 user_data: Option<u64>, 724 eventfd: &EventFd, 725 completion_list: &mut VecDeque<(u64, i32)>, 726 ) -> AsyncIoResult<()> { 727 let result: i32 = { 728 let mut file = self.file(); 729 730 // Flush 731 file.flush().map_err(AsyncIoError::Fsync)?; 732 733 0 734 }; 735 736 if let Some(user_data) = user_data { 737 completion_list.push_back((user_data, result)); 738 eventfd.write(1).unwrap(); 739 } 740 741 Ok(()) 742 } 743 744 fn file(&mut self) -> MutexGuard<F>; 745 } 746 747 pub enum ImageType { 748 FixedVhd, 749 Qcow2, 750 Raw, 751 Vhdx, 752 } 753 754 const QCOW_MAGIC: u32 = 0x5146_49fb; 755 const VHDX_SIGN: u64 = 0x656C_6966_7864_6876; 756 757 /// Read a block into memory aligned by the source block size (needed for O_DIRECT) 758 pub fn read_aligned_block_size(f: &mut File) -> std::io::Result<Vec<u8>> { 759 let blocksize = DiskTopology::probe(f)?.logical_block_size as usize; 760 // SAFETY: We are allocating memory that is naturally aligned (size = alignment) and we meet 761 // requirements for safety from Vec::from_raw_parts() as we are using the global allocator 762 // and transferring ownership of the memory. 763 let mut data = unsafe { 764 Vec::from_raw_parts( 765 alloc_zeroed(Layout::from_size_align_unchecked(blocksize, blocksize)), 766 blocksize, 767 blocksize, 768 ) 769 }; 770 f.read_exact(&mut data)?; 771 Ok(data) 772 } 773 774 /// Determine image type through file parsing. 775 pub fn detect_image_type(f: &mut File) -> std::io::Result<ImageType> { 776 let block = read_aligned_block_size(f)?; 777 778 // Check 4 first bytes to get the header value and determine the image type 779 let image_type = if u32::from_be_bytes(block[0..4].try_into().unwrap()) == QCOW_MAGIC { 780 ImageType::Qcow2 781 } else if vhd::is_fixed_vhd(f)? { 782 ImageType::FixedVhd 783 } else if u64::from_le_bytes(block[0..8].try_into().unwrap()) == VHDX_SIGN { 784 ImageType::Vhdx 785 } else { 786 ImageType::Raw 787 }; 788 789 Ok(image_type) 790 } 791 792 pub trait BlockBackend: Read + Write + Seek + Send + Debug { 793 fn size(&self) -> Result<u64, Error>; 794 } 795 796 /// Inspect the image file type and create an appropriate disk file to match it. 797 pub fn create_disk_file(mut file: File, direct_io: bool) -> Result<Box<dyn BlockBackend>, Error> { 798 let image_type = detect_image_type(&mut file).map_err(Error::DetectImageType)?; 799 800 Ok(match image_type { 801 ImageType::Qcow2 => { 802 Box::new(QcowFile::from(RawFile::new(file, direct_io)).map_err(Error::QcowError)?) 803 as Box<dyn BlockBackend> 804 } 805 ImageType::FixedVhd => { 806 Box::new(FixedVhd::new(file).map_err(Error::FixedVhdError)?) as Box<dyn BlockBackend> 807 } 808 ImageType::Vhdx => { 809 Box::new(Vhdx::new(file).map_err(Error::VhdxError)?) as Box<dyn BlockBackend> 810 } 811 ImageType::Raw => Box::new(RawFile::new(file, direct_io)) as Box<dyn BlockBackend>, 812 }) 813 } 814 815 #[derive(Debug)] 816 pub struct DiskTopology { 817 pub logical_block_size: u64, 818 pub physical_block_size: u64, 819 pub minimum_io_size: u64, 820 pub optimal_io_size: u64, 821 } 822 823 impl Default for DiskTopology { 824 fn default() -> Self { 825 Self { 826 logical_block_size: 512, 827 physical_block_size: 512, 828 minimum_io_size: 512, 829 optimal_io_size: 0, 830 } 831 } 832 } 833 834 ioctl_io_nr!(BLKSSZGET, 0x12, 104); 835 ioctl_io_nr!(BLKPBSZGET, 0x12, 123); 836 ioctl_io_nr!(BLKIOMIN, 0x12, 120); 837 ioctl_io_nr!(BLKIOOPT, 0x12, 121); 838 839 enum BlockSize { 840 LogicalBlock, 841 PhysicalBlock, 842 MinimumIo, 843 OptimalIo, 844 } 845 846 impl DiskTopology { 847 fn is_block_device(f: &File) -> std::io::Result<bool> { 848 let mut stat = std::mem::MaybeUninit::<libc::stat>::uninit(); 849 // SAFETY: FFI call with a valid fd and buffer 850 let ret = unsafe { libc::fstat(f.as_raw_fd(), stat.as_mut_ptr()) }; 851 if ret != 0 { 852 return Err(std::io::Error::last_os_error()); 853 } 854 855 // SAFETY: stat is valid at this point 856 let is_block = unsafe { (*stat.as_ptr()).st_mode & S_IFMT == S_IFBLK }; 857 Ok(is_block) 858 } 859 860 // libc::ioctl() takes different types on different architectures 861 fn query_block_size(f: &File, block_size_type: BlockSize) -> std::io::Result<u64> { 862 let mut block_size = 0; 863 // SAFETY: FFI call with correct arguments 864 let ret = unsafe { 865 ioctl( 866 f.as_raw_fd(), 867 match block_size_type { 868 BlockSize::LogicalBlock => BLKSSZGET(), 869 BlockSize::PhysicalBlock => BLKPBSZGET(), 870 BlockSize::MinimumIo => BLKIOMIN(), 871 BlockSize::OptimalIo => BLKIOOPT(), 872 } as _, 873 &mut block_size, 874 ) 875 }; 876 if ret != 0 { 877 return Err(std::io::Error::last_os_error()); 878 }; 879 880 Ok(block_size) 881 } 882 883 pub fn probe(f: &File) -> std::io::Result<Self> { 884 if !Self::is_block_device(f)? { 885 return Ok(DiskTopology::default()); 886 } 887 888 Ok(DiskTopology { 889 logical_block_size: Self::query_block_size(f, BlockSize::LogicalBlock)?, 890 physical_block_size: Self::query_block_size(f, BlockSize::PhysicalBlock)?, 891 minimum_io_size: Self::query_block_size(f, BlockSize::MinimumIo)?, 892 optimal_io_size: Self::query_block_size(f, BlockSize::OptimalIo)?, 893 }) 894 } 895 } 896