1 // Copyright (c) 2020 Ant Financial 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 use crate::{ 16 seccomp_filters::Thread, thread_helper::spawn_virtio_thread, ActivateResult, EpollHelper, 17 EpollHelperError, EpollHelperHandler, GuestMemoryMmap, VirtioCommon, VirtioDevice, 18 VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, EPOLL_HELPER_EVENT_LAST, 19 VIRTIO_F_VERSION_1, 20 }; 21 use anyhow::anyhow; 22 use seccompiler::SeccompAction; 23 use std::io::{self, Write}; 24 use std::mem::size_of; 25 use std::os::unix::io::AsRawFd; 26 use std::result; 27 use std::sync::{atomic::AtomicBool, Arc, Barrier}; 28 use thiserror::Error; 29 use versionize::{VersionMap, Versionize, VersionizeResult}; 30 use versionize_derive::Versionize; 31 use virtio_queue::{Queue, QueueT}; 32 use vm_allocator::page_size::{align_page_size_down, get_page_size}; 33 use vm_memory::{ 34 Address, ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 35 GuestMemoryError, GuestMemoryRegion, 36 }; 37 use vm_migration::{ 38 Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped, 39 }; 40 use vmm_sys_util::eventfd::EventFd; 41 42 const QUEUE_SIZE: u16 = 128; 43 const REPORTING_QUEUE_SIZE: u16 = 32; 44 const MIN_NUM_QUEUES: usize = 2; 45 46 // Inflate virtio queue event. 47 const INFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 1; 48 // Deflate virtio queue event. 49 const DEFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2; 50 // Reporting virtio queue event. 51 const REPORTING_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 3; 52 53 // Size of a PFN in the balloon interface. 54 const VIRTIO_BALLOON_PFN_SHIFT: u64 = 12; 55 56 // Deflate balloon on OOM 57 const VIRTIO_BALLOON_F_DEFLATE_ON_OOM: u64 = 2; 58 // Enable an additional virtqueue to let the guest notify the host about free 59 // pages. 60 const VIRTIO_BALLOON_F_REPORTING: u64 = 5; 61 62 #[derive(Error, Debug)] 63 pub enum Error { 64 #[error("Guest gave us bad memory addresses.: {0}")] 65 GuestMemory(GuestMemoryError), 66 #[error("Guest gave us a write only descriptor that protocol says to read from")] 67 UnexpectedWriteOnlyDescriptor, 68 #[error("Guest sent us invalid request")] 69 InvalidRequest, 70 #[error("Fallocate fail.: {0}")] 71 FallocateFail(std::io::Error), 72 #[error("Madvise fail.: {0}")] 73 MadviseFail(std::io::Error), 74 #[error("Failed to EventFd write.: {0}")] 75 EventFdWriteFail(std::io::Error), 76 #[error("Invalid queue index: {0}")] 77 InvalidQueueIndex(usize), 78 #[error("Fail tp signal: {0}")] 79 FailedSignal(io::Error), 80 #[error("Descriptor chain is too short")] 81 DescriptorChainTooShort, 82 #[error("Failed adding used index: {0}")] 83 QueueAddUsed(virtio_queue::Error), 84 #[error("Failed creating an iterator over the queue: {0}")] 85 QueueIterator(virtio_queue::Error), 86 } 87 88 // Got from include/uapi/linux/virtio_balloon.h 89 #[repr(C)] 90 #[derive(Copy, Clone, Debug, Default, Versionize)] 91 pub struct VirtioBalloonConfig { 92 // Number of pages host wants Guest to give up. 93 num_pages: u32, 94 // Number of pages we've actually got in balloon. 95 actual: u32, 96 } 97 98 #[derive(Clone, Debug)] 99 struct PartiallyBalloonedPage { 100 addr: u64, 101 bitmap: Vec<u64>, 102 page_size: u64, 103 } 104 105 impl PartiallyBalloonedPage { 106 fn new() -> Self { 107 let page_size = get_page_size(); 108 let len = ((page_size >> VIRTIO_BALLOON_PFN_SHIFT) + 63) / 64; 109 // Initial each padding bit as 1 in bitmap. 110 let mut bitmap = vec![0_u64; len as usize]; 111 let pad_num = len * 64 - (page_size >> VIRTIO_BALLOON_PFN_SHIFT); 112 bitmap[(len - 1) as usize] = !((1 << (64 - pad_num)) - 1); 113 Self { 114 addr: 0, 115 bitmap, 116 page_size, 117 } 118 } 119 120 fn pfn_match(&self, addr: u64) -> bool { 121 self.addr == addr & !(self.page_size - 1) 122 } 123 124 fn bitmap_full(&self) -> bool { 125 self.bitmap.iter().all(|b| *b == u64::MAX) 126 } 127 128 fn set_bit(&mut self, addr: u64) { 129 let addr_offset = (addr % self.page_size) >> VIRTIO_BALLOON_PFN_SHIFT; 130 self.bitmap[(addr_offset / 64) as usize] |= 1 << (addr_offset % 64); 131 } 132 133 fn reset(&mut self) { 134 let len = ((self.page_size >> VIRTIO_BALLOON_PFN_SHIFT) + 63) / 64; 135 self.addr = 0; 136 self.bitmap = vec![0; len as usize]; 137 let pad_num = len * 64 - (self.page_size >> VIRTIO_BALLOON_PFN_SHIFT); 138 self.bitmap[(len - 1) as usize] = !((1 << (64 - pad_num)) - 1); 139 } 140 } 141 142 const CONFIG_ACTUAL_OFFSET: u64 = 4; 143 const CONFIG_ACTUAL_SIZE: usize = 4; 144 145 // SAFETY: it only has data and has no implicit padding. 146 unsafe impl ByteValued for VirtioBalloonConfig {} 147 148 struct BalloonEpollHandler { 149 mem: GuestMemoryAtomic<GuestMemoryMmap>, 150 queues: Vec<Queue>, 151 interrupt_cb: Arc<dyn VirtioInterrupt>, 152 inflate_queue_evt: EventFd, 153 deflate_queue_evt: EventFd, 154 reporting_queue_evt: Option<EventFd>, 155 kill_evt: EventFd, 156 pause_evt: EventFd, 157 pbp: Option<PartiallyBalloonedPage>, 158 } 159 160 impl BalloonEpollHandler { 161 fn signal(&self, int_type: VirtioInterruptType) -> result::Result<(), Error> { 162 self.interrupt_cb.trigger(int_type).map_err(|e| { 163 error!("Failed to signal used queue: {:?}", e); 164 Error::FailedSignal(e) 165 }) 166 } 167 168 fn advise_memory_range( 169 memory: &GuestMemoryMmap, 170 range_base: GuestAddress, 171 range_len: usize, 172 advice: libc::c_int, 173 ) -> result::Result<(), Error> { 174 let hva = memory 175 .get_host_address(range_base) 176 .map_err(Error::GuestMemory)?; 177 let res = 178 // SAFETY: Need unsafe to do syscall madvise 179 unsafe { libc::madvise(hva as *mut libc::c_void, range_len as libc::size_t, advice) }; 180 if res != 0 { 181 return Err(Error::MadviseFail(io::Error::last_os_error())); 182 } 183 Ok(()) 184 } 185 186 fn release_memory_range( 187 memory: &GuestMemoryMmap, 188 range_base: GuestAddress, 189 range_len: usize, 190 ) -> result::Result<(), Error> { 191 let region = memory.find_region(range_base).ok_or(Error::GuestMemory( 192 GuestMemoryError::InvalidGuestAddress(range_base), 193 ))?; 194 if let Some(f_off) = region.file_offset() { 195 let offset = range_base.0 - region.start_addr().0; 196 // SAFETY: FFI call with valid arguments 197 let res = unsafe { 198 libc::fallocate64( 199 f_off.file().as_raw_fd(), 200 libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, 201 (offset + f_off.start()) as libc::off64_t, 202 range_len as libc::off64_t, 203 ) 204 }; 205 206 if res != 0 { 207 return Err(Error::FallocateFail(io::Error::last_os_error())); 208 } 209 } 210 211 Self::advise_memory_range(memory, range_base, range_len, libc::MADV_DONTNEED) 212 } 213 214 fn release_memory_range_4k( 215 pbp: &mut Option<PartiallyBalloonedPage>, 216 memory: &GuestMemoryMmap, 217 pfn: u32, 218 ) -> result::Result<(), Error> { 219 let range_base = GuestAddress((pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT); 220 let range_len = 1 << VIRTIO_BALLOON_PFN_SHIFT; 221 222 let page_size: u64 = get_page_size(); 223 if page_size == 1 << VIRTIO_BALLOON_PFN_SHIFT { 224 return Self::release_memory_range(memory, range_base, range_len); 225 } 226 227 if pbp.is_none() { 228 *pbp = Some(PartiallyBalloonedPage::new()); 229 } 230 231 if !pbp.as_ref().unwrap().pfn_match(range_base.0) { 232 // We are trying to free memory region in a different pfn with current pbp. Flush pbp. 233 pbp.as_mut().unwrap().reset(); 234 pbp.as_mut().unwrap().addr = align_page_size_down(range_base.0); 235 } 236 237 pbp.as_mut().unwrap().set_bit(range_base.0); 238 if pbp.as_ref().unwrap().bitmap_full() { 239 Self::release_memory_range( 240 memory, 241 vm_memory::GuestAddress(pbp.as_ref().unwrap().addr), 242 page_size as usize, 243 )?; 244 245 pbp.as_mut().unwrap().reset(); 246 } 247 248 Ok(()) 249 } 250 251 fn process_queue(&mut self, queue_index: usize) -> result::Result<(), Error> { 252 let mut used_descs = false; 253 while let Some(mut desc_chain) = 254 self.queues[queue_index].pop_descriptor_chain(self.mem.memory()) 255 { 256 let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?; 257 258 let data_chunk_size = size_of::<u32>(); 259 260 // The head contains the request type which MUST be readable. 261 if desc.is_write_only() { 262 error!("The head contains the request type is not right"); 263 return Err(Error::UnexpectedWriteOnlyDescriptor); 264 } 265 if desc.len() as usize % data_chunk_size != 0 { 266 error!("the request size {} is not right", desc.len()); 267 return Err(Error::InvalidRequest); 268 } 269 270 let mut offset = 0u64; 271 while offset < desc.len() as u64 { 272 let addr = desc.addr().checked_add(offset).unwrap(); 273 let pfn: u32 = desc_chain 274 .memory() 275 .read_obj(addr) 276 .map_err(Error::GuestMemory)?; 277 offset += data_chunk_size as u64; 278 279 match queue_index { 280 0 => { 281 Self::release_memory_range_4k(&mut self.pbp, desc_chain.memory(), pfn)?; 282 } 283 1 => { 284 let page_size = get_page_size() as usize; 285 let rbase = align_page_size_down((pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT); 286 287 Self::advise_memory_range( 288 desc_chain.memory(), 289 vm_memory::GuestAddress(rbase), 290 page_size, 291 libc::MADV_WILLNEED, 292 )?; 293 } 294 _ => return Err(Error::InvalidQueueIndex(queue_index)), 295 } 296 } 297 298 self.queues[queue_index] 299 .add_used(desc_chain.memory(), desc_chain.head_index(), desc.len()) 300 .map_err(Error::QueueAddUsed)?; 301 used_descs = true; 302 } 303 304 if used_descs { 305 self.signal(VirtioInterruptType::Queue(queue_index as u16)) 306 } else { 307 Ok(()) 308 } 309 } 310 311 fn process_reporting_queue(&mut self, queue_index: usize) -> result::Result<(), Error> { 312 let mut used_descs = false; 313 while let Some(mut desc_chain) = 314 self.queues[queue_index].pop_descriptor_chain(self.mem.memory()) 315 { 316 let mut descs_len = 0; 317 while let Some(desc) = desc_chain.next() { 318 descs_len += desc.len(); 319 Self::release_memory_range(desc_chain.memory(), desc.addr(), desc.len() as usize)?; 320 } 321 322 self.queues[queue_index] 323 .add_used(desc_chain.memory(), desc_chain.head_index(), descs_len) 324 .map_err(Error::QueueAddUsed)?; 325 used_descs = true; 326 } 327 328 if used_descs { 329 self.signal(VirtioInterruptType::Queue(queue_index as u16)) 330 } else { 331 Ok(()) 332 } 333 } 334 335 fn run( 336 &mut self, 337 paused: Arc<AtomicBool>, 338 paused_sync: Arc<Barrier>, 339 ) -> result::Result<(), EpollHelperError> { 340 let mut helper = EpollHelper::new(&self.kill_evt, &self.pause_evt)?; 341 helper.add_event(self.inflate_queue_evt.as_raw_fd(), INFLATE_QUEUE_EVENT)?; 342 helper.add_event(self.deflate_queue_evt.as_raw_fd(), DEFLATE_QUEUE_EVENT)?; 343 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() { 344 helper.add_event(reporting_queue_evt.as_raw_fd(), REPORTING_QUEUE_EVENT)?; 345 } 346 helper.run(paused, paused_sync, self)?; 347 348 Ok(()) 349 } 350 } 351 352 impl EpollHelperHandler for BalloonEpollHandler { 353 fn handle_event( 354 &mut self, 355 _helper: &mut EpollHelper, 356 event: &epoll::Event, 357 ) -> result::Result<(), EpollHelperError> { 358 let ev_type = event.data as u16; 359 match ev_type { 360 INFLATE_QUEUE_EVENT => { 361 self.inflate_queue_evt.read().map_err(|e| { 362 EpollHelperError::HandleEvent(anyhow!( 363 "Failed to get inflate queue event: {:?}", 364 e 365 )) 366 })?; 367 self.process_queue(0).map_err(|e| { 368 EpollHelperError::HandleEvent(anyhow!( 369 "Failed to signal used inflate queue: {:?}", 370 e 371 )) 372 })?; 373 } 374 DEFLATE_QUEUE_EVENT => { 375 self.deflate_queue_evt.read().map_err(|e| { 376 EpollHelperError::HandleEvent(anyhow!( 377 "Failed to get deflate queue event: {:?}", 378 e 379 )) 380 })?; 381 self.process_queue(1).map_err(|e| { 382 EpollHelperError::HandleEvent(anyhow!( 383 "Failed to signal used deflate queue: {:?}", 384 e 385 )) 386 })?; 387 } 388 REPORTING_QUEUE_EVENT => { 389 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() { 390 reporting_queue_evt.read().map_err(|e| { 391 EpollHelperError::HandleEvent(anyhow!( 392 "Failed to get reporting queue event: {:?}", 393 e 394 )) 395 })?; 396 self.process_reporting_queue(2).map_err(|e| { 397 EpollHelperError::HandleEvent(anyhow!( 398 "Failed to signal used inflate queue: {:?}", 399 e 400 )) 401 })?; 402 } else { 403 return Err(EpollHelperError::HandleEvent(anyhow!( 404 "Invalid reporting queue event as no eventfd registered" 405 ))); 406 } 407 } 408 _ => { 409 return Err(EpollHelperError::HandleEvent(anyhow!( 410 "Unknown event for virtio-balloon" 411 ))); 412 } 413 } 414 415 Ok(()) 416 } 417 } 418 419 #[derive(Versionize)] 420 pub struct BalloonState { 421 pub avail_features: u64, 422 pub acked_features: u64, 423 pub config: VirtioBalloonConfig, 424 } 425 426 impl VersionMapped for BalloonState {} 427 428 // Virtio device for exposing entropy to the guest OS through virtio. 429 pub struct Balloon { 430 common: VirtioCommon, 431 id: String, 432 config: VirtioBalloonConfig, 433 seccomp_action: SeccompAction, 434 exit_evt: EventFd, 435 interrupt_cb: Option<Arc<dyn VirtioInterrupt>>, 436 } 437 438 impl Balloon { 439 // Create a new virtio-balloon. 440 pub fn new( 441 id: String, 442 size: u64, 443 deflate_on_oom: bool, 444 free_page_reporting: bool, 445 seccomp_action: SeccompAction, 446 exit_evt: EventFd, 447 state: Option<BalloonState>, 448 ) -> io::Result<Self> { 449 let mut queue_sizes = vec![QUEUE_SIZE; MIN_NUM_QUEUES]; 450 451 let (avail_features, acked_features, config, paused) = if let Some(state) = state { 452 info!("Restoring virtio-balloon {}", id); 453 ( 454 state.avail_features, 455 state.acked_features, 456 state.config, 457 true, 458 ) 459 } else { 460 let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; 461 if deflate_on_oom { 462 avail_features |= 1u64 << VIRTIO_BALLOON_F_DEFLATE_ON_OOM; 463 } 464 if free_page_reporting { 465 avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING; 466 } 467 468 let config = VirtioBalloonConfig { 469 num_pages: (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32, 470 ..Default::default() 471 }; 472 473 (avail_features, 0, config, false) 474 }; 475 476 if free_page_reporting { 477 queue_sizes.push(REPORTING_QUEUE_SIZE); 478 } 479 480 Ok(Balloon { 481 common: VirtioCommon { 482 device_type: VirtioDeviceType::Balloon as u32, 483 avail_features, 484 acked_features, 485 paused_sync: Some(Arc::new(Barrier::new(2))), 486 queue_sizes, 487 min_queues: MIN_NUM_QUEUES as u16, 488 paused: Arc::new(AtomicBool::new(paused)), 489 ..Default::default() 490 }, 491 id, 492 config, 493 seccomp_action, 494 exit_evt, 495 interrupt_cb: None, 496 }) 497 } 498 499 pub fn resize(&mut self, size: u64) -> Result<(), Error> { 500 self.config.num_pages = (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32; 501 502 if let Some(interrupt_cb) = &self.interrupt_cb { 503 interrupt_cb 504 .trigger(VirtioInterruptType::Config) 505 .map_err(Error::FailedSignal) 506 } else { 507 Ok(()) 508 } 509 } 510 511 // Get the actual size of the virtio-balloon. 512 pub fn get_actual(&self) -> u64 { 513 (self.config.actual as u64) << VIRTIO_BALLOON_PFN_SHIFT 514 } 515 516 fn state(&self) -> BalloonState { 517 BalloonState { 518 avail_features: self.common.avail_features, 519 acked_features: self.common.acked_features, 520 config: self.config, 521 } 522 } 523 524 #[cfg(fuzzing)] 525 pub fn wait_for_epoll_threads(&mut self) { 526 self.common.wait_for_epoll_threads(); 527 } 528 } 529 530 impl Drop for Balloon { 531 fn drop(&mut self) { 532 if let Some(kill_evt) = self.common.kill_evt.take() { 533 // Ignore the result because there is nothing we can do about it. 534 let _ = kill_evt.write(1); 535 } 536 self.common.wait_for_epoll_threads(); 537 } 538 } 539 540 impl VirtioDevice for Balloon { 541 fn device_type(&self) -> u32 { 542 self.common.device_type 543 } 544 545 fn queue_max_sizes(&self) -> &[u16] { 546 &self.common.queue_sizes 547 } 548 549 fn features(&self) -> u64 { 550 self.common.avail_features 551 } 552 553 fn ack_features(&mut self, value: u64) { 554 self.common.ack_features(value) 555 } 556 557 fn read_config(&self, offset: u64, data: &mut [u8]) { 558 self.read_config_from_slice(self.config.as_slice(), offset, data); 559 } 560 561 fn write_config(&mut self, offset: u64, data: &[u8]) { 562 // The "actual" field is the only mutable field 563 if offset != CONFIG_ACTUAL_OFFSET || data.len() != CONFIG_ACTUAL_SIZE { 564 error!( 565 "Attempt to write to read-only field: offset {:x} length {}", 566 offset, 567 data.len() 568 ); 569 return; 570 } 571 572 let config = self.config.as_mut_slice(); 573 let config_len = config.len() as u64; 574 let data_len = data.len() as u64; 575 if offset + data_len > config_len { 576 error!( 577 "Out-of-bound access to configuration: config_len = {} offset = {:x} length = {} for {}", 578 config_len, 579 offset, 580 data_len, 581 self.device_type() 582 ); 583 return; 584 } 585 586 if let Some(end) = offset.checked_add(config.len() as u64) { 587 let mut offset_config = 588 &mut config[offset as usize..std::cmp::min(end, config_len) as usize]; 589 offset_config.write_all(data).unwrap(); 590 } 591 } 592 593 fn activate( 594 &mut self, 595 mem: GuestMemoryAtomic<GuestMemoryMmap>, 596 interrupt_cb: Arc<dyn VirtioInterrupt>, 597 mut queues: Vec<(usize, Queue, EventFd)>, 598 ) -> ActivateResult { 599 self.common.activate(&queues, &interrupt_cb)?; 600 let (kill_evt, pause_evt) = self.common.dup_eventfds(); 601 602 let mut virtqueues = Vec::new(); 603 let (_, queue, queue_evt) = queues.remove(0); 604 virtqueues.push(queue); 605 let inflate_queue_evt = queue_evt; 606 let (_, queue, queue_evt) = queues.remove(0); 607 virtqueues.push(queue); 608 let deflate_queue_evt = queue_evt; 609 let reporting_queue_evt = 610 if self.common.feature_acked(VIRTIO_BALLOON_F_REPORTING) && !queues.is_empty() { 611 let (_, queue, queue_evt) = queues.remove(0); 612 virtqueues.push(queue); 613 Some(queue_evt) 614 } else { 615 None 616 }; 617 618 self.interrupt_cb = Some(interrupt_cb.clone()); 619 620 let mut handler = BalloonEpollHandler { 621 mem, 622 queues: virtqueues, 623 interrupt_cb, 624 inflate_queue_evt, 625 deflate_queue_evt, 626 reporting_queue_evt, 627 kill_evt, 628 pause_evt, 629 pbp: None, 630 }; 631 632 let paused = self.common.paused.clone(); 633 let paused_sync = self.common.paused_sync.clone(); 634 let mut epoll_threads = Vec::new(); 635 636 spawn_virtio_thread( 637 &self.id, 638 &self.seccomp_action, 639 Thread::VirtioBalloon, 640 &mut epoll_threads, 641 &self.exit_evt, 642 move || handler.run(paused, paused_sync.unwrap()), 643 )?; 644 self.common.epoll_threads = Some(epoll_threads); 645 646 event!("virtio-device", "activated", "id", &self.id); 647 Ok(()) 648 } 649 650 fn reset(&mut self) -> Option<Arc<dyn VirtioInterrupt>> { 651 let result = self.common.reset(); 652 event!("virtio-device", "reset", "id", &self.id); 653 result 654 } 655 } 656 657 impl Pausable for Balloon { 658 fn pause(&mut self) -> result::Result<(), MigratableError> { 659 self.common.pause() 660 } 661 662 fn resume(&mut self) -> result::Result<(), MigratableError> { 663 self.common.resume() 664 } 665 } 666 667 impl Snapshottable for Balloon { 668 fn id(&self) -> String { 669 self.id.clone() 670 } 671 672 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 673 Snapshot::new_from_versioned_state(&self.state()) 674 } 675 } 676 impl Transportable for Balloon {} 677 impl Migratable for Balloon {} 678