1 // Copyright (c) 2020 Ant Financial 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 use std::io::{self, Write}; 18 use std::mem::size_of; 19 use std::os::unix::io::AsRawFd; 20 use std::result; 21 use std::sync::atomic::AtomicBool; 22 use std::sync::{Arc, Barrier}; 23 24 use anyhow::anyhow; 25 use seccompiler::SeccompAction; 26 use serde::{Deserialize, Serialize}; 27 use thiserror::Error; 28 use virtio_queue::{Queue, QueueT}; 29 use vm_allocator::page_size::{align_page_size_down, get_page_size}; 30 use vm_memory::{ 31 Address, ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 32 GuestMemoryError, GuestMemoryRegion, 33 }; 34 use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; 35 use vmm_sys_util::eventfd::EventFd; 36 37 use crate::seccomp_filters::Thread; 38 use crate::thread_helper::spawn_virtio_thread; 39 use crate::{ 40 ActivateResult, EpollHelper, EpollHelperError, EpollHelperHandler, GuestMemoryMmap, 41 VirtioCommon, VirtioDevice, VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, 42 EPOLL_HELPER_EVENT_LAST, VIRTIO_F_VERSION_1, 43 }; 44 45 const QUEUE_SIZE: u16 = 128; 46 const REPORTING_QUEUE_SIZE: u16 = 32; 47 const MIN_NUM_QUEUES: usize = 2; 48 49 // Inflate virtio queue event. 50 const INFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 1; 51 // Deflate virtio queue event. 52 const DEFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2; 53 // Reporting virtio queue event. 54 const REPORTING_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 3; 55 56 // Size of a PFN in the balloon interface. 57 const VIRTIO_BALLOON_PFN_SHIFT: u64 = 12; 58 59 // Deflate balloon on OOM 60 const VIRTIO_BALLOON_F_DEFLATE_ON_OOM: u64 = 2; 61 // Enable an additional virtqueue to let the guest notify the host about free 62 // pages. 63 const VIRTIO_BALLOON_F_REPORTING: u64 = 5; 64 65 #[derive(Error, Debug)] 66 pub enum Error { 67 #[error("Guest gave us bad memory addresses.: {0}")] 68 GuestMemory(GuestMemoryError), 69 #[error("Guest gave us a write only descriptor that protocol says to read from")] 70 UnexpectedWriteOnlyDescriptor, 71 #[error("Guest sent us invalid request")] 72 InvalidRequest, 73 #[error("Fallocate fail.: {0}")] 74 FallocateFail(std::io::Error), 75 #[error("Madvise fail.: {0}")] 76 MadviseFail(std::io::Error), 77 #[error("Failed to EventFd write.: {0}")] 78 EventFdWriteFail(std::io::Error), 79 #[error("Invalid queue index: {0}")] 80 InvalidQueueIndex(usize), 81 #[error("Fail tp signal: {0}")] 82 FailedSignal(io::Error), 83 #[error("Descriptor chain is too short")] 84 DescriptorChainTooShort, 85 #[error("Failed adding used index: {0}")] 86 QueueAddUsed(virtio_queue::Error), 87 #[error("Failed creating an iterator over the queue: {0}")] 88 QueueIterator(virtio_queue::Error), 89 } 90 91 // Got from include/uapi/linux/virtio_balloon.h 92 #[repr(C)] 93 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)] 94 pub struct VirtioBalloonConfig { 95 // Number of pages host wants Guest to give up. 96 num_pages: u32, 97 // Number of pages we've actually got in balloon. 98 actual: u32, 99 } 100 101 #[derive(Clone, Debug)] 102 struct PartiallyBalloonedPage { 103 addr: u64, 104 bitmap: Vec<u64>, 105 page_size: u64, 106 } 107 108 impl PartiallyBalloonedPage { 109 fn new() -> Self { 110 let page_size = get_page_size(); 111 let len = (page_size >> VIRTIO_BALLOON_PFN_SHIFT).div_ceil(64); 112 // Initial each padding bit as 1 in bitmap. 113 let mut bitmap = vec![0_u64; len as usize]; 114 let pad_num = len * 64 - (page_size >> VIRTIO_BALLOON_PFN_SHIFT); 115 bitmap[(len - 1) as usize] = !((1 << (64 - pad_num)) - 1); 116 Self { 117 addr: 0, 118 bitmap, 119 page_size, 120 } 121 } 122 123 fn pfn_match(&self, addr: u64) -> bool { 124 self.addr == addr & !(self.page_size - 1) 125 } 126 127 fn bitmap_full(&self) -> bool { 128 self.bitmap.iter().all(|b| *b == u64::MAX) 129 } 130 131 fn set_bit(&mut self, addr: u64) { 132 let addr_offset = (addr % self.page_size) >> VIRTIO_BALLOON_PFN_SHIFT; 133 self.bitmap[(addr_offset / 64) as usize] |= 1 << (addr_offset % 64); 134 } 135 136 fn reset(&mut self) { 137 let len = (self.page_size >> VIRTIO_BALLOON_PFN_SHIFT).div_ceil(64); 138 self.addr = 0; 139 self.bitmap = vec![0; len as usize]; 140 let pad_num = len * 64 - (self.page_size >> VIRTIO_BALLOON_PFN_SHIFT); 141 self.bitmap[(len - 1) as usize] = !((1 << (64 - pad_num)) - 1); 142 } 143 } 144 145 const CONFIG_ACTUAL_OFFSET: u64 = 4; 146 const CONFIG_ACTUAL_SIZE: usize = 4; 147 148 // SAFETY: it only has data and has no implicit padding. 149 unsafe impl ByteValued for VirtioBalloonConfig {} 150 151 struct BalloonEpollHandler { 152 mem: GuestMemoryAtomic<GuestMemoryMmap>, 153 queues: Vec<Queue>, 154 interrupt_cb: Arc<dyn VirtioInterrupt>, 155 inflate_queue_evt: EventFd, 156 deflate_queue_evt: EventFd, 157 reporting_queue_evt: Option<EventFd>, 158 kill_evt: EventFd, 159 pause_evt: EventFd, 160 pbp: Option<PartiallyBalloonedPage>, 161 } 162 163 impl BalloonEpollHandler { 164 fn signal(&self, int_type: VirtioInterruptType) -> result::Result<(), Error> { 165 self.interrupt_cb.trigger(int_type).map_err(|e| { 166 error!("Failed to signal used queue: {:?}", e); 167 Error::FailedSignal(e) 168 }) 169 } 170 171 fn advise_memory_range( 172 memory: &GuestMemoryMmap, 173 range_base: GuestAddress, 174 range_len: usize, 175 advice: libc::c_int, 176 ) -> result::Result<(), Error> { 177 let hva = memory 178 .get_host_address(range_base) 179 .map_err(Error::GuestMemory)?; 180 let res = 181 // SAFETY: Need unsafe to do syscall madvise 182 unsafe { libc::madvise(hva as *mut libc::c_void, range_len as libc::size_t, advice) }; 183 if res != 0 { 184 return Err(Error::MadviseFail(io::Error::last_os_error())); 185 } 186 Ok(()) 187 } 188 189 fn release_memory_range( 190 memory: &GuestMemoryMmap, 191 range_base: GuestAddress, 192 range_len: usize, 193 ) -> result::Result<(), Error> { 194 let region = memory.find_region(range_base).ok_or(Error::GuestMemory( 195 GuestMemoryError::InvalidGuestAddress(range_base), 196 ))?; 197 if let Some(f_off) = region.file_offset() { 198 let offset = range_base.0 - region.start_addr().0; 199 // SAFETY: FFI call with valid arguments 200 let res = unsafe { 201 libc::fallocate64( 202 f_off.file().as_raw_fd(), 203 libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, 204 (offset + f_off.start()) as libc::off64_t, 205 range_len as libc::off64_t, 206 ) 207 }; 208 209 if res != 0 { 210 return Err(Error::FallocateFail(io::Error::last_os_error())); 211 } 212 } 213 214 Self::advise_memory_range(memory, range_base, range_len, libc::MADV_DONTNEED) 215 } 216 217 fn release_memory_range_4k( 218 pbp: &mut Option<PartiallyBalloonedPage>, 219 memory: &GuestMemoryMmap, 220 pfn: u32, 221 ) -> result::Result<(), Error> { 222 let range_base = GuestAddress((pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT); 223 let range_len = 1 << VIRTIO_BALLOON_PFN_SHIFT; 224 225 let page_size: u64 = get_page_size(); 226 if page_size == 1 << VIRTIO_BALLOON_PFN_SHIFT { 227 return Self::release_memory_range(memory, range_base, range_len); 228 } 229 230 if pbp.is_none() { 231 *pbp = Some(PartiallyBalloonedPage::new()); 232 } 233 234 if !pbp.as_ref().unwrap().pfn_match(range_base.0) { 235 // We are trying to free memory region in a different pfn with current pbp. Flush pbp. 236 pbp.as_mut().unwrap().reset(); 237 pbp.as_mut().unwrap().addr = align_page_size_down(range_base.0); 238 } 239 240 pbp.as_mut().unwrap().set_bit(range_base.0); 241 if pbp.as_ref().unwrap().bitmap_full() { 242 Self::release_memory_range( 243 memory, 244 vm_memory::GuestAddress(pbp.as_ref().unwrap().addr), 245 page_size as usize, 246 )?; 247 248 pbp.as_mut().unwrap().reset(); 249 } 250 251 Ok(()) 252 } 253 254 fn process_queue(&mut self, queue_index: usize) -> result::Result<(), Error> { 255 let mut used_descs = false; 256 while let Some(mut desc_chain) = 257 self.queues[queue_index].pop_descriptor_chain(self.mem.memory()) 258 { 259 let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?; 260 261 let data_chunk_size = size_of::<u32>(); 262 263 // The head contains the request type which MUST be readable. 264 if desc.is_write_only() { 265 error!("The head contains the request type is not right"); 266 return Err(Error::UnexpectedWriteOnlyDescriptor); 267 } 268 if desc.len() as usize % data_chunk_size != 0 { 269 error!("the request size {} is not right", desc.len()); 270 return Err(Error::InvalidRequest); 271 } 272 273 let mut offset = 0u64; 274 while offset < desc.len() as u64 { 275 let addr = desc.addr().checked_add(offset).unwrap(); 276 let pfn: u32 = desc_chain 277 .memory() 278 .read_obj(addr) 279 .map_err(Error::GuestMemory)?; 280 offset += data_chunk_size as u64; 281 282 match queue_index { 283 0 => { 284 Self::release_memory_range_4k(&mut self.pbp, desc_chain.memory(), pfn)?; 285 } 286 1 => { 287 let page_size = get_page_size() as usize; 288 let rbase = align_page_size_down((pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT); 289 290 Self::advise_memory_range( 291 desc_chain.memory(), 292 vm_memory::GuestAddress(rbase), 293 page_size, 294 libc::MADV_WILLNEED, 295 )?; 296 } 297 _ => return Err(Error::InvalidQueueIndex(queue_index)), 298 } 299 } 300 301 self.queues[queue_index] 302 .add_used(desc_chain.memory(), desc_chain.head_index(), desc.len()) 303 .map_err(Error::QueueAddUsed)?; 304 used_descs = true; 305 } 306 307 if used_descs { 308 self.signal(VirtioInterruptType::Queue(queue_index as u16)) 309 } else { 310 Ok(()) 311 } 312 } 313 314 fn process_reporting_queue(&mut self, queue_index: usize) -> result::Result<(), Error> { 315 let mut used_descs = false; 316 while let Some(mut desc_chain) = 317 self.queues[queue_index].pop_descriptor_chain(self.mem.memory()) 318 { 319 let mut descs_len = 0; 320 while let Some(desc) = desc_chain.next() { 321 descs_len += desc.len(); 322 Self::release_memory_range(desc_chain.memory(), desc.addr(), desc.len() as usize)?; 323 } 324 325 self.queues[queue_index] 326 .add_used(desc_chain.memory(), desc_chain.head_index(), descs_len) 327 .map_err(Error::QueueAddUsed)?; 328 used_descs = true; 329 } 330 331 if used_descs { 332 self.signal(VirtioInterruptType::Queue(queue_index as u16)) 333 } else { 334 Ok(()) 335 } 336 } 337 338 fn run( 339 &mut self, 340 paused: Arc<AtomicBool>, 341 paused_sync: Arc<Barrier>, 342 ) -> result::Result<(), EpollHelperError> { 343 let mut helper = EpollHelper::new(&self.kill_evt, &self.pause_evt)?; 344 helper.add_event(self.inflate_queue_evt.as_raw_fd(), INFLATE_QUEUE_EVENT)?; 345 helper.add_event(self.deflate_queue_evt.as_raw_fd(), DEFLATE_QUEUE_EVENT)?; 346 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() { 347 helper.add_event(reporting_queue_evt.as_raw_fd(), REPORTING_QUEUE_EVENT)?; 348 } 349 helper.run(paused, paused_sync, self)?; 350 351 Ok(()) 352 } 353 } 354 355 impl EpollHelperHandler for BalloonEpollHandler { 356 fn handle_event( 357 &mut self, 358 _helper: &mut EpollHelper, 359 event: &epoll::Event, 360 ) -> result::Result<(), EpollHelperError> { 361 let ev_type = event.data as u16; 362 match ev_type { 363 INFLATE_QUEUE_EVENT => { 364 self.inflate_queue_evt.read().map_err(|e| { 365 EpollHelperError::HandleEvent(anyhow!( 366 "Failed to get inflate queue event: {:?}", 367 e 368 )) 369 })?; 370 self.process_queue(0).map_err(|e| { 371 EpollHelperError::HandleEvent(anyhow!( 372 "Failed to signal used inflate queue: {:?}", 373 e 374 )) 375 })?; 376 } 377 DEFLATE_QUEUE_EVENT => { 378 self.deflate_queue_evt.read().map_err(|e| { 379 EpollHelperError::HandleEvent(anyhow!( 380 "Failed to get deflate queue event: {:?}", 381 e 382 )) 383 })?; 384 self.process_queue(1).map_err(|e| { 385 EpollHelperError::HandleEvent(anyhow!( 386 "Failed to signal used deflate queue: {:?}", 387 e 388 )) 389 })?; 390 } 391 REPORTING_QUEUE_EVENT => { 392 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() { 393 reporting_queue_evt.read().map_err(|e| { 394 EpollHelperError::HandleEvent(anyhow!( 395 "Failed to get reporting queue event: {:?}", 396 e 397 )) 398 })?; 399 self.process_reporting_queue(2).map_err(|e| { 400 EpollHelperError::HandleEvent(anyhow!( 401 "Failed to signal used inflate queue: {:?}", 402 e 403 )) 404 })?; 405 } else { 406 return Err(EpollHelperError::HandleEvent(anyhow!( 407 "Invalid reporting queue event as no eventfd registered" 408 ))); 409 } 410 } 411 _ => { 412 return Err(EpollHelperError::HandleEvent(anyhow!( 413 "Unknown event for virtio-balloon" 414 ))); 415 } 416 } 417 418 Ok(()) 419 } 420 } 421 422 #[derive(Serialize, Deserialize)] 423 pub struct BalloonState { 424 pub avail_features: u64, 425 pub acked_features: u64, 426 pub config: VirtioBalloonConfig, 427 } 428 429 // Virtio device for exposing entropy to the guest OS through virtio. 430 pub struct Balloon { 431 common: VirtioCommon, 432 id: String, 433 config: VirtioBalloonConfig, 434 seccomp_action: SeccompAction, 435 exit_evt: EventFd, 436 interrupt_cb: Option<Arc<dyn VirtioInterrupt>>, 437 } 438 439 impl Balloon { 440 // Create a new virtio-balloon. 441 pub fn new( 442 id: String, 443 size: u64, 444 deflate_on_oom: bool, 445 free_page_reporting: bool, 446 seccomp_action: SeccompAction, 447 exit_evt: EventFd, 448 state: Option<BalloonState>, 449 ) -> io::Result<Self> { 450 let mut queue_sizes = vec![QUEUE_SIZE; MIN_NUM_QUEUES]; 451 452 let (avail_features, acked_features, config, paused) = if let Some(state) = state { 453 info!("Restoring virtio-balloon {}", id); 454 ( 455 state.avail_features, 456 state.acked_features, 457 state.config, 458 true, 459 ) 460 } else { 461 let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; 462 if deflate_on_oom { 463 avail_features |= 1u64 << VIRTIO_BALLOON_F_DEFLATE_ON_OOM; 464 } 465 if free_page_reporting { 466 avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING; 467 } 468 469 let config = VirtioBalloonConfig { 470 num_pages: (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32, 471 ..Default::default() 472 }; 473 474 (avail_features, 0, config, false) 475 }; 476 477 if free_page_reporting { 478 queue_sizes.push(REPORTING_QUEUE_SIZE); 479 } 480 481 Ok(Balloon { 482 common: VirtioCommon { 483 device_type: VirtioDeviceType::Balloon as u32, 484 avail_features, 485 acked_features, 486 paused_sync: Some(Arc::new(Barrier::new(2))), 487 queue_sizes, 488 min_queues: MIN_NUM_QUEUES as u16, 489 paused: Arc::new(AtomicBool::new(paused)), 490 ..Default::default() 491 }, 492 id, 493 config, 494 seccomp_action, 495 exit_evt, 496 interrupt_cb: None, 497 }) 498 } 499 500 pub fn resize(&mut self, size: u64) -> Result<(), Error> { 501 self.config.num_pages = (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32; 502 503 if let Some(interrupt_cb) = &self.interrupt_cb { 504 interrupt_cb 505 .trigger(VirtioInterruptType::Config) 506 .map_err(Error::FailedSignal) 507 } else { 508 Ok(()) 509 } 510 } 511 512 // Get the actual size of the virtio-balloon. 513 pub fn get_actual(&self) -> u64 { 514 (self.config.actual as u64) << VIRTIO_BALLOON_PFN_SHIFT 515 } 516 517 fn state(&self) -> BalloonState { 518 BalloonState { 519 avail_features: self.common.avail_features, 520 acked_features: self.common.acked_features, 521 config: self.config, 522 } 523 } 524 525 #[cfg(fuzzing)] 526 pub fn wait_for_epoll_threads(&mut self) { 527 self.common.wait_for_epoll_threads(); 528 } 529 } 530 531 impl Drop for Balloon { 532 fn drop(&mut self) { 533 if let Some(kill_evt) = self.common.kill_evt.take() { 534 // Ignore the result because there is nothing we can do about it. 535 let _ = kill_evt.write(1); 536 } 537 self.common.wait_for_epoll_threads(); 538 } 539 } 540 541 impl VirtioDevice for Balloon { 542 fn device_type(&self) -> u32 { 543 self.common.device_type 544 } 545 546 fn queue_max_sizes(&self) -> &[u16] { 547 &self.common.queue_sizes 548 } 549 550 fn features(&self) -> u64 { 551 self.common.avail_features 552 } 553 554 fn ack_features(&mut self, value: u64) { 555 self.common.ack_features(value) 556 } 557 558 fn read_config(&self, offset: u64, data: &mut [u8]) { 559 self.read_config_from_slice(self.config.as_slice(), offset, data); 560 } 561 562 fn write_config(&mut self, offset: u64, data: &[u8]) { 563 // The "actual" field is the only mutable field 564 if offset != CONFIG_ACTUAL_OFFSET || data.len() != CONFIG_ACTUAL_SIZE { 565 error!( 566 "Attempt to write to read-only field: offset {:x} length {}", 567 offset, 568 data.len() 569 ); 570 return; 571 } 572 573 let config = self.config.as_mut_slice(); 574 let config_len = config.len() as u64; 575 let data_len = data.len() as u64; 576 if offset + data_len > config_len { 577 error!( 578 "Out-of-bound access to configuration: config_len = {} offset = {:x} length = {} for {}", 579 config_len, 580 offset, 581 data_len, 582 self.device_type() 583 ); 584 return; 585 } 586 587 if let Some(end) = offset.checked_add(config.len() as u64) { 588 let mut offset_config = 589 &mut config[offset as usize..std::cmp::min(end, config_len) as usize]; 590 offset_config.write_all(data).unwrap(); 591 } 592 } 593 594 fn activate( 595 &mut self, 596 mem: GuestMemoryAtomic<GuestMemoryMmap>, 597 interrupt_cb: Arc<dyn VirtioInterrupt>, 598 mut queues: Vec<(usize, Queue, EventFd)>, 599 ) -> ActivateResult { 600 self.common.activate(&queues, &interrupt_cb)?; 601 let (kill_evt, pause_evt) = self.common.dup_eventfds(); 602 603 let mut virtqueues = Vec::new(); 604 let (_, queue, queue_evt) = queues.remove(0); 605 virtqueues.push(queue); 606 let inflate_queue_evt = queue_evt; 607 let (_, queue, queue_evt) = queues.remove(0); 608 virtqueues.push(queue); 609 let deflate_queue_evt = queue_evt; 610 let reporting_queue_evt = 611 if self.common.feature_acked(VIRTIO_BALLOON_F_REPORTING) && !queues.is_empty() { 612 let (_, queue, queue_evt) = queues.remove(0); 613 virtqueues.push(queue); 614 Some(queue_evt) 615 } else { 616 None 617 }; 618 619 self.interrupt_cb = Some(interrupt_cb.clone()); 620 621 let mut handler = BalloonEpollHandler { 622 mem, 623 queues: virtqueues, 624 interrupt_cb, 625 inflate_queue_evt, 626 deflate_queue_evt, 627 reporting_queue_evt, 628 kill_evt, 629 pause_evt, 630 pbp: None, 631 }; 632 633 let paused = self.common.paused.clone(); 634 let paused_sync = self.common.paused_sync.clone(); 635 let mut epoll_threads = Vec::new(); 636 637 spawn_virtio_thread( 638 &self.id, 639 &self.seccomp_action, 640 Thread::VirtioBalloon, 641 &mut epoll_threads, 642 &self.exit_evt, 643 move || handler.run(paused, paused_sync.unwrap()), 644 )?; 645 self.common.epoll_threads = Some(epoll_threads); 646 647 event!("virtio-device", "activated", "id", &self.id); 648 Ok(()) 649 } 650 651 fn reset(&mut self) -> Option<Arc<dyn VirtioInterrupt>> { 652 let result = self.common.reset(); 653 event!("virtio-device", "reset", "id", &self.id); 654 result 655 } 656 } 657 658 impl Pausable for Balloon { 659 fn pause(&mut self) -> result::Result<(), MigratableError> { 660 self.common.pause() 661 } 662 663 fn resume(&mut self) -> result::Result<(), MigratableError> { 664 self.common.resume() 665 } 666 } 667 668 impl Snapshottable for Balloon { 669 fn id(&self) -> String { 670 self.id.clone() 671 } 672 673 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 674 Snapshot::new_from_state(&self.state()) 675 } 676 } 677 impl Transportable for Balloon {} 678 impl Migratable for Balloon {} 679