1 // Copyright (c) 2020 Ant Financial 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 use crate::{ 18 seccomp_filters::Thread, thread_helper::spawn_virtio_thread, ActivateResult, EpollHelper, 19 EpollHelperError, EpollHelperHandler, GuestMemoryMmap, VirtioCommon, VirtioDevice, 20 VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, EPOLL_HELPER_EVENT_LAST, 21 VIRTIO_F_VERSION_1, 22 }; 23 use anyhow::anyhow; 24 use seccompiler::SeccompAction; 25 use serde::{Deserialize, Serialize}; 26 use std::io::{self, Write}; 27 use std::mem::size_of; 28 use std::os::unix::io::AsRawFd; 29 use std::result; 30 use std::sync::{atomic::AtomicBool, Arc, Barrier}; 31 use thiserror::Error; 32 use virtio_queue::{Queue, QueueT}; 33 use vm_allocator::page_size::{align_page_size_down, get_page_size}; 34 use vm_memory::{ 35 Address, ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 36 GuestMemoryError, GuestMemoryRegion, 37 }; 38 use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; 39 use vmm_sys_util::eventfd::EventFd; 40 41 const QUEUE_SIZE: u16 = 128; 42 const REPORTING_QUEUE_SIZE: u16 = 32; 43 const MIN_NUM_QUEUES: usize = 2; 44 45 // Inflate virtio queue event. 46 const INFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 1; 47 // Deflate virtio queue event. 48 const DEFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2; 49 // Reporting virtio queue event. 50 const REPORTING_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 3; 51 52 // Size of a PFN in the balloon interface. 53 const VIRTIO_BALLOON_PFN_SHIFT: u64 = 12; 54 55 // Deflate balloon on OOM 56 const VIRTIO_BALLOON_F_DEFLATE_ON_OOM: u64 = 2; 57 // Enable an additional virtqueue to let the guest notify the host about free 58 // pages. 59 const VIRTIO_BALLOON_F_REPORTING: u64 = 5; 60 61 #[derive(Error, Debug)] 62 pub enum Error { 63 #[error("Guest gave us bad memory addresses.: {0}")] 64 GuestMemory(GuestMemoryError), 65 #[error("Guest gave us a write only descriptor that protocol says to read from")] 66 UnexpectedWriteOnlyDescriptor, 67 #[error("Guest sent us invalid request")] 68 InvalidRequest, 69 #[error("Fallocate fail.: {0}")] 70 FallocateFail(std::io::Error), 71 #[error("Madvise fail.: {0}")] 72 MadviseFail(std::io::Error), 73 #[error("Failed to EventFd write.: {0}")] 74 EventFdWriteFail(std::io::Error), 75 #[error("Invalid queue index: {0}")] 76 InvalidQueueIndex(usize), 77 #[error("Fail tp signal: {0}")] 78 FailedSignal(io::Error), 79 #[error("Descriptor chain is too short")] 80 DescriptorChainTooShort, 81 #[error("Failed adding used index: {0}")] 82 QueueAddUsed(virtio_queue::Error), 83 #[error("Failed creating an iterator over the queue: {0}")] 84 QueueIterator(virtio_queue::Error), 85 } 86 87 // Got from include/uapi/linux/virtio_balloon.h 88 #[repr(C)] 89 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)] 90 pub struct VirtioBalloonConfig { 91 // Number of pages host wants Guest to give up. 92 num_pages: u32, 93 // Number of pages we've actually got in balloon. 94 actual: u32, 95 } 96 97 #[derive(Clone, Debug)] 98 struct PartiallyBalloonedPage { 99 addr: u64, 100 bitmap: Vec<u64>, 101 page_size: u64, 102 } 103 104 impl PartiallyBalloonedPage { 105 fn new() -> Self { 106 let page_size = get_page_size(); 107 let len = ((page_size >> VIRTIO_BALLOON_PFN_SHIFT) + 63) / 64; 108 // Initial each padding bit as 1 in bitmap. 109 let mut bitmap = vec![0_u64; len as usize]; 110 let pad_num = len * 64 - (page_size >> VIRTIO_BALLOON_PFN_SHIFT); 111 bitmap[(len - 1) as usize] = !((1 << (64 - pad_num)) - 1); 112 Self { 113 addr: 0, 114 bitmap, 115 page_size, 116 } 117 } 118 119 fn pfn_match(&self, addr: u64) -> bool { 120 self.addr == addr & !(self.page_size - 1) 121 } 122 123 fn bitmap_full(&self) -> bool { 124 self.bitmap.iter().all(|b| *b == u64::MAX) 125 } 126 127 fn set_bit(&mut self, addr: u64) { 128 let addr_offset = (addr % self.page_size) >> VIRTIO_BALLOON_PFN_SHIFT; 129 self.bitmap[(addr_offset / 64) as usize] |= 1 << (addr_offset % 64); 130 } 131 132 fn reset(&mut self) { 133 let len = ((self.page_size >> VIRTIO_BALLOON_PFN_SHIFT) + 63) / 64; 134 self.addr = 0; 135 self.bitmap = vec![0; len as usize]; 136 let pad_num = len * 64 - (self.page_size >> VIRTIO_BALLOON_PFN_SHIFT); 137 self.bitmap[(len - 1) as usize] = !((1 << (64 - pad_num)) - 1); 138 } 139 } 140 141 const CONFIG_ACTUAL_OFFSET: u64 = 4; 142 const CONFIG_ACTUAL_SIZE: usize = 4; 143 144 // SAFETY: it only has data and has no implicit padding. 145 unsafe impl ByteValued for VirtioBalloonConfig {} 146 147 struct BalloonEpollHandler { 148 mem: GuestMemoryAtomic<GuestMemoryMmap>, 149 queues: Vec<Queue>, 150 interrupt_cb: Arc<dyn VirtioInterrupt>, 151 inflate_queue_evt: EventFd, 152 deflate_queue_evt: EventFd, 153 reporting_queue_evt: Option<EventFd>, 154 kill_evt: EventFd, 155 pause_evt: EventFd, 156 pbp: Option<PartiallyBalloonedPage>, 157 } 158 159 impl BalloonEpollHandler { 160 fn signal(&self, int_type: VirtioInterruptType) -> result::Result<(), Error> { 161 self.interrupt_cb.trigger(int_type).map_err(|e| { 162 error!("Failed to signal used queue: {:?}", e); 163 Error::FailedSignal(e) 164 }) 165 } 166 167 fn advise_memory_range( 168 memory: &GuestMemoryMmap, 169 range_base: GuestAddress, 170 range_len: usize, 171 advice: libc::c_int, 172 ) -> result::Result<(), Error> { 173 let hva = memory 174 .get_host_address(range_base) 175 .map_err(Error::GuestMemory)?; 176 let res = 177 // SAFETY: Need unsafe to do syscall madvise 178 unsafe { libc::madvise(hva as *mut libc::c_void, range_len as libc::size_t, advice) }; 179 if res != 0 { 180 return Err(Error::MadviseFail(io::Error::last_os_error())); 181 } 182 Ok(()) 183 } 184 185 fn release_memory_range( 186 memory: &GuestMemoryMmap, 187 range_base: GuestAddress, 188 range_len: usize, 189 ) -> result::Result<(), Error> { 190 let region = memory.find_region(range_base).ok_or(Error::GuestMemory( 191 GuestMemoryError::InvalidGuestAddress(range_base), 192 ))?; 193 if let Some(f_off) = region.file_offset() { 194 let offset = range_base.0 - region.start_addr().0; 195 // SAFETY: FFI call with valid arguments 196 let res = unsafe { 197 libc::fallocate64( 198 f_off.file().as_raw_fd(), 199 libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, 200 (offset + f_off.start()) as libc::off64_t, 201 range_len as libc::off64_t, 202 ) 203 }; 204 205 if res != 0 { 206 return Err(Error::FallocateFail(io::Error::last_os_error())); 207 } 208 } 209 210 Self::advise_memory_range(memory, range_base, range_len, libc::MADV_DONTNEED) 211 } 212 213 fn release_memory_range_4k( 214 pbp: &mut Option<PartiallyBalloonedPage>, 215 memory: &GuestMemoryMmap, 216 pfn: u32, 217 ) -> result::Result<(), Error> { 218 let range_base = GuestAddress((pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT); 219 let range_len = 1 << VIRTIO_BALLOON_PFN_SHIFT; 220 221 let page_size: u64 = get_page_size(); 222 if page_size == 1 << VIRTIO_BALLOON_PFN_SHIFT { 223 return Self::release_memory_range(memory, range_base, range_len); 224 } 225 226 if pbp.is_none() { 227 *pbp = Some(PartiallyBalloonedPage::new()); 228 } 229 230 if !pbp.as_ref().unwrap().pfn_match(range_base.0) { 231 // We are trying to free memory region in a different pfn with current pbp. Flush pbp. 232 pbp.as_mut().unwrap().reset(); 233 pbp.as_mut().unwrap().addr = align_page_size_down(range_base.0); 234 } 235 236 pbp.as_mut().unwrap().set_bit(range_base.0); 237 if pbp.as_ref().unwrap().bitmap_full() { 238 Self::release_memory_range( 239 memory, 240 vm_memory::GuestAddress(pbp.as_ref().unwrap().addr), 241 page_size as usize, 242 )?; 243 244 pbp.as_mut().unwrap().reset(); 245 } 246 247 Ok(()) 248 } 249 250 fn process_queue(&mut self, queue_index: usize) -> result::Result<(), Error> { 251 let mut used_descs = false; 252 while let Some(mut desc_chain) = 253 self.queues[queue_index].pop_descriptor_chain(self.mem.memory()) 254 { 255 let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?; 256 257 let data_chunk_size = size_of::<u32>(); 258 259 // The head contains the request type which MUST be readable. 260 if desc.is_write_only() { 261 error!("The head contains the request type is not right"); 262 return Err(Error::UnexpectedWriteOnlyDescriptor); 263 } 264 if desc.len() as usize % data_chunk_size != 0 { 265 error!("the request size {} is not right", desc.len()); 266 return Err(Error::InvalidRequest); 267 } 268 269 let mut offset = 0u64; 270 while offset < desc.len() as u64 { 271 let addr = desc.addr().checked_add(offset).unwrap(); 272 let pfn: u32 = desc_chain 273 .memory() 274 .read_obj(addr) 275 .map_err(Error::GuestMemory)?; 276 offset += data_chunk_size as u64; 277 278 match queue_index { 279 0 => { 280 Self::release_memory_range_4k(&mut self.pbp, desc_chain.memory(), pfn)?; 281 } 282 1 => { 283 let page_size = get_page_size() as usize; 284 let rbase = align_page_size_down((pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT); 285 286 Self::advise_memory_range( 287 desc_chain.memory(), 288 vm_memory::GuestAddress(rbase), 289 page_size, 290 libc::MADV_WILLNEED, 291 )?; 292 } 293 _ => return Err(Error::InvalidQueueIndex(queue_index)), 294 } 295 } 296 297 self.queues[queue_index] 298 .add_used(desc_chain.memory(), desc_chain.head_index(), desc.len()) 299 .map_err(Error::QueueAddUsed)?; 300 used_descs = true; 301 } 302 303 if used_descs { 304 self.signal(VirtioInterruptType::Queue(queue_index as u16)) 305 } else { 306 Ok(()) 307 } 308 } 309 310 fn process_reporting_queue(&mut self, queue_index: usize) -> result::Result<(), Error> { 311 let mut used_descs = false; 312 while let Some(mut desc_chain) = 313 self.queues[queue_index].pop_descriptor_chain(self.mem.memory()) 314 { 315 let mut descs_len = 0; 316 while let Some(desc) = desc_chain.next() { 317 descs_len += desc.len(); 318 Self::release_memory_range(desc_chain.memory(), desc.addr(), desc.len() as usize)?; 319 } 320 321 self.queues[queue_index] 322 .add_used(desc_chain.memory(), desc_chain.head_index(), descs_len) 323 .map_err(Error::QueueAddUsed)?; 324 used_descs = true; 325 } 326 327 if used_descs { 328 self.signal(VirtioInterruptType::Queue(queue_index as u16)) 329 } else { 330 Ok(()) 331 } 332 } 333 334 fn run( 335 &mut self, 336 paused: Arc<AtomicBool>, 337 paused_sync: Arc<Barrier>, 338 ) -> result::Result<(), EpollHelperError> { 339 let mut helper = EpollHelper::new(&self.kill_evt, &self.pause_evt)?; 340 helper.add_event(self.inflate_queue_evt.as_raw_fd(), INFLATE_QUEUE_EVENT)?; 341 helper.add_event(self.deflate_queue_evt.as_raw_fd(), DEFLATE_QUEUE_EVENT)?; 342 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() { 343 helper.add_event(reporting_queue_evt.as_raw_fd(), REPORTING_QUEUE_EVENT)?; 344 } 345 helper.run(paused, paused_sync, self)?; 346 347 Ok(()) 348 } 349 } 350 351 impl EpollHelperHandler for BalloonEpollHandler { 352 fn handle_event( 353 &mut self, 354 _helper: &mut EpollHelper, 355 event: &epoll::Event, 356 ) -> result::Result<(), EpollHelperError> { 357 let ev_type = event.data as u16; 358 match ev_type { 359 INFLATE_QUEUE_EVENT => { 360 self.inflate_queue_evt.read().map_err(|e| { 361 EpollHelperError::HandleEvent(anyhow!( 362 "Failed to get inflate queue event: {:?}", 363 e 364 )) 365 })?; 366 self.process_queue(0).map_err(|e| { 367 EpollHelperError::HandleEvent(anyhow!( 368 "Failed to signal used inflate queue: {:?}", 369 e 370 )) 371 })?; 372 } 373 DEFLATE_QUEUE_EVENT => { 374 self.deflate_queue_evt.read().map_err(|e| { 375 EpollHelperError::HandleEvent(anyhow!( 376 "Failed to get deflate queue event: {:?}", 377 e 378 )) 379 })?; 380 self.process_queue(1).map_err(|e| { 381 EpollHelperError::HandleEvent(anyhow!( 382 "Failed to signal used deflate queue: {:?}", 383 e 384 )) 385 })?; 386 } 387 REPORTING_QUEUE_EVENT => { 388 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() { 389 reporting_queue_evt.read().map_err(|e| { 390 EpollHelperError::HandleEvent(anyhow!( 391 "Failed to get reporting queue event: {:?}", 392 e 393 )) 394 })?; 395 self.process_reporting_queue(2).map_err(|e| { 396 EpollHelperError::HandleEvent(anyhow!( 397 "Failed to signal used inflate queue: {:?}", 398 e 399 )) 400 })?; 401 } else { 402 return Err(EpollHelperError::HandleEvent(anyhow!( 403 "Invalid reporting queue event as no eventfd registered" 404 ))); 405 } 406 } 407 _ => { 408 return Err(EpollHelperError::HandleEvent(anyhow!( 409 "Unknown event for virtio-balloon" 410 ))); 411 } 412 } 413 414 Ok(()) 415 } 416 } 417 418 #[derive(Serialize, Deserialize)] 419 pub struct BalloonState { 420 pub avail_features: u64, 421 pub acked_features: u64, 422 pub config: VirtioBalloonConfig, 423 } 424 425 // Virtio device for exposing entropy to the guest OS through virtio. 426 pub struct Balloon { 427 common: VirtioCommon, 428 id: String, 429 config: VirtioBalloonConfig, 430 seccomp_action: SeccompAction, 431 exit_evt: EventFd, 432 interrupt_cb: Option<Arc<dyn VirtioInterrupt>>, 433 } 434 435 impl Balloon { 436 // Create a new virtio-balloon. 437 pub fn new( 438 id: String, 439 size: u64, 440 deflate_on_oom: bool, 441 free_page_reporting: bool, 442 seccomp_action: SeccompAction, 443 exit_evt: EventFd, 444 state: Option<BalloonState>, 445 ) -> io::Result<Self> { 446 let mut queue_sizes = vec![QUEUE_SIZE; MIN_NUM_QUEUES]; 447 448 let (avail_features, acked_features, config, paused) = if let Some(state) = state { 449 info!("Restoring virtio-balloon {}", id); 450 ( 451 state.avail_features, 452 state.acked_features, 453 state.config, 454 true, 455 ) 456 } else { 457 let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; 458 if deflate_on_oom { 459 avail_features |= 1u64 << VIRTIO_BALLOON_F_DEFLATE_ON_OOM; 460 } 461 if free_page_reporting { 462 avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING; 463 } 464 465 let config = VirtioBalloonConfig { 466 num_pages: (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32, 467 ..Default::default() 468 }; 469 470 (avail_features, 0, config, false) 471 }; 472 473 if free_page_reporting { 474 queue_sizes.push(REPORTING_QUEUE_SIZE); 475 } 476 477 Ok(Balloon { 478 common: VirtioCommon { 479 device_type: VirtioDeviceType::Balloon as u32, 480 avail_features, 481 acked_features, 482 paused_sync: Some(Arc::new(Barrier::new(2))), 483 queue_sizes, 484 min_queues: MIN_NUM_QUEUES as u16, 485 paused: Arc::new(AtomicBool::new(paused)), 486 ..Default::default() 487 }, 488 id, 489 config, 490 seccomp_action, 491 exit_evt, 492 interrupt_cb: None, 493 }) 494 } 495 496 pub fn resize(&mut self, size: u64) -> Result<(), Error> { 497 self.config.num_pages = (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32; 498 499 if let Some(interrupt_cb) = &self.interrupt_cb { 500 interrupt_cb 501 .trigger(VirtioInterruptType::Config) 502 .map_err(Error::FailedSignal) 503 } else { 504 Ok(()) 505 } 506 } 507 508 // Get the actual size of the virtio-balloon. 509 pub fn get_actual(&self) -> u64 { 510 (self.config.actual as u64) << VIRTIO_BALLOON_PFN_SHIFT 511 } 512 513 fn state(&self) -> BalloonState { 514 BalloonState { 515 avail_features: self.common.avail_features, 516 acked_features: self.common.acked_features, 517 config: self.config, 518 } 519 } 520 521 #[cfg(fuzzing)] 522 pub fn wait_for_epoll_threads(&mut self) { 523 self.common.wait_for_epoll_threads(); 524 } 525 } 526 527 impl Drop for Balloon { 528 fn drop(&mut self) { 529 if let Some(kill_evt) = self.common.kill_evt.take() { 530 // Ignore the result because there is nothing we can do about it. 531 let _ = kill_evt.write(1); 532 } 533 self.common.wait_for_epoll_threads(); 534 } 535 } 536 537 impl VirtioDevice for Balloon { 538 fn device_type(&self) -> u32 { 539 self.common.device_type 540 } 541 542 fn queue_max_sizes(&self) -> &[u16] { 543 &self.common.queue_sizes 544 } 545 546 fn features(&self) -> u64 { 547 self.common.avail_features 548 } 549 550 fn ack_features(&mut self, value: u64) { 551 self.common.ack_features(value) 552 } 553 554 fn read_config(&self, offset: u64, data: &mut [u8]) { 555 self.read_config_from_slice(self.config.as_slice(), offset, data); 556 } 557 558 fn write_config(&mut self, offset: u64, data: &[u8]) { 559 // The "actual" field is the only mutable field 560 if offset != CONFIG_ACTUAL_OFFSET || data.len() != CONFIG_ACTUAL_SIZE { 561 error!( 562 "Attempt to write to read-only field: offset {:x} length {}", 563 offset, 564 data.len() 565 ); 566 return; 567 } 568 569 let config = self.config.as_mut_slice(); 570 let config_len = config.len() as u64; 571 let data_len = data.len() as u64; 572 if offset + data_len > config_len { 573 error!( 574 "Out-of-bound access to configuration: config_len = {} offset = {:x} length = {} for {}", 575 config_len, 576 offset, 577 data_len, 578 self.device_type() 579 ); 580 return; 581 } 582 583 if let Some(end) = offset.checked_add(config.len() as u64) { 584 let mut offset_config = 585 &mut config[offset as usize..std::cmp::min(end, config_len) as usize]; 586 offset_config.write_all(data).unwrap(); 587 } 588 } 589 590 fn activate( 591 &mut self, 592 mem: GuestMemoryAtomic<GuestMemoryMmap>, 593 interrupt_cb: Arc<dyn VirtioInterrupt>, 594 mut queues: Vec<(usize, Queue, EventFd)>, 595 ) -> ActivateResult { 596 self.common.activate(&queues, &interrupt_cb)?; 597 let (kill_evt, pause_evt) = self.common.dup_eventfds(); 598 599 let mut virtqueues = Vec::new(); 600 let (_, queue, queue_evt) = queues.remove(0); 601 virtqueues.push(queue); 602 let inflate_queue_evt = queue_evt; 603 let (_, queue, queue_evt) = queues.remove(0); 604 virtqueues.push(queue); 605 let deflate_queue_evt = queue_evt; 606 let reporting_queue_evt = 607 if self.common.feature_acked(VIRTIO_BALLOON_F_REPORTING) && !queues.is_empty() { 608 let (_, queue, queue_evt) = queues.remove(0); 609 virtqueues.push(queue); 610 Some(queue_evt) 611 } else { 612 None 613 }; 614 615 self.interrupt_cb = Some(interrupt_cb.clone()); 616 617 let mut handler = BalloonEpollHandler { 618 mem, 619 queues: virtqueues, 620 interrupt_cb, 621 inflate_queue_evt, 622 deflate_queue_evt, 623 reporting_queue_evt, 624 kill_evt, 625 pause_evt, 626 pbp: None, 627 }; 628 629 let paused = self.common.paused.clone(); 630 let paused_sync = self.common.paused_sync.clone(); 631 let mut epoll_threads = Vec::new(); 632 633 spawn_virtio_thread( 634 &self.id, 635 &self.seccomp_action, 636 Thread::VirtioBalloon, 637 &mut epoll_threads, 638 &self.exit_evt, 639 move || handler.run(paused, paused_sync.unwrap()), 640 )?; 641 self.common.epoll_threads = Some(epoll_threads); 642 643 event!("virtio-device", "activated", "id", &self.id); 644 Ok(()) 645 } 646 647 fn reset(&mut self) -> Option<Arc<dyn VirtioInterrupt>> { 648 let result = self.common.reset(); 649 event!("virtio-device", "reset", "id", &self.id); 650 result 651 } 652 } 653 654 impl Pausable for Balloon { 655 fn pause(&mut self) -> result::Result<(), MigratableError> { 656 self.common.pause() 657 } 658 659 fn resume(&mut self) -> result::Result<(), MigratableError> { 660 self.common.resume() 661 } 662 } 663 664 impl Snapshottable for Balloon { 665 fn id(&self) -> String { 666 self.id.clone() 667 } 668 669 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 670 Snapshot::new_from_state(&self.state()) 671 } 672 } 673 impl Transportable for Balloon {} 674 impl Migratable for Balloon {} 675