1 // Copyright (c) 2020 Ant Financial 2 // 3 // SPDX-License-Identifier: Apache-2.0 4 // 5 // Licensed under the Apache License, Version 2.0 (the "License"); 6 // you may not use this file except in compliance with the License. 7 // You may obtain a copy of the License at 8 // 9 // http://www.apache.org/licenses/LICENSE-2.0 10 // 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 17 use std::io::{self, Write}; 18 use std::mem::size_of; 19 use std::os::unix::io::AsRawFd; 20 use std::result; 21 use std::sync::{atomic::AtomicBool, Arc, Barrier}; 22 23 use anyhow::anyhow; 24 use seccompiler::SeccompAction; 25 use serde::{Deserialize, Serialize}; 26 use thiserror::Error; 27 use virtio_queue::{Queue, QueueT}; 28 use vm_allocator::page_size::{align_page_size_down, get_page_size}; 29 use vm_memory::{ 30 Address, ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 31 GuestMemoryError, GuestMemoryRegion, 32 }; 33 use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable}; 34 use vmm_sys_util::eventfd::EventFd; 35 36 use crate::{ 37 seccomp_filters::Thread, thread_helper::spawn_virtio_thread, ActivateResult, EpollHelper, 38 EpollHelperError, EpollHelperHandler, GuestMemoryMmap, VirtioCommon, VirtioDevice, 39 VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, EPOLL_HELPER_EVENT_LAST, 40 VIRTIO_F_VERSION_1, 41 }; 42 43 const QUEUE_SIZE: u16 = 128; 44 const REPORTING_QUEUE_SIZE: u16 = 32; 45 const MIN_NUM_QUEUES: usize = 2; 46 47 // Inflate virtio queue event. 48 const INFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 1; 49 // Deflate virtio queue event. 50 const DEFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2; 51 // Reporting virtio queue event. 52 const REPORTING_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 3; 53 54 // Size of a PFN in the balloon interface. 55 const VIRTIO_BALLOON_PFN_SHIFT: u64 = 12; 56 57 // Deflate balloon on OOM 58 const VIRTIO_BALLOON_F_DEFLATE_ON_OOM: u64 = 2; 59 // Enable an additional virtqueue to let the guest notify the host about free 60 // pages. 61 const VIRTIO_BALLOON_F_REPORTING: u64 = 5; 62 63 #[derive(Error, Debug)] 64 pub enum Error { 65 #[error("Guest gave us bad memory addresses.: {0}")] 66 GuestMemory(GuestMemoryError), 67 #[error("Guest gave us a write only descriptor that protocol says to read from")] 68 UnexpectedWriteOnlyDescriptor, 69 #[error("Guest sent us invalid request")] 70 InvalidRequest, 71 #[error("Fallocate fail.: {0}")] 72 FallocateFail(std::io::Error), 73 #[error("Madvise fail.: {0}")] 74 MadviseFail(std::io::Error), 75 #[error("Failed to EventFd write.: {0}")] 76 EventFdWriteFail(std::io::Error), 77 #[error("Invalid queue index: {0}")] 78 InvalidQueueIndex(usize), 79 #[error("Fail tp signal: {0}")] 80 FailedSignal(io::Error), 81 #[error("Descriptor chain is too short")] 82 DescriptorChainTooShort, 83 #[error("Failed adding used index: {0}")] 84 QueueAddUsed(virtio_queue::Error), 85 #[error("Failed creating an iterator over the queue: {0}")] 86 QueueIterator(virtio_queue::Error), 87 } 88 89 // Got from include/uapi/linux/virtio_balloon.h 90 #[repr(C)] 91 #[derive(Copy, Clone, Debug, Default, Serialize, Deserialize)] 92 pub struct VirtioBalloonConfig { 93 // Number of pages host wants Guest to give up. 94 num_pages: u32, 95 // Number of pages we've actually got in balloon. 96 actual: u32, 97 } 98 99 #[derive(Clone, Debug)] 100 struct PartiallyBalloonedPage { 101 addr: u64, 102 bitmap: Vec<u64>, 103 page_size: u64, 104 } 105 106 impl PartiallyBalloonedPage { 107 fn new() -> Self { 108 let page_size = get_page_size(); 109 let len = ((page_size >> VIRTIO_BALLOON_PFN_SHIFT) + 63) / 64; 110 // Initial each padding bit as 1 in bitmap. 111 let mut bitmap = vec![0_u64; len as usize]; 112 let pad_num = len * 64 - (page_size >> VIRTIO_BALLOON_PFN_SHIFT); 113 bitmap[(len - 1) as usize] = !((1 << (64 - pad_num)) - 1); 114 Self { 115 addr: 0, 116 bitmap, 117 page_size, 118 } 119 } 120 121 fn pfn_match(&self, addr: u64) -> bool { 122 self.addr == addr & !(self.page_size - 1) 123 } 124 125 fn bitmap_full(&self) -> bool { 126 self.bitmap.iter().all(|b| *b == u64::MAX) 127 } 128 129 fn set_bit(&mut self, addr: u64) { 130 let addr_offset = (addr % self.page_size) >> VIRTIO_BALLOON_PFN_SHIFT; 131 self.bitmap[(addr_offset / 64) as usize] |= 1 << (addr_offset % 64); 132 } 133 134 fn reset(&mut self) { 135 let len = ((self.page_size >> VIRTIO_BALLOON_PFN_SHIFT) + 63) / 64; 136 self.addr = 0; 137 self.bitmap = vec![0; len as usize]; 138 let pad_num = len * 64 - (self.page_size >> VIRTIO_BALLOON_PFN_SHIFT); 139 self.bitmap[(len - 1) as usize] = !((1 << (64 - pad_num)) - 1); 140 } 141 } 142 143 const CONFIG_ACTUAL_OFFSET: u64 = 4; 144 const CONFIG_ACTUAL_SIZE: usize = 4; 145 146 // SAFETY: it only has data and has no implicit padding. 147 unsafe impl ByteValued for VirtioBalloonConfig {} 148 149 struct BalloonEpollHandler { 150 mem: GuestMemoryAtomic<GuestMemoryMmap>, 151 queues: Vec<Queue>, 152 interrupt_cb: Arc<dyn VirtioInterrupt>, 153 inflate_queue_evt: EventFd, 154 deflate_queue_evt: EventFd, 155 reporting_queue_evt: Option<EventFd>, 156 kill_evt: EventFd, 157 pause_evt: EventFd, 158 pbp: Option<PartiallyBalloonedPage>, 159 } 160 161 impl BalloonEpollHandler { 162 fn signal(&self, int_type: VirtioInterruptType) -> result::Result<(), Error> { 163 self.interrupt_cb.trigger(int_type).map_err(|e| { 164 error!("Failed to signal used queue: {:?}", e); 165 Error::FailedSignal(e) 166 }) 167 } 168 169 fn advise_memory_range( 170 memory: &GuestMemoryMmap, 171 range_base: GuestAddress, 172 range_len: usize, 173 advice: libc::c_int, 174 ) -> result::Result<(), Error> { 175 let hva = memory 176 .get_host_address(range_base) 177 .map_err(Error::GuestMemory)?; 178 let res = 179 // SAFETY: Need unsafe to do syscall madvise 180 unsafe { libc::madvise(hva as *mut libc::c_void, range_len as libc::size_t, advice) }; 181 if res != 0 { 182 return Err(Error::MadviseFail(io::Error::last_os_error())); 183 } 184 Ok(()) 185 } 186 187 fn release_memory_range( 188 memory: &GuestMemoryMmap, 189 range_base: GuestAddress, 190 range_len: usize, 191 ) -> result::Result<(), Error> { 192 let region = memory.find_region(range_base).ok_or(Error::GuestMemory( 193 GuestMemoryError::InvalidGuestAddress(range_base), 194 ))?; 195 if let Some(f_off) = region.file_offset() { 196 let offset = range_base.0 - region.start_addr().0; 197 // SAFETY: FFI call with valid arguments 198 let res = unsafe { 199 libc::fallocate64( 200 f_off.file().as_raw_fd(), 201 libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, 202 (offset + f_off.start()) as libc::off64_t, 203 range_len as libc::off64_t, 204 ) 205 }; 206 207 if res != 0 { 208 return Err(Error::FallocateFail(io::Error::last_os_error())); 209 } 210 } 211 212 Self::advise_memory_range(memory, range_base, range_len, libc::MADV_DONTNEED) 213 } 214 215 fn release_memory_range_4k( 216 pbp: &mut Option<PartiallyBalloonedPage>, 217 memory: &GuestMemoryMmap, 218 pfn: u32, 219 ) -> result::Result<(), Error> { 220 let range_base = GuestAddress((pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT); 221 let range_len = 1 << VIRTIO_BALLOON_PFN_SHIFT; 222 223 let page_size: u64 = get_page_size(); 224 if page_size == 1 << VIRTIO_BALLOON_PFN_SHIFT { 225 return Self::release_memory_range(memory, range_base, range_len); 226 } 227 228 if pbp.is_none() { 229 *pbp = Some(PartiallyBalloonedPage::new()); 230 } 231 232 if !pbp.as_ref().unwrap().pfn_match(range_base.0) { 233 // We are trying to free memory region in a different pfn with current pbp. Flush pbp. 234 pbp.as_mut().unwrap().reset(); 235 pbp.as_mut().unwrap().addr = align_page_size_down(range_base.0); 236 } 237 238 pbp.as_mut().unwrap().set_bit(range_base.0); 239 if pbp.as_ref().unwrap().bitmap_full() { 240 Self::release_memory_range( 241 memory, 242 vm_memory::GuestAddress(pbp.as_ref().unwrap().addr), 243 page_size as usize, 244 )?; 245 246 pbp.as_mut().unwrap().reset(); 247 } 248 249 Ok(()) 250 } 251 252 fn process_queue(&mut self, queue_index: usize) -> result::Result<(), Error> { 253 let mut used_descs = false; 254 while let Some(mut desc_chain) = 255 self.queues[queue_index].pop_descriptor_chain(self.mem.memory()) 256 { 257 let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?; 258 259 let data_chunk_size = size_of::<u32>(); 260 261 // The head contains the request type which MUST be readable. 262 if desc.is_write_only() { 263 error!("The head contains the request type is not right"); 264 return Err(Error::UnexpectedWriteOnlyDescriptor); 265 } 266 if desc.len() as usize % data_chunk_size != 0 { 267 error!("the request size {} is not right", desc.len()); 268 return Err(Error::InvalidRequest); 269 } 270 271 let mut offset = 0u64; 272 while offset < desc.len() as u64 { 273 let addr = desc.addr().checked_add(offset).unwrap(); 274 let pfn: u32 = desc_chain 275 .memory() 276 .read_obj(addr) 277 .map_err(Error::GuestMemory)?; 278 offset += data_chunk_size as u64; 279 280 match queue_index { 281 0 => { 282 Self::release_memory_range_4k(&mut self.pbp, desc_chain.memory(), pfn)?; 283 } 284 1 => { 285 let page_size = get_page_size() as usize; 286 let rbase = align_page_size_down((pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT); 287 288 Self::advise_memory_range( 289 desc_chain.memory(), 290 vm_memory::GuestAddress(rbase), 291 page_size, 292 libc::MADV_WILLNEED, 293 )?; 294 } 295 _ => return Err(Error::InvalidQueueIndex(queue_index)), 296 } 297 } 298 299 self.queues[queue_index] 300 .add_used(desc_chain.memory(), desc_chain.head_index(), desc.len()) 301 .map_err(Error::QueueAddUsed)?; 302 used_descs = true; 303 } 304 305 if used_descs { 306 self.signal(VirtioInterruptType::Queue(queue_index as u16)) 307 } else { 308 Ok(()) 309 } 310 } 311 312 fn process_reporting_queue(&mut self, queue_index: usize) -> result::Result<(), Error> { 313 let mut used_descs = false; 314 while let Some(mut desc_chain) = 315 self.queues[queue_index].pop_descriptor_chain(self.mem.memory()) 316 { 317 let mut descs_len = 0; 318 while let Some(desc) = desc_chain.next() { 319 descs_len += desc.len(); 320 Self::release_memory_range(desc_chain.memory(), desc.addr(), desc.len() as usize)?; 321 } 322 323 self.queues[queue_index] 324 .add_used(desc_chain.memory(), desc_chain.head_index(), descs_len) 325 .map_err(Error::QueueAddUsed)?; 326 used_descs = true; 327 } 328 329 if used_descs { 330 self.signal(VirtioInterruptType::Queue(queue_index as u16)) 331 } else { 332 Ok(()) 333 } 334 } 335 336 fn run( 337 &mut self, 338 paused: Arc<AtomicBool>, 339 paused_sync: Arc<Barrier>, 340 ) -> result::Result<(), EpollHelperError> { 341 let mut helper = EpollHelper::new(&self.kill_evt, &self.pause_evt)?; 342 helper.add_event(self.inflate_queue_evt.as_raw_fd(), INFLATE_QUEUE_EVENT)?; 343 helper.add_event(self.deflate_queue_evt.as_raw_fd(), DEFLATE_QUEUE_EVENT)?; 344 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() { 345 helper.add_event(reporting_queue_evt.as_raw_fd(), REPORTING_QUEUE_EVENT)?; 346 } 347 helper.run(paused, paused_sync, self)?; 348 349 Ok(()) 350 } 351 } 352 353 impl EpollHelperHandler for BalloonEpollHandler { 354 fn handle_event( 355 &mut self, 356 _helper: &mut EpollHelper, 357 event: &epoll::Event, 358 ) -> result::Result<(), EpollHelperError> { 359 let ev_type = event.data as u16; 360 match ev_type { 361 INFLATE_QUEUE_EVENT => { 362 self.inflate_queue_evt.read().map_err(|e| { 363 EpollHelperError::HandleEvent(anyhow!( 364 "Failed to get inflate queue event: {:?}", 365 e 366 )) 367 })?; 368 self.process_queue(0).map_err(|e| { 369 EpollHelperError::HandleEvent(anyhow!( 370 "Failed to signal used inflate queue: {:?}", 371 e 372 )) 373 })?; 374 } 375 DEFLATE_QUEUE_EVENT => { 376 self.deflate_queue_evt.read().map_err(|e| { 377 EpollHelperError::HandleEvent(anyhow!( 378 "Failed to get deflate queue event: {:?}", 379 e 380 )) 381 })?; 382 self.process_queue(1).map_err(|e| { 383 EpollHelperError::HandleEvent(anyhow!( 384 "Failed to signal used deflate queue: {:?}", 385 e 386 )) 387 })?; 388 } 389 REPORTING_QUEUE_EVENT => { 390 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() { 391 reporting_queue_evt.read().map_err(|e| { 392 EpollHelperError::HandleEvent(anyhow!( 393 "Failed to get reporting queue event: {:?}", 394 e 395 )) 396 })?; 397 self.process_reporting_queue(2).map_err(|e| { 398 EpollHelperError::HandleEvent(anyhow!( 399 "Failed to signal used inflate queue: {:?}", 400 e 401 )) 402 })?; 403 } else { 404 return Err(EpollHelperError::HandleEvent(anyhow!( 405 "Invalid reporting queue event as no eventfd registered" 406 ))); 407 } 408 } 409 _ => { 410 return Err(EpollHelperError::HandleEvent(anyhow!( 411 "Unknown event for virtio-balloon" 412 ))); 413 } 414 } 415 416 Ok(()) 417 } 418 } 419 420 #[derive(Serialize, Deserialize)] 421 pub struct BalloonState { 422 pub avail_features: u64, 423 pub acked_features: u64, 424 pub config: VirtioBalloonConfig, 425 } 426 427 // Virtio device for exposing entropy to the guest OS through virtio. 428 pub struct Balloon { 429 common: VirtioCommon, 430 id: String, 431 config: VirtioBalloonConfig, 432 seccomp_action: SeccompAction, 433 exit_evt: EventFd, 434 interrupt_cb: Option<Arc<dyn VirtioInterrupt>>, 435 } 436 437 impl Balloon { 438 // Create a new virtio-balloon. 439 pub fn new( 440 id: String, 441 size: u64, 442 deflate_on_oom: bool, 443 free_page_reporting: bool, 444 seccomp_action: SeccompAction, 445 exit_evt: EventFd, 446 state: Option<BalloonState>, 447 ) -> io::Result<Self> { 448 let mut queue_sizes = vec![QUEUE_SIZE; MIN_NUM_QUEUES]; 449 450 let (avail_features, acked_features, config, paused) = if let Some(state) = state { 451 info!("Restoring virtio-balloon {}", id); 452 ( 453 state.avail_features, 454 state.acked_features, 455 state.config, 456 true, 457 ) 458 } else { 459 let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; 460 if deflate_on_oom { 461 avail_features |= 1u64 << VIRTIO_BALLOON_F_DEFLATE_ON_OOM; 462 } 463 if free_page_reporting { 464 avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING; 465 } 466 467 let config = VirtioBalloonConfig { 468 num_pages: (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32, 469 ..Default::default() 470 }; 471 472 (avail_features, 0, config, false) 473 }; 474 475 if free_page_reporting { 476 queue_sizes.push(REPORTING_QUEUE_SIZE); 477 } 478 479 Ok(Balloon { 480 common: VirtioCommon { 481 device_type: VirtioDeviceType::Balloon as u32, 482 avail_features, 483 acked_features, 484 paused_sync: Some(Arc::new(Barrier::new(2))), 485 queue_sizes, 486 min_queues: MIN_NUM_QUEUES as u16, 487 paused: Arc::new(AtomicBool::new(paused)), 488 ..Default::default() 489 }, 490 id, 491 config, 492 seccomp_action, 493 exit_evt, 494 interrupt_cb: None, 495 }) 496 } 497 498 pub fn resize(&mut self, size: u64) -> Result<(), Error> { 499 self.config.num_pages = (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32; 500 501 if let Some(interrupt_cb) = &self.interrupt_cb { 502 interrupt_cb 503 .trigger(VirtioInterruptType::Config) 504 .map_err(Error::FailedSignal) 505 } else { 506 Ok(()) 507 } 508 } 509 510 // Get the actual size of the virtio-balloon. 511 pub fn get_actual(&self) -> u64 { 512 (self.config.actual as u64) << VIRTIO_BALLOON_PFN_SHIFT 513 } 514 515 fn state(&self) -> BalloonState { 516 BalloonState { 517 avail_features: self.common.avail_features, 518 acked_features: self.common.acked_features, 519 config: self.config, 520 } 521 } 522 523 #[cfg(fuzzing)] 524 pub fn wait_for_epoll_threads(&mut self) { 525 self.common.wait_for_epoll_threads(); 526 } 527 } 528 529 impl Drop for Balloon { 530 fn drop(&mut self) { 531 if let Some(kill_evt) = self.common.kill_evt.take() { 532 // Ignore the result because there is nothing we can do about it. 533 let _ = kill_evt.write(1); 534 } 535 self.common.wait_for_epoll_threads(); 536 } 537 } 538 539 impl VirtioDevice for Balloon { 540 fn device_type(&self) -> u32 { 541 self.common.device_type 542 } 543 544 fn queue_max_sizes(&self) -> &[u16] { 545 &self.common.queue_sizes 546 } 547 548 fn features(&self) -> u64 { 549 self.common.avail_features 550 } 551 552 fn ack_features(&mut self, value: u64) { 553 self.common.ack_features(value) 554 } 555 556 fn read_config(&self, offset: u64, data: &mut [u8]) { 557 self.read_config_from_slice(self.config.as_slice(), offset, data); 558 } 559 560 fn write_config(&mut self, offset: u64, data: &[u8]) { 561 // The "actual" field is the only mutable field 562 if offset != CONFIG_ACTUAL_OFFSET || data.len() != CONFIG_ACTUAL_SIZE { 563 error!( 564 "Attempt to write to read-only field: offset {:x} length {}", 565 offset, 566 data.len() 567 ); 568 return; 569 } 570 571 let config = self.config.as_mut_slice(); 572 let config_len = config.len() as u64; 573 let data_len = data.len() as u64; 574 if offset + data_len > config_len { 575 error!( 576 "Out-of-bound access to configuration: config_len = {} offset = {:x} length = {} for {}", 577 config_len, 578 offset, 579 data_len, 580 self.device_type() 581 ); 582 return; 583 } 584 585 if let Some(end) = offset.checked_add(config.len() as u64) { 586 let mut offset_config = 587 &mut config[offset as usize..std::cmp::min(end, config_len) as usize]; 588 offset_config.write_all(data).unwrap(); 589 } 590 } 591 592 fn activate( 593 &mut self, 594 mem: GuestMemoryAtomic<GuestMemoryMmap>, 595 interrupt_cb: Arc<dyn VirtioInterrupt>, 596 mut queues: Vec<(usize, Queue, EventFd)>, 597 ) -> ActivateResult { 598 self.common.activate(&queues, &interrupt_cb)?; 599 let (kill_evt, pause_evt) = self.common.dup_eventfds(); 600 601 let mut virtqueues = Vec::new(); 602 let (_, queue, queue_evt) = queues.remove(0); 603 virtqueues.push(queue); 604 let inflate_queue_evt = queue_evt; 605 let (_, queue, queue_evt) = queues.remove(0); 606 virtqueues.push(queue); 607 let deflate_queue_evt = queue_evt; 608 let reporting_queue_evt = 609 if self.common.feature_acked(VIRTIO_BALLOON_F_REPORTING) && !queues.is_empty() { 610 let (_, queue, queue_evt) = queues.remove(0); 611 virtqueues.push(queue); 612 Some(queue_evt) 613 } else { 614 None 615 }; 616 617 self.interrupt_cb = Some(interrupt_cb.clone()); 618 619 let mut handler = BalloonEpollHandler { 620 mem, 621 queues: virtqueues, 622 interrupt_cb, 623 inflate_queue_evt, 624 deflate_queue_evt, 625 reporting_queue_evt, 626 kill_evt, 627 pause_evt, 628 pbp: None, 629 }; 630 631 let paused = self.common.paused.clone(); 632 let paused_sync = self.common.paused_sync.clone(); 633 let mut epoll_threads = Vec::new(); 634 635 spawn_virtio_thread( 636 &self.id, 637 &self.seccomp_action, 638 Thread::VirtioBalloon, 639 &mut epoll_threads, 640 &self.exit_evt, 641 move || handler.run(paused, paused_sync.unwrap()), 642 )?; 643 self.common.epoll_threads = Some(epoll_threads); 644 645 event!("virtio-device", "activated", "id", &self.id); 646 Ok(()) 647 } 648 649 fn reset(&mut self) -> Option<Arc<dyn VirtioInterrupt>> { 650 let result = self.common.reset(); 651 event!("virtio-device", "reset", "id", &self.id); 652 result 653 } 654 } 655 656 impl Pausable for Balloon { 657 fn pause(&mut self) -> result::Result<(), MigratableError> { 658 self.common.pause() 659 } 660 661 fn resume(&mut self) -> result::Result<(), MigratableError> { 662 self.common.resume() 663 } 664 } 665 666 impl Snapshottable for Balloon { 667 fn id(&self) -> String { 668 self.id.clone() 669 } 670 671 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 672 Snapshot::new_from_state(&self.state()) 673 } 674 } 675 impl Transportable for Balloon {} 676 impl Migratable for Balloon {} 677