1 // Copyright (c) 2020 Ant Financial 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 use crate::{ 16 seccomp_filters::Thread, thread_helper::spawn_virtio_thread, ActivateError, ActivateResult, 17 EpollHelper, EpollHelperError, EpollHelperHandler, GuestMemoryMmap, VirtioCommon, VirtioDevice, 18 VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, EPOLL_HELPER_EVENT_LAST, 19 VIRTIO_F_VERSION_1, 20 }; 21 use libc::EFD_NONBLOCK; 22 use seccompiler::SeccompAction; 23 use std::io; 24 use std::mem::size_of; 25 use std::os::unix::io::AsRawFd; 26 use std::result; 27 use std::sync::{ 28 atomic::{AtomicBool, AtomicU64, Ordering}, 29 mpsc, Arc, Barrier, Mutex, 30 }; 31 use versionize::{VersionMap, Versionize, VersionizeResult}; 32 use versionize_derive::Versionize; 33 use virtio_queue::Queue; 34 use vm_memory::{ 35 Address, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryAtomic, GuestMemoryError, 36 GuestMemoryRegion, 37 }; 38 use vm_migration::{ 39 Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped, 40 }; 41 use vmm_sys_util::eventfd::EventFd; 42 43 const QUEUE_SIZE: u16 = 128; 44 const REPORTING_QUEUE_SIZE: u16 = 32; 45 const MIN_NUM_QUEUES: usize = 2; 46 47 // Resize event. 48 const RESIZE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 1; 49 // Inflate virtio queue event. 50 const INFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2; 51 // Deflate virtio queue event. 52 const DEFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 3; 53 // Reporting virtio queue event. 54 const REPORTING_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 4; 55 56 // Size of a PFN in the balloon interface. 57 const VIRTIO_BALLOON_PFN_SHIFT: u64 = 12; 58 59 // Deflate balloon on OOM 60 const VIRTIO_BALLOON_F_DEFLATE_ON_OOM: u64 = 2; 61 // Enable an additional virtqueue to let the guest notify the host about free 62 // pages. 63 const VIRTIO_BALLOON_F_REPORTING: u64 = 5; 64 65 #[derive(Debug)] 66 pub enum Error { 67 // Guest gave us bad memory addresses. 68 GuestMemory(GuestMemoryError), 69 // Guest gave us a write only descriptor that protocol says to read from. 70 UnexpectedWriteOnlyDescriptor, 71 // Guest sent us invalid request. 72 InvalidRequest, 73 // Fallocate fail. 74 FallocateFail(std::io::Error), 75 // Madvise fail. 76 MadviseFail(std::io::Error), 77 // Failed to EventFd write. 78 EventFdWriteFail(std::io::Error), 79 // Failed to EventFd try_clone. 80 EventFdTryCloneFail(std::io::Error), 81 // Failed to MpscRecv. 82 MpscRecvFail(mpsc::RecvError), 83 // Resize invalid argument 84 ResizeInval(String), 85 // Invalid queue index 86 InvalidQueueIndex(usize), 87 // Fail tp signal 88 FailedSignal(io::Error), 89 /// Descriptor chain is too short 90 DescriptorChainTooShort, 91 /// Failed adding used index 92 QueueAddUsed(virtio_queue::Error), 93 /// Failed creating an iterator over the queue 94 QueueIterator(virtio_queue::Error), 95 } 96 97 // Got from include/uapi/linux/virtio_balloon.h 98 #[repr(C)] 99 #[derive(Copy, Clone, Debug, Default, Versionize)] 100 pub struct VirtioBalloonConfig { 101 // Number of pages host wants Guest to give up. 102 num_pages: u32, 103 // Number of pages we've actually got in balloon. 104 actual: u32, 105 } 106 107 const CONFIG_ACTUAL_OFFSET: u64 = 4; 108 const CONFIG_ACTUAL_SIZE: usize = 4; 109 110 // SAFETY: it only has data and has no implicit padding. 111 unsafe impl ByteValued for VirtioBalloonConfig {} 112 113 struct VirtioBalloonResizeReceiver { 114 size: Arc<AtomicU64>, 115 tx: mpsc::Sender<Result<(), Error>>, 116 evt: EventFd, 117 } 118 119 impl VirtioBalloonResizeReceiver { 120 fn get_size(&self) -> u64 { 121 self.size.load(Ordering::Acquire) 122 } 123 124 fn send(&self, r: Result<(), Error>) -> Result<(), mpsc::SendError<Result<(), Error>>> { 125 self.tx.send(r) 126 } 127 } 128 129 struct VirtioBalloonResize { 130 size: Arc<AtomicU64>, 131 tx: mpsc::Sender<Result<(), Error>>, 132 rx: mpsc::Receiver<Result<(), Error>>, 133 evt: EventFd, 134 } 135 136 impl VirtioBalloonResize { 137 pub fn new(size: u64) -> io::Result<Self> { 138 let (tx, rx) = mpsc::channel(); 139 140 Ok(Self { 141 size: Arc::new(AtomicU64::new(size)), 142 tx, 143 rx, 144 evt: EventFd::new(EFD_NONBLOCK)?, 145 }) 146 } 147 148 pub fn get_receiver(&self) -> Result<VirtioBalloonResizeReceiver, Error> { 149 Ok(VirtioBalloonResizeReceiver { 150 size: self.size.clone(), 151 tx: self.tx.clone(), 152 evt: self.evt.try_clone().map_err(Error::EventFdTryCloneFail)?, 153 }) 154 } 155 156 pub fn work(&self, size: u64) -> Result<(), Error> { 157 self.size.store(size, Ordering::Release); 158 self.evt.write(1).map_err(Error::EventFdWriteFail)?; 159 self.rx.recv().map_err(Error::MpscRecvFail)? 160 } 161 } 162 163 struct BalloonEpollHandler { 164 config: Arc<Mutex<VirtioBalloonConfig>>, 165 resize_receiver: VirtioBalloonResizeReceiver, 166 queues: Vec<Queue<GuestMemoryAtomic<GuestMemoryMmap>>>, 167 interrupt_cb: Arc<dyn VirtioInterrupt>, 168 inflate_queue_evt: EventFd, 169 deflate_queue_evt: EventFd, 170 reporting_queue_evt: Option<EventFd>, 171 kill_evt: EventFd, 172 pause_evt: EventFd, 173 } 174 175 impl BalloonEpollHandler { 176 fn signal(&self, int_type: VirtioInterruptType) -> result::Result<(), Error> { 177 self.interrupt_cb.trigger(int_type).map_err(|e| { 178 error!("Failed to signal used queue: {:?}", e); 179 Error::FailedSignal(e) 180 }) 181 } 182 183 fn advise_memory_range( 184 memory: &GuestMemoryMmap, 185 range_base: GuestAddress, 186 range_len: usize, 187 advice: libc::c_int, 188 ) -> result::Result<(), Error> { 189 let hva = memory 190 .get_host_address(range_base) 191 .map_err(Error::GuestMemory)?; 192 // Need unsafe to do syscall madvise 193 let res = 194 unsafe { libc::madvise(hva as *mut libc::c_void, range_len as libc::size_t, advice) }; 195 if res != 0 { 196 return Err(Error::MadviseFail(io::Error::last_os_error())); 197 } 198 Ok(()) 199 } 200 201 fn release_memory_range( 202 memory: &GuestMemoryMmap, 203 range_base: GuestAddress, 204 range_len: usize, 205 ) -> result::Result<(), Error> { 206 let region = memory.find_region(range_base).ok_or(Error::GuestMemory( 207 GuestMemoryError::InvalidGuestAddress(range_base), 208 ))?; 209 if let Some(f_off) = region.file_offset() { 210 let offset = range_base.0 - region.start_addr().0; 211 let res = unsafe { 212 libc::fallocate64( 213 f_off.file().as_raw_fd(), 214 libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, 215 (offset as u64 + f_off.start()) as libc::off64_t, 216 range_len as libc::off64_t, 217 ) 218 }; 219 220 if res != 0 { 221 return Err(Error::FallocateFail(io::Error::last_os_error())); 222 } 223 } 224 225 Self::advise_memory_range(memory, range_base, range_len, libc::MADV_DONTNEED) 226 } 227 228 fn notify_queue( 229 &mut self, 230 queue_index: usize, 231 used_descs: Vec<(u16, u32)>, 232 ) -> result::Result<(), Error> { 233 for (desc_index, len) in used_descs.iter() { 234 self.queues[queue_index] 235 .add_used(*desc_index, *len) 236 .map_err(Error::QueueAddUsed)?; 237 } 238 239 if !used_descs.is_empty() { 240 self.signal(VirtioInterruptType::Queue(queue_index as u16))?; 241 } 242 243 Ok(()) 244 } 245 246 fn process_queue(&mut self, queue_index: usize) -> result::Result<(), Error> { 247 let mut used_descs = Vec::new(); 248 for mut desc_chain in self.queues[queue_index] 249 .iter() 250 .map_err(Error::QueueIterator)? 251 { 252 let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?; 253 254 used_descs.push((desc_chain.head_index(), desc.len())); 255 256 let data_chunk_size = size_of::<u32>(); 257 258 // The head contains the request type which MUST be readable. 259 if desc.is_write_only() { 260 error!("The head contains the request type is not right"); 261 return Err(Error::UnexpectedWriteOnlyDescriptor); 262 } 263 if desc.len() as usize % data_chunk_size != 0 { 264 error!("the request size {} is not right", desc.len()); 265 return Err(Error::InvalidRequest); 266 } 267 268 let mut offset = 0u64; 269 while offset < desc.len() as u64 { 270 let addr = desc.addr().checked_add(offset).unwrap(); 271 let pfn: u32 = desc_chain 272 .memory() 273 .read_obj(addr) 274 .map_err(Error::GuestMemory)?; 275 offset += data_chunk_size as u64; 276 277 let range_base = GuestAddress((pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT); 278 let range_len = 1 << VIRTIO_BALLOON_PFN_SHIFT; 279 280 match queue_index { 281 0 => { 282 Self::release_memory_range(desc_chain.memory(), range_base, range_len)?; 283 } 284 1 => { 285 Self::advise_memory_range( 286 desc_chain.memory(), 287 range_base, 288 range_len, 289 libc::MADV_WILLNEED, 290 )?; 291 } 292 _ => return Err(Error::InvalidQueueIndex(queue_index)), 293 } 294 } 295 } 296 297 self.notify_queue(queue_index, used_descs) 298 } 299 300 fn process_reporting_queue(&mut self, queue_index: usize) -> result::Result<(), Error> { 301 let mut used_descs = Vec::new(); 302 303 for mut desc_chain in self.queues[queue_index] 304 .iter() 305 .map_err(Error::QueueIterator)? 306 { 307 let mut descs_len = 0; 308 while let Some(desc) = desc_chain.next() { 309 descs_len += desc.len(); 310 Self::release_memory_range(desc_chain.memory(), desc.addr(), desc.len() as usize)?; 311 } 312 313 used_descs.push((desc_chain.head_index(), descs_len)); 314 } 315 316 self.notify_queue(queue_index, used_descs) 317 } 318 319 fn run( 320 &mut self, 321 paused: Arc<AtomicBool>, 322 paused_sync: Arc<Barrier>, 323 ) -> result::Result<(), EpollHelperError> { 324 let mut helper = EpollHelper::new(&self.kill_evt, &self.pause_evt)?; 325 helper.add_event(self.resize_receiver.evt.as_raw_fd(), RESIZE_EVENT)?; 326 helper.add_event(self.inflate_queue_evt.as_raw_fd(), INFLATE_QUEUE_EVENT)?; 327 helper.add_event(self.deflate_queue_evt.as_raw_fd(), DEFLATE_QUEUE_EVENT)?; 328 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() { 329 helper.add_event(reporting_queue_evt.as_raw_fd(), REPORTING_QUEUE_EVENT)?; 330 } 331 helper.run(paused, paused_sync, self)?; 332 333 Ok(()) 334 } 335 } 336 337 impl EpollHelperHandler for BalloonEpollHandler { 338 fn handle_event(&mut self, _helper: &mut EpollHelper, event: &epoll::Event) -> bool { 339 let ev_type = event.data as u16; 340 match ev_type { 341 RESIZE_EVENT => { 342 if let Err(e) = self.resize_receiver.evt.read() { 343 error!("Failed to get resize event: {:?}", e); 344 return true; 345 } 346 let mut signal_error = false; 347 let r = { 348 let mut config = self.config.lock().unwrap(); 349 config.num_pages = 350 (self.resize_receiver.get_size() >> VIRTIO_BALLOON_PFN_SHIFT) as u32; 351 if let Err(e) = self.signal(VirtioInterruptType::Config) { 352 signal_error = true; 353 Err(e) 354 } else { 355 Ok(()) 356 } 357 }; 358 if let Err(e) = &r { 359 // This error will send back to resize caller. 360 error!("Handle resize event get error: {:?}", e); 361 } 362 if let Err(e) = self.resize_receiver.send(r) { 363 error!("Sending \"resize\" generated error: {:?}", e); 364 return true; 365 } 366 if signal_error { 367 return true; 368 } 369 } 370 INFLATE_QUEUE_EVENT => { 371 if let Err(e) = self.inflate_queue_evt.read() { 372 error!("Failed to get inflate queue event: {:?}", e); 373 return true; 374 } else if let Err(e) = self.process_queue(0) { 375 error!("Failed to signal used inflate queue: {:?}", e); 376 return true; 377 } 378 } 379 DEFLATE_QUEUE_EVENT => { 380 if let Err(e) = self.deflate_queue_evt.read() { 381 error!("Failed to get deflate queue event: {:?}", e); 382 return true; 383 } else if let Err(e) = self.process_queue(1) { 384 error!("Failed to signal used deflate queue: {:?}", e); 385 return true; 386 } 387 } 388 REPORTING_QUEUE_EVENT => { 389 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() { 390 if let Err(e) = reporting_queue_evt.read() { 391 error!("Failed to get reporting queue event: {:?}", e); 392 return true; 393 } else if let Err(e) = self.process_reporting_queue(2) { 394 error!("Failed to signal used inflate queue: {:?}", e); 395 return true; 396 } 397 } else { 398 error!("Invalid reporting queue event as no eventfd registered"); 399 return true; 400 } 401 } 402 _ => { 403 error!("Unknown event for virtio-balloon"); 404 return true; 405 } 406 } 407 408 false 409 } 410 } 411 412 #[derive(Versionize)] 413 pub struct BalloonState { 414 pub avail_features: u64, 415 pub acked_features: u64, 416 pub config: VirtioBalloonConfig, 417 } 418 419 impl VersionMapped for BalloonState {} 420 421 // Virtio device for exposing entropy to the guest OS through virtio. 422 pub struct Balloon { 423 common: VirtioCommon, 424 id: String, 425 resize: VirtioBalloonResize, 426 config: Arc<Mutex<VirtioBalloonConfig>>, 427 seccomp_action: SeccompAction, 428 exit_evt: EventFd, 429 } 430 431 impl Balloon { 432 // Create a new virtio-balloon. 433 pub fn new( 434 id: String, 435 size: u64, 436 deflate_on_oom: bool, 437 free_page_reporting: bool, 438 seccomp_action: SeccompAction, 439 exit_evt: EventFd, 440 ) -> io::Result<Self> { 441 let mut queue_sizes = vec![QUEUE_SIZE; MIN_NUM_QUEUES]; 442 let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; 443 if deflate_on_oom { 444 avail_features |= 1u64 << VIRTIO_BALLOON_F_DEFLATE_ON_OOM; 445 } 446 if free_page_reporting { 447 avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING; 448 queue_sizes.push(REPORTING_QUEUE_SIZE); 449 } 450 451 let config = VirtioBalloonConfig { 452 num_pages: (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32, 453 ..Default::default() 454 }; 455 456 Ok(Balloon { 457 common: VirtioCommon { 458 device_type: VirtioDeviceType::Balloon as u32, 459 avail_features, 460 paused_sync: Some(Arc::new(Barrier::new(2))), 461 queue_sizes, 462 min_queues: MIN_NUM_QUEUES as u16, 463 ..Default::default() 464 }, 465 id, 466 resize: VirtioBalloonResize::new(size)?, 467 config: Arc::new(Mutex::new(config)), 468 seccomp_action, 469 exit_evt, 470 }) 471 } 472 473 pub fn resize(&self, size: u64) -> Result<(), Error> { 474 self.resize.work(size) 475 } 476 477 // Get the actual size of the virtio-balloon. 478 pub fn get_actual(&self) -> u64 { 479 (self.config.lock().unwrap().actual as u64) << VIRTIO_BALLOON_PFN_SHIFT 480 } 481 482 fn state(&self) -> BalloonState { 483 BalloonState { 484 avail_features: self.common.avail_features, 485 acked_features: self.common.acked_features, 486 config: *(self.config.lock().unwrap()), 487 } 488 } 489 490 fn set_state(&mut self, state: &BalloonState) { 491 self.common.avail_features = state.avail_features; 492 self.common.acked_features = state.acked_features; 493 *(self.config.lock().unwrap()) = state.config; 494 } 495 } 496 497 impl Drop for Balloon { 498 fn drop(&mut self) { 499 if let Some(kill_evt) = self.common.kill_evt.take() { 500 // Ignore the result because there is nothing we can do about it. 501 let _ = kill_evt.write(1); 502 } 503 } 504 } 505 506 impl VirtioDevice for Balloon { 507 fn device_type(&self) -> u32 { 508 self.common.device_type 509 } 510 511 fn queue_max_sizes(&self) -> &[u16] { 512 &self.common.queue_sizes 513 } 514 515 fn features(&self) -> u64 { 516 self.common.avail_features 517 } 518 519 fn ack_features(&mut self, value: u64) { 520 self.common.ack_features(value) 521 } 522 523 fn read_config(&self, offset: u64, data: &mut [u8]) { 524 self.read_config_from_slice(self.config.lock().unwrap().as_slice(), offset, data); 525 } 526 527 fn write_config(&mut self, offset: u64, data: &[u8]) { 528 // The "actual" field is the only mutable field 529 if offset != CONFIG_ACTUAL_OFFSET || data.len() != CONFIG_ACTUAL_SIZE { 530 error!( 531 "Attempt to write to read-only field: offset {:x} length {}", 532 offset, 533 data.len() 534 ); 535 return; 536 } 537 538 self.write_config_helper(self.config.lock().unwrap().as_mut_slice(), offset, data); 539 } 540 541 fn activate( 542 &mut self, 543 _mem: GuestMemoryAtomic<GuestMemoryMmap>, 544 interrupt_cb: Arc<dyn VirtioInterrupt>, 545 queues: Vec<Queue<GuestMemoryAtomic<GuestMemoryMmap>>>, 546 mut queue_evts: Vec<EventFd>, 547 ) -> ActivateResult { 548 self.common.activate(&queues, &queue_evts, &interrupt_cb)?; 549 let (kill_evt, pause_evt) = self.common.dup_eventfds(); 550 551 let inflate_queue_evt = queue_evts.remove(0); 552 let deflate_queue_evt = queue_evts.remove(0); 553 let reporting_queue_evt = 554 if self.common.feature_acked(VIRTIO_BALLOON_F_REPORTING) && !queue_evts.is_empty() { 555 Some(queue_evts.remove(0)) 556 } else { 557 None 558 }; 559 560 let mut handler = BalloonEpollHandler { 561 config: self.config.clone(), 562 resize_receiver: self.resize.get_receiver().map_err(|e| { 563 error!("failed to clone resize EventFd: {:?}", e); 564 ActivateError::BadActivate 565 })?, 566 queues, 567 interrupt_cb, 568 inflate_queue_evt, 569 deflate_queue_evt, 570 reporting_queue_evt, 571 kill_evt, 572 pause_evt, 573 }; 574 575 let paused = self.common.paused.clone(); 576 let paused_sync = self.common.paused_sync.clone(); 577 let mut epoll_threads = Vec::new(); 578 579 spawn_virtio_thread( 580 &self.id, 581 &self.seccomp_action, 582 Thread::VirtioBalloon, 583 &mut epoll_threads, 584 &self.exit_evt, 585 move || { 586 if let Err(e) = handler.run(paused, paused_sync.unwrap()) { 587 error!("Error running worker: {:?}", e); 588 } 589 }, 590 )?; 591 self.common.epoll_threads = Some(epoll_threads); 592 593 event!("virtio-device", "activated", "id", &self.id); 594 Ok(()) 595 } 596 597 fn reset(&mut self) -> Option<Arc<dyn VirtioInterrupt>> { 598 let result = self.common.reset(); 599 event!("virtio-device", "reset", "id", &self.id); 600 result 601 } 602 } 603 604 impl Pausable for Balloon { 605 fn pause(&mut self) -> result::Result<(), MigratableError> { 606 self.common.pause() 607 } 608 609 fn resume(&mut self) -> result::Result<(), MigratableError> { 610 self.common.resume() 611 } 612 } 613 614 impl Snapshottable for Balloon { 615 fn id(&self) -> String { 616 self.id.clone() 617 } 618 619 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 620 Snapshot::new_from_versioned_state(&self.id(), &self.state()) 621 } 622 623 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 624 self.set_state(&snapshot.to_versioned_state(&self.id)?); 625 Ok(()) 626 } 627 } 628 impl Transportable for Balloon {} 629 impl Migratable for Balloon {} 630