1 // Copyright (c) 2020 Ant Financial 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 use crate::{ 16 seccomp_filters::Thread, thread_helper::spawn_virtio_thread, ActivateResult, EpollHelper, 17 EpollHelperError, EpollHelperHandler, GuestMemoryMmap, VirtioCommon, VirtioDevice, 18 VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, EPOLL_HELPER_EVENT_LAST, 19 VIRTIO_F_VERSION_1, 20 }; 21 use anyhow::anyhow; 22 use seccompiler::SeccompAction; 23 use std::io::{self, Write}; 24 use std::mem::size_of; 25 use std::os::unix::io::AsRawFd; 26 use std::result; 27 use std::sync::{atomic::AtomicBool, Arc, Barrier}; 28 use thiserror::Error; 29 use versionize::{VersionMap, Versionize, VersionizeResult}; 30 use versionize_derive::Versionize; 31 use virtio_queue::{Queue, QueueT}; 32 use vm_memory::{ 33 Address, ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic, 34 GuestMemoryError, GuestMemoryRegion, 35 }; 36 use vm_migration::{ 37 Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped, 38 }; 39 use vmm_sys_util::eventfd::EventFd; 40 41 const QUEUE_SIZE: u16 = 128; 42 const REPORTING_QUEUE_SIZE: u16 = 32; 43 const MIN_NUM_QUEUES: usize = 2; 44 45 // Inflate virtio queue event. 46 const INFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 1; 47 // Deflate virtio queue event. 48 const DEFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2; 49 // Reporting virtio queue event. 50 const REPORTING_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 3; 51 52 // Size of a PFN in the balloon interface. 53 const VIRTIO_BALLOON_PFN_SHIFT: u64 = 12; 54 55 // Deflate balloon on OOM 56 const VIRTIO_BALLOON_F_DEFLATE_ON_OOM: u64 = 2; 57 // Enable an additional virtqueue to let the guest notify the host about free 58 // pages. 59 const VIRTIO_BALLOON_F_REPORTING: u64 = 5; 60 61 #[derive(Error, Debug)] 62 pub enum Error { 63 #[error("Guest gave us bad memory addresses.: {0}")] 64 GuestMemory(GuestMemoryError), 65 #[error("Guest gave us a write only descriptor that protocol says to read from.")] 66 UnexpectedWriteOnlyDescriptor, 67 #[error("Guest sent us invalid request.")] 68 InvalidRequest, 69 #[error("Fallocate fail.: {0}")] 70 FallocateFail(std::io::Error), 71 #[error("Madvise fail.: {0}")] 72 MadviseFail(std::io::Error), 73 #[error("Failed to EventFd write.: {0}")] 74 EventFdWriteFail(std::io::Error), 75 #[error("Invalid queue index: {0}")] 76 InvalidQueueIndex(usize), 77 #[error("Fail tp signal: {0}")] 78 FailedSignal(io::Error), 79 #[error("Descriptor chain is too short")] 80 DescriptorChainTooShort, 81 #[error("Failed adding used index: {0}")] 82 QueueAddUsed(virtio_queue::Error), 83 #[error("Failed creating an iterator over the queue: {0}")] 84 QueueIterator(virtio_queue::Error), 85 } 86 87 // Got from include/uapi/linux/virtio_balloon.h 88 #[repr(C)] 89 #[derive(Copy, Clone, Debug, Default, Versionize)] 90 pub struct VirtioBalloonConfig { 91 // Number of pages host wants Guest to give up. 92 num_pages: u32, 93 // Number of pages we've actually got in balloon. 94 actual: u32, 95 } 96 97 const CONFIG_ACTUAL_OFFSET: u64 = 4; 98 const CONFIG_ACTUAL_SIZE: usize = 4; 99 100 // SAFETY: it only has data and has no implicit padding. 101 unsafe impl ByteValued for VirtioBalloonConfig {} 102 103 struct BalloonEpollHandler { 104 mem: GuestMemoryAtomic<GuestMemoryMmap>, 105 queues: Vec<Queue>, 106 interrupt_cb: Arc<dyn VirtioInterrupt>, 107 inflate_queue_evt: EventFd, 108 deflate_queue_evt: EventFd, 109 reporting_queue_evt: Option<EventFd>, 110 kill_evt: EventFd, 111 pause_evt: EventFd, 112 } 113 114 impl BalloonEpollHandler { 115 fn signal(&self, int_type: VirtioInterruptType) -> result::Result<(), Error> { 116 self.interrupt_cb.trigger(int_type).map_err(|e| { 117 error!("Failed to signal used queue: {:?}", e); 118 Error::FailedSignal(e) 119 }) 120 } 121 122 fn advise_memory_range( 123 memory: &GuestMemoryMmap, 124 range_base: GuestAddress, 125 range_len: usize, 126 advice: libc::c_int, 127 ) -> result::Result<(), Error> { 128 let hva = memory 129 .get_host_address(range_base) 130 .map_err(Error::GuestMemory)?; 131 // Need unsafe to do syscall madvise 132 let res = 133 unsafe { libc::madvise(hva as *mut libc::c_void, range_len as libc::size_t, advice) }; 134 if res != 0 { 135 return Err(Error::MadviseFail(io::Error::last_os_error())); 136 } 137 Ok(()) 138 } 139 140 fn release_memory_range( 141 memory: &GuestMemoryMmap, 142 range_base: GuestAddress, 143 range_len: usize, 144 ) -> result::Result<(), Error> { 145 let region = memory.find_region(range_base).ok_or(Error::GuestMemory( 146 GuestMemoryError::InvalidGuestAddress(range_base), 147 ))?; 148 if let Some(f_off) = region.file_offset() { 149 let offset = range_base.0 - region.start_addr().0; 150 let res = unsafe { 151 libc::fallocate64( 152 f_off.file().as_raw_fd(), 153 libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, 154 (offset as u64 + f_off.start()) as libc::off64_t, 155 range_len as libc::off64_t, 156 ) 157 }; 158 159 if res != 0 { 160 return Err(Error::FallocateFail(io::Error::last_os_error())); 161 } 162 } 163 164 Self::advise_memory_range(memory, range_base, range_len, libc::MADV_DONTNEED) 165 } 166 167 fn process_queue(&mut self, queue_index: usize) -> result::Result<(), Error> { 168 let mut used_descs = false; 169 while let Some(mut desc_chain) = 170 self.queues[queue_index].pop_descriptor_chain(self.mem.memory()) 171 { 172 let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?; 173 174 let data_chunk_size = size_of::<u32>(); 175 176 // The head contains the request type which MUST be readable. 177 if desc.is_write_only() { 178 error!("The head contains the request type is not right"); 179 return Err(Error::UnexpectedWriteOnlyDescriptor); 180 } 181 if desc.len() as usize % data_chunk_size != 0 { 182 error!("the request size {} is not right", desc.len()); 183 return Err(Error::InvalidRequest); 184 } 185 186 let mut offset = 0u64; 187 while offset < desc.len() as u64 { 188 let addr = desc.addr().checked_add(offset).unwrap(); 189 let pfn: u32 = desc_chain 190 .memory() 191 .read_obj(addr) 192 .map_err(Error::GuestMemory)?; 193 offset += data_chunk_size as u64; 194 195 let range_base = GuestAddress((pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT); 196 let range_len = 1 << VIRTIO_BALLOON_PFN_SHIFT; 197 198 match queue_index { 199 0 => { 200 Self::release_memory_range(desc_chain.memory(), range_base, range_len)?; 201 } 202 1 => { 203 Self::advise_memory_range( 204 desc_chain.memory(), 205 range_base, 206 range_len, 207 libc::MADV_WILLNEED, 208 )?; 209 } 210 _ => return Err(Error::InvalidQueueIndex(queue_index)), 211 } 212 } 213 214 self.queues[queue_index] 215 .add_used(desc_chain.memory(), desc_chain.head_index(), desc.len()) 216 .map_err(Error::QueueAddUsed)?; 217 used_descs = true; 218 } 219 220 if used_descs { 221 self.signal(VirtioInterruptType::Queue(queue_index as u16)) 222 } else { 223 Ok(()) 224 } 225 } 226 227 fn process_reporting_queue(&mut self, queue_index: usize) -> result::Result<(), Error> { 228 let mut used_descs = false; 229 while let Some(mut desc_chain) = 230 self.queues[queue_index].pop_descriptor_chain(self.mem.memory()) 231 { 232 let mut descs_len = 0; 233 while let Some(desc) = desc_chain.next() { 234 descs_len += desc.len(); 235 Self::release_memory_range(desc_chain.memory(), desc.addr(), desc.len() as usize)?; 236 } 237 238 self.queues[queue_index] 239 .add_used(desc_chain.memory(), desc_chain.head_index(), descs_len) 240 .map_err(Error::QueueAddUsed)?; 241 used_descs = true; 242 } 243 244 if used_descs { 245 self.signal(VirtioInterruptType::Queue(queue_index as u16)) 246 } else { 247 Ok(()) 248 } 249 } 250 251 fn run( 252 &mut self, 253 paused: Arc<AtomicBool>, 254 paused_sync: Arc<Barrier>, 255 ) -> result::Result<(), EpollHelperError> { 256 let mut helper = EpollHelper::new(&self.kill_evt, &self.pause_evt)?; 257 helper.add_event(self.inflate_queue_evt.as_raw_fd(), INFLATE_QUEUE_EVENT)?; 258 helper.add_event(self.deflate_queue_evt.as_raw_fd(), DEFLATE_QUEUE_EVENT)?; 259 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() { 260 helper.add_event(reporting_queue_evt.as_raw_fd(), REPORTING_QUEUE_EVENT)?; 261 } 262 helper.run(paused, paused_sync, self)?; 263 264 Ok(()) 265 } 266 } 267 268 impl EpollHelperHandler for BalloonEpollHandler { 269 fn handle_event( 270 &mut self, 271 _helper: &mut EpollHelper, 272 event: &epoll::Event, 273 ) -> result::Result<(), EpollHelperError> { 274 let ev_type = event.data as u16; 275 match ev_type { 276 INFLATE_QUEUE_EVENT => { 277 self.inflate_queue_evt.read().map_err(|e| { 278 EpollHelperError::HandleEvent(anyhow!( 279 "Failed to get inflate queue event: {:?}", 280 e 281 )) 282 })?; 283 self.process_queue(0).map_err(|e| { 284 EpollHelperError::HandleEvent(anyhow!( 285 "Failed to signal used inflate queue: {:?}", 286 e 287 )) 288 })?; 289 } 290 DEFLATE_QUEUE_EVENT => { 291 self.deflate_queue_evt.read().map_err(|e| { 292 EpollHelperError::HandleEvent(anyhow!( 293 "Failed to get deflate queue event: {:?}", 294 e 295 )) 296 })?; 297 self.process_queue(1).map_err(|e| { 298 EpollHelperError::HandleEvent(anyhow!( 299 "Failed to signal used deflate queue: {:?}", 300 e 301 )) 302 })?; 303 } 304 REPORTING_QUEUE_EVENT => { 305 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() { 306 reporting_queue_evt.read().map_err(|e| { 307 EpollHelperError::HandleEvent(anyhow!( 308 "Failed to get reporting queue event: {:?}", 309 e 310 )) 311 })?; 312 self.process_reporting_queue(2).map_err(|e| { 313 EpollHelperError::HandleEvent(anyhow!( 314 "Failed to signal used inflate queue: {:?}", 315 e 316 )) 317 })?; 318 } else { 319 return Err(EpollHelperError::HandleEvent(anyhow!( 320 "Invalid reporting queue event as no eventfd registered" 321 ))); 322 } 323 } 324 _ => { 325 return Err(EpollHelperError::HandleEvent(anyhow!( 326 "Unknown event for virtio-balloon" 327 ))); 328 } 329 } 330 331 Ok(()) 332 } 333 } 334 335 #[derive(Versionize)] 336 pub struct BalloonState { 337 pub avail_features: u64, 338 pub acked_features: u64, 339 pub config: VirtioBalloonConfig, 340 } 341 342 impl VersionMapped for BalloonState {} 343 344 // Virtio device for exposing entropy to the guest OS through virtio. 345 pub struct Balloon { 346 common: VirtioCommon, 347 id: String, 348 config: VirtioBalloonConfig, 349 seccomp_action: SeccompAction, 350 exit_evt: EventFd, 351 interrupt_cb: Option<Arc<dyn VirtioInterrupt>>, 352 } 353 354 impl Balloon { 355 // Create a new virtio-balloon. 356 pub fn new( 357 id: String, 358 size: u64, 359 deflate_on_oom: bool, 360 free_page_reporting: bool, 361 seccomp_action: SeccompAction, 362 exit_evt: EventFd, 363 ) -> io::Result<Self> { 364 let mut queue_sizes = vec![QUEUE_SIZE; MIN_NUM_QUEUES]; 365 let mut avail_features = 1u64 << VIRTIO_F_VERSION_1; 366 if deflate_on_oom { 367 avail_features |= 1u64 << VIRTIO_BALLOON_F_DEFLATE_ON_OOM; 368 } 369 if free_page_reporting { 370 avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING; 371 queue_sizes.push(REPORTING_QUEUE_SIZE); 372 } 373 374 let config = VirtioBalloonConfig { 375 num_pages: (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32, 376 ..Default::default() 377 }; 378 379 Ok(Balloon { 380 common: VirtioCommon { 381 device_type: VirtioDeviceType::Balloon as u32, 382 avail_features, 383 paused_sync: Some(Arc::new(Barrier::new(2))), 384 queue_sizes, 385 min_queues: MIN_NUM_QUEUES as u16, 386 ..Default::default() 387 }, 388 id, 389 config, 390 seccomp_action, 391 exit_evt, 392 interrupt_cb: None, 393 }) 394 } 395 396 pub fn resize(&mut self, size: u64) -> Result<(), Error> { 397 self.config.num_pages = (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32; 398 399 if let Some(interrupt_cb) = &self.interrupt_cb { 400 interrupt_cb 401 .trigger(VirtioInterruptType::Config) 402 .map_err(Error::FailedSignal) 403 } else { 404 Ok(()) 405 } 406 } 407 408 // Get the actual size of the virtio-balloon. 409 pub fn get_actual(&self) -> u64 { 410 (self.config.actual as u64) << VIRTIO_BALLOON_PFN_SHIFT 411 } 412 413 fn state(&self) -> BalloonState { 414 BalloonState { 415 avail_features: self.common.avail_features, 416 acked_features: self.common.acked_features, 417 config: self.config, 418 } 419 } 420 421 fn set_state(&mut self, state: &BalloonState) { 422 self.common.avail_features = state.avail_features; 423 self.common.acked_features = state.acked_features; 424 self.config = state.config; 425 } 426 427 #[cfg(fuzzing)] 428 pub fn wait_for_epoll_threads(&mut self) { 429 self.common.wait_for_epoll_threads(); 430 } 431 } 432 433 impl Drop for Balloon { 434 fn drop(&mut self) { 435 if let Some(kill_evt) = self.common.kill_evt.take() { 436 // Ignore the result because there is nothing we can do about it. 437 let _ = kill_evt.write(1); 438 } 439 } 440 } 441 442 impl VirtioDevice for Balloon { 443 fn device_type(&self) -> u32 { 444 self.common.device_type 445 } 446 447 fn queue_max_sizes(&self) -> &[u16] { 448 &self.common.queue_sizes 449 } 450 451 fn features(&self) -> u64 { 452 self.common.avail_features 453 } 454 455 fn ack_features(&mut self, value: u64) { 456 self.common.ack_features(value) 457 } 458 459 fn read_config(&self, offset: u64, data: &mut [u8]) { 460 self.read_config_from_slice(self.config.as_slice(), offset, data); 461 } 462 463 fn write_config(&mut self, offset: u64, data: &[u8]) { 464 // The "actual" field is the only mutable field 465 if offset != CONFIG_ACTUAL_OFFSET || data.len() != CONFIG_ACTUAL_SIZE { 466 error!( 467 "Attempt to write to read-only field: offset {:x} length {}", 468 offset, 469 data.len() 470 ); 471 return; 472 } 473 474 let config = self.config.as_mut_slice(); 475 let config_len = config.len() as u64; 476 let data_len = data.len() as u64; 477 if offset + data_len > config_len { 478 error!( 479 "Out-of-bound access to configuration: config_len = {} offset = {:x} length = {} for {}", 480 config_len, 481 offset, 482 data_len, 483 self.device_type() 484 ); 485 return; 486 } 487 488 if let Some(end) = offset.checked_add(config.len() as u64) { 489 let mut offset_config = 490 &mut config[offset as usize..std::cmp::min(end, config_len) as usize]; 491 offset_config.write_all(data).unwrap(); 492 } 493 } 494 495 fn activate( 496 &mut self, 497 mem: GuestMemoryAtomic<GuestMemoryMmap>, 498 interrupt_cb: Arc<dyn VirtioInterrupt>, 499 mut queues: Vec<(usize, Queue, EventFd)>, 500 ) -> ActivateResult { 501 self.common.activate(&queues, &interrupt_cb)?; 502 let (kill_evt, pause_evt) = self.common.dup_eventfds(); 503 504 let mut virtqueues = Vec::new(); 505 let (_, queue, queue_evt) = queues.remove(0); 506 virtqueues.push(queue); 507 let inflate_queue_evt = queue_evt; 508 let (_, queue, queue_evt) = queues.remove(0); 509 virtqueues.push(queue); 510 let deflate_queue_evt = queue_evt; 511 let reporting_queue_evt = 512 if self.common.feature_acked(VIRTIO_BALLOON_F_REPORTING) && !queues.is_empty() { 513 let (_, queue, queue_evt) = queues.remove(0); 514 virtqueues.push(queue); 515 Some(queue_evt) 516 } else { 517 None 518 }; 519 520 self.interrupt_cb = Some(interrupt_cb.clone()); 521 522 let mut handler = BalloonEpollHandler { 523 mem, 524 queues: virtqueues, 525 interrupt_cb, 526 inflate_queue_evt, 527 deflate_queue_evt, 528 reporting_queue_evt, 529 kill_evt, 530 pause_evt, 531 }; 532 533 let paused = self.common.paused.clone(); 534 let paused_sync = self.common.paused_sync.clone(); 535 let mut epoll_threads = Vec::new(); 536 537 spawn_virtio_thread( 538 &self.id, 539 &self.seccomp_action, 540 Thread::VirtioBalloon, 541 &mut epoll_threads, 542 &self.exit_evt, 543 move || handler.run(paused, paused_sync.unwrap()), 544 )?; 545 self.common.epoll_threads = Some(epoll_threads); 546 547 event!("virtio-device", "activated", "id", &self.id); 548 Ok(()) 549 } 550 551 fn reset(&mut self) -> Option<Arc<dyn VirtioInterrupt>> { 552 let result = self.common.reset(); 553 event!("virtio-device", "reset", "id", &self.id); 554 result 555 } 556 } 557 558 impl Pausable for Balloon { 559 fn pause(&mut self) -> result::Result<(), MigratableError> { 560 self.common.pause() 561 } 562 563 fn resume(&mut self) -> result::Result<(), MigratableError> { 564 self.common.resume() 565 } 566 } 567 568 impl Snapshottable for Balloon { 569 fn id(&self) -> String { 570 self.id.clone() 571 } 572 573 fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> { 574 Snapshot::new_from_versioned_state(&self.id(), &self.state()) 575 } 576 577 fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> { 578 self.set_state(&snapshot.to_versioned_state(&self.id)?); 579 Ok(()) 580 } 581 } 582 impl Transportable for Balloon {} 583 impl Migratable for Balloon {} 584