xref: /cloud-hypervisor/virtio-devices/src/balloon.rs (revision 7d7bfb2034001d4cb15df2ddc56d2d350c8da30f)
1 // Copyright (c) 2020 Ant Financial
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 use crate::{
16     seccomp_filters::Thread, thread_helper::spawn_virtio_thread, ActivateError, ActivateResult,
17     EpollHelper, EpollHelperError, EpollHelperHandler, GuestMemoryMmap, VirtioCommon, VirtioDevice,
18     VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, EPOLL_HELPER_EVENT_LAST,
19     VIRTIO_F_VERSION_1,
20 };
21 use libc::EFD_NONBLOCK;
22 use seccompiler::SeccompAction;
23 use std::io;
24 use std::mem::size_of;
25 use std::os::unix::io::AsRawFd;
26 use std::result;
27 use std::sync::{
28     atomic::{AtomicBool, AtomicU64, Ordering},
29     mpsc, Arc, Barrier, Mutex,
30 };
31 use versionize::{VersionMap, Versionize, VersionizeResult};
32 use versionize_derive::Versionize;
33 use virtio_queue::Queue;
34 use vm_memory::{
35     Address, ByteValued, Bytes, GuestAddress, GuestMemory, GuestMemoryAtomic, GuestMemoryError,
36     GuestMemoryRegion,
37 };
38 use vm_migration::{
39     Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
40 };
41 use vmm_sys_util::eventfd::EventFd;
42 
43 const QUEUE_SIZE: u16 = 128;
44 const REPORTING_QUEUE_SIZE: u16 = 32;
45 const MIN_NUM_QUEUES: usize = 2;
46 
47 // Resize event.
48 const RESIZE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 1;
49 // Inflate virtio queue event.
50 const INFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2;
51 // Deflate virtio queue event.
52 const DEFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 3;
53 // Reporting virtio queue event.
54 const REPORTING_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 4;
55 
56 // Size of a PFN in the balloon interface.
57 const VIRTIO_BALLOON_PFN_SHIFT: u64 = 12;
58 
59 // Deflate balloon on OOM
60 const VIRTIO_BALLOON_F_DEFLATE_ON_OOM: u64 = 2;
61 // Enable an additional virtqueue to let the guest notify the host about free
62 // pages.
63 const VIRTIO_BALLOON_F_REPORTING: u64 = 5;
64 
65 #[derive(Debug)]
66 pub enum Error {
67     // Guest gave us bad memory addresses.
68     GuestMemory(GuestMemoryError),
69     // Guest gave us a write only descriptor that protocol says to read from.
70     UnexpectedWriteOnlyDescriptor,
71     // Guest sent us invalid request.
72     InvalidRequest,
73     // Fallocate fail.
74     FallocateFail(std::io::Error),
75     // Madvise fail.
76     MadviseFail(std::io::Error),
77     // Failed to EventFd write.
78     EventFdWriteFail(std::io::Error),
79     // Failed to EventFd try_clone.
80     EventFdTryCloneFail(std::io::Error),
81     // Failed to MpscRecv.
82     MpscRecvFail(mpsc::RecvError),
83     // Resize invalid argument
84     ResizeInval(String),
85     // Invalid queue index
86     InvalidQueueIndex(usize),
87     // Fail tp signal
88     FailedSignal(io::Error),
89     /// Descriptor chain is too short
90     DescriptorChainTooShort,
91     /// Failed adding used index
92     QueueAddUsed(virtio_queue::Error),
93     /// Failed creating an iterator over the queue
94     QueueIterator(virtio_queue::Error),
95 }
96 
97 // Got from include/uapi/linux/virtio_balloon.h
98 #[repr(C)]
99 #[derive(Copy, Clone, Debug, Default, Versionize)]
100 pub struct VirtioBalloonConfig {
101     // Number of pages host wants Guest to give up.
102     num_pages: u32,
103     // Number of pages we've actually got in balloon.
104     actual: u32,
105 }
106 
107 const CONFIG_ACTUAL_OFFSET: u64 = 4;
108 const CONFIG_ACTUAL_SIZE: usize = 4;
109 
110 // SAFETY: it only has data and has no implicit padding.
111 unsafe impl ByteValued for VirtioBalloonConfig {}
112 
113 struct VirtioBalloonResizeReceiver {
114     size: Arc<AtomicU64>,
115     tx: mpsc::Sender<Result<(), Error>>,
116     evt: EventFd,
117 }
118 
119 impl VirtioBalloonResizeReceiver {
120     fn get_size(&self) -> u64 {
121         self.size.load(Ordering::Acquire)
122     }
123 
124     fn send(&self, r: Result<(), Error>) -> Result<(), mpsc::SendError<Result<(), Error>>> {
125         self.tx.send(r)
126     }
127 }
128 
129 struct VirtioBalloonResize {
130     size: Arc<AtomicU64>,
131     tx: mpsc::Sender<Result<(), Error>>,
132     rx: mpsc::Receiver<Result<(), Error>>,
133     evt: EventFd,
134 }
135 
136 impl VirtioBalloonResize {
137     pub fn new(size: u64) -> io::Result<Self> {
138         let (tx, rx) = mpsc::channel();
139 
140         Ok(Self {
141             size: Arc::new(AtomicU64::new(size)),
142             tx,
143             rx,
144             evt: EventFd::new(EFD_NONBLOCK)?,
145         })
146     }
147 
148     pub fn get_receiver(&self) -> Result<VirtioBalloonResizeReceiver, Error> {
149         Ok(VirtioBalloonResizeReceiver {
150             size: self.size.clone(),
151             tx: self.tx.clone(),
152             evt: self.evt.try_clone().map_err(Error::EventFdTryCloneFail)?,
153         })
154     }
155 
156     pub fn work(&self, size: u64) -> Result<(), Error> {
157         self.size.store(size, Ordering::Release);
158         self.evt.write(1).map_err(Error::EventFdWriteFail)?;
159         self.rx.recv().map_err(Error::MpscRecvFail)?
160     }
161 }
162 
163 struct BalloonEpollHandler {
164     config: Arc<Mutex<VirtioBalloonConfig>>,
165     resize_receiver: VirtioBalloonResizeReceiver,
166     queues: Vec<Queue<GuestMemoryAtomic<GuestMemoryMmap>>>,
167     interrupt_cb: Arc<dyn VirtioInterrupt>,
168     inflate_queue_evt: EventFd,
169     deflate_queue_evt: EventFd,
170     reporting_queue_evt: Option<EventFd>,
171     kill_evt: EventFd,
172     pause_evt: EventFd,
173 }
174 
175 impl BalloonEpollHandler {
176     fn signal(&self, int_type: VirtioInterruptType) -> result::Result<(), Error> {
177         self.interrupt_cb.trigger(int_type).map_err(|e| {
178             error!("Failed to signal used queue: {:?}", e);
179             Error::FailedSignal(e)
180         })
181     }
182 
183     fn advise_memory_range(
184         memory: &GuestMemoryMmap,
185         range_base: GuestAddress,
186         range_len: usize,
187         advice: libc::c_int,
188     ) -> result::Result<(), Error> {
189         let hva = memory
190             .get_host_address(range_base)
191             .map_err(Error::GuestMemory)?;
192         // Need unsafe to do syscall madvise
193         let res =
194             unsafe { libc::madvise(hva as *mut libc::c_void, range_len as libc::size_t, advice) };
195         if res != 0 {
196             return Err(Error::MadviseFail(io::Error::last_os_error()));
197         }
198         Ok(())
199     }
200 
201     fn release_memory_range(
202         memory: &GuestMemoryMmap,
203         range_base: GuestAddress,
204         range_len: usize,
205     ) -> result::Result<(), Error> {
206         let region = memory.find_region(range_base).ok_or(Error::GuestMemory(
207             GuestMemoryError::InvalidGuestAddress(range_base),
208         ))?;
209         if let Some(f_off) = region.file_offset() {
210             let offset = range_base.0 - region.start_addr().0;
211             let res = unsafe {
212                 libc::fallocate64(
213                     f_off.file().as_raw_fd(),
214                     libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE,
215                     (offset as u64 + f_off.start()) as libc::off64_t,
216                     range_len as libc::off64_t,
217                 )
218             };
219 
220             if res != 0 {
221                 return Err(Error::FallocateFail(io::Error::last_os_error()));
222             }
223         }
224 
225         Self::advise_memory_range(memory, range_base, range_len, libc::MADV_DONTNEED)
226     }
227 
228     fn notify_queue(
229         &mut self,
230         queue_index: usize,
231         used_descs: Vec<(u16, u32)>,
232     ) -> result::Result<(), Error> {
233         for (desc_index, len) in used_descs.iter() {
234             self.queues[queue_index]
235                 .add_used(*desc_index, *len)
236                 .map_err(Error::QueueAddUsed)?;
237         }
238 
239         if !used_descs.is_empty() {
240             self.signal(VirtioInterruptType::Queue(queue_index as u16))?;
241         }
242 
243         Ok(())
244     }
245 
246     fn process_queue(&mut self, queue_index: usize) -> result::Result<(), Error> {
247         let mut used_descs = Vec::new();
248         for mut desc_chain in self.queues[queue_index]
249             .iter()
250             .map_err(Error::QueueIterator)?
251         {
252             let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?;
253 
254             used_descs.push((desc_chain.head_index(), desc.len()));
255 
256             let data_chunk_size = size_of::<u32>();
257 
258             // The head contains the request type which MUST be readable.
259             if desc.is_write_only() {
260                 error!("The head contains the request type is not right");
261                 return Err(Error::UnexpectedWriteOnlyDescriptor);
262             }
263             if desc.len() as usize % data_chunk_size != 0 {
264                 error!("the request size {} is not right", desc.len());
265                 return Err(Error::InvalidRequest);
266             }
267 
268             let mut offset = 0u64;
269             while offset < desc.len() as u64 {
270                 let addr = desc.addr().checked_add(offset).unwrap();
271                 let pfn: u32 = desc_chain
272                     .memory()
273                     .read_obj(addr)
274                     .map_err(Error::GuestMemory)?;
275                 offset += data_chunk_size as u64;
276 
277                 let range_base = GuestAddress((pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT);
278                 let range_len = 1 << VIRTIO_BALLOON_PFN_SHIFT;
279 
280                 match queue_index {
281                     0 => {
282                         Self::release_memory_range(desc_chain.memory(), range_base, range_len)?;
283                     }
284                     1 => {
285                         Self::advise_memory_range(
286                             desc_chain.memory(),
287                             range_base,
288                             range_len,
289                             libc::MADV_WILLNEED,
290                         )?;
291                     }
292                     _ => return Err(Error::InvalidQueueIndex(queue_index)),
293                 }
294             }
295         }
296 
297         self.notify_queue(queue_index, used_descs)
298     }
299 
300     fn process_reporting_queue(&mut self, queue_index: usize) -> result::Result<(), Error> {
301         let mut used_descs = Vec::new();
302 
303         for mut desc_chain in self.queues[queue_index]
304             .iter()
305             .map_err(Error::QueueIterator)?
306         {
307             let mut descs_len = 0;
308             while let Some(desc) = desc_chain.next() {
309                 descs_len += desc.len();
310                 Self::release_memory_range(desc_chain.memory(), desc.addr(), desc.len() as usize)?;
311             }
312 
313             used_descs.push((desc_chain.head_index(), descs_len));
314         }
315 
316         self.notify_queue(queue_index, used_descs)
317     }
318 
319     fn run(
320         &mut self,
321         paused: Arc<AtomicBool>,
322         paused_sync: Arc<Barrier>,
323     ) -> result::Result<(), EpollHelperError> {
324         let mut helper = EpollHelper::new(&self.kill_evt, &self.pause_evt)?;
325         helper.add_event(self.resize_receiver.evt.as_raw_fd(), RESIZE_EVENT)?;
326         helper.add_event(self.inflate_queue_evt.as_raw_fd(), INFLATE_QUEUE_EVENT)?;
327         helper.add_event(self.deflate_queue_evt.as_raw_fd(), DEFLATE_QUEUE_EVENT)?;
328         if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() {
329             helper.add_event(reporting_queue_evt.as_raw_fd(), REPORTING_QUEUE_EVENT)?;
330         }
331         helper.run(paused, paused_sync, self)?;
332 
333         Ok(())
334     }
335 }
336 
337 impl EpollHelperHandler for BalloonEpollHandler {
338     fn handle_event(&mut self, _helper: &mut EpollHelper, event: &epoll::Event) -> bool {
339         let ev_type = event.data as u16;
340         match ev_type {
341             RESIZE_EVENT => {
342                 if let Err(e) = self.resize_receiver.evt.read() {
343                     error!("Failed to get resize event: {:?}", e);
344                     return true;
345                 }
346                 let mut signal_error = false;
347                 let r = {
348                     let mut config = self.config.lock().unwrap();
349                     config.num_pages =
350                         (self.resize_receiver.get_size() >> VIRTIO_BALLOON_PFN_SHIFT) as u32;
351                     if let Err(e) = self.signal(VirtioInterruptType::Config) {
352                         signal_error = true;
353                         Err(e)
354                     } else {
355                         Ok(())
356                     }
357                 };
358                 if let Err(e) = &r {
359                     // This error will send back to resize caller.
360                     error!("Handle resize event get error: {:?}", e);
361                 }
362                 if let Err(e) = self.resize_receiver.send(r) {
363                     error!("Sending \"resize\" generated error: {:?}", e);
364                     return true;
365                 }
366                 if signal_error {
367                     return true;
368                 }
369             }
370             INFLATE_QUEUE_EVENT => {
371                 if let Err(e) = self.inflate_queue_evt.read() {
372                     error!("Failed to get inflate queue event: {:?}", e);
373                     return true;
374                 } else if let Err(e) = self.process_queue(0) {
375                     error!("Failed to signal used inflate queue: {:?}", e);
376                     return true;
377                 }
378             }
379             DEFLATE_QUEUE_EVENT => {
380                 if let Err(e) = self.deflate_queue_evt.read() {
381                     error!("Failed to get deflate queue event: {:?}", e);
382                     return true;
383                 } else if let Err(e) = self.process_queue(1) {
384                     error!("Failed to signal used deflate queue: {:?}", e);
385                     return true;
386                 }
387             }
388             REPORTING_QUEUE_EVENT => {
389                 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() {
390                     if let Err(e) = reporting_queue_evt.read() {
391                         error!("Failed to get reporting queue event: {:?}", e);
392                         return true;
393                     } else if let Err(e) = self.process_reporting_queue(2) {
394                         error!("Failed to signal used inflate queue: {:?}", e);
395                         return true;
396                     }
397                 } else {
398                     error!("Invalid reporting queue event as no eventfd registered");
399                     return true;
400                 }
401             }
402             _ => {
403                 error!("Unknown event for virtio-balloon");
404                 return true;
405             }
406         }
407 
408         false
409     }
410 }
411 
412 #[derive(Versionize)]
413 pub struct BalloonState {
414     pub avail_features: u64,
415     pub acked_features: u64,
416     pub config: VirtioBalloonConfig,
417 }
418 
419 impl VersionMapped for BalloonState {}
420 
421 // Virtio device for exposing entropy to the guest OS through virtio.
422 pub struct Balloon {
423     common: VirtioCommon,
424     id: String,
425     resize: VirtioBalloonResize,
426     config: Arc<Mutex<VirtioBalloonConfig>>,
427     seccomp_action: SeccompAction,
428     exit_evt: EventFd,
429 }
430 
431 impl Balloon {
432     // Create a new virtio-balloon.
433     pub fn new(
434         id: String,
435         size: u64,
436         deflate_on_oom: bool,
437         free_page_reporting: bool,
438         seccomp_action: SeccompAction,
439         exit_evt: EventFd,
440     ) -> io::Result<Self> {
441         let mut queue_sizes = vec![QUEUE_SIZE; MIN_NUM_QUEUES];
442         let mut avail_features = 1u64 << VIRTIO_F_VERSION_1;
443         if deflate_on_oom {
444             avail_features |= 1u64 << VIRTIO_BALLOON_F_DEFLATE_ON_OOM;
445         }
446         if free_page_reporting {
447             avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING;
448             queue_sizes.push(REPORTING_QUEUE_SIZE);
449         }
450 
451         let config = VirtioBalloonConfig {
452             num_pages: (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32,
453             ..Default::default()
454         };
455 
456         Ok(Balloon {
457             common: VirtioCommon {
458                 device_type: VirtioDeviceType::Balloon as u32,
459                 avail_features,
460                 paused_sync: Some(Arc::new(Barrier::new(2))),
461                 queue_sizes,
462                 min_queues: MIN_NUM_QUEUES as u16,
463                 ..Default::default()
464             },
465             id,
466             resize: VirtioBalloonResize::new(size)?,
467             config: Arc::new(Mutex::new(config)),
468             seccomp_action,
469             exit_evt,
470         })
471     }
472 
473     pub fn resize(&self, size: u64) -> Result<(), Error> {
474         self.resize.work(size)
475     }
476 
477     // Get the actual size of the virtio-balloon.
478     pub fn get_actual(&self) -> u64 {
479         (self.config.lock().unwrap().actual as u64) << VIRTIO_BALLOON_PFN_SHIFT
480     }
481 
482     fn state(&self) -> BalloonState {
483         BalloonState {
484             avail_features: self.common.avail_features,
485             acked_features: self.common.acked_features,
486             config: *(self.config.lock().unwrap()),
487         }
488     }
489 
490     fn set_state(&mut self, state: &BalloonState) {
491         self.common.avail_features = state.avail_features;
492         self.common.acked_features = state.acked_features;
493         *(self.config.lock().unwrap()) = state.config;
494     }
495 }
496 
497 impl Drop for Balloon {
498     fn drop(&mut self) {
499         if let Some(kill_evt) = self.common.kill_evt.take() {
500             // Ignore the result because there is nothing we can do about it.
501             let _ = kill_evt.write(1);
502         }
503     }
504 }
505 
506 impl VirtioDevice for Balloon {
507     fn device_type(&self) -> u32 {
508         self.common.device_type
509     }
510 
511     fn queue_max_sizes(&self) -> &[u16] {
512         &self.common.queue_sizes
513     }
514 
515     fn features(&self) -> u64 {
516         self.common.avail_features
517     }
518 
519     fn ack_features(&mut self, value: u64) {
520         self.common.ack_features(value)
521     }
522 
523     fn read_config(&self, offset: u64, data: &mut [u8]) {
524         self.read_config_from_slice(self.config.lock().unwrap().as_slice(), offset, data);
525     }
526 
527     fn write_config(&mut self, offset: u64, data: &[u8]) {
528         // The "actual" field is the only mutable field
529         if offset != CONFIG_ACTUAL_OFFSET || data.len() != CONFIG_ACTUAL_SIZE {
530             error!(
531                 "Attempt to write to read-only field: offset {:x} length {}",
532                 offset,
533                 data.len()
534             );
535             return;
536         }
537 
538         self.write_config_helper(self.config.lock().unwrap().as_mut_slice(), offset, data);
539     }
540 
541     fn activate(
542         &mut self,
543         _mem: GuestMemoryAtomic<GuestMemoryMmap>,
544         interrupt_cb: Arc<dyn VirtioInterrupt>,
545         queues: Vec<Queue<GuestMemoryAtomic<GuestMemoryMmap>>>,
546         mut queue_evts: Vec<EventFd>,
547     ) -> ActivateResult {
548         self.common.activate(&queues, &queue_evts, &interrupt_cb)?;
549         let (kill_evt, pause_evt) = self.common.dup_eventfds();
550 
551         let inflate_queue_evt = queue_evts.remove(0);
552         let deflate_queue_evt = queue_evts.remove(0);
553         let reporting_queue_evt =
554             if self.common.feature_acked(VIRTIO_BALLOON_F_REPORTING) && !queue_evts.is_empty() {
555                 Some(queue_evts.remove(0))
556             } else {
557                 None
558             };
559 
560         let mut handler = BalloonEpollHandler {
561             config: self.config.clone(),
562             resize_receiver: self.resize.get_receiver().map_err(|e| {
563                 error!("failed to clone resize EventFd: {:?}", e);
564                 ActivateError::BadActivate
565             })?,
566             queues,
567             interrupt_cb,
568             inflate_queue_evt,
569             deflate_queue_evt,
570             reporting_queue_evt,
571             kill_evt,
572             pause_evt,
573         };
574 
575         let paused = self.common.paused.clone();
576         let paused_sync = self.common.paused_sync.clone();
577         let mut epoll_threads = Vec::new();
578 
579         spawn_virtio_thread(
580             &self.id,
581             &self.seccomp_action,
582             Thread::VirtioBalloon,
583             &mut epoll_threads,
584             &self.exit_evt,
585             move || {
586                 if let Err(e) = handler.run(paused, paused_sync.unwrap()) {
587                     error!("Error running worker: {:?}", e);
588                 }
589             },
590         )?;
591         self.common.epoll_threads = Some(epoll_threads);
592 
593         event!("virtio-device", "activated", "id", &self.id);
594         Ok(())
595     }
596 
597     fn reset(&mut self) -> Option<Arc<dyn VirtioInterrupt>> {
598         let result = self.common.reset();
599         event!("virtio-device", "reset", "id", &self.id);
600         result
601     }
602 }
603 
604 impl Pausable for Balloon {
605     fn pause(&mut self) -> result::Result<(), MigratableError> {
606         self.common.pause()
607     }
608 
609     fn resume(&mut self) -> result::Result<(), MigratableError> {
610         self.common.resume()
611     }
612 }
613 
614 impl Snapshottable for Balloon {
615     fn id(&self) -> String {
616         self.id.clone()
617     }
618 
619     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
620         Snapshot::new_from_versioned_state(&self.id(), &self.state())
621     }
622 
623     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
624         self.set_state(&snapshot.to_versioned_state(&self.id)?);
625         Ok(())
626     }
627 }
628 impl Transportable for Balloon {}
629 impl Migratable for Balloon {}
630