xref: /cloud-hypervisor/virtio-devices/src/balloon.rs (revision eea9bcea38e0c5649f444c829f3a4f9c22aa486c)
1 // Copyright (c) 2020 Ant Financial
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //     http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 use crate::{
16     seccomp_filters::Thread, thread_helper::spawn_virtio_thread, ActivateResult, EpollHelper,
17     EpollHelperError, EpollHelperHandler, GuestMemoryMmap, VirtioCommon, VirtioDevice,
18     VirtioDeviceType, VirtioInterrupt, VirtioInterruptType, EPOLL_HELPER_EVENT_LAST,
19     VIRTIO_F_VERSION_1,
20 };
21 use anyhow::anyhow;
22 use seccompiler::SeccompAction;
23 use std::io::{self, Write};
24 use std::mem::size_of;
25 use std::os::unix::io::AsRawFd;
26 use std::result;
27 use std::sync::{atomic::AtomicBool, Arc, Barrier};
28 use thiserror::Error;
29 use versionize::{VersionMap, Versionize, VersionizeResult};
30 use versionize_derive::Versionize;
31 use virtio_queue::{Queue, QueueT};
32 use vm_memory::{
33     Address, ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemory, GuestMemoryAtomic,
34     GuestMemoryError, GuestMemoryRegion,
35 };
36 use vm_migration::{
37     Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable, VersionMapped,
38 };
39 use vmm_sys_util::eventfd::EventFd;
40 
41 const QUEUE_SIZE: u16 = 128;
42 const REPORTING_QUEUE_SIZE: u16 = 32;
43 const MIN_NUM_QUEUES: usize = 2;
44 
45 // Inflate virtio queue event.
46 const INFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 1;
47 // Deflate virtio queue event.
48 const DEFLATE_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2;
49 // Reporting virtio queue event.
50 const REPORTING_QUEUE_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 3;
51 
52 // Size of a PFN in the balloon interface.
53 const VIRTIO_BALLOON_PFN_SHIFT: u64 = 12;
54 
55 // Deflate balloon on OOM
56 const VIRTIO_BALLOON_F_DEFLATE_ON_OOM: u64 = 2;
57 // Enable an additional virtqueue to let the guest notify the host about free
58 // pages.
59 const VIRTIO_BALLOON_F_REPORTING: u64 = 5;
60 
61 #[derive(Error, Debug)]
62 pub enum Error {
63     #[error("Guest gave us bad memory addresses.: {0}")]
64     GuestMemory(GuestMemoryError),
65     #[error("Guest gave us a write only descriptor that protocol says to read from.")]
66     UnexpectedWriteOnlyDescriptor,
67     #[error("Guest sent us invalid request.")]
68     InvalidRequest,
69     #[error("Fallocate fail.: {0}")]
70     FallocateFail(std::io::Error),
71     #[error("Madvise fail.: {0}")]
72     MadviseFail(std::io::Error),
73     #[error("Failed to EventFd write.: {0}")]
74     EventFdWriteFail(std::io::Error),
75     #[error("Invalid queue index: {0}")]
76     InvalidQueueIndex(usize),
77     #[error("Fail tp signal: {0}")]
78     FailedSignal(io::Error),
79     #[error("Descriptor chain is too short")]
80     DescriptorChainTooShort,
81     #[error("Failed adding used index: {0}")]
82     QueueAddUsed(virtio_queue::Error),
83     #[error("Failed creating an iterator over the queue: {0}")]
84     QueueIterator(virtio_queue::Error),
85 }
86 
87 // Got from include/uapi/linux/virtio_balloon.h
88 #[repr(C)]
89 #[derive(Copy, Clone, Debug, Default, Versionize)]
90 pub struct VirtioBalloonConfig {
91     // Number of pages host wants Guest to give up.
92     num_pages: u32,
93     // Number of pages we've actually got in balloon.
94     actual: u32,
95 }
96 
97 const CONFIG_ACTUAL_OFFSET: u64 = 4;
98 const CONFIG_ACTUAL_SIZE: usize = 4;
99 
100 // SAFETY: it only has data and has no implicit padding.
101 unsafe impl ByteValued for VirtioBalloonConfig {}
102 
103 struct BalloonEpollHandler {
104     mem: GuestMemoryAtomic<GuestMemoryMmap>,
105     queues: Vec<Queue>,
106     interrupt_cb: Arc<dyn VirtioInterrupt>,
107     inflate_queue_evt: EventFd,
108     deflate_queue_evt: EventFd,
109     reporting_queue_evt: Option<EventFd>,
110     kill_evt: EventFd,
111     pause_evt: EventFd,
112 }
113 
114 impl BalloonEpollHandler {
115     fn signal(&self, int_type: VirtioInterruptType) -> result::Result<(), Error> {
116         self.interrupt_cb.trigger(int_type).map_err(|e| {
117             error!("Failed to signal used queue: {:?}", e);
118             Error::FailedSignal(e)
119         })
120     }
121 
122     fn advise_memory_range(
123         memory: &GuestMemoryMmap,
124         range_base: GuestAddress,
125         range_len: usize,
126         advice: libc::c_int,
127     ) -> result::Result<(), Error> {
128         let hva = memory
129             .get_host_address(range_base)
130             .map_err(Error::GuestMemory)?;
131         // Need unsafe to do syscall madvise
132         let res =
133             unsafe { libc::madvise(hva as *mut libc::c_void, range_len as libc::size_t, advice) };
134         if res != 0 {
135             return Err(Error::MadviseFail(io::Error::last_os_error()));
136         }
137         Ok(())
138     }
139 
140     fn release_memory_range(
141         memory: &GuestMemoryMmap,
142         range_base: GuestAddress,
143         range_len: usize,
144     ) -> result::Result<(), Error> {
145         let region = memory.find_region(range_base).ok_or(Error::GuestMemory(
146             GuestMemoryError::InvalidGuestAddress(range_base),
147         ))?;
148         if let Some(f_off) = region.file_offset() {
149             let offset = range_base.0 - region.start_addr().0;
150             let res = unsafe {
151                 libc::fallocate64(
152                     f_off.file().as_raw_fd(),
153                     libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE,
154                     (offset as u64 + f_off.start()) as libc::off64_t,
155                     range_len as libc::off64_t,
156                 )
157             };
158 
159             if res != 0 {
160                 return Err(Error::FallocateFail(io::Error::last_os_error()));
161             }
162         }
163 
164         Self::advise_memory_range(memory, range_base, range_len, libc::MADV_DONTNEED)
165     }
166 
167     fn process_queue(&mut self, queue_index: usize) -> result::Result<(), Error> {
168         let mut used_descs = false;
169         while let Some(mut desc_chain) =
170             self.queues[queue_index].pop_descriptor_chain(self.mem.memory())
171         {
172             let desc = desc_chain.next().ok_or(Error::DescriptorChainTooShort)?;
173 
174             let data_chunk_size = size_of::<u32>();
175 
176             // The head contains the request type which MUST be readable.
177             if desc.is_write_only() {
178                 error!("The head contains the request type is not right");
179                 return Err(Error::UnexpectedWriteOnlyDescriptor);
180             }
181             if desc.len() as usize % data_chunk_size != 0 {
182                 error!("the request size {} is not right", desc.len());
183                 return Err(Error::InvalidRequest);
184             }
185 
186             let mut offset = 0u64;
187             while offset < desc.len() as u64 {
188                 let addr = desc.addr().checked_add(offset).unwrap();
189                 let pfn: u32 = desc_chain
190                     .memory()
191                     .read_obj(addr)
192                     .map_err(Error::GuestMemory)?;
193                 offset += data_chunk_size as u64;
194 
195                 let range_base = GuestAddress((pfn as u64) << VIRTIO_BALLOON_PFN_SHIFT);
196                 let range_len = 1 << VIRTIO_BALLOON_PFN_SHIFT;
197 
198                 match queue_index {
199                     0 => {
200                         Self::release_memory_range(desc_chain.memory(), range_base, range_len)?;
201                     }
202                     1 => {
203                         Self::advise_memory_range(
204                             desc_chain.memory(),
205                             range_base,
206                             range_len,
207                             libc::MADV_WILLNEED,
208                         )?;
209                     }
210                     _ => return Err(Error::InvalidQueueIndex(queue_index)),
211                 }
212             }
213 
214             self.queues[queue_index]
215                 .add_used(desc_chain.memory(), desc_chain.head_index(), desc.len())
216                 .map_err(Error::QueueAddUsed)?;
217             used_descs = true;
218         }
219 
220         if used_descs {
221             self.signal(VirtioInterruptType::Queue(queue_index as u16))
222         } else {
223             Ok(())
224         }
225     }
226 
227     fn process_reporting_queue(&mut self, queue_index: usize) -> result::Result<(), Error> {
228         let mut used_descs = false;
229         while let Some(mut desc_chain) =
230             self.queues[queue_index].pop_descriptor_chain(self.mem.memory())
231         {
232             let mut descs_len = 0;
233             while let Some(desc) = desc_chain.next() {
234                 descs_len += desc.len();
235                 Self::release_memory_range(desc_chain.memory(), desc.addr(), desc.len() as usize)?;
236             }
237 
238             self.queues[queue_index]
239                 .add_used(desc_chain.memory(), desc_chain.head_index(), descs_len)
240                 .map_err(Error::QueueAddUsed)?;
241             used_descs = true;
242         }
243 
244         if used_descs {
245             self.signal(VirtioInterruptType::Queue(queue_index as u16))
246         } else {
247             Ok(())
248         }
249     }
250 
251     fn run(
252         &mut self,
253         paused: Arc<AtomicBool>,
254         paused_sync: Arc<Barrier>,
255     ) -> result::Result<(), EpollHelperError> {
256         let mut helper = EpollHelper::new(&self.kill_evt, &self.pause_evt)?;
257         helper.add_event(self.inflate_queue_evt.as_raw_fd(), INFLATE_QUEUE_EVENT)?;
258         helper.add_event(self.deflate_queue_evt.as_raw_fd(), DEFLATE_QUEUE_EVENT)?;
259         if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() {
260             helper.add_event(reporting_queue_evt.as_raw_fd(), REPORTING_QUEUE_EVENT)?;
261         }
262         helper.run(paused, paused_sync, self)?;
263 
264         Ok(())
265     }
266 }
267 
268 impl EpollHelperHandler for BalloonEpollHandler {
269     fn handle_event(
270         &mut self,
271         _helper: &mut EpollHelper,
272         event: &epoll::Event,
273     ) -> result::Result<(), EpollHelperError> {
274         let ev_type = event.data as u16;
275         match ev_type {
276             INFLATE_QUEUE_EVENT => {
277                 self.inflate_queue_evt.read().map_err(|e| {
278                     EpollHelperError::HandleEvent(anyhow!(
279                         "Failed to get inflate queue event: {:?}",
280                         e
281                     ))
282                 })?;
283                 self.process_queue(0).map_err(|e| {
284                     EpollHelperError::HandleEvent(anyhow!(
285                         "Failed to signal used inflate queue: {:?}",
286                         e
287                     ))
288                 })?;
289             }
290             DEFLATE_QUEUE_EVENT => {
291                 self.deflate_queue_evt.read().map_err(|e| {
292                     EpollHelperError::HandleEvent(anyhow!(
293                         "Failed to get deflate queue event: {:?}",
294                         e
295                     ))
296                 })?;
297                 self.process_queue(1).map_err(|e| {
298                     EpollHelperError::HandleEvent(anyhow!(
299                         "Failed to signal used deflate queue: {:?}",
300                         e
301                     ))
302                 })?;
303             }
304             REPORTING_QUEUE_EVENT => {
305                 if let Some(reporting_queue_evt) = self.reporting_queue_evt.as_ref() {
306                     reporting_queue_evt.read().map_err(|e| {
307                         EpollHelperError::HandleEvent(anyhow!(
308                             "Failed to get reporting queue event: {:?}",
309                             e
310                         ))
311                     })?;
312                     self.process_reporting_queue(2).map_err(|e| {
313                         EpollHelperError::HandleEvent(anyhow!(
314                             "Failed to signal used inflate queue: {:?}",
315                             e
316                         ))
317                     })?;
318                 } else {
319                     return Err(EpollHelperError::HandleEvent(anyhow!(
320                         "Invalid reporting queue event as no eventfd registered"
321                     )));
322                 }
323             }
324             _ => {
325                 return Err(EpollHelperError::HandleEvent(anyhow!(
326                     "Unknown event for virtio-balloon"
327                 )));
328             }
329         }
330 
331         Ok(())
332     }
333 }
334 
335 #[derive(Versionize)]
336 pub struct BalloonState {
337     pub avail_features: u64,
338     pub acked_features: u64,
339     pub config: VirtioBalloonConfig,
340 }
341 
342 impl VersionMapped for BalloonState {}
343 
344 // Virtio device for exposing entropy to the guest OS through virtio.
345 pub struct Balloon {
346     common: VirtioCommon,
347     id: String,
348     config: VirtioBalloonConfig,
349     seccomp_action: SeccompAction,
350     exit_evt: EventFd,
351     interrupt_cb: Option<Arc<dyn VirtioInterrupt>>,
352 }
353 
354 impl Balloon {
355     // Create a new virtio-balloon.
356     pub fn new(
357         id: String,
358         size: u64,
359         deflate_on_oom: bool,
360         free_page_reporting: bool,
361         seccomp_action: SeccompAction,
362         exit_evt: EventFd,
363     ) -> io::Result<Self> {
364         let mut queue_sizes = vec![QUEUE_SIZE; MIN_NUM_QUEUES];
365         let mut avail_features = 1u64 << VIRTIO_F_VERSION_1;
366         if deflate_on_oom {
367             avail_features |= 1u64 << VIRTIO_BALLOON_F_DEFLATE_ON_OOM;
368         }
369         if free_page_reporting {
370             avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING;
371             queue_sizes.push(REPORTING_QUEUE_SIZE);
372         }
373 
374         let config = VirtioBalloonConfig {
375             num_pages: (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32,
376             ..Default::default()
377         };
378 
379         Ok(Balloon {
380             common: VirtioCommon {
381                 device_type: VirtioDeviceType::Balloon as u32,
382                 avail_features,
383                 paused_sync: Some(Arc::new(Barrier::new(2))),
384                 queue_sizes,
385                 min_queues: MIN_NUM_QUEUES as u16,
386                 ..Default::default()
387             },
388             id,
389             config,
390             seccomp_action,
391             exit_evt,
392             interrupt_cb: None,
393         })
394     }
395 
396     pub fn resize(&mut self, size: u64) -> Result<(), Error> {
397         self.config.num_pages = (size >> VIRTIO_BALLOON_PFN_SHIFT) as u32;
398 
399         if let Some(interrupt_cb) = &self.interrupt_cb {
400             interrupt_cb
401                 .trigger(VirtioInterruptType::Config)
402                 .map_err(Error::FailedSignal)
403         } else {
404             Ok(())
405         }
406     }
407 
408     // Get the actual size of the virtio-balloon.
409     pub fn get_actual(&self) -> u64 {
410         (self.config.actual as u64) << VIRTIO_BALLOON_PFN_SHIFT
411     }
412 
413     fn state(&self) -> BalloonState {
414         BalloonState {
415             avail_features: self.common.avail_features,
416             acked_features: self.common.acked_features,
417             config: self.config,
418         }
419     }
420 
421     fn set_state(&mut self, state: &BalloonState) {
422         self.common.avail_features = state.avail_features;
423         self.common.acked_features = state.acked_features;
424         self.config = state.config;
425     }
426 
427     #[cfg(fuzzing)]
428     pub fn wait_for_epoll_threads(&mut self) {
429         self.common.wait_for_epoll_threads();
430     }
431 }
432 
433 impl Drop for Balloon {
434     fn drop(&mut self) {
435         if let Some(kill_evt) = self.common.kill_evt.take() {
436             // Ignore the result because there is nothing we can do about it.
437             let _ = kill_evt.write(1);
438         }
439     }
440 }
441 
442 impl VirtioDevice for Balloon {
443     fn device_type(&self) -> u32 {
444         self.common.device_type
445     }
446 
447     fn queue_max_sizes(&self) -> &[u16] {
448         &self.common.queue_sizes
449     }
450 
451     fn features(&self) -> u64 {
452         self.common.avail_features
453     }
454 
455     fn ack_features(&mut self, value: u64) {
456         self.common.ack_features(value)
457     }
458 
459     fn read_config(&self, offset: u64, data: &mut [u8]) {
460         self.read_config_from_slice(self.config.as_slice(), offset, data);
461     }
462 
463     fn write_config(&mut self, offset: u64, data: &[u8]) {
464         // The "actual" field is the only mutable field
465         if offset != CONFIG_ACTUAL_OFFSET || data.len() != CONFIG_ACTUAL_SIZE {
466             error!(
467                 "Attempt to write to read-only field: offset {:x} length {}",
468                 offset,
469                 data.len()
470             );
471             return;
472         }
473 
474         let config = self.config.as_mut_slice();
475         let config_len = config.len() as u64;
476         let data_len = data.len() as u64;
477         if offset + data_len > config_len {
478             error!(
479                     "Out-of-bound access to configuration: config_len = {} offset = {:x} length = {} for {}",
480                     config_len,
481                     offset,
482                     data_len,
483                     self.device_type()
484                 );
485             return;
486         }
487 
488         if let Some(end) = offset.checked_add(config.len() as u64) {
489             let mut offset_config =
490                 &mut config[offset as usize..std::cmp::min(end, config_len) as usize];
491             offset_config.write_all(data).unwrap();
492         }
493     }
494 
495     fn activate(
496         &mut self,
497         mem: GuestMemoryAtomic<GuestMemoryMmap>,
498         interrupt_cb: Arc<dyn VirtioInterrupt>,
499         mut queues: Vec<(usize, Queue, EventFd)>,
500     ) -> ActivateResult {
501         self.common.activate(&queues, &interrupt_cb)?;
502         let (kill_evt, pause_evt) = self.common.dup_eventfds();
503 
504         let mut virtqueues = Vec::new();
505         let (_, queue, queue_evt) = queues.remove(0);
506         virtqueues.push(queue);
507         let inflate_queue_evt = queue_evt;
508         let (_, queue, queue_evt) = queues.remove(0);
509         virtqueues.push(queue);
510         let deflate_queue_evt = queue_evt;
511         let reporting_queue_evt =
512             if self.common.feature_acked(VIRTIO_BALLOON_F_REPORTING) && !queues.is_empty() {
513                 let (_, queue, queue_evt) = queues.remove(0);
514                 virtqueues.push(queue);
515                 Some(queue_evt)
516             } else {
517                 None
518             };
519 
520         self.interrupt_cb = Some(interrupt_cb.clone());
521 
522         let mut handler = BalloonEpollHandler {
523             mem,
524             queues: virtqueues,
525             interrupt_cb,
526             inflate_queue_evt,
527             deflate_queue_evt,
528             reporting_queue_evt,
529             kill_evt,
530             pause_evt,
531         };
532 
533         let paused = self.common.paused.clone();
534         let paused_sync = self.common.paused_sync.clone();
535         let mut epoll_threads = Vec::new();
536 
537         spawn_virtio_thread(
538             &self.id,
539             &self.seccomp_action,
540             Thread::VirtioBalloon,
541             &mut epoll_threads,
542             &self.exit_evt,
543             move || handler.run(paused, paused_sync.unwrap()),
544         )?;
545         self.common.epoll_threads = Some(epoll_threads);
546 
547         event!("virtio-device", "activated", "id", &self.id);
548         Ok(())
549     }
550 
551     fn reset(&mut self) -> Option<Arc<dyn VirtioInterrupt>> {
552         let result = self.common.reset();
553         event!("virtio-device", "reset", "id", &self.id);
554         result
555     }
556 }
557 
558 impl Pausable for Balloon {
559     fn pause(&mut self) -> result::Result<(), MigratableError> {
560         self.common.pause()
561     }
562 
563     fn resume(&mut self) -> result::Result<(), MigratableError> {
564         self.common.resume()
565     }
566 }
567 
568 impl Snapshottable for Balloon {
569     fn id(&self) -> String {
570         self.id.clone()
571     }
572 
573     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
574         Snapshot::new_from_versioned_state(&self.id(), &self.state())
575     }
576 
577     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
578         self.set_state(&snapshot.to_versioned_state(&self.id)?);
579         Ok(())
580     }
581 }
582 impl Transportable for Balloon {}
583 impl Migratable for Balloon {}
584