xref: /cloud-hypervisor/virtio-devices/src/iommu.rs (revision 9af2968a7dc47b89bf07ea9dc5e735084efcfa3a)
1 // Copyright © 2019 Intel Corporation
2 //
3 // SPDX-License-Identifier: Apache-2.0 AND BSD-3-Clause
4 
5 use super::Error as DeviceError;
6 use super::{
7     ActivateError, ActivateResult, DescriptorChain, EpollHelper, EpollHelperError,
8     EpollHelperHandler, Queue, VirtioCommon, VirtioDevice, VirtioDeviceType,
9     EPOLL_HELPER_EVENT_LAST, VIRTIO_F_VERSION_1,
10 };
11 use crate::seccomp_filters::{get_seccomp_filter, Thread};
12 use crate::GuestMemoryMmap;
13 use crate::{DmaRemapping, VirtioInterrupt, VirtioInterruptType};
14 use seccomp::{SeccompAction, SeccompFilter};
15 use std::collections::BTreeMap;
16 use std::fmt::{self, Display};
17 use std::io;
18 use std::mem::size_of;
19 use std::ops::Bound::Included;
20 use std::os::unix::io::AsRawFd;
21 use std::result;
22 use std::sync::atomic::AtomicBool;
23 use std::sync::{Arc, Barrier, RwLock};
24 use std::thread;
25 use versionize::{VersionMap, Versionize, VersionizeResult};
26 use versionize_derive::Versionize;
27 use vm_device::dma_mapping::ExternalDmaMapping;
28 use vm_memory::{
29     Address, ByteValued, Bytes, GuestAddress, GuestAddressSpace, GuestMemoryAtomic,
30     GuestMemoryError,
31 };
32 use vm_migration::VersionMapped;
33 use vm_migration::{Migratable, MigratableError, Pausable, Snapshot, Snapshottable, Transportable};
34 use vmm_sys_util::eventfd::EventFd;
35 
36 /// Queues sizes
37 const QUEUE_SIZE: u16 = 256;
38 const NUM_QUEUES: usize = 2;
39 const QUEUE_SIZES: &[u16] = &[QUEUE_SIZE; NUM_QUEUES];
40 
41 /// New descriptors are pending on the request queue.
42 /// "requestq" is meant to be used anytime an action is required to be
43 /// performed on behalf of the guest driver.
44 const REQUEST_Q_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 1;
45 /// New descriptors are pending on the event queue.
46 /// "eventq" lets the device report any fault or other asynchronous event to
47 /// the guest driver.
48 const EVENT_Q_EVENT: u16 = EPOLL_HELPER_EVENT_LAST + 2;
49 
50 /// PROBE properties size.
51 /// This is the minimal size to provide at least one RESV_MEM property.
52 /// Because virtio-iommu expects one MSI reserved region, we must provide it,
53 /// otherwise the driver in the guest will define a predefined one between
54 /// 0x8000000 and 0x80FFFFF, which is only relevant for ARM architecture, but
55 /// will conflict with x86.
56 const PROBE_PROP_SIZE: u32 =
57     (size_of::<VirtioIommuProbeProperty>() + size_of::<VirtioIommuProbeResvMem>()) as u32;
58 const MSI_IOVA_START: u64 = 0xfee0_0000;
59 const MSI_IOVA_END: u64 = 0xfeef_ffff;
60 
61 /// Virtio IOMMU features
62 #[allow(unused)]
63 const VIRTIO_IOMMU_F_INPUT_RANGE: u32 = 0;
64 #[allow(unused)]
65 const VIRTIO_IOMMU_F_DOMAIN_RANGE: u32 = 1;
66 #[allow(unused)]
67 const VIRTIO_IOMMU_F_MAP_UNMAP: u32 = 2;
68 #[allow(unused)]
69 const VIRTIO_IOMMU_F_BYPASS: u32 = 3;
70 const VIRTIO_IOMMU_F_PROBE: u32 = 4;
71 #[allow(unused)]
72 const VIRTIO_IOMMU_F_MMIO: u32 = 5;
73 #[allow(unused)]
74 const VIRTIO_IOMMU_F_BYPASS_CONFIG: u32 = 6;
75 
76 // Support 2MiB and 4KiB page sizes.
77 const VIRTIO_IOMMU_PAGE_SIZE_MASK: u64 = (2 << 20) | (4 << 10);
78 
79 #[derive(Copy, Clone, Debug, Default)]
80 #[repr(packed)]
81 struct VirtioIommuRange32 {
82     start: u32,
83     end: u32,
84 }
85 
86 unsafe impl ByteValued for VirtioIommuRange32 {}
87 
88 #[derive(Copy, Clone, Debug, Default)]
89 #[repr(packed)]
90 struct VirtioIommuRange64 {
91     start: u64,
92     end: u64,
93 }
94 
95 unsafe impl ByteValued for VirtioIommuRange64 {}
96 
97 #[derive(Copy, Clone, Debug, Default)]
98 #[repr(packed)]
99 struct VirtioIommuConfig {
100     page_size_mask: u64,
101     input_range: VirtioIommuRange64,
102     domain_range: VirtioIommuRange32,
103     probe_size: u32,
104     bypass: u8,
105     reserved: [u8; 7],
106 }
107 
108 unsafe impl ByteValued for VirtioIommuConfig {}
109 
110 /// Virtio IOMMU request type
111 const VIRTIO_IOMMU_T_ATTACH: u8 = 1;
112 const VIRTIO_IOMMU_T_DETACH: u8 = 2;
113 const VIRTIO_IOMMU_T_MAP: u8 = 3;
114 const VIRTIO_IOMMU_T_UNMAP: u8 = 4;
115 const VIRTIO_IOMMU_T_PROBE: u8 = 5;
116 
117 #[derive(Copy, Clone, Debug, Default)]
118 #[repr(packed)]
119 struct VirtioIommuReqHead {
120     type_: u8,
121     reserved: [u8; 3],
122 }
123 
124 unsafe impl ByteValued for VirtioIommuReqHead {}
125 
126 /// Virtio IOMMU request status
127 const VIRTIO_IOMMU_S_OK: u8 = 0;
128 #[allow(unused)]
129 const VIRTIO_IOMMU_S_IOERR: u8 = 1;
130 #[allow(unused)]
131 const VIRTIO_IOMMU_S_UNSUPP: u8 = 2;
132 #[allow(unused)]
133 const VIRTIO_IOMMU_S_DEVERR: u8 = 3;
134 #[allow(unused)]
135 const VIRTIO_IOMMU_S_INVAL: u8 = 4;
136 #[allow(unused)]
137 const VIRTIO_IOMMU_S_RANGE: u8 = 5;
138 #[allow(unused)]
139 const VIRTIO_IOMMU_S_NOENT: u8 = 6;
140 #[allow(unused)]
141 const VIRTIO_IOMMU_S_FAULT: u8 = 7;
142 #[allow(unused)]
143 const VIRTIO_IOMMU_S_NOMEM: u8 = 8;
144 
145 #[derive(Copy, Clone, Debug, Default)]
146 #[repr(packed)]
147 struct VirtioIommuReqTail {
148     status: u8,
149     reserved: [u8; 3],
150 }
151 
152 unsafe impl ByteValued for VirtioIommuReqTail {}
153 
154 /// ATTACH request
155 #[derive(Copy, Clone, Debug, Default)]
156 #[repr(packed)]
157 struct VirtioIommuReqAttach {
158     domain: u32,
159     endpoint: u32,
160     reserved: [u8; 8],
161 }
162 
163 unsafe impl ByteValued for VirtioIommuReqAttach {}
164 
165 /// DETACH request
166 #[derive(Copy, Clone, Debug, Default)]
167 #[repr(packed)]
168 struct VirtioIommuReqDetach {
169     domain: u32,
170     endpoint: u32,
171     reserved: [u8; 8],
172 }
173 
174 unsafe impl ByteValued for VirtioIommuReqDetach {}
175 
176 /// Virtio IOMMU request MAP flags
177 #[allow(unused)]
178 const VIRTIO_IOMMU_MAP_F_READ: u32 = 1;
179 #[allow(unused)]
180 const VIRTIO_IOMMU_MAP_F_WRITE: u32 = 1 << 1;
181 #[allow(unused)]
182 const VIRTIO_IOMMU_MAP_F_MMIO: u32 = 1 << 2;
183 #[allow(unused)]
184 const VIRTIO_IOMMU_MAP_F_MASK: u32 =
185     VIRTIO_IOMMU_MAP_F_READ | VIRTIO_IOMMU_MAP_F_WRITE | VIRTIO_IOMMU_MAP_F_MMIO;
186 
187 /// MAP request
188 #[derive(Copy, Clone, Debug, Default)]
189 #[repr(packed)]
190 struct VirtioIommuReqMap {
191     domain: u32,
192     virt_start: u64,
193     virt_end: u64,
194     phys_start: u64,
195     flags: u32,
196 }
197 
198 unsafe impl ByteValued for VirtioIommuReqMap {}
199 
200 /// UNMAP request
201 #[derive(Copy, Clone, Debug, Default)]
202 #[repr(packed)]
203 struct VirtioIommuReqUnmap {
204     domain: u32,
205     virt_start: u64,
206     virt_end: u64,
207     reserved: [u8; 4],
208 }
209 
210 unsafe impl ByteValued for VirtioIommuReqUnmap {}
211 
212 /// Virtio IOMMU request PROBE types
213 #[allow(unused)]
214 const VIRTIO_IOMMU_PROBE_T_NONE: u16 = 0;
215 const VIRTIO_IOMMU_PROBE_T_RESV_MEM: u16 = 1;
216 #[allow(unused)]
217 const VIRTIO_IOMMU_PROBE_T_MASK: u16 = 0xfff;
218 
219 /// PROBE request
220 #[derive(Copy, Clone, Debug, Default)]
221 #[repr(packed)]
222 struct VirtioIommuReqProbe {
223     endpoint: u32,
224     reserved: [u64; 8],
225 }
226 
227 unsafe impl ByteValued for VirtioIommuReqProbe {}
228 
229 #[derive(Copy, Clone, Debug, Default)]
230 #[repr(packed)]
231 struct VirtioIommuProbeProperty {
232     type_: u16,
233     length: u16,
234 }
235 
236 unsafe impl ByteValued for VirtioIommuProbeProperty {}
237 
238 /// Virtio IOMMU request PROBE property RESV_MEM subtypes
239 #[allow(unused)]
240 const VIRTIO_IOMMU_RESV_MEM_T_RESERVED: u8 = 0;
241 const VIRTIO_IOMMU_RESV_MEM_T_MSI: u8 = 1;
242 
243 #[derive(Copy, Clone, Debug, Default)]
244 #[repr(packed)]
245 struct VirtioIommuProbeResvMem {
246     subtype: u8,
247     reserved: [u8; 3],
248     start: u64,
249     end: u64,
250 }
251 
252 unsafe impl ByteValued for VirtioIommuProbeResvMem {}
253 
254 /// Virtio IOMMU fault flags
255 #[allow(unused)]
256 const VIRTIO_IOMMU_FAULT_F_READ: u32 = 1;
257 #[allow(unused)]
258 const VIRTIO_IOMMU_FAULT_F_WRITE: u32 = 1 << 1;
259 #[allow(unused)]
260 const VIRTIO_IOMMU_FAULT_F_EXEC: u32 = 1 << 2;
261 #[allow(unused)]
262 const VIRTIO_IOMMU_FAULT_F_ADDRESS: u32 = 1 << 8;
263 
264 /// Virtio IOMMU fault reasons
265 #[allow(unused)]
266 const VIRTIO_IOMMU_FAULT_R_UNKNOWN: u32 = 0;
267 #[allow(unused)]
268 const VIRTIO_IOMMU_FAULT_R_DOMAIN: u32 = 1;
269 #[allow(unused)]
270 const VIRTIO_IOMMU_FAULT_R_MAPPING: u32 = 2;
271 
272 /// Fault reporting through eventq
273 #[allow(unused)]
274 #[derive(Copy, Clone, Debug, Default)]
275 #[repr(packed)]
276 struct VirtioIommuFault {
277     reason: u8,
278     reserved: [u8; 3],
279     flags: u32,
280     endpoint: u32,
281     reserved2: [u8; 4],
282     address: u64,
283 }
284 
285 unsafe impl ByteValued for VirtioIommuFault {}
286 
287 #[derive(Debug)]
288 enum Error {
289     /// Guest gave us bad memory addresses.
290     GuestMemory(GuestMemoryError),
291     /// Guest gave us a write only descriptor that protocol says to read from.
292     UnexpectedWriteOnlyDescriptor,
293     /// Guest gave us a read only descriptor that protocol says to write to.
294     UnexpectedReadOnlyDescriptor,
295     /// Guest gave us too few descriptors in a descriptor chain.
296     DescriptorChainTooShort,
297     /// Guest gave us a buffer that was too short to use.
298     BufferLengthTooSmall,
299     /// Guest sent us invalid request.
300     InvalidRequest,
301     /// Guest sent us invalid ATTACH request.
302     InvalidAttachRequest,
303     /// Guest sent us invalid DETACH request.
304     InvalidDetachRequest,
305     /// Guest sent us invalid MAP request.
306     InvalidMapRequest,
307     /// Guest sent us invalid UNMAP request.
308     InvalidUnmapRequest,
309     /// Guest sent us invalid PROBE request.
310     InvalidProbeRequest,
311     /// Failed to performing external mapping.
312     ExternalMapping(io::Error),
313     /// Failed to performing external unmapping.
314     ExternalUnmapping(io::Error),
315 }
316 
317 impl Display for Error {
318     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
319         use self::Error::*;
320 
321         match self {
322             BufferLengthTooSmall => write!(f, "buffer length too small"),
323             DescriptorChainTooShort => write!(f, "descriptor chain too short"),
324             GuestMemory(e) => write!(f, "bad guest memory address: {}", e),
325             InvalidRequest => write!(f, "invalid request"),
326             InvalidAttachRequest => write!(f, "invalid attach request"),
327             InvalidDetachRequest => write!(f, "invalid detach request"),
328             InvalidMapRequest => write!(f, "invalid map request"),
329             InvalidUnmapRequest => write!(f, "invalid unmap request"),
330             InvalidProbeRequest => write!(f, "invalid probe request"),
331             UnexpectedReadOnlyDescriptor => write!(f, "unexpected read-only descriptor"),
332             UnexpectedWriteOnlyDescriptor => write!(f, "unexpected write-only descriptor"),
333             ExternalMapping(e) => write!(f, "failed performing external mapping: {}", e),
334             ExternalUnmapping(e) => write!(f, "failed performing external unmapping: {}", e),
335         }
336     }
337 }
338 
339 struct Request {}
340 
341 impl Request {
342     // Parse the available vring buffer. Based on the hashmap table of external
343     // mappings required from various devices such as VFIO or vhost-user ones,
344     // this function might update the hashmap table of external mappings per
345     // domain.
346     // Basically, the VMM knows about the device_id <=> mapping relationship
347     // before running the VM, but at runtime, a new domain <=> mapping hashmap
348     // is created based on the information provided from the guest driver for
349     // virtio-iommu (giving the link device_id <=> domain).
350     fn parse(
351         avail_desc: &DescriptorChain,
352         mem: &GuestMemoryMmap,
353         mapping: &Arc<IommuMapping>,
354         ext_mapping: &BTreeMap<u32, Arc<dyn ExternalDmaMapping>>,
355         ext_domain_mapping: &mut BTreeMap<u32, Arc<dyn ExternalDmaMapping>>,
356     ) -> result::Result<usize, Error> {
357         // The head contains the request type which MUST be readable.
358         if avail_desc.is_write_only() {
359             return Err(Error::UnexpectedWriteOnlyDescriptor);
360         }
361 
362         if (avail_desc.len as usize) < size_of::<VirtioIommuReqHead>() {
363             return Err(Error::InvalidRequest);
364         }
365 
366         let req_head: VirtioIommuReqHead =
367             mem.read_obj(avail_desc.addr).map_err(Error::GuestMemory)?;
368         let req_offset = size_of::<VirtioIommuReqHead>();
369         let desc_size_left = (avail_desc.len as usize) - req_offset;
370         let req_addr = if let Some(addr) = avail_desc.addr.checked_add(req_offset as u64) {
371             addr
372         } else {
373             return Err(Error::InvalidRequest);
374         };
375 
376         // Create the reply
377         let mut reply: Vec<u8> = Vec::new();
378 
379         let hdr_len = match req_head.type_ {
380             VIRTIO_IOMMU_T_ATTACH => {
381                 if desc_size_left != size_of::<VirtioIommuReqAttach>() {
382                     return Err(Error::InvalidAttachRequest);
383                 }
384 
385                 let req: VirtioIommuReqAttach = mem
386                     .read_obj(req_addr as GuestAddress)
387                     .map_err(Error::GuestMemory)?;
388                 debug!("Attach request {:?}", req);
389 
390                 // Copy the value to use it as a proper reference.
391                 let domain = req.domain;
392                 let endpoint = req.endpoint;
393 
394                 // Add endpoint associated with specific domain
395                 mapping.endpoints.write().unwrap().insert(endpoint, domain);
396 
397                 // If the endpoint is part of the list of devices with an
398                 // external mapping, insert a new entry for the corresponding
399                 // domain, with the same reference to the trait.
400                 if let Some(map) = ext_mapping.get(&endpoint) {
401                     ext_domain_mapping.insert(domain, map.clone());
402                 }
403 
404                 // Add new domain with no mapping if the entry didn't exist yet
405                 let mut mappings = mapping.mappings.write().unwrap();
406                 mappings.entry(domain).or_insert_with(BTreeMap::new);
407 
408                 0
409             }
410             VIRTIO_IOMMU_T_DETACH => {
411                 if desc_size_left != size_of::<VirtioIommuReqDetach>() {
412                     return Err(Error::InvalidDetachRequest);
413                 }
414 
415                 let req: VirtioIommuReqDetach = mem
416                     .read_obj(req_addr as GuestAddress)
417                     .map_err(Error::GuestMemory)?;
418                 debug!("Detach request {:?}", req);
419 
420                 // Copy the value to use it as a proper reference.
421                 let domain = req.domain;
422                 let endpoint = req.endpoint;
423 
424                 // If the endpoint is part of the list of devices with an
425                 // external mapping, remove the entry for the corresponding
426                 // domain.
427                 if ext_mapping.contains_key(&endpoint) {
428                     ext_domain_mapping.remove(&domain);
429                 }
430 
431                 // Remove endpoint associated with specific domain
432                 mapping.endpoints.write().unwrap().remove(&endpoint);
433 
434                 0
435             }
436             VIRTIO_IOMMU_T_MAP => {
437                 if desc_size_left != size_of::<VirtioIommuReqMap>() {
438                     return Err(Error::InvalidMapRequest);
439                 }
440 
441                 let req: VirtioIommuReqMap = mem
442                     .read_obj(req_addr as GuestAddress)
443                     .map_err(Error::GuestMemory)?;
444                 debug!("Map request {:?}", req);
445 
446                 // Copy the value to use it as a proper reference.
447                 let domain = req.domain;
448 
449                 // Trigger external mapping if necessary.
450                 if let Some(ext_map) = ext_domain_mapping.get(&domain) {
451                     let size = req.virt_end - req.virt_start + 1;
452                     ext_map
453                         .map(req.virt_start, req.phys_start, size)
454                         .map_err(Error::ExternalMapping)?;
455                 }
456 
457                 // Add new mapping associated with the domain
458                 if let Some(entry) = mapping.mappings.write().unwrap().get_mut(&domain) {
459                     entry.insert(
460                         req.virt_start,
461                         Mapping {
462                             gpa: req.phys_start,
463                             size: req.virt_end - req.virt_start + 1,
464                         },
465                     );
466                 } else {
467                     return Err(Error::InvalidMapRequest);
468                 }
469 
470                 0
471             }
472             VIRTIO_IOMMU_T_UNMAP => {
473                 if desc_size_left != size_of::<VirtioIommuReqUnmap>() {
474                     return Err(Error::InvalidUnmapRequest);
475                 }
476 
477                 let req: VirtioIommuReqUnmap = mem
478                     .read_obj(req_addr as GuestAddress)
479                     .map_err(Error::GuestMemory)?;
480                 debug!("Unmap request {:?}", req);
481 
482                 // Copy the value to use it as a proper reference.
483                 let domain = req.domain;
484                 let virt_start = req.virt_start;
485 
486                 // Trigger external unmapping if necessary.
487                 if let Some(ext_map) = ext_domain_mapping.get(&domain) {
488                     let size = req.virt_end - virt_start + 1;
489                     ext_map
490                         .unmap(virt_start, size)
491                         .map_err(Error::ExternalUnmapping)?;
492                 }
493 
494                 // Add new mapping associated with the domain
495                 if let Some(entry) = mapping.mappings.write().unwrap().get_mut(&domain) {
496                     entry.remove(&virt_start);
497                 }
498 
499                 0
500             }
501             VIRTIO_IOMMU_T_PROBE => {
502                 if desc_size_left != size_of::<VirtioIommuReqProbe>() {
503                     return Err(Error::InvalidProbeRequest);
504                 }
505 
506                 let req: VirtioIommuReqProbe = mem
507                     .read_obj(req_addr as GuestAddress)
508                     .map_err(Error::GuestMemory)?;
509                 debug!("Probe request {:?}", req);
510 
511                 let probe_prop = VirtioIommuProbeProperty {
512                     type_: VIRTIO_IOMMU_PROBE_T_RESV_MEM,
513                     length: size_of::<VirtioIommuProbeResvMem>() as u16,
514                 };
515                 reply.extend_from_slice(probe_prop.as_slice());
516 
517                 let resv_mem = VirtioIommuProbeResvMem {
518                     subtype: VIRTIO_IOMMU_RESV_MEM_T_MSI,
519                     start: MSI_IOVA_START,
520                     end: MSI_IOVA_END,
521                     ..Default::default()
522                 };
523                 reply.extend_from_slice(resv_mem.as_slice());
524 
525                 PROBE_PROP_SIZE
526             }
527             _ => return Err(Error::InvalidRequest),
528         };
529 
530         let status_desc = avail_desc
531             .next_descriptor()
532             .ok_or(Error::DescriptorChainTooShort)?;
533 
534         // The status MUST always be writable
535         if !status_desc.is_write_only() {
536             return Err(Error::UnexpectedReadOnlyDescriptor);
537         }
538 
539         if status_desc.len < hdr_len + size_of::<VirtioIommuReqTail>() as u32 {
540             return Err(Error::BufferLengthTooSmall);
541         }
542 
543         let tail = VirtioIommuReqTail {
544             status: VIRTIO_IOMMU_S_OK,
545             ..Default::default()
546         };
547         reply.extend_from_slice(tail.as_slice());
548 
549         mem.write_slice(reply.as_slice(), status_desc.addr)
550             .map_err(Error::GuestMemory)?;
551 
552         Ok((hdr_len as usize) + size_of::<VirtioIommuReqTail>())
553     }
554 }
555 
556 struct IommuEpollHandler {
557     queues: Vec<Queue>,
558     mem: GuestMemoryAtomic<GuestMemoryMmap>,
559     interrupt_cb: Arc<dyn VirtioInterrupt>,
560     queue_evts: Vec<EventFd>,
561     kill_evt: EventFd,
562     pause_evt: EventFd,
563     mapping: Arc<IommuMapping>,
564     ext_mapping: BTreeMap<u32, Arc<dyn ExternalDmaMapping>>,
565     ext_domain_mapping: BTreeMap<u32, Arc<dyn ExternalDmaMapping>>,
566 }
567 
568 impl IommuEpollHandler {
569     fn request_queue(&mut self) -> bool {
570         let mut used_desc_heads = [(0, 0); QUEUE_SIZE as usize];
571         let mut used_count = 0;
572         let mem = self.mem.memory();
573         for avail_desc in self.queues[0].iter(&mem) {
574             let len = match Request::parse(
575                 &avail_desc,
576                 &mem,
577                 &self.mapping,
578                 &self.ext_mapping,
579                 &mut self.ext_domain_mapping,
580             ) {
581                 Ok(len) => len as u32,
582                 Err(e) => {
583                     error!("failed parsing descriptor: {}", e);
584                     0
585                 }
586             };
587 
588             used_desc_heads[used_count] = (avail_desc.index, len);
589             used_count += 1;
590         }
591 
592         for &(desc_index, len) in &used_desc_heads[..used_count] {
593             self.queues[0].add_used(&mem, desc_index, len);
594         }
595         used_count > 0
596     }
597 
598     fn event_queue(&mut self) -> bool {
599         false
600     }
601 
602     fn signal_used_queue(&self, queue: &Queue) -> result::Result<(), DeviceError> {
603         self.interrupt_cb
604             .trigger(&VirtioInterruptType::Queue, Some(queue))
605             .map_err(|e| {
606                 error!("Failed to signal used queue: {:?}", e);
607                 DeviceError::FailedSignalingUsedQueue(e)
608             })
609     }
610 
611     fn run(
612         &mut self,
613         paused: Arc<AtomicBool>,
614         paused_sync: Arc<Barrier>,
615     ) -> result::Result<(), EpollHelperError> {
616         let mut helper = EpollHelper::new(&self.kill_evt, &self.pause_evt)?;
617         helper.add_event(self.queue_evts[0].as_raw_fd(), REQUEST_Q_EVENT)?;
618         helper.add_event(self.queue_evts[1].as_raw_fd(), EVENT_Q_EVENT)?;
619         helper.run(paused, paused_sync, self)?;
620 
621         Ok(())
622     }
623 }
624 
625 impl EpollHelperHandler for IommuEpollHandler {
626     fn handle_event(&mut self, _helper: &mut EpollHelper, event: &epoll::Event) -> bool {
627         let ev_type = event.data as u16;
628         match ev_type {
629             REQUEST_Q_EVENT => {
630                 if let Err(e) = self.queue_evts[0].read() {
631                     error!("Failed to get queue event: {:?}", e);
632                     return true;
633                 } else if self.request_queue() {
634                     if let Err(e) = self.signal_used_queue(&self.queues[0]) {
635                         error!("Failed to signal used queue: {:?}", e);
636                         return true;
637                     }
638                 }
639             }
640             EVENT_Q_EVENT => {
641                 if let Err(e) = self.queue_evts[1].read() {
642                     error!("Failed to get queue event: {:?}", e);
643                     return true;
644                 } else if self.event_queue() {
645                     if let Err(e) = self.signal_used_queue(&self.queues[1]) {
646                         error!("Failed to signal used queue: {:?}", e);
647                         return true;
648                     }
649                 }
650             }
651             _ => {
652                 error!("Unexpected event: {}", ev_type);
653                 return true;
654             }
655         }
656         false
657     }
658 }
659 
660 #[derive(Clone, Copy, Versionize)]
661 struct Mapping {
662     gpa: u64,
663     size: u64,
664 }
665 
666 pub struct IommuMapping {
667     // Domain related to an endpoint.
668     endpoints: Arc<RwLock<BTreeMap<u32, u32>>>,
669     // List of mappings per domain.
670     mappings: Arc<RwLock<BTreeMap<u32, BTreeMap<u64, Mapping>>>>,
671 }
672 
673 impl DmaRemapping for IommuMapping {
674     fn translate(&self, id: u32, addr: u64) -> std::result::Result<u64, std::io::Error> {
675         debug!("Translate addr 0x{:x}", addr);
676         if let Some(domain) = self.endpoints.read().unwrap().get(&id) {
677             if let Some(mapping) = self.mappings.read().unwrap().get(domain) {
678                 let range_start = if VIRTIO_IOMMU_PAGE_SIZE_MASK > addr {
679                     0
680                 } else {
681                     addr - VIRTIO_IOMMU_PAGE_SIZE_MASK
682                 };
683                 for (&key, &value) in mapping.range((Included(&range_start), Included(&addr))) {
684                     if addr >= key && addr < key + value.size {
685                         let new_addr = addr - key + value.gpa;
686                         debug!("Into new_addr 0x{:x}", new_addr);
687                         return Ok(new_addr);
688                     }
689                 }
690             }
691         }
692 
693         debug!("Into same addr...");
694         Ok(addr)
695     }
696 }
697 
698 pub struct Iommu {
699     common: VirtioCommon,
700     id: String,
701     config: VirtioIommuConfig,
702     mapping: Arc<IommuMapping>,
703     ext_mapping: BTreeMap<u32, Arc<dyn ExternalDmaMapping>>,
704     seccomp_action: SeccompAction,
705 }
706 
707 #[derive(Versionize)]
708 struct IommuState {
709     avail_features: u64,
710     acked_features: u64,
711     endpoints: Vec<(u32, u32)>,
712     mappings: Vec<(u32, Vec<(u64, Mapping)>)>,
713 }
714 
715 impl VersionMapped for IommuState {}
716 
717 impl Iommu {
718     pub fn new(id: String, seccomp_action: SeccompAction) -> io::Result<(Self, Arc<IommuMapping>)> {
719         let config = VirtioIommuConfig {
720             page_size_mask: VIRTIO_IOMMU_PAGE_SIZE_MASK,
721             probe_size: PROBE_PROP_SIZE,
722             ..Default::default()
723         };
724 
725         let mapping = Arc::new(IommuMapping {
726             endpoints: Arc::new(RwLock::new(BTreeMap::new())),
727             mappings: Arc::new(RwLock::new(BTreeMap::new())),
728         });
729 
730         Ok((
731             Iommu {
732                 id,
733                 common: VirtioCommon {
734                     device_type: VirtioDeviceType::Iommu as u32,
735                     queue_sizes: QUEUE_SIZES.to_vec(),
736                     avail_features: 1u64 << VIRTIO_F_VERSION_1
737                         | 1u64 << VIRTIO_IOMMU_F_MAP_UNMAP
738                         | 1u64 << VIRTIO_IOMMU_F_PROBE,
739                     paused_sync: Some(Arc::new(Barrier::new(2))),
740                     ..Default::default()
741                 },
742                 config,
743                 mapping: mapping.clone(),
744                 ext_mapping: BTreeMap::new(),
745                 seccomp_action,
746             },
747             mapping,
748         ))
749     }
750 
751     fn state(&self) -> IommuState {
752         IommuState {
753             avail_features: self.common.avail_features,
754             acked_features: self.common.acked_features,
755             endpoints: self
756                 .mapping
757                 .endpoints
758                 .read()
759                 .unwrap()
760                 .clone()
761                 .into_iter()
762                 .collect(),
763             mappings: self
764                 .mapping
765                 .mappings
766                 .read()
767                 .unwrap()
768                 .clone()
769                 .into_iter()
770                 .map(|(k, v)| (k, v.into_iter().collect()))
771                 .collect(),
772         }
773     }
774 
775     fn set_state(&mut self, state: &IommuState) {
776         self.common.avail_features = state.avail_features;
777         self.common.acked_features = state.acked_features;
778         *(self.mapping.endpoints.write().unwrap()) = state.endpoints.clone().into_iter().collect();
779         *(self.mapping.mappings.write().unwrap()) = state
780             .mappings
781             .clone()
782             .into_iter()
783             .map(|(k, v)| (k, v.into_iter().collect()))
784             .collect();
785     }
786 
787     pub fn add_external_mapping(&mut self, device_id: u32, mapping: Arc<dyn ExternalDmaMapping>) {
788         self.ext_mapping.insert(device_id, mapping);
789     }
790 }
791 
792 impl Drop for Iommu {
793     fn drop(&mut self) {
794         if let Some(kill_evt) = self.common.kill_evt.take() {
795             // Ignore the result because there is nothing we can do about it.
796             let _ = kill_evt.write(1);
797         }
798     }
799 }
800 
801 impl VirtioDevice for Iommu {
802     fn device_type(&self) -> u32 {
803         self.common.device_type
804     }
805 
806     fn queue_max_sizes(&self) -> &[u16] {
807         &self.common.queue_sizes
808     }
809 
810     fn features(&self) -> u64 {
811         self.common.avail_features
812     }
813 
814     fn ack_features(&mut self, value: u64) {
815         self.common.ack_features(value)
816     }
817 
818     fn read_config(&self, offset: u64, data: &mut [u8]) {
819         self.read_config_from_slice(self.config.as_slice(), offset, data);
820     }
821 
822     fn activate(
823         &mut self,
824         mem: GuestMemoryAtomic<GuestMemoryMmap>,
825         interrupt_cb: Arc<dyn VirtioInterrupt>,
826         queues: Vec<Queue>,
827         queue_evts: Vec<EventFd>,
828     ) -> ActivateResult {
829         self.common.activate(&queues, &queue_evts, &interrupt_cb)?;
830         let (kill_evt, pause_evt) = self.common.dup_eventfds();
831         let mut handler = IommuEpollHandler {
832             queues,
833             mem,
834             interrupt_cb,
835             queue_evts,
836             kill_evt,
837             pause_evt,
838             mapping: self.mapping.clone(),
839             ext_mapping: self.ext_mapping.clone(),
840             ext_domain_mapping: BTreeMap::new(),
841         };
842 
843         let paused = self.common.paused.clone();
844         let paused_sync = self.common.paused_sync.clone();
845         let mut epoll_threads = Vec::new();
846         // Retrieve seccomp filter for virtio_iommu thread
847         let virtio_iommu_seccomp_filter =
848             get_seccomp_filter(&self.seccomp_action, Thread::VirtioIommu)
849                 .map_err(ActivateError::CreateSeccompFilter)?;
850         thread::Builder::new()
851             .name(self.id.clone())
852             .spawn(move || {
853                 if let Err(e) = SeccompFilter::apply(virtio_iommu_seccomp_filter) {
854                     error!("Error applying seccomp filter: {:?}", e);
855                 } else if let Err(e) = handler.run(paused, paused_sync.unwrap()) {
856                     error!("Error running worker: {:?}", e);
857                 }
858             })
859             .map(|thread| epoll_threads.push(thread))
860             .map_err(|e| {
861                 error!("failed to clone the virtio-iommu epoll thread: {}", e);
862                 ActivateError::BadActivate
863             })?;
864 
865         self.common.epoll_threads = Some(epoll_threads);
866 
867         event!("virtio-device", "activated", "id", &self.id);
868         Ok(())
869     }
870 
871     fn reset(&mut self) -> Option<Arc<dyn VirtioInterrupt>> {
872         let result = self.common.reset();
873         event!("virtio-device", "reset", "id", &self.id);
874         result
875     }
876 }
877 
878 impl Pausable for Iommu {
879     fn pause(&mut self) -> result::Result<(), MigratableError> {
880         self.common.pause()
881     }
882 
883     fn resume(&mut self) -> result::Result<(), MigratableError> {
884         self.common.resume()
885     }
886 }
887 
888 impl Snapshottable for Iommu {
889     fn id(&self) -> String {
890         self.id.clone()
891     }
892 
893     fn snapshot(&mut self) -> std::result::Result<Snapshot, MigratableError> {
894         Snapshot::new_from_versioned_state(&self.id, &self.state())
895     }
896 
897     fn restore(&mut self, snapshot: Snapshot) -> std::result::Result<(), MigratableError> {
898         self.set_state(&snapshot.to_versioned_state(&self.id)?);
899         Ok(())
900     }
901 }
902 impl Transportable for Iommu {}
903 impl Migratable for Iommu {}
904