xref: /qemu/hw/riscv/riscv-iommu.c (revision e4a8e093dc74be049f4829831dce76e5edab0003)
1 /*
2  * QEMU emulation of an RISC-V IOMMU
3  *
4  * Copyright (C) 2021-2023, Rivos Inc.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2 or later, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program; if not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include "qemu/osdep.h"
20 #include "qom/object.h"
21 #include "hw/pci/pci_bus.h"
22 #include "hw/pci/pci_device.h"
23 #include "hw/qdev-properties.h"
24 #include "hw/riscv/riscv_hart.h"
25 #include "migration/vmstate.h"
26 #include "qapi/error.h"
27 #include "qemu/timer.h"
28 
29 #include "cpu_bits.h"
30 #include "riscv-iommu.h"
31 #include "riscv-iommu-bits.h"
32 #include "trace.h"
33 
34 #define LIMIT_CACHE_CTX               (1U << 7)
35 #define LIMIT_CACHE_IOT               (1U << 20)
36 
37 /* Physical page number coversions */
38 #define PPN_PHYS(ppn)                 ((ppn) << TARGET_PAGE_BITS)
39 #define PPN_DOWN(phy)                 ((phy) >> TARGET_PAGE_BITS)
40 
41 typedef struct RISCVIOMMUContext RISCVIOMMUContext;
42 typedef struct RISCVIOMMUEntry RISCVIOMMUEntry;
43 
44 /* Device assigned I/O address space */
45 struct RISCVIOMMUSpace {
46     IOMMUMemoryRegion iova_mr;  /* IOVA memory region for attached device */
47     AddressSpace iova_as;       /* IOVA address space for attached device */
48     RISCVIOMMUState *iommu;     /* Managing IOMMU device state */
49     uint32_t devid;             /* Requester identifier, AKA device_id */
50     bool notifier;              /* IOMMU unmap notifier enabled */
51     QLIST_ENTRY(RISCVIOMMUSpace) list;
52 };
53 
54 /* Device translation context state. */
55 struct RISCVIOMMUContext {
56     uint64_t devid:24;          /* Requester Id, AKA device_id */
57     uint64_t process_id:20;     /* Process ID. PASID for PCIe */
58     uint64_t tc;                /* Translation Control */
59     uint64_t ta;                /* Translation Attributes */
60     uint64_t satp;              /* S-Stage address translation and protection */
61     uint64_t gatp;              /* G-Stage address translation and protection */
62     uint64_t msi_addr_mask;     /* MSI filtering - address mask */
63     uint64_t msi_addr_pattern;  /* MSI filtering - address pattern */
64     uint64_t msiptp;            /* MSI redirection page table pointer */
65 };
66 
67 /* Address translation cache entry */
68 struct RISCVIOMMUEntry {
69     uint64_t iova:44;           /* IOVA Page Number */
70     uint64_t pscid:20;          /* Process Soft-Context identifier */
71     uint64_t phys:44;           /* Physical Page Number */
72     uint64_t gscid:16;          /* Guest Soft-Context identifier */
73     uint64_t perm:2;            /* IOMMU_RW flags */
74 };
75 
76 /* IOMMU index for transactions without process_id specified. */
77 #define RISCV_IOMMU_NOPROCID 0
78 
79 static uint8_t riscv_iommu_get_icvec_vector(uint32_t icvec, uint32_t vec_type)
80 {
81     switch (vec_type) {
82     case RISCV_IOMMU_INTR_CQ:
83         return icvec & RISCV_IOMMU_ICVEC_CIV;
84     case RISCV_IOMMU_INTR_FQ:
85         return (icvec & RISCV_IOMMU_ICVEC_FIV) >> 4;
86     case RISCV_IOMMU_INTR_PM:
87         return (icvec & RISCV_IOMMU_ICVEC_PMIV) >> 8;
88     case RISCV_IOMMU_INTR_PQ:
89         return (icvec & RISCV_IOMMU_ICVEC_PIV) >> 12;
90     default:
91         g_assert_not_reached();
92     }
93 }
94 
95 static void riscv_iommu_notify(RISCVIOMMUState *s, int vec_type)
96 {
97     uint32_t ipsr, icvec, vector;
98 
99     if (!s->notify) {
100         return;
101     }
102 
103     icvec = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_ICVEC);
104     ipsr = riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, (1 << vec_type), 0);
105 
106     if (!(ipsr & (1 << vec_type))) {
107         vector = riscv_iommu_get_icvec_vector(icvec, vec_type);
108         s->notify(s, vector);
109         trace_riscv_iommu_notify_int_vector(vec_type, vector);
110     }
111 }
112 
113 static void riscv_iommu_fault(RISCVIOMMUState *s,
114                               struct riscv_iommu_fq_record *ev)
115 {
116     uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
117     uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQH) & s->fq_mask;
118     uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQT) & s->fq_mask;
119     uint32_t next = (tail + 1) & s->fq_mask;
120     uint32_t devid = get_field(ev->hdr, RISCV_IOMMU_FQ_HDR_DID);
121 
122     trace_riscv_iommu_flt(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
123                           PCI_FUNC(devid), ev->hdr, ev->iotval);
124 
125     if (!(ctrl & RISCV_IOMMU_FQCSR_FQON) ||
126         !!(ctrl & (RISCV_IOMMU_FQCSR_FQOF | RISCV_IOMMU_FQCSR_FQMF))) {
127         return;
128     }
129 
130     if (head == next) {
131         riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
132                               RISCV_IOMMU_FQCSR_FQOF, 0);
133     } else {
134         dma_addr_t addr = s->fq_addr + tail * sizeof(*ev);
135         if (dma_memory_write(s->target_as, addr, ev, sizeof(*ev),
136                              MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
137             riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR,
138                                   RISCV_IOMMU_FQCSR_FQMF, 0);
139         } else {
140             riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_FQT, next);
141         }
142     }
143 
144     if (ctrl & RISCV_IOMMU_FQCSR_FIE) {
145         riscv_iommu_notify(s, RISCV_IOMMU_INTR_FQ);
146     }
147 }
148 
149 static void riscv_iommu_pri(RISCVIOMMUState *s,
150     struct riscv_iommu_pq_record *pr)
151 {
152     uint32_t ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
153     uint32_t head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQH) & s->pq_mask;
154     uint32_t tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQT) & s->pq_mask;
155     uint32_t next = (tail + 1) & s->pq_mask;
156     uint32_t devid = get_field(pr->hdr, RISCV_IOMMU_PREQ_HDR_DID);
157 
158     trace_riscv_iommu_pri(s->parent_obj.id, PCI_BUS_NUM(devid), PCI_SLOT(devid),
159                           PCI_FUNC(devid), pr->payload);
160 
161     if (!(ctrl & RISCV_IOMMU_PQCSR_PQON) ||
162         !!(ctrl & (RISCV_IOMMU_PQCSR_PQOF | RISCV_IOMMU_PQCSR_PQMF))) {
163         return;
164     }
165 
166     if (head == next) {
167         riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
168                               RISCV_IOMMU_PQCSR_PQOF, 0);
169     } else {
170         dma_addr_t addr = s->pq_addr + tail * sizeof(*pr);
171         if (dma_memory_write(s->target_as, addr, pr, sizeof(*pr),
172                              MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
173             riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR,
174                                   RISCV_IOMMU_PQCSR_PQMF, 0);
175         } else {
176             riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_PQT, next);
177         }
178     }
179 
180     if (ctrl & RISCV_IOMMU_PQCSR_PIE) {
181         riscv_iommu_notify(s, RISCV_IOMMU_INTR_PQ);
182     }
183 }
184 
185 /*
186  * Discards all bits from 'val' whose matching bits in the same
187  * positions in the mask 'ext' are zeros, and packs the remaining
188  * bits from 'val' contiguously at the least-significant end of the
189  * result, keeping the same bit order as 'val' and filling any
190  * other bits at the most-significant end of the result with zeros.
191  *
192  * For example, for the following 'val' and 'ext', the return 'ret'
193  * will be:
194  *
195  * val = a b c d e f g h
196  * ext = 1 0 1 0 0 1 1 0
197  * ret = 0 0 0 0 a c f g
198  *
199  * This function, taken from the riscv-iommu 1.0 spec, section 2.3.3
200  * "Process to translate addresses of MSIs", is similar to bit manip
201  * function PEXT (Parallel bits extract) from x86.
202  */
203 static uint64_t riscv_iommu_pext_u64(uint64_t val, uint64_t ext)
204 {
205     uint64_t ret = 0;
206     uint64_t rot = 1;
207 
208     while (ext) {
209         if (ext & 1) {
210             if (val & 1) {
211                 ret |= rot;
212             }
213             rot <<= 1;
214         }
215         val >>= 1;
216         ext >>= 1;
217     }
218 
219     return ret;
220 }
221 
222 /* Check if GPA matches MSI/MRIF pattern. */
223 static bool riscv_iommu_msi_check(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
224     dma_addr_t gpa)
225 {
226     if (!s->enable_msi) {
227         return false;
228     }
229 
230     if (get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE) !=
231         RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
232         return false; /* Invalid MSI/MRIF mode */
233     }
234 
235     if ((PPN_DOWN(gpa) ^ ctx->msi_addr_pattern) & ~ctx->msi_addr_mask) {
236         return false; /* GPA not in MSI range defined by AIA IMSIC rules. */
237     }
238 
239     return true;
240 }
241 
242 /*
243  * RISCV IOMMU Address Translation Lookup - Page Table Walk
244  *
245  * Note: Code is based on get_physical_address() from target/riscv/cpu_helper.c
246  * Both implementation can be merged into single helper function in future.
247  * Keeping them separate for now, as error reporting and flow specifics are
248  * sufficiently different for separate implementation.
249  *
250  * @s        : IOMMU Device State
251  * @ctx      : Translation context for device id and process address space id.
252  * @iotlb    : translation data: physical address and access mode.
253  * @return   : success or fault cause code.
254  */
255 static int riscv_iommu_spa_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
256     IOMMUTLBEntry *iotlb)
257 {
258     dma_addr_t addr, base;
259     uint64_t satp, gatp, pte;
260     bool en_s, en_g;
261     struct {
262         unsigned char step;
263         unsigned char levels;
264         unsigned char ptidxbits;
265         unsigned char ptesize;
266     } sc[2];
267     /* Translation stage phase */
268     enum {
269         S_STAGE = 0,
270         G_STAGE = 1,
271     } pass;
272     MemTxResult ret;
273 
274     satp = get_field(ctx->satp, RISCV_IOMMU_ATP_MODE_FIELD);
275     gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
276 
277     en_s = satp != RISCV_IOMMU_DC_FSC_MODE_BARE;
278     en_g = gatp != RISCV_IOMMU_DC_IOHGATP_MODE_BARE;
279 
280     /*
281      * Early check for MSI address match when IOVA == GPA.
282      * Note that the (!en_s) condition means that the MSI
283      * page table may only be used when guest pages are
284      * mapped using the g-stage page table, whether single-
285      * or two-stage paging is enabled. It's unavoidable though,
286      * because the spec mandates that we do a first-stage
287      * translation before we check the MSI page table, which
288      * means we can't do an early MSI check unless we have
289      * strictly !en_s.
290      */
291     if (!en_s && (iotlb->perm & IOMMU_WO) &&
292         riscv_iommu_msi_check(s, ctx, iotlb->iova)) {
293         iotlb->target_as = &s->trap_as;
294         iotlb->translated_addr = iotlb->iova;
295         iotlb->addr_mask = ~TARGET_PAGE_MASK;
296         return 0;
297     }
298 
299     /* Exit early for pass-through mode. */
300     if (!(en_s || en_g)) {
301         iotlb->translated_addr = iotlb->iova;
302         iotlb->addr_mask = ~TARGET_PAGE_MASK;
303         /* Allow R/W in pass-through mode */
304         iotlb->perm = IOMMU_RW;
305         return 0;
306     }
307 
308     /* S/G translation parameters. */
309     for (pass = 0; pass < 2; pass++) {
310         uint32_t sv_mode;
311 
312         sc[pass].step = 0;
313         if (pass ? (s->fctl & RISCV_IOMMU_FCTL_GXL) :
314             (ctx->tc & RISCV_IOMMU_DC_TC_SXL)) {
315             /* 32bit mode for GXL/SXL == 1 */
316             switch (pass ? gatp : satp) {
317             case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
318                 sc[pass].levels    = 0;
319                 sc[pass].ptidxbits = 0;
320                 sc[pass].ptesize   = 0;
321                 break;
322             case RISCV_IOMMU_DC_IOHGATP_MODE_SV32X4:
323                 sv_mode = pass ? RISCV_IOMMU_CAP_SV32X4 : RISCV_IOMMU_CAP_SV32;
324                 if (!(s->cap & sv_mode)) {
325                     return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
326                 }
327                 sc[pass].levels    = 2;
328                 sc[pass].ptidxbits = 10;
329                 sc[pass].ptesize   = 4;
330                 break;
331             default:
332                 return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
333             }
334         } else {
335             /* 64bit mode for GXL/SXL == 0 */
336             switch (pass ? gatp : satp) {
337             case RISCV_IOMMU_DC_IOHGATP_MODE_BARE:
338                 sc[pass].levels    = 0;
339                 sc[pass].ptidxbits = 0;
340                 sc[pass].ptesize   = 0;
341                 break;
342             case RISCV_IOMMU_DC_IOHGATP_MODE_SV39X4:
343                 sv_mode = pass ? RISCV_IOMMU_CAP_SV39X4 : RISCV_IOMMU_CAP_SV39;
344                 if (!(s->cap & sv_mode)) {
345                     return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
346                 }
347                 sc[pass].levels    = 3;
348                 sc[pass].ptidxbits = 9;
349                 sc[pass].ptesize   = 8;
350                 break;
351             case RISCV_IOMMU_DC_IOHGATP_MODE_SV48X4:
352                 sv_mode = pass ? RISCV_IOMMU_CAP_SV48X4 : RISCV_IOMMU_CAP_SV48;
353                 if (!(s->cap & sv_mode)) {
354                     return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
355                 }
356                 sc[pass].levels    = 4;
357                 sc[pass].ptidxbits = 9;
358                 sc[pass].ptesize   = 8;
359                 break;
360             case RISCV_IOMMU_DC_IOHGATP_MODE_SV57X4:
361                 sv_mode = pass ? RISCV_IOMMU_CAP_SV57X4 : RISCV_IOMMU_CAP_SV57;
362                 if (!(s->cap & sv_mode)) {
363                     return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
364                 }
365                 sc[pass].levels    = 5;
366                 sc[pass].ptidxbits = 9;
367                 sc[pass].ptesize   = 8;
368                 break;
369             default:
370                 return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
371             }
372         }
373     };
374 
375     /* S/G stages translation tables root pointers */
376     gatp = PPN_PHYS(get_field(ctx->gatp, RISCV_IOMMU_ATP_PPN_FIELD));
377     satp = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_ATP_PPN_FIELD));
378     addr = (en_s && en_g) ? satp : iotlb->iova;
379     base = en_g ? gatp : satp;
380     pass = en_g ? G_STAGE : S_STAGE;
381 
382     do {
383         const unsigned widened = (pass && !sc[pass].step) ? 2 : 0;
384         const unsigned va_bits = widened + sc[pass].ptidxbits;
385         const unsigned va_skip = TARGET_PAGE_BITS + sc[pass].ptidxbits *
386                                  (sc[pass].levels - 1 - sc[pass].step);
387         const unsigned idx = (addr >> va_skip) & ((1 << va_bits) - 1);
388         const dma_addr_t pte_addr = base + idx * sc[pass].ptesize;
389         const bool ade =
390             ctx->tc & (pass ? RISCV_IOMMU_DC_TC_GADE : RISCV_IOMMU_DC_TC_SADE);
391 
392         /* Address range check before first level lookup */
393         if (!sc[pass].step) {
394             const uint64_t va_len = va_skip + va_bits;
395             const uint64_t va_mask = (1ULL << va_len) - 1;
396 
397             if (pass == S_STAGE && va_len > 32) {
398                 target_ulong mask, masked_msbs;
399 
400                 mask = (1L << (TARGET_LONG_BITS - (va_len - 1))) - 1;
401                 masked_msbs = (addr >> (va_len - 1)) & mask;
402 
403                 if (masked_msbs != 0 && masked_msbs != mask) {
404                     return (iotlb->perm & IOMMU_WO) ?
405                                 RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S :
406                                 RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S;
407                 }
408             } else {
409                 if ((addr & va_mask) != addr) {
410                     return (iotlb->perm & IOMMU_WO) ?
411                                 RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
412                                 RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS;
413                 }
414             }
415         }
416 
417         /* Read page table entry */
418         if (sc[pass].ptesize == 4) {
419             uint32_t pte32 = 0;
420             ret = ldl_le_dma(s->target_as, pte_addr, &pte32,
421                              MEMTXATTRS_UNSPECIFIED);
422             pte = pte32;
423         } else {
424             ret = ldq_le_dma(s->target_as, pte_addr, &pte,
425                              MEMTXATTRS_UNSPECIFIED);
426         }
427         if (ret != MEMTX_OK) {
428             return (iotlb->perm & IOMMU_WO) ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT
429                                             : RISCV_IOMMU_FQ_CAUSE_RD_FAULT;
430         }
431 
432         sc[pass].step++;
433         hwaddr ppn = pte >> PTE_PPN_SHIFT;
434 
435         if (!(pte & PTE_V)) {
436             break;                /* Invalid PTE */
437         } else if (!(pte & (PTE_R | PTE_W | PTE_X))) {
438             base = PPN_PHYS(ppn); /* Inner PTE, continue walking */
439         } else if ((pte & (PTE_R | PTE_W | PTE_X)) == PTE_W) {
440             break;                /* Reserved leaf PTE flags: PTE_W */
441         } else if ((pte & (PTE_R | PTE_W | PTE_X)) == (PTE_W | PTE_X)) {
442             break;                /* Reserved leaf PTE flags: PTE_W + PTE_X */
443         } else if (ppn & ((1ULL << (va_skip - TARGET_PAGE_BITS)) - 1)) {
444             break;                /* Misaligned PPN */
445         } else if ((iotlb->perm & IOMMU_RO) && !(pte & PTE_R)) {
446             break;                /* Read access check failed */
447         } else if ((iotlb->perm & IOMMU_WO) && !(pte & PTE_W)) {
448             break;                /* Write access check failed */
449         } else if ((iotlb->perm & IOMMU_RO) && !ade && !(pte & PTE_A)) {
450             break;                /* Access bit not set */
451         } else if ((iotlb->perm & IOMMU_WO) && !ade && !(pte & PTE_D)) {
452             break;                /* Dirty bit not set */
453         } else {
454             /* Leaf PTE, translation completed. */
455             sc[pass].step = sc[pass].levels;
456             base = PPN_PHYS(ppn) | (addr & ((1ULL << va_skip) - 1));
457             /* Update address mask based on smallest translation granularity */
458             iotlb->addr_mask &= (1ULL << va_skip) - 1;
459             /* Continue with S-Stage translation? */
460             if (pass && sc[0].step != sc[0].levels) {
461                 pass = S_STAGE;
462                 addr = iotlb->iova;
463                 continue;
464             }
465             /* Translation phase completed (GPA or SPA) */
466             iotlb->translated_addr = base;
467             iotlb->perm = (pte & PTE_W) ? ((pte & PTE_R) ? IOMMU_RW : IOMMU_WO)
468                                                          : IOMMU_RO;
469 
470             /* Check MSI GPA address match */
471             if (pass == S_STAGE && (iotlb->perm & IOMMU_WO) &&
472                 riscv_iommu_msi_check(s, ctx, base)) {
473                 /* Trap MSI writes and return GPA address. */
474                 iotlb->target_as = &s->trap_as;
475                 iotlb->addr_mask = ~TARGET_PAGE_MASK;
476                 return 0;
477             }
478 
479             /* Continue with G-Stage translation? */
480             if (!pass && en_g) {
481                 pass = G_STAGE;
482                 addr = base;
483                 base = gatp;
484                 sc[pass].step = 0;
485                 continue;
486             }
487 
488             return 0;
489         }
490 
491         if (sc[pass].step == sc[pass].levels) {
492             break; /* Can't find leaf PTE */
493         }
494 
495         /* Continue with G-Stage translation? */
496         if (!pass && en_g) {
497             pass = G_STAGE;
498             addr = base;
499             base = gatp;
500             sc[pass].step = 0;
501         }
502     } while (1);
503 
504     return (iotlb->perm & IOMMU_WO) ?
505                 (pass ? RISCV_IOMMU_FQ_CAUSE_WR_FAULT_VS :
506                         RISCV_IOMMU_FQ_CAUSE_WR_FAULT_S) :
507                 (pass ? RISCV_IOMMU_FQ_CAUSE_RD_FAULT_VS :
508                         RISCV_IOMMU_FQ_CAUSE_RD_FAULT_S);
509 }
510 
511 static void riscv_iommu_report_fault(RISCVIOMMUState *s,
512                                      RISCVIOMMUContext *ctx,
513                                      uint32_t fault_type, uint32_t cause,
514                                      bool pv,
515                                      uint64_t iotval, uint64_t iotval2)
516 {
517     struct riscv_iommu_fq_record ev = { 0 };
518 
519     if (ctx->tc & RISCV_IOMMU_DC_TC_DTF) {
520         switch (cause) {
521         case RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED:
522         case RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT:
523         case RISCV_IOMMU_FQ_CAUSE_DDT_INVALID:
524         case RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED:
525         case RISCV_IOMMU_FQ_CAUSE_DDT_CORRUPTED:
526         case RISCV_IOMMU_FQ_CAUSE_INTERNAL_DP_ERROR:
527         case RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT:
528             break;
529         default:
530             /* DTF prevents reporting a fault for this given cause */
531             return;
532         }
533     }
534 
535     ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_CAUSE, cause);
536     ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_TTYPE, fault_type);
537     ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_DID, ctx->devid);
538     ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PV, true);
539 
540     if (pv) {
541         ev.hdr = set_field(ev.hdr, RISCV_IOMMU_FQ_HDR_PID, ctx->process_id);
542     }
543 
544     ev.iotval = iotval;
545     ev.iotval2 = iotval2;
546 
547     riscv_iommu_fault(s, &ev);
548 }
549 
550 /* Redirect MSI write for given GPA. */
551 static MemTxResult riscv_iommu_msi_write(RISCVIOMMUState *s,
552     RISCVIOMMUContext *ctx, uint64_t gpa, uint64_t data,
553     unsigned size, MemTxAttrs attrs)
554 {
555     MemTxResult res;
556     dma_addr_t addr;
557     uint64_t intn;
558     uint32_t n190;
559     uint64_t pte[2];
560     int fault_type = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
561     int cause;
562 
563     /* Interrupt File Number */
564     intn = riscv_iommu_pext_u64(PPN_DOWN(gpa), ctx->msi_addr_mask);
565     if (intn >= 256) {
566         /* Interrupt file number out of range */
567         res = MEMTX_ACCESS_ERROR;
568         cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
569         goto err;
570     }
571 
572     /* fetch MSI PTE */
573     addr = PPN_PHYS(get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_PPN));
574     addr = addr | (intn * sizeof(pte));
575     res = dma_memory_read(s->target_as, addr, &pte, sizeof(pte),
576             MEMTXATTRS_UNSPECIFIED);
577     if (res != MEMTX_OK) {
578         if (res == MEMTX_DECODE_ERROR) {
579             cause = RISCV_IOMMU_FQ_CAUSE_MSI_PT_CORRUPTED;
580         } else {
581             cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
582         }
583         goto err;
584     }
585 
586     le64_to_cpus(&pte[0]);
587     le64_to_cpus(&pte[1]);
588 
589     if (!(pte[0] & RISCV_IOMMU_MSI_PTE_V) || (pte[0] & RISCV_IOMMU_MSI_PTE_C)) {
590         /*
591          * The spec mentions that: "If msipte.C == 1, then further
592          * processing to interpret the PTE is implementation
593          * defined.". We'll abort with cause = 262 for this
594          * case too.
595          */
596         res = MEMTX_ACCESS_ERROR;
597         cause = RISCV_IOMMU_FQ_CAUSE_MSI_INVALID;
598         goto err;
599     }
600 
601     switch (get_field(pte[0], RISCV_IOMMU_MSI_PTE_M)) {
602     case RISCV_IOMMU_MSI_PTE_M_BASIC:
603         /* MSI Pass-through mode */
604         addr = PPN_PHYS(get_field(pte[0], RISCV_IOMMU_MSI_PTE_PPN));
605 
606         trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
607                               PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
608                               gpa, addr);
609 
610         res = dma_memory_write(s->target_as, addr, &data, size, attrs);
611         if (res != MEMTX_OK) {
612             cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
613             goto err;
614         }
615 
616         return MEMTX_OK;
617     case RISCV_IOMMU_MSI_PTE_M_MRIF:
618         /* MRIF mode, continue. */
619         break;
620     default:
621         res = MEMTX_ACCESS_ERROR;
622         cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
623         goto err;
624     }
625 
626     /*
627      * Report an error for interrupt identities exceeding the maximum allowed
628      * for an IMSIC interrupt file (2047) or destination address is not 32-bit
629      * aligned. See IOMMU Specification, Chapter 2.3. MSI page tables.
630      */
631     if ((data > 2047) || (gpa & 3)) {
632         res = MEMTX_ACCESS_ERROR;
633         cause = RISCV_IOMMU_FQ_CAUSE_MSI_MISCONFIGURED;
634         goto err;
635     }
636 
637     /* MSI MRIF mode, non atomic pending bit update */
638 
639     /* MRIF pending bit address */
640     addr = get_field(pte[0], RISCV_IOMMU_MSI_PTE_MRIF_ADDR) << 9;
641     addr = addr | ((data & 0x7c0) >> 3);
642 
643     trace_riscv_iommu_msi(s->parent_obj.id, PCI_BUS_NUM(ctx->devid),
644                           PCI_SLOT(ctx->devid), PCI_FUNC(ctx->devid),
645                           gpa, addr);
646 
647     /* MRIF pending bit mask */
648     data = 1ULL << (data & 0x03f);
649     res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
650     if (res != MEMTX_OK) {
651         cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
652         goto err;
653     }
654 
655     intn = intn | data;
656     res = dma_memory_write(s->target_as, addr, &intn, sizeof(intn), attrs);
657     if (res != MEMTX_OK) {
658         cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
659         goto err;
660     }
661 
662     /* Get MRIF enable bits */
663     addr = addr + sizeof(intn);
664     res = dma_memory_read(s->target_as, addr, &intn, sizeof(intn), attrs);
665     if (res != MEMTX_OK) {
666         cause = RISCV_IOMMU_FQ_CAUSE_MSI_LOAD_FAULT;
667         goto err;
668     }
669 
670     if (!(intn & data)) {
671         /* notification disabled, MRIF update completed. */
672         return MEMTX_OK;
673     }
674 
675     /* Send notification message */
676     addr = PPN_PHYS(get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NPPN));
677     n190 = get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID) |
678           (get_field(pte[1], RISCV_IOMMU_MSI_MRIF_NID_MSB) << 10);
679 
680     res = dma_memory_write(s->target_as, addr, &n190, sizeof(n190), attrs);
681     if (res != MEMTX_OK) {
682         cause = RISCV_IOMMU_FQ_CAUSE_MSI_WR_FAULT;
683         goto err;
684     }
685 
686     trace_riscv_iommu_mrif_notification(s->parent_obj.id, n190, addr);
687 
688     return MEMTX_OK;
689 
690 err:
691     riscv_iommu_report_fault(s, ctx, fault_type, cause,
692                              !!ctx->process_id, 0, 0);
693     return res;
694 }
695 
696 /*
697  * Check device context configuration as described by the
698  * riscv-iommu spec section "Device-context configuration
699  * checks".
700  */
701 static bool riscv_iommu_validate_device_ctx(RISCVIOMMUState *s,
702                                             RISCVIOMMUContext *ctx)
703 {
704     uint32_t fsc_mode, msi_mode;
705     uint64_t gatp;
706 
707     if (!(s->cap & RISCV_IOMMU_CAP_ATS) &&
708         (ctx->tc & RISCV_IOMMU_DC_TC_EN_ATS ||
709          ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI ||
710          ctx->tc & RISCV_IOMMU_DC_TC_PRPR)) {
711         return false;
712     }
713 
714     if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_ATS) &&
715         (ctx->tc & RISCV_IOMMU_DC_TC_T2GPA ||
716          ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI)) {
717         return false;
718     }
719 
720     if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_PRI) &&
721         ctx->tc & RISCV_IOMMU_DC_TC_PRPR) {
722         return false;
723     }
724 
725     if (!(s->cap & RISCV_IOMMU_CAP_T2GPA) &&
726         ctx->tc & RISCV_IOMMU_DC_TC_T2GPA) {
727         return false;
728     }
729 
730     if (s->cap & RISCV_IOMMU_CAP_MSI_FLAT) {
731         msi_mode = get_field(ctx->msiptp, RISCV_IOMMU_DC_MSIPTP_MODE);
732 
733         if (msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_OFF &&
734             msi_mode != RISCV_IOMMU_DC_MSIPTP_MODE_FLAT) {
735             return false;
736         }
737     }
738 
739     gatp = get_field(ctx->gatp, RISCV_IOMMU_ATP_MODE_FIELD);
740     if (ctx->tc & RISCV_IOMMU_DC_TC_T2GPA &&
741         gatp == RISCV_IOMMU_DC_IOHGATP_MODE_BARE) {
742         return false;
743     }
744 
745     fsc_mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
746 
747     if (ctx->tc & RISCV_IOMMU_DC_TC_PDTV) {
748         switch (fsc_mode) {
749         case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8:
750             if (!(s->cap & RISCV_IOMMU_CAP_PD8)) {
751                 return false;
752             }
753             break;
754         case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD17:
755             if (!(s->cap & RISCV_IOMMU_CAP_PD17)) {
756                 return false;
757             }
758             break;
759         case RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20:
760             if (!(s->cap & RISCV_IOMMU_CAP_PD20)) {
761                 return false;
762             }
763             break;
764         }
765     } else {
766         /* DC.tc.PDTV is 0 */
767         if (ctx->tc & RISCV_IOMMU_DC_TC_DPE) {
768             return false;
769         }
770 
771         if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
772             if (fsc_mode == RISCV_IOMMU_CAP_SV32 &&
773                 !(s->cap & RISCV_IOMMU_CAP_SV32)) {
774                 return false;
775             }
776         } else {
777             switch (fsc_mode) {
778             case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
779                 if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
780                     return false;
781                 }
782                 break;
783             case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
784                 if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
785                     return false;
786                 }
787             break;
788             case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
789                 if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
790                     return false;
791                 }
792                 break;
793             }
794         }
795     }
796 
797     /*
798      * CAP_END is always zero (only one endianess). FCTL_BE is
799      * always zero (little-endian accesses). Thus TC_SBE must
800      * always be LE, i.e. zero.
801      */
802     if (ctx->tc & RISCV_IOMMU_DC_TC_SBE) {
803         return false;
804     }
805 
806     return true;
807 }
808 
809 /*
810  * Validate process context (PC) according to section
811  * "Process-context configuration checks".
812  */
813 static bool riscv_iommu_validate_process_ctx(RISCVIOMMUState *s,
814                                              RISCVIOMMUContext *ctx)
815 {
816     uint32_t mode;
817 
818     if (get_field(ctx->ta, RISCV_IOMMU_PC_TA_RESERVED)) {
819         return false;
820     }
821 
822     if (get_field(ctx->satp, RISCV_IOMMU_PC_FSC_RESERVED)) {
823         return false;
824     }
825 
826     mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
827     switch (mode) {
828     case RISCV_IOMMU_DC_FSC_MODE_BARE:
829     /* sv39 and sv32 modes have the same value (8) */
830     case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
831     case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
832     case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
833         break;
834     default:
835         return false;
836     }
837 
838     if (ctx->tc & RISCV_IOMMU_DC_TC_SXL) {
839         if (mode == RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV32 &&
840             !(s->cap & RISCV_IOMMU_CAP_SV32)) {
841                 return false;
842         }
843     } else {
844         switch (mode) {
845         case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39:
846             if (!(s->cap & RISCV_IOMMU_CAP_SV39)) {
847                 return false;
848             }
849             break;
850         case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV48:
851             if (!(s->cap & RISCV_IOMMU_CAP_SV48)) {
852                 return false;
853             }
854             break;
855         case RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57:
856             if (!(s->cap & RISCV_IOMMU_CAP_SV57)) {
857                 return false;
858             }
859             break;
860         }
861     }
862 
863     return true;
864 }
865 
866 /*
867  * RISC-V IOMMU Device Context Loopkup - Device Directory Tree Walk
868  *
869  * @s         : IOMMU Device State
870  * @ctx       : Device Translation Context with devid and process_id set.
871  * @return    : success or fault code.
872  */
873 static int riscv_iommu_ctx_fetch(RISCVIOMMUState *s, RISCVIOMMUContext *ctx)
874 {
875     const uint64_t ddtp = s->ddtp;
876     unsigned mode = get_field(ddtp, RISCV_IOMMU_DDTP_MODE);
877     dma_addr_t addr = PPN_PHYS(get_field(ddtp, RISCV_IOMMU_DDTP_PPN));
878     struct riscv_iommu_dc dc;
879     /* Device Context format: 0: extended (64 bytes) | 1: base (32 bytes) */
880     const int dc_fmt = !s->enable_msi;
881     const size_t dc_len = sizeof(dc) >> dc_fmt;
882     int depth;
883     uint64_t de;
884 
885     switch (mode) {
886     case RISCV_IOMMU_DDTP_MODE_OFF:
887         return RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED;
888 
889     case RISCV_IOMMU_DDTP_MODE_BARE:
890         /* mock up pass-through translation context */
891         ctx->gatp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
892             RISCV_IOMMU_DC_IOHGATP_MODE_BARE);
893         ctx->satp = set_field(0, RISCV_IOMMU_ATP_MODE_FIELD,
894             RISCV_IOMMU_DC_FSC_MODE_BARE);
895 
896         ctx->tc = RISCV_IOMMU_DC_TC_V;
897         if (s->enable_ats) {
898             ctx->tc |= RISCV_IOMMU_DC_TC_EN_ATS;
899         }
900 
901         ctx->ta = 0;
902         ctx->msiptp = 0;
903         return 0;
904 
905     case RISCV_IOMMU_DDTP_MODE_1LVL:
906         depth = 0;
907         break;
908 
909     case RISCV_IOMMU_DDTP_MODE_2LVL:
910         depth = 1;
911         break;
912 
913     case RISCV_IOMMU_DDTP_MODE_3LVL:
914         depth = 2;
915         break;
916 
917     default:
918         return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
919     }
920 
921     /*
922      * Check supported device id width (in bits).
923      * See IOMMU Specification, Chapter 6. Software guidelines.
924      * - if extended device-context format is used:
925      *   1LVL: 6, 2LVL: 15, 3LVL: 24
926      * - if base device-context format is used:
927      *   1LVL: 7, 2LVL: 16, 3LVL: 24
928      */
929     if (ctx->devid >= (1 << (depth * 9 + 6 + (dc_fmt && depth != 2)))) {
930         return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
931     }
932 
933     /* Device directory tree walk */
934     for (; depth-- > 0; ) {
935         /*
936          * Select device id index bits based on device directory tree level
937          * and device context format.
938          * See IOMMU Specification, Chapter 2. Data Structures.
939          * - if extended device-context format is used:
940          *   device index: [23:15][14:6][5:0]
941          * - if base device-context format is used:
942          *   device index: [23:16][15:7][6:0]
943          */
944         const int split = depth * 9 + 6 + dc_fmt;
945         addr |= ((ctx->devid >> split) << 3) & ~TARGET_PAGE_MASK;
946         if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
947                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
948             return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
949         }
950         le64_to_cpus(&de);
951         if (!(de & RISCV_IOMMU_DDTE_VALID)) {
952             /* invalid directory entry */
953             return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
954         }
955         if (de & ~(RISCV_IOMMU_DDTE_PPN | RISCV_IOMMU_DDTE_VALID)) {
956             /* reserved bits set */
957             return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
958         }
959         addr = PPN_PHYS(get_field(de, RISCV_IOMMU_DDTE_PPN));
960     }
961 
962     /* index into device context entry page */
963     addr |= (ctx->devid * dc_len) & ~TARGET_PAGE_MASK;
964 
965     memset(&dc, 0, sizeof(dc));
966     if (dma_memory_read(s->target_as, addr, &dc, dc_len,
967                         MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
968         return RISCV_IOMMU_FQ_CAUSE_DDT_LOAD_FAULT;
969     }
970 
971     /* Set translation context. */
972     ctx->tc = le64_to_cpu(dc.tc);
973     ctx->gatp = le64_to_cpu(dc.iohgatp);
974     ctx->satp = le64_to_cpu(dc.fsc);
975     ctx->ta = le64_to_cpu(dc.ta);
976     ctx->msiptp = le64_to_cpu(dc.msiptp);
977     ctx->msi_addr_mask = le64_to_cpu(dc.msi_addr_mask);
978     ctx->msi_addr_pattern = le64_to_cpu(dc.msi_addr_pattern);
979 
980     if (!(ctx->tc & RISCV_IOMMU_DC_TC_V)) {
981         return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
982     }
983 
984     if (!riscv_iommu_validate_device_ctx(s, ctx)) {
985         return RISCV_IOMMU_FQ_CAUSE_DDT_MISCONFIGURED;
986     }
987 
988     /* FSC field checks */
989     mode = get_field(ctx->satp, RISCV_IOMMU_DC_FSC_MODE);
990     addr = PPN_PHYS(get_field(ctx->satp, RISCV_IOMMU_DC_FSC_PPN));
991 
992     if (!(ctx->tc & RISCV_IOMMU_DC_TC_PDTV)) {
993         if (ctx->process_id != RISCV_IOMMU_NOPROCID) {
994             /* PID is disabled */
995             return RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
996         }
997         if (mode > RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57) {
998             /* Invalid translation mode */
999             return RISCV_IOMMU_FQ_CAUSE_DDT_INVALID;
1000         }
1001         return 0;
1002     }
1003 
1004     if (ctx->process_id == RISCV_IOMMU_NOPROCID) {
1005         if (!(ctx->tc & RISCV_IOMMU_DC_TC_DPE)) {
1006             /* No default process_id enabled, set BARE mode */
1007             ctx->satp = 0ULL;
1008             return 0;
1009         } else {
1010             /* Use default process_id #0 */
1011             ctx->process_id = 0;
1012         }
1013     }
1014 
1015     if (mode == RISCV_IOMMU_DC_FSC_MODE_BARE) {
1016         /* No S-Stage translation, done. */
1017         return 0;
1018     }
1019 
1020     /* FSC.TC.PDTV enabled */
1021     if (mode > RISCV_IOMMU_DC_FSC_PDTP_MODE_PD20) {
1022         /* Invalid PDTP.MODE */
1023         return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
1024     }
1025 
1026     for (depth = mode - RISCV_IOMMU_DC_FSC_PDTP_MODE_PD8; depth-- > 0; ) {
1027         /*
1028          * Select process id index bits based on process directory tree
1029          * level. See IOMMU Specification, 2.2. Process-Directory-Table.
1030          */
1031         const int split = depth * 9 + 8;
1032         addr |= ((ctx->process_id >> split) << 3) & ~TARGET_PAGE_MASK;
1033         if (dma_memory_read(s->target_as, addr, &de, sizeof(de),
1034                             MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
1035             return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
1036         }
1037         le64_to_cpus(&de);
1038         if (!(de & RISCV_IOMMU_PC_TA_V)) {
1039             return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
1040         }
1041         addr = PPN_PHYS(get_field(de, RISCV_IOMMU_PC_FSC_PPN));
1042     }
1043 
1044     /* Leaf entry in PDT */
1045     addr |= (ctx->process_id << 4) & ~TARGET_PAGE_MASK;
1046     if (dma_memory_read(s->target_as, addr, &dc.ta, sizeof(uint64_t) * 2,
1047                         MEMTXATTRS_UNSPECIFIED) != MEMTX_OK) {
1048         return RISCV_IOMMU_FQ_CAUSE_PDT_LOAD_FAULT;
1049     }
1050 
1051     /* Use FSC and TA from process directory entry. */
1052     ctx->ta = le64_to_cpu(dc.ta);
1053     ctx->satp = le64_to_cpu(dc.fsc);
1054 
1055     if (!(ctx->ta & RISCV_IOMMU_PC_TA_V)) {
1056         return RISCV_IOMMU_FQ_CAUSE_PDT_INVALID;
1057     }
1058 
1059     if (!riscv_iommu_validate_process_ctx(s, ctx)) {
1060         return RISCV_IOMMU_FQ_CAUSE_PDT_MISCONFIGURED;
1061     }
1062 
1063     return 0;
1064 }
1065 
1066 /* Translation Context cache support */
1067 static gboolean riscv_iommu_ctx_equal(gconstpointer v1, gconstpointer v2)
1068 {
1069     RISCVIOMMUContext *c1 = (RISCVIOMMUContext *) v1;
1070     RISCVIOMMUContext *c2 = (RISCVIOMMUContext *) v2;
1071     return c1->devid == c2->devid &&
1072            c1->process_id == c2->process_id;
1073 }
1074 
1075 static guint riscv_iommu_ctx_hash(gconstpointer v)
1076 {
1077     RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) v;
1078     /*
1079      * Generate simple hash of (process_id, devid)
1080      * assuming 24-bit wide devid.
1081      */
1082     return (guint)(ctx->devid) + ((guint)(ctx->process_id) << 24);
1083 }
1084 
1085 static void riscv_iommu_ctx_inval_devid_procid(gpointer key, gpointer value,
1086                                                gpointer data)
1087 {
1088     RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
1089     RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
1090     if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
1091         ctx->devid == arg->devid &&
1092         ctx->process_id == arg->process_id) {
1093         ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
1094     }
1095 }
1096 
1097 static void riscv_iommu_ctx_inval_devid(gpointer key, gpointer value,
1098                                         gpointer data)
1099 {
1100     RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
1101     RISCVIOMMUContext *arg = (RISCVIOMMUContext *) data;
1102     if (ctx->tc & RISCV_IOMMU_DC_TC_V &&
1103         ctx->devid == arg->devid) {
1104         ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
1105     }
1106 }
1107 
1108 static void riscv_iommu_ctx_inval_all(gpointer key, gpointer value,
1109                                       gpointer data)
1110 {
1111     RISCVIOMMUContext *ctx = (RISCVIOMMUContext *) value;
1112     if (ctx->tc & RISCV_IOMMU_DC_TC_V) {
1113         ctx->tc &= ~RISCV_IOMMU_DC_TC_V;
1114     }
1115 }
1116 
1117 static void riscv_iommu_ctx_inval(RISCVIOMMUState *s, GHFunc func,
1118                                   uint32_t devid, uint32_t process_id)
1119 {
1120     GHashTable *ctx_cache;
1121     RISCVIOMMUContext key = {
1122         .devid = devid,
1123         .process_id = process_id,
1124     };
1125     ctx_cache = g_hash_table_ref(s->ctx_cache);
1126     g_hash_table_foreach(ctx_cache, func, &key);
1127     g_hash_table_unref(ctx_cache);
1128 }
1129 
1130 /* Find or allocate translation context for a given {device_id, process_id} */
1131 static RISCVIOMMUContext *riscv_iommu_ctx(RISCVIOMMUState *s,
1132                                           unsigned devid, unsigned process_id,
1133                                           void **ref)
1134 {
1135     GHashTable *ctx_cache;
1136     RISCVIOMMUContext *ctx;
1137     RISCVIOMMUContext key = {
1138         .devid = devid,
1139         .process_id = process_id,
1140     };
1141 
1142     ctx_cache = g_hash_table_ref(s->ctx_cache);
1143     ctx = g_hash_table_lookup(ctx_cache, &key);
1144 
1145     if (ctx && (ctx->tc & RISCV_IOMMU_DC_TC_V)) {
1146         *ref = ctx_cache;
1147         return ctx;
1148     }
1149 
1150     ctx = g_new0(RISCVIOMMUContext, 1);
1151     ctx->devid = devid;
1152     ctx->process_id = process_id;
1153 
1154     int fault = riscv_iommu_ctx_fetch(s, ctx);
1155     if (!fault) {
1156         if (g_hash_table_size(ctx_cache) >= LIMIT_CACHE_CTX) {
1157             g_hash_table_unref(ctx_cache);
1158             ctx_cache = g_hash_table_new_full(riscv_iommu_ctx_hash,
1159                                               riscv_iommu_ctx_equal,
1160                                               g_free, NULL);
1161             g_hash_table_ref(ctx_cache);
1162             g_hash_table_unref(qatomic_xchg(&s->ctx_cache, ctx_cache));
1163         }
1164         g_hash_table_add(ctx_cache, ctx);
1165         *ref = ctx_cache;
1166         return ctx;
1167     }
1168 
1169     g_hash_table_unref(ctx_cache);
1170     *ref = NULL;
1171 
1172     riscv_iommu_report_fault(s, ctx, RISCV_IOMMU_FQ_TTYPE_UADDR_RD,
1173                              fault, !!process_id, 0, 0);
1174 
1175     g_free(ctx);
1176     return NULL;
1177 }
1178 
1179 static void riscv_iommu_ctx_put(RISCVIOMMUState *s, void *ref)
1180 {
1181     if (ref) {
1182         g_hash_table_unref((GHashTable *)ref);
1183     }
1184 }
1185 
1186 /* Find or allocate address space for a given device */
1187 static AddressSpace *riscv_iommu_space(RISCVIOMMUState *s, uint32_t devid)
1188 {
1189     RISCVIOMMUSpace *as;
1190 
1191     /* FIXME: PCIe bus remapping for attached endpoints. */
1192     devid |= s->bus << 8;
1193 
1194     QLIST_FOREACH(as, &s->spaces, list) {
1195         if (as->devid == devid) {
1196             break;
1197         }
1198     }
1199 
1200     if (as == NULL) {
1201         char name[64];
1202         as = g_new0(RISCVIOMMUSpace, 1);
1203 
1204         as->iommu = s;
1205         as->devid = devid;
1206 
1207         snprintf(name, sizeof(name), "riscv-iommu-%04x:%02x.%d-iova",
1208             PCI_BUS_NUM(as->devid), PCI_SLOT(as->devid), PCI_FUNC(as->devid));
1209 
1210         /* IOVA address space, untranslated addresses */
1211         memory_region_init_iommu(&as->iova_mr, sizeof(as->iova_mr),
1212             TYPE_RISCV_IOMMU_MEMORY_REGION,
1213             OBJECT(as), "riscv_iommu", UINT64_MAX);
1214         address_space_init(&as->iova_as, MEMORY_REGION(&as->iova_mr), name);
1215 
1216         QLIST_INSERT_HEAD(&s->spaces, as, list);
1217 
1218         trace_riscv_iommu_new(s->parent_obj.id, PCI_BUS_NUM(as->devid),
1219                 PCI_SLOT(as->devid), PCI_FUNC(as->devid));
1220     }
1221     return &as->iova_as;
1222 }
1223 
1224 /* Translation Object cache support */
1225 static gboolean riscv_iommu_iot_equal(gconstpointer v1, gconstpointer v2)
1226 {
1227     RISCVIOMMUEntry *t1 = (RISCVIOMMUEntry *) v1;
1228     RISCVIOMMUEntry *t2 = (RISCVIOMMUEntry *) v2;
1229     return t1->gscid == t2->gscid && t1->pscid == t2->pscid &&
1230            t1->iova == t2->iova;
1231 }
1232 
1233 static guint riscv_iommu_iot_hash(gconstpointer v)
1234 {
1235     RISCVIOMMUEntry *t = (RISCVIOMMUEntry *) v;
1236     return (guint)t->iova;
1237 }
1238 
1239 /* GV: 1 PSCV: 1 AV: 1 */
1240 static void riscv_iommu_iot_inval_pscid_iova(gpointer key, gpointer value,
1241                                              gpointer data)
1242 {
1243     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1244     RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
1245     if (iot->gscid == arg->gscid &&
1246         iot->pscid == arg->pscid &&
1247         iot->iova == arg->iova) {
1248         iot->perm = IOMMU_NONE;
1249     }
1250 }
1251 
1252 /* GV: 1 PSCV: 1 AV: 0 */
1253 static void riscv_iommu_iot_inval_pscid(gpointer key, gpointer value,
1254                                         gpointer data)
1255 {
1256     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1257     RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
1258     if (iot->gscid == arg->gscid &&
1259         iot->pscid == arg->pscid) {
1260         iot->perm = IOMMU_NONE;
1261     }
1262 }
1263 
1264 /* GV: 1 GVMA: 1 */
1265 static void riscv_iommu_iot_inval_gscid_gpa(gpointer key, gpointer value,
1266                                             gpointer data)
1267 {
1268     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1269     RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
1270     if (iot->gscid == arg->gscid) {
1271         /* simplified cache, no GPA matching */
1272         iot->perm = IOMMU_NONE;
1273     }
1274 }
1275 
1276 /* GV: 1 GVMA: 0 */
1277 static void riscv_iommu_iot_inval_gscid(gpointer key, gpointer value,
1278                                         gpointer data)
1279 {
1280     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1281     RISCVIOMMUEntry *arg = (RISCVIOMMUEntry *) data;
1282     if (iot->gscid == arg->gscid) {
1283         iot->perm = IOMMU_NONE;
1284     }
1285 }
1286 
1287 /* GV: 0 */
1288 static void riscv_iommu_iot_inval_all(gpointer key, gpointer value,
1289                                       gpointer data)
1290 {
1291     RISCVIOMMUEntry *iot = (RISCVIOMMUEntry *) value;
1292     iot->perm = IOMMU_NONE;
1293 }
1294 
1295 /* caller should keep ref-count for iot_cache object */
1296 static RISCVIOMMUEntry *riscv_iommu_iot_lookup(RISCVIOMMUContext *ctx,
1297     GHashTable *iot_cache, hwaddr iova)
1298 {
1299     RISCVIOMMUEntry key = {
1300         .gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID),
1301         .pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID),
1302         .iova  = PPN_DOWN(iova),
1303     };
1304     return g_hash_table_lookup(iot_cache, &key);
1305 }
1306 
1307 /* caller should keep ref-count for iot_cache object */
1308 static void riscv_iommu_iot_update(RISCVIOMMUState *s,
1309     GHashTable *iot_cache, RISCVIOMMUEntry *iot)
1310 {
1311     if (!s->iot_limit) {
1312         return;
1313     }
1314 
1315     if (g_hash_table_size(s->iot_cache) >= s->iot_limit) {
1316         iot_cache = g_hash_table_new_full(riscv_iommu_iot_hash,
1317                                           riscv_iommu_iot_equal,
1318                                           g_free, NULL);
1319         g_hash_table_unref(qatomic_xchg(&s->iot_cache, iot_cache));
1320     }
1321     g_hash_table_add(iot_cache, iot);
1322 }
1323 
1324 static void riscv_iommu_iot_inval(RISCVIOMMUState *s, GHFunc func,
1325     uint32_t gscid, uint32_t pscid, hwaddr iova)
1326 {
1327     GHashTable *iot_cache;
1328     RISCVIOMMUEntry key = {
1329         .gscid = gscid,
1330         .pscid = pscid,
1331         .iova  = PPN_DOWN(iova),
1332     };
1333 
1334     iot_cache = g_hash_table_ref(s->iot_cache);
1335     g_hash_table_foreach(iot_cache, func, &key);
1336     g_hash_table_unref(iot_cache);
1337 }
1338 
1339 static int riscv_iommu_translate(RISCVIOMMUState *s, RISCVIOMMUContext *ctx,
1340     IOMMUTLBEntry *iotlb, bool enable_cache)
1341 {
1342     RISCVIOMMUEntry *iot;
1343     IOMMUAccessFlags perm;
1344     bool enable_pid;
1345     bool enable_pri;
1346     GHashTable *iot_cache;
1347     int fault;
1348 
1349     iot_cache = g_hash_table_ref(s->iot_cache);
1350     /*
1351      * TC[32] is reserved for custom extensions, used here to temporarily
1352      * enable automatic page-request generation for ATS queries.
1353      */
1354     enable_pri = (iotlb->perm == IOMMU_NONE) && (ctx->tc & BIT_ULL(32));
1355     enable_pid = (ctx->tc & RISCV_IOMMU_DC_TC_PDTV);
1356 
1357     /* Check for ATS request. */
1358     if (iotlb->perm == IOMMU_NONE) {
1359         /* Check if ATS is disabled. */
1360         if (!(ctx->tc & RISCV_IOMMU_DC_TC_EN_ATS)) {
1361             enable_pri = false;
1362             fault = RISCV_IOMMU_FQ_CAUSE_TTYPE_BLOCKED;
1363             goto done;
1364         }
1365     }
1366 
1367     iot = riscv_iommu_iot_lookup(ctx, iot_cache, iotlb->iova);
1368     perm = iot ? iot->perm : IOMMU_NONE;
1369     if (perm != IOMMU_NONE) {
1370         iotlb->translated_addr = PPN_PHYS(iot->phys);
1371         iotlb->addr_mask = ~TARGET_PAGE_MASK;
1372         iotlb->perm = perm;
1373         fault = 0;
1374         goto done;
1375     }
1376 
1377     /* Translate using device directory / page table information. */
1378     fault = riscv_iommu_spa_fetch(s, ctx, iotlb);
1379 
1380     if (!fault && iotlb->target_as == &s->trap_as) {
1381         /* Do not cache trapped MSI translations */
1382         goto done;
1383     }
1384 
1385     /*
1386      * We made an implementation choice to not cache identity-mapped
1387      * translations, as allowed by the specification, to avoid
1388      * translation cache evictions for other devices sharing the
1389      * IOMMU hardware model.
1390      */
1391     if (!fault && iotlb->translated_addr != iotlb->iova && enable_cache) {
1392         iot = g_new0(RISCVIOMMUEntry, 1);
1393         iot->iova = PPN_DOWN(iotlb->iova);
1394         iot->phys = PPN_DOWN(iotlb->translated_addr);
1395         iot->gscid = get_field(ctx->gatp, RISCV_IOMMU_DC_IOHGATP_GSCID);
1396         iot->pscid = get_field(ctx->ta, RISCV_IOMMU_DC_TA_PSCID);
1397         iot->perm = iotlb->perm;
1398         riscv_iommu_iot_update(s, iot_cache, iot);
1399     }
1400 
1401 done:
1402     g_hash_table_unref(iot_cache);
1403 
1404     if (enable_pri && fault) {
1405         struct riscv_iommu_pq_record pr = {0};
1406         if (enable_pid) {
1407             pr.hdr = set_field(RISCV_IOMMU_PREQ_HDR_PV,
1408                                RISCV_IOMMU_PREQ_HDR_PID, ctx->process_id);
1409         }
1410         pr.hdr = set_field(pr.hdr, RISCV_IOMMU_PREQ_HDR_DID, ctx->devid);
1411         pr.payload = (iotlb->iova & TARGET_PAGE_MASK) |
1412                      RISCV_IOMMU_PREQ_PAYLOAD_M;
1413         riscv_iommu_pri(s, &pr);
1414         return fault;
1415     }
1416 
1417     if (fault) {
1418         unsigned ttype = RISCV_IOMMU_FQ_TTYPE_PCIE_ATS_REQ;
1419 
1420         if (iotlb->perm & IOMMU_RW) {
1421             ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_WR;
1422         } else if (iotlb->perm & IOMMU_RO) {
1423             ttype = RISCV_IOMMU_FQ_TTYPE_UADDR_RD;
1424         }
1425 
1426         riscv_iommu_report_fault(s, ctx, ttype, fault, enable_pid,
1427                                  iotlb->iova, iotlb->translated_addr);
1428         return fault;
1429     }
1430 
1431     return 0;
1432 }
1433 
1434 /* IOMMU Command Interface */
1435 static MemTxResult riscv_iommu_iofence(RISCVIOMMUState *s, bool notify,
1436     uint64_t addr, uint32_t data)
1437 {
1438     /*
1439      * ATS processing in this implementation of the IOMMU is synchronous,
1440      * no need to wait for completions here.
1441      */
1442     if (!notify) {
1443         return MEMTX_OK;
1444     }
1445 
1446     return dma_memory_write(s->target_as, addr, &data, sizeof(data),
1447         MEMTXATTRS_UNSPECIFIED);
1448 }
1449 
1450 static void riscv_iommu_ats(RISCVIOMMUState *s,
1451     struct riscv_iommu_command *cmd, IOMMUNotifierFlag flag,
1452     IOMMUAccessFlags perm,
1453     void (*trace_fn)(const char *id))
1454 {
1455     RISCVIOMMUSpace *as = NULL;
1456     IOMMUNotifier *n;
1457     IOMMUTLBEvent event;
1458     uint32_t pid;
1459     uint32_t devid;
1460     const bool pv = cmd->dword0 & RISCV_IOMMU_CMD_ATS_PV;
1461 
1462     if (cmd->dword0 & RISCV_IOMMU_CMD_ATS_DSV) {
1463         /* Use device segment and requester id */
1464         devid = get_field(cmd->dword0,
1465             RISCV_IOMMU_CMD_ATS_DSEG | RISCV_IOMMU_CMD_ATS_RID);
1466     } else {
1467         devid = get_field(cmd->dword0, RISCV_IOMMU_CMD_ATS_RID);
1468     }
1469 
1470     pid = get_field(cmd->dword0, RISCV_IOMMU_CMD_ATS_PID);
1471 
1472     QLIST_FOREACH(as, &s->spaces, list) {
1473         if (as->devid == devid) {
1474             break;
1475         }
1476     }
1477 
1478     if (!as || !as->notifier) {
1479         return;
1480     }
1481 
1482     event.type = flag;
1483     event.entry.perm = perm;
1484     event.entry.target_as = s->target_as;
1485 
1486     IOMMU_NOTIFIER_FOREACH(n, &as->iova_mr) {
1487         if (!pv || n->iommu_idx == pid) {
1488             event.entry.iova = n->start;
1489             event.entry.addr_mask = n->end - n->start;
1490             trace_fn(as->iova_mr.parent_obj.name);
1491             memory_region_notify_iommu_one(n, &event);
1492         }
1493     }
1494 }
1495 
1496 static void riscv_iommu_ats_inval(RISCVIOMMUState *s,
1497     struct riscv_iommu_command *cmd)
1498 {
1499     return riscv_iommu_ats(s, cmd, IOMMU_NOTIFIER_DEVIOTLB_UNMAP, IOMMU_NONE,
1500                            trace_riscv_iommu_ats_inval);
1501 }
1502 
1503 static void riscv_iommu_ats_prgr(RISCVIOMMUState *s,
1504     struct riscv_iommu_command *cmd)
1505 {
1506     unsigned resp_code = get_field(cmd->dword1,
1507                                    RISCV_IOMMU_CMD_ATS_PRGR_RESP_CODE);
1508 
1509     /* Using the access flag to carry response code information */
1510     IOMMUAccessFlags perm = resp_code ? IOMMU_NONE : IOMMU_RW;
1511     return riscv_iommu_ats(s, cmd, IOMMU_NOTIFIER_MAP, perm,
1512                            trace_riscv_iommu_ats_prgr);
1513 }
1514 
1515 static void riscv_iommu_process_ddtp(RISCVIOMMUState *s)
1516 {
1517     uint64_t old_ddtp = s->ddtp;
1518     uint64_t new_ddtp = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_DDTP);
1519     unsigned new_mode = get_field(new_ddtp, RISCV_IOMMU_DDTP_MODE);
1520     unsigned old_mode = get_field(old_ddtp, RISCV_IOMMU_DDTP_MODE);
1521     bool ok = false;
1522 
1523     /*
1524      * Check for allowed DDTP.MODE transitions:
1525      * {OFF, BARE}        -> {OFF, BARE, 1LVL, 2LVL, 3LVL}
1526      * {1LVL, 2LVL, 3LVL} -> {OFF, BARE}
1527      */
1528     if (new_mode == old_mode ||
1529         new_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
1530         new_mode == RISCV_IOMMU_DDTP_MODE_BARE) {
1531         ok = true;
1532     } else if (new_mode == RISCV_IOMMU_DDTP_MODE_1LVL ||
1533                new_mode == RISCV_IOMMU_DDTP_MODE_2LVL ||
1534                new_mode == RISCV_IOMMU_DDTP_MODE_3LVL) {
1535         ok = old_mode == RISCV_IOMMU_DDTP_MODE_OFF ||
1536              old_mode == RISCV_IOMMU_DDTP_MODE_BARE;
1537     }
1538 
1539     if (ok) {
1540         /* clear reserved and busy bits, report back sanitized version */
1541         new_ddtp = set_field(new_ddtp & RISCV_IOMMU_DDTP_PPN,
1542                              RISCV_IOMMU_DDTP_MODE, new_mode);
1543     } else {
1544         new_ddtp = old_ddtp;
1545     }
1546     s->ddtp = new_ddtp;
1547 
1548     riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, new_ddtp);
1549 }
1550 
1551 /* Command function and opcode field. */
1552 #define RISCV_IOMMU_CMD(func, op) (((func) << 7) | (op))
1553 
1554 static void riscv_iommu_process_cq_tail(RISCVIOMMUState *s)
1555 {
1556     struct riscv_iommu_command cmd;
1557     MemTxResult res;
1558     dma_addr_t addr;
1559     uint32_t tail, head, ctrl;
1560     uint64_t cmd_opcode;
1561     GHFunc func;
1562 
1563     ctrl = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
1564     tail = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQT) & s->cq_mask;
1565     head = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQH) & s->cq_mask;
1566 
1567     /* Check for pending error or queue processing disabled */
1568     if (!(ctrl & RISCV_IOMMU_CQCSR_CQON) ||
1569         !!(ctrl & (RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CQMF))) {
1570         return;
1571     }
1572 
1573     while (tail != head) {
1574         addr = s->cq_addr  + head * sizeof(cmd);
1575         res = dma_memory_read(s->target_as, addr, &cmd, sizeof(cmd),
1576                               MEMTXATTRS_UNSPECIFIED);
1577 
1578         if (res != MEMTX_OK) {
1579             riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
1580                                   RISCV_IOMMU_CQCSR_CQMF, 0);
1581             goto fault;
1582         }
1583 
1584         trace_riscv_iommu_cmd(s->parent_obj.id, cmd.dword0, cmd.dword1);
1585 
1586         cmd_opcode = get_field(cmd.dword0,
1587                                RISCV_IOMMU_CMD_OPCODE | RISCV_IOMMU_CMD_FUNC);
1588 
1589         switch (cmd_opcode) {
1590         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOFENCE_FUNC_C,
1591                              RISCV_IOMMU_CMD_IOFENCE_OPCODE):
1592             res = riscv_iommu_iofence(s,
1593                 cmd.dword0 & RISCV_IOMMU_CMD_IOFENCE_AV, cmd.dword1 << 2,
1594                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IOFENCE_DATA));
1595 
1596             if (res != MEMTX_OK) {
1597                 riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
1598                                       RISCV_IOMMU_CQCSR_CQMF, 0);
1599                 goto fault;
1600             }
1601             break;
1602 
1603         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_GVMA,
1604                              RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
1605             if (cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV) {
1606                 /* illegal command arguments IOTINVAL.GVMA & PSCV == 1 */
1607                 goto cmd_ill;
1608             } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_GV)) {
1609                 /* invalidate all cache mappings */
1610                 func = riscv_iommu_iot_inval_all;
1611             } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_AV)) {
1612                 /* invalidate cache matching GSCID */
1613                 func = riscv_iommu_iot_inval_gscid;
1614             } else {
1615                 /* invalidate cache matching GSCID and ADDR (GPA) */
1616                 func = riscv_iommu_iot_inval_gscid_gpa;
1617             }
1618             riscv_iommu_iot_inval(s, func,
1619                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_GSCID), 0,
1620                 cmd.dword1 << 2 & TARGET_PAGE_MASK);
1621             break;
1622 
1623         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IOTINVAL_FUNC_VMA,
1624                              RISCV_IOMMU_CMD_IOTINVAL_OPCODE):
1625             if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_GV)) {
1626                 /* invalidate all cache mappings, simplified model */
1627                 func = riscv_iommu_iot_inval_all;
1628             } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_PSCV)) {
1629                 /* invalidate cache matching GSCID, simplified model */
1630                 func = riscv_iommu_iot_inval_gscid;
1631             } else if (!(cmd.dword0 & RISCV_IOMMU_CMD_IOTINVAL_AV)) {
1632                 /* invalidate cache matching GSCID and PSCID */
1633                 func = riscv_iommu_iot_inval_pscid;
1634             } else {
1635                 /* invalidate cache matching GSCID and PSCID and ADDR (IOVA) */
1636                 func = riscv_iommu_iot_inval_pscid_iova;
1637             }
1638             riscv_iommu_iot_inval(s, func,
1639                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_GSCID),
1640                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IOTINVAL_PSCID),
1641                 cmd.dword1 << 2 & TARGET_PAGE_MASK);
1642             break;
1643 
1644         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_DDT,
1645                              RISCV_IOMMU_CMD_IODIR_OPCODE):
1646             if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
1647                 /* invalidate all device context cache mappings */
1648                 func = riscv_iommu_ctx_inval_all;
1649             } else {
1650                 /* invalidate all device context matching DID */
1651                 func = riscv_iommu_ctx_inval_devid;
1652             }
1653             riscv_iommu_ctx_inval(s, func,
1654                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID), 0);
1655             break;
1656 
1657         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_IODIR_FUNC_INVAL_PDT,
1658                              RISCV_IOMMU_CMD_IODIR_OPCODE):
1659             if (!(cmd.dword0 & RISCV_IOMMU_CMD_IODIR_DV)) {
1660                 /* illegal command arguments IODIR_PDT & DV == 0 */
1661                 goto cmd_ill;
1662             } else {
1663                 func = riscv_iommu_ctx_inval_devid_procid;
1664             }
1665             riscv_iommu_ctx_inval(s, func,
1666                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_DID),
1667                 get_field(cmd.dword0, RISCV_IOMMU_CMD_IODIR_PID));
1668             break;
1669 
1670         /* ATS commands */
1671         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_ATS_FUNC_INVAL,
1672                              RISCV_IOMMU_CMD_ATS_OPCODE):
1673             if (!s->enable_ats) {
1674                 goto cmd_ill;
1675             }
1676 
1677             riscv_iommu_ats_inval(s, &cmd);
1678             break;
1679 
1680         case RISCV_IOMMU_CMD(RISCV_IOMMU_CMD_ATS_FUNC_PRGR,
1681                              RISCV_IOMMU_CMD_ATS_OPCODE):
1682             if (!s->enable_ats) {
1683                 goto cmd_ill;
1684             }
1685 
1686             riscv_iommu_ats_prgr(s, &cmd);
1687             break;
1688 
1689         default:
1690         cmd_ill:
1691             /* Invalid instruction, do not advance instruction index. */
1692             riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR,
1693                 RISCV_IOMMU_CQCSR_CMD_ILL, 0);
1694             goto fault;
1695         }
1696 
1697         /* Advance and update head pointer after command completes. */
1698         head = (head + 1) & s->cq_mask;
1699         riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_CQH, head);
1700     }
1701     return;
1702 
1703 fault:
1704     if (ctrl & RISCV_IOMMU_CQCSR_CIE) {
1705         riscv_iommu_notify(s, RISCV_IOMMU_INTR_CQ);
1706     }
1707 }
1708 
1709 static void riscv_iommu_process_cq_control(RISCVIOMMUState *s)
1710 {
1711     uint64_t base;
1712     uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
1713     uint32_t ctrl_clr;
1714     bool enable = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQEN);
1715     bool active = !!(ctrl_set & RISCV_IOMMU_CQCSR_CQON);
1716 
1717     if (enable && !active) {
1718         base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_CQB);
1719         s->cq_mask = (2ULL << get_field(base, RISCV_IOMMU_CQB_LOG2SZ)) - 1;
1720         s->cq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_CQB_PPN));
1721         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~s->cq_mask);
1722         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQH], 0);
1723         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_CQT], 0);
1724         ctrl_set = RISCV_IOMMU_CQCSR_CQON;
1725         ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQMF |
1726                    RISCV_IOMMU_CQCSR_CMD_ILL | RISCV_IOMMU_CQCSR_CMD_TO |
1727                    RISCV_IOMMU_CQCSR_FENCE_W_IP;
1728     } else if (!enable && active) {
1729         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQT], ~0);
1730         ctrl_set = 0;
1731         ctrl_clr = RISCV_IOMMU_CQCSR_BUSY | RISCV_IOMMU_CQCSR_CQON;
1732     } else {
1733         ctrl_set = 0;
1734         ctrl_clr = RISCV_IOMMU_CQCSR_BUSY;
1735     }
1736 
1737     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, ctrl_set, ctrl_clr);
1738 }
1739 
1740 static void riscv_iommu_process_fq_control(RISCVIOMMUState *s)
1741 {
1742     uint64_t base;
1743     uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
1744     uint32_t ctrl_clr;
1745     bool enable = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQEN);
1746     bool active = !!(ctrl_set & RISCV_IOMMU_FQCSR_FQON);
1747 
1748     if (enable && !active) {
1749         base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_FQB);
1750         s->fq_mask = (2ULL << get_field(base, RISCV_IOMMU_FQB_LOG2SZ)) - 1;
1751         s->fq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_FQB_PPN));
1752         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~s->fq_mask);
1753         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQH], 0);
1754         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_FQT], 0);
1755         ctrl_set = RISCV_IOMMU_FQCSR_FQON;
1756         ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQMF |
1757             RISCV_IOMMU_FQCSR_FQOF;
1758     } else if (!enable && active) {
1759         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQH], ~0);
1760         ctrl_set = 0;
1761         ctrl_clr = RISCV_IOMMU_FQCSR_BUSY | RISCV_IOMMU_FQCSR_FQON;
1762     } else {
1763         ctrl_set = 0;
1764         ctrl_clr = RISCV_IOMMU_FQCSR_BUSY;
1765     }
1766 
1767     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, ctrl_set, ctrl_clr);
1768 }
1769 
1770 static void riscv_iommu_process_pq_control(RISCVIOMMUState *s)
1771 {
1772     uint64_t base;
1773     uint32_t ctrl_set = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
1774     uint32_t ctrl_clr;
1775     bool enable = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQEN);
1776     bool active = !!(ctrl_set & RISCV_IOMMU_PQCSR_PQON);
1777 
1778     if (enable && !active) {
1779         base = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_PQB);
1780         s->pq_mask = (2ULL << get_field(base, RISCV_IOMMU_PQB_LOG2SZ)) - 1;
1781         s->pq_addr = PPN_PHYS(get_field(base, RISCV_IOMMU_PQB_PPN));
1782         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~s->pq_mask);
1783         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQH], 0);
1784         stl_le_p(&s->regs_rw[RISCV_IOMMU_REG_PQT], 0);
1785         ctrl_set = RISCV_IOMMU_PQCSR_PQON;
1786         ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQMF |
1787             RISCV_IOMMU_PQCSR_PQOF;
1788     } else if (!enable && active) {
1789         stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQH], ~0);
1790         ctrl_set = 0;
1791         ctrl_clr = RISCV_IOMMU_PQCSR_BUSY | RISCV_IOMMU_PQCSR_PQON;
1792     } else {
1793         ctrl_set = 0;
1794         ctrl_clr = RISCV_IOMMU_PQCSR_BUSY;
1795     }
1796 
1797     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, ctrl_set, ctrl_clr);
1798 }
1799 
1800 static void riscv_iommu_process_dbg(RISCVIOMMUState *s)
1801 {
1802     uint64_t iova = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_TR_REQ_IOVA);
1803     uint64_t ctrl = riscv_iommu_reg_get64(s, RISCV_IOMMU_REG_TR_REQ_CTL);
1804     unsigned devid = get_field(ctrl, RISCV_IOMMU_TR_REQ_CTL_DID);
1805     unsigned pid = get_field(ctrl, RISCV_IOMMU_TR_REQ_CTL_PID);
1806     RISCVIOMMUContext *ctx;
1807     void *ref;
1808 
1809     if (!(ctrl & RISCV_IOMMU_TR_REQ_CTL_GO_BUSY)) {
1810         return;
1811     }
1812 
1813     ctx = riscv_iommu_ctx(s, devid, pid, &ref);
1814     if (ctx == NULL) {
1815         riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE,
1816                                  RISCV_IOMMU_TR_RESPONSE_FAULT |
1817                                  (RISCV_IOMMU_FQ_CAUSE_DMA_DISABLED << 10));
1818     } else {
1819         IOMMUTLBEntry iotlb = {
1820             .iova = iova,
1821             .perm = ctrl & RISCV_IOMMU_TR_REQ_CTL_NW ? IOMMU_RO : IOMMU_RW,
1822             .addr_mask = ~0,
1823             .target_as = NULL,
1824         };
1825         int fault = riscv_iommu_translate(s, ctx, &iotlb, false);
1826         if (fault) {
1827             iova = RISCV_IOMMU_TR_RESPONSE_FAULT | (((uint64_t) fault) << 10);
1828         } else {
1829             iova = iotlb.translated_addr & ~iotlb.addr_mask;
1830             iova >>= TARGET_PAGE_BITS;
1831             iova &= RISCV_IOMMU_TR_RESPONSE_PPN;
1832 
1833             /* We do not support superpages (> 4kbs) for now */
1834             iova &= ~RISCV_IOMMU_TR_RESPONSE_S;
1835         }
1836         riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_TR_RESPONSE, iova);
1837     }
1838 
1839     riscv_iommu_reg_mod64(s, RISCV_IOMMU_REG_TR_REQ_CTL, 0,
1840         RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
1841     riscv_iommu_ctx_put(s, ref);
1842 }
1843 
1844 typedef void riscv_iommu_process_fn(RISCVIOMMUState *s);
1845 
1846 static void riscv_iommu_update_icvec(RISCVIOMMUState *s, uint64_t data)
1847 {
1848     uint64_t icvec = 0;
1849 
1850     icvec |= MIN(data & RISCV_IOMMU_ICVEC_CIV,
1851                  s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_CIV);
1852 
1853     icvec |= MIN(data & RISCV_IOMMU_ICVEC_FIV,
1854                  s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_FIV);
1855 
1856     icvec |= MIN(data & RISCV_IOMMU_ICVEC_PMIV,
1857                  s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_PMIV);
1858 
1859     icvec |= MIN(data & RISCV_IOMMU_ICVEC_PIV,
1860                  s->icvec_avail_vectors & RISCV_IOMMU_ICVEC_PIV);
1861 
1862     trace_riscv_iommu_icvec_write(data, icvec);
1863 
1864     riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_ICVEC, icvec);
1865 }
1866 
1867 static void riscv_iommu_update_ipsr(RISCVIOMMUState *s, uint64_t data)
1868 {
1869     uint32_t cqcsr, fqcsr, pqcsr;
1870     uint32_t ipsr_set = 0;
1871     uint32_t ipsr_clr = 0;
1872 
1873     if (data & RISCV_IOMMU_IPSR_CIP) {
1874         cqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_CQCSR);
1875 
1876         if (cqcsr & RISCV_IOMMU_CQCSR_CIE &&
1877             (cqcsr & RISCV_IOMMU_CQCSR_FENCE_W_IP ||
1878              cqcsr & RISCV_IOMMU_CQCSR_CMD_ILL ||
1879              cqcsr & RISCV_IOMMU_CQCSR_CMD_TO ||
1880              cqcsr & RISCV_IOMMU_CQCSR_CQMF)) {
1881             ipsr_set |= RISCV_IOMMU_IPSR_CIP;
1882         } else {
1883             ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
1884         }
1885     } else {
1886         ipsr_clr |= RISCV_IOMMU_IPSR_CIP;
1887     }
1888 
1889     if (data & RISCV_IOMMU_IPSR_FIP) {
1890         fqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_FQCSR);
1891 
1892         if (fqcsr & RISCV_IOMMU_FQCSR_FIE &&
1893             (fqcsr & RISCV_IOMMU_FQCSR_FQOF ||
1894              fqcsr & RISCV_IOMMU_FQCSR_FQMF)) {
1895             ipsr_set |= RISCV_IOMMU_IPSR_FIP;
1896         } else {
1897             ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
1898         }
1899     } else {
1900         ipsr_clr |= RISCV_IOMMU_IPSR_FIP;
1901     }
1902 
1903     if (data & RISCV_IOMMU_IPSR_PIP) {
1904         pqcsr = riscv_iommu_reg_get32(s, RISCV_IOMMU_REG_PQCSR);
1905 
1906         if (pqcsr & RISCV_IOMMU_PQCSR_PIE &&
1907             (pqcsr & RISCV_IOMMU_PQCSR_PQOF ||
1908              pqcsr & RISCV_IOMMU_PQCSR_PQMF)) {
1909             ipsr_set |= RISCV_IOMMU_IPSR_PIP;
1910         } else {
1911             ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
1912         }
1913     } else {
1914         ipsr_clr |= RISCV_IOMMU_IPSR_PIP;
1915     }
1916 
1917     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_IPSR, ipsr_set, ipsr_clr);
1918 }
1919 
1920 /*
1921  * Write the resulting value of 'data' for the reg specified
1922  * by 'reg_addr', after considering read-only/read-write/write-clear
1923  * bits, in the pointer 'dest'.
1924  *
1925  * The result is written in little-endian.
1926  */
1927 static void riscv_iommu_write_reg_val(RISCVIOMMUState *s,
1928                                       void *dest, hwaddr reg_addr,
1929                                       int size, uint64_t data)
1930 {
1931     uint64_t ro = ldn_le_p(&s->regs_ro[reg_addr], size);
1932     uint64_t wc = ldn_le_p(&s->regs_wc[reg_addr], size);
1933     uint64_t rw = ldn_le_p(&s->regs_rw[reg_addr], size);
1934 
1935     stn_le_p(dest, size, ((rw & ro) | (data & ~ro)) & ~(data & wc));
1936 }
1937 
1938 static MemTxResult riscv_iommu_mmio_write(void *opaque, hwaddr addr,
1939                                           uint64_t data, unsigned size,
1940                                           MemTxAttrs attrs)
1941 {
1942     riscv_iommu_process_fn *process_fn = NULL;
1943     RISCVIOMMUState *s = opaque;
1944     uint32_t regb = addr & ~3;
1945     uint32_t busy = 0;
1946     uint64_t val = 0;
1947 
1948     if ((addr & (size - 1)) != 0) {
1949         /* Unsupported MMIO alignment or access size */
1950         return MEMTX_ERROR;
1951     }
1952 
1953     if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
1954         /* Unsupported MMIO access location. */
1955         return MEMTX_ACCESS_ERROR;
1956     }
1957 
1958     /* Track actionable MMIO write. */
1959     switch (regb) {
1960     case RISCV_IOMMU_REG_DDTP:
1961     case RISCV_IOMMU_REG_DDTP + 4:
1962         process_fn = riscv_iommu_process_ddtp;
1963         regb = RISCV_IOMMU_REG_DDTP;
1964         busy = RISCV_IOMMU_DDTP_BUSY;
1965         break;
1966 
1967     case RISCV_IOMMU_REG_CQT:
1968         process_fn = riscv_iommu_process_cq_tail;
1969         break;
1970 
1971     case RISCV_IOMMU_REG_CQCSR:
1972         process_fn = riscv_iommu_process_cq_control;
1973         busy = RISCV_IOMMU_CQCSR_BUSY;
1974         break;
1975 
1976     case RISCV_IOMMU_REG_FQCSR:
1977         process_fn = riscv_iommu_process_fq_control;
1978         busy = RISCV_IOMMU_FQCSR_BUSY;
1979         break;
1980 
1981     case RISCV_IOMMU_REG_PQCSR:
1982         process_fn = riscv_iommu_process_pq_control;
1983         busy = RISCV_IOMMU_PQCSR_BUSY;
1984         break;
1985 
1986     case RISCV_IOMMU_REG_ICVEC:
1987     case RISCV_IOMMU_REG_IPSR:
1988         /*
1989          * ICVEC and IPSR have special read/write procedures. We'll
1990          * call their respective helpers and exit.
1991          */
1992         riscv_iommu_write_reg_val(s, &val, addr, size, data);
1993 
1994         /*
1995          * 'val' is stored as LE. Switch to host endianess
1996          * before using it.
1997          */
1998         val = le64_to_cpu(val);
1999 
2000         if (regb == RISCV_IOMMU_REG_ICVEC) {
2001             riscv_iommu_update_icvec(s, val);
2002         } else {
2003             riscv_iommu_update_ipsr(s, val);
2004         }
2005 
2006         return MEMTX_OK;
2007 
2008     case RISCV_IOMMU_REG_TR_REQ_CTL:
2009         process_fn = riscv_iommu_process_dbg;
2010         regb = RISCV_IOMMU_REG_TR_REQ_CTL;
2011         busy = RISCV_IOMMU_TR_REQ_CTL_GO_BUSY;
2012         break;
2013 
2014     default:
2015         break;
2016     }
2017 
2018     /*
2019      * Registers update might be not synchronized with core logic.
2020      * If system software updates register when relevant BUSY bit
2021      * is set IOMMU behavior of additional writes to the register
2022      * is UNSPECIFIED.
2023      */
2024     riscv_iommu_write_reg_val(s, &s->regs_rw[addr], addr, size, data);
2025 
2026     /* Busy flag update, MSB 4-byte register. */
2027     if (busy) {
2028         uint32_t rw = ldl_le_p(&s->regs_rw[regb]);
2029         stl_le_p(&s->regs_rw[regb], rw | busy);
2030     }
2031 
2032     if (process_fn) {
2033         process_fn(s);
2034     }
2035 
2036     return MEMTX_OK;
2037 }
2038 
2039 static MemTxResult riscv_iommu_mmio_read(void *opaque, hwaddr addr,
2040     uint64_t *data, unsigned size, MemTxAttrs attrs)
2041 {
2042     RISCVIOMMUState *s = opaque;
2043     uint64_t val = -1;
2044     uint8_t *ptr;
2045 
2046     if ((addr & (size - 1)) != 0) {
2047         /* Unsupported MMIO alignment. */
2048         return MEMTX_ERROR;
2049     }
2050 
2051     if (addr + size > RISCV_IOMMU_REG_MSI_CONFIG) {
2052         return MEMTX_ACCESS_ERROR;
2053     }
2054 
2055     ptr = &s->regs_rw[addr];
2056     val = ldn_le_p(ptr, size);
2057 
2058     *data = val;
2059 
2060     return MEMTX_OK;
2061 }
2062 
2063 static const MemoryRegionOps riscv_iommu_mmio_ops = {
2064     .read_with_attrs = riscv_iommu_mmio_read,
2065     .write_with_attrs = riscv_iommu_mmio_write,
2066     .endianness = DEVICE_NATIVE_ENDIAN,
2067     .impl = {
2068         .min_access_size = 4,
2069         .max_access_size = 8,
2070         .unaligned = false,
2071     },
2072     .valid = {
2073         .min_access_size = 4,
2074         .max_access_size = 8,
2075     }
2076 };
2077 
2078 /*
2079  * Translations matching MSI pattern check are redirected to "riscv-iommu-trap"
2080  * memory region as untranslated address, for additional MSI/MRIF interception
2081  * by IOMMU interrupt remapping implementation.
2082  * Note: Device emulation code generating an MSI is expected to provide a valid
2083  * memory transaction attributes with requested_id set.
2084  */
2085 static MemTxResult riscv_iommu_trap_write(void *opaque, hwaddr addr,
2086     uint64_t data, unsigned size, MemTxAttrs attrs)
2087 {
2088     RISCVIOMMUState* s = (RISCVIOMMUState *)opaque;
2089     RISCVIOMMUContext *ctx;
2090     MemTxResult res;
2091     void *ref;
2092     uint32_t devid = attrs.requester_id;
2093 
2094     if (attrs.unspecified) {
2095         return MEMTX_ACCESS_ERROR;
2096     }
2097 
2098     /* FIXME: PCIe bus remapping for attached endpoints. */
2099     devid |= s->bus << 8;
2100 
2101     ctx = riscv_iommu_ctx(s, devid, 0, &ref);
2102     if (ctx == NULL) {
2103         res = MEMTX_ACCESS_ERROR;
2104     } else {
2105         res = riscv_iommu_msi_write(s, ctx, addr, data, size, attrs);
2106     }
2107     riscv_iommu_ctx_put(s, ref);
2108     return res;
2109 }
2110 
2111 static MemTxResult riscv_iommu_trap_read(void *opaque, hwaddr addr,
2112     uint64_t *data, unsigned size, MemTxAttrs attrs)
2113 {
2114     return MEMTX_ACCESS_ERROR;
2115 }
2116 
2117 static const MemoryRegionOps riscv_iommu_trap_ops = {
2118     .read_with_attrs = riscv_iommu_trap_read,
2119     .write_with_attrs = riscv_iommu_trap_write,
2120     .endianness = DEVICE_LITTLE_ENDIAN,
2121     .impl = {
2122         .min_access_size = 4,
2123         .max_access_size = 8,
2124         .unaligned = true,
2125     },
2126     .valid = {
2127         .min_access_size = 4,
2128         .max_access_size = 8,
2129     }
2130 };
2131 
2132 void riscv_iommu_set_cap_igs(RISCVIOMMUState *s, riscv_iommu_igs_mode mode)
2133 {
2134     s->cap = set_field(s->cap, RISCV_IOMMU_CAP_IGS, mode);
2135 }
2136 
2137 static void riscv_iommu_instance_init(Object *obj)
2138 {
2139     RISCVIOMMUState *s = RISCV_IOMMU(obj);
2140 
2141     /* Enable translation debug interface */
2142     s->cap = RISCV_IOMMU_CAP_DBG;
2143 
2144     /* Report QEMU target physical address space limits */
2145     s->cap = set_field(s->cap, RISCV_IOMMU_CAP_PAS,
2146                        TARGET_PHYS_ADDR_SPACE_BITS);
2147 
2148     /* TODO: method to report supported PID bits */
2149     s->pid_bits = 8; /* restricted to size of MemTxAttrs.pid */
2150     s->cap |= RISCV_IOMMU_CAP_PD8;
2151 
2152     /* register storage */
2153     s->regs_rw = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
2154     s->regs_ro = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
2155     s->regs_wc = g_new0(uint8_t, RISCV_IOMMU_REG_SIZE);
2156 
2157      /* Mark all registers read-only */
2158     memset(s->regs_ro, 0xff, RISCV_IOMMU_REG_SIZE);
2159 
2160     /* Device translation context cache */
2161     s->ctx_cache = g_hash_table_new_full(riscv_iommu_ctx_hash,
2162                                          riscv_iommu_ctx_equal,
2163                                          g_free, NULL);
2164 
2165     s->iot_cache = g_hash_table_new_full(riscv_iommu_iot_hash,
2166                                          riscv_iommu_iot_equal,
2167                                          g_free, NULL);
2168 
2169     s->iommus.le_next = NULL;
2170     s->iommus.le_prev = NULL;
2171     QLIST_INIT(&s->spaces);
2172 }
2173 
2174 static void riscv_iommu_realize(DeviceState *dev, Error **errp)
2175 {
2176     RISCVIOMMUState *s = RISCV_IOMMU(dev);
2177 
2178     s->cap |= s->version & RISCV_IOMMU_CAP_VERSION;
2179     if (s->enable_msi) {
2180         s->cap |= RISCV_IOMMU_CAP_MSI_FLAT | RISCV_IOMMU_CAP_MSI_MRIF;
2181     }
2182     if (s->enable_ats) {
2183         s->cap |= RISCV_IOMMU_CAP_ATS;
2184     }
2185     if (s->enable_s_stage) {
2186         s->cap |= RISCV_IOMMU_CAP_SV32 | RISCV_IOMMU_CAP_SV39 |
2187                   RISCV_IOMMU_CAP_SV48 | RISCV_IOMMU_CAP_SV57;
2188     }
2189     if (s->enable_g_stage) {
2190         s->cap |= RISCV_IOMMU_CAP_SV32X4 | RISCV_IOMMU_CAP_SV39X4 |
2191                   RISCV_IOMMU_CAP_SV48X4 | RISCV_IOMMU_CAP_SV57X4;
2192     }
2193 
2194     /* Out-of-reset translation mode: OFF (DMA disabled) BARE (passthrough) */
2195     s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, s->enable_off ?
2196                         RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE);
2197 
2198     /*
2199      * Register complete MMIO space, including MSI/PBA registers.
2200      * Note, PCIDevice implementation will add overlapping MR for MSI/PBA,
2201      * managed directly by the PCIDevice implementation.
2202      */
2203     memory_region_init_io(&s->regs_mr, OBJECT(dev), &riscv_iommu_mmio_ops, s,
2204         "riscv-iommu-regs", RISCV_IOMMU_REG_SIZE);
2205 
2206     /* Set power-on register state */
2207     stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_CAP], s->cap);
2208     stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_FCTL], 0);
2209     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FCTL],
2210              ~(RISCV_IOMMU_FCTL_BE | RISCV_IOMMU_FCTL_WSI));
2211     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_DDTP],
2212         ~(RISCV_IOMMU_DDTP_PPN | RISCV_IOMMU_DDTP_MODE));
2213     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQB],
2214         ~(RISCV_IOMMU_CQB_LOG2SZ | RISCV_IOMMU_CQB_PPN));
2215     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQB],
2216         ~(RISCV_IOMMU_FQB_LOG2SZ | RISCV_IOMMU_FQB_PPN));
2217     stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQB],
2218         ~(RISCV_IOMMU_PQB_LOG2SZ | RISCV_IOMMU_PQB_PPN));
2219     stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQMF |
2220         RISCV_IOMMU_CQCSR_CMD_TO | RISCV_IOMMU_CQCSR_CMD_ILL);
2221     stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_CQCSR], RISCV_IOMMU_CQCSR_CQON |
2222         RISCV_IOMMU_CQCSR_BUSY);
2223     stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQMF |
2224         RISCV_IOMMU_FQCSR_FQOF);
2225     stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_FQCSR], RISCV_IOMMU_FQCSR_FQON |
2226         RISCV_IOMMU_FQCSR_BUSY);
2227     stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQMF |
2228         RISCV_IOMMU_PQCSR_PQOF);
2229     stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_PQCSR], RISCV_IOMMU_PQCSR_PQON |
2230         RISCV_IOMMU_PQCSR_BUSY);
2231     stl_le_p(&s->regs_wc[RISCV_IOMMU_REG_IPSR], ~0);
2232     stl_le_p(&s->regs_ro[RISCV_IOMMU_REG_ICVEC], 0);
2233     stq_le_p(&s->regs_rw[RISCV_IOMMU_REG_DDTP], s->ddtp);
2234     /* If debug registers enabled. */
2235     if (s->cap & RISCV_IOMMU_CAP_DBG) {
2236         stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_TR_REQ_IOVA], 0);
2237         stq_le_p(&s->regs_ro[RISCV_IOMMU_REG_TR_REQ_CTL],
2238             RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
2239     }
2240 
2241     /* Memory region for downstream access, if specified. */
2242     if (s->target_mr) {
2243         s->target_as = g_new0(AddressSpace, 1);
2244         address_space_init(s->target_as, s->target_mr,
2245             "riscv-iommu-downstream");
2246     } else {
2247         /* Fallback to global system memory. */
2248         s->target_as = &address_space_memory;
2249     }
2250 
2251     /* Memory region for untranslated MRIF/MSI writes */
2252     memory_region_init_io(&s->trap_mr, OBJECT(dev), &riscv_iommu_trap_ops, s,
2253             "riscv-iommu-trap", ~0ULL);
2254     address_space_init(&s->trap_as, &s->trap_mr, "riscv-iommu-trap-as");
2255 }
2256 
2257 static void riscv_iommu_unrealize(DeviceState *dev)
2258 {
2259     RISCVIOMMUState *s = RISCV_IOMMU(dev);
2260 
2261     g_hash_table_unref(s->iot_cache);
2262     g_hash_table_unref(s->ctx_cache);
2263 }
2264 
2265 void riscv_iommu_reset(RISCVIOMMUState *s)
2266 {
2267     uint32_t reg_clr;
2268     int ddtp_mode;
2269 
2270     /*
2271      * Clear DDTP while setting DDTP_mode back to user
2272      * initial setting.
2273      */
2274     ddtp_mode = s->enable_off ?
2275                 RISCV_IOMMU_DDTP_MODE_OFF : RISCV_IOMMU_DDTP_MODE_BARE;
2276     s->ddtp = set_field(0, RISCV_IOMMU_DDTP_MODE, ddtp_mode);
2277     riscv_iommu_reg_set64(s, RISCV_IOMMU_REG_DDTP, s->ddtp);
2278 
2279     reg_clr = RISCV_IOMMU_CQCSR_CQEN | RISCV_IOMMU_CQCSR_CIE |
2280               RISCV_IOMMU_CQCSR_CQON | RISCV_IOMMU_CQCSR_BUSY;
2281     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_CQCSR, 0, reg_clr);
2282 
2283     reg_clr = RISCV_IOMMU_FQCSR_FQEN | RISCV_IOMMU_FQCSR_FIE |
2284               RISCV_IOMMU_FQCSR_FQON | RISCV_IOMMU_FQCSR_BUSY;
2285     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_FQCSR, 0, reg_clr);
2286 
2287     reg_clr = RISCV_IOMMU_PQCSR_PQEN | RISCV_IOMMU_PQCSR_PIE |
2288               RISCV_IOMMU_PQCSR_PQON | RISCV_IOMMU_PQCSR_BUSY;
2289     riscv_iommu_reg_mod32(s, RISCV_IOMMU_REG_PQCSR, 0, reg_clr);
2290 
2291     riscv_iommu_reg_mod64(s, RISCV_IOMMU_REG_TR_REQ_CTL, 0,
2292                           RISCV_IOMMU_TR_REQ_CTL_GO_BUSY);
2293 
2294     riscv_iommu_reg_set32(s, RISCV_IOMMU_REG_IPSR, 0);
2295 
2296     g_hash_table_remove_all(s->ctx_cache);
2297     g_hash_table_remove_all(s->iot_cache);
2298 }
2299 
2300 static const Property riscv_iommu_properties[] = {
2301     DEFINE_PROP_UINT32("version", RISCVIOMMUState, version,
2302         RISCV_IOMMU_SPEC_DOT_VER),
2303     DEFINE_PROP_UINT32("bus", RISCVIOMMUState, bus, 0x0),
2304     DEFINE_PROP_UINT32("ioatc-limit", RISCVIOMMUState, iot_limit,
2305         LIMIT_CACHE_IOT),
2306     DEFINE_PROP_BOOL("intremap", RISCVIOMMUState, enable_msi, TRUE),
2307     DEFINE_PROP_BOOL("ats", RISCVIOMMUState, enable_ats, TRUE),
2308     DEFINE_PROP_BOOL("off", RISCVIOMMUState, enable_off, TRUE),
2309     DEFINE_PROP_BOOL("s-stage", RISCVIOMMUState, enable_s_stage, TRUE),
2310     DEFINE_PROP_BOOL("g-stage", RISCVIOMMUState, enable_g_stage, TRUE),
2311     DEFINE_PROP_LINK("downstream-mr", RISCVIOMMUState, target_mr,
2312         TYPE_MEMORY_REGION, MemoryRegion *),
2313 };
2314 
2315 static void riscv_iommu_class_init(ObjectClass *klass, void* data)
2316 {
2317     DeviceClass *dc = DEVICE_CLASS(klass);
2318 
2319     /* internal device for riscv-iommu-{pci/sys}, not user-creatable */
2320     dc->user_creatable = false;
2321     dc->realize = riscv_iommu_realize;
2322     dc->unrealize = riscv_iommu_unrealize;
2323     device_class_set_props(dc, riscv_iommu_properties);
2324 }
2325 
2326 static const TypeInfo riscv_iommu_info = {
2327     .name = TYPE_RISCV_IOMMU,
2328     .parent = TYPE_DEVICE,
2329     .instance_size = sizeof(RISCVIOMMUState),
2330     .instance_init = riscv_iommu_instance_init,
2331     .class_init = riscv_iommu_class_init,
2332 };
2333 
2334 static const char *IOMMU_FLAG_STR[] = {
2335     "NA",
2336     "RO",
2337     "WR",
2338     "RW",
2339 };
2340 
2341 /* RISC-V IOMMU Memory Region - Address Translation Space */
2342 static IOMMUTLBEntry riscv_iommu_memory_region_translate(
2343     IOMMUMemoryRegion *iommu_mr, hwaddr addr,
2344     IOMMUAccessFlags flag, int iommu_idx)
2345 {
2346     RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
2347     RISCVIOMMUContext *ctx;
2348     void *ref;
2349     IOMMUTLBEntry iotlb = {
2350         .iova = addr,
2351         .target_as = as->iommu->target_as,
2352         .addr_mask = ~0ULL,
2353         .perm = flag,
2354     };
2355 
2356     ctx = riscv_iommu_ctx(as->iommu, as->devid, iommu_idx, &ref);
2357     if (ctx == NULL) {
2358         /* Translation disabled or invalid. */
2359         iotlb.addr_mask = 0;
2360         iotlb.perm = IOMMU_NONE;
2361     } else if (riscv_iommu_translate(as->iommu, ctx, &iotlb, true)) {
2362         /* Translation disabled or fault reported. */
2363         iotlb.addr_mask = 0;
2364         iotlb.perm = IOMMU_NONE;
2365     }
2366 
2367     /* Trace all dma translations with original access flags. */
2368     trace_riscv_iommu_dma(as->iommu->parent_obj.id, PCI_BUS_NUM(as->devid),
2369                           PCI_SLOT(as->devid), PCI_FUNC(as->devid), iommu_idx,
2370                           IOMMU_FLAG_STR[flag & IOMMU_RW], iotlb.iova,
2371                           iotlb.translated_addr);
2372 
2373     riscv_iommu_ctx_put(as->iommu, ref);
2374 
2375     return iotlb;
2376 }
2377 
2378 static int riscv_iommu_memory_region_notify(
2379     IOMMUMemoryRegion *iommu_mr, IOMMUNotifierFlag old,
2380     IOMMUNotifierFlag new, Error **errp)
2381 {
2382     RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
2383 
2384     if (old == IOMMU_NOTIFIER_NONE) {
2385         as->notifier = true;
2386         trace_riscv_iommu_notifier_add(iommu_mr->parent_obj.name);
2387     } else if (new == IOMMU_NOTIFIER_NONE) {
2388         as->notifier = false;
2389         trace_riscv_iommu_notifier_del(iommu_mr->parent_obj.name);
2390     }
2391 
2392     return 0;
2393 }
2394 
2395 static inline bool pci_is_iommu(PCIDevice *pdev)
2396 {
2397     return pci_get_word(pdev->config + PCI_CLASS_DEVICE) == 0x0806;
2398 }
2399 
2400 static AddressSpace *riscv_iommu_find_as(PCIBus *bus, void *opaque, int devfn)
2401 {
2402     RISCVIOMMUState *s = (RISCVIOMMUState *) opaque;
2403     PCIDevice *pdev = pci_find_device(bus, pci_bus_num(bus), devfn);
2404     AddressSpace *as = NULL;
2405 
2406     if (pdev && pci_is_iommu(pdev)) {
2407         return s->target_as;
2408     }
2409 
2410     /* Find first registered IOMMU device */
2411     while (s->iommus.le_prev) {
2412         s = *(s->iommus.le_prev);
2413     }
2414 
2415     /* Find first matching IOMMU */
2416     while (s != NULL && as == NULL) {
2417         as = riscv_iommu_space(s, PCI_BUILD_BDF(pci_bus_num(bus), devfn));
2418         s = s->iommus.le_next;
2419     }
2420 
2421     return as ? as : &address_space_memory;
2422 }
2423 
2424 static const PCIIOMMUOps riscv_iommu_ops = {
2425     .get_address_space = riscv_iommu_find_as,
2426 };
2427 
2428 void riscv_iommu_pci_setup_iommu(RISCVIOMMUState *iommu, PCIBus *bus,
2429         Error **errp)
2430 {
2431     if (bus->iommu_ops &&
2432         bus->iommu_ops->get_address_space == riscv_iommu_find_as) {
2433         /* Allow multiple IOMMUs on the same PCIe bus, link known devices */
2434         RISCVIOMMUState *last = (RISCVIOMMUState *)bus->iommu_opaque;
2435         QLIST_INSERT_AFTER(last, iommu, iommus);
2436     } else if (!bus->iommu_ops && !bus->iommu_opaque) {
2437         pci_setup_iommu(bus, &riscv_iommu_ops, iommu);
2438     } else {
2439         error_setg(errp, "can't register secondary IOMMU for PCI bus #%d",
2440             pci_bus_num(bus));
2441     }
2442 }
2443 
2444 static int riscv_iommu_memory_region_index(IOMMUMemoryRegion *iommu_mr,
2445     MemTxAttrs attrs)
2446 {
2447     return attrs.unspecified ? RISCV_IOMMU_NOPROCID : (int)attrs.pid;
2448 }
2449 
2450 static int riscv_iommu_memory_region_index_len(IOMMUMemoryRegion *iommu_mr)
2451 {
2452     RISCVIOMMUSpace *as = container_of(iommu_mr, RISCVIOMMUSpace, iova_mr);
2453     return 1 << as->iommu->pid_bits;
2454 }
2455 
2456 static void riscv_iommu_memory_region_init(ObjectClass *klass, void *data)
2457 {
2458     IOMMUMemoryRegionClass *imrc = IOMMU_MEMORY_REGION_CLASS(klass);
2459 
2460     imrc->translate = riscv_iommu_memory_region_translate;
2461     imrc->notify_flag_changed = riscv_iommu_memory_region_notify;
2462     imrc->attrs_to_index = riscv_iommu_memory_region_index;
2463     imrc->num_indexes = riscv_iommu_memory_region_index_len;
2464 }
2465 
2466 static const TypeInfo riscv_iommu_memory_region_info = {
2467     .parent = TYPE_IOMMU_MEMORY_REGION,
2468     .name = TYPE_RISCV_IOMMU_MEMORY_REGION,
2469     .class_init = riscv_iommu_memory_region_init,
2470 };
2471 
2472 static void riscv_iommu_register_mr_types(void)
2473 {
2474     type_register_static(&riscv_iommu_memory_region_info);
2475     type_register_static(&riscv_iommu_info);
2476 }
2477 
2478 type_init(riscv_iommu_register_mr_types);
2479