1 // SPDX-License-Identifier: GPL-2.0-only OR MIT
2 /*
3 * Copyright © 2024 Intel Corporation
4 *
5 * Authors:
6 * Matthew Brost <matthew.brost@intel.com>
7 */
8
9 #include <linux/dma-mapping.h>
10 #include <linux/hmm.h>
11 #include <linux/memremap.h>
12 #include <linux/migrate.h>
13 #include <linux/mm_types.h>
14 #include <linux/pagemap.h>
15 #include <linux/slab.h>
16
17 #include <drm/drm_device.h>
18 #include <drm/drm_gpusvm.h>
19 #include <drm/drm_pagemap.h>
20 #include <drm/drm_print.h>
21
22 /**
23 * DOC: Overview
24 *
25 * GPU Shared Virtual Memory (GPU SVM) layer for the Direct Rendering Manager (DRM)
26 * is a component of the DRM framework designed to manage shared virtual memory
27 * between the CPU and GPU. It enables efficient data exchange and processing
28 * for GPU-accelerated applications by allowing memory sharing and
29 * synchronization between the CPU's and GPU's virtual address spaces.
30 *
31 * Key GPU SVM Components:
32 *
33 * - Notifiers:
34 * Used for tracking memory intervals and notifying the GPU of changes,
35 * notifiers are sized based on a GPU SVM initialization parameter, with a
36 * recommendation of 512M or larger. They maintain a Red-BlacK tree and a
37 * list of ranges that fall within the notifier interval. Notifiers are
38 * tracked within a GPU SVM Red-BlacK tree and list and are dynamically
39 * inserted or removed as ranges within the interval are created or
40 * destroyed.
41 * - Ranges:
42 * Represent memory ranges mapped in a DRM device and managed by GPU SVM.
43 * They are sized based on an array of chunk sizes, which is a GPU SVM
44 * initialization parameter, and the CPU address space. Upon GPU fault,
45 * the largest aligned chunk that fits within the faulting CPU address
46 * space is chosen for the range size. Ranges are expected to be
47 * dynamically allocated on GPU fault and removed on an MMU notifier UNMAP
48 * event. As mentioned above, ranges are tracked in a notifier's Red-Black
49 * tree.
50 *
51 * - Operations:
52 * Define the interface for driver-specific GPU SVM operations such as
53 * range allocation, notifier allocation, and invalidations.
54 *
55 * - Device Memory Allocations:
56 * Embedded structure containing enough information for GPU SVM to migrate
57 * to / from device memory.
58 *
59 * - Device Memory Operations:
60 * Define the interface for driver-specific device memory operations
61 * release memory, populate pfns, and copy to / from device memory.
62 *
63 * This layer provides interfaces for allocating, mapping, migrating, and
64 * releasing memory ranges between the CPU and GPU. It handles all core memory
65 * management interactions (DMA mapping, HMM, and migration) and provides
66 * driver-specific virtual functions (vfuncs). This infrastructure is sufficient
67 * to build the expected driver components for an SVM implementation as detailed
68 * below.
69 *
70 * Expected Driver Components:
71 *
72 * - GPU page fault handler:
73 * Used to create ranges and notifiers based on the fault address,
74 * optionally migrate the range to device memory, and create GPU bindings.
75 *
76 * - Garbage collector:
77 * Used to unmap and destroy GPU bindings for ranges. Ranges are expected
78 * to be added to the garbage collector upon a MMU_NOTIFY_UNMAP event in
79 * notifier callback.
80 *
81 * - Notifier callback:
82 * Used to invalidate and DMA unmap GPU bindings for ranges.
83 */
84
85 /**
86 * DOC: Locking
87 *
88 * GPU SVM handles locking for core MM interactions, i.e., it locks/unlocks the
89 * mmap lock as needed.
90 *
91 * GPU SVM introduces a global notifier lock, which safeguards the notifier's
92 * range RB tree and list, as well as the range's DMA mappings and sequence
93 * number. GPU SVM manages all necessary locking and unlocking operations,
94 * except for the recheck range's pages being valid
95 * (drm_gpusvm_range_pages_valid) when the driver is committing GPU bindings.
96 * This lock corresponds to the ``driver->update`` lock mentioned in
97 * Documentation/mm/hmm.rst. Future revisions may transition from a GPU SVM
98 * global lock to a per-notifier lock if finer-grained locking is deemed
99 * necessary.
100 *
101 * In addition to the locking mentioned above, the driver should implement a
102 * lock to safeguard core GPU SVM function calls that modify state, such as
103 * drm_gpusvm_range_find_or_insert and drm_gpusvm_range_remove. This lock is
104 * denoted as 'driver_svm_lock' in code examples. Finer grained driver side
105 * locking should also be possible for concurrent GPU fault processing within a
106 * single GPU SVM. The 'driver_svm_lock' can be via drm_gpusvm_driver_set_lock
107 * to add annotations to GPU SVM.
108 */
109
110 /**
111 * DOC: Migration
112 *
113 * The migration support is quite simple, allowing migration between RAM and
114 * device memory at the range granularity. For example, GPU SVM currently does
115 * not support mixing RAM and device memory pages within a range. This means
116 * that upon GPU fault, the entire range can be migrated to device memory, and
117 * upon CPU fault, the entire range is migrated to RAM. Mixed RAM and device
118 * memory storage within a range could be added in the future if required.
119 *
120 * The reasoning for only supporting range granularity is as follows: it
121 * simplifies the implementation, and range sizes are driver-defined and should
122 * be relatively small.
123 */
124
125 /**
126 * DOC: Partial Unmapping of Ranges
127 *
128 * Partial unmapping of ranges (e.g., 1M out of 2M is unmapped by CPU resulting
129 * in MMU_NOTIFY_UNMAP event) presents several challenges, with the main one
130 * being that a subset of the range still has CPU and GPU mappings. If the
131 * backing store for the range is in device memory, a subset of the backing
132 * store has references. One option would be to split the range and device
133 * memory backing store, but the implementation for this would be quite
134 * complicated. Given that partial unmappings are rare and driver-defined range
135 * sizes are relatively small, GPU SVM does not support splitting of ranges.
136 *
137 * With no support for range splitting, upon partial unmapping of a range, the
138 * driver is expected to invalidate and destroy the entire range. If the range
139 * has device memory as its backing, the driver is also expected to migrate any
140 * remaining pages back to RAM.
141 */
142
143 /**
144 * DOC: Examples
145 *
146 * This section provides three examples of how to build the expected driver
147 * components: the GPU page fault handler, the garbage collector, and the
148 * notifier callback.
149 *
150 * The generic code provided does not include logic for complex migration
151 * policies, optimized invalidations, fined grained driver locking, or other
152 * potentially required driver locking (e.g., DMA-resv locks).
153 *
154 * 1) GPU page fault handler
155 *
156 * .. code-block:: c
157 *
158 * int driver_bind_range(struct drm_gpusvm *gpusvm, struct drm_gpusvm_range *range)
159 * {
160 * int err = 0;
161 *
162 * driver_alloc_and_setup_memory_for_bind(gpusvm, range);
163 *
164 * drm_gpusvm_notifier_lock(gpusvm);
165 * if (drm_gpusvm_range_pages_valid(range))
166 * driver_commit_bind(gpusvm, range);
167 * else
168 * err = -EAGAIN;
169 * drm_gpusvm_notifier_unlock(gpusvm);
170 *
171 * return err;
172 * }
173 *
174 * int driver_gpu_fault(struct drm_gpusvm *gpusvm, unsigned long fault_addr,
175 * unsigned long gpuva_start, unsigned long gpuva_end)
176 * {
177 * struct drm_gpusvm_ctx ctx = {};
178 * int err;
179 *
180 * driver_svm_lock();
181 * retry:
182 * // Always process UNMAPs first so view of GPU SVM ranges is current
183 * driver_garbage_collector(gpusvm);
184 *
185 * range = drm_gpusvm_range_find_or_insert(gpusvm, fault_addr,
186 * gpuva_start, gpuva_end,
187 * &ctx);
188 * if (IS_ERR(range)) {
189 * err = PTR_ERR(range);
190 * goto unlock;
191 * }
192 *
193 * if (driver_migration_policy(range)) {
194 * mmap_read_lock(mm);
195 * devmem = driver_alloc_devmem();
196 * err = drm_gpusvm_migrate_to_devmem(gpusvm, range,
197 * devmem_allocation,
198 * &ctx);
199 * mmap_read_unlock(mm);
200 * if (err) // CPU mappings may have changed
201 * goto retry;
202 * }
203 *
204 * err = drm_gpusvm_range_get_pages(gpusvm, range, &ctx);
205 * if (err == -EOPNOTSUPP || err == -EFAULT || err == -EPERM) { // CPU mappings changed
206 * if (err == -EOPNOTSUPP)
207 * drm_gpusvm_range_evict(gpusvm, range);
208 * goto retry;
209 * } else if (err) {
210 * goto unlock;
211 * }
212 *
213 * err = driver_bind_range(gpusvm, range);
214 * if (err == -EAGAIN) // CPU mappings changed
215 * goto retry
216 *
217 * unlock:
218 * driver_svm_unlock();
219 * return err;
220 * }
221 *
222 * 2) Garbage Collector
223 *
224 * .. code-block:: c
225 *
226 * void __driver_garbage_collector(struct drm_gpusvm *gpusvm,
227 * struct drm_gpusvm_range *range)
228 * {
229 * assert_driver_svm_locked(gpusvm);
230 *
231 * // Partial unmap, migrate any remaining device memory pages back to RAM
232 * if (range->flags.partial_unmap)
233 * drm_gpusvm_range_evict(gpusvm, range);
234 *
235 * driver_unbind_range(range);
236 * drm_gpusvm_range_remove(gpusvm, range);
237 * }
238 *
239 * void driver_garbage_collector(struct drm_gpusvm *gpusvm)
240 * {
241 * assert_driver_svm_locked(gpusvm);
242 *
243 * for_each_range_in_garbage_collector(gpusvm, range)
244 * __driver_garbage_collector(gpusvm, range);
245 * }
246 *
247 * 3) Notifier callback
248 *
249 * .. code-block:: c
250 *
251 * void driver_invalidation(struct drm_gpusvm *gpusvm,
252 * struct drm_gpusvm_notifier *notifier,
253 * const struct mmu_notifier_range *mmu_range)
254 * {
255 * struct drm_gpusvm_ctx ctx = { .in_notifier = true, };
256 * struct drm_gpusvm_range *range = NULL;
257 *
258 * driver_invalidate_device_pages(gpusvm, mmu_range->start, mmu_range->end);
259 *
260 * drm_gpusvm_for_each_range(range, notifier, mmu_range->start,
261 * mmu_range->end) {
262 * drm_gpusvm_range_unmap_pages(gpusvm, range, &ctx);
263 *
264 * if (mmu_range->event != MMU_NOTIFY_UNMAP)
265 * continue;
266 *
267 * drm_gpusvm_range_set_unmapped(range, mmu_range);
268 * driver_garbage_collector_add(gpusvm, range);
269 * }
270 * }
271 */
272
273 /**
274 * npages_in_range() - Calculate the number of pages in a given range
275 * @start: The start address of the range
276 * @end: The end address of the range
277 *
278 * This macro calculates the number of pages in a given memory range,
279 * specified by the start and end addresses. It divides the difference
280 * between the end and start addresses by the page size (PAGE_SIZE) to
281 * determine the number of pages in the range.
282 *
283 * Return: The number of pages in the specified range.
284 */
285 static unsigned long
npages_in_range(unsigned long start,unsigned long end)286 npages_in_range(unsigned long start, unsigned long end)
287 {
288 return (end - start) >> PAGE_SHIFT;
289 }
290
291 /**
292 * struct drm_gpusvm_zdd - GPU SVM zone device data
293 *
294 * @refcount: Reference count for the zdd
295 * @devmem_allocation: device memory allocation
296 * @device_private_page_owner: Device private pages owner
297 *
298 * This structure serves as a generic wrapper installed in
299 * page->zone_device_data. It provides infrastructure for looking up a device
300 * memory allocation upon CPU page fault and asynchronously releasing device
301 * memory once the CPU has no page references. Asynchronous release is useful
302 * because CPU page references can be dropped in IRQ contexts, while releasing
303 * device memory likely requires sleeping locks.
304 */
305 struct drm_gpusvm_zdd {
306 struct kref refcount;
307 struct drm_gpusvm_devmem *devmem_allocation;
308 void *device_private_page_owner;
309 };
310
311 /**
312 * drm_gpusvm_zdd_alloc() - Allocate a zdd structure.
313 * @device_private_page_owner: Device private pages owner
314 *
315 * This function allocates and initializes a new zdd structure. It sets up the
316 * reference count and initializes the destroy work.
317 *
318 * Return: Pointer to the allocated zdd on success, ERR_PTR() on failure.
319 */
320 static struct drm_gpusvm_zdd *
drm_gpusvm_zdd_alloc(void * device_private_page_owner)321 drm_gpusvm_zdd_alloc(void *device_private_page_owner)
322 {
323 struct drm_gpusvm_zdd *zdd;
324
325 zdd = kmalloc(sizeof(*zdd), GFP_KERNEL);
326 if (!zdd)
327 return NULL;
328
329 kref_init(&zdd->refcount);
330 zdd->devmem_allocation = NULL;
331 zdd->device_private_page_owner = device_private_page_owner;
332
333 return zdd;
334 }
335
336 /**
337 * drm_gpusvm_zdd_get() - Get a reference to a zdd structure.
338 * @zdd: Pointer to the zdd structure.
339 *
340 * This function increments the reference count of the provided zdd structure.
341 *
342 * Return: Pointer to the zdd structure.
343 */
drm_gpusvm_zdd_get(struct drm_gpusvm_zdd * zdd)344 static struct drm_gpusvm_zdd *drm_gpusvm_zdd_get(struct drm_gpusvm_zdd *zdd)
345 {
346 kref_get(&zdd->refcount);
347 return zdd;
348 }
349
350 /**
351 * drm_gpusvm_zdd_destroy() - Destroy a zdd structure.
352 * @ref: Pointer to the reference count structure.
353 *
354 * This function queues the destroy_work of the zdd for asynchronous destruction.
355 */
drm_gpusvm_zdd_destroy(struct kref * ref)356 static void drm_gpusvm_zdd_destroy(struct kref *ref)
357 {
358 struct drm_gpusvm_zdd *zdd =
359 container_of(ref, struct drm_gpusvm_zdd, refcount);
360 struct drm_gpusvm_devmem *devmem = zdd->devmem_allocation;
361
362 if (devmem) {
363 complete_all(&devmem->detached);
364 if (devmem->ops->devmem_release)
365 devmem->ops->devmem_release(devmem);
366 }
367 kfree(zdd);
368 }
369
370 /**
371 * drm_gpusvm_zdd_put() - Put a zdd reference.
372 * @zdd: Pointer to the zdd structure.
373 *
374 * This function decrements the reference count of the provided zdd structure
375 * and schedules its destruction if the count drops to zero.
376 */
drm_gpusvm_zdd_put(struct drm_gpusvm_zdd * zdd)377 static void drm_gpusvm_zdd_put(struct drm_gpusvm_zdd *zdd)
378 {
379 kref_put(&zdd->refcount, drm_gpusvm_zdd_destroy);
380 }
381
382 /**
383 * drm_gpusvm_range_find() - Find GPU SVM range from GPU SVM notifier
384 * @notifier: Pointer to the GPU SVM notifier structure.
385 * @start: Start address of the range
386 * @end: End address of the range
387 *
388 * Return: A pointer to the drm_gpusvm_range if found or NULL
389 */
390 struct drm_gpusvm_range *
drm_gpusvm_range_find(struct drm_gpusvm_notifier * notifier,unsigned long start,unsigned long end)391 drm_gpusvm_range_find(struct drm_gpusvm_notifier *notifier, unsigned long start,
392 unsigned long end)
393 {
394 struct interval_tree_node *itree;
395
396 itree = interval_tree_iter_first(¬ifier->root, start, end - 1);
397
398 if (itree)
399 return container_of(itree, struct drm_gpusvm_range, itree);
400 else
401 return NULL;
402 }
403 EXPORT_SYMBOL_GPL(drm_gpusvm_range_find);
404
405 /**
406 * drm_gpusvm_for_each_range_safe() - Safely iterate over GPU SVM ranges in a notifier
407 * @range__: Iterator variable for the ranges
408 * @next__: Iterator variable for the ranges temporay storage
409 * @notifier__: Pointer to the GPU SVM notifier
410 * @start__: Start address of the range
411 * @end__: End address of the range
412 *
413 * This macro is used to iterate over GPU SVM ranges in a notifier while
414 * removing ranges from it.
415 */
416 #define drm_gpusvm_for_each_range_safe(range__, next__, notifier__, start__, end__) \
417 for ((range__) = drm_gpusvm_range_find((notifier__), (start__), (end__)), \
418 (next__) = __drm_gpusvm_range_next(range__); \
419 (range__) && (drm_gpusvm_range_start(range__) < (end__)); \
420 (range__) = (next__), (next__) = __drm_gpusvm_range_next(range__))
421
422 /**
423 * __drm_gpusvm_notifier_next() - get the next drm_gpusvm_notifier in the list
424 * @notifier: a pointer to the current drm_gpusvm_notifier
425 *
426 * Return: A pointer to the next drm_gpusvm_notifier if available, or NULL if
427 * the current notifier is the last one or if the input notifier is
428 * NULL.
429 */
430 static struct drm_gpusvm_notifier *
__drm_gpusvm_notifier_next(struct drm_gpusvm_notifier * notifier)431 __drm_gpusvm_notifier_next(struct drm_gpusvm_notifier *notifier)
432 {
433 if (notifier && !list_is_last(¬ifier->entry,
434 ¬ifier->gpusvm->notifier_list))
435 return list_next_entry(notifier, entry);
436
437 return NULL;
438 }
439
440 static struct drm_gpusvm_notifier *
notifier_iter_first(struct rb_root_cached * root,unsigned long start,unsigned long last)441 notifier_iter_first(struct rb_root_cached *root, unsigned long start,
442 unsigned long last)
443 {
444 struct interval_tree_node *itree;
445
446 itree = interval_tree_iter_first(root, start, last);
447
448 if (itree)
449 return container_of(itree, struct drm_gpusvm_notifier, itree);
450 else
451 return NULL;
452 }
453
454 /**
455 * drm_gpusvm_for_each_notifier() - Iterate over GPU SVM notifiers in a gpusvm
456 * @notifier__: Iterator variable for the notifiers
457 * @notifier__: Pointer to the GPU SVM notifier
458 * @start__: Start address of the notifier
459 * @end__: End address of the notifier
460 *
461 * This macro is used to iterate over GPU SVM notifiers in a gpusvm.
462 */
463 #define drm_gpusvm_for_each_notifier(notifier__, gpusvm__, start__, end__) \
464 for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1); \
465 (notifier__) && (drm_gpusvm_notifier_start(notifier__) < (end__)); \
466 (notifier__) = __drm_gpusvm_notifier_next(notifier__))
467
468 /**
469 * drm_gpusvm_for_each_notifier_safe() - Safely iterate over GPU SVM notifiers in a gpusvm
470 * @notifier__: Iterator variable for the notifiers
471 * @next__: Iterator variable for the notifiers temporay storage
472 * @notifier__: Pointer to the GPU SVM notifier
473 * @start__: Start address of the notifier
474 * @end__: End address of the notifier
475 *
476 * This macro is used to iterate over GPU SVM notifiers in a gpusvm while
477 * removing notifiers from it.
478 */
479 #define drm_gpusvm_for_each_notifier_safe(notifier__, next__, gpusvm__, start__, end__) \
480 for ((notifier__) = notifier_iter_first(&(gpusvm__)->root, (start__), (end__) - 1), \
481 (next__) = __drm_gpusvm_notifier_next(notifier__); \
482 (notifier__) && (drm_gpusvm_notifier_start(notifier__) < (end__)); \
483 (notifier__) = (next__), (next__) = __drm_gpusvm_notifier_next(notifier__))
484
485 /**
486 * drm_gpusvm_notifier_invalidate() - Invalidate a GPU SVM notifier.
487 * @mni: Pointer to the mmu_interval_notifier structure.
488 * @mmu_range: Pointer to the mmu_notifier_range structure.
489 * @cur_seq: Current sequence number.
490 *
491 * This function serves as a generic MMU notifier for GPU SVM. It sets the MMU
492 * notifier sequence number and calls the driver invalidate vfunc under
493 * gpusvm->notifier_lock.
494 *
495 * Return: true if the operation succeeds, false otherwise.
496 */
497 static bool
drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier * mni,const struct mmu_notifier_range * mmu_range,unsigned long cur_seq)498 drm_gpusvm_notifier_invalidate(struct mmu_interval_notifier *mni,
499 const struct mmu_notifier_range *mmu_range,
500 unsigned long cur_seq)
501 {
502 struct drm_gpusvm_notifier *notifier =
503 container_of(mni, typeof(*notifier), notifier);
504 struct drm_gpusvm *gpusvm = notifier->gpusvm;
505
506 if (!mmu_notifier_range_blockable(mmu_range))
507 return false;
508
509 down_write(&gpusvm->notifier_lock);
510 mmu_interval_set_seq(mni, cur_seq);
511 gpusvm->ops->invalidate(gpusvm, notifier, mmu_range);
512 up_write(&gpusvm->notifier_lock);
513
514 return true;
515 }
516
517 /*
518 * drm_gpusvm_notifier_ops - MMU interval notifier operations for GPU SVM
519 */
520 static const struct mmu_interval_notifier_ops drm_gpusvm_notifier_ops = {
521 .invalidate = drm_gpusvm_notifier_invalidate,
522 };
523
524 /**
525 * drm_gpusvm_init() - Initialize the GPU SVM.
526 * @gpusvm: Pointer to the GPU SVM structure.
527 * @name: Name of the GPU SVM.
528 * @drm: Pointer to the DRM device structure.
529 * @mm: Pointer to the mm_struct for the address space.
530 * @device_private_page_owner: Device private pages owner.
531 * @mm_start: Start address of GPU SVM.
532 * @mm_range: Range of the GPU SVM.
533 * @notifier_size: Size of individual notifiers.
534 * @ops: Pointer to the operations structure for GPU SVM.
535 * @chunk_sizes: Pointer to the array of chunk sizes used in range allocation.
536 * Entries should be powers of 2 in descending order with last
537 * entry being SZ_4K.
538 * @num_chunks: Number of chunks.
539 *
540 * This function initializes the GPU SVM.
541 *
542 * Return: 0 on success, a negative error code on failure.
543 */
drm_gpusvm_init(struct drm_gpusvm * gpusvm,const char * name,struct drm_device * drm,struct mm_struct * mm,void * device_private_page_owner,unsigned long mm_start,unsigned long mm_range,unsigned long notifier_size,const struct drm_gpusvm_ops * ops,const unsigned long * chunk_sizes,int num_chunks)544 int drm_gpusvm_init(struct drm_gpusvm *gpusvm,
545 const char *name, struct drm_device *drm,
546 struct mm_struct *mm, void *device_private_page_owner,
547 unsigned long mm_start, unsigned long mm_range,
548 unsigned long notifier_size,
549 const struct drm_gpusvm_ops *ops,
550 const unsigned long *chunk_sizes, int num_chunks)
551 {
552 if (!ops->invalidate || !num_chunks)
553 return -EINVAL;
554
555 gpusvm->name = name;
556 gpusvm->drm = drm;
557 gpusvm->mm = mm;
558 gpusvm->device_private_page_owner = device_private_page_owner;
559 gpusvm->mm_start = mm_start;
560 gpusvm->mm_range = mm_range;
561 gpusvm->notifier_size = notifier_size;
562 gpusvm->ops = ops;
563 gpusvm->chunk_sizes = chunk_sizes;
564 gpusvm->num_chunks = num_chunks;
565
566 mmgrab(mm);
567 gpusvm->root = RB_ROOT_CACHED;
568 INIT_LIST_HEAD(&gpusvm->notifier_list);
569
570 init_rwsem(&gpusvm->notifier_lock);
571
572 fs_reclaim_acquire(GFP_KERNEL);
573 might_lock(&gpusvm->notifier_lock);
574 fs_reclaim_release(GFP_KERNEL);
575
576 #ifdef CONFIG_LOCKDEP
577 gpusvm->lock_dep_map = NULL;
578 #endif
579
580 return 0;
581 }
582 EXPORT_SYMBOL_GPL(drm_gpusvm_init);
583
584 /**
585 * drm_gpusvm_notifier_find() - Find GPU SVM notifier
586 * @gpusvm: Pointer to the GPU SVM structure
587 * @fault_addr: Fault address
588 *
589 * This function finds the GPU SVM notifier associated with the fault address.
590 *
591 * Return: Pointer to the GPU SVM notifier on success, NULL otherwise.
592 */
593 static struct drm_gpusvm_notifier *
drm_gpusvm_notifier_find(struct drm_gpusvm * gpusvm,unsigned long fault_addr)594 drm_gpusvm_notifier_find(struct drm_gpusvm *gpusvm,
595 unsigned long fault_addr)
596 {
597 return notifier_iter_first(&gpusvm->root, fault_addr, fault_addr + 1);
598 }
599
600 /**
601 * to_drm_gpusvm_notifier() - retrieve the container struct for a given rbtree node
602 * @node: a pointer to the rbtree node embedded within a drm_gpusvm_notifier struct
603 *
604 * Return: A pointer to the containing drm_gpusvm_notifier structure.
605 */
to_drm_gpusvm_notifier(struct rb_node * node)606 static struct drm_gpusvm_notifier *to_drm_gpusvm_notifier(struct rb_node *node)
607 {
608 return container_of(node, struct drm_gpusvm_notifier, itree.rb);
609 }
610
611 /**
612 * drm_gpusvm_notifier_insert() - Insert GPU SVM notifier
613 * @gpusvm: Pointer to the GPU SVM structure
614 * @notifier: Pointer to the GPU SVM notifier structure
615 *
616 * This function inserts the GPU SVM notifier into the GPU SVM RB tree and list.
617 */
drm_gpusvm_notifier_insert(struct drm_gpusvm * gpusvm,struct drm_gpusvm_notifier * notifier)618 static void drm_gpusvm_notifier_insert(struct drm_gpusvm *gpusvm,
619 struct drm_gpusvm_notifier *notifier)
620 {
621 struct rb_node *node;
622 struct list_head *head;
623
624 interval_tree_insert(¬ifier->itree, &gpusvm->root);
625
626 node = rb_prev(¬ifier->itree.rb);
627 if (node)
628 head = &(to_drm_gpusvm_notifier(node))->entry;
629 else
630 head = &gpusvm->notifier_list;
631
632 list_add(¬ifier->entry, head);
633 }
634
635 /**
636 * drm_gpusvm_notifier_remove() - Remove GPU SVM notifier
637 * @gpusvm: Pointer to the GPU SVM tructure
638 * @notifier: Pointer to the GPU SVM notifier structure
639 *
640 * This function removes the GPU SVM notifier from the GPU SVM RB tree and list.
641 */
drm_gpusvm_notifier_remove(struct drm_gpusvm * gpusvm,struct drm_gpusvm_notifier * notifier)642 static void drm_gpusvm_notifier_remove(struct drm_gpusvm *gpusvm,
643 struct drm_gpusvm_notifier *notifier)
644 {
645 interval_tree_remove(¬ifier->itree, &gpusvm->root);
646 list_del(¬ifier->entry);
647 }
648
649 /**
650 * drm_gpusvm_fini() - Finalize the GPU SVM.
651 * @gpusvm: Pointer to the GPU SVM structure.
652 *
653 * This function finalizes the GPU SVM by cleaning up any remaining ranges and
654 * notifiers, and dropping a reference to struct MM.
655 */
drm_gpusvm_fini(struct drm_gpusvm * gpusvm)656 void drm_gpusvm_fini(struct drm_gpusvm *gpusvm)
657 {
658 struct drm_gpusvm_notifier *notifier, *next;
659
660 drm_gpusvm_for_each_notifier_safe(notifier, next, gpusvm, 0, LONG_MAX) {
661 struct drm_gpusvm_range *range, *__next;
662
663 /*
664 * Remove notifier first to avoid racing with any invalidation
665 */
666 mmu_interval_notifier_remove(¬ifier->notifier);
667 notifier->flags.removed = true;
668
669 drm_gpusvm_for_each_range_safe(range, __next, notifier, 0,
670 LONG_MAX)
671 drm_gpusvm_range_remove(gpusvm, range);
672 }
673
674 mmdrop(gpusvm->mm);
675 WARN_ON(!RB_EMPTY_ROOT(&gpusvm->root.rb_root));
676 }
677 EXPORT_SYMBOL_GPL(drm_gpusvm_fini);
678
679 /**
680 * drm_gpusvm_notifier_alloc() - Allocate GPU SVM notifier
681 * @gpusvm: Pointer to the GPU SVM structure
682 * @fault_addr: Fault address
683 *
684 * This function allocates and initializes the GPU SVM notifier structure.
685 *
686 * Return: Pointer to the allocated GPU SVM notifier on success, ERR_PTR() on failure.
687 */
688 static struct drm_gpusvm_notifier *
drm_gpusvm_notifier_alloc(struct drm_gpusvm * gpusvm,unsigned long fault_addr)689 drm_gpusvm_notifier_alloc(struct drm_gpusvm *gpusvm, unsigned long fault_addr)
690 {
691 struct drm_gpusvm_notifier *notifier;
692
693 if (gpusvm->ops->notifier_alloc)
694 notifier = gpusvm->ops->notifier_alloc();
695 else
696 notifier = kzalloc(sizeof(*notifier), GFP_KERNEL);
697
698 if (!notifier)
699 return ERR_PTR(-ENOMEM);
700
701 notifier->gpusvm = gpusvm;
702 notifier->itree.start = ALIGN_DOWN(fault_addr, gpusvm->notifier_size);
703 notifier->itree.last = ALIGN(fault_addr + 1, gpusvm->notifier_size) - 1;
704 INIT_LIST_HEAD(¬ifier->entry);
705 notifier->root = RB_ROOT_CACHED;
706 INIT_LIST_HEAD(¬ifier->range_list);
707
708 return notifier;
709 }
710
711 /**
712 * drm_gpusvm_notifier_free() - Free GPU SVM notifier
713 * @gpusvm: Pointer to the GPU SVM structure
714 * @notifier: Pointer to the GPU SVM notifier structure
715 *
716 * This function frees the GPU SVM notifier structure.
717 */
drm_gpusvm_notifier_free(struct drm_gpusvm * gpusvm,struct drm_gpusvm_notifier * notifier)718 static void drm_gpusvm_notifier_free(struct drm_gpusvm *gpusvm,
719 struct drm_gpusvm_notifier *notifier)
720 {
721 WARN_ON(!RB_EMPTY_ROOT(¬ifier->root.rb_root));
722
723 if (gpusvm->ops->notifier_free)
724 gpusvm->ops->notifier_free(notifier);
725 else
726 kfree(notifier);
727 }
728
729 /**
730 * to_drm_gpusvm_range() - retrieve the container struct for a given rbtree node
731 * @node: a pointer to the rbtree node embedded within a drm_gpusvm_range struct
732 *
733 * Return: A pointer to the containing drm_gpusvm_range structure.
734 */
to_drm_gpusvm_range(struct rb_node * node)735 static struct drm_gpusvm_range *to_drm_gpusvm_range(struct rb_node *node)
736 {
737 return container_of(node, struct drm_gpusvm_range, itree.rb);
738 }
739
740 /**
741 * drm_gpusvm_range_insert() - Insert GPU SVM range
742 * @notifier: Pointer to the GPU SVM notifier structure
743 * @range: Pointer to the GPU SVM range structure
744 *
745 * This function inserts the GPU SVM range into the notifier RB tree and list.
746 */
drm_gpusvm_range_insert(struct drm_gpusvm_notifier * notifier,struct drm_gpusvm_range * range)747 static void drm_gpusvm_range_insert(struct drm_gpusvm_notifier *notifier,
748 struct drm_gpusvm_range *range)
749 {
750 struct rb_node *node;
751 struct list_head *head;
752
753 drm_gpusvm_notifier_lock(notifier->gpusvm);
754 interval_tree_insert(&range->itree, ¬ifier->root);
755
756 node = rb_prev(&range->itree.rb);
757 if (node)
758 head = &(to_drm_gpusvm_range(node))->entry;
759 else
760 head = ¬ifier->range_list;
761
762 list_add(&range->entry, head);
763 drm_gpusvm_notifier_unlock(notifier->gpusvm);
764 }
765
766 /**
767 * __drm_gpusvm_range_remove() - Remove GPU SVM range
768 * @notifier: Pointer to the GPU SVM notifier structure
769 * @range: Pointer to the GPU SVM range structure
770 *
771 * This macro removes the GPU SVM range from the notifier RB tree and list.
772 */
__drm_gpusvm_range_remove(struct drm_gpusvm_notifier * notifier,struct drm_gpusvm_range * range)773 static void __drm_gpusvm_range_remove(struct drm_gpusvm_notifier *notifier,
774 struct drm_gpusvm_range *range)
775 {
776 interval_tree_remove(&range->itree, ¬ifier->root);
777 list_del(&range->entry);
778 }
779
780 /**
781 * drm_gpusvm_range_alloc() - Allocate GPU SVM range
782 * @gpusvm: Pointer to the GPU SVM structure
783 * @notifier: Pointer to the GPU SVM notifier structure
784 * @fault_addr: Fault address
785 * @chunk_size: Chunk size
786 * @migrate_devmem: Flag indicating whether to migrate device memory
787 *
788 * This function allocates and initializes the GPU SVM range structure.
789 *
790 * Return: Pointer to the allocated GPU SVM range on success, ERR_PTR() on failure.
791 */
792 static struct drm_gpusvm_range *
drm_gpusvm_range_alloc(struct drm_gpusvm * gpusvm,struct drm_gpusvm_notifier * notifier,unsigned long fault_addr,unsigned long chunk_size,bool migrate_devmem)793 drm_gpusvm_range_alloc(struct drm_gpusvm *gpusvm,
794 struct drm_gpusvm_notifier *notifier,
795 unsigned long fault_addr, unsigned long chunk_size,
796 bool migrate_devmem)
797 {
798 struct drm_gpusvm_range *range;
799
800 if (gpusvm->ops->range_alloc)
801 range = gpusvm->ops->range_alloc(gpusvm);
802 else
803 range = kzalloc(sizeof(*range), GFP_KERNEL);
804
805 if (!range)
806 return ERR_PTR(-ENOMEM);
807
808 kref_init(&range->refcount);
809 range->gpusvm = gpusvm;
810 range->notifier = notifier;
811 range->itree.start = ALIGN_DOWN(fault_addr, chunk_size);
812 range->itree.last = ALIGN(fault_addr + 1, chunk_size) - 1;
813 INIT_LIST_HEAD(&range->entry);
814 range->notifier_seq = LONG_MAX;
815 range->flags.migrate_devmem = migrate_devmem ? 1 : 0;
816
817 return range;
818 }
819
820 /**
821 * drm_gpusvm_check_pages() - Check pages
822 * @gpusvm: Pointer to the GPU SVM structure
823 * @notifier: Pointer to the GPU SVM notifier structure
824 * @start: Start address
825 * @end: End address
826 *
827 * Check if pages between start and end have been faulted in on the CPU. Use to
828 * prevent migration of pages without CPU backing store.
829 *
830 * Return: True if pages have been faulted into CPU, False otherwise
831 */
drm_gpusvm_check_pages(struct drm_gpusvm * gpusvm,struct drm_gpusvm_notifier * notifier,unsigned long start,unsigned long end)832 static bool drm_gpusvm_check_pages(struct drm_gpusvm *gpusvm,
833 struct drm_gpusvm_notifier *notifier,
834 unsigned long start, unsigned long end)
835 {
836 struct hmm_range hmm_range = {
837 .default_flags = 0,
838 .notifier = ¬ifier->notifier,
839 .start = start,
840 .end = end,
841 .dev_private_owner = gpusvm->device_private_page_owner,
842 };
843 unsigned long timeout =
844 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
845 unsigned long *pfns;
846 unsigned long npages = npages_in_range(start, end);
847 int err, i;
848
849 mmap_assert_locked(gpusvm->mm);
850
851 pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
852 if (!pfns)
853 return false;
854
855 hmm_range.notifier_seq = mmu_interval_read_begin(¬ifier->notifier);
856 hmm_range.hmm_pfns = pfns;
857
858 while (true) {
859 err = hmm_range_fault(&hmm_range);
860 if (err == -EBUSY) {
861 if (time_after(jiffies, timeout))
862 break;
863
864 hmm_range.notifier_seq =
865 mmu_interval_read_begin(¬ifier->notifier);
866 continue;
867 }
868 break;
869 }
870 if (err)
871 goto err_free;
872
873 for (i = 0; i < npages;) {
874 if (!(pfns[i] & HMM_PFN_VALID)) {
875 err = -EFAULT;
876 goto err_free;
877 }
878 i += 0x1 << hmm_pfn_to_map_order(pfns[i]);
879 }
880
881 err_free:
882 kvfree(pfns);
883 return err ? false : true;
884 }
885
886 /**
887 * drm_gpusvm_range_chunk_size() - Determine chunk size for GPU SVM range
888 * @gpusvm: Pointer to the GPU SVM structure
889 * @notifier: Pointer to the GPU SVM notifier structure
890 * @vas: Pointer to the virtual memory area structure
891 * @fault_addr: Fault address
892 * @gpuva_start: Start address of GPUVA which mirrors CPU
893 * @gpuva_end: End address of GPUVA which mirrors CPU
894 * @check_pages_threshold: Check CPU pages for present threshold
895 *
896 * This function determines the chunk size for the GPU SVM range based on the
897 * fault address, GPU SVM chunk sizes, existing GPU SVM ranges, and the virtual
898 * memory area boundaries.
899 *
900 * Return: Chunk size on success, LONG_MAX on failure.
901 */
902 static unsigned long
drm_gpusvm_range_chunk_size(struct drm_gpusvm * gpusvm,struct drm_gpusvm_notifier * notifier,struct vm_area_struct * vas,unsigned long fault_addr,unsigned long gpuva_start,unsigned long gpuva_end,unsigned long check_pages_threshold)903 drm_gpusvm_range_chunk_size(struct drm_gpusvm *gpusvm,
904 struct drm_gpusvm_notifier *notifier,
905 struct vm_area_struct *vas,
906 unsigned long fault_addr,
907 unsigned long gpuva_start,
908 unsigned long gpuva_end,
909 unsigned long check_pages_threshold)
910 {
911 unsigned long start, end;
912 int i = 0;
913
914 retry:
915 for (; i < gpusvm->num_chunks; ++i) {
916 start = ALIGN_DOWN(fault_addr, gpusvm->chunk_sizes[i]);
917 end = ALIGN(fault_addr + 1, gpusvm->chunk_sizes[i]);
918
919 if (start >= vas->vm_start && end <= vas->vm_end &&
920 start >= drm_gpusvm_notifier_start(notifier) &&
921 end <= drm_gpusvm_notifier_end(notifier) &&
922 start >= gpuva_start && end <= gpuva_end)
923 break;
924 }
925
926 if (i == gpusvm->num_chunks)
927 return LONG_MAX;
928
929 /*
930 * If allocation more than page, ensure not to overlap with existing
931 * ranges.
932 */
933 if (end - start != SZ_4K) {
934 struct drm_gpusvm_range *range;
935
936 range = drm_gpusvm_range_find(notifier, start, end);
937 if (range) {
938 ++i;
939 goto retry;
940 }
941
942 /*
943 * XXX: Only create range on pages CPU has faulted in. Without
944 * this check, or prefault, on BMG 'xe_exec_system_allocator --r
945 * process-many-malloc' fails. In the failure case, each process
946 * mallocs 16k but the CPU VMA is ~128k which results in 64k SVM
947 * ranges. When migrating the SVM ranges, some processes fail in
948 * drm_gpusvm_migrate_to_devmem with 'migrate.cpages != npages'
949 * and then upon drm_gpusvm_range_get_pages device pages from
950 * other processes are collected + faulted in which creates all
951 * sorts of problems. Unsure exactly how this happening, also
952 * problem goes away if 'xe_exec_system_allocator --r
953 * process-many-malloc' mallocs at least 64k at a time.
954 */
955 if (end - start <= check_pages_threshold &&
956 !drm_gpusvm_check_pages(gpusvm, notifier, start, end)) {
957 ++i;
958 goto retry;
959 }
960 }
961
962 return end - start;
963 }
964
965 #ifdef CONFIG_LOCKDEP
966 /**
967 * drm_gpusvm_driver_lock_held() - Assert GPU SVM driver lock is held
968 * @gpusvm: Pointer to the GPU SVM structure.
969 *
970 * Ensure driver lock is held.
971 */
drm_gpusvm_driver_lock_held(struct drm_gpusvm * gpusvm)972 static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
973 {
974 if ((gpusvm)->lock_dep_map)
975 lockdep_assert(lock_is_held_type((gpusvm)->lock_dep_map, 0));
976 }
977 #else
drm_gpusvm_driver_lock_held(struct drm_gpusvm * gpusvm)978 static void drm_gpusvm_driver_lock_held(struct drm_gpusvm *gpusvm)
979 {
980 }
981 #endif
982
983 /**
984 * drm_gpusvm_range_find_or_insert() - Find or insert GPU SVM range
985 * @gpusvm: Pointer to the GPU SVM structure
986 * @fault_addr: Fault address
987 * @gpuva_start: Start address of GPUVA which mirrors CPU
988 * @gpuva_end: End address of GPUVA which mirrors CPU
989 * @ctx: GPU SVM context
990 *
991 * This function finds or inserts a newly allocated a GPU SVM range based on the
992 * fault address. Caller must hold a lock to protect range lookup and insertion.
993 *
994 * Return: Pointer to the GPU SVM range on success, ERR_PTR() on failure.
995 */
996 struct drm_gpusvm_range *
drm_gpusvm_range_find_or_insert(struct drm_gpusvm * gpusvm,unsigned long fault_addr,unsigned long gpuva_start,unsigned long gpuva_end,const struct drm_gpusvm_ctx * ctx)997 drm_gpusvm_range_find_or_insert(struct drm_gpusvm *gpusvm,
998 unsigned long fault_addr,
999 unsigned long gpuva_start,
1000 unsigned long gpuva_end,
1001 const struct drm_gpusvm_ctx *ctx)
1002 {
1003 struct drm_gpusvm_notifier *notifier;
1004 struct drm_gpusvm_range *range;
1005 struct mm_struct *mm = gpusvm->mm;
1006 struct vm_area_struct *vas;
1007 bool notifier_alloc = false;
1008 unsigned long chunk_size;
1009 int err;
1010 bool migrate_devmem;
1011
1012 drm_gpusvm_driver_lock_held(gpusvm);
1013
1014 if (fault_addr < gpusvm->mm_start ||
1015 fault_addr > gpusvm->mm_start + gpusvm->mm_range)
1016 return ERR_PTR(-EINVAL);
1017
1018 if (!mmget_not_zero(mm))
1019 return ERR_PTR(-EFAULT);
1020
1021 notifier = drm_gpusvm_notifier_find(gpusvm, fault_addr);
1022 if (!notifier) {
1023 notifier = drm_gpusvm_notifier_alloc(gpusvm, fault_addr);
1024 if (IS_ERR(notifier)) {
1025 err = PTR_ERR(notifier);
1026 goto err_mmunlock;
1027 }
1028 notifier_alloc = true;
1029 err = mmu_interval_notifier_insert(¬ifier->notifier,
1030 mm,
1031 drm_gpusvm_notifier_start(notifier),
1032 drm_gpusvm_notifier_size(notifier),
1033 &drm_gpusvm_notifier_ops);
1034 if (err)
1035 goto err_notifier;
1036 }
1037
1038 mmap_read_lock(mm);
1039
1040 vas = vma_lookup(mm, fault_addr);
1041 if (!vas) {
1042 err = -ENOENT;
1043 goto err_notifier_remove;
1044 }
1045
1046 if (!ctx->read_only && !(vas->vm_flags & VM_WRITE)) {
1047 err = -EPERM;
1048 goto err_notifier_remove;
1049 }
1050
1051 range = drm_gpusvm_range_find(notifier, fault_addr, fault_addr + 1);
1052 if (range)
1053 goto out_mmunlock;
1054 /*
1055 * XXX: Short-circuiting migration based on migrate_vma_* current
1056 * limitations. If/when migrate_vma_* add more support, this logic will
1057 * have to change.
1058 */
1059 migrate_devmem = ctx->devmem_possible &&
1060 vma_is_anonymous(vas) && !is_vm_hugetlb_page(vas);
1061
1062 chunk_size = drm_gpusvm_range_chunk_size(gpusvm, notifier, vas,
1063 fault_addr, gpuva_start,
1064 gpuva_end,
1065 ctx->check_pages_threshold);
1066 if (chunk_size == LONG_MAX) {
1067 err = -EINVAL;
1068 goto err_notifier_remove;
1069 }
1070
1071 range = drm_gpusvm_range_alloc(gpusvm, notifier, fault_addr, chunk_size,
1072 migrate_devmem);
1073 if (IS_ERR(range)) {
1074 err = PTR_ERR(range);
1075 goto err_notifier_remove;
1076 }
1077
1078 drm_gpusvm_range_insert(notifier, range);
1079 if (notifier_alloc)
1080 drm_gpusvm_notifier_insert(gpusvm, notifier);
1081
1082 out_mmunlock:
1083 mmap_read_unlock(mm);
1084 mmput(mm);
1085
1086 return range;
1087
1088 err_notifier_remove:
1089 mmap_read_unlock(mm);
1090 if (notifier_alloc)
1091 mmu_interval_notifier_remove(¬ifier->notifier);
1092 err_notifier:
1093 if (notifier_alloc)
1094 drm_gpusvm_notifier_free(gpusvm, notifier);
1095 err_mmunlock:
1096 mmput(mm);
1097 return ERR_PTR(err);
1098 }
1099 EXPORT_SYMBOL_GPL(drm_gpusvm_range_find_or_insert);
1100
1101 /**
1102 * __drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range (internal)
1103 * @gpusvm: Pointer to the GPU SVM structure
1104 * @range: Pointer to the GPU SVM range structure
1105 * @npages: Number of pages to unmap
1106 *
1107 * This function unmap pages associated with a GPU SVM range. Assumes and
1108 * asserts correct locking is in place when called.
1109 */
__drm_gpusvm_range_unmap_pages(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range,unsigned long npages)1110 static void __drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
1111 struct drm_gpusvm_range *range,
1112 unsigned long npages)
1113 {
1114 unsigned long i, j;
1115 struct drm_pagemap *dpagemap = range->dpagemap;
1116 struct device *dev = gpusvm->drm->dev;
1117
1118 lockdep_assert_held(&gpusvm->notifier_lock);
1119
1120 if (range->flags.has_dma_mapping) {
1121 struct drm_gpusvm_range_flags flags = {
1122 .__flags = range->flags.__flags,
1123 };
1124
1125 for (i = 0, j = 0; i < npages; j++) {
1126 struct drm_pagemap_device_addr *addr = &range->dma_addr[j];
1127
1128 if (addr->proto == DRM_INTERCONNECT_SYSTEM)
1129 dma_unmap_page(dev,
1130 addr->addr,
1131 PAGE_SIZE << addr->order,
1132 addr->dir);
1133 else if (dpagemap && dpagemap->ops->device_unmap)
1134 dpagemap->ops->device_unmap(dpagemap,
1135 dev, *addr);
1136 i += 1 << addr->order;
1137 }
1138
1139 /* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
1140 flags.has_devmem_pages = false;
1141 flags.has_dma_mapping = false;
1142 WRITE_ONCE(range->flags.__flags, flags.__flags);
1143
1144 range->dpagemap = NULL;
1145 }
1146 }
1147
1148 /**
1149 * drm_gpusvm_range_free_pages() - Free pages associated with a GPU SVM range
1150 * @gpusvm: Pointer to the GPU SVM structure
1151 * @range: Pointer to the GPU SVM range structure
1152 *
1153 * This function frees the dma address array associated with a GPU SVM range.
1154 */
drm_gpusvm_range_free_pages(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range)1155 static void drm_gpusvm_range_free_pages(struct drm_gpusvm *gpusvm,
1156 struct drm_gpusvm_range *range)
1157 {
1158 lockdep_assert_held(&gpusvm->notifier_lock);
1159
1160 if (range->dma_addr) {
1161 kvfree(range->dma_addr);
1162 range->dma_addr = NULL;
1163 }
1164 }
1165
1166 /**
1167 * drm_gpusvm_range_remove() - Remove GPU SVM range
1168 * @gpusvm: Pointer to the GPU SVM structure
1169 * @range: Pointer to the GPU SVM range to be removed
1170 *
1171 * This function removes the specified GPU SVM range and also removes the parent
1172 * GPU SVM notifier if no more ranges remain in the notifier. The caller must
1173 * hold a lock to protect range and notifier removal.
1174 */
drm_gpusvm_range_remove(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range)1175 void drm_gpusvm_range_remove(struct drm_gpusvm *gpusvm,
1176 struct drm_gpusvm_range *range)
1177 {
1178 unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1179 drm_gpusvm_range_end(range));
1180 struct drm_gpusvm_notifier *notifier;
1181
1182 drm_gpusvm_driver_lock_held(gpusvm);
1183
1184 notifier = drm_gpusvm_notifier_find(gpusvm,
1185 drm_gpusvm_range_start(range));
1186 if (WARN_ON_ONCE(!notifier))
1187 return;
1188
1189 drm_gpusvm_notifier_lock(gpusvm);
1190 __drm_gpusvm_range_unmap_pages(gpusvm, range, npages);
1191 drm_gpusvm_range_free_pages(gpusvm, range);
1192 __drm_gpusvm_range_remove(notifier, range);
1193 drm_gpusvm_notifier_unlock(gpusvm);
1194
1195 drm_gpusvm_range_put(range);
1196
1197 if (RB_EMPTY_ROOT(¬ifier->root.rb_root)) {
1198 if (!notifier->flags.removed)
1199 mmu_interval_notifier_remove(¬ifier->notifier);
1200 drm_gpusvm_notifier_remove(gpusvm, notifier);
1201 drm_gpusvm_notifier_free(gpusvm, notifier);
1202 }
1203 }
1204 EXPORT_SYMBOL_GPL(drm_gpusvm_range_remove);
1205
1206 /**
1207 * drm_gpusvm_range_get() - Get a reference to GPU SVM range
1208 * @range: Pointer to the GPU SVM range
1209 *
1210 * This function increments the reference count of the specified GPU SVM range.
1211 *
1212 * Return: Pointer to the GPU SVM range.
1213 */
1214 struct drm_gpusvm_range *
drm_gpusvm_range_get(struct drm_gpusvm_range * range)1215 drm_gpusvm_range_get(struct drm_gpusvm_range *range)
1216 {
1217 kref_get(&range->refcount);
1218
1219 return range;
1220 }
1221 EXPORT_SYMBOL_GPL(drm_gpusvm_range_get);
1222
1223 /**
1224 * drm_gpusvm_range_destroy() - Destroy GPU SVM range
1225 * @refcount: Pointer to the reference counter embedded in the GPU SVM range
1226 *
1227 * This function destroys the specified GPU SVM range when its reference count
1228 * reaches zero. If a custom range-free function is provided, it is invoked to
1229 * free the range; otherwise, the range is deallocated using kfree().
1230 */
drm_gpusvm_range_destroy(struct kref * refcount)1231 static void drm_gpusvm_range_destroy(struct kref *refcount)
1232 {
1233 struct drm_gpusvm_range *range =
1234 container_of(refcount, struct drm_gpusvm_range, refcount);
1235 struct drm_gpusvm *gpusvm = range->gpusvm;
1236
1237 if (gpusvm->ops->range_free)
1238 gpusvm->ops->range_free(range);
1239 else
1240 kfree(range);
1241 }
1242
1243 /**
1244 * drm_gpusvm_range_put() - Put a reference to GPU SVM range
1245 * @range: Pointer to the GPU SVM range
1246 *
1247 * This function decrements the reference count of the specified GPU SVM range
1248 * and frees it when the count reaches zero.
1249 */
drm_gpusvm_range_put(struct drm_gpusvm_range * range)1250 void drm_gpusvm_range_put(struct drm_gpusvm_range *range)
1251 {
1252 kref_put(&range->refcount, drm_gpusvm_range_destroy);
1253 }
1254 EXPORT_SYMBOL_GPL(drm_gpusvm_range_put);
1255
1256 /**
1257 * drm_gpusvm_range_pages_valid() - GPU SVM range pages valid
1258 * @gpusvm: Pointer to the GPU SVM structure
1259 * @range: Pointer to the GPU SVM range structure
1260 *
1261 * This function determines if a GPU SVM range pages are valid. Expected be
1262 * called holding gpusvm->notifier_lock and as the last step before committing a
1263 * GPU binding. This is akin to a notifier seqno check in the HMM documentation
1264 * but due to wider notifiers (i.e., notifiers which span multiple ranges) this
1265 * function is required for finer grained checking (i.e., per range) if pages
1266 * are valid.
1267 *
1268 * Return: True if GPU SVM range has valid pages, False otherwise
1269 */
drm_gpusvm_range_pages_valid(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range)1270 bool drm_gpusvm_range_pages_valid(struct drm_gpusvm *gpusvm,
1271 struct drm_gpusvm_range *range)
1272 {
1273 lockdep_assert_held(&gpusvm->notifier_lock);
1274
1275 return range->flags.has_devmem_pages || range->flags.has_dma_mapping;
1276 }
1277 EXPORT_SYMBOL_GPL(drm_gpusvm_range_pages_valid);
1278
1279 /**
1280 * drm_gpusvm_range_pages_valid_unlocked() - GPU SVM range pages valid unlocked
1281 * @gpusvm: Pointer to the GPU SVM structure
1282 * @range: Pointer to the GPU SVM range structure
1283 *
1284 * This function determines if a GPU SVM range pages are valid. Expected be
1285 * called without holding gpusvm->notifier_lock.
1286 *
1287 * Return: True if GPU SVM range has valid pages, False otherwise
1288 */
1289 static bool
drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range)1290 drm_gpusvm_range_pages_valid_unlocked(struct drm_gpusvm *gpusvm,
1291 struct drm_gpusvm_range *range)
1292 {
1293 bool pages_valid;
1294
1295 if (!range->dma_addr)
1296 return false;
1297
1298 drm_gpusvm_notifier_lock(gpusvm);
1299 pages_valid = drm_gpusvm_range_pages_valid(gpusvm, range);
1300 if (!pages_valid)
1301 drm_gpusvm_range_free_pages(gpusvm, range);
1302 drm_gpusvm_notifier_unlock(gpusvm);
1303
1304 return pages_valid;
1305 }
1306
1307 /**
1308 * drm_gpusvm_range_get_pages() - Get pages for a GPU SVM range
1309 * @gpusvm: Pointer to the GPU SVM structure
1310 * @range: Pointer to the GPU SVM range structure
1311 * @ctx: GPU SVM context
1312 *
1313 * This function gets pages for a GPU SVM range and ensures they are mapped for
1314 * DMA access.
1315 *
1316 * Return: 0 on success, negative error code on failure.
1317 */
drm_gpusvm_range_get_pages(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range,const struct drm_gpusvm_ctx * ctx)1318 int drm_gpusvm_range_get_pages(struct drm_gpusvm *gpusvm,
1319 struct drm_gpusvm_range *range,
1320 const struct drm_gpusvm_ctx *ctx)
1321 {
1322 struct mmu_interval_notifier *notifier = &range->notifier->notifier;
1323 struct hmm_range hmm_range = {
1324 .default_flags = HMM_PFN_REQ_FAULT | (ctx->read_only ? 0 :
1325 HMM_PFN_REQ_WRITE),
1326 .notifier = notifier,
1327 .start = drm_gpusvm_range_start(range),
1328 .end = drm_gpusvm_range_end(range),
1329 .dev_private_owner = gpusvm->device_private_page_owner,
1330 };
1331 struct mm_struct *mm = gpusvm->mm;
1332 struct drm_gpusvm_zdd *zdd;
1333 unsigned long timeout =
1334 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
1335 unsigned long i, j;
1336 unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1337 drm_gpusvm_range_end(range));
1338 unsigned long num_dma_mapped;
1339 unsigned int order = 0;
1340 unsigned long *pfns;
1341 struct page **pages;
1342 int err = 0;
1343 struct dev_pagemap *pagemap;
1344 struct drm_pagemap *dpagemap;
1345 struct drm_gpusvm_range_flags flags;
1346
1347 retry:
1348 hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
1349 if (drm_gpusvm_range_pages_valid_unlocked(gpusvm, range))
1350 goto set_seqno;
1351
1352 pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
1353 if (!pfns)
1354 return -ENOMEM;
1355
1356 if (!mmget_not_zero(mm)) {
1357 err = -EFAULT;
1358 goto err_free;
1359 }
1360
1361 hmm_range.hmm_pfns = pfns;
1362 while (true) {
1363 mmap_read_lock(mm);
1364 err = hmm_range_fault(&hmm_range);
1365 mmap_read_unlock(mm);
1366
1367 if (err == -EBUSY) {
1368 if (time_after(jiffies, timeout))
1369 break;
1370
1371 hmm_range.notifier_seq =
1372 mmu_interval_read_begin(notifier);
1373 continue;
1374 }
1375 break;
1376 }
1377 mmput(mm);
1378 if (err)
1379 goto err_free;
1380
1381 pages = (struct page **)pfns;
1382 map_pages:
1383 /*
1384 * Perform all dma mappings under the notifier lock to not
1385 * access freed pages. A notifier will either block on
1386 * the notifier lock or unmap dma.
1387 */
1388 drm_gpusvm_notifier_lock(gpusvm);
1389
1390 flags.__flags = range->flags.__flags;
1391 if (flags.unmapped) {
1392 drm_gpusvm_notifier_unlock(gpusvm);
1393 err = -EFAULT;
1394 goto err_free;
1395 }
1396
1397 if (mmu_interval_read_retry(notifier, hmm_range.notifier_seq)) {
1398 drm_gpusvm_notifier_unlock(gpusvm);
1399 kvfree(pfns);
1400 goto retry;
1401 }
1402
1403 if (!range->dma_addr) {
1404 /* Unlock and restart mapping to allocate memory. */
1405 drm_gpusvm_notifier_unlock(gpusvm);
1406 range->dma_addr = kvmalloc_array(npages,
1407 sizeof(*range->dma_addr),
1408 GFP_KERNEL);
1409 if (!range->dma_addr) {
1410 err = -ENOMEM;
1411 goto err_free;
1412 }
1413 goto map_pages;
1414 }
1415
1416 zdd = NULL;
1417 num_dma_mapped = 0;
1418 for (i = 0, j = 0; i < npages; ++j) {
1419 struct page *page = hmm_pfn_to_page(pfns[i]);
1420
1421 order = hmm_pfn_to_map_order(pfns[i]);
1422 if (is_device_private_page(page) ||
1423 is_device_coherent_page(page)) {
1424 if (zdd != page->zone_device_data && i > 0) {
1425 err = -EOPNOTSUPP;
1426 goto err_unmap;
1427 }
1428 zdd = page->zone_device_data;
1429 if (pagemap != page_pgmap(page)) {
1430 if (i > 0) {
1431 err = -EOPNOTSUPP;
1432 goto err_unmap;
1433 }
1434
1435 pagemap = page_pgmap(page);
1436 dpagemap = zdd->devmem_allocation->dpagemap;
1437 if (drm_WARN_ON(gpusvm->drm, !dpagemap)) {
1438 /*
1439 * Raced. This is not supposed to happen
1440 * since hmm_range_fault() should've migrated
1441 * this page to system.
1442 */
1443 err = -EAGAIN;
1444 goto err_unmap;
1445 }
1446 }
1447 range->dma_addr[j] =
1448 dpagemap->ops->device_map(dpagemap,
1449 gpusvm->drm->dev,
1450 page, order,
1451 DMA_BIDIRECTIONAL);
1452 if (dma_mapping_error(gpusvm->drm->dev,
1453 range->dma_addr[j].addr)) {
1454 err = -EFAULT;
1455 goto err_unmap;
1456 }
1457
1458 pages[i] = page;
1459 } else {
1460 dma_addr_t addr;
1461
1462 if (is_zone_device_page(page) || zdd) {
1463 err = -EOPNOTSUPP;
1464 goto err_unmap;
1465 }
1466
1467 if (ctx->devmem_only) {
1468 err = -EFAULT;
1469 goto err_unmap;
1470 }
1471
1472 addr = dma_map_page(gpusvm->drm->dev,
1473 page, 0,
1474 PAGE_SIZE << order,
1475 DMA_BIDIRECTIONAL);
1476 if (dma_mapping_error(gpusvm->drm->dev, addr)) {
1477 err = -EFAULT;
1478 goto err_unmap;
1479 }
1480
1481 range->dma_addr[j] = drm_pagemap_device_addr_encode
1482 (addr, DRM_INTERCONNECT_SYSTEM, order,
1483 DMA_BIDIRECTIONAL);
1484 }
1485 i += 1 << order;
1486 num_dma_mapped = i;
1487 flags.has_dma_mapping = true;
1488 }
1489
1490 if (zdd) {
1491 flags.has_devmem_pages = true;
1492 range->dpagemap = dpagemap;
1493 }
1494
1495 /* WRITE_ONCE pairs with READ_ONCE for opportunistic checks */
1496 WRITE_ONCE(range->flags.__flags, flags.__flags);
1497
1498 drm_gpusvm_notifier_unlock(gpusvm);
1499 kvfree(pfns);
1500 set_seqno:
1501 range->notifier_seq = hmm_range.notifier_seq;
1502
1503 return 0;
1504
1505 err_unmap:
1506 __drm_gpusvm_range_unmap_pages(gpusvm, range, num_dma_mapped);
1507 drm_gpusvm_notifier_unlock(gpusvm);
1508 err_free:
1509 kvfree(pfns);
1510 if (err == -EAGAIN)
1511 goto retry;
1512 return err;
1513 }
1514 EXPORT_SYMBOL_GPL(drm_gpusvm_range_get_pages);
1515
1516 /**
1517 * drm_gpusvm_range_unmap_pages() - Unmap pages associated with a GPU SVM range
1518 * @gpusvm: Pointer to the GPU SVM structure
1519 * @range: Pointer to the GPU SVM range structure
1520 * @ctx: GPU SVM context
1521 *
1522 * This function unmaps pages associated with a GPU SVM range. If @in_notifier
1523 * is set, it is assumed that gpusvm->notifier_lock is held in write mode; if it
1524 * is clear, it acquires gpusvm->notifier_lock in read mode. Must be called on
1525 * each GPU SVM range attached to notifier in gpusvm->ops->invalidate for IOMMU
1526 * security model.
1527 */
drm_gpusvm_range_unmap_pages(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range,const struct drm_gpusvm_ctx * ctx)1528 void drm_gpusvm_range_unmap_pages(struct drm_gpusvm *gpusvm,
1529 struct drm_gpusvm_range *range,
1530 const struct drm_gpusvm_ctx *ctx)
1531 {
1532 unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
1533 drm_gpusvm_range_end(range));
1534
1535 if (ctx->in_notifier)
1536 lockdep_assert_held_write(&gpusvm->notifier_lock);
1537 else
1538 drm_gpusvm_notifier_lock(gpusvm);
1539
1540 __drm_gpusvm_range_unmap_pages(gpusvm, range, npages);
1541
1542 if (!ctx->in_notifier)
1543 drm_gpusvm_notifier_unlock(gpusvm);
1544 }
1545 EXPORT_SYMBOL_GPL(drm_gpusvm_range_unmap_pages);
1546
1547 /**
1548 * drm_gpusvm_migration_unlock_put_page() - Put a migration page
1549 * @page: Pointer to the page to put
1550 *
1551 * This function unlocks and puts a page.
1552 */
drm_gpusvm_migration_unlock_put_page(struct page * page)1553 static void drm_gpusvm_migration_unlock_put_page(struct page *page)
1554 {
1555 unlock_page(page);
1556 put_page(page);
1557 }
1558
1559 /**
1560 * drm_gpusvm_migration_unlock_put_pages() - Put migration pages
1561 * @npages: Number of pages
1562 * @migrate_pfn: Array of migrate page frame numbers
1563 *
1564 * This function unlocks and puts an array of pages.
1565 */
drm_gpusvm_migration_unlock_put_pages(unsigned long npages,unsigned long * migrate_pfn)1566 static void drm_gpusvm_migration_unlock_put_pages(unsigned long npages,
1567 unsigned long *migrate_pfn)
1568 {
1569 unsigned long i;
1570
1571 for (i = 0; i < npages; ++i) {
1572 struct page *page;
1573
1574 if (!migrate_pfn[i])
1575 continue;
1576
1577 page = migrate_pfn_to_page(migrate_pfn[i]);
1578 drm_gpusvm_migration_unlock_put_page(page);
1579 migrate_pfn[i] = 0;
1580 }
1581 }
1582
1583 /**
1584 * drm_gpusvm_get_devmem_page() - Get a reference to a device memory page
1585 * @page: Pointer to the page
1586 * @zdd: Pointer to the GPU SVM zone device data
1587 *
1588 * This function associates the given page with the specified GPU SVM zone
1589 * device data and initializes it for zone device usage.
1590 */
drm_gpusvm_get_devmem_page(struct page * page,struct drm_gpusvm_zdd * zdd)1591 static void drm_gpusvm_get_devmem_page(struct page *page,
1592 struct drm_gpusvm_zdd *zdd)
1593 {
1594 page->zone_device_data = drm_gpusvm_zdd_get(zdd);
1595 zone_device_page_init(page);
1596 }
1597
1598 /**
1599 * drm_gpusvm_migrate_map_pages() - Map migration pages for GPU SVM migration
1600 * @dev: The device for which the pages are being mapped
1601 * @dma_addr: Array to store DMA addresses corresponding to mapped pages
1602 * @migrate_pfn: Array of migrate page frame numbers to map
1603 * @npages: Number of pages to map
1604 * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
1605 *
1606 * This function maps pages of memory for migration usage in GPU SVM. It
1607 * iterates over each page frame number provided in @migrate_pfn, maps the
1608 * corresponding page, and stores the DMA address in the provided @dma_addr
1609 * array.
1610 *
1611 * Return: 0 on success, -EFAULT if an error occurs during mapping.
1612 */
drm_gpusvm_migrate_map_pages(struct device * dev,dma_addr_t * dma_addr,unsigned long * migrate_pfn,unsigned long npages,enum dma_data_direction dir)1613 static int drm_gpusvm_migrate_map_pages(struct device *dev,
1614 dma_addr_t *dma_addr,
1615 unsigned long *migrate_pfn,
1616 unsigned long npages,
1617 enum dma_data_direction dir)
1618 {
1619 unsigned long i;
1620
1621 for (i = 0; i < npages; ++i) {
1622 struct page *page = migrate_pfn_to_page(migrate_pfn[i]);
1623
1624 if (!page)
1625 continue;
1626
1627 if (WARN_ON_ONCE(is_zone_device_page(page)))
1628 return -EFAULT;
1629
1630 dma_addr[i] = dma_map_page(dev, page, 0, PAGE_SIZE, dir);
1631 if (dma_mapping_error(dev, dma_addr[i]))
1632 return -EFAULT;
1633 }
1634
1635 return 0;
1636 }
1637
1638 /**
1639 * drm_gpusvm_migrate_unmap_pages() - Unmap pages previously mapped for GPU SVM migration
1640 * @dev: The device for which the pages were mapped
1641 * @dma_addr: Array of DMA addresses corresponding to mapped pages
1642 * @npages: Number of pages to unmap
1643 * @dir: Direction of data transfer (e.g., DMA_BIDIRECTIONAL)
1644 *
1645 * This function unmaps previously mapped pages of memory for GPU Shared Virtual
1646 * Memory (SVM). It iterates over each DMA address provided in @dma_addr, checks
1647 * if it's valid and not already unmapped, and unmaps the corresponding page.
1648 */
drm_gpusvm_migrate_unmap_pages(struct device * dev,dma_addr_t * dma_addr,unsigned long npages,enum dma_data_direction dir)1649 static void drm_gpusvm_migrate_unmap_pages(struct device *dev,
1650 dma_addr_t *dma_addr,
1651 unsigned long npages,
1652 enum dma_data_direction dir)
1653 {
1654 unsigned long i;
1655
1656 for (i = 0; i < npages; ++i) {
1657 if (!dma_addr[i] || dma_mapping_error(dev, dma_addr[i]))
1658 continue;
1659
1660 dma_unmap_page(dev, dma_addr[i], PAGE_SIZE, dir);
1661 }
1662 }
1663
1664 /**
1665 * drm_gpusvm_migrate_to_devmem() - Migrate GPU SVM range to device memory
1666 * @gpusvm: Pointer to the GPU SVM structure
1667 * @range: Pointer to the GPU SVM range structure
1668 * @devmem_allocation: Pointer to the device memory allocation. The caller
1669 * should hold a reference to the device memory allocation,
1670 * which should be dropped via ops->devmem_release or upon
1671 * the failure of this function.
1672 * @ctx: GPU SVM context
1673 *
1674 * This function migrates the specified GPU SVM range to device memory. It
1675 * performs the necessary setup and invokes the driver-specific operations for
1676 * migration to device memory. Upon successful return, @devmem_allocation can
1677 * safely reference @range until ops->devmem_release is called which only upon
1678 * successful return. Expected to be called while holding the mmap lock in read
1679 * mode.
1680 *
1681 * Return: 0 on success, negative error code on failure.
1682 */
drm_gpusvm_migrate_to_devmem(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range,struct drm_gpusvm_devmem * devmem_allocation,const struct drm_gpusvm_ctx * ctx)1683 int drm_gpusvm_migrate_to_devmem(struct drm_gpusvm *gpusvm,
1684 struct drm_gpusvm_range *range,
1685 struct drm_gpusvm_devmem *devmem_allocation,
1686 const struct drm_gpusvm_ctx *ctx)
1687 {
1688 const struct drm_gpusvm_devmem_ops *ops = devmem_allocation->ops;
1689 unsigned long start = drm_gpusvm_range_start(range),
1690 end = drm_gpusvm_range_end(range);
1691 struct migrate_vma migrate = {
1692 .start = start,
1693 .end = end,
1694 .pgmap_owner = gpusvm->device_private_page_owner,
1695 .flags = MIGRATE_VMA_SELECT_SYSTEM,
1696 };
1697 struct mm_struct *mm = gpusvm->mm;
1698 unsigned long i, npages = npages_in_range(start, end);
1699 struct vm_area_struct *vas;
1700 struct drm_gpusvm_zdd *zdd = NULL;
1701 struct page **pages;
1702 dma_addr_t *dma_addr;
1703 void *buf;
1704 int err;
1705
1706 mmap_assert_locked(gpusvm->mm);
1707
1708 if (!range->flags.migrate_devmem)
1709 return -EINVAL;
1710
1711 if (!ops->populate_devmem_pfn || !ops->copy_to_devmem ||
1712 !ops->copy_to_ram)
1713 return -EOPNOTSUPP;
1714
1715 vas = vma_lookup(mm, start);
1716 if (!vas) {
1717 err = -ENOENT;
1718 goto err_out;
1719 }
1720
1721 if (end > vas->vm_end || start < vas->vm_start) {
1722 err = -EINVAL;
1723 goto err_out;
1724 }
1725
1726 if (!vma_is_anonymous(vas)) {
1727 err = -EBUSY;
1728 goto err_out;
1729 }
1730
1731 buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
1732 sizeof(*pages), GFP_KERNEL);
1733 if (!buf) {
1734 err = -ENOMEM;
1735 goto err_out;
1736 }
1737 dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
1738 pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
1739
1740 zdd = drm_gpusvm_zdd_alloc(gpusvm->device_private_page_owner);
1741 if (!zdd) {
1742 err = -ENOMEM;
1743 goto err_free;
1744 }
1745
1746 migrate.vma = vas;
1747 migrate.src = buf;
1748 migrate.dst = migrate.src + npages;
1749
1750 err = migrate_vma_setup(&migrate);
1751 if (err)
1752 goto err_free;
1753
1754 if (!migrate.cpages) {
1755 err = -EFAULT;
1756 goto err_free;
1757 }
1758
1759 if (migrate.cpages != npages) {
1760 err = -EBUSY;
1761 goto err_finalize;
1762 }
1763
1764 err = ops->populate_devmem_pfn(devmem_allocation, npages, migrate.dst);
1765 if (err)
1766 goto err_finalize;
1767
1768 err = drm_gpusvm_migrate_map_pages(devmem_allocation->dev, dma_addr,
1769 migrate.src, npages, DMA_TO_DEVICE);
1770 if (err)
1771 goto err_finalize;
1772
1773 for (i = 0; i < npages; ++i) {
1774 struct page *page = pfn_to_page(migrate.dst[i]);
1775
1776 pages[i] = page;
1777 migrate.dst[i] = migrate_pfn(migrate.dst[i]);
1778 drm_gpusvm_get_devmem_page(page, zdd);
1779 }
1780
1781 err = ops->copy_to_devmem(pages, dma_addr, npages);
1782 if (err)
1783 goto err_finalize;
1784
1785 /* Upon success bind devmem allocation to range and zdd */
1786 devmem_allocation->timeslice_expiration = get_jiffies_64() +
1787 msecs_to_jiffies(ctx->timeslice_ms);
1788 zdd->devmem_allocation = devmem_allocation; /* Owns ref */
1789
1790 err_finalize:
1791 if (err)
1792 drm_gpusvm_migration_unlock_put_pages(npages, migrate.dst);
1793 migrate_vma_pages(&migrate);
1794 migrate_vma_finalize(&migrate);
1795 drm_gpusvm_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
1796 DMA_TO_DEVICE);
1797 err_free:
1798 if (zdd)
1799 drm_gpusvm_zdd_put(zdd);
1800 kvfree(buf);
1801 err_out:
1802 return err;
1803 }
1804 EXPORT_SYMBOL_GPL(drm_gpusvm_migrate_to_devmem);
1805
1806 /**
1807 * drm_gpusvm_migrate_populate_ram_pfn() - Populate RAM PFNs for a VM area
1808 * @vas: Pointer to the VM area structure, can be NULL
1809 * @fault_page: Fault page
1810 * @npages: Number of pages to populate
1811 * @mpages: Number of pages to migrate
1812 * @src_mpfn: Source array of migrate PFNs
1813 * @mpfn: Array of migrate PFNs to populate
1814 * @addr: Start address for PFN allocation
1815 *
1816 * This function populates the RAM migrate page frame numbers (PFNs) for the
1817 * specified VM area structure. It allocates and locks pages in the VM area for
1818 * RAM usage. If vas is non-NULL use alloc_page_vma for allocation, if NULL use
1819 * alloc_page for allocation.
1820 *
1821 * Return: 0 on success, negative error code on failure.
1822 */
drm_gpusvm_migrate_populate_ram_pfn(struct vm_area_struct * vas,struct page * fault_page,unsigned long npages,unsigned long * mpages,unsigned long * src_mpfn,unsigned long * mpfn,unsigned long addr)1823 static int drm_gpusvm_migrate_populate_ram_pfn(struct vm_area_struct *vas,
1824 struct page *fault_page,
1825 unsigned long npages,
1826 unsigned long *mpages,
1827 unsigned long *src_mpfn,
1828 unsigned long *mpfn,
1829 unsigned long addr)
1830 {
1831 unsigned long i;
1832
1833 for (i = 0; i < npages; ++i, addr += PAGE_SIZE) {
1834 struct page *page, *src_page;
1835
1836 if (!(src_mpfn[i] & MIGRATE_PFN_MIGRATE))
1837 continue;
1838
1839 src_page = migrate_pfn_to_page(src_mpfn[i]);
1840 if (!src_page)
1841 continue;
1842
1843 if (fault_page) {
1844 if (src_page->zone_device_data !=
1845 fault_page->zone_device_data)
1846 continue;
1847 }
1848
1849 if (vas)
1850 page = alloc_page_vma(GFP_HIGHUSER, vas, addr);
1851 else
1852 page = alloc_page(GFP_HIGHUSER);
1853
1854 if (!page)
1855 goto free_pages;
1856
1857 mpfn[i] = migrate_pfn(page_to_pfn(page));
1858 }
1859
1860 for (i = 0; i < npages; ++i) {
1861 struct page *page = migrate_pfn_to_page(mpfn[i]);
1862
1863 if (!page)
1864 continue;
1865
1866 WARN_ON_ONCE(!trylock_page(page));
1867 ++*mpages;
1868 }
1869
1870 return 0;
1871
1872 free_pages:
1873 for (i = 0; i < npages; ++i) {
1874 struct page *page = migrate_pfn_to_page(mpfn[i]);
1875
1876 if (!page)
1877 continue;
1878
1879 put_page(page);
1880 mpfn[i] = 0;
1881 }
1882 return -ENOMEM;
1883 }
1884
1885 /**
1886 * drm_gpusvm_evict_to_ram() - Evict GPU SVM range to RAM
1887 * @devmem_allocation: Pointer to the device memory allocation
1888 *
1889 * Similar to __drm_gpusvm_migrate_to_ram but does not require mmap lock and
1890 * migration done via migrate_device_* functions.
1891 *
1892 * Return: 0 on success, negative error code on failure.
1893 */
drm_gpusvm_evict_to_ram(struct drm_gpusvm_devmem * devmem_allocation)1894 int drm_gpusvm_evict_to_ram(struct drm_gpusvm_devmem *devmem_allocation)
1895 {
1896 const struct drm_gpusvm_devmem_ops *ops = devmem_allocation->ops;
1897 unsigned long npages, mpages = 0;
1898 struct page **pages;
1899 unsigned long *src, *dst;
1900 dma_addr_t *dma_addr;
1901 void *buf;
1902 int i, err = 0;
1903 unsigned int retry_count = 2;
1904
1905 npages = devmem_allocation->size >> PAGE_SHIFT;
1906
1907 retry:
1908 if (!mmget_not_zero(devmem_allocation->mm))
1909 return -EFAULT;
1910
1911 buf = kvcalloc(npages, 2 * sizeof(*src) + sizeof(*dma_addr) +
1912 sizeof(*pages), GFP_KERNEL);
1913 if (!buf) {
1914 err = -ENOMEM;
1915 goto err_out;
1916 }
1917 src = buf;
1918 dst = buf + (sizeof(*src) * npages);
1919 dma_addr = buf + (2 * sizeof(*src) * npages);
1920 pages = buf + (2 * sizeof(*src) + sizeof(*dma_addr)) * npages;
1921
1922 err = ops->populate_devmem_pfn(devmem_allocation, npages, src);
1923 if (err)
1924 goto err_free;
1925
1926 err = migrate_device_pfns(src, npages);
1927 if (err)
1928 goto err_free;
1929
1930 err = drm_gpusvm_migrate_populate_ram_pfn(NULL, NULL, npages, &mpages,
1931 src, dst, 0);
1932 if (err || !mpages)
1933 goto err_finalize;
1934
1935 err = drm_gpusvm_migrate_map_pages(devmem_allocation->dev, dma_addr,
1936 dst, npages, DMA_FROM_DEVICE);
1937 if (err)
1938 goto err_finalize;
1939
1940 for (i = 0; i < npages; ++i)
1941 pages[i] = migrate_pfn_to_page(src[i]);
1942
1943 err = ops->copy_to_ram(pages, dma_addr, npages);
1944 if (err)
1945 goto err_finalize;
1946
1947 err_finalize:
1948 if (err)
1949 drm_gpusvm_migration_unlock_put_pages(npages, dst);
1950 migrate_device_pages(src, dst, npages);
1951 migrate_device_finalize(src, dst, npages);
1952 drm_gpusvm_migrate_unmap_pages(devmem_allocation->dev, dma_addr, npages,
1953 DMA_FROM_DEVICE);
1954 err_free:
1955 kvfree(buf);
1956 err_out:
1957 mmput_async(devmem_allocation->mm);
1958
1959 if (completion_done(&devmem_allocation->detached))
1960 return 0;
1961
1962 if (retry_count--) {
1963 cond_resched();
1964 goto retry;
1965 }
1966
1967 return err ?: -EBUSY;
1968 }
1969 EXPORT_SYMBOL_GPL(drm_gpusvm_evict_to_ram);
1970
1971 /**
1972 * __drm_gpusvm_migrate_to_ram() - Migrate GPU SVM range to RAM (internal)
1973 * @vas: Pointer to the VM area structure
1974 * @device_private_page_owner: Device private pages owner
1975 * @page: Pointer to the page for fault handling (can be NULL)
1976 * @fault_addr: Fault address
1977 * @size: Size of migration
1978 *
1979 * This internal function performs the migration of the specified GPU SVM range
1980 * to RAM. It sets up the migration, populates + dma maps RAM PFNs, and
1981 * invokes the driver-specific operations for migration to RAM.
1982 *
1983 * Return: 0 on success, negative error code on failure.
1984 */
__drm_gpusvm_migrate_to_ram(struct vm_area_struct * vas,void * device_private_page_owner,struct page * page,unsigned long fault_addr,unsigned long size)1985 static int __drm_gpusvm_migrate_to_ram(struct vm_area_struct *vas,
1986 void *device_private_page_owner,
1987 struct page *page,
1988 unsigned long fault_addr,
1989 unsigned long size)
1990 {
1991 struct migrate_vma migrate = {
1992 .vma = vas,
1993 .pgmap_owner = device_private_page_owner,
1994 .flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE |
1995 MIGRATE_VMA_SELECT_DEVICE_COHERENT,
1996 .fault_page = page,
1997 };
1998 struct drm_gpusvm_zdd *zdd;
1999 const struct drm_gpusvm_devmem_ops *ops;
2000 struct device *dev = NULL;
2001 unsigned long npages, mpages = 0;
2002 struct page **pages;
2003 dma_addr_t *dma_addr;
2004 unsigned long start, end;
2005 void *buf;
2006 int i, err = 0;
2007
2008 if (page) {
2009 zdd = page->zone_device_data;
2010 if (time_before64(get_jiffies_64(),
2011 zdd->devmem_allocation->timeslice_expiration))
2012 return 0;
2013 }
2014
2015 start = ALIGN_DOWN(fault_addr, size);
2016 end = ALIGN(fault_addr + 1, size);
2017
2018 /* Corner where VMA area struct has been partially unmapped */
2019 if (start < vas->vm_start)
2020 start = vas->vm_start;
2021 if (end > vas->vm_end)
2022 end = vas->vm_end;
2023
2024 migrate.start = start;
2025 migrate.end = end;
2026 npages = npages_in_range(start, end);
2027
2028 buf = kvcalloc(npages, 2 * sizeof(*migrate.src) + sizeof(*dma_addr) +
2029 sizeof(*pages), GFP_KERNEL);
2030 if (!buf) {
2031 err = -ENOMEM;
2032 goto err_out;
2033 }
2034 dma_addr = buf + (2 * sizeof(*migrate.src) * npages);
2035 pages = buf + (2 * sizeof(*migrate.src) + sizeof(*dma_addr)) * npages;
2036
2037 migrate.vma = vas;
2038 migrate.src = buf;
2039 migrate.dst = migrate.src + npages;
2040
2041 err = migrate_vma_setup(&migrate);
2042 if (err)
2043 goto err_free;
2044
2045 /* Raced with another CPU fault, nothing to do */
2046 if (!migrate.cpages)
2047 goto err_free;
2048
2049 if (!page) {
2050 for (i = 0; i < npages; ++i) {
2051 if (!(migrate.src[i] & MIGRATE_PFN_MIGRATE))
2052 continue;
2053
2054 page = migrate_pfn_to_page(migrate.src[i]);
2055 break;
2056 }
2057
2058 if (!page)
2059 goto err_finalize;
2060 }
2061 zdd = page->zone_device_data;
2062 ops = zdd->devmem_allocation->ops;
2063 dev = zdd->devmem_allocation->dev;
2064
2065 err = drm_gpusvm_migrate_populate_ram_pfn(vas, page, npages, &mpages,
2066 migrate.src, migrate.dst,
2067 start);
2068 if (err)
2069 goto err_finalize;
2070
2071 err = drm_gpusvm_migrate_map_pages(dev, dma_addr, migrate.dst, npages,
2072 DMA_FROM_DEVICE);
2073 if (err)
2074 goto err_finalize;
2075
2076 for (i = 0; i < npages; ++i)
2077 pages[i] = migrate_pfn_to_page(migrate.src[i]);
2078
2079 err = ops->copy_to_ram(pages, dma_addr, npages);
2080 if (err)
2081 goto err_finalize;
2082
2083 err_finalize:
2084 if (err)
2085 drm_gpusvm_migration_unlock_put_pages(npages, migrate.dst);
2086 migrate_vma_pages(&migrate);
2087 migrate_vma_finalize(&migrate);
2088 if (dev)
2089 drm_gpusvm_migrate_unmap_pages(dev, dma_addr, npages,
2090 DMA_FROM_DEVICE);
2091 err_free:
2092 kvfree(buf);
2093 err_out:
2094
2095 return err;
2096 }
2097
2098 /**
2099 * drm_gpusvm_range_evict - Evict GPU SVM range
2100 * @range: Pointer to the GPU SVM range to be removed
2101 *
2102 * This function evicts the specified GPU SVM range. This function will not
2103 * evict coherent pages.
2104 *
2105 * Return: 0 on success, a negative error code on failure.
2106 */
drm_gpusvm_range_evict(struct drm_gpusvm * gpusvm,struct drm_gpusvm_range * range)2107 int drm_gpusvm_range_evict(struct drm_gpusvm *gpusvm,
2108 struct drm_gpusvm_range *range)
2109 {
2110 struct mmu_interval_notifier *notifier = &range->notifier->notifier;
2111 struct hmm_range hmm_range = {
2112 .default_flags = HMM_PFN_REQ_FAULT,
2113 .notifier = notifier,
2114 .start = drm_gpusvm_range_start(range),
2115 .end = drm_gpusvm_range_end(range),
2116 .dev_private_owner = NULL,
2117 };
2118 unsigned long timeout =
2119 jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT);
2120 unsigned long *pfns;
2121 unsigned long npages = npages_in_range(drm_gpusvm_range_start(range),
2122 drm_gpusvm_range_end(range));
2123 int err = 0;
2124 struct mm_struct *mm = gpusvm->mm;
2125
2126 if (!mmget_not_zero(mm))
2127 return -EFAULT;
2128
2129 pfns = kvmalloc_array(npages, sizeof(*pfns), GFP_KERNEL);
2130 if (!pfns)
2131 return -ENOMEM;
2132
2133 hmm_range.hmm_pfns = pfns;
2134 while (!time_after(jiffies, timeout)) {
2135 hmm_range.notifier_seq = mmu_interval_read_begin(notifier);
2136 if (time_after(jiffies, timeout)) {
2137 err = -ETIME;
2138 break;
2139 }
2140
2141 mmap_read_lock(mm);
2142 err = hmm_range_fault(&hmm_range);
2143 mmap_read_unlock(mm);
2144 if (err != -EBUSY)
2145 break;
2146 }
2147
2148 kvfree(pfns);
2149 mmput(mm);
2150
2151 return err;
2152 }
2153 EXPORT_SYMBOL_GPL(drm_gpusvm_range_evict);
2154
2155 /**
2156 * drm_gpusvm_page_free() - Put GPU SVM zone device data associated with a page
2157 * @page: Pointer to the page
2158 *
2159 * This function is a callback used to put the GPU SVM zone device data
2160 * associated with a page when it is being released.
2161 */
drm_gpusvm_page_free(struct page * page)2162 static void drm_gpusvm_page_free(struct page *page)
2163 {
2164 drm_gpusvm_zdd_put(page->zone_device_data);
2165 }
2166
2167 /**
2168 * drm_gpusvm_migrate_to_ram() - Migrate GPU SVM range to RAM (page fault handler)
2169 * @vmf: Pointer to the fault information structure
2170 *
2171 * This function is a page fault handler used to migrate a GPU SVM range to RAM.
2172 * It retrieves the GPU SVM range information from the faulting page and invokes
2173 * the internal migration function to migrate the range back to RAM.
2174 *
2175 * Return: VM_FAULT_SIGBUS on failure, 0 on success.
2176 */
drm_gpusvm_migrate_to_ram(struct vm_fault * vmf)2177 static vm_fault_t drm_gpusvm_migrate_to_ram(struct vm_fault *vmf)
2178 {
2179 struct drm_gpusvm_zdd *zdd = vmf->page->zone_device_data;
2180 int err;
2181
2182 err = __drm_gpusvm_migrate_to_ram(vmf->vma,
2183 zdd->device_private_page_owner,
2184 vmf->page, vmf->address,
2185 zdd->devmem_allocation->size);
2186
2187 return err ? VM_FAULT_SIGBUS : 0;
2188 }
2189
2190 /*
2191 * drm_gpusvm_pagemap_ops - Device page map operations for GPU SVM
2192 */
2193 static const struct dev_pagemap_ops drm_gpusvm_pagemap_ops = {
2194 .page_free = drm_gpusvm_page_free,
2195 .migrate_to_ram = drm_gpusvm_migrate_to_ram,
2196 };
2197
2198 /**
2199 * drm_gpusvm_pagemap_ops_get() - Retrieve GPU SVM device page map operations
2200 *
2201 * Return: Pointer to the GPU SVM device page map operations structure.
2202 */
drm_gpusvm_pagemap_ops_get(void)2203 const struct dev_pagemap_ops *drm_gpusvm_pagemap_ops_get(void)
2204 {
2205 return &drm_gpusvm_pagemap_ops;
2206 }
2207 EXPORT_SYMBOL_GPL(drm_gpusvm_pagemap_ops_get);
2208
2209 /**
2210 * drm_gpusvm_has_mapping() - Check if GPU SVM has mapping for the given address range
2211 * @gpusvm: Pointer to the GPU SVM structure.
2212 * @start: Start address
2213 * @end: End address
2214 *
2215 * Return: True if GPU SVM has mapping, False otherwise
2216 */
drm_gpusvm_has_mapping(struct drm_gpusvm * gpusvm,unsigned long start,unsigned long end)2217 bool drm_gpusvm_has_mapping(struct drm_gpusvm *gpusvm, unsigned long start,
2218 unsigned long end)
2219 {
2220 struct drm_gpusvm_notifier *notifier;
2221
2222 drm_gpusvm_for_each_notifier(notifier, gpusvm, start, end) {
2223 struct drm_gpusvm_range *range = NULL;
2224
2225 drm_gpusvm_for_each_range(range, notifier, start, end)
2226 return true;
2227 }
2228
2229 return false;
2230 }
2231 EXPORT_SYMBOL_GPL(drm_gpusvm_has_mapping);
2232
2233 /**
2234 * drm_gpusvm_range_set_unmapped() - Mark a GPU SVM range as unmapped
2235 * @range: Pointer to the GPU SVM range structure.
2236 * @mmu_range: Pointer to the MMU notifier range structure.
2237 *
2238 * This function marks a GPU SVM range as unmapped and sets the partial_unmap flag
2239 * if the range partially falls within the provided MMU notifier range.
2240 */
drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range * range,const struct mmu_notifier_range * mmu_range)2241 void drm_gpusvm_range_set_unmapped(struct drm_gpusvm_range *range,
2242 const struct mmu_notifier_range *mmu_range)
2243 {
2244 lockdep_assert_held_write(&range->gpusvm->notifier_lock);
2245
2246 range->flags.unmapped = true;
2247 if (drm_gpusvm_range_start(range) < mmu_range->start ||
2248 drm_gpusvm_range_end(range) > mmu_range->end)
2249 range->flags.partial_unmap = true;
2250 }
2251 EXPORT_SYMBOL_GPL(drm_gpusvm_range_set_unmapped);
2252
2253 /**
2254 * drm_gpusvm_devmem_init() - Initialize a GPU SVM device memory allocation
2255 *
2256 * @dev: Pointer to the device structure which device memory allocation belongs to
2257 * @mm: Pointer to the mm_struct for the address space
2258 * @ops: Pointer to the operations structure for GPU SVM device memory
2259 * @dpagemap: The struct drm_pagemap we're allocating from.
2260 * @size: Size of device memory allocation
2261 */
drm_gpusvm_devmem_init(struct drm_gpusvm_devmem * devmem_allocation,struct device * dev,struct mm_struct * mm,const struct drm_gpusvm_devmem_ops * ops,struct drm_pagemap * dpagemap,size_t size)2262 void drm_gpusvm_devmem_init(struct drm_gpusvm_devmem *devmem_allocation,
2263 struct device *dev, struct mm_struct *mm,
2264 const struct drm_gpusvm_devmem_ops *ops,
2265 struct drm_pagemap *dpagemap, size_t size)
2266 {
2267 init_completion(&devmem_allocation->detached);
2268 devmem_allocation->dev = dev;
2269 devmem_allocation->mm = mm;
2270 devmem_allocation->ops = ops;
2271 devmem_allocation->dpagemap = dpagemap;
2272 devmem_allocation->size = size;
2273 }
2274 EXPORT_SYMBOL_GPL(drm_gpusvm_devmem_init);
2275
2276 MODULE_DESCRIPTION("DRM GPUSVM");
2277 MODULE_LICENSE("GPL");
2278