1 // SPDX-License-Identifier: MIT 2 /* 3 * Copyright © 2014-2016 Intel Corporation 4 */ 5 6 #include <linux/pagevec.h> 7 #include <linux/shmem_fs.h> 8 #include <linux/swap.h> 9 10 #include <drm/drm_cache.h> 11 12 #include "gem/i915_gem_region.h" 13 #include "i915_drv.h" 14 #include "i915_gem_object.h" 15 #include "i915_gem_tiling.h" 16 #include "i915_gemfs.h" 17 #include "i915_scatterlist.h" 18 #include "i915_trace.h" 19 20 /* 21 * Move folios to appropriate lru and release the batch, decrementing the 22 * ref count of those folios. 23 */ 24 static void check_release_folio_batch(struct folio_batch *fbatch) 25 { 26 check_move_unevictable_folios(fbatch); 27 __folio_batch_release(fbatch); 28 cond_resched(); 29 } 30 31 void shmem_sg_free_table(struct sg_table *st, struct address_space *mapping, 32 bool dirty, bool backup) 33 { 34 struct sgt_iter sgt_iter; 35 struct folio_batch fbatch; 36 struct folio *last = NULL; 37 struct page *page; 38 39 mapping_clear_unevictable(mapping); 40 41 folio_batch_init(&fbatch); 42 for_each_sgt_page(page, sgt_iter, st) { 43 struct folio *folio = page_folio(page); 44 45 if (folio == last) 46 continue; 47 last = folio; 48 if (dirty) 49 folio_mark_dirty(folio); 50 if (backup) 51 folio_mark_accessed(folio); 52 53 if (!folio_batch_add(&fbatch, folio)) 54 check_release_folio_batch(&fbatch); 55 } 56 if (fbatch.nr) 57 check_release_folio_batch(&fbatch); 58 59 sg_free_table(st); 60 } 61 62 int shmem_sg_alloc_table(struct drm_i915_private *i915, struct sg_table *st, 63 size_t size, struct intel_memory_region *mr, 64 struct address_space *mapping, 65 unsigned int max_segment) 66 { 67 unsigned int page_count; /* restricted by sg_alloc_table */ 68 unsigned long i; 69 struct scatterlist *sg; 70 unsigned long next_pfn = 0; /* suppress gcc warning */ 71 gfp_t noreclaim; 72 int ret; 73 74 if (overflows_type(size / PAGE_SIZE, page_count)) 75 return -E2BIG; 76 77 page_count = size / PAGE_SIZE; 78 /* 79 * If there's no chance of allocating enough pages for the whole 80 * object, bail early. 81 */ 82 if (size > resource_size(&mr->region)) 83 return -ENOMEM; 84 85 if (sg_alloc_table(st, page_count, GFP_KERNEL | __GFP_NOWARN)) 86 return -ENOMEM; 87 88 /* 89 * Get the list of pages out of our struct file. They'll be pinned 90 * at this point until we release them. 91 * 92 * Fail silently without starting the shrinker 93 */ 94 mapping_set_unevictable(mapping); 95 noreclaim = mapping_gfp_constraint(mapping, ~__GFP_RECLAIM); 96 noreclaim |= __GFP_NORETRY | __GFP_NOWARN; 97 98 sg = st->sgl; 99 st->nents = 0; 100 for (i = 0; i < page_count; i++) { 101 struct folio *folio; 102 unsigned long nr_pages; 103 const unsigned int shrink[] = { 104 I915_SHRINK_BOUND | I915_SHRINK_UNBOUND, 105 0, 106 }, *s = shrink; 107 gfp_t gfp = noreclaim; 108 109 do { 110 cond_resched(); 111 folio = shmem_read_folio_gfp(mapping, i, gfp); 112 if (!IS_ERR(folio)) 113 break; 114 115 if (!*s) { 116 ret = PTR_ERR(folio); 117 goto err_sg; 118 } 119 120 i915_gem_shrink(NULL, i915, 2 * page_count, NULL, *s++); 121 122 /* 123 * We've tried hard to allocate the memory by reaping 124 * our own buffer, now let the real VM do its job and 125 * go down in flames if truly OOM. 126 * 127 * However, since graphics tend to be disposable, 128 * defer the oom here by reporting the ENOMEM back 129 * to userspace. 130 */ 131 if (!*s) { 132 /* reclaim and warn, but no oom */ 133 gfp = mapping_gfp_mask(mapping); 134 135 /* 136 * Our bo are always dirty and so we require 137 * kswapd to reclaim our pages (direct reclaim 138 * does not effectively begin pageout of our 139 * buffers on its own). However, direct reclaim 140 * only waits for kswapd when under allocation 141 * congestion. So as a result __GFP_RECLAIM is 142 * unreliable and fails to actually reclaim our 143 * dirty pages -- unless you try over and over 144 * again with !__GFP_NORETRY. However, we still 145 * want to fail this allocation rather than 146 * trigger the out-of-memory killer and for 147 * this we want __GFP_RETRY_MAYFAIL. 148 */ 149 gfp |= __GFP_RETRY_MAYFAIL | __GFP_NOWARN; 150 } 151 } while (1); 152 153 nr_pages = min_t(unsigned long, 154 folio_nr_pages(folio), page_count - i); 155 if (!i || 156 sg->length >= max_segment || 157 folio_pfn(folio) != next_pfn) { 158 if (i) 159 sg = sg_next(sg); 160 161 st->nents++; 162 sg_set_folio(sg, folio, nr_pages * PAGE_SIZE, 0); 163 } else { 164 /* XXX: could overflow? */ 165 sg->length += nr_pages * PAGE_SIZE; 166 } 167 next_pfn = folio_pfn(folio) + nr_pages; 168 i += nr_pages - 1; 169 170 /* Check that the i965g/gm workaround works. */ 171 GEM_BUG_ON(gfp & __GFP_DMA32 && next_pfn >= 0x00100000UL); 172 } 173 if (sg) /* loop terminated early; short sg table */ 174 sg_mark_end(sg); 175 176 /* Trim unused sg entries to avoid wasting memory. */ 177 i915_sg_trim(st); 178 179 return 0; 180 err_sg: 181 sg_mark_end(sg); 182 if (sg != st->sgl) { 183 shmem_sg_free_table(st, mapping, false, false); 184 } else { 185 mapping_clear_unevictable(mapping); 186 sg_free_table(st); 187 } 188 189 /* 190 * shmemfs first checks if there is enough memory to allocate the page 191 * and reports ENOSPC should there be insufficient, along with the usual 192 * ENOMEM for a genuine allocation failure. 193 * 194 * We use ENOSPC in our driver to mean that we have run out of aperture 195 * space and so want to translate the error from shmemfs back to our 196 * usual understanding of ENOMEM. 197 */ 198 if (ret == -ENOSPC) 199 ret = -ENOMEM; 200 201 return ret; 202 } 203 204 static int shmem_get_pages(struct drm_i915_gem_object *obj) 205 { 206 struct drm_i915_private *i915 = to_i915(obj->base.dev); 207 struct intel_memory_region *mem = obj->mm.region; 208 struct address_space *mapping = obj->base.filp->f_mapping; 209 unsigned int max_segment = i915_sg_segment_size(i915->drm.dev); 210 struct sg_table *st; 211 int ret; 212 213 /* 214 * Assert that the object is not currently in any GPU domain. As it 215 * wasn't in the GTT, there shouldn't be any way it could have been in 216 * a GPU cache 217 */ 218 GEM_BUG_ON(obj->read_domains & I915_GEM_GPU_DOMAINS); 219 GEM_BUG_ON(obj->write_domain & I915_GEM_GPU_DOMAINS); 220 221 rebuild_st: 222 st = kmalloc(sizeof(*st), GFP_KERNEL | __GFP_NOWARN); 223 if (!st) 224 return -ENOMEM; 225 226 ret = shmem_sg_alloc_table(i915, st, obj->base.size, mem, mapping, 227 max_segment); 228 if (ret) 229 goto err_st; 230 231 ret = i915_gem_gtt_prepare_pages(obj, st); 232 if (ret) { 233 /* 234 * DMA remapping failed? One possible cause is that 235 * it could not reserve enough large entries, asking 236 * for PAGE_SIZE chunks instead may be helpful. 237 */ 238 if (max_segment > PAGE_SIZE) { 239 shmem_sg_free_table(st, mapping, false, false); 240 kfree(st); 241 242 max_segment = PAGE_SIZE; 243 goto rebuild_st; 244 } else { 245 dev_warn(i915->drm.dev, 246 "Failed to DMA remap %zu pages\n", 247 obj->base.size >> PAGE_SHIFT); 248 goto err_pages; 249 } 250 } 251 252 if (i915_gem_object_needs_bit17_swizzle(obj)) 253 i915_gem_object_do_bit_17_swizzle(obj, st); 254 255 if (i915_gem_object_can_bypass_llc(obj)) 256 obj->cache_dirty = true; 257 258 __i915_gem_object_set_pages(obj, st); 259 260 return 0; 261 262 err_pages: 263 shmem_sg_free_table(st, mapping, false, false); 264 /* 265 * shmemfs first checks if there is enough memory to allocate the page 266 * and reports ENOSPC should there be insufficient, along with the usual 267 * ENOMEM for a genuine allocation failure. 268 * 269 * We use ENOSPC in our driver to mean that we have run out of aperture 270 * space and so want to translate the error from shmemfs back to our 271 * usual understanding of ENOMEM. 272 */ 273 err_st: 274 if (ret == -ENOSPC) 275 ret = -ENOMEM; 276 277 kfree(st); 278 279 return ret; 280 } 281 282 static int 283 shmem_truncate(struct drm_i915_gem_object *obj) 284 { 285 /* 286 * Our goal here is to return as much of the memory as 287 * is possible back to the system as we are called from OOM. 288 * To do this we must instruct the shmfs to drop all of its 289 * backing pages, *now*. 290 */ 291 shmem_truncate_range(file_inode(obj->base.filp), 0, (loff_t)-1); 292 obj->mm.madv = __I915_MADV_PURGED; 293 obj->mm.pages = ERR_PTR(-EFAULT); 294 295 return 0; 296 } 297 298 void __shmem_writeback(size_t size, struct address_space *mapping) 299 { 300 struct writeback_control wbc = { 301 .sync_mode = WB_SYNC_NONE, 302 .nr_to_write = SWAP_CLUSTER_MAX, 303 .range_start = 0, 304 .range_end = LLONG_MAX, 305 .for_reclaim = 1, 306 }; 307 struct folio *folio = NULL; 308 int error = 0; 309 310 /* 311 * Leave mmapings intact (GTT will have been revoked on unbinding, 312 * leaving only CPU mmapings around) and add those folios to the LRU 313 * instead of invoking writeback so they are aged and paged out 314 * as normal. 315 */ 316 while ((folio = writeback_iter(mapping, &wbc, folio, &error))) { 317 if (folio_mapped(folio)) 318 folio_redirty_for_writepage(&wbc, folio); 319 else 320 error = shmem_writeout(folio, &wbc); 321 } 322 } 323 324 static void 325 shmem_writeback(struct drm_i915_gem_object *obj) 326 { 327 __shmem_writeback(obj->base.size, obj->base.filp->f_mapping); 328 } 329 330 static int shmem_shrink(struct drm_i915_gem_object *obj, unsigned int flags) 331 { 332 switch (obj->mm.madv) { 333 case I915_MADV_DONTNEED: 334 return i915_gem_object_truncate(obj); 335 case __I915_MADV_PURGED: 336 return 0; 337 } 338 339 if (flags & I915_GEM_OBJECT_SHRINK_WRITEBACK) 340 shmem_writeback(obj); 341 342 return 0; 343 } 344 345 void 346 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj, 347 struct sg_table *pages, 348 bool needs_clflush) 349 { 350 struct drm_i915_private *i915 = to_i915(obj->base.dev); 351 352 GEM_BUG_ON(obj->mm.madv == __I915_MADV_PURGED); 353 354 if (obj->mm.madv == I915_MADV_DONTNEED) 355 obj->mm.dirty = false; 356 357 if (needs_clflush && 358 (obj->read_domains & I915_GEM_DOMAIN_CPU) == 0 && 359 !(obj->cache_coherent & I915_BO_CACHE_COHERENT_FOR_READ)) 360 drm_clflush_sg(pages); 361 362 __start_cpu_write(obj); 363 /* 364 * On non-LLC igfx platforms, force the flush-on-acquire if this is ever 365 * swapped-in. Our async flush path is not trust worthy enough yet(and 366 * happens in the wrong order), and with some tricks it's conceivable 367 * for userspace to change the cache-level to I915_CACHE_NONE after the 368 * pages are swapped-in, and since execbuf binds the object before doing 369 * the async flush, we have a race window. 370 */ 371 if (!HAS_LLC(i915) && !IS_DGFX(i915)) 372 obj->cache_dirty = true; 373 } 374 375 void i915_gem_object_put_pages_shmem(struct drm_i915_gem_object *obj, struct sg_table *pages) 376 { 377 __i915_gem_object_release_shmem(obj, pages, true); 378 379 i915_gem_gtt_finish_pages(obj, pages); 380 381 if (i915_gem_object_needs_bit17_swizzle(obj)) 382 i915_gem_object_save_bit_17_swizzle(obj, pages); 383 384 shmem_sg_free_table(pages, file_inode(obj->base.filp)->i_mapping, 385 obj->mm.dirty, obj->mm.madv == I915_MADV_WILLNEED); 386 kfree(pages); 387 obj->mm.dirty = false; 388 } 389 390 static void 391 shmem_put_pages(struct drm_i915_gem_object *obj, struct sg_table *pages) 392 { 393 if (likely(i915_gem_object_has_struct_page(obj))) 394 i915_gem_object_put_pages_shmem(obj, pages); 395 else 396 i915_gem_object_put_pages_phys(obj, pages); 397 } 398 399 static int 400 shmem_pwrite(struct drm_i915_gem_object *obj, 401 const struct drm_i915_gem_pwrite *arg) 402 { 403 struct address_space *mapping = obj->base.filp->f_mapping; 404 const struct address_space_operations *aops = mapping->a_ops; 405 char __user *user_data = u64_to_user_ptr(arg->data_ptr); 406 u64 remain; 407 loff_t pos; 408 unsigned int pg; 409 410 /* Caller already validated user args */ 411 GEM_BUG_ON(!access_ok(user_data, arg->size)); 412 413 if (!i915_gem_object_has_struct_page(obj)) 414 return i915_gem_object_pwrite_phys(obj, arg); 415 416 /* 417 * Before we instantiate/pin the backing store for our use, we 418 * can prepopulate the shmemfs filp efficiently using a write into 419 * the pagecache. We avoid the penalty of instantiating all the 420 * pages, important if the user is just writing to a few and never 421 * uses the object on the GPU, and using a direct write into shmemfs 422 * allows it to avoid the cost of retrieving a page (either swapin 423 * or clearing-before-use) before it is overwritten. 424 */ 425 if (i915_gem_object_has_pages(obj)) 426 return -ENODEV; 427 428 if (obj->mm.madv != I915_MADV_WILLNEED) 429 return -EFAULT; 430 431 /* 432 * Before the pages are instantiated the object is treated as being 433 * in the CPU domain. The pages will be clflushed as required before 434 * use, and we can freely write into the pages directly. If userspace 435 * races pwrite with any other operation; corruption will ensue - 436 * that is userspace's prerogative! 437 */ 438 439 remain = arg->size; 440 pos = arg->offset; 441 pg = offset_in_page(pos); 442 443 do { 444 unsigned int len, unwritten; 445 struct folio *folio; 446 void *data, *vaddr; 447 int err; 448 char __maybe_unused c; 449 450 len = PAGE_SIZE - pg; 451 if (len > remain) 452 len = remain; 453 454 /* Prefault the user page to reduce potential recursion */ 455 err = __get_user(c, user_data); 456 if (err) 457 return err; 458 459 err = __get_user(c, user_data + len - 1); 460 if (err) 461 return err; 462 463 err = aops->write_begin(obj->base.filp, mapping, pos, len, 464 &folio, &data); 465 if (err < 0) 466 return err; 467 468 vaddr = kmap_local_folio(folio, offset_in_folio(folio, pos)); 469 pagefault_disable(); 470 unwritten = __copy_from_user_inatomic(vaddr, user_data, len); 471 pagefault_enable(); 472 kunmap_local(vaddr); 473 474 err = aops->write_end(obj->base.filp, mapping, pos, len, 475 len - unwritten, folio, data); 476 if (err < 0) 477 return err; 478 479 /* We don't handle -EFAULT, leave it to the caller to check */ 480 if (unwritten) 481 return -ENODEV; 482 483 remain -= len; 484 user_data += len; 485 pos += len; 486 pg = 0; 487 } while (remain); 488 489 return 0; 490 } 491 492 static int 493 shmem_pread(struct drm_i915_gem_object *obj, 494 const struct drm_i915_gem_pread *arg) 495 { 496 if (!i915_gem_object_has_struct_page(obj)) 497 return i915_gem_object_pread_phys(obj, arg); 498 499 return -ENODEV; 500 } 501 502 static void shmem_release(struct drm_i915_gem_object *obj) 503 { 504 if (i915_gem_object_has_struct_page(obj)) 505 i915_gem_object_release_memory_region(obj); 506 507 fput(obj->base.filp); 508 } 509 510 const struct drm_i915_gem_object_ops i915_gem_shmem_ops = { 511 .name = "i915_gem_object_shmem", 512 .flags = I915_GEM_OBJECT_IS_SHRINKABLE, 513 514 .get_pages = shmem_get_pages, 515 .put_pages = shmem_put_pages, 516 .truncate = shmem_truncate, 517 .shrink = shmem_shrink, 518 519 .pwrite = shmem_pwrite, 520 .pread = shmem_pread, 521 522 .release = shmem_release, 523 }; 524 525 static int __create_shmem(struct drm_i915_private *i915, 526 struct drm_gem_object *obj, 527 resource_size_t size) 528 { 529 unsigned long flags = VM_NORESERVE; 530 struct file *filp; 531 532 drm_gem_private_object_init(&i915->drm, obj, size); 533 534 /* XXX: The __shmem_file_setup() function returns -EINVAL if size is 535 * greater than MAX_LFS_FILESIZE. 536 * To handle the same error as other code that returns -E2BIG when 537 * the size is too large, we add a code that returns -E2BIG when the 538 * size is larger than the size that can be handled. 539 * If BITS_PER_LONG is 32, size > MAX_LFS_FILESIZE is always false, 540 * so we only needs to check when BITS_PER_LONG is 64. 541 * If BITS_PER_LONG is 32, E2BIG checks are processed when 542 * i915_gem_object_size_2big() is called before init_object() callback 543 * is called. 544 */ 545 if (BITS_PER_LONG == 64 && size > MAX_LFS_FILESIZE) 546 return -E2BIG; 547 548 if (i915->mm.gemfs) 549 filp = shmem_file_setup_with_mnt(i915->mm.gemfs, "i915", size, 550 flags); 551 else 552 filp = shmem_file_setup("i915", size, flags); 553 if (IS_ERR(filp)) 554 return PTR_ERR(filp); 555 556 obj->filp = filp; 557 return 0; 558 } 559 560 static int shmem_object_init(struct intel_memory_region *mem, 561 struct drm_i915_gem_object *obj, 562 resource_size_t offset, 563 resource_size_t size, 564 resource_size_t page_size, 565 unsigned int flags) 566 { 567 static struct lock_class_key lock_class; 568 struct drm_i915_private *i915 = mem->i915; 569 struct address_space *mapping; 570 unsigned int cache_level; 571 gfp_t mask; 572 int ret; 573 574 ret = __create_shmem(i915, &obj->base, size); 575 if (ret) 576 return ret; 577 578 mask = GFP_HIGHUSER | __GFP_RECLAIMABLE; 579 if (IS_I965GM(i915) || IS_I965G(i915)) { 580 /* 965gm cannot relocate objects above 4GiB. */ 581 mask &= ~__GFP_HIGHMEM; 582 mask |= __GFP_DMA32; 583 } 584 585 mapping = obj->base.filp->f_mapping; 586 mapping_set_gfp_mask(mapping, mask); 587 GEM_BUG_ON(!(mapping_gfp_mask(mapping) & __GFP_RECLAIM)); 588 589 i915_gem_object_init(obj, &i915_gem_shmem_ops, &lock_class, flags); 590 obj->mem_flags |= I915_BO_FLAG_STRUCT_PAGE; 591 obj->write_domain = I915_GEM_DOMAIN_CPU; 592 obj->read_domains = I915_GEM_DOMAIN_CPU; 593 594 /* 595 * MTL doesn't snoop CPU cache by default for GPU access (namely 596 * 1-way coherency). However some UMD's are currently depending on 597 * that. Make 1-way coherent the default setting for MTL. A follow 598 * up patch will extend the GEM_CREATE uAPI to allow UMD's specify 599 * caching mode at BO creation time 600 */ 601 if (HAS_LLC(i915) || (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70))) 602 /* On some devices, we can have the GPU use the LLC (the CPU 603 * cache) for about a 10% performance improvement 604 * compared to uncached. Graphics requests other than 605 * display scanout are coherent with the CPU in 606 * accessing this cache. This means in this mode we 607 * don't need to clflush on the CPU side, and on the 608 * GPU side we only need to flush internal caches to 609 * get data visible to the CPU. 610 * 611 * However, we maintain the display planes as UC, and so 612 * need to rebind when first used as such. 613 */ 614 cache_level = I915_CACHE_LLC; 615 else 616 cache_level = I915_CACHE_NONE; 617 618 i915_gem_object_set_cache_coherency(obj, cache_level); 619 620 i915_gem_object_init_memory_region(obj, mem); 621 622 return 0; 623 } 624 625 struct drm_i915_gem_object * 626 i915_gem_object_create_shmem(struct drm_i915_private *i915, 627 resource_size_t size) 628 { 629 return i915_gem_object_create_region(i915->mm.regions[INTEL_REGION_SMEM], 630 size, 0, 0); 631 } 632 633 /* Allocate a new GEM object and fill it with the supplied data */ 634 struct drm_i915_gem_object * 635 i915_gem_object_create_shmem_from_data(struct drm_i915_private *i915, 636 const void *data, resource_size_t size) 637 { 638 struct drm_i915_gem_object *obj; 639 struct file *file; 640 const struct address_space_operations *aops; 641 loff_t pos; 642 int err; 643 644 GEM_WARN_ON(IS_DGFX(i915)); 645 obj = i915_gem_object_create_shmem(i915, round_up(size, PAGE_SIZE)); 646 if (IS_ERR(obj)) 647 return obj; 648 649 GEM_BUG_ON(obj->write_domain != I915_GEM_DOMAIN_CPU); 650 651 file = obj->base.filp; 652 aops = file->f_mapping->a_ops; 653 pos = 0; 654 do { 655 unsigned int len = min_t(typeof(size), size, PAGE_SIZE); 656 struct folio *folio; 657 void *fsdata; 658 659 err = aops->write_begin(file, file->f_mapping, pos, len, 660 &folio, &fsdata); 661 if (err < 0) 662 goto fail; 663 664 memcpy_to_folio(folio, offset_in_folio(folio, pos), data, len); 665 666 err = aops->write_end(file, file->f_mapping, pos, len, len, 667 folio, fsdata); 668 if (err < 0) 669 goto fail; 670 671 size -= len; 672 data += len; 673 pos += len; 674 } while (size); 675 676 return obj; 677 678 fail: 679 i915_gem_object_put(obj); 680 return ERR_PTR(err); 681 } 682 683 static int init_shmem(struct intel_memory_region *mem) 684 { 685 i915_gemfs_init(mem->i915); 686 intel_memory_region_set_name(mem, "system"); 687 688 return 0; /* We have fallback to the kernel mnt if gemfs init failed. */ 689 } 690 691 static int release_shmem(struct intel_memory_region *mem) 692 { 693 i915_gemfs_fini(mem->i915); 694 return 0; 695 } 696 697 static const struct intel_memory_region_ops shmem_region_ops = { 698 .init = init_shmem, 699 .release = release_shmem, 700 .init_object = shmem_object_init, 701 }; 702 703 struct intel_memory_region *i915_gem_shmem_setup(struct drm_i915_private *i915, 704 u16 type, u16 instance) 705 { 706 return intel_memory_region_create(i915, 0, 707 totalram_pages() << PAGE_SHIFT, 708 PAGE_SIZE, 0, 0, 709 type, instance, 710 &shmem_region_ops); 711 } 712 713 bool i915_gem_object_is_shmem(const struct drm_i915_gem_object *obj) 714 { 715 return obj->ops == &i915_gem_shmem_ops; 716 } 717