xref: /linux/drivers/android/binder/page_range.rs (revision 1791c390149f56313c425e8add1fd15baf40afb8)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 // Copyright (C) 2025 Google LLC.
4 
5 //! This module has utilities for managing a page range where unused pages may be reclaimed by a
6 //! vma shrinker.
7 
8 // To avoid deadlocks, locks are taken in the order:
9 //
10 //  1. mmap lock
11 //  2. spinlock
12 //  3. lru spinlock
13 //
14 // The shrinker will use trylock methods because it locks them in a different order.
15 
16 use crate::AssertSync;
17 
18 use core::{
19     marker::PhantomPinned,
20     mem::{size_of, size_of_val, MaybeUninit},
21     ptr,
22 };
23 
24 use kernel::{
25     bindings,
26     error::Result,
27     ffi::{c_ulong, c_void},
28     mm::{virt, Mm, MmWithUser},
29     new_mutex, new_spinlock,
30     page::{Page, PAGE_SHIFT, PAGE_SIZE},
31     prelude::*,
32     str::CStr,
33     sync::{aref::ARef, Mutex, SpinLock},
34     task::Pid,
35     transmute::FromBytes,
36     types::Opaque,
37     uaccess::UserSliceReader,
38 };
39 
40 /// Represents a shrinker that can be registered with the kernel.
41 ///
42 /// Each shrinker can be used by many `ShrinkablePageRange` objects.
43 #[repr(C)]
44 pub(crate) struct Shrinker {
45     inner: Opaque<*mut bindings::shrinker>,
46     list_lru: Opaque<bindings::list_lru>,
47 }
48 
49 // SAFETY: The shrinker and list_lru are thread safe.
50 unsafe impl Send for Shrinker {}
51 // SAFETY: The shrinker and list_lru are thread safe.
52 unsafe impl Sync for Shrinker {}
53 
54 impl Shrinker {
55     /// Create a new shrinker.
56     ///
57     /// # Safety
58     ///
59     /// Before using this shrinker with a `ShrinkablePageRange`, the `register` method must have
60     /// been called exactly once, and it must not have returned an error.
new() -> Self61     pub(crate) const unsafe fn new() -> Self {
62         Self {
63             inner: Opaque::uninit(),
64             list_lru: Opaque::uninit(),
65         }
66     }
67 
68     /// Register this shrinker with the kernel.
register(&'static self, name: &CStr) -> Result<()>69     pub(crate) fn register(&'static self, name: &CStr) -> Result<()> {
70         // SAFETY: These fields are not yet used, so it's okay to zero them.
71         unsafe {
72             self.inner.get().write(ptr::null_mut());
73             self.list_lru.get().write_bytes(0, 1);
74         }
75 
76         // SAFETY: The field is not yet used, so we can initialize it.
77         let ret = unsafe { bindings::__list_lru_init(self.list_lru.get(), false, ptr::null_mut()) };
78         if ret != 0 {
79             return Err(Error::from_errno(ret));
80         }
81 
82         // SAFETY: The `name` points at a valid c string.
83         let shrinker = unsafe { bindings::shrinker_alloc(0, name.as_char_ptr()) };
84         if shrinker.is_null() {
85             // SAFETY: We initialized it, so its okay to destroy it.
86             unsafe { bindings::list_lru_destroy(self.list_lru.get()) };
87             return Err(Error::from_errno(ret));
88         }
89 
90         // SAFETY: We're about to register the shrinker, and these are the fields we need to
91         // initialize. (All other fields are already zeroed.)
92         unsafe {
93             (&raw mut (*shrinker).count_objects).write(Some(rust_shrink_count));
94             (&raw mut (*shrinker).scan_objects).write(Some(rust_shrink_scan));
95             (&raw mut (*shrinker).private_data).write(self.list_lru.get().cast());
96         }
97 
98         // SAFETY: The new shrinker has been fully initialized, so we can register it.
99         unsafe { bindings::shrinker_register(shrinker) };
100 
101         // SAFETY: This initializes the pointer to the shrinker so that we can use it.
102         unsafe { self.inner.get().write(shrinker) };
103 
104         Ok(())
105     }
106 }
107 
108 /// A container that manages a page range in a vma.
109 ///
110 /// The pages can be thought of as an array of booleans of whether the pages are usable. The
111 /// methods `use_range` and `stop_using_range` set all booleans in a range to true or false
112 /// respectively. Initially, no pages are allocated. When a page is not used, it is not freed
113 /// immediately. Instead, it is made available to the memory shrinker to free it if the device is
114 /// under memory pressure.
115 ///
116 /// It's okay for `use_range` and `stop_using_range` to race with each other, although there's no
117 /// way to know whether an index ends up with true or false if a call to `use_range` races with
118 /// another call to `stop_using_range` on a given index.
119 ///
120 /// It's also okay for the two methods to race with themselves, e.g. if two threads call
121 /// `use_range` on the same index, then that's fine and neither call will return until the page is
122 /// allocated and mapped.
123 ///
124 /// The methods that read or write to a range require that the page is marked as in use. So it is
125 /// _not_ okay to call `stop_using_range` on a page that is in use by the methods that read or
126 /// write to the page.
127 #[pin_data(PinnedDrop)]
128 pub(crate) struct ShrinkablePageRange {
129     /// Shrinker object registered with the kernel.
130     shrinker: &'static Shrinker,
131     /// Pid using this page range. Only used as debugging information.
132     pid: Pid,
133     /// The mm for the relevant process.
134     mm: ARef<Mm>,
135     /// Used to synchronize calls to `vm_insert_page` and `zap_page_range_single`.
136     #[pin]
137     mm_lock: Mutex<()>,
138     /// Spinlock protecting changes to pages.
139     #[pin]
140     lock: SpinLock<Inner>,
141 
142     /// Must not move, since page info has pointers back.
143     #[pin]
144     _pin: PhantomPinned,
145 }
146 
147 // We do not define any ops. For now, used only to check identity of vmas.
148 static BINDER_VM_OPS: AssertSync<bindings::vm_operations_struct> = AssertSync(pin_init::zeroed());
149 
150 // To ensure that we do not accidentally install pages into or zap pages from the wrong vma, we
151 // check its vm_ops and private data before using it.
check_vma(vma: &virt::VmaRef, owner: *const ShrinkablePageRange) -> Option<&virt::VmaMixedMap>152 fn check_vma(vma: &virt::VmaRef, owner: *const ShrinkablePageRange) -> Option<&virt::VmaMixedMap> {
153     // SAFETY: Just reading the vm_ops pointer of any active vma is safe.
154     let vm_ops = unsafe { (*vma.as_ptr()).vm_ops };
155     if !ptr::eq(vm_ops, &BINDER_VM_OPS.0) {
156         return None;
157     }
158 
159     // SAFETY: Reading the vm_private_data pointer of a binder-owned vma is safe.
160     let vm_private_data = unsafe { (*vma.as_ptr()).vm_private_data };
161     // The ShrinkablePageRange is only dropped when the Process is dropped, which only happens once
162     // the file's ->release handler is invoked, which means the ShrinkablePageRange outlives any
163     // VMA associated with it, so there can't be any false positives due to pointer reuse here.
164     if !ptr::eq(vm_private_data, owner.cast()) {
165         return None;
166     }
167 
168     vma.as_mixedmap_vma()
169 }
170 
171 struct Inner {
172     /// Array of pages.
173     ///
174     /// Since this is also accessed by the shrinker, we can't use a `Box`, which asserts exclusive
175     /// ownership. To deal with that, we manage it using raw pointers.
176     pages: *mut PageInfo,
177     /// Length of the `pages` array.
178     size: usize,
179     /// The address of the vma to insert the pages into.
180     vma_addr: usize,
181 }
182 
183 // SAFETY: proper locking is in place for `Inner`
184 unsafe impl Send for Inner {}
185 
186 type StableMmGuard =
187     kernel::sync::lock::Guard<'static, (), kernel::sync::lock::mutex::MutexBackend>;
188 
189 /// An array element that describes the current state of a page.
190 ///
191 /// There are three states:
192 ///
193 ///  * Free. The page is None. The `lru` element is not queued.
194 ///  * Available. The page is Some. The `lru` element is queued to the shrinker's lru.
195 ///  * Used. The page is Some. The `lru` element is not queued.
196 ///
197 /// When an element is available, the shrinker is able to free the page.
198 #[repr(C)]
199 struct PageInfo {
200     lru: bindings::list_head,
201     page: Option<Page>,
202     range: *const ShrinkablePageRange,
203 }
204 
205 impl PageInfo {
206     /// # Safety
207     ///
208     /// The caller ensures that writing to `me.page` is ok, and that the page is not currently set.
set_page(me: *mut PageInfo, page: Page)209     unsafe fn set_page(me: *mut PageInfo, page: Page) {
210         // SAFETY: This pointer offset is in bounds.
211         let ptr = unsafe { &raw mut (*me).page };
212 
213         // SAFETY: The pointer is valid for writing, so also valid for reading.
214         if unsafe { (*ptr).is_some() } {
215             pr_err!("set_page called when there is already a page");
216             // SAFETY: We will initialize the page again below.
217             unsafe { ptr::drop_in_place(ptr) };
218         }
219 
220         // SAFETY: The pointer is valid for writing.
221         unsafe { ptr::write(ptr, Some(page)) };
222     }
223 
224     /// # Safety
225     ///
226     /// The caller ensures that reading from `me.page` is ok for the duration of 'a.
get_page<'a>(me: *const PageInfo) -> Option<&'a Page>227     unsafe fn get_page<'a>(me: *const PageInfo) -> Option<&'a Page> {
228         // SAFETY: This pointer offset is in bounds.
229         let ptr = unsafe { &raw const (*me).page };
230 
231         // SAFETY: The pointer is valid for reading.
232         unsafe { (*ptr).as_ref() }
233     }
234 
235     /// # Safety
236     ///
237     /// The caller ensures that writing to `me.page` is ok for the duration of 'a.
take_page(me: *mut PageInfo) -> Option<Page>238     unsafe fn take_page(me: *mut PageInfo) -> Option<Page> {
239         // SAFETY: This pointer offset is in bounds.
240         let ptr = unsafe { &raw mut (*me).page };
241 
242         // SAFETY: The pointer is valid for reading.
243         unsafe { (*ptr).take() }
244     }
245 
246     /// Add this page to the lru list, if not already in the list.
247     ///
248     /// # Safety
249     ///
250     /// The pointer must be valid, and it must be the right shrinker and nid.
list_lru_add(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker)251     unsafe fn list_lru_add(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker) {
252         // SAFETY: This pointer offset is in bounds.
253         let lru_ptr = unsafe { &raw mut (*me).lru };
254         // SAFETY: The lru pointer is valid, and we're not using it with any other lru list.
255         unsafe { bindings::list_lru_add(shrinker.list_lru.get(), lru_ptr, nid, ptr::null_mut()) };
256     }
257 
258     /// Remove this page from the lru list, if it is in the list.
259     ///
260     /// # Safety
261     ///
262     /// The pointer must be valid, and it must be the right shrinker and nid.
list_lru_del(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker)263     unsafe fn list_lru_del(me: *mut PageInfo, nid: i32, shrinker: &'static Shrinker) {
264         // SAFETY: This pointer offset is in bounds.
265         let lru_ptr = unsafe { &raw mut (*me).lru };
266         // SAFETY: The lru pointer is valid, and we're not using it with any other lru list.
267         unsafe { bindings::list_lru_del(shrinker.list_lru.get(), lru_ptr, nid, ptr::null_mut()) };
268     }
269 }
270 
271 impl ShrinkablePageRange {
272     /// Create a new `ShrinkablePageRange` using the given shrinker.
new(shrinker: &'static Shrinker) -> impl PinInit<Self, Error>273     pub(crate) fn new(shrinker: &'static Shrinker) -> impl PinInit<Self, Error> {
274         try_pin_init!(Self {
275             shrinker,
276             pid: kernel::current!().pid(),
277             mm: ARef::from(&**kernel::current!().mm().ok_or(ESRCH)?),
278             mm_lock <- new_mutex!((), "ShrinkablePageRange::mm"),
279             lock <- new_spinlock!(Inner {
280                 pages: ptr::null_mut(),
281                 size: 0,
282                 vma_addr: 0,
283             }, "ShrinkablePageRange"),
284             _pin: PhantomPinned,
285         })
286     }
287 
stable_trylock_mm(&self) -> Option<StableMmGuard>288     pub(crate) fn stable_trylock_mm(&self) -> Option<StableMmGuard> {
289         // SAFETY: This extends the duration of the reference. Since this call happens before
290         // `mm_lock` is taken in the destructor of `ShrinkablePageRange`, the destructor will block
291         // until the returned guard is dropped. This ensures that the guard is valid until dropped.
292         let mm_lock = unsafe { &*ptr::from_ref(&self.mm_lock) };
293 
294         mm_lock.try_lock()
295     }
296 
297     /// Register a vma with this page range. Returns the size of the region.
register_with_vma(&self, vma: &virt::VmaNew) -> Result<usize>298     pub(crate) fn register_with_vma(&self, vma: &virt::VmaNew) -> Result<usize> {
299         let num_bytes = usize::min(vma.end() - vma.start(), bindings::SZ_4M as usize);
300         let num_pages = num_bytes >> PAGE_SHIFT;
301 
302         if !ptr::eq::<Mm>(&*self.mm, &**vma.mm()) {
303             pr_debug!("Failed to register with vma: invalid vma->vm_mm");
304             return Err(EINVAL);
305         }
306         if num_pages == 0 {
307             pr_debug!("Failed to register with vma: size zero");
308             return Err(EINVAL);
309         }
310 
311         let mut pages = KVVec::<PageInfo>::with_capacity(num_pages, GFP_KERNEL)?;
312 
313         // SAFETY: This just initializes the pages array.
314         unsafe {
315             let self_ptr = self as *const ShrinkablePageRange;
316             for i in 0..num_pages {
317                 let info = pages.as_mut_ptr().add(i);
318                 (&raw mut (*info).range).write(self_ptr);
319                 (&raw mut (*info).page).write(None);
320                 let lru = &raw mut (*info).lru;
321                 (&raw mut (*lru).next).write(lru);
322                 (&raw mut (*lru).prev).write(lru);
323             }
324         }
325 
326         let mut inner = self.lock.lock();
327         if inner.size > 0 {
328             pr_debug!("Failed to register with vma: already registered");
329             drop(inner);
330             return Err(EBUSY);
331         }
332 
333         inner.pages = pages.into_raw_parts().0;
334         inner.size = num_pages;
335         inner.vma_addr = vma.start();
336 
337         // This pointer is only used for comparison - it's not dereferenced.
338         //
339         // SAFETY: We own the vma, and we don't use any methods on VmaNew that rely on
340         // `vm_private_data`.
341         unsafe {
342             (*vma.as_ptr()).vm_private_data = ptr::from_ref(self).cast_mut().cast::<c_void>()
343         };
344 
345         // SAFETY: We own the vma, and we don't use any methods on VmaNew that rely on
346         // `vm_ops`.
347         unsafe { (*vma.as_ptr()).vm_ops = &BINDER_VM_OPS.0 };
348 
349         Ok(num_pages)
350     }
351 
352     /// Make sure that the given pages are allocated and mapped.
353     ///
354     /// Must not be called from an atomic context.
use_range(&self, start: usize, end: usize) -> Result<()>355     pub(crate) fn use_range(&self, start: usize, end: usize) -> Result<()> {
356         if start >= end {
357             return Ok(());
358         }
359         let mut inner = self.lock.lock();
360         assert!(end <= inner.size);
361 
362         for i in start..end {
363             // SAFETY: This pointer offset is in bounds.
364             let page_info = unsafe { inner.pages.add(i) };
365 
366             // SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay.
367             if let Some(page) = unsafe { PageInfo::get_page(page_info) } {
368                 // Since we're going to use the page, we should remove it from the lru list so that
369                 // the shrinker will not free it.
370                 //
371                 // SAFETY: The pointer is valid, and this is the right shrinker.
372                 //
373                 // The shrinker can't free the page between the check and this call to
374                 // `list_lru_del` because we hold the lock.
375                 unsafe { PageInfo::list_lru_del(page_info, page.nid(), self.shrinker) };
376             } else {
377                 // We have to allocate a new page. Use the slow path.
378                 drop(inner);
379                 // SAFETY: `i < end <= inner.size` so `i` is in bounds.
380                 match unsafe { self.use_page_slow(i) } {
381                     Ok(()) => {}
382                     Err(err) => {
383                         pr_warn!("Error in use_page_slow: {:?}", err);
384                         return Err(err);
385                     }
386                 }
387                 inner = self.lock.lock();
388             }
389         }
390         Ok(())
391     }
392 
393     /// Mark the given page as in use, slow path.
394     ///
395     /// Must not be called from an atomic context.
396     ///
397     /// # Safety
398     ///
399     /// Assumes that `i` is in bounds.
400     #[cold]
use_page_slow(&self, i: usize) -> Result<()>401     unsafe fn use_page_slow(&self, i: usize) -> Result<()> {
402         let new_page = Page::alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO)?;
403 
404         let mm_mutex = self.mm_lock.lock();
405         let inner = self.lock.lock();
406 
407         // SAFETY: This pointer offset is in bounds.
408         let page_info = unsafe { inner.pages.add(i) };
409 
410         // SAFETY: The pointer is valid, and we hold the lock so reading from the page is okay.
411         if let Some(page) = unsafe { PageInfo::get_page(page_info) } {
412             // The page was already there, or someone else added the page while we didn't hold the
413             // spinlock.
414             //
415             // SAFETY: The pointer is valid, and this is the right shrinker.
416             //
417             // The shrinker can't free the page between the check and this call to
418             // `list_lru_del` because we hold the lock.
419             unsafe { PageInfo::list_lru_del(page_info, page.nid(), self.shrinker) };
420             return Ok(());
421         }
422 
423         let vma_addr = inner.vma_addr;
424         // Release the spinlock while we insert the page into the vma.
425         drop(inner);
426 
427         // No overflow since we stay in bounds of the vma.
428         let user_page_addr = vma_addr + (i << PAGE_SHIFT);
429 
430         // We use `mmput_async` when dropping the `mm` because `use_page_slow` is usually used from
431         // a remote process. If the call to `mmput` races with the process shutting down, then the
432         // caller of `use_page_slow` becomes responsible for cleaning up the `mm`, which doesn't
433         // happen until it returns to userspace. However, the caller might instead go to sleep and
434         // wait for the owner of the `mm` to wake it up, which doesn't happen because it's in the
435         // middle of a shutdown process that won't complete until the `mm` is dropped. This can
436         // amount to a deadlock.
437         //
438         // Using `mmput_async` avoids this, because then the `mm` cleanup is instead queued to a
439         // workqueue.
440         let mm = MmWithUser::into_mmput_async(self.mm.mmget_not_zero().ok_or(ESRCH)?);
441         {
442             let vma_read;
443             let mmap_read;
444             let vma = if let Some(ret) = mm.lock_vma_under_rcu(vma_addr) {
445                 vma_read = ret;
446                 check_vma(&vma_read, self)
447             } else {
448                 mmap_read = mm.mmap_read_lock();
449                 mmap_read
450                     .vma_lookup(vma_addr)
451                     .and_then(|vma| check_vma(vma, self))
452             };
453 
454             match vma {
455                 Some(vma) => vma.vm_insert_page(user_page_addr, &new_page)?,
456                 None => return Err(ESRCH),
457             }
458         }
459 
460         let inner = self.lock.lock();
461 
462         // SAFETY: The `page_info` pointer is valid and currently does not have a page. The page
463         // can be written to since we hold the lock.
464         //
465         // We released and reacquired the spinlock since we checked that the page is null, but we
466         // always hold the mm_lock mutex when setting the page to a non-null value, so it's not
467         // possible for someone else to have changed it since our check.
468         unsafe { PageInfo::set_page(page_info, new_page) };
469 
470         drop(inner);
471         drop(mm_mutex);
472 
473         Ok(())
474     }
475 
476     /// If the given page is in use, then mark it as available so that the shrinker can free it.
477     ///
478     /// May be called from an atomic context.
stop_using_range(&self, start: usize, end: usize)479     pub(crate) fn stop_using_range(&self, start: usize, end: usize) {
480         if start >= end {
481             return;
482         }
483         let inner = self.lock.lock();
484         assert!(end <= inner.size);
485 
486         for i in (start..end).rev() {
487             // SAFETY: The pointer is in bounds.
488             let page_info = unsafe { inner.pages.add(i) };
489 
490             // SAFETY: Okay for reading since we have the lock.
491             if let Some(page) = unsafe { PageInfo::get_page(page_info) } {
492                 // SAFETY: The pointer is valid, and it's the right shrinker.
493                 unsafe { PageInfo::list_lru_add(page_info, page.nid(), self.shrinker) };
494             }
495         }
496     }
497 
498     /// Helper for reading or writing to a range of bytes that may overlap with several pages.
499     ///
500     /// # Safety
501     ///
502     /// All pages touched by this operation must be in use for the duration of this call.
iterate<T>(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result where T: FnMut(&Page, usize, usize) -> Result,503     unsafe fn iterate<T>(&self, mut offset: usize, mut size: usize, mut cb: T) -> Result
504     where
505         T: FnMut(&Page, usize, usize) -> Result,
506     {
507         if size == 0 {
508             return Ok(());
509         }
510 
511         let (pages, num_pages) = {
512             let inner = self.lock.lock();
513             (inner.pages, inner.size)
514         };
515         let num_bytes = num_pages << PAGE_SHIFT;
516 
517         // Check that the request is within the buffer.
518         if offset.checked_add(size).ok_or(EFAULT)? > num_bytes {
519             return Err(EFAULT);
520         }
521 
522         let mut page_index = offset >> PAGE_SHIFT;
523         offset &= PAGE_SIZE - 1;
524         while size > 0 {
525             let available = usize::min(size, PAGE_SIZE - offset);
526             // SAFETY: The pointer is in bounds.
527             let page_info = unsafe { pages.add(page_index) };
528             // SAFETY: The caller guarantees that this page is in the "in use" state for the
529             // duration of this call to `iterate`, so nobody will change the page.
530             let page = unsafe { PageInfo::get_page(page_info) };
531             if page.is_none() {
532                 pr_warn!("Page is null!");
533             }
534             let page = page.ok_or(EFAULT)?;
535             cb(page, offset, available)?;
536             size -= available;
537             page_index += 1;
538             offset = 0;
539         }
540         Ok(())
541     }
542 
543     /// Copy from userspace into this page range.
544     ///
545     /// # Safety
546     ///
547     /// All pages touched by this operation must be in use for the duration of this call.
copy_from_user_slice( &self, reader: &mut UserSliceReader, offset: usize, size: usize, ) -> Result548     pub(crate) unsafe fn copy_from_user_slice(
549         &self,
550         reader: &mut UserSliceReader,
551         offset: usize,
552         size: usize,
553     ) -> Result {
554         // SAFETY: `self.iterate` has the same safety requirements as `copy_from_user_slice`.
555         unsafe {
556             self.iterate(offset, size, |page, offset, to_copy| {
557                 page.copy_from_user_slice_raw(reader, offset, to_copy)
558             })
559         }
560     }
561 
562     /// Copy from this page range into kernel space.
563     ///
564     /// # Safety
565     ///
566     /// All pages touched by this operation must be in use for the duration of this call.
read<T: FromBytes>(&self, offset: usize) -> Result<T>567     pub(crate) unsafe fn read<T: FromBytes>(&self, offset: usize) -> Result<T> {
568         let mut out = MaybeUninit::<T>::uninit();
569         let mut out_offset = 0;
570         // SAFETY: `self.iterate` has the same safety requirements as `read`.
571         unsafe {
572             self.iterate(offset, size_of::<T>(), |page, offset, to_copy| {
573                 // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T.
574                 let obj_ptr = (out.as_mut_ptr() as *mut u8).add(out_offset);
575                 // SAFETY: The pointer points is in-bounds of the `out` variable, so it is valid.
576                 page.read_raw(obj_ptr, offset, to_copy)?;
577                 out_offset += to_copy;
578                 Ok(())
579             })?;
580         }
581         // SAFETY: We just initialised the data.
582         Ok(unsafe { out.assume_init() })
583     }
584 
585     /// Copy from kernel space into this page range.
586     ///
587     /// # Safety
588     ///
589     /// All pages touched by this operation must be in use for the duration of this call.
write<T: ?Sized>(&self, offset: usize, obj: &T) -> Result590     pub(crate) unsafe fn write<T: ?Sized>(&self, offset: usize, obj: &T) -> Result {
591         let mut obj_offset = 0;
592         // SAFETY: `self.iterate` has the same safety requirements as `write`.
593         unsafe {
594             self.iterate(offset, size_of_val(obj), |page, offset, to_copy| {
595                 // SAFETY: The sum of `offset` and `to_copy` is bounded by the size of T.
596                 let obj_ptr = (obj as *const T as *const u8).add(obj_offset);
597                 // SAFETY: We have a reference to the object, so the pointer is valid.
598                 page.write_raw(obj_ptr, offset, to_copy)?;
599                 obj_offset += to_copy;
600                 Ok(())
601             })
602         }
603     }
604 
605     /// Write zeroes to the given range.
606     ///
607     /// # Safety
608     ///
609     /// All pages touched by this operation must be in use for the duration of this call.
fill_zero(&self, offset: usize, size: usize) -> Result610     pub(crate) unsafe fn fill_zero(&self, offset: usize, size: usize) -> Result {
611         // SAFETY: `self.iterate` has the same safety requirements as `copy_into`.
612         unsafe {
613             self.iterate(offset, size, |page, offset, len| {
614                 page.fill_zero_raw(offset, len)
615             })
616         }
617     }
618 }
619 
620 #[pinned_drop]
621 impl PinnedDrop for ShrinkablePageRange {
drop(self: Pin<&mut Self>)622     fn drop(self: Pin<&mut Self>) {
623         let (pages, size) = {
624             let lock = self.lock.lock();
625             (lock.pages, lock.size)
626         };
627 
628         if size == 0 {
629             return;
630         }
631 
632         // Note: This call is also necessary for the safety of `stable_trylock_mm`.
633         let mm_lock = self.mm_lock.lock();
634 
635         // This is the destructor, so unlike the other methods, we only need to worry about races
636         // with the shrinker here. Since we hold the `mm_lock`, we also can't race with the
637         // shrinker, and after this loop, the shrinker will not access any of our pages since we
638         // removed them from the lru list.
639         for i in 0..size {
640             // SAFETY: Loop is in-bounds of the size.
641             let p_ptr = unsafe { pages.add(i) };
642             // SAFETY: No other readers, so we can read.
643             if let Some(p) = unsafe { PageInfo::get_page(p_ptr) } {
644                 // SAFETY: The pointer is valid and it's the right shrinker.
645                 unsafe { PageInfo::list_lru_del(p_ptr, p.nid(), self.shrinker) };
646             }
647         }
648 
649         drop(mm_lock);
650 
651         // SAFETY: `pages` was allocated as an `KVVec<PageInfo>` with capacity `size`. Furthermore,
652         // all `size` elements are initialized. Also, the array is no longer shared with the
653         // shrinker due to the above loop.
654         drop(unsafe { KVVec::from_raw_parts(pages, size, size) });
655     }
656 }
657 
658 /// # Safety
659 /// Called by the shrinker.
660 #[no_mangle]
rust_shrink_count( shrink: *mut bindings::shrinker, _sc: *mut bindings::shrink_control, ) -> c_ulong661 unsafe extern "C" fn rust_shrink_count(
662     shrink: *mut bindings::shrinker,
663     _sc: *mut bindings::shrink_control,
664 ) -> c_ulong {
665     // SAFETY: We can access our own private data.
666     let list_lru = unsafe { (*shrink).private_data.cast::<bindings::list_lru>() };
667     // SAFETY: Accessing the lru list is okay. Just an FFI call.
668     unsafe { bindings::list_lru_count(list_lru) }
669 }
670 
671 /// # Safety
672 /// Called by the shrinker.
673 #[no_mangle]
rust_shrink_scan( shrink: *mut bindings::shrinker, sc: *mut bindings::shrink_control, ) -> c_ulong674 unsafe extern "C" fn rust_shrink_scan(
675     shrink: *mut bindings::shrinker,
676     sc: *mut bindings::shrink_control,
677 ) -> c_ulong {
678     // SAFETY: We can access our own private data.
679     let list_lru = unsafe { (*shrink).private_data.cast::<bindings::list_lru>() };
680     // SAFETY: Caller guarantees that it is safe to read this field.
681     let nr_to_scan = unsafe { (*sc).nr_to_scan };
682     // SAFETY: Accessing the lru list is okay. Just an FFI call.
683     unsafe {
684         bindings::list_lru_walk(
685             list_lru,
686             Some(bindings::rust_shrink_free_page_wrap),
687             ptr::null_mut(),
688             nr_to_scan,
689         )
690     }
691 }
692 
693 const LRU_SKIP: bindings::lru_status = bindings::lru_status_LRU_SKIP;
694 const LRU_REMOVED_ENTRY: bindings::lru_status = bindings::lru_status_LRU_REMOVED_RETRY;
695 
696 /// # Safety
697 /// Called by the shrinker.
698 #[no_mangle]
rust_shrink_free_page( item: *mut bindings::list_head, lru: *mut bindings::list_lru_one, _cb_arg: *mut c_void, ) -> bindings::lru_status699 unsafe extern "C" fn rust_shrink_free_page(
700     item: *mut bindings::list_head,
701     lru: *mut bindings::list_lru_one,
702     _cb_arg: *mut c_void,
703 ) -> bindings::lru_status {
704     // Fields that should survive after unlocking the lru lock.
705     let page;
706     let page_index;
707     let mm;
708     let mmap_read;
709     let mm_mutex;
710     let vma_addr;
711     let range_ptr;
712 
713     {
714         // CAST: The `list_head` field is first in `PageInfo`.
715         let info = item as *mut PageInfo;
716         // SAFETY: The `range` field of `PageInfo` is immutable.
717         range_ptr = unsafe { (*info).range };
718         // SAFETY: The `range` outlives its `PageInfo` values.
719         let range = unsafe { &*range_ptr };
720 
721         mm = match range.mm.mmget_not_zero() {
722             Some(mm) => MmWithUser::into_mmput_async(mm),
723             None => return LRU_SKIP,
724         };
725 
726         mm_mutex = match range.stable_trylock_mm() {
727             Some(guard) => guard,
728             None => return LRU_SKIP,
729         };
730 
731         mmap_read = match mm.mmap_read_trylock() {
732             Some(guard) => guard,
733             None => return LRU_SKIP,
734         };
735 
736         // We can't lock it normally here, since we hold the lru lock.
737         let inner = match range.lock.try_lock() {
738             Some(inner) => inner,
739             None => return LRU_SKIP,
740         };
741 
742         // SAFETY: The item is in this lru list, so it's okay to remove it.
743         unsafe { bindings::list_lru_isolate(lru, item) };
744 
745         // SAFETY: Both pointers are in bounds of the same allocation.
746         page_index = unsafe { info.offset_from(inner.pages) } as usize;
747 
748         // SAFETY: We hold the spinlock, so we can take the page.
749         //
750         // This sets the page pointer to zero before we unmap it from the vma. However, we call
751         // `zap_page_range` before we release the mmap lock, so `use_page_slow` will not be able to
752         // insert a new page until after our call to `zap_page_range`.
753         page = unsafe { PageInfo::take_page(info) };
754         vma_addr = inner.vma_addr;
755 
756         // From this point on, we don't access this PageInfo or ShrinkablePageRange again, because
757         // they can be freed at any point after we unlock `lru_lock`. This is with the exception of
758         // `mm_mutex` which is kept alive by holding the lock.
759     }
760 
761     // SAFETY: The lru lock is locked when this method is called.
762     unsafe { bindings::spin_unlock(&raw mut (*lru).lock) };
763 
764     if let Some(unchecked_vma) = mmap_read.vma_lookup(vma_addr) {
765         if let Some(vma) = check_vma(unchecked_vma, range_ptr) {
766             let user_page_addr = vma_addr + (page_index << PAGE_SHIFT);
767             vma.zap_page_range_single(user_page_addr, PAGE_SIZE);
768         }
769     }
770 
771     drop(mmap_read);
772     drop(mm_mutex);
773     drop(mm);
774     drop(page);
775 
776     LRU_REMOVED_ENTRY
777 }
778