1 /* 2 * Copyright (c) 2014 Mellanox Technologies. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the 8 * OpenIB.org BSD license below: 9 * 10 * Redistribution and use in source and binary forms, with or 11 * without modification, are permitted provided that the following 12 * conditions are met: 13 * 14 * - Redistributions of source code must retain the above 15 * copyright notice, this list of conditions and the following 16 * disclaimer. 17 * 18 * - Redistributions in binary form must reproduce the above 19 * copyright notice, this list of conditions and the following 20 * disclaimer in the documentation and/or other materials 21 * provided with the distribution. 22 * 23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 * SOFTWARE. 31 */ 32 33 #include <linux/types.h> 34 #include <linux/sched.h> 35 #include <linux/sched/mm.h> 36 #include <linux/sched/task.h> 37 #include <linux/pid.h> 38 #include <linux/slab.h> 39 #include <linux/export.h> 40 #include <linux/vmalloc.h> 41 #include <linux/hugetlb.h> 42 #include <linux/interval_tree.h> 43 #include <linux/hmm.h> 44 #include <linux/hmm-dma.h> 45 #include <linux/pagemap.h> 46 47 #include <rdma/ib_umem_odp.h> 48 49 #include "uverbs.h" 50 51 static void ib_init_umem_implicit_odp(struct ib_umem_odp *umem_odp) 52 { 53 umem_odp->is_implicit_odp = 1; 54 umem_odp->umem.is_odp = 1; 55 mutex_init(&umem_odp->umem_mutex); 56 } 57 58 static int ib_init_umem_odp(struct ib_umem_odp *umem_odp, 59 const struct mmu_interval_notifier_ops *ops) 60 { 61 struct ib_device *dev = umem_odp->umem.ibdev; 62 size_t page_size = 1UL << umem_odp->page_shift; 63 struct hmm_dma_map *map; 64 unsigned long start; 65 unsigned long end; 66 size_t nr_entries; 67 int ret = 0; 68 69 umem_odp->umem.is_odp = 1; 70 mutex_init(&umem_odp->umem_mutex); 71 72 start = ALIGN_DOWN(umem_odp->umem.address, page_size); 73 if (check_add_overflow(umem_odp->umem.address, 74 (unsigned long)umem_odp->umem.length, &end)) 75 return -EOVERFLOW; 76 end = ALIGN(end, page_size); 77 if (unlikely(end < page_size)) 78 return -EOVERFLOW; 79 80 nr_entries = (end - start) >> PAGE_SHIFT; 81 if (!(nr_entries * PAGE_SIZE / page_size)) 82 return -EINVAL; 83 84 map = &umem_odp->map; 85 if (ib_uses_virt_dma(dev)) { 86 map->pfn_list = kvcalloc(nr_entries, sizeof(*map->pfn_list), 87 GFP_KERNEL | __GFP_NOWARN); 88 if (!map->pfn_list) 89 ret = -ENOMEM; 90 } else 91 ret = hmm_dma_map_alloc(dev->dma_device, map, 92 (end - start) >> PAGE_SHIFT, 93 1 << umem_odp->page_shift); 94 if (ret) 95 return ret; 96 97 ret = mmu_interval_notifier_insert(&umem_odp->notifier, 98 umem_odp->umem.owning_mm, start, 99 end - start, ops); 100 if (ret) 101 goto out_free_map; 102 103 return 0; 104 105 out_free_map: 106 if (ib_uses_virt_dma(dev)) 107 kfree(map->pfn_list); 108 else 109 hmm_dma_map_free(dev->dma_device, map); 110 return ret; 111 } 112 113 /** 114 * ib_umem_odp_alloc_implicit - Allocate a parent implicit ODP umem 115 * 116 * Implicit ODP umems do not have a VA range and do not have any page lists. 117 * They exist only to hold the per_mm reference to help the driver create 118 * children umems. 119 * 120 * @device: IB device to create UMEM 121 * @access: ib_reg_mr access flags 122 */ 123 struct ib_umem_odp *ib_umem_odp_alloc_implicit(struct ib_device *device, 124 int access) 125 { 126 struct ib_umem *umem; 127 struct ib_umem_odp *umem_odp; 128 129 if (access & IB_ACCESS_HUGETLB) 130 return ERR_PTR(-EINVAL); 131 132 umem_odp = kzalloc(sizeof(*umem_odp), GFP_KERNEL); 133 if (!umem_odp) 134 return ERR_PTR(-ENOMEM); 135 umem = &umem_odp->umem; 136 umem->ibdev = device; 137 umem->writable = ib_access_writable(access); 138 umem->owning_mm = current->mm; 139 umem_odp->page_shift = PAGE_SHIFT; 140 141 umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); 142 ib_init_umem_implicit_odp(umem_odp); 143 return umem_odp; 144 } 145 EXPORT_SYMBOL(ib_umem_odp_alloc_implicit); 146 147 /** 148 * ib_umem_odp_alloc_child - Allocate a child ODP umem under an implicit 149 * parent ODP umem 150 * 151 * @root: The parent umem enclosing the child. This must be allocated using 152 * ib_alloc_implicit_odp_umem() 153 * @addr: The starting userspace VA 154 * @size: The length of the userspace VA 155 * @ops: MMU interval ops, currently only @invalidate 156 */ 157 struct ib_umem_odp * 158 ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr, 159 size_t size, 160 const struct mmu_interval_notifier_ops *ops) 161 { 162 /* 163 * Caller must ensure that root cannot be freed during the call to 164 * ib_alloc_odp_umem. 165 */ 166 struct ib_umem_odp *odp_data; 167 struct ib_umem *umem; 168 int ret; 169 170 if (WARN_ON(!root->is_implicit_odp)) 171 return ERR_PTR(-EINVAL); 172 173 odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); 174 if (!odp_data) 175 return ERR_PTR(-ENOMEM); 176 umem = &odp_data->umem; 177 umem->ibdev = root->umem.ibdev; 178 umem->length = size; 179 umem->address = addr; 180 umem->writable = root->umem.writable; 181 umem->owning_mm = root->umem.owning_mm; 182 odp_data->page_shift = PAGE_SHIFT; 183 odp_data->notifier.ops = ops; 184 185 /* 186 * A mmget must be held when registering a notifier, the owming_mm only 187 * has a mm_grab at this point. 188 */ 189 if (!mmget_not_zero(umem->owning_mm)) { 190 ret = -EFAULT; 191 goto out_free; 192 } 193 194 odp_data->tgid = get_pid(root->tgid); 195 ret = ib_init_umem_odp(odp_data, ops); 196 if (ret) 197 goto out_tgid; 198 mmput(umem->owning_mm); 199 return odp_data; 200 201 out_tgid: 202 put_pid(odp_data->tgid); 203 mmput(umem->owning_mm); 204 out_free: 205 kfree(odp_data); 206 return ERR_PTR(ret); 207 } 208 EXPORT_SYMBOL(ib_umem_odp_alloc_child); 209 210 /** 211 * ib_umem_odp_get - Create a umem_odp for a userspace va 212 * 213 * @device: IB device struct to get UMEM 214 * @addr: userspace virtual address to start at 215 * @size: length of region to pin 216 * @access: IB_ACCESS_xxx flags for memory being pinned 217 * @ops: MMU interval ops, currently only @invalidate 218 * 219 * The driver should use when the access flags indicate ODP memory. It avoids 220 * pinning, instead, stores the mm for future page fault handling in 221 * conjunction with MMU notifiers. 222 */ 223 struct ib_umem_odp *ib_umem_odp_get(struct ib_device *device, 224 unsigned long addr, size_t size, int access, 225 const struct mmu_interval_notifier_ops *ops) 226 { 227 struct ib_umem_odp *umem_odp; 228 int ret; 229 230 if (WARN_ON_ONCE(!(access & IB_ACCESS_ON_DEMAND))) 231 return ERR_PTR(-EINVAL); 232 233 umem_odp = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); 234 if (!umem_odp) 235 return ERR_PTR(-ENOMEM); 236 237 umem_odp->umem.ibdev = device; 238 umem_odp->umem.length = size; 239 umem_odp->umem.address = addr; 240 umem_odp->umem.writable = ib_access_writable(access); 241 umem_odp->umem.owning_mm = current->mm; 242 umem_odp->notifier.ops = ops; 243 244 umem_odp->page_shift = PAGE_SHIFT; 245 #ifdef CONFIG_HUGETLB_PAGE 246 if (access & IB_ACCESS_HUGETLB) 247 umem_odp->page_shift = HPAGE_SHIFT; 248 #endif 249 250 umem_odp->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); 251 ret = ib_init_umem_odp(umem_odp, ops); 252 if (ret) 253 goto err_put_pid; 254 return umem_odp; 255 256 err_put_pid: 257 put_pid(umem_odp->tgid); 258 kfree(umem_odp); 259 return ERR_PTR(ret); 260 } 261 EXPORT_SYMBOL(ib_umem_odp_get); 262 263 static void ib_umem_odp_free(struct ib_umem_odp *umem_odp) 264 { 265 struct ib_device *dev = umem_odp->umem.ibdev; 266 267 /* 268 * Ensure that no more pages are mapped in the umem. 269 * 270 * It is the driver's responsibility to ensure, before calling us, 271 * that the hardware will not attempt to access the MR any more. 272 */ 273 mutex_lock(&umem_odp->umem_mutex); 274 ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), 275 ib_umem_end(umem_odp)); 276 mutex_unlock(&umem_odp->umem_mutex); 277 mmu_interval_notifier_remove(&umem_odp->notifier); 278 if (ib_uses_virt_dma(dev)) 279 kfree(umem_odp->map.pfn_list); 280 else 281 hmm_dma_map_free(dev->dma_device, &umem_odp->map); 282 } 283 284 void ib_umem_odp_release(struct ib_umem_odp *umem_odp) 285 { 286 if (!umem_odp->is_implicit_odp) 287 ib_umem_odp_free(umem_odp); 288 289 put_pid(umem_odp->tgid); 290 kfree(umem_odp); 291 } 292 EXPORT_SYMBOL(ib_umem_odp_release); 293 294 /** 295 * ib_umem_odp_map_dma_and_lock - DMA map userspace memory in an ODP MR and lock it. 296 * 297 * Maps the range passed in the argument to DMA addresses. 298 * Upon success the ODP MR will be locked to let caller complete its device 299 * page table update. 300 * 301 * Returns the number of pages mapped in success, negative error code 302 * for failure. 303 * @umem_odp: the umem to map and pin 304 * @user_virt: the address from which we need to map. 305 * @bcnt: the minimal number of bytes to pin and map. The mapping might be 306 * bigger due to alignment, and may also be smaller in case of an error 307 * pinning or mapping a page. The actual pages mapped is returned in 308 * the return value. 309 * @access_mask: bit mask of the requested access permissions for the given 310 * range. 311 * @fault: is faulting required for the given range 312 */ 313 int ib_umem_odp_map_dma_and_lock(struct ib_umem_odp *umem_odp, u64 user_virt, 314 u64 bcnt, u64 access_mask, bool fault) 315 __acquires(&umem_odp->umem_mutex) 316 { 317 struct task_struct *owning_process = NULL; 318 struct mm_struct *owning_mm = umem_odp->umem.owning_mm; 319 int pfn_index, dma_index, ret = 0, start_idx; 320 unsigned int page_shift, hmm_order, pfn_start_idx; 321 unsigned long num_pfns, current_seq; 322 struct hmm_range range = {}; 323 unsigned long timeout; 324 325 if (user_virt < ib_umem_start(umem_odp) || 326 user_virt + bcnt > ib_umem_end(umem_odp)) 327 return -EFAULT; 328 329 page_shift = umem_odp->page_shift; 330 331 /* 332 * owning_process is allowed to be NULL, this means somehow the mm is 333 * existing beyond the lifetime of the originating process.. Presumably 334 * mmget_not_zero will fail in this case. 335 */ 336 owning_process = get_pid_task(umem_odp->tgid, PIDTYPE_PID); 337 if (!owning_process || !mmget_not_zero(owning_mm)) { 338 ret = -EINVAL; 339 goto out_put_task; 340 } 341 342 range.notifier = &umem_odp->notifier; 343 range.start = ALIGN_DOWN(user_virt, 1UL << page_shift); 344 range.end = ALIGN(user_virt + bcnt, 1UL << page_shift); 345 pfn_start_idx = (range.start - ib_umem_start(umem_odp)) >> PAGE_SHIFT; 346 num_pfns = (range.end - range.start) >> PAGE_SHIFT; 347 if (fault) { 348 range.default_flags = HMM_PFN_REQ_FAULT; 349 350 if (access_mask & HMM_PFN_WRITE) 351 range.default_flags |= HMM_PFN_REQ_WRITE; 352 } 353 354 range.hmm_pfns = &(umem_odp->map.pfn_list[pfn_start_idx]); 355 timeout = jiffies + msecs_to_jiffies(HMM_RANGE_DEFAULT_TIMEOUT); 356 357 retry: 358 current_seq = range.notifier_seq = 359 mmu_interval_read_begin(&umem_odp->notifier); 360 361 mmap_read_lock(owning_mm); 362 ret = hmm_range_fault(&range); 363 mmap_read_unlock(owning_mm); 364 if (unlikely(ret)) { 365 if (ret == -EBUSY && !time_after(jiffies, timeout)) 366 goto retry; 367 goto out_put_mm; 368 } 369 370 start_idx = (range.start - ib_umem_start(umem_odp)) >> page_shift; 371 dma_index = start_idx; 372 373 mutex_lock(&umem_odp->umem_mutex); 374 if (mmu_interval_read_retry(&umem_odp->notifier, current_seq)) { 375 mutex_unlock(&umem_odp->umem_mutex); 376 goto retry; 377 } 378 379 for (pfn_index = 0; pfn_index < num_pfns; 380 pfn_index += 1 << (page_shift - PAGE_SHIFT), dma_index++) { 381 382 /* 383 * Since we asked for hmm_range_fault() to populate 384 * pages it shouldn't return an error entry on success. 385 */ 386 WARN_ON(fault && range.hmm_pfns[pfn_index] & HMM_PFN_ERROR); 387 WARN_ON(fault && !(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)); 388 if (!(range.hmm_pfns[pfn_index] & HMM_PFN_VALID)) 389 continue; 390 391 if (range.hmm_pfns[pfn_index] & HMM_PFN_DMA_MAPPED) 392 continue; 393 394 hmm_order = hmm_pfn_to_map_order(range.hmm_pfns[pfn_index]); 395 /* If a hugepage was detected and ODP wasn't set for, the umem 396 * page_shift will be used, the opposite case is an error. 397 */ 398 if (hmm_order + PAGE_SHIFT < page_shift) { 399 ret = -EINVAL; 400 ibdev_dbg(umem_odp->umem.ibdev, 401 "%s: un-expected hmm_order %u, page_shift %u\n", 402 __func__, hmm_order, page_shift); 403 break; 404 } 405 } 406 /* upon success lock should stay on hold for the callee */ 407 if (!ret) 408 ret = dma_index - start_idx; 409 else 410 mutex_unlock(&umem_odp->umem_mutex); 411 412 out_put_mm: 413 mmput_async(owning_mm); 414 out_put_task: 415 if (owning_process) 416 put_task_struct(owning_process); 417 return ret; 418 } 419 EXPORT_SYMBOL(ib_umem_odp_map_dma_and_lock); 420 421 void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, 422 u64 bound) 423 { 424 struct ib_device *dev = umem_odp->umem.ibdev; 425 u64 addr; 426 427 lockdep_assert_held(&umem_odp->umem_mutex); 428 429 virt = max_t(u64, virt, ib_umem_start(umem_odp)); 430 bound = min_t(u64, bound, ib_umem_end(umem_odp)); 431 for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { 432 u64 offset = addr - ib_umem_start(umem_odp); 433 size_t idx = offset >> umem_odp->page_shift; 434 unsigned long pfn = umem_odp->map.pfn_list[idx]; 435 436 if (!hmm_dma_unmap_pfn(dev->dma_device, &umem_odp->map, idx)) 437 goto clear; 438 439 if (pfn & HMM_PFN_WRITE) { 440 struct page *page = hmm_pfn_to_page(pfn); 441 struct page *head_page = compound_head(page); 442 /* 443 * set_page_dirty prefers being called with 444 * the page lock. However, MMU notifiers are 445 * called sometimes with and sometimes without 446 * the lock. We rely on the umem_mutex instead 447 * to prevent other mmu notifiers from 448 * continuing and allowing the page mapping to 449 * be removed. 450 */ 451 set_page_dirty(head_page); 452 } 453 umem_odp->npages--; 454 clear: 455 umem_odp->map.pfn_list[idx] &= ~HMM_PFN_FLAGS; 456 } 457 } 458 EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); 459