2 * Copyright © 2012-2014 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
28 #include "i915_trace.h"
29 #include "intel_drv.h"
30 #include <linux/mmu_context.h>
31 #include <linux/mmu_notifier.h>
32 #include <linux/mempolicy.h>
33 #include <linux/swap.h>
35 #if defined(CONFIG_MMU_NOTIFIER)
36 #include <linux/interval_tree.h>
38 struct i915_mmu_notifier
{
40 struct hlist_node node
;
41 struct mmu_notifier mn
;
42 struct rb_root objects
;
43 struct drm_device
*dev
;
45 struct work_struct work
;
50 struct i915_mmu_object
{
51 struct i915_mmu_notifier
*mmu
;
52 struct interval_tree_node it
;
53 struct drm_i915_gem_object
*obj
;
56 static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier
*_mn
,
61 struct i915_mmu_notifier
*mn
= container_of(_mn
, struct i915_mmu_notifier
, mn
);
62 struct interval_tree_node
*it
= NULL
;
63 unsigned long serial
= 0;
65 end
--; /* interval ranges are inclusive, but invalidate range is exclusive */
67 struct drm_i915_gem_object
*obj
;
71 if (serial
== mn
->serial
)
72 it
= interval_tree_iter_next(it
, start
, end
);
74 it
= interval_tree_iter_first(&mn
->objects
, start
, end
);
76 obj
= container_of(it
, struct i915_mmu_object
, it
)->obj
;
77 drm_gem_object_reference(&obj
->base
);
80 spin_unlock(&mn
->lock
);
84 mutex_lock(&mn
->dev
->struct_mutex
);
85 /* Cancel any active worker and force us to re-evaluate gup */
86 obj
->userptr
.work
= NULL
;
88 if (obj
->pages
!= NULL
) {
89 struct drm_i915_private
*dev_priv
= to_i915(mn
->dev
);
90 struct i915_vma
*vma
, *tmp
;
91 bool was_interruptible
;
93 was_interruptible
= dev_priv
->mm
.interruptible
;
94 dev_priv
->mm
.interruptible
= false;
96 list_for_each_entry_safe(vma
, tmp
, &obj
->vma_list
, vma_link
) {
97 int ret
= i915_vma_unbind(vma
);
98 WARN_ON(ret
&& ret
!= -EIO
);
100 WARN_ON(i915_gem_object_put_pages(obj
));
102 dev_priv
->mm
.interruptible
= was_interruptible
;
105 start
= obj
->userptr
.ptr
+ obj
->base
.size
;
107 drm_gem_object_unreference(&obj
->base
);
108 mutex_unlock(&mn
->dev
->struct_mutex
);
112 static const struct mmu_notifier_ops i915_gem_userptr_notifier
= {
113 .invalidate_range_start
= i915_gem_userptr_mn_invalidate_range_start
,
116 static struct i915_mmu_notifier
*
117 __i915_mmu_notifier_lookup(struct drm_device
*dev
, struct mm_struct
*mm
)
119 struct drm_i915_private
*dev_priv
= to_i915(dev
);
120 struct i915_mmu_notifier
*mmu
;
122 /* Protected by dev->struct_mutex */
123 hash_for_each_possible(dev_priv
->mmu_notifiers
, mmu
, node
, (unsigned long)mm
)
130 static struct i915_mmu_notifier
*
131 i915_mmu_notifier_get(struct drm_device
*dev
, struct mm_struct
*mm
)
133 struct drm_i915_private
*dev_priv
= to_i915(dev
);
134 struct i915_mmu_notifier
*mmu
;
137 lockdep_assert_held(&dev
->struct_mutex
);
139 mmu
= __i915_mmu_notifier_lookup(dev
, mm
);
143 mmu
= kmalloc(sizeof(*mmu
), GFP_KERNEL
);
145 return ERR_PTR(-ENOMEM
);
147 spin_lock_init(&mmu
->lock
);
149 mmu
->mn
.ops
= &i915_gem_userptr_notifier
;
151 mmu
->objects
= RB_ROOT
;
155 /* Protected by mmap_sem (write-lock) */
156 ret
= __mmu_notifier_register(&mmu
->mn
, mm
);
162 /* Protected by dev->struct_mutex */
163 hash_add(dev_priv
->mmu_notifiers
, &mmu
->node
, (unsigned long)mm
);
168 __i915_mmu_notifier_destroy_worker(struct work_struct
*work
)
170 struct i915_mmu_notifier
*mmu
= container_of(work
, typeof(*mmu
), work
);
171 mmu_notifier_unregister(&mmu
->mn
, mmu
->mm
);
176 __i915_mmu_notifier_destroy(struct i915_mmu_notifier
*mmu
)
178 lockdep_assert_held(&mmu
->dev
->struct_mutex
);
180 /* Protected by dev->struct_mutex */
181 hash_del(&mmu
->node
);
183 /* Our lock ordering is: mmap_sem, mmu_notifier_scru, struct_mutex.
184 * We enter the function holding struct_mutex, therefore we need
185 * to drop our mutex prior to calling mmu_notifier_unregister in
186 * order to prevent lock inversion (and system-wide deadlock)
187 * between the mmap_sem and struct-mutex. Hence we defer the
188 * unregistration to a workqueue where we hold no locks.
190 INIT_WORK(&mmu
->work
, __i915_mmu_notifier_destroy_worker
);
191 schedule_work(&mmu
->work
);
194 static void __i915_mmu_notifier_update_serial(struct i915_mmu_notifier
*mmu
)
196 if (++mmu
->serial
== 0)
201 i915_mmu_notifier_del(struct i915_mmu_notifier
*mmu
,
202 struct i915_mmu_object
*mn
)
204 lockdep_assert_held(&mmu
->dev
->struct_mutex
);
206 spin_lock(&mmu
->lock
);
207 interval_tree_remove(&mn
->it
, &mmu
->objects
);
208 __i915_mmu_notifier_update_serial(mmu
);
209 spin_unlock(&mmu
->lock
);
211 /* Protected against _add() by dev->struct_mutex */
212 if (--mmu
->count
== 0)
213 __i915_mmu_notifier_destroy(mmu
);
217 i915_mmu_notifier_add(struct i915_mmu_notifier
*mmu
,
218 struct i915_mmu_object
*mn
)
220 struct interval_tree_node
*it
;
223 ret
= i915_mutex_lock_interruptible(mmu
->dev
);
227 /* Make sure we drop the final active reference (and thereby
228 * remove the objects from the interval tree) before we do
229 * the check for overlapping objects.
231 i915_gem_retire_requests(mmu
->dev
);
233 /* Disallow overlapping userptr objects */
234 spin_lock(&mmu
->lock
);
235 it
= interval_tree_iter_first(&mmu
->objects
,
236 mn
->it
.start
, mn
->it
.last
);
238 struct drm_i915_gem_object
*obj
;
240 /* We only need to check the first object in the range as it
241 * either has cancelled gup work queued and we need to
242 * return back to the user to give time for the gup-workers
243 * to flush their object references upon which the object will
244 * be removed from the interval-tree, or the the range is
245 * still in use by another client and the overlap is invalid.
248 obj
= container_of(it
, struct i915_mmu_object
, it
)->obj
;
249 ret
= obj
->userptr
.workers
? -EAGAIN
: -EINVAL
;
251 interval_tree_insert(&mn
->it
, &mmu
->objects
);
252 __i915_mmu_notifier_update_serial(mmu
);
255 spin_unlock(&mmu
->lock
);
256 mutex_unlock(&mmu
->dev
->struct_mutex
);
262 i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object
*obj
)
264 struct i915_mmu_object
*mn
;
266 mn
= obj
->userptr
.mn
;
270 i915_mmu_notifier_del(mn
->mmu
, mn
);
271 obj
->userptr
.mn
= NULL
;
275 i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object
*obj
,
278 struct i915_mmu_notifier
*mmu
;
279 struct i915_mmu_object
*mn
;
282 if (flags
& I915_USERPTR_UNSYNCHRONIZED
)
283 return capable(CAP_SYS_ADMIN
) ? 0 : -EPERM
;
285 down_write(&obj
->userptr
.mm
->mmap_sem
);
286 ret
= i915_mutex_lock_interruptible(obj
->base
.dev
);
288 mmu
= i915_mmu_notifier_get(obj
->base
.dev
, obj
->userptr
.mm
);
290 mmu
->count
++; /* preemptive add to act as a refcount */
293 mutex_unlock(&obj
->base
.dev
->struct_mutex
);
295 up_write(&obj
->userptr
.mm
->mmap_sem
);
299 mn
= kzalloc(sizeof(*mn
), GFP_KERNEL
);
306 mn
->it
.start
= obj
->userptr
.ptr
;
307 mn
->it
.last
= mn
->it
.start
+ obj
->base
.size
- 1;
310 ret
= i915_mmu_notifier_add(mmu
, mn
);
314 obj
->userptr
.mn
= mn
;
320 mutex_lock(&obj
->base
.dev
->struct_mutex
);
321 if (--mmu
->count
== 0)
322 __i915_mmu_notifier_destroy(mmu
);
323 mutex_unlock(&obj
->base
.dev
->struct_mutex
);
330 i915_gem_userptr_release__mmu_notifier(struct drm_i915_gem_object
*obj
)
335 i915_gem_userptr_init__mmu_notifier(struct drm_i915_gem_object
*obj
,
338 if ((flags
& I915_USERPTR_UNSYNCHRONIZED
) == 0)
341 if (!capable(CAP_SYS_ADMIN
))
348 struct get_pages_work
{
349 struct work_struct work
;
350 struct drm_i915_gem_object
*obj
;
351 struct task_struct
*task
;
355 #if IS_ENABLED(CONFIG_SWIOTLB)
356 #define swiotlb_active() swiotlb_nr_tbl()
358 #define swiotlb_active() 0
362 st_set_pages(struct sg_table
**st
, struct page
**pvec
, int num_pages
)
364 struct scatterlist
*sg
;
367 *st
= kmalloc(sizeof(**st
), GFP_KERNEL
);
371 if (swiotlb_active()) {
372 ret
= sg_alloc_table(*st
, num_pages
, GFP_KERNEL
);
376 for_each_sg((*st
)->sgl
, sg
, num_pages
, n
)
377 sg_set_page(sg
, pvec
[n
], PAGE_SIZE
, 0);
379 ret
= sg_alloc_table_from_pages(*st
, pvec
, num_pages
,
380 0, num_pages
<< PAGE_SHIFT
,
395 __i915_gem_userptr_get_pages_worker(struct work_struct
*_work
)
397 struct get_pages_work
*work
= container_of(_work
, typeof(*work
), work
);
398 struct drm_i915_gem_object
*obj
= work
->obj
;
399 struct drm_device
*dev
= obj
->base
.dev
;
400 const int num_pages
= obj
->base
.size
>> PAGE_SHIFT
;
407 pvec
= kmalloc(num_pages
*sizeof(struct page
*),
408 GFP_TEMPORARY
| __GFP_NOWARN
| __GFP_NORETRY
);
410 pvec
= drm_malloc_ab(num_pages
, sizeof(struct page
*));
412 struct mm_struct
*mm
= obj
->userptr
.mm
;
414 down_read(&mm
->mmap_sem
);
415 while (pinned
< num_pages
) {
416 ret
= get_user_pages(work
->task
, mm
,
417 obj
->userptr
.ptr
+ pinned
* PAGE_SIZE
,
419 !obj
->userptr
.read_only
, 0,
420 pvec
+ pinned
, NULL
);
426 up_read(&mm
->mmap_sem
);
429 mutex_lock(&dev
->struct_mutex
);
430 if (obj
->userptr
.work
!= &work
->work
) {
432 } else if (pinned
== num_pages
) {
433 ret
= st_set_pages(&obj
->pages
, pvec
, num_pages
);
435 list_add_tail(&obj
->global_list
, &to_i915(dev
)->mm
.unbound_list
);
440 obj
->userptr
.work
= ERR_PTR(ret
);
441 obj
->userptr
.workers
--;
442 drm_gem_object_unreference(&obj
->base
);
443 mutex_unlock(&dev
->struct_mutex
);
445 release_pages(pvec
, pinned
, 0);
446 drm_free_large(pvec
);
448 put_task_struct(work
->task
);
453 i915_gem_userptr_get_pages(struct drm_i915_gem_object
*obj
)
455 const int num_pages
= obj
->base
.size
>> PAGE_SHIFT
;
459 /* If userspace should engineer that these pages are replaced in
460 * the vma between us binding this page into the GTT and completion
461 * of rendering... Their loss. If they change the mapping of their
462 * pages they need to create a new bo to point to the new vma.
464 * However, that still leaves open the possibility of the vma
465 * being copied upon fork. Which falls under the same userspace
466 * synchronisation issue as a regular bo, except that this time
467 * the process may not be expecting that a particular piece of
468 * memory is tied to the GPU.
470 * Fortunately, we can hook into the mmu_notifier in order to
471 * discard the page references prior to anything nasty happening
472 * to the vma (discard or cloning) which should prevent the more
473 * egregious cases from causing harm.
478 if (obj
->userptr
.mm
== current
->mm
) {
479 pvec
= kmalloc(num_pages
*sizeof(struct page
*),
480 GFP_TEMPORARY
| __GFP_NOWARN
| __GFP_NORETRY
);
482 pvec
= drm_malloc_ab(num_pages
, sizeof(struct page
*));
487 pinned
= __get_user_pages_fast(obj
->userptr
.ptr
, num_pages
,
488 !obj
->userptr
.read_only
, pvec
);
490 if (pinned
< num_pages
) {
495 /* Spawn a worker so that we can acquire the
496 * user pages without holding our mutex. Access
497 * to the user pages requires mmap_sem, and we have
498 * a strict lock ordering of mmap_sem, struct_mutex -
499 * we already hold struct_mutex here and so cannot
500 * call gup without encountering a lock inversion.
502 * Userspace will keep on repeating the operation
503 * (thanks to EAGAIN) until either we hit the fast
504 * path or the worker completes. If the worker is
505 * cancelled or superseded, the task is still run
506 * but the results ignored. (This leads to
507 * complications that we may have a stray object
508 * refcount that we need to be wary of when
509 * checking for existing objects during creation.)
510 * If the worker encounters an error, it reports
511 * that error back to this function through
512 * obj->userptr.work = ERR_PTR.
515 if (obj
->userptr
.work
== NULL
&&
516 obj
->userptr
.workers
< I915_GEM_USERPTR_MAX_WORKERS
) {
517 struct get_pages_work
*work
;
519 work
= kmalloc(sizeof(*work
), GFP_KERNEL
);
521 obj
->userptr
.work
= &work
->work
;
522 obj
->userptr
.workers
++;
525 drm_gem_object_reference(&obj
->base
);
527 work
->task
= current
;
528 get_task_struct(work
->task
);
530 INIT_WORK(&work
->work
, __i915_gem_userptr_get_pages_worker
);
531 schedule_work(&work
->work
);
535 if (IS_ERR(obj
->userptr
.work
)) {
536 ret
= PTR_ERR(obj
->userptr
.work
);
537 obj
->userptr
.work
= NULL
;
542 ret
= st_set_pages(&obj
->pages
, pvec
, num_pages
);
544 obj
->userptr
.work
= NULL
;
549 release_pages(pvec
, pinned
, 0);
550 drm_free_large(pvec
);
555 i915_gem_userptr_put_pages(struct drm_i915_gem_object
*obj
)
557 struct scatterlist
*sg
;
560 BUG_ON(obj
->userptr
.work
!= NULL
);
562 if (obj
->madv
!= I915_MADV_WILLNEED
)
565 for_each_sg(obj
->pages
->sgl
, sg
, obj
->pages
->nents
, i
) {
566 struct page
*page
= sg_page(sg
);
569 set_page_dirty(page
);
571 mark_page_accessed(page
);
572 page_cache_release(page
);
576 sg_free_table(obj
->pages
);
581 i915_gem_userptr_release(struct drm_i915_gem_object
*obj
)
583 i915_gem_userptr_release__mmu_notifier(obj
);
585 if (obj
->userptr
.mm
) {
586 mmput(obj
->userptr
.mm
);
587 obj
->userptr
.mm
= NULL
;
592 i915_gem_userptr_dmabuf_export(struct drm_i915_gem_object
*obj
)
597 return i915_gem_userptr_init__mmu_notifier(obj
, 0);
600 static const struct drm_i915_gem_object_ops i915_gem_userptr_ops
= {
601 .dmabuf_export
= i915_gem_userptr_dmabuf_export
,
602 .get_pages
= i915_gem_userptr_get_pages
,
603 .put_pages
= i915_gem_userptr_put_pages
,
604 .release
= i915_gem_userptr_release
,
608 * Creates a new mm object that wraps some normal memory from the process
609 * context - user memory.
611 * We impose several restrictions upon the memory being mapped
613 * 1. It must be page aligned (both start/end addresses, i.e ptr and size).
614 * 2. It cannot overlap any other userptr object in the same address space.
615 * 3. It must be normal system memory, not a pointer into another map of IO
616 * space (e.g. it must not be a GTT mmapping of another object).
617 * 4. We only allow a bo as large as we could in theory map into the GTT,
618 * that is we limit the size to the total size of the GTT.
619 * 5. The bo is marked as being snoopable. The backing pages are left
620 * accessible directly by the CPU, but reads and writes by the GPU may
621 * incur the cost of a snoop (unless you have an LLC architecture).
623 * Synchronisation between multiple users and the GPU is left to userspace
624 * through the normal set-domain-ioctl. The kernel will enforce that the
625 * GPU relinquishes the VMA before it is returned back to the system
626 * i.e. upon free(), munmap() or process termination. However, the userspace
627 * malloc() library may not immediately relinquish the VMA after free() and
628 * instead reuse it whilst the GPU is still reading and writing to the VMA.
631 * Also note, that the object created here is not currently a "first class"
632 * object, in that several ioctls are banned. These are the CPU access
633 * ioctls: mmap(), pwrite and pread. In practice, you are expected to use
634 * direct access via your pointer rather than use those ioctls.
636 * If you think this is a good interface to use to pass GPU memory between
637 * drivers, please use dma-buf instead. In fact, wherever possible use
641 i915_gem_userptr_ioctl(struct drm_device
*dev
, void *data
, struct drm_file
*file
)
643 struct drm_i915_private
*dev_priv
= dev
->dev_private
;
644 struct drm_i915_gem_userptr
*args
= data
;
645 struct drm_i915_gem_object
*obj
;
649 if (args
->flags
& ~(I915_USERPTR_READ_ONLY
|
650 I915_USERPTR_UNSYNCHRONIZED
))
653 if (offset_in_page(args
->user_ptr
| args
->user_size
))
656 if (args
->user_size
> dev_priv
->gtt
.base
.total
)
659 if (!access_ok(args
->flags
& I915_USERPTR_READ_ONLY
? VERIFY_READ
: VERIFY_WRITE
,
660 (char __user
*)(unsigned long)args
->user_ptr
, args
->user_size
))
663 if (args
->flags
& I915_USERPTR_READ_ONLY
) {
664 /* On almost all of the current hw, we cannot tell the GPU that a
665 * page is readonly, so this is just a placeholder in the uAPI.
670 /* Allocate the new object */
671 obj
= i915_gem_object_alloc(dev
);
675 drm_gem_private_object_init(dev
, &obj
->base
, args
->user_size
);
676 i915_gem_object_init(obj
, &i915_gem_userptr_ops
);
677 obj
->cache_level
= I915_CACHE_LLC
;
678 obj
->base
.write_domain
= I915_GEM_DOMAIN_CPU
;
679 obj
->base
.read_domains
= I915_GEM_DOMAIN_CPU
;
681 obj
->userptr
.ptr
= args
->user_ptr
;
682 obj
->userptr
.read_only
= !!(args
->flags
& I915_USERPTR_READ_ONLY
);
684 /* And keep a pointer to the current->mm for resolving the user pages
685 * at binding. This means that we need to hook into the mmu_notifier
686 * in order to detect if the mmu is destroyed.
689 if ((obj
->userptr
.mm
= get_task_mm(current
)))
690 ret
= i915_gem_userptr_init__mmu_notifier(obj
, args
->flags
);
692 ret
= drm_gem_handle_create(file
, &obj
->base
, &handle
);
694 /* drop reference from allocate - handle holds it now */
695 drm_gem_object_unreference_unlocked(&obj
->base
);
699 args
->handle
= handle
;
704 i915_gem_init_userptr(struct drm_device
*dev
)
706 #if defined(CONFIG_MMU_NOTIFIER)
707 struct drm_i915_private
*dev_priv
= to_i915(dev
);
708 hash_init(dev_priv
->mmu_notifiers
);