X-Git-Url: http://drtracing.org/?a=blobdiff_plain;f=drivers%2Fgpu%2Fdrm%2Fi915%2Fintel_lrc.c;h=eda3096a5b754137ea1a50b8840e9ab618c7675c;hb=5f19e2bffa63a91cd4ac1adcec648e14a44277ce;hp=09df74b8e917b1dac90d460be50d1c4c5152881c;hpb=1190944f4b12203330ac5ed8784f6c181bf26f2d;p=deliverable%2Flinux.git diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 09df74b8e917..eda3096a5b75 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -188,6 +188,15 @@ #define GEN8_CTX_FORCE_RESTORE (1<<2) #define GEN8_CTX_L3LLC_COHERENT (1<<5) #define GEN8_CTX_PRIVILEGE (1<<8) + +#define ASSIGN_CTX_PDP(ppgtt, reg_state, n) { \ + const u64 _addr = test_bit(n, ppgtt->pdp.used_pdpes) ? \ + ppgtt->pdp.page_directory[n]->daddr : \ + ppgtt->scratch_pd->daddr; \ + reg_state[CTX_PDP ## n ## _UDW+1] = upper_32_bits(_addr); \ + reg_state[CTX_PDP ## n ## _LDW+1] = lower_32_bits(_addr); \ +} + enum { ADVANCED_CONTEXT = 0, LEGACY_CONTEXT, @@ -202,6 +211,7 @@ enum { FAULT_AND_CONTINUE /* Unsupported */ }; #define GEN8_CTX_ID_SHIFT 32 +#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17 static int intel_lr_context_pin(struct intel_engine_cs *ring, struct intel_context *ctx); @@ -265,7 +275,8 @@ static uint64_t execlists_ctx_descriptor(struct intel_engine_cs *ring, desc = GEN8_CTX_VALID; desc |= LEGACY_CONTEXT << GEN8_CTX_MODE_SHIFT; - desc |= GEN8_CTX_L3LLC_COHERENT; + if (IS_GEN8(ctx_obj->base.dev)) + desc |= GEN8_CTX_L3LLC_COHERENT; desc |= GEN8_CTX_PRIVILEGE; desc |= lrca; desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT; @@ -305,21 +316,24 @@ static void execlists_elsp_write(struct intel_engine_cs *ring, desc[3] = (u32)(temp >> 32); desc[2] = (u32)temp; - intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL); - I915_WRITE(RING_ELSP(ring), desc[1]); - I915_WRITE(RING_ELSP(ring), desc[0]); - I915_WRITE(RING_ELSP(ring), desc[3]); + spin_lock(&dev_priv->uncore.lock); + intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL); + I915_WRITE_FW(RING_ELSP(ring), desc[1]); + I915_WRITE_FW(RING_ELSP(ring), desc[0]); + I915_WRITE_FW(RING_ELSP(ring), desc[3]); /* The context is automatically loaded after the following */ - I915_WRITE(RING_ELSP(ring), desc[2]); + I915_WRITE_FW(RING_ELSP(ring), desc[2]); /* ELSP is a wo register, so use another nearby reg for posting instead */ - POSTING_READ(RING_EXECLIST_STATUS(ring)); - intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); + POSTING_READ_FW(RING_EXECLIST_STATUS(ring)); + intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL); + spin_unlock(&dev_priv->uncore.lock); } static int execlists_update_context(struct drm_i915_gem_object *ctx_obj, struct drm_i915_gem_object *ring_obj, + struct i915_hw_ppgtt *ppgtt, u32 tail) { struct page *page; @@ -331,6 +345,16 @@ static int execlists_update_context(struct drm_i915_gem_object *ctx_obj, reg_state[CTX_RING_TAIL+1] = tail; reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(ring_obj); + /* True PPGTT with dynamic page allocation: update PDP registers and + * point the unallocated PDPs to the scratch page + */ + if (ppgtt) { + ASSIGN_CTX_PDP(ppgtt, reg_state, 3); + ASSIGN_CTX_PDP(ppgtt, reg_state, 2); + ASSIGN_CTX_PDP(ppgtt, reg_state, 1); + ASSIGN_CTX_PDP(ppgtt, reg_state, 0); + } + kunmap_atomic(reg_state); return 0; @@ -349,7 +373,7 @@ static void execlists_submit_contexts(struct intel_engine_cs *ring, WARN_ON(!i915_gem_obj_is_pinned(ctx_obj0)); WARN_ON(!i915_gem_obj_is_pinned(ringbuf0->obj)); - execlists_update_context(ctx_obj0, ringbuf0->obj, tail0); + execlists_update_context(ctx_obj0, ringbuf0->obj, to0->ppgtt, tail0); if (to1) { ringbuf1 = to1->engine[ring->id].ringbuf; @@ -358,7 +382,7 @@ static void execlists_submit_contexts(struct intel_engine_cs *ring, WARN_ON(!i915_gem_obj_is_pinned(ctx_obj1)); WARN_ON(!i915_gem_obj_is_pinned(ringbuf1->obj)); - execlists_update_context(ctx_obj1, ringbuf1->obj, tail1); + execlists_update_context(ctx_obj1, ringbuf1->obj, to1->ppgtt, tail1); } execlists_elsp_write(ring, ctx_obj0, ctx_obj1); @@ -371,6 +395,12 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring) assert_spin_locked(&ring->execlist_lock); + /* + * If irqs are not active generate a warning as batches that finish + * without the irqs may get lost and a GPU Hang may occur. + */ + WARN_ON(!intel_irqs_enabled(ring->dev->dev_private)); + if (list_empty(&ring->execlist_queue)) return; @@ -398,7 +428,7 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring) * WaIdleLiteRestore: make sure we never cause a lite * restore with HEAD==TAIL */ - if (req0 && req0->elsp_submitted) { + if (req0->elsp_submitted) { /* * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL * as we resubmit the request. See gen8_emit_request() @@ -520,8 +550,6 @@ static int execlists_context_queue(struct intel_engine_cs *ring, struct drm_i915_gem_request *request) { struct drm_i915_gem_request *cursor; - struct drm_i915_private *dev_priv = ring->dev->dev_private; - unsigned long flags; int num_elements = 0; if (to != ring->default_context) @@ -538,7 +566,6 @@ static int execlists_context_queue(struct intel_engine_cs *ring, request->ring = ring; request->ctx = to; kref_init(&request->ref); - request->uniq = dev_priv->request_uniq++; i915_gem_context_reference(request->ctx); } else { i915_gem_request_reference(request); @@ -546,9 +573,7 @@ static int execlists_context_queue(struct intel_engine_cs *ring, } request->tail = tail; - intel_runtime_pm_get(dev_priv); - - spin_lock_irqsave(&ring->execlist_lock, flags); + spin_lock_irq(&ring->execlist_lock); list_for_each_entry(cursor, &ring->execlist_queue, execlist_link) if (++num_elements > 2) @@ -574,7 +599,7 @@ static int execlists_context_queue(struct intel_engine_cs *ring, if (num_elements == 0) execlists_context_unqueue(ring); - spin_unlock_irqrestore(&ring->execlist_lock, flags); + spin_unlock_irq(&ring->execlist_lock); return 0; } @@ -604,6 +629,7 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, struct list_head *vmas) { struct intel_engine_cs *ring = ringbuf->ring; + const unsigned other_rings = ~intel_ring_flag(ring); struct i915_vma *vma; uint32_t flush_domains = 0; bool flush_chipset = false; @@ -612,9 +638,11 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, list_for_each_entry(vma, vmas, exec_list) { struct drm_i915_gem_object *obj = vma->obj; - ret = i915_gem_object_sync(obj, ring); - if (ret) - return ret; + if (obj->active & other_rings) { + ret = i915_gem_object_sync(obj, ring); + if (ret) + return ret; + } if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) flush_chipset |= i915_gem_clflush_object(obj, false); @@ -631,6 +659,188 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, return logical_ring_invalidate_all_caches(ringbuf, ctx); } +int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request) +{ + int ret; + + if (request->ctx != request->ring->default_context) { + ret = intel_lr_context_pin(request->ring, request->ctx); + if (ret) + return ret; + } + + request->ringbuf = request->ctx->engine[request->ring->id].ringbuf; + + return 0; +} + +static int logical_ring_wait_for_space(struct intel_ringbuffer *ringbuf, + struct intel_context *ctx, + int bytes) +{ + struct intel_engine_cs *ring = ringbuf->ring; + struct drm_i915_gem_request *request; + unsigned space; + int ret; + + /* The whole point of reserving space is to not wait! */ + WARN_ON(ringbuf->reserved_in_use); + + if (intel_ring_space(ringbuf) >= bytes) + return 0; + + list_for_each_entry(request, &ring->request_list, list) { + /* + * The request queue is per-engine, so can contain requests + * from multiple ringbuffers. Here, we must ignore any that + * aren't from the ringbuffer we're considering. + */ + if (request->ringbuf != ringbuf) + continue; + + /* Would completion of this request free enough space? */ + space = __intel_ring_space(request->postfix, ringbuf->tail, + ringbuf->size); + if (space >= bytes) + break; + } + + if (WARN_ON(&request->list == &ring->request_list)) + return -ENOSPC; + + ret = i915_wait_request(request); + if (ret) + return ret; + + ringbuf->space = space; + return 0; +} + +/* + * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload + * @ringbuf: Logical Ringbuffer to advance. + * + * The tail is updated in our logical ringbuffer struct, not in the actual context. What + * really happens during submission is that the context and current tail will be placed + * on a queue waiting for the ELSP to be ready to accept a new context submission. At that + * point, the tail *inside* the context is updated and the ELSP written to. + */ +static void +intel_logical_ring_advance_and_submit(struct intel_ringbuffer *ringbuf, + struct intel_context *ctx, + struct drm_i915_gem_request *request) +{ + struct intel_engine_cs *ring = ringbuf->ring; + + intel_logical_ring_advance(ringbuf); + + if (intel_ring_stopped(ring)) + return; + + execlists_context_queue(ring, ctx, ringbuf->tail, request); +} + +static int logical_ring_wrap_buffer(struct intel_ringbuffer *ringbuf, + struct intel_context *ctx) +{ + uint32_t __iomem *virt; + int rem = ringbuf->size - ringbuf->tail; + + /* Can't wrap if space has already been reserved! */ + WARN_ON(ringbuf->reserved_in_use); + + if (ringbuf->space < rem) { + int ret = logical_ring_wait_for_space(ringbuf, ctx, rem); + + if (ret) + return ret; + } + + virt = ringbuf->virtual_start + ringbuf->tail; + rem /= 4; + while (rem--) + iowrite32(MI_NOOP, virt++); + + ringbuf->tail = 0; + intel_ring_update_space(ringbuf); + + return 0; +} + +static int logical_ring_prepare(struct intel_ringbuffer *ringbuf, + struct intel_context *ctx, int bytes) +{ + int ret; + + /* + * Add on the reserved size to the request to make sure that after + * the intended commands have been emitted, there is guaranteed to + * still be enough free space to send them to the hardware. + */ + if (!ringbuf->reserved_in_use) + bytes += ringbuf->reserved_size; + + if (unlikely(ringbuf->tail + bytes > ringbuf->effective_size)) { + ret = logical_ring_wrap_buffer(ringbuf, ctx); + if (unlikely(ret)) + return ret; + + if(ringbuf->reserved_size) { + uint32_t size = ringbuf->reserved_size; + + intel_ring_reserved_space_cancel(ringbuf); + intel_ring_reserved_space_reserve(ringbuf, size); + } + } + + if (unlikely(ringbuf->space < bytes)) { + ret = logical_ring_wait_for_space(ringbuf, ctx, bytes); + if (unlikely(ret)) + return ret; + } + + return 0; +} + +/** + * intel_logical_ring_begin() - prepare the logical ringbuffer to accept some commands + * + * @ringbuf: Logical ringbuffer. + * @num_dwords: number of DWORDs that we plan to write to the ringbuffer. + * + * The ringbuffer might not be ready to accept the commands right away (maybe it needs to + * be wrapped, or wait a bit for the tail to be updated). This function takes care of that + * and also preallocates a request (every workload submission is still mediated through + * requests, same as it did with legacy ringbuffer submission). + * + * Return: non-zero if the ringbuffer is not ready to be written to. + */ +static int intel_logical_ring_begin(struct intel_ringbuffer *ringbuf, + struct intel_context *ctx, int num_dwords) +{ + struct intel_engine_cs *ring = ringbuf->ring; + struct drm_device *dev = ring->dev; + struct drm_i915_private *dev_priv = dev->dev_private; + int ret; + + ret = i915_gem_check_wedge(&dev_priv->gpu_error, + dev_priv->mm.interruptible); + if (ret) + return ret; + + ret = logical_ring_prepare(ringbuf, ctx, num_dwords * sizeof(uint32_t)); + if (ret) + return ret; + + /* Preallocate the olr before touching the ring */ + ret = i915_gem_request_alloc(ring, ctx); + if (ret) + return ret; + + ringbuf->space -= num_dwords * sizeof(uint32_t); + return 0; +} + /** * execlists_submission() - submit a batchbuffer for execution, Execlists style * @dev: DRM device. @@ -648,16 +858,15 @@ static int execlists_move_to_gpu(struct intel_ringbuffer *ringbuf, * * Return: non-zero if the submission fails. */ -int intel_execlists_submission(struct drm_device *dev, struct drm_file *file, - struct intel_engine_cs *ring, - struct intel_context *ctx, +int intel_execlists_submission(struct i915_execbuffer_params *params, struct drm_i915_gem_execbuffer2 *args, - struct list_head *vmas, - struct drm_i915_gem_object *batch_obj, - u64 exec_start, u32 dispatch_flags) + struct list_head *vmas) { + struct drm_device *dev = params->dev; + struct intel_engine_cs *ring = params->ring; struct drm_i915_private *dev_priv = dev->dev_private; - struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf; + struct intel_ringbuffer *ringbuf = params->ctx->engine[ring->id].ringbuf; + u64 exec_start; int instp_mode; u32 instp_mask; int ret; @@ -708,13 +917,13 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file, return -EINVAL; } - ret = execlists_move_to_gpu(ringbuf, ctx, vmas); + ret = execlists_move_to_gpu(ringbuf, params->ctx, vmas); if (ret) return ret; if (ring == &dev_priv->ring[RCS] && instp_mode != dev_priv->relative_constants_mode) { - ret = intel_logical_ring_begin(ringbuf, ctx, 4); + ret = intel_logical_ring_begin(ringbuf, params->ctx, 4); if (ret) return ret; @@ -727,14 +936,17 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file, dev_priv->relative_constants_mode = instp_mode; } - ret = ring->emit_bb_start(ringbuf, ctx, exec_start, dispatch_flags); + exec_start = params->batch_obj_vm_offset + + args->batch_start_offset; + + ret = ring->emit_bb_start(ringbuf, params->ctx, exec_start, params->dispatch_flags); if (ret) return ret; - trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), dispatch_flags); + trace_i915_gem_ring_dispatch(intel_ring_get_request(ring), params->dispatch_flags); i915_gem_execbuffer_move_to_active(vmas, ring); - i915_gem_execbuffer_retire_commands(dev, file, ring, batch_obj); + i915_gem_execbuffer_retire_commands(params->dev, params->file, ring, params->batch_obj); return 0; } @@ -742,8 +954,6 @@ int intel_execlists_submission(struct drm_device *dev, struct drm_file *file, void intel_execlists_retire_requests(struct intel_engine_cs *ring) { struct drm_i915_gem_request *req, *tmp; - struct drm_i915_private *dev_priv = ring->dev->dev_private; - unsigned long flags; struct list_head retired_list; WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex)); @@ -751,9 +961,9 @@ void intel_execlists_retire_requests(struct intel_engine_cs *ring) return; INIT_LIST_HEAD(&retired_list); - spin_lock_irqsave(&ring->execlist_lock, flags); + spin_lock_irq(&ring->execlist_lock); list_replace_init(&ring->execlist_retired_req_list, &retired_list); - spin_unlock_irqrestore(&ring->execlist_lock, flags); + spin_unlock_irq(&ring->execlist_lock); list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) { struct intel_context *ctx = req->ctx; @@ -762,7 +972,6 @@ void intel_execlists_retire_requests(struct intel_engine_cs *ring) if (ctx_obj && (ctx != ring->default_context)) intel_lr_context_unpin(ring, ctx); - intel_runtime_pm_put(dev_priv); list_del(&req->execlist_link); i915_gem_request_unreference(req); } @@ -807,30 +1016,6 @@ int logical_ring_flush_all_caches(struct intel_ringbuffer *ringbuf, return 0; } -/* - * intel_logical_ring_advance_and_submit() - advance the tail and submit the workload - * @ringbuf: Logical Ringbuffer to advance. - * - * The tail is updated in our logical ringbuffer struct, not in the actual context. What - * really happens during submission is that the context and current tail will be placed - * on a queue waiting for the ELSP to be ready to accept a new context submission. At that - * point, the tail *inside* the context is updated and the ELSP written to. - */ -static void -intel_logical_ring_advance_and_submit(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, - struct drm_i915_gem_request *request) -{ - struct intel_engine_cs *ring = ringbuf->ring; - - intel_logical_ring_advance(ringbuf); - - if (intel_ring_stopped(ring)) - return; - - execlists_context_queue(ring, ctx, ringbuf->tail, request); -} - static int intel_lr_context_pin(struct intel_engine_cs *ring, struct intel_context *ctx) { @@ -875,255 +1060,259 @@ void intel_lr_context_unpin(struct intel_engine_cs *ring, } } -static int logical_ring_alloc_request(struct intel_engine_cs *ring, - struct intel_context *ctx) +static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring, + struct intel_context *ctx) { - struct drm_i915_gem_request *request; - struct drm_i915_private *dev_private = ring->dev->dev_private; - int ret; + int ret, i; + struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf; + struct drm_device *dev = ring->dev; + struct drm_i915_private *dev_priv = dev->dev_private; + struct i915_workarounds *w = &dev_priv->workarounds; - if (ring->outstanding_lazy_request) + if (WARN_ON_ONCE(w->count == 0)) return 0; - request = kzalloc(sizeof(*request), GFP_KERNEL); - if (request == NULL) - return -ENOMEM; - - if (ctx != ring->default_context) { - ret = intel_lr_context_pin(ring, ctx); - if (ret) { - kfree(request); - return ret; - } - } - - kref_init(&request->ref); - request->ring = ring; - request->uniq = dev_private->request_uniq++; - - ret = i915_gem_get_seqno(ring->dev, &request->seqno); - if (ret) { - intel_lr_context_unpin(ring, ctx); - kfree(request); + ring->gpu_caches_dirty = true; + ret = logical_ring_flush_all_caches(ringbuf, ctx); + if (ret) return ret; - } - request->ctx = ctx; - i915_gem_context_reference(request->ctx); - request->ringbuf = ctx->engine[ring->id].ringbuf; - - ring->outstanding_lazy_request = request; - return 0; -} - -static int logical_ring_wait_request(struct intel_ringbuffer *ringbuf, - int bytes) -{ - struct intel_engine_cs *ring = ringbuf->ring; - struct drm_i915_gem_request *request; - int ret; - - if (intel_ring_space(ringbuf) >= bytes) - return 0; - - list_for_each_entry(request, &ring->request_list, list) { - /* - * The request queue is per-engine, so can contain requests - * from multiple ringbuffers. Here, we must ignore any that - * aren't from the ringbuffer we're considering. - */ - struct intel_context *ctx = request->ctx; - if (ctx->engine[ring->id].ringbuf != ringbuf) - continue; + ret = intel_logical_ring_begin(ringbuf, ctx, w->count * 2 + 2); + if (ret) + return ret; - /* Would completion of this request free enough space? */ - if (__intel_ring_space(request->tail, ringbuf->tail, - ringbuf->size) >= bytes) { - break; - } + intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(w->count)); + for (i = 0; i < w->count; i++) { + intel_logical_ring_emit(ringbuf, w->reg[i].addr); + intel_logical_ring_emit(ringbuf, w->reg[i].value); } + intel_logical_ring_emit(ringbuf, MI_NOOP); - if (&request->list == &ring->request_list) - return -ENOSPC; + intel_logical_ring_advance(ringbuf); - ret = i915_wait_request(request); + ring->gpu_caches_dirty = true; + ret = logical_ring_flush_all_caches(ringbuf, ctx); if (ret) return ret; - i915_gem_retire_requests_ring(ring); - - return intel_ring_space(ringbuf) >= bytes ? 0 : -ENOSPC; + return 0; } -static int logical_ring_wait_for_space(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, - int bytes) +#define wa_ctx_emit(batch, cmd) \ + do { \ + if (WARN_ON(index >= (PAGE_SIZE / sizeof(uint32_t)))) { \ + return -ENOSPC; \ + } \ + batch[index++] = (cmd); \ + } while (0) + +static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx, + uint32_t offset, + uint32_t start_alignment) { - struct intel_engine_cs *ring = ringbuf->ring; - struct drm_device *dev = ring->dev; - struct drm_i915_private *dev_priv = dev->dev_private; - unsigned long end; - int ret; - - ret = logical_ring_wait_request(ringbuf, bytes); - if (ret != -ENOSPC) - return ret; + return wa_ctx->offset = ALIGN(offset, start_alignment); +} - /* Force the context submission in case we have been skipping it */ - intel_logical_ring_advance_and_submit(ringbuf, ctx, NULL); +static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx, + uint32_t offset, + uint32_t size_alignment) +{ + wa_ctx->size = offset - wa_ctx->offset; - /* With GEM the hangcheck timer should kick us out of the loop, - * leaving it early runs the risk of corrupting GEM state (due - * to running on almost untested codepaths). But on resume - * timers don't work yet, so prevent a complete hang in that - * case by choosing an insanely large timeout. */ - end = jiffies + 60 * HZ; + WARN(wa_ctx->size % size_alignment, + "wa_ctx_bb failed sanity checks: size %d is not aligned to %d\n", + wa_ctx->size, size_alignment); + return 0; +} - ret = 0; - do { - if (intel_ring_space(ringbuf) >= bytes) - break; +/** + * gen8_init_indirectctx_bb() - initialize indirect ctx batch with WA + * + * @ring: only applicable for RCS + * @wa_ctx: structure representing wa_ctx + * offset: specifies start of the batch, should be cache-aligned. This is updated + * with the offset value received as input. + * size: size of the batch in DWORDS but HW expects in terms of cachelines + * @batch: page in which WA are loaded + * @offset: This field specifies the start of the batch, it should be + * cache-aligned otherwise it is adjusted accordingly. + * Typically we only have one indirect_ctx and per_ctx batch buffer which are + * initialized at the beginning and shared across all contexts but this field + * helps us to have multiple batches at different offsets and select them based + * on a criteria. At the moment this batch always start at the beginning of the page + * and at this point we don't have multiple wa_ctx batch buffers. + * + * The number of WA applied are not known at the beginning; we use this field + * to return the no of DWORDS written. - msleep(1); + * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END + * so it adds NOOPs as padding to make it cacheline aligned. + * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together + * makes a complete batch buffer. + * + * Return: non-zero if we exceed the PAGE_SIZE limit. + */ - if (dev_priv->mm.interruptible && signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } +static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring, + struct i915_wa_ctx_bb *wa_ctx, + uint32_t *const batch, + uint32_t *offset) +{ + uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); + + /* WaDisableCtxRestoreArbitration:bdw,chv */ + wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_DISABLE); + + /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */ + if (IS_BROADWELL(ring->dev)) { + struct drm_i915_private *dev_priv = to_i915(ring->dev); + uint32_t l3sqc4_flush = (I915_READ(GEN8_L3SQCREG4) | + GEN8_LQSC_FLUSH_COHERENT_LINES); + + wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1)); + wa_ctx_emit(batch, GEN8_L3SQCREG4); + wa_ctx_emit(batch, l3sqc4_flush); + + wa_ctx_emit(batch, GFX_OP_PIPE_CONTROL(6)); + wa_ctx_emit(batch, (PIPE_CONTROL_CS_STALL | + PIPE_CONTROL_DC_FLUSH_ENABLE)); + wa_ctx_emit(batch, 0); + wa_ctx_emit(batch, 0); + wa_ctx_emit(batch, 0); + wa_ctx_emit(batch, 0); + + wa_ctx_emit(batch, MI_LOAD_REGISTER_IMM(1)); + wa_ctx_emit(batch, GEN8_L3SQCREG4); + wa_ctx_emit(batch, l3sqc4_flush & ~GEN8_LQSC_FLUSH_COHERENT_LINES); + } - ret = i915_gem_check_wedge(&dev_priv->gpu_error, - dev_priv->mm.interruptible); - if (ret) - break; + /* Pad to end of cacheline */ + while (index % CACHELINE_DWORDS) + wa_ctx_emit(batch, MI_NOOP); - if (time_after(jiffies, end)) { - ret = -EBUSY; - break; - } - } while (1); + /* + * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because + * execution depends on the length specified in terms of cache lines + * in the register CTX_RCS_INDIRECT_CTX + */ - return ret; + return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS); } -static int logical_ring_wrap_buffer(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx) +/** + * gen8_init_perctx_bb() - initialize per ctx batch with WA + * + * @ring: only applicable for RCS + * @wa_ctx: structure representing wa_ctx + * offset: specifies start of the batch, should be cache-aligned. + * size: size of the batch in DWORDS but HW expects in terms of cachelines + * @offset: This field specifies the start of this batch. + * This batch is started immediately after indirect_ctx batch. Since we ensure + * that indirect_ctx ends on a cacheline this batch is aligned automatically. + * + * The number of DWORDS written are returned using this field. + * + * This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding + * to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant. + */ +static int gen8_init_perctx_bb(struct intel_engine_cs *ring, + struct i915_wa_ctx_bb *wa_ctx, + uint32_t *const batch, + uint32_t *offset) { - uint32_t __iomem *virt; - int rem = ringbuf->size - ringbuf->tail; + uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS); - if (ringbuf->space < rem) { - int ret = logical_ring_wait_for_space(ringbuf, ctx, rem); - - if (ret) - return ret; - } - - virt = ringbuf->virtual_start + ringbuf->tail; - rem /= 4; - while (rem--) - iowrite32(MI_NOOP, virt++); + /* WaDisableCtxRestoreArbitration:bdw,chv */ + wa_ctx_emit(batch, MI_ARB_ON_OFF | MI_ARB_ENABLE); - ringbuf->tail = 0; - intel_ring_update_space(ringbuf); + wa_ctx_emit(batch, MI_BATCH_BUFFER_END); - return 0; + return wa_ctx_end(wa_ctx, *offset = index, 1); } -static int logical_ring_prepare(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, int bytes) +static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size) { int ret; - if (unlikely(ringbuf->tail + bytes > ringbuf->effective_size)) { - ret = logical_ring_wrap_buffer(ringbuf, ctx); - if (unlikely(ret)) - return ret; + ring->wa_ctx.obj = i915_gem_alloc_object(ring->dev, PAGE_ALIGN(size)); + if (!ring->wa_ctx.obj) { + DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n"); + return -ENOMEM; } - if (unlikely(ringbuf->space < bytes)) { - ret = logical_ring_wait_for_space(ringbuf, ctx, bytes); - if (unlikely(ret)) - return ret; + ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, PAGE_SIZE, 0); + if (ret) { + DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n", + ret); + drm_gem_object_unreference(&ring->wa_ctx.obj->base); + return ret; } return 0; } -/** - * intel_logical_ring_begin() - prepare the logical ringbuffer to accept some commands - * - * @ringbuf: Logical ringbuffer. - * @num_dwords: number of DWORDs that we plan to write to the ringbuffer. - * - * The ringbuffer might not be ready to accept the commands right away (maybe it needs to - * be wrapped, or wait a bit for the tail to be updated). This function takes care of that - * and also preallocates a request (every workload submission is still mediated through - * requests, same as it did with legacy ringbuffer submission). - * - * Return: non-zero if the ringbuffer is not ready to be written to. - */ -int intel_logical_ring_begin(struct intel_ringbuffer *ringbuf, - struct intel_context *ctx, int num_dwords) +static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring) { - struct intel_engine_cs *ring = ringbuf->ring; - struct drm_device *dev = ring->dev; - struct drm_i915_private *dev_priv = dev->dev_private; - int ret; - - ret = i915_gem_check_wedge(&dev_priv->gpu_error, - dev_priv->mm.interruptible); - if (ret) - return ret; - - ret = logical_ring_prepare(ringbuf, ctx, num_dwords * sizeof(uint32_t)); - if (ret) - return ret; - - /* Preallocate the olr before touching the ring */ - ret = logical_ring_alloc_request(ring, ctx); - if (ret) - return ret; - - ringbuf->space -= num_dwords * sizeof(uint32_t); - return 0; + if (ring->wa_ctx.obj) { + i915_gem_object_ggtt_unpin(ring->wa_ctx.obj); + drm_gem_object_unreference(&ring->wa_ctx.obj->base); + ring->wa_ctx.obj = NULL; + } } -static int intel_logical_ring_workarounds_emit(struct intel_engine_cs *ring, - struct intel_context *ctx) +static int intel_init_workaround_bb(struct intel_engine_cs *ring) { - int ret, i; - struct intel_ringbuffer *ringbuf = ctx->engine[ring->id].ringbuf; - struct drm_device *dev = ring->dev; - struct drm_i915_private *dev_priv = dev->dev_private; - struct i915_workarounds *w = &dev_priv->workarounds; + int ret; + uint32_t *batch; + uint32_t offset; + struct page *page; + struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx; - if (WARN_ON_ONCE(w->count == 0)) - return 0; + WARN_ON(ring->id != RCS); - ring->gpu_caches_dirty = true; - ret = logical_ring_flush_all_caches(ringbuf, ctx); - if (ret) - return ret; + /* some WA perform writes to scratch page, ensure it is valid */ + if (ring->scratch.obj == NULL) { + DRM_ERROR("scratch page not allocated for %s\n", ring->name); + return -EINVAL; + } - ret = intel_logical_ring_begin(ringbuf, ctx, w->count * 2 + 2); - if (ret) + ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE); + if (ret) { + DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret); return ret; - - intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(w->count)); - for (i = 0; i < w->count; i++) { - intel_logical_ring_emit(ringbuf, w->reg[i].addr); - intel_logical_ring_emit(ringbuf, w->reg[i].value); } - intel_logical_ring_emit(ringbuf, MI_NOOP); - intel_logical_ring_advance(ringbuf); + page = i915_gem_object_get_page(wa_ctx->obj, 0); + batch = kmap_atomic(page); + offset = 0; - ring->gpu_caches_dirty = true; - ret = logical_ring_flush_all_caches(ringbuf, ctx); + if (INTEL_INFO(ring->dev)->gen == 8) { + ret = gen8_init_indirectctx_bb(ring, + &wa_ctx->indirect_ctx, + batch, + &offset); + if (ret) + goto out; + + ret = gen8_init_perctx_bb(ring, + &wa_ctx->per_ctx, + batch, + &offset); + if (ret) + goto out; + } else { + WARN(INTEL_INFO(ring->dev)->gen >= 8, + "WA batch buffer is not initialized for Gen%d\n", + INTEL_INFO(ring->dev)->gen); + lrc_destroy_wa_ctx_obj(ring); + } + +out: + kunmap_atomic(batch); if (ret) - return ret; + lrc_destroy_wa_ctx_obj(ring); - return 0; + return ret; } static int gen8_init_common_ring(struct intel_engine_cs *ring) @@ -1282,6 +1471,7 @@ static int gen8_emit_flush_render(struct intel_ringbuffer *ringbuf, { struct intel_engine_cs *ring = ringbuf->ring; u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES; + bool vf_flush_wa; u32 flags = 0; int ret; @@ -1303,10 +1493,26 @@ static int gen8_emit_flush_render(struct intel_ringbuffer *ringbuf, flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; } - ret = intel_logical_ring_begin(ringbuf, ctx, 6); + /* + * On GEN9+ Before VF_CACHE_INVALIDATE we need to emit a NULL pipe + * control. + */ + vf_flush_wa = INTEL_INFO(ring->dev)->gen >= 9 && + flags & PIPE_CONTROL_VF_CACHE_INVALIDATE; + + ret = intel_logical_ring_begin(ringbuf, ctx, vf_flush_wa ? 12 : 6); if (ret) return ret; + if (vf_flush_wa) { + intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + intel_logical_ring_emit(ringbuf, 0); + } + intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6)); intel_logical_ring_emit(ringbuf, flags); intel_logical_ring_emit(ringbuf, scratch_addr); @@ -1394,7 +1600,7 @@ static int intel_lr_context_render_state_init(struct intel_engine_cs *ring, i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), ring); - ret = __i915_add_request(ring, file, so.obj); + __i915_add_request(ring, file, so.obj); /* intel_logical_ring_add_request moves object to inactive if it * fails */ out: @@ -1437,11 +1643,14 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *ring) ring->cleanup(ring); i915_cmd_parser_fini_ring(ring); + i915_gem_batch_pool_fini(&ring->batch_pool); if (ring->status_page.obj) { kunmap(sg_page(ring->status_page.obj->pages->sgl)); ring->status_page.obj = NULL; } + + lrc_destroy_wa_ctx_obj(ring); } static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *ring) @@ -1454,6 +1663,7 @@ static int logical_ring_init(struct drm_device *dev, struct intel_engine_cs *rin ring->dev = dev; INIT_LIST_HEAD(&ring->active_list); INIT_LIST_HEAD(&ring->request_list); + i915_gem_batch_pool_init(dev, &ring->batch_pool); init_waitqueue_head(&ring->irq_queue); INIT_LIST_HEAD(&ring->execlist_queue); @@ -1500,11 +1710,28 @@ static int logical_render_ring_init(struct drm_device *dev) ring->emit_bb_start = gen8_emit_bb_start; ring->dev = dev; - ret = logical_ring_init(dev, ring); + + ret = intel_init_pipe_control(ring); if (ret) return ret; - return intel_init_pipe_control(ring); + ret = intel_init_workaround_bb(ring); + if (ret) { + /* + * We continue even if we fail to initialize WA batch + * because we only expect rare glitches but nothing + * critical to prevent us from using GPU + */ + DRM_ERROR("WA batch buffer initialization failed: %d\n", + ret); + } + + ret = logical_ring_init(dev, ring); + if (ret) { + lrc_destroy_wa_ctx_obj(ring); + } + + return ret; } static int logical_bsd_ring_init(struct drm_device *dev) @@ -1784,15 +2011,27 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o reg_state[CTX_SECOND_BB_STATE] = ring->mmio_base + 0x118; reg_state[CTX_SECOND_BB_STATE+1] = 0; if (ring->id == RCS) { - /* TODO: according to BSpec, the register state context - * for CHV does not have these. OTOH, these registers do - * exist in CHV. I'm waiting for a clarification */ reg_state[CTX_BB_PER_CTX_PTR] = ring->mmio_base + 0x1c0; reg_state[CTX_BB_PER_CTX_PTR+1] = 0; reg_state[CTX_RCS_INDIRECT_CTX] = ring->mmio_base + 0x1c4; reg_state[CTX_RCS_INDIRECT_CTX+1] = 0; reg_state[CTX_RCS_INDIRECT_CTX_OFFSET] = ring->mmio_base + 0x1c8; reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = 0; + if (ring->wa_ctx.obj) { + struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx; + uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj); + + reg_state[CTX_RCS_INDIRECT_CTX+1] = + (ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) | + (wa_ctx->indirect_ctx.size / CACHELINE_DWORDS); + + reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] = + CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6; + + reg_state[CTX_BB_PER_CTX_PTR+1] = + (ggtt_offset + wa_ctx->per_ctx.offset * sizeof(uint32_t)) | + 0x01; + } } reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9); reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED; @@ -1806,14 +2045,14 @@ populate_lr_context(struct intel_context *ctx, struct drm_i915_gem_object *ctx_o reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1); reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0); reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0); - reg_state[CTX_PDP3_UDW+1] = upper_32_bits(ppgtt->pdp.page_directory[3]->daddr); - reg_state[CTX_PDP3_LDW+1] = lower_32_bits(ppgtt->pdp.page_directory[3]->daddr); - reg_state[CTX_PDP2_UDW+1] = upper_32_bits(ppgtt->pdp.page_directory[2]->daddr); - reg_state[CTX_PDP2_LDW+1] = lower_32_bits(ppgtt->pdp.page_directory[2]->daddr); - reg_state[CTX_PDP1_UDW+1] = upper_32_bits(ppgtt->pdp.page_directory[1]->daddr); - reg_state[CTX_PDP1_LDW+1] = lower_32_bits(ppgtt->pdp.page_directory[1]->daddr); - reg_state[CTX_PDP0_UDW+1] = upper_32_bits(ppgtt->pdp.page_directory[0]->daddr); - reg_state[CTX_PDP0_LDW+1] = lower_32_bits(ppgtt->pdp.page_directory[0]->daddr); + + /* With dynamic page allocation, PDPs may not be allocated at this point, + * Point the unallocated PDPs to the scratch page + */ + ASSIGN_CTX_PDP(ppgtt, reg_state, 3); + ASSIGN_CTX_PDP(ppgtt, reg_state, 2); + ASSIGN_CTX_PDP(ppgtt, reg_state, 1); + ASSIGN_CTX_PDP(ppgtt, reg_state, 0); if (ring->id == RCS) { reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); reg_state[CTX_R_PWR_CLK_STATE] = GEN8_R_PWR_CLK_STATE; @@ -1930,11 +2169,10 @@ int intel_lr_context_deferred_create(struct intel_context *ctx, context_size = round_up(get_lr_context_size(ring), 4096); - ctx_obj = i915_gem_alloc_context_obj(dev, context_size); - if (IS_ERR(ctx_obj)) { - ret = PTR_ERR(ctx_obj); - DRM_DEBUG_DRIVER("Alloc LRC backing obj failed: %d\n", ret); - return ret; + ctx_obj = i915_gem_alloc_object(dev, context_size); + if (!ctx_obj) { + DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n"); + return -ENOMEM; } if (is_global_default_ctx) {