kernel/drivers/gpu/drm/i915/gt/gen2_engine_cs.c

// SPDX-License-Identifier: MIT
/*
 * Copyright © 2020 Intel Corporation
 */

#include "gen2_engine_cs.h"
#include "i915_drv.h"
#include "intel_engine.h"
#include "intel_gpu_commands.h"
#include "intel_gt.h"
#include "intel_gt_irq.h"
#include "intel_ring.h"

int gen2_emit_flush(struct i915_request *rq, u32 mode)
{
	unsigned int num_store_dw = 12;
	u32 cmd, *cs;

	cmd = MI_FLUSH;
	if (mode & EMIT_INVALIDATE)
		cmd |= MI_READ_FLUSH;

	cs = intel_ring_begin(rq, 2 + 4 * num_store_dw);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = cmd;
	while (num_store_dw--) {
		*cs++ = MI_STORE_DWORD_INDEX;
		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
		*cs++ = 0;
		*cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH;
	}
	*cs++ = cmd;

	intel_ring_advance(rq, cs);

	return 0;
}

int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode)
{
	u32 cmd, *cs;
	int i;

	/*
	 * read/write caches:
	 *
	 * I915_GEM_DOMAIN_RENDER is always invalidated, but is
	 * only flushed if MI_NO_WRITE_FLUSH is unset.  On 965, it is
	 * also flushed at 2d versus 3d pipeline switches.
	 *
	 * read-only caches:
	 *
	 * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if
	 * MI_READ_FLUSH is set, and is always flushed on 965.
	 *
	 * I915_GEM_DOMAIN_COMMAND may not exist?
	 *
	 * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is
	 * invalidated when MI_EXE_FLUSH is set.
	 *
	 * I915_GEM_DOMAIN_VERTEX, which exists on 965, is
	 * invalidated with every MI_FLUSH.
	 *
	 * TLBs:
	 *
	 * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND
	 * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and
	 * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER
	 * are flushed at any MI_FLUSH.
	 */

	cmd = MI_FLUSH;
	if (mode & EMIT_INVALIDATE) {
		cmd |= MI_EXE_FLUSH;
		if (IS_G4X(rq->engine->i915) || GRAPHICS_VER(rq->engine->i915) == 5)
			cmd |= MI_INVALIDATE_ISP;
	}

	i = 2;
	if (mode & EMIT_INVALIDATE)
		i += 20;

	cs = intel_ring_begin(rq, i);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = cmd;

	/*
	 * A random delay to let the CS invalidate take effect? Without this
	 * delay, the GPU relocation path fails as the CS does not see
	 * the updated contents. Just as important, if we apply the flushes
	 * to the EMIT_FLUSH branch (i.e. immediately after the relocation
	 * write and before the invalidate on the next batch), the relocations
	 * still fail. This implies that is a delay following invalidation
	 * that is required to reset the caches as opposed to a delay to
	 * ensure the memory is written.
	 */
	if (mode & EMIT_INVALIDATE) {
		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
			PIPE_CONTROL_GLOBAL_GTT;
		*cs++ = 0;
		*cs++ = 0;

		for (i = 0; i < 12; i++)
			*cs++ = MI_FLUSH;

		*cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE;
		*cs++ = intel_gt_scratch_offset(rq->engine->gt,
						INTEL_GT_SCRATCH_FIELD_DEFAULT) |
			PIPE_CONTROL_GLOBAL_GTT;
		*cs++ = 0;
		*cs++ = 0;
	}

	*cs++ = cmd;

	intel_ring_advance(rq, cs);

	return 0;
}

int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode)
{
	u32 *cs;

	cs = intel_ring_begin(rq, 2);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = MI_FLUSH;
	*cs++ = MI_NOOP;
	intel_ring_advance(rq, cs);

	return 0;
}

static u32 *__gen2_emit_breadcrumb(struct i915_request *rq, u32 *cs,
				   int flush, int post)
{
	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);

	*cs++ = MI_FLUSH;

	while (flush--) {
		*cs++ = MI_STORE_DWORD_INDEX;
		*cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32);
		*cs++ = rq->fence.seqno;
	}

	while (post--) {
		*cs++ = MI_STORE_DWORD_INDEX;
		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
		*cs++ = rq->fence.seqno;
	}

	*cs++ = MI_USER_INTERRUPT;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);

	return cs;
}

u32 *gen3_emit_breadcrumb(struct i915_request *rq, u32 *cs)
{
	return __gen2_emit_breadcrumb(rq, cs, 16, 8);
}

u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
{
	return __gen2_emit_breadcrumb(rq, cs, 8, 8);
}

/* Just userspace ABI convention to limit the wa batch bo to a resonable size */
#define I830_BATCH_LIMIT SZ_256K
#define I830_TLB_ENTRIES (2)
#define I830_WA_SIZE max(I830_TLB_ENTRIES * SZ_4K, I830_BATCH_LIMIT)
int i830_emit_bb_start(struct i915_request *rq,
		       u64 offset, u32 len,
		       unsigned int dispatch_flags)
{
	u32 *cs, cs_offset =
		intel_gt_scratch_offset(rq->engine->gt,
					INTEL_GT_SCRATCH_FIELD_DEFAULT);

	GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE);

	cs = intel_ring_begin(rq, 6);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	/* Evict the invalid PTE TLBs */
	*cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA;
	*cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096;
	*cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */
	*cs++ = cs_offset;
	*cs++ = 0xdeadbeef;
	*cs++ = MI_NOOP;
	intel_ring_advance(rq, cs);

	if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {
		if (len > I830_BATCH_LIMIT)
			return -ENOSPC;

		cs = intel_ring_begin(rq, 6 + 2);
		if (IS_ERR(cs))
			return PTR_ERR(cs);

		/*
		 * Blit the batch (which has now all relocs applied) to the
		 * stable batch scratch bo area (so that the CS never
		 * stumbles over its tlb invalidation bug) ...
		 */
		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096;
		*cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096;
		*cs++ = cs_offset;
		*cs++ = 4096;
		*cs++ = offset;

		*cs++ = MI_FLUSH;
		*cs++ = MI_NOOP;
		intel_ring_advance(rq, cs);

		/* ... and execute it. */
		offset = cs_offset;
	}

	if (!(dispatch_flags & I915_DISPATCH_SECURE))
		offset |= MI_BATCH_NON_SECURE;

	cs = intel_ring_begin(rq, 2);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
	*cs++ = offset;
	intel_ring_advance(rq, cs);

	return 0;
}

int gen3_emit_bb_start(struct i915_request *rq,
		       u64 offset, u32 len,
		       unsigned int dispatch_flags)
{
	u32 *cs;

	if (!(dispatch_flags & I915_DISPATCH_SECURE))
		offset |= MI_BATCH_NON_SECURE;

	cs = intel_ring_begin(rq, 2);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
	*cs++ = offset;
	intel_ring_advance(rq, cs);

	return 0;
}

int gen4_emit_bb_start(struct i915_request *rq,
		       u64 offset, u32 length,
		       unsigned int dispatch_flags)
{
	u32 security;
	u32 *cs;

	security = MI_BATCH_NON_SECURE_I965;
	if (dispatch_flags & I915_DISPATCH_SECURE)
		security = 0;

	cs = intel_ring_begin(rq, 2);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | security;
	*cs++ = offset;
	intel_ring_advance(rq, cs);

	return 0;
}

void gen2_irq_enable(struct intel_engine_cs *engine)
{
	struct drm_i915_private *i915 = engine->i915;

	i915->irq_mask &= ~engine->irq_enable_mask;
	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
	ENGINE_POSTING_READ16(engine, RING_IMR);
}

void gen2_irq_disable(struct intel_engine_cs *engine)
{
	struct drm_i915_private *i915 = engine->i915;

	i915->irq_mask |= engine->irq_enable_mask;
	intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);
}

void gen3_irq_enable(struct intel_engine_cs *engine)
{
	engine->i915->irq_mask &= ~engine->irq_enable_mask;
	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
	intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);
}

void gen3_irq_disable(struct intel_engine_cs *engine)
{
	engine->i915->irq_mask |= engine->irq_enable_mask;
	intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);
}

void gen5_irq_enable(struct intel_engine_cs *engine)
{
	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
}

void gen5_irq_disable(struct intel_engine_cs *engine)
{
	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
}
init repo. 2024-07-22 17:22:30 +08:00			`// SPDX-License-Identifier: MIT`
			`/*`
			`* Copyright © 2020 Intel Corporation`
			`*/`

			`#include "gen2_engine_cs.h"`
			`#include "i915_drv.h"`
			`#include "intel_engine.h"`
			`#include "intel_gpu_commands.h"`
			`#include "intel_gt.h"`
			`#include "intel_gt_irq.h"`
			`#include "intel_ring.h"`

			`int gen2_emit_flush(struct i915_request *rq, u32 mode)`
			`{`
			`unsigned int num_store_dw = 12;`
			`u32 cmd, *cs;`

			`cmd = MI_FLUSH;`
			`if (mode & EMIT_INVALIDATE)`
			`cmd \|= MI_READ_FLUSH;`

			`cs = intel_ring_begin(rq, 2 + 4 * num_store_dw);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`*cs++ = cmd;`
			`while (num_store_dw--) {`
			`*cs++ = MI_STORE_DWORD_INDEX;`
			`cs++ = I915_GEM_HWS_SCRATCH sizeof(u32);`
			`*cs++ = 0;`
			`*cs++ = MI_FLUSH \| MI_NO_WRITE_FLUSH;`
			`}`
			`*cs++ = cmd;`

			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode)`
			`{`
			`u32 cmd, *cs;`
			`int i;`

			`/*`
			`* read/write caches:`
			`*`
			`* I915_GEM_DOMAIN_RENDER is always invalidated, but is`
			`* only flushed if MI_NO_WRITE_FLUSH is unset. On 965, it is`
			`* also flushed at 2d versus 3d pipeline switches.`
			`*`
			`* read-only caches:`
			`*`
			`* I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if`
			`* MI_READ_FLUSH is set, and is always flushed on 965.`
			`*`
			`* I915_GEM_DOMAIN_COMMAND may not exist?`
			`*`
			`* I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is`
			`* invalidated when MI_EXE_FLUSH is set.`
			`*`
			`* I915_GEM_DOMAIN_VERTEX, which exists on 965, is`
			`* invalidated with every MI_FLUSH.`
			`*`
			`* TLBs:`
			`*`
			`* On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND`
			`* and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and`
			`* I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER`
			`* are flushed at any MI_FLUSH.`
			`*/`

			`cmd = MI_FLUSH;`
			`if (mode & EMIT_INVALIDATE) {`
			`cmd \|= MI_EXE_FLUSH;`
			`if (IS_G4X(rq->engine->i915) \|\| GRAPHICS_VER(rq->engine->i915) == 5)`
			`cmd \|= MI_INVALIDATE_ISP;`
			`}`

			`i = 2;`
			`if (mode & EMIT_INVALIDATE)`
			`i += 20;`

			`cs = intel_ring_begin(rq, i);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`*cs++ = cmd;`

			`/*`
			`* A random delay to let the CS invalidate take effect? Without this`
			`* delay, the GPU relocation path fails as the CS does not see`
			`* the updated contents. Just as important, if we apply the flushes`
			`* to the EMIT_FLUSH branch (i.e. immediately after the relocation`
			`* write and before the invalidate on the next batch), the relocations`
			`* still fail. This implies that is a delay following invalidation`
			`* that is required to reset the caches as opposed to a delay to`
			`* ensure the memory is written.`
			`*/`
			`if (mode & EMIT_INVALIDATE) {`
			`*cs++ = GFX_OP_PIPE_CONTROL(4) \| PIPE_CONTROL_QW_WRITE;`
			`*cs++ = intel_gt_scratch_offset(rq->engine->gt,`
			`INTEL_GT_SCRATCH_FIELD_DEFAULT) \|`
			`PIPE_CONTROL_GLOBAL_GTT;`
			`*cs++ = 0;`
			`*cs++ = 0;`

			`for (i = 0; i < 12; i++)`
			`*cs++ = MI_FLUSH;`

			`*cs++ = GFX_OP_PIPE_CONTROL(4) \| PIPE_CONTROL_QW_WRITE;`
			`*cs++ = intel_gt_scratch_offset(rq->engine->gt,`
			`INTEL_GT_SCRATCH_FIELD_DEFAULT) \|`
			`PIPE_CONTROL_GLOBAL_GTT;`
			`*cs++ = 0;`
			`*cs++ = 0;`
			`}`

			`*cs++ = cmd;`

			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode)`
			`{`
			`u32 *cs;`

			`cs = intel_ring_begin(rq, 2);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`*cs++ = MI_FLUSH;`
			`*cs++ = MI_NOOP;`
			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`static u32 __gen2_emit_breadcrumb(struct i915_request rq, u32 *cs,`
			`int flush, int post)`
			`{`
			`GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);`
			`GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);`

			`*cs++ = MI_FLUSH;`

			`while (flush--) {`
			`*cs++ = MI_STORE_DWORD_INDEX;`
			`cs++ = I915_GEM_HWS_SCRATCH sizeof(u32);`
			`*cs++ = rq->fence.seqno;`
			`}`

			`while (post--) {`
			`*cs++ = MI_STORE_DWORD_INDEX;`
			`*cs++ = I915_GEM_HWS_SEQNO_ADDR;`
			`*cs++ = rq->fence.seqno;`
			`}`

			`*cs++ = MI_USER_INTERRUPT;`

			`rq->tail = intel_ring_offset(rq, cs);`
			`assert_ring_tail_valid(rq->ring, rq->tail);`

			`return cs;`
			`}`

			`u32 gen3_emit_breadcrumb(struct i915_request rq, u32 *cs)`
			`{`
			`return __gen2_emit_breadcrumb(rq, cs, 16, 8);`
			`}`

			`u32 gen5_emit_breadcrumb(struct i915_request rq, u32 *cs)`
			`{`
			`return __gen2_emit_breadcrumb(rq, cs, 8, 8);`
			`}`

			`/* Just userspace ABI convention to limit the wa batch bo to a resonable size */`
			`#define I830_BATCH_LIMIT SZ_256K`
			`#define I830_TLB_ENTRIES (2)`
			`#define I830_WA_SIZE max(I830_TLB_ENTRIES * SZ_4K, I830_BATCH_LIMIT)`
			`int i830_emit_bb_start(struct i915_request *rq,`
			`u64 offset, u32 len,`
			`unsigned int dispatch_flags)`
			`{`
			`u32 *cs, cs_offset =`
			`intel_gt_scratch_offset(rq->engine->gt,`
			`INTEL_GT_SCRATCH_FIELD_DEFAULT);`

			`GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE);`

			`cs = intel_ring_begin(rq, 6);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`/* Evict the invalid PTE TLBs */`
			`*cs++ = COLOR_BLT_CMD \| BLT_WRITE_RGBA;`
			`*cs++ = BLT_DEPTH_32 \| BLT_ROP_COLOR_COPY \| 4096;`
			`cs++ = I830_TLB_ENTRIES << 16 \| 4; / load each page */`
			`*cs++ = cs_offset;`
			`*cs++ = 0xdeadbeef;`
			`*cs++ = MI_NOOP;`
			`intel_ring_advance(rq, cs);`

			`if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) {`
			`if (len > I830_BATCH_LIMIT)`
			`return -ENOSPC;`

			`cs = intel_ring_begin(rq, 6 + 2);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`/*`
			`* Blit the batch (which has now all relocs applied) to the`
			`* stable batch scratch bo area (so that the CS never`
			`* stumbles over its tlb invalidation bug) ...`
			`*/`
			`*cs++ = SRC_COPY_BLT_CMD \| BLT_WRITE_RGBA \| (6 - 2);`
			`*cs++ = BLT_DEPTH_32 \| BLT_ROP_SRC_COPY \| 4096;`
			`*cs++ = DIV_ROUND_UP(len, 4096) << 16 \| 4096;`
			`*cs++ = cs_offset;`
			`*cs++ = 4096;`
			`*cs++ = offset;`

			`*cs++ = MI_FLUSH;`
			`*cs++ = MI_NOOP;`
			`intel_ring_advance(rq, cs);`

			`/* ... and execute it. */`
			`offset = cs_offset;`
			`}`

			`if (!(dispatch_flags & I915_DISPATCH_SECURE))`
			`offset \|= MI_BATCH_NON_SECURE;`

			`cs = intel_ring_begin(rq, 2);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`*cs++ = MI_BATCH_BUFFER_START \| MI_BATCH_GTT;`
			`*cs++ = offset;`
			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`int gen3_emit_bb_start(struct i915_request *rq,`
			`u64 offset, u32 len,`
			`unsigned int dispatch_flags)`
			`{`
			`u32 *cs;`

			`if (!(dispatch_flags & I915_DISPATCH_SECURE))`
			`offset \|= MI_BATCH_NON_SECURE;`

			`cs = intel_ring_begin(rq, 2);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`*cs++ = MI_BATCH_BUFFER_START \| MI_BATCH_GTT;`
			`*cs++ = offset;`
			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`int gen4_emit_bb_start(struct i915_request *rq,`
			`u64 offset, u32 length,`
			`unsigned int dispatch_flags)`
			`{`
			`u32 security;`
			`u32 *cs;`

			`security = MI_BATCH_NON_SECURE_I965;`
			`if (dispatch_flags & I915_DISPATCH_SECURE)`
			`security = 0;`

			`cs = intel_ring_begin(rq, 2);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`*cs++ = MI_BATCH_BUFFER_START \| MI_BATCH_GTT \| security;`
			`*cs++ = offset;`
			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`void gen2_irq_enable(struct intel_engine_cs *engine)`
			`{`
			`struct drm_i915_private *i915 = engine->i915;`

			`i915->irq_mask &= ~engine->irq_enable_mask;`
			`intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);`
			`ENGINE_POSTING_READ16(engine, RING_IMR);`
			`}`

			`void gen2_irq_disable(struct intel_engine_cs *engine)`
			`{`
			`struct drm_i915_private *i915 = engine->i915;`

			`i915->irq_mask \|= engine->irq_enable_mask;`
			`intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask);`
			`}`

			`void gen3_irq_enable(struct intel_engine_cs *engine)`
			`{`
			`engine->i915->irq_mask &= ~engine->irq_enable_mask;`
			`intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);`
			`intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR);`
			`}`

			`void gen3_irq_disable(struct intel_engine_cs *engine)`
			`{`
			`engine->i915->irq_mask \|= engine->irq_enable_mask;`
			`intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask);`
			`}`

			`void gen5_irq_enable(struct intel_engine_cs *engine)`
			`{`
			`gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);`
			`}`

			`void gen5_irq_disable(struct intel_engine_cs *engine)`
			`{`
			`gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);`
			`}`