11452 lines
490 KiB
Diff
11452 lines
490 KiB
Diff
From dc44099798c94c194dedcb107e7aadee0d4c8e0b Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Tue, 4 Jun 2024 15:09:20 +0200
|
|
Subject: [PATCH 01/71] !29536
|
|
|
|
---
|
|
src/amd/compiler/aco_interface.cpp | 2 +
|
|
src/amd/compiler/aco_ir.h | 1 +
|
|
src/amd/compiler/aco_vectorize_spills.cpp | 253 ++++++++++++++++++++++
|
|
src/amd/compiler/meson.build | 1 +
|
|
4 files changed, 257 insertions(+)
|
|
create mode 100644 src/amd/compiler/aco_vectorize_spills.cpp
|
|
|
|
diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
|
|
index bc24b31a5bb6a..32a28908f90f0 100644
|
|
--- a/src/amd/compiler/aco_interface.cpp
|
|
+++ b/src/amd/compiler/aco_interface.cpp
|
|
@@ -152,6 +152,8 @@ aco_postprocess_shader(const struct aco_compiler_options* options,
|
|
schedule_program(program.get());
|
|
validate(program.get());
|
|
|
|
+ vectorize_spills(program.get());
|
|
+
|
|
/* Register Allocation */
|
|
register_allocation(program.get());
|
|
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index 7542c1e0db143..96bc3c540e0bf 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -2263,6 +2263,7 @@ void combine_delay_alu(Program* program);
|
|
bool dealloc_vgprs(Program* program);
|
|
void insert_NOPs(Program* program);
|
|
void form_hard_clauses(Program* program);
|
|
+void vectorize_spills(Program* program);
|
|
unsigned emit_program(Program* program, std::vector<uint32_t>& code,
|
|
std::vector<struct aco_symbol>* symbols = NULL, bool append_endpgm = true);
|
|
/**
|
|
diff --git a/src/amd/compiler/aco_vectorize_spills.cpp b/src/amd/compiler/aco_vectorize_spills.cpp
|
|
new file mode 100644
|
|
index 0000000000000..b989306b5a3c2
|
|
--- /dev/null
|
|
+++ b/src/amd/compiler/aco_vectorize_spills.cpp
|
|
@@ -0,0 +1,253 @@
|
|
+/*
|
|
+ * Copyright © 2024 Valve Corporation
|
|
+ *
|
|
+ * SPDX-License-Identifier: MIT
|
|
+ */
|
|
+
|
|
+#include "aco_builder.h"
|
|
+#include "aco_ir.h"
|
|
+
|
|
+#include <unordered_set>
|
|
+
|
|
+namespace aco {
|
|
+
|
|
+struct vectorize_ctx {
|
|
+ std::vector<aco_ptr<Instruction>> instrs_to_vectorize;
|
|
+
|
|
+ std::vector<aco_ptr<Instruction>> vectors;
|
|
+ std::vector<aco_ptr<Instruction>> vectorized_instrs;
|
|
+
|
|
+ std::vector<unsigned> component_idxs;
|
|
+
|
|
+ std::unordered_set<unsigned> killed_soffset_ids;
|
|
+ std::unordered_set<unsigned> seen_soffset_ids;
|
|
+
|
|
+ std::vector<aco_ptr<Instruction>>::iterator insert_point;
|
|
+ Block* block;
|
|
+ Program* program;
|
|
+};
|
|
+
|
|
+void
|
|
+vectorize_and_insert(vectorize_ctx& ctx, bool store)
|
|
+{
|
|
+ std::sort(ctx.instrs_to_vectorize.begin(), ctx.instrs_to_vectorize.end(),
|
|
+ [](const auto& one, const auto& other)
|
|
+ { return one->scratch().offset < other->scratch().offset; });
|
|
+
|
|
+ Builder instr_bld(ctx.program, &ctx.vectorized_instrs);
|
|
+
|
|
+ for (unsigned i = 0; i < ctx.instrs_to_vectorize.size(); ++i) {
|
|
+ ctx.component_idxs.push_back(i);
|
|
+ for (auto j = i + 1; j < ctx.instrs_to_vectorize.size(); ++j) {
|
|
+ const auto& component = ctx.instrs_to_vectorize[ctx.component_idxs.back()];
|
|
+ const auto& instr = ctx.instrs_to_vectorize[j];
|
|
+ /* skip stores with unrelated soffset */
|
|
+ if (instr->operands[1].tempId() != component->operands[1].tempId())
|
|
+ continue;
|
|
+ int16_t next_offset;
|
|
+ if (store)
|
|
+ next_offset = component->scratch().offset + (int16_t)component->operands[2].bytes();
|
|
+ else
|
|
+ next_offset = component->scratch().offset + (int16_t)component->definitions[0].bytes();
|
|
+
|
|
+ /* there's a gap, can't vectorize across it */
|
|
+ if (instr->scratch().offset > next_offset)
|
|
+ break;
|
|
+ /* XXX: Hitting this means there are intersecting stores. This shouldn't happen! */
|
|
+ if (instr->scratch().offset != next_offset)
|
|
+ break;
|
|
+
|
|
+ if (instr->operands[1].isKill())
|
|
+ ctx.killed_soffset_ids.insert(instr->operands[1].tempId());
|
|
+
|
|
+ ctx.component_idxs.push_back(j);
|
|
+ }
|
|
+
|
|
+ if (ctx.component_idxs.empty())
|
|
+ continue;
|
|
+
|
|
+ size_t comp_idx = 0;
|
|
+ while (comp_idx < ctx.component_idxs.size()) {
|
|
+ size_t vector_size = 4;
|
|
+ while (vector_size > ctx.component_idxs.size() - comp_idx)
|
|
+ vector_size >>= 1;
|
|
+
|
|
+ auto& first_component = ctx.instrs_to_vectorize[ctx.component_idxs[comp_idx]];
|
|
+
|
|
+ if (vector_size == 1) {
|
|
+ ctx.vectorized_instrs.emplace_back(std::move(first_component));
|
|
+ ++comp_idx;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (store) {
|
|
+ Temp vec_tmp = ctx.program->allocateTmp(RegClass(RegType::vgpr, vector_size));
|
|
+ Instruction* vec =
|
|
+ create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, vector_size, 1);
|
|
+ for (unsigned c = 0; c < vector_size; ++c) {
|
|
+ auto& component = ctx.instrs_to_vectorize[ctx.component_idxs[comp_idx + c]];
|
|
+ vec->operands[c] = component->operands[2];
|
|
+ }
|
|
+ vec->definitions[0] = Definition(vec_tmp);
|
|
+ ctx.vectors.emplace_back(vec);
|
|
+
|
|
+ aco_opcode opcode;
|
|
+ switch (vector_size) {
|
|
+ case 4: opcode = aco_opcode::scratch_store_dwordx4; break;
|
|
+ case 2: opcode = aco_opcode::scratch_store_dwordx2; break;
|
|
+ default: unreachable("invalid vector size");
|
|
+ }
|
|
+
|
|
+ Operand vec_op = Operand(vec_tmp);
|
|
+ vec_op.setFirstKill(true);
|
|
+ instr_bld.scratch(opcode, Operand(v1), first_component->operands[1], vec_op,
|
|
+ first_component->scratch().offset, first_component->scratch().sync);
|
|
+ } else {
|
|
+ Temp vec_tmp = ctx.program->allocateTmp(RegClass(RegType::vgpr, vector_size));
|
|
+
|
|
+ aco_opcode opcode;
|
|
+ switch (vector_size) {
|
|
+ case 4: opcode = aco_opcode::scratch_load_dwordx4; break;
|
|
+ case 2: opcode = aco_opcode::scratch_load_dwordx2; break;
|
|
+ default: unreachable("invalid vector size");
|
|
+ }
|
|
+
|
|
+ instr_bld.scratch(opcode, Definition(vec_tmp), Operand(v1),
|
|
+ first_component->operands[1], first_component->scratch().offset,
|
|
+ first_component->scratch().sync);
|
|
+
|
|
+ Instruction* vec =
|
|
+ create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, vector_size);
|
|
+ for (unsigned c = 0; c < vector_size; ++c) {
|
|
+ auto& component = ctx.instrs_to_vectorize[ctx.component_idxs[comp_idx + c]];
|
|
+ vec->definitions[c] = component->definitions[0];
|
|
+ }
|
|
+ vec->operands[0] = Operand(vec_tmp);
|
|
+ vec->operands[0].setFirstKill(true);
|
|
+ ctx.vectors.emplace_back(vec);
|
|
+ }
|
|
+ comp_idx += vector_size;
|
|
+ }
|
|
+
|
|
+ for (unsigned j = 0; j < ctx.component_idxs.size(); ++j) {
|
|
+ auto idx = ctx.component_idxs[j];
|
|
+ ctx.instrs_to_vectorize.erase(ctx.instrs_to_vectorize.begin() + (idx - j));
|
|
+ }
|
|
+ /* Adjust for deleted instruction */
|
|
+ --i;
|
|
+
|
|
+ ctx.component_idxs.clear();
|
|
+ }
|
|
+
|
|
+ for (auto it = ctx.vectorized_instrs.rbegin(); it != ctx.vectorized_instrs.rend(); ++it) {
|
|
+ auto soffset_id = (*it)->operands[1].tempId();
|
|
+ if (ctx.seen_soffset_ids.find(soffset_id) == ctx.seen_soffset_ids.end()) {
|
|
+ if (ctx.killed_soffset_ids.find(soffset_id) != ctx.killed_soffset_ids.end())
|
|
+ (*it)->operands[1].setFirstKill(true);
|
|
+ ctx.seen_soffset_ids.insert(soffset_id);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (store) {
|
|
+ ctx.insert_point =
|
|
+ ctx.block->instructions.insert(ctx.insert_point, std::move_iterator(ctx.vectors.begin()),
|
|
+ std::move_iterator(ctx.vectors.end()));
|
|
+ ctx.insert_point += ctx.vectors.size();
|
|
+ ctx.insert_point = ctx.block->instructions.insert(
|
|
+ ctx.insert_point, std::move_iterator(ctx.vectorized_instrs.rbegin()),
|
|
+ std::move_iterator(ctx.vectorized_instrs.rend()));
|
|
+ ctx.insert_point += ctx.vectorized_instrs.size();
|
|
+ } else {
|
|
+ ctx.insert_point = ctx.block->instructions.insert(
|
|
+ ctx.insert_point, std::move_iterator(ctx.vectorized_instrs.rbegin()),
|
|
+ std::move_iterator(ctx.vectorized_instrs.rend()));
|
|
+ ctx.insert_point += ctx.vectorized_instrs.size();
|
|
+ ctx.insert_point =
|
|
+ ctx.block->instructions.insert(ctx.insert_point, std::move_iterator(ctx.vectors.begin()),
|
|
+ std::move_iterator(ctx.vectors.end()));
|
|
+ ctx.insert_point += ctx.vectors.size();
|
|
+ }
|
|
+
|
|
+ ctx.vectors.clear();
|
|
+ ctx.vectorized_instrs.clear();
|
|
+ ctx.instrs_to_vectorize.clear();
|
|
+ ctx.seen_soffset_ids.clear();
|
|
+ ctx.killed_soffset_ids.clear();
|
|
+}
|
|
+
|
|
+void
|
|
+vectorize_spills(Program* program)
|
|
+{
|
|
+ vectorize_ctx ctx;
|
|
+ ctx.program = program;
|
|
+ aco::monotonic_buffer_resource memory;
|
|
+
|
|
+ for (auto& block : program->blocks) {
|
|
+ ctx.block = █
|
|
+ IDSet conflicting_temps(memory);
|
|
+
|
|
+ /* Try vectorizing stores */
|
|
+ for (auto it = block.instructions.begin(); it != block.instructions.end();) {
|
|
+ bool vectorize_now = !(*it)->isVMEM() && it != block.instructions.begin();
|
|
+
|
|
+ /* Only look for stores that kill their operand. We can move/combine these with other
|
|
+ * instructions without affecting register demand.
|
|
+ */
|
|
+ if ((*it)->opcode == aco_opcode::scratch_store_dword && (*it)->operands[2].isKill() &&
|
|
+ !(*it)->operands[2].regClass().is_subdword()) {
|
|
+ if (conflicting_temps.count((*it)->operands[2].tempId())) {
|
|
+ vectorize_now = true;
|
|
+ --it;
|
|
+ } else {
|
|
+ bool first = ctx.instrs_to_vectorize.empty();
|
|
+ ctx.instrs_to_vectorize.emplace_back(std::move(*it));
|
|
+ it = block.instructions.erase(it);
|
|
+ if (first)
|
|
+ ctx.insert_point = it;
|
|
+ continue;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (vectorize_now) {
|
|
+ auto clause_size = it - ctx.insert_point;
|
|
+ vectorize_and_insert(ctx, true);
|
|
+ it = ctx.insert_point + clause_size;
|
|
+ conflicting_temps = IDSet(memory);
|
|
+ } else {
|
|
+ for (auto& def : (*it)->definitions)
|
|
+ if (def.isTemp())
|
|
+ conflicting_temps.insert(def.tempId());
|
|
+ }
|
|
+ ++it;
|
|
+ }
|
|
+ /* Try vectorizing loads */
|
|
+ for (auto it = block.instructions.begin(); it != block.instructions.end();) {
|
|
+ bool vectorize_now = !(*it)->isVMEM() && it != block.instructions.begin();
|
|
+ for (auto& op : (*it)->operands) {
|
|
+ if (op.isTemp() && conflicting_temps.count(op.tempId())) {
|
|
+ vectorize_now = true;
|
|
+ --it;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Loads that kill their definition are dead and shouldn't appear with spilling */
|
|
+ if (!vectorize_now && (*it)->opcode == aco_opcode::scratch_load_dword &&
|
|
+ !(*it)->definitions[0].isKill() && !(*it)->definitions[0].regClass().is_subdword()) {
|
|
+ ctx.instrs_to_vectorize.emplace_back(std::move(*it));
|
|
+ conflicting_temps.insert((*it)->definitions[0].tempId());
|
|
+ it = block.instructions.erase(it);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (vectorize_now) {
|
|
+ ctx.insert_point = it;
|
|
+ vectorize_and_insert(ctx, false);
|
|
+ it = ctx.insert_point;
|
|
+ conflicting_temps = IDSet(memory);
|
|
+ }
|
|
+ ++it;
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+} // namespace aco
|
|
diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build
|
|
index ae2d6a41b793a..b235f626f97af 100644
|
|
--- a/src/amd/compiler/meson.build
|
|
+++ b/src/amd/compiler/meson.build
|
|
@@ -66,6 +66,7 @@ libaco_files = files(
|
|
'aco_statistics.cpp',
|
|
'aco_util.h',
|
|
'aco_validate.cpp',
|
|
+ 'aco_vectorize_spills.cpp',
|
|
)
|
|
|
|
cpp_args_aco = cpp.get_supported_arguments(['-fno-exceptions', '-fno-rtti', '-Wimplicit-fallthrough', '-Wshadow'])
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 8123a30fc5553bbf237833fbb7a5b39ce677664d Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 25 Mar 2024 16:52:45 +0100
|
|
Subject: [PATCH 02/71] !29576
|
|
|
|
---
|
|
src/amd/compiler/aco_ir.h | 1 +
|
|
src/amd/compiler/aco_register_allocation.cpp | 316 +++++++++++--------
|
|
2 files changed, 193 insertions(+), 124 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index 96bc3c540e0bf..8a501797092ed 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -742,6 +742,7 @@ public:
|
|
isPrecolored_ = isFixed_;
|
|
}
|
|
|
|
+
|
|
constexpr bool isConstant() const noexcept { return isConstant_; }
|
|
|
|
constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; }
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index 7ff35c079e2ed..fc62487627fad 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -14,8 +14,17 @@
|
|
#include <bitset>
|
|
#include <map>
|
|
#include <optional>
|
|
+#include <set>
|
|
+#include <unordered_map>
|
|
+#include <unordered_set>
|
|
#include <vector>
|
|
|
|
+namespace std {
|
|
+template <> struct hash<aco::PhysReg> {
|
|
+ size_t operator()(aco::PhysReg temp) const noexcept { return std::hash<uint32_t>{}(temp.reg_b); }
|
|
+};
|
|
+} // namespace std
|
|
+
|
|
namespace aco {
|
|
namespace {
|
|
|
|
@@ -29,6 +38,19 @@ void add_subdword_operand(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx
|
|
void add_subdword_definition(Program* program, aco_ptr<Instruction>& instr, PhysReg reg,
|
|
bool allow_16bit_write);
|
|
|
|
+struct parallelcopy {
|
|
+ constexpr parallelcopy() : skip_renaming(false) {}
|
|
+ constexpr parallelcopy(Operand op_, Definition def_) : op(op_), def(def_), skip_renaming(false)
|
|
+ {}
|
|
+ constexpr parallelcopy(Operand op_, Definition def_, bool skip_renaming_)
|
|
+ : op(op_), def(def_), skip_renaming(skip_renaming_)
|
|
+ {}
|
|
+
|
|
+ Operand op;
|
|
+ Definition def;
|
|
+ bool skip_renaming;
|
|
+};
|
|
+
|
|
struct assignment {
|
|
PhysReg reg;
|
|
RegClass rc;
|
|
@@ -270,7 +292,11 @@ public:
|
|
std::array<uint32_t, 512> regs;
|
|
std::map<uint32_t, std::array<uint32_t, 4>> subdword_regs;
|
|
|
|
- const uint32_t& operator[](PhysReg index) const { return regs[index]; }
|
|
+ const uint32_t& operator[](PhysReg index) const
|
|
+ {
|
|
+ assert(index.reg() < 512);
|
|
+ return regs[index];
|
|
+ }
|
|
|
|
uint32_t& operator[](PhysReg index) { return regs[index]; }
|
|
|
|
@@ -357,7 +383,12 @@ public:
|
|
}
|
|
}
|
|
|
|
- void clear(Operand op) { clear(op.physReg(), op.regClass()); }
|
|
+ void clear(Operand op)
|
|
+ {
|
|
+ if (op.isTemp() && get_id(op.physReg()) && !is_blocked(op.physReg()))
|
|
+ assert(get_id(op.physReg()) == op.tempId());
|
|
+ clear(op.physReg(), op.regClass());
|
|
+ }
|
|
|
|
void fill(Definition def)
|
|
{
|
|
@@ -805,22 +836,21 @@ enum UpdateRenames {
|
|
MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames);
|
|
|
|
void
|
|
-update_renames(ra_ctx& ctx, RegisterFile& reg_file,
|
|
- std::vector<std::pair<Operand, Definition>>& parallelcopies,
|
|
+update_renames(ra_ctx& ctx, RegisterFile& reg_file, std::vector<parallelcopy>& parallelcopies,
|
|
aco_ptr<Instruction>& instr, UpdateRenames flags)
|
|
{
|
|
/* clear operands */
|
|
- for (std::pair<Operand, Definition>& copy : parallelcopies) {
|
|
+ for (parallelcopy& copy : parallelcopies) {
|
|
/* the definitions with id are not from this function and already handled */
|
|
- if (copy.second.isTemp())
|
|
+ if (copy.def.isTemp())
|
|
continue;
|
|
- reg_file.clear(copy.first);
|
|
+ reg_file.clear(copy.op);
|
|
}
|
|
|
|
/* allocate id's and rename operands: this is done transparently here */
|
|
auto it = parallelcopies.begin();
|
|
while (it != parallelcopies.end()) {
|
|
- if (it->second.isTemp()) {
|
|
+ if (it->def.isTemp()) {
|
|
++it;
|
|
continue;
|
|
}
|
|
@@ -828,9 +858,9 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file,
|
|
/* check if we moved a definition: change the register and remove copy */
|
|
bool is_def = false;
|
|
for (Definition& def : instr->definitions) {
|
|
- if (def.isTemp() && def.getTemp() == it->first.getTemp()) {
|
|
+ if (def.isTemp() && def.getTemp() == it->op.getTemp()) {
|
|
// FIXME: ensure that the definition can use this reg
|
|
- def.setFixed(it->second.physReg());
|
|
+ def.setFixed(it->def.physReg());
|
|
reg_file.fill(def);
|
|
ctx.assignments[def.tempId()].reg = def.physReg();
|
|
it = parallelcopies.erase(it);
|
|
@@ -842,34 +872,52 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file,
|
|
continue;
|
|
|
|
/* check if we moved another parallelcopy definition */
|
|
- for (std::pair<Operand, Definition>& other : parallelcopies) {
|
|
- if (!other.second.isTemp())
|
|
+ for (parallelcopy& other : parallelcopies) {
|
|
+ if (!other.def.isTemp())
|
|
continue;
|
|
- if (it->first.getTemp() == other.second.getTemp()) {
|
|
- other.second.setFixed(it->second.physReg());
|
|
- ctx.assignments[other.second.tempId()].reg = other.second.physReg();
|
|
- it = parallelcopies.erase(it);
|
|
- is_def = true;
|
|
+ if (it->op.getTemp() == other.def.getTemp()) {
|
|
+ bool other_ensures_precoloring = false;
|
|
+
|
|
/* check if we moved an operand, again */
|
|
bool fill = true;
|
|
for (Operand& op : instr->operands) {
|
|
- if (op.isTemp() && op.tempId() == other.second.tempId()) {
|
|
- // FIXME: ensure that the operand can use this reg
|
|
- op.setFixed(other.second.physReg());
|
|
- fill = !op.isKillBeforeDef();
|
|
- }
|
|
+ if (!op.isTemp() || op.tempId() != other.def.tempId())
|
|
+ continue;
|
|
+ bool isKillBeforeDef = op.isFirstKillBeforeDef();
|
|
+ fill = !isKillBeforeDef;
|
|
+
|
|
+ if (other.def.physReg() == op.physReg() && op.isPrecolored())
|
|
+ other_ensures_precoloring = true;
|
|
+ else
|
|
+ op.setFixed(it->def.physReg());
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ Definition fill_def;
|
|
+
|
|
+ if (other_ensures_precoloring) {
|
|
+ it->op = other.op;
|
|
+ ctx.assignments[other.op.tempId()].reg = it->def.physReg();
|
|
+ fill_def = it->def;
|
|
+ } else {
|
|
+ other.def.setFixed(it->def.physReg());
|
|
+ ctx.assignments[other.def.tempId()].reg = other.def.physReg();
|
|
+ it = parallelcopies.erase(it);
|
|
+ fill_def = other.def;
|
|
}
|
|
+ is_def = true;
|
|
+
|
|
if (fill)
|
|
- reg_file.fill(other.second);
|
|
+ reg_file.fill(fill_def);
|
|
break;
|
|
}
|
|
}
|
|
if (is_def)
|
|
continue;
|
|
|
|
- std::pair<Operand, Definition>& copy = *it;
|
|
- copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass()));
|
|
- ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass());
|
|
+ parallelcopy& copy = *it;
|
|
+ copy.def.setTemp(ctx.program->allocateTmp(copy.def.regClass()));
|
|
+ ctx.assignments.emplace_back(copy.def.physReg(), copy.def.regClass());
|
|
assert(ctx.assignments.size() == ctx.program->peekAllocationId());
|
|
|
|
/* check if we moved an operand */
|
|
@@ -879,19 +927,19 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file,
|
|
Operand& op = instr->operands[i];
|
|
if (!op.isTemp())
|
|
continue;
|
|
- if (op.tempId() == copy.first.tempId()) {
|
|
+ if (op.tempId() == copy.op.tempId()) {
|
|
/* only rename precolored operands if the copy-location matches */
|
|
- bool omit_renaming = op.isPrecolored() && op.physReg() != copy.second.physReg();
|
|
+ bool omit_renaming = op.isPrecolored() && op.physReg() != copy.def.physReg();
|
|
|
|
/* Omit renaming in some cases for p_create_vector in order to avoid
|
|
* unnecessary shuffle code. */
|
|
if (!(flags & rename_not_killed_ops) && !op.isKillBeforeDef()) {
|
|
omit_renaming = true;
|
|
- for (std::pair<Operand, Definition>& pc : parallelcopies) {
|
|
- PhysReg def_reg = pc.second.physReg();
|
|
- omit_renaming &= def_reg > copy.first.physReg()
|
|
- ? (copy.first.physReg() + copy.first.size() <= def_reg.reg())
|
|
- : (def_reg + pc.second.size() <= copy.first.physReg().reg());
|
|
+ for (parallelcopy& pc : parallelcopies) {
|
|
+ PhysReg def_reg = pc.def.physReg();
|
|
+ omit_renaming &= def_reg > copy.op.physReg()
|
|
+ ? (copy.op.physReg() + copy.op.size() <= def_reg.reg())
|
|
+ : (def_reg + pc.def.size() <= copy.op.physReg().reg());
|
|
}
|
|
}
|
|
|
|
@@ -905,8 +953,8 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file,
|
|
if (omit_renaming)
|
|
continue;
|
|
|
|
- op.setTemp(copy.second.getTemp());
|
|
- op.setFixed(copy.second.physReg());
|
|
+ op.setTemp(copy.def.getTemp());
|
|
+ op.setFixed(copy.def.physReg());
|
|
|
|
fill = !op.isKillBeforeDef() || op.isPrecolored();
|
|
}
|
|
@@ -914,7 +962,7 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file,
|
|
|
|
/* Apply changes to register file. */
|
|
if (fill)
|
|
- reg_file.fill(copy.second);
|
|
+ reg_file.fill(copy.def);
|
|
|
|
++it;
|
|
}
|
|
@@ -1050,7 +1098,7 @@ collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_inte
|
|
|
|
std::optional<PhysReg>
|
|
get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file,
|
|
- std::vector<std::pair<Operand, Definition>>& parallelcopies,
|
|
+ std::vector<parallelcopy>& parallelcopies,
|
|
aco_ptr<Instruction>& instr, const PhysRegInterval def_reg,
|
|
DefInfo info, unsigned id)
|
|
{
|
|
@@ -1102,8 +1150,7 @@ get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file,
|
|
}
|
|
|
|
bool
|
|
-get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
|
|
- std::vector<std::pair<Operand, Definition>>& parallelcopies,
|
|
+get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, std::vector<parallelcopy>& parallelcopies,
|
|
const std::vector<unsigned>& vars, aco_ptr<Instruction>& instr,
|
|
const PhysRegInterval def_reg)
|
|
{
|
|
@@ -1253,9 +1300,8 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file,
|
|
}
|
|
|
|
std::optional<PhysReg>
|
|
-get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file,
|
|
- std::vector<std::pair<Operand, Definition>>& parallelcopies, const DefInfo& info,
|
|
- aco_ptr<Instruction>& instr)
|
|
+get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, std::vector<parallelcopy>& parallelcopies,
|
|
+ const DefInfo& info, aco_ptr<Instruction>& instr)
|
|
{
|
|
const PhysRegInterval& bounds = info.bounds;
|
|
uint32_t size = info.size;
|
|
@@ -1381,7 +1427,7 @@ get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file,
|
|
if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector)
|
|
tmp_file.fill_killed_operands(instr.get());
|
|
|
|
- std::vector<std::pair<Operand, Definition>> pc;
|
|
+ std::vector<parallelcopy> pc;
|
|
if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win))
|
|
return {};
|
|
|
|
@@ -1460,11 +1506,13 @@ struct IDAndInfo {
|
|
};
|
|
|
|
void
|
|
-add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val)
|
|
+add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val, bool add_to_ctx = true)
|
|
{
|
|
- ctx.renames[ctx.block->index][orig_val.id()] = new_val;
|
|
ctx.orig_names.emplace(new_val.id(), orig_val);
|
|
- ctx.assignments[orig_val.id()].renamed = true;
|
|
+ if (add_to_ctx) {
|
|
+ ctx.renames[ctx.block->index][orig_val.id()] = new_val;
|
|
+ ctx.assignments[orig_val.id()].renamed = true;
|
|
+ }
|
|
}
|
|
|
|
/* Reallocates vars by sorting them and placing each variable after the previous
|
|
@@ -1473,7 +1521,7 @@ add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val)
|
|
*/
|
|
PhysReg
|
|
compact_relocate_vars(ra_ctx& ctx, const std::vector<IDAndRegClass>& vars,
|
|
- std::vector<std::pair<Operand, Definition>>& parallelcopies, PhysReg start)
|
|
+ std::vector<parallelcopy>& parallelcopies, PhysReg start)
|
|
{
|
|
/* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword
|
|
* temporary sizes to dwords.
|
|
@@ -1624,7 +1672,7 @@ get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr<Ins
|
|
|
|
bool
|
|
compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file,
|
|
- std::vector<std::pair<Operand, Definition>>& parallelcopies)
|
|
+ std::vector<parallelcopy>& parallelcopies)
|
|
{
|
|
PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true);
|
|
int zeros = reg_file.count_zero(linear_vgpr_bounds);
|
|
@@ -1650,7 +1698,7 @@ compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file,
|
|
*/
|
|
PhysReg
|
|
alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr<Instruction>& instr,
|
|
- std::vector<std::pair<Operand, Definition>>& parallelcopies)
|
|
+ std::vector<parallelcopy>& parallelcopies)
|
|
{
|
|
assert(instr->opcode == aco_opcode::p_start_linear_vgpr);
|
|
assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0);
|
|
@@ -1683,7 +1731,7 @@ alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr<Instruction
|
|
tmp_file.fill_killed_operands(instr.get());
|
|
|
|
/* Find new assignments for blocking vars. */
|
|
- std::vector<std::pair<Operand, Definition>> pc;
|
|
+ std::vector<parallelcopy> pc;
|
|
if (!ctx.policy.skip_optimistic_path &&
|
|
get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) {
|
|
parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
|
|
@@ -1734,7 +1782,7 @@ should_compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file)
|
|
|
|
PhysReg
|
|
get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
|
|
- std::vector<std::pair<Operand, Definition>>& parallelcopies, aco_ptr<Instruction>& instr,
|
|
+ std::vector<parallelcopy>& parallelcopies, aco_ptr<Instruction>& instr,
|
|
int operand_index = -1)
|
|
{
|
|
auto split_vec = ctx.split_vectors.find(temp.id());
|
|
@@ -1808,7 +1856,7 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
|
|
return *res;
|
|
|
|
/* try compacting the linear vgprs to make more space */
|
|
- std::vector<std::pair<Operand, Definition>> pc;
|
|
+ std::vector<parallelcopy> pc;
|
|
if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) &&
|
|
compact_linear_vgprs(ctx, reg_file, pc)) {
|
|
parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end());
|
|
@@ -1816,8 +1864,8 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
|
|
/* We don't need to fill the copy definitions in because we don't care about the linear VGPR
|
|
* space here. */
|
|
RegisterFile tmp_file(reg_file);
|
|
- for (std::pair<Operand, Definition>& copy : pc)
|
|
- tmp_file.clear(copy.first);
|
|
+ for (parallelcopy& copy : pc)
|
|
+ tmp_file.clear(copy.op);
|
|
|
|
return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index);
|
|
}
|
|
@@ -1875,8 +1923,7 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
|
|
|
|
PhysReg
|
|
get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
|
|
- std::vector<std::pair<Operand, Definition>>& parallelcopies,
|
|
- aco_ptr<Instruction>& instr)
|
|
+ std::vector<parallelcopy>& parallelcopies, aco_ptr<Instruction>& instr)
|
|
{
|
|
RegClass rc = temp.regClass();
|
|
/* create_vector instructions have different costs w.r.t. register coalescing */
|
|
@@ -1993,7 +2040,7 @@ get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp,
|
|
std::vector<unsigned> vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size});
|
|
|
|
bool success = false;
|
|
- std::vector<std::pair<Operand, Definition>> pc;
|
|
+ std::vector<parallelcopy> pc;
|
|
success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size});
|
|
|
|
if (!success) {
|
|
@@ -2084,59 +2131,81 @@ operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsign
|
|
|
|
void
|
|
handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file,
|
|
- std::vector<std::pair<Operand, Definition>>& parallelcopy,
|
|
- aco_ptr<Instruction>& instr)
|
|
+ std::vector<parallelcopy>& parallelcopy, aco_ptr<Instruction>& instr)
|
|
{
|
|
assert(instr->operands.size() <= 128);
|
|
assert(parallelcopy.empty());
|
|
|
|
RegisterFile tmp_file(register_file);
|
|
+ std::unordered_map<unsigned, std::unordered_set<PhysReg>> temp_regs;
|
|
+ std::vector<unsigned> blocking_vars;
|
|
|
|
- BITSET_DECLARE(mask, 128) = {0};
|
|
-
|
|
- for (unsigned i = 0; i < instr->operands.size(); i++) {
|
|
- Operand& op = instr->operands[i];
|
|
-
|
|
- if (!op.isPrecolored())
|
|
+ for (auto it = instr->operands.begin(); it != instr->operands.end(); ++it) {
|
|
+ if (!it->isPrecolored())
|
|
continue;
|
|
|
|
- assert(op.isTemp());
|
|
- PhysReg src = ctx.assignments[op.tempId()].reg;
|
|
- adjust_max_used_regs(ctx, op.regClass(), op.physReg());
|
|
+ assert(it->isTemp());
|
|
+ adjust_max_used_regs(ctx, it->regClass(), it->physReg());
|
|
+ PhysReg src = ctx.assignments[it->tempId()].reg;
|
|
+ temp_regs[it->tempId()].emplace(it->physReg());
|
|
|
|
- if (op.physReg() == src) {
|
|
- tmp_file.block(op.physReg(), op.regClass());
|
|
- continue;
|
|
+ if (src == it->physReg()) {
|
|
+ tmp_file.block(it->physReg(), it->regClass());
|
|
+ } else {
|
|
+ /* clear from register_file so fixed operands are not collected be collect_vars() */
|
|
+ if (!tmp_file.is_blocked(src))
|
|
+ tmp_file.clear(src, it->regClass()); // TODO: try to avoid moving block vars to src
|
|
}
|
|
|
|
/* An instruction can have at most one operand precolored to the same register. */
|
|
assert(std::none_of(parallelcopy.begin(), parallelcopy.end(),
|
|
- [&](auto copy) { return copy.second.physReg() == op.physReg(); }));
|
|
+ [&](auto copy) { return copy.def.physReg() == it->physReg(); }));
|
|
+ }
|
|
+
|
|
+ for (auto& regs : temp_regs) {
|
|
+ PhysReg src = ctx.assignments[regs.first].reg;
|
|
|
|
- /* clear from register_file so fixed operands are not collected be collect_vars() */
|
|
- tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src
|
|
+ PhysReg live_reg = *regs.second.begin();
|
|
+ if (regs.second.size() > 1) {
|
|
+ bool found = false;
|
|
+ for (auto reg : regs.second) {
|
|
+ PhysRegInterval range = {reg, ctx.program->temp_rc[regs.first].size()};
|
|
+ bool intersects_with_def = false;
|
|
+ for (const auto& def : instr->definitions) {
|
|
+ if (!def.isTemp() || !def.isFixed())
|
|
+ continue;
|
|
+ PhysRegInterval def_range = {def.physReg(), def.regClass().size()};
|
|
+ if (intersects(def_range, range)) {
|
|
+ intersects_with_def = true;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ if (intersects_with_def)
|
|
+ continue;
|
|
|
|
- BITSET_SET(mask, i);
|
|
+ if (!found || reg == src) {
|
|
+ live_reg = reg;
|
|
+ found = true;
|
|
+ if (reg == src)
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
|
|
- Operand pc_op(instr->operands[i].getTemp());
|
|
- pc_op.setFixed(src);
|
|
- Definition pc_def = Definition(op.physReg(), pc_op.regClass());
|
|
- parallelcopy.emplace_back(pc_op, pc_def);
|
|
- }
|
|
+ RegClass rc = ctx.program->temp_rc[regs.first];
|
|
|
|
- if (BITSET_IS_EMPTY(mask))
|
|
- return;
|
|
+ for (auto reg : regs.second) {
|
|
+ if (reg == src)
|
|
+ continue;
|
|
|
|
- unsigned i;
|
|
- std::vector<unsigned> blocking_vars;
|
|
- BITSET_FOREACH_SET (i, mask, instr->operands.size()) {
|
|
- Operand& op = instr->operands[i];
|
|
- PhysRegInterval target{op.physReg(), op.size()};
|
|
- std::vector<unsigned> blocking_vars2 = collect_vars(ctx, tmp_file, target);
|
|
- blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end());
|
|
+ Definition copy_def = Definition(reg, rc);
|
|
+ parallelcopy.emplace_back(Operand(Temp(regs.first, rc), src), copy_def, reg != live_reg);
|
|
|
|
- /* prevent get_regs_for_copies() from using these registers */
|
|
- tmp_file.block(op.physReg(), op.regClass());
|
|
+ PhysRegInterval target{reg, rc.size()};
|
|
+ std::vector<unsigned> blocking_vars2 = collect_vars(ctx, tmp_file, target);
|
|
+ blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end());
|
|
+ tmp_file.block(reg, rc);
|
|
+ }
|
|
}
|
|
|
|
get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval());
|
|
@@ -2145,8 +2214,8 @@ handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file,
|
|
|
|
void
|
|
get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
|
|
- std::vector<std::pair<Operand, Definition>>& parallelcopy,
|
|
- aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
|
|
+ std::vector<parallelcopy>& parallelcopy, aco_ptr<Instruction>& instr,
|
|
+ Operand& operand, unsigned operand_index)
|
|
{
|
|
/* clear the operand in case it's only a stride mismatch */
|
|
PhysReg src = ctx.assignments[operand.tempId()].reg;
|
|
@@ -2166,45 +2235,44 @@ get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file,
|
|
std::vector<aco_ptr<Instruction>>& instructions, Block& block,
|
|
aco_ptr<Instruction>& phi, Temp tmp)
|
|
{
|
|
- std::vector<std::pair<Operand, Definition>> parallelcopy;
|
|
+ std::vector<parallelcopy> parallelcopy;
|
|
PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi);
|
|
update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops);
|
|
|
|
/* process parallelcopy */
|
|
- for (std::pair<Operand, Definition> pc : parallelcopy) {
|
|
+ for (struct parallelcopy pc : parallelcopy) {
|
|
/* see if it's a copy from a different phi */
|
|
// TODO: prefer moving some previous phis over live-ins
|
|
// TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a
|
|
// problem in practice since they can only be fixed to exec)
|
|
Instruction* prev_phi = NULL;
|
|
for (auto phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) {
|
|
- if ((*phi_it)->definitions[0].tempId() == pc.first.tempId())
|
|
+ if ((*phi_it)->definitions[0].tempId() == pc.op.tempId())
|
|
prev_phi = phi_it->get();
|
|
}
|
|
if (prev_phi) {
|
|
/* if so, just update that phi's register */
|
|
- prev_phi->definitions[0].setFixed(pc.second.physReg());
|
|
+ prev_phi->definitions[0].setFixed(pc.def.physReg());
|
|
register_file.fill(prev_phi->definitions[0]);
|
|
- ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(),
|
|
- pc.second.regClass()};
|
|
+ ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.def.physReg(), pc.def.regClass()};
|
|
continue;
|
|
}
|
|
|
|
/* rename */
|
|
- auto orig_it = ctx.orig_names.find(pc.first.tempId());
|
|
- Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp();
|
|
- add_rename(ctx, orig, pc.second.getTemp());
|
|
+ auto orig_it = ctx.orig_names.find(pc.op.tempId());
|
|
+ Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.op.getTemp();
|
|
+ add_rename(ctx, orig, pc.def.getTemp());
|
|
|
|
/* otherwise, this is a live-in and we need to create a new phi
|
|
* to move it in this block's predecessors */
|
|
aco_opcode opcode =
|
|
- pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
|
|
+ pc.op.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi;
|
|
Block::edge_vec& preds =
|
|
- pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
|
|
+ pc.op.getTemp().is_linear() ? block.linear_preds : block.logical_preds;
|
|
aco_ptr<Instruction> new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)};
|
|
- new_phi->definitions[0] = pc.second;
|
|
+ new_phi->definitions[0] = pc.def;
|
|
for (unsigned i = 0; i < preds.size(); i++)
|
|
- new_phi->operands[i] = Operand(pc.first);
|
|
+ new_phi->operands[i] = Operand(pc.op);
|
|
instructions.emplace_back(std::move(new_phi));
|
|
|
|
/* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is
|
|
@@ -2916,7 +2984,7 @@ optimize_encoding(ra_ctx& ctx, RegisterFile& register_file, aco_ptr<Instruction>
|
|
}
|
|
|
|
void
|
|
-emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
|
|
+emit_parallel_copy_internal(ra_ctx& ctx, std::vector<parallelcopy>& parallelcopy,
|
|
aco_ptr<Instruction>& instr,
|
|
std::vector<aco_ptr<Instruction>>& instructions, bool temp_in_scc,
|
|
RegisterFile& register_file)
|
|
@@ -2931,31 +2999,31 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definiti
|
|
bool sgpr_operands_alias_defs = false;
|
|
uint64_t sgpr_operands[4] = {0, 0, 0, 0};
|
|
for (unsigned i = 0; i < parallelcopy.size(); i++) {
|
|
- linear_vgpr |= parallelcopy[i].first.regClass().is_linear_vgpr();
|
|
+ linear_vgpr |= parallelcopy[i].op.regClass().is_linear_vgpr();
|
|
|
|
- if (temp_in_scc && parallelcopy[i].first.isTemp() &&
|
|
- parallelcopy[i].first.getTemp().type() == RegType::sgpr) {
|
|
+ if (temp_in_scc && parallelcopy[i].op.isTemp() &&
|
|
+ parallelcopy[i].op.getTemp().type() == RegType::sgpr) {
|
|
if (!sgpr_operands_alias_defs) {
|
|
- unsigned reg = parallelcopy[i].first.physReg().reg();
|
|
- unsigned size = parallelcopy[i].first.getTemp().size();
|
|
+ unsigned reg = parallelcopy[i].op.physReg().reg();
|
|
+ unsigned size = parallelcopy[i].op.getTemp().size();
|
|
sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
|
|
|
|
- reg = parallelcopy[i].second.physReg().reg();
|
|
- size = parallelcopy[i].second.getTemp().size();
|
|
+ reg = parallelcopy[i].def.physReg().reg();
|
|
+ size = parallelcopy[i].def.getTemp().size();
|
|
if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
|
|
sgpr_operands_alias_defs = true;
|
|
}
|
|
}
|
|
|
|
- pc->operands[i] = parallelcopy[i].first;
|
|
- pc->definitions[i] = parallelcopy[i].second;
|
|
+ pc->operands[i] = parallelcopy[i].op;
|
|
+ pc->definitions[i] = parallelcopy[i].def;
|
|
assert(pc->operands[i].size() == pc->definitions[i].size());
|
|
|
|
/* it might happen that the operand is already renamed. we have to restore the
|
|
* original name. */
|
|
auto it = ctx.orig_names.find(pc->operands[i].tempId());
|
|
Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp();
|
|
- add_rename(ctx, orig, pc->definitions[i].getTemp());
|
|
+ add_rename(ctx, orig, pc->definitions[i].getTemp(), !parallelcopy[i].skip_renaming);
|
|
}
|
|
|
|
if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) {
|
|
@@ -2982,18 +3050,18 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector<std::pair<Operand, Definiti
|
|
}
|
|
|
|
void
|
|
-emit_parallel_copy(ra_ctx& ctx, std::vector<std::pair<Operand, Definition>>& parallelcopy,
|
|
+emit_parallel_copy(ra_ctx& ctx, std::vector<parallelcopy>& parallelcopy,
|
|
aco_ptr<Instruction>& instr, std::vector<aco_ptr<Instruction>>& instructions,
|
|
bool temp_in_scc, RegisterFile& register_file)
|
|
{
|
|
if (parallelcopy.empty())
|
|
return;
|
|
|
|
- std::vector<std::pair<Operand, Definition>> linear_vgpr;
|
|
+ std::vector<struct parallelcopy> linear_vgpr;
|
|
if (ctx.num_linear_vgprs) {
|
|
unsigned next = 0;
|
|
for (unsigned i = 0; i < parallelcopy.size(); i++) {
|
|
- if (parallelcopy[i].first.regClass().is_linear_vgpr()) {
|
|
+ if (parallelcopy[i].def.regClass().is_linear_vgpr()) {
|
|
linear_vgpr.push_back(parallelcopy[i]);
|
|
continue;
|
|
}
|
|
@@ -3063,7 +3131,7 @@ register_allocation(Program* program, ra_test_policy policy)
|
|
auto instr_it = std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi);
|
|
for (; instr_it != block.instructions.end(); ++instr_it) {
|
|
aco_ptr<Instruction>& instr = *instr_it;
|
|
- std::vector<std::pair<Operand, Definition>> parallelcopy;
|
|
+ std::vector<parallelcopy> parallelcopy;
|
|
bool temp_in_scc = register_file[scc];
|
|
|
|
if (instr->opcode == aco_opcode::p_branch) {
|
|
@@ -3084,7 +3152,6 @@ register_allocation(Program* program, ra_test_policy policy)
|
|
/* rename operands */
|
|
operand.setTemp(read_variable(ctx, operand.getTemp(), block.index));
|
|
assert(ctx.assignments[operand.tempId()].assigned);
|
|
-
|
|
fixed |=
|
|
operand.isPrecolored() && ctx.assignments[operand.tempId()].reg != operand.physReg();
|
|
}
|
|
@@ -3101,8 +3168,9 @@ register_allocation(Program* program, ra_test_policy policy)
|
|
}
|
|
}
|
|
|
|
- if (fixed)
|
|
+ if (fixed) {
|
|
handle_fixed_operands(ctx, register_file, parallelcopy, instr);
|
|
+ }
|
|
|
|
for (unsigned i = 0; i < instr->operands.size(); ++i) {
|
|
auto& operand = instr->operands[i];
|
|
@@ -3347,7 +3415,7 @@ register_allocation(Program* program, ra_test_policy policy)
|
|
bool temp_in_scc =
|
|
register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc);
|
|
|
|
- std::vector<std::pair<Operand, Definition>> parallelcopy;
|
|
+ std::vector<parallelcopy> parallelcopy;
|
|
compact_linear_vgprs(ctx, register_file, parallelcopy);
|
|
update_renames(ctx, register_file, parallelcopy, br, rename_not_killed_ops);
|
|
emit_parallel_copy_internal(ctx, parallelcopy, br, instructions, temp_in_scc, register_file);
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 51acc061a662fc8fcc1e257a12346474af5912d6 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 24 Jun 2024 16:48:43 +0200
|
|
Subject: [PATCH 03/71] !29730
|
|
|
|
---
|
|
src/amd/compiler/aco_ir.h | 1 +
|
|
src/amd/compiler/aco_live_var_analysis.cpp | 50 +++++---
|
|
src/amd/compiler/aco_spill.cpp | 133 ++++++++++++++++++---
|
|
3 files changed, 151 insertions(+), 33 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index 8a501797092ed..d838b728e19ce 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -2314,6 +2314,7 @@ int get_op_fixed_to_def(Instruction* instr);
|
|
/* utilities for dealing with register demand */
|
|
RegisterDemand get_live_changes(Instruction* instr);
|
|
RegisterDemand get_temp_registers(Instruction* instr);
|
|
+RegisterDemand get_temp_reg_changes(Instruction* instr);
|
|
|
|
/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
|
|
uint16_t get_extra_sgprs(Program* program);
|
|
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
|
|
index 8744258a1b9aa..a635c94496143 100644
|
|
--- a/src/amd/compiler/aco_live_var_analysis.cpp
|
|
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
|
|
@@ -9,6 +9,29 @@
|
|
|
|
namespace aco {
|
|
|
|
+namespace {
|
|
+void
|
|
+get_temp_register_demand(Instruction* instr, RegisterDemand& demand_before, RegisterDemand& demand_after)
|
|
+{
|
|
+ for (Definition def : instr->definitions) {
|
|
+ if (def.isKill())
|
|
+ demand_after += def.getTemp();
|
|
+ else if (def.isTemp())
|
|
+ demand_before -= def.getTemp();
|
|
+ }
|
|
+
|
|
+ for (Operand op : instr->operands) {
|
|
+ if (op.isFirstKill() || op.isCopyKill()) {
|
|
+ demand_before += op.getTemp();
|
|
+ if (op.isLateKill())
|
|
+ demand_after += op.getTemp();
|
|
+ } else if (op.isClobbered() && !op.isKill()) {
|
|
+ demand_before += op.getTemp();
|
|
+ }
|
|
+ }
|
|
+}
|
|
+}
|
|
+
|
|
RegisterDemand
|
|
get_live_changes(Instruction* instr)
|
|
{
|
|
@@ -34,27 +57,22 @@ get_temp_registers(Instruction* instr)
|
|
RegisterDemand demand_before;
|
|
RegisterDemand demand_after;
|
|
|
|
- for (Definition def : instr->definitions) {
|
|
- if (def.isKill())
|
|
- demand_after += def.getTemp();
|
|
- else if (def.isTemp())
|
|
- demand_before -= def.getTemp();
|
|
- }
|
|
-
|
|
- for (Operand op : instr->operands) {
|
|
- if (op.isFirstKill() || op.isCopyKill()) {
|
|
- demand_before += op.getTemp();
|
|
- if (op.isLateKill())
|
|
- demand_after += op.getTemp();
|
|
- } else if (op.isClobbered() && !op.isKill()) {
|
|
- demand_before += op.getTemp();
|
|
- }
|
|
- }
|
|
+ get_temp_register_demand(instr, demand_before, demand_after);
|
|
|
|
demand_after.update(demand_before);
|
|
return demand_after;
|
|
}
|
|
|
|
+RegisterDemand get_temp_reg_changes(Instruction* instr)
|
|
+{
|
|
+ RegisterDemand demand_before;
|
|
+ RegisterDemand demand_after;
|
|
+
|
|
+ get_temp_register_demand(instr, demand_before, demand_after);
|
|
+
|
|
+ return demand_after - demand_before;
|
|
+}
|
|
+
|
|
namespace {
|
|
|
|
struct live_ctx {
|
|
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
|
|
index ae7ae16e3298b..be45b0eda7632 100644
|
|
--- a/src/amd/compiler/aco_spill.cpp
|
|
+++ b/src/amd/compiler/aco_spill.cpp
|
|
@@ -15,6 +15,7 @@
|
|
#include <algorithm>
|
|
#include <cstring>
|
|
#include <map>
|
|
+#include <optional>
|
|
#include <set>
|
|
#include <unordered_map>
|
|
#include <unordered_set>
|
|
@@ -909,7 +910,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
|
/* the Operand is spilled: add it to reloads */
|
|
Temp new_tmp = ctx.program->allocateTmp(op.regClass());
|
|
ctx.renames[block_idx][op.getTemp()] = new_tmp;
|
|
- reloads[new_tmp] = std::make_pair(op.getTemp(), current_spills[op.getTemp()]);
|
|
+ reloads[op.getTemp()] = std::make_pair(new_tmp, current_spills[op.getTemp()]);
|
|
current_spills.erase(op.getTemp());
|
|
spilled_registers -= new_tmp;
|
|
}
|
|
@@ -917,13 +918,17 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
|
/* check if register demand is low enough during and after the current instruction */
|
|
if (block->register_demand.exceeds(ctx.target_pressure)) {
|
|
RegisterDemand new_demand = instr->register_demand;
|
|
+ std::optional<RegisterDemand> live_changes;
|
|
|
|
/* if reg pressure is too high, spill variable with furthest next use */
|
|
while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) {
|
|
float score = 0.0;
|
|
Temp to_spill = Temp();
|
|
+ unsigned operand_idx = -1u;
|
|
+ unsigned respill_slot = -1u;
|
|
unsigned do_rematerialize = 0;
|
|
unsigned avoid_respill = 0;
|
|
+
|
|
RegType type = RegType::sgpr;
|
|
if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr)
|
|
type = RegType::vgpr;
|
|
@@ -941,24 +946,68 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
|
|
|
if (can_rematerialize > do_rematerialize || loop_variable > avoid_respill ||
|
|
ctx.ssa_infos[t].score() > score) {
|
|
- /* Don't spill operands */
|
|
- if (std::any_of(instr->operands.begin(), instr->operands.end(),
|
|
- [&](Operand& op) { return op.isTemp() && op.getTemp() == var; }))
|
|
+ unsigned cur_operand_idx = -1u;
|
|
+ bool can_spill = true;
|
|
+ for (auto it = instr->operands.begin(); it != instr->operands.end(); ++it) {
|
|
+ if (!it->isTemp() || it->getTemp() != var)
|
|
+ continue;
|
|
+
|
|
+ /* Vectors with size >4 require a p_split_vector. When spilling an operand,
|
|
+ * the p_split_vector cannot kill the vector (because it's also an operand
|
|
+ * to the current instruction) and will therefore increase register demand
|
|
+ * instead of helping reduce it.
|
|
+ */
|
|
+ if (it->regClass().size() > 4) {
|
|
+ can_spill = false;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (!live_changes)
|
|
+ live_changes = get_temp_reg_changes(instr.get());
|
|
+
|
|
+ /* Don't spill operands if killing operands won't help with register pressure */
|
|
+ if ((type == RegType::sgpr && live_changes->sgpr < (int16_t)it->size()) ||
|
|
+ (type == RegType::vgpr && live_changes->vgpr < (int16_t)it->size())) {
|
|
+ can_spill = false;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ cur_operand_idx = it - instr->operands.begin();
|
|
+ if (it->isLateKill() || it->isKill())
|
|
+ can_spill = false;
|
|
+ break;
|
|
+ }
|
|
+ if (!can_spill)
|
|
continue;
|
|
|
|
+ bool is_spilled_operand = reloads.count(var);
|
|
+
|
|
to_spill = var;
|
|
score = ctx.ssa_infos[t].score();
|
|
do_rematerialize = can_rematerialize;
|
|
- avoid_respill = loop_variable;
|
|
+ avoid_respill = loop_variable || is_spilled_operand;
|
|
+ operand_idx = cur_operand_idx;
|
|
+
|
|
+ /* This variable is spilled at the loop-header of the current loop.
|
|
+ * Re-use the spill-slot in order to avoid an extra store.
|
|
+ */
|
|
+ if (loop_variable)
|
|
+ respill_slot = ctx.loop.back().spills[var];
|
|
+ else if (is_spilled_operand)
|
|
+ respill_slot = reloads[var].second;
|
|
}
|
|
}
|
|
assert(to_spill != Temp());
|
|
|
|
- if (avoid_respill) {
|
|
- /* This variable is spilled at the loop-header of the current loop.
|
|
- * Re-use the spill-slot in order to avoid an extra store.
|
|
+ if (operand_idx != -1u) {
|
|
+ /* We might not be able to spill all operands. Keep live_changes up-to-date so we
|
|
+ * stop when we spilled every operand we can.
|
|
*/
|
|
- current_spills[to_spill] = ctx.loop.back().spills[to_spill];
|
|
+ *live_changes -= instr->operands[operand_idx].getTemp();
|
|
+ }
|
|
+
|
|
+ if (avoid_respill) {
|
|
+ current_spills[to_spill] = respill_slot;
|
|
spilled_registers += to_spill;
|
|
continue;
|
|
}
|
|
@@ -1007,7 +1056,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
|
/* add reloads and instruction to new instructions */
|
|
for (std::pair<const Temp, std::pair<Temp, uint32_t>>& pair : reloads) {
|
|
aco_ptr<Instruction> reload =
|
|
- do_reload(ctx, pair.second.first, pair.first, pair.second.second);
|
|
+ do_reload(ctx, pair.first, pair.second.first, pair.second.second);
|
|
instructions.emplace_back(std::move(reload));
|
|
}
|
|
instructions.emplace_back(std::move(instr));
|
|
@@ -1227,7 +1276,7 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& inst
|
|
assert(temp.type() == RegType::vgpr && !temp.is_linear());
|
|
|
|
Builder bld(ctx.program, &instructions);
|
|
- if (temp.size() > 1) {
|
|
+ if (temp.size() > 4) {
|
|
Instruction* split{
|
|
create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())};
|
|
split->operands[0] = Operand(temp);
|
|
@@ -1246,11 +1295,36 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& inst
|
|
instr->mubuf().cache.value = ac_swizzled;
|
|
}
|
|
}
|
|
- } else if (ctx.program->gfx_level >= GFX9) {
|
|
- bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), ctx.scratch_rsrc, temp, offset,
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ aco_opcode opcode;
|
|
+ switch (temp.size()) {
|
|
+ case 4:
|
|
+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_store_dwordx4
|
|
+ : aco_opcode::buffer_store_dwordx4;
|
|
+ break;
|
|
+ case 3:
|
|
+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_store_dwordx3
|
|
+ : aco_opcode::buffer_store_dwordx3;
|
|
+ break;
|
|
+ case 2:
|
|
+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_store_dwordx2
|
|
+ : aco_opcode::buffer_store_dwordx2;
|
|
+ break;
|
|
+ case 1:
|
|
+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_store_dword
|
|
+ : aco_opcode::buffer_store_dword;
|
|
+ break;
|
|
+ default:
|
|
+ unreachable("Unhandled vector size!\n");
|
|
+ }
|
|
+
|
|
+ if (ctx.program->gfx_level >= GFX9) {
|
|
+ bld.scratch(opcode, Operand(v1), ctx.scratch_rsrc, temp, offset,
|
|
memory_sync_info(storage_vgpr_spill, semantic_private));
|
|
} else {
|
|
- Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, Operand(v1),
|
|
+ Instruction* instr = bld.mubuf(opcode, ctx.scratch_rsrc, Operand(v1),
|
|
scratch_offset, temp, offset, false);
|
|
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
|
|
instr->mubuf().cache.value = ac_swizzled;
|
|
@@ -1291,11 +1365,36 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector<aco_ptr<Instruction>>& ins
|
|
}
|
|
}
|
|
bld.insert(vec);
|
|
- } else if (ctx.program->gfx_level >= GFX9) {
|
|
- bld.scratch(aco_opcode::scratch_load_dword, def, Operand(v1), ctx.scratch_rsrc, offset,
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ aco_opcode opcode;
|
|
+ switch (def.size()) {
|
|
+ case 4:
|
|
+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_load_dwordx4
|
|
+ : aco_opcode::buffer_load_dwordx4;
|
|
+ break;
|
|
+ case 3:
|
|
+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_load_dwordx3
|
|
+ : aco_opcode::buffer_load_dwordx3;
|
|
+ break;
|
|
+ case 2:
|
|
+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_load_dwordx2
|
|
+ : aco_opcode::buffer_load_dwordx2;
|
|
+ break;
|
|
+ case 1:
|
|
+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_load_dword
|
|
+ : aco_opcode::buffer_load_dword;
|
|
+ break;
|
|
+ default:
|
|
+ unreachable("Unhandled vector size!\n");
|
|
+ }
|
|
+
|
|
+ if (ctx.program->gfx_level >= GFX9) {
|
|
+ bld.scratch(opcode, def, Operand(v1), ctx.scratch_rsrc, offset,
|
|
memory_sync_info(storage_vgpr_spill, semantic_private));
|
|
} else {
|
|
- Instruction* instr = bld.mubuf(aco_opcode::buffer_load_dword, def, ctx.scratch_rsrc,
|
|
+ Instruction* instr = bld.mubuf(opcode, def, ctx.scratch_rsrc,
|
|
Operand(v1), scratch_offset, offset, false);
|
|
instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
|
|
instr->mubuf().cache.value = ac_swizzled;
|
|
--
|
|
GitLab
|
|
|
|
|
|
From a0276e8120c286a81006d1636f5e5e552c807d69 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 17 Jun 2024 12:55:48 +0200
|
|
Subject: [PATCH 04/71] !29577
|
|
|
|
---
|
|
src/compiler/nir/meson.build | 1 +
|
|
src/compiler/nir/nir.c | 7 +-
|
|
src/compiler/nir/nir.h | 35 ++-
|
|
src/compiler/nir/nir_builder.h | 22 ++
|
|
src/compiler/nir/nir_clone.c | 1 +
|
|
src/compiler/nir/nir_divergence_analysis.c | 31 ++-
|
|
src/compiler/nir/nir_functions.c | 5 +-
|
|
src/compiler/nir/nir_gather_info.c | 6 +-
|
|
src/compiler/nir/nir_inline_helpers.h | 2 +
|
|
src/compiler/nir/nir_lower_memory_model.c | 33 +--
|
|
src/compiler/nir/nir_metadata.c | 13 ++
|
|
src/compiler/nir/nir_opt_call.c | 259 +++++++++++++++++++++
|
|
src/compiler/nir/nir_print.c | 7 +
|
|
src/compiler/nir/nir_serialize.c | 11 +
|
|
src/compiler/nir/nir_sweep.c | 9 -
|
|
src/compiler/nir/nir_validate.c | 5 +
|
|
src/compiler/spirv/vtn_cfg.c | 3 +
|
|
17 files changed, 410 insertions(+), 40 deletions(-)
|
|
create mode 100644 src/compiler/nir/nir_opt_call.c
|
|
|
|
diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
|
|
index 514f5e0e1b7a1..2df6b28d73b39 100644
|
|
--- a/src/compiler/nir/meson.build
|
|
+++ b/src/compiler/nir/meson.build
|
|
@@ -219,6 +219,7 @@ files_libnir = files(
|
|
'nir_normalize_cubemap_coords.c',
|
|
'nir_opt_access.c',
|
|
'nir_opt_barriers.c',
|
|
+ 'nir_opt_call.c',
|
|
'nir_opt_combine_stores.c',
|
|
'nir_opt_comparison_pre.c',
|
|
'nir_opt_conditional_discard.c',
|
|
diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
|
|
index 513fd04f36f99..0b2736e4a0308 100644
|
|
--- a/src/compiler/nir/nir.c
|
|
+++ b/src/compiler/nir/nir.c
|
|
@@ -502,6 +502,7 @@ nir_function_create(nir_shader *shader, const char *name)
|
|
func->is_preamble = false;
|
|
func->dont_inline = false;
|
|
func->should_inline = false;
|
|
+ func->driver_attributes = 0;
|
|
func->is_subroutine = false;
|
|
func->is_tmp_globals_wrapper = false;
|
|
func->subroutine_index = 0;
|
|
@@ -1584,8 +1585,8 @@ nir_def_rewrite_uses_src(nir_def *def, nir_src new_src)
|
|
nir_def_rewrite_uses(def, new_src.ssa);
|
|
}
|
|
|
|
-static bool
|
|
-is_instr_between(nir_instr *start, nir_instr *end, nir_instr *between)
|
|
+bool
|
|
+nir_instr_is_between(nir_instr *start, nir_instr *end, nir_instr *between)
|
|
{
|
|
assert(start->block == end->block);
|
|
|
|
@@ -1629,7 +1630,7 @@ nir_def_rewrite_uses_after(nir_def *def, nir_def *new_ssa,
|
|
* not be dominated by after_me is if it is between def and after_me in
|
|
* the instruction list.
|
|
*/
|
|
- if (is_instr_between(def->parent_instr, after_me, nir_src_parent_instr(use_src)))
|
|
+ if (nir_instr_is_between(def->parent_instr, after_me, nir_src_parent_instr(use_src)))
|
|
continue;
|
|
}
|
|
|
|
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
|
|
index 7a781b7fefb4e..10a592f4b87bb 100644
|
|
--- a/src/compiler/nir/nir.h
|
|
+++ b/src/compiler/nir/nir.h
|
|
@@ -1915,6 +1915,10 @@ typedef struct {
|
|
nir_instr instr;
|
|
|
|
struct nir_function *callee;
|
|
+ /* If this function call is indirect, the function pointer to call.
|
|
+ * Otherwise, null initialized.
|
|
+ */
|
|
+ nir_src indirect_callee;
|
|
|
|
unsigned num_params;
|
|
nir_src params[];
|
|
@@ -3646,13 +3650,28 @@ typedef struct {
|
|
uint8_t num_components;
|
|
uint8_t bit_size;
|
|
|
|
- /* True if this paramater is actually the function return variable */
|
|
+ /* True if this parameter is a deref used for returning values */
|
|
bool is_return;
|
|
|
|
bool implicit_conversion_prohibited;
|
|
|
|
+ /* True if this parameter is not divergent. This is inverted to make
|
|
+ * parameters divergent by default unless explicitly specified
|
|
+ * otherwise.
|
|
+ */
|
|
+ bool is_uniform;
|
|
+
|
|
nir_variable_mode mode;
|
|
|
|
+ /* Drivers may optionally stash flags here describing the parameter.
|
|
+ * For example, this might encode whether the driver expects the value
|
|
+ * to be uniform or divergent, if the driver handles divergent parameters
|
|
+ * differently from uniform ones.
|
|
+ *
|
|
+ * NIR will preserve this value but does not interpret it in any way.
|
|
+ */
|
|
+ uint32_t driver_attributes;
|
|
+
|
|
/* The type of the function param */
|
|
const struct glsl_type *type;
|
|
} nir_parameter;
|
|
@@ -3675,6 +3694,14 @@ typedef struct nir_function {
|
|
*/
|
|
nir_function_impl *impl;
|
|
|
|
+ /* Drivers may optionally stash flags here describing the function call.
|
|
+ * For example, this might encode the ABI used for the call if a driver
|
|
+ * supports multiple ABIs.
|
|
+ *
|
|
+ * NIR will preserve this value but does not interpret it in any way.
|
|
+ */
|
|
+ uint32_t driver_attributes;
|
|
+
|
|
bool is_entrypoint;
|
|
/* from SPIR-V linkage, only for libraries */
|
|
bool is_exported;
|
|
@@ -5053,6 +5080,8 @@ void nir_instr_clear_src(nir_instr *instr, nir_src *src);
|
|
|
|
void nir_instr_move_src(nir_instr *dest_instr, nir_src *dest, nir_src *src);
|
|
|
|
+bool nir_instr_is_between(nir_instr *start, nir_instr *end, nir_instr *between);
|
|
+
|
|
void nir_def_init(nir_instr *instr, nir_def *def,
|
|
unsigned num_components, unsigned bit_size);
|
|
static inline void
|
|
@@ -6789,6 +6818,10 @@ bool nir_opt_combine_barriers(nir_shader *shader,
|
|
void *data);
|
|
bool nir_opt_barrier_modes(nir_shader *shader);
|
|
|
|
+typedef bool (*can_remat_cb)(nir_instr *instr);
|
|
+
|
|
+bool nir_minimize_call_live_states(nir_shader *shader);
|
|
+
|
|
bool nir_opt_combine_stores(nir_shader *shader, nir_variable_mode modes);
|
|
|
|
bool nir_copy_prop_impl(nir_function_impl *impl);
|
|
diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
|
|
index 5e07f588b4a5b..97a12e8c9ffc4 100644
|
|
--- a/src/compiler/nir/nir_builder.h
|
|
+++ b/src/compiler/nir/nir_builder.h
|
|
@@ -2218,6 +2218,22 @@ nir_build_call(nir_builder *build, nir_function *func, size_t count,
|
|
nir_builder_instr_insert(build, &call->instr);
|
|
}
|
|
|
|
+static inline void
|
|
+nir_build_indirect_call(nir_builder *build, nir_function *func, nir_def *callee,
|
|
+ size_t count, nir_def **args)
|
|
+{
|
|
+ assert(count == func->num_params && "parameter count must match");
|
|
+ assert(!func->impl && "cannot call directly defined functions indirectly");
|
|
+ nir_call_instr *call = nir_call_instr_create(build->shader, func);
|
|
+
|
|
+ for (unsigned i = 0; i < func->num_params; ++i) {
|
|
+ call->params[i] = nir_src_for_ssa(args[i]);
|
|
+ }
|
|
+ call->indirect_callee = nir_src_for_ssa(callee);
|
|
+
|
|
+ nir_builder_instr_insert(build, &call->instr);
|
|
+}
|
|
+
|
|
static inline void
|
|
nir_discard(nir_builder *build)
|
|
{
|
|
@@ -2251,6 +2267,12 @@ nir_build_string(nir_builder *build, const char *value);
|
|
nir_build_call(build, func, ARRAY_SIZE(args), args); \
|
|
} while (0)
|
|
|
|
+#define nir_call_indirect(build, func, callee, ...) \
|
|
+ do { \
|
|
+ nir_def *_args[] = { __VA_ARGS__ }; \
|
|
+ nir_build_indirect_call(build, func, callee, ARRAY_SIZE(_args), _args); \
|
|
+ } while (0)
|
|
+
|
|
nir_def *
|
|
nir_compare_func(nir_builder *b, enum compare_func func,
|
|
nir_def *src0, nir_def *src1);
|
|
diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c
|
|
index a8359fcd8da76..0bfd9623686ec 100644
|
|
--- a/src/compiler/nir/nir_clone.c
|
|
+++ b/src/compiler/nir/nir_clone.c
|
|
@@ -714,6 +714,7 @@ nir_function_clone(nir_shader *ns, const nir_function *fxn)
|
|
nfxn->should_inline = fxn->should_inline;
|
|
nfxn->dont_inline = fxn->dont_inline;
|
|
nfxn->is_subroutine = fxn->is_subroutine;
|
|
+ nfxn->driver_attributes = fxn->driver_attributes;
|
|
nfxn->is_tmp_globals_wrapper = fxn->is_tmp_globals_wrapper;
|
|
nfxn->num_subroutine_types = fxn->num_subroutine_types;
|
|
nfxn->subroutine_index = fxn->subroutine_index;
|
|
diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c
|
|
index 7c1b94ea81eb2..183b3bc81820d 100644
|
|
--- a/src/compiler/nir/nir_divergence_analysis.c
|
|
+++ b/src/compiler/nir/nir_divergence_analysis.c
|
|
@@ -39,6 +39,7 @@
|
|
struct divergence_state {
|
|
const gl_shader_stage stage;
|
|
nir_shader *shader;
|
|
+ nir_function_impl *impl;
|
|
nir_divergence_options options;
|
|
nir_loop *loop;
|
|
|
|
@@ -713,11 +714,15 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
|
src_divergent(instr->src[1], state);
|
|
break;
|
|
|
|
+ case nir_intrinsic_load_param:
|
|
+ is_divergent =
|
|
+ !state->impl->function->params[nir_intrinsic_param_idx(instr)].is_uniform;
|
|
+ break;
|
|
+
|
|
/* Intrinsics which are always divergent */
|
|
case nir_intrinsic_inverse_ballot:
|
|
case nir_intrinsic_load_color0:
|
|
case nir_intrinsic_load_color1:
|
|
- case nir_intrinsic_load_param:
|
|
case nir_intrinsic_load_sample_id:
|
|
case nir_intrinsic_load_sample_id_no_per_sample:
|
|
case nir_intrinsic_load_sample_mask_in:
|
|
@@ -1089,8 +1094,9 @@ instr_is_loop_invariant(nir_instr *instr, struct divergence_state *state)
|
|
case nir_instr_type_deref:
|
|
case nir_instr_type_tex:
|
|
return nir_foreach_src(instr, src_invariant, state->loop);
|
|
- case nir_instr_type_phi:
|
|
case nir_instr_type_call:
|
|
+ return false;
|
|
+ case nir_instr_type_phi:
|
|
case nir_instr_type_parallel_copy:
|
|
default:
|
|
unreachable("NIR divergence analysis: Unsupported instruction type.");
|
|
@@ -1115,9 +1121,10 @@ update_instr_divergence(nir_instr *instr, struct divergence_state *state)
|
|
return visit_deref(state->shader, nir_instr_as_deref(instr), state);
|
|
case nir_instr_type_debug_info:
|
|
return false;
|
|
+ case nir_instr_type_call:
|
|
+ return false;
|
|
case nir_instr_type_jump:
|
|
case nir_instr_type_phi:
|
|
- case nir_instr_type_call:
|
|
case nir_instr_type_parallel_copy:
|
|
default:
|
|
unreachable("NIR divergence analysis: Unsupported instruction type.");
|
|
@@ -1405,6 +1412,7 @@ nir_divergence_analysis_impl(nir_function_impl *impl, nir_divergence_options opt
|
|
struct divergence_state state = {
|
|
.stage = impl->function->shader->info.stage,
|
|
.shader = impl->function->shader,
|
|
+ .impl = impl,
|
|
.options = options,
|
|
.loop = NULL,
|
|
.divergent_loop_cf = false,
|
|
@@ -1422,8 +1430,10 @@ void
|
|
nir_divergence_analysis(nir_shader *shader)
|
|
{
|
|
shader->info.divergence_analysis_run = true;
|
|
- nir_divergence_analysis_impl(nir_shader_get_entrypoint(shader),
|
|
- shader->options->divergence_analysis_options);
|
|
+ nir_foreach_function_impl(impl, shader) {
|
|
+ nir_divergence_analysis_impl(impl,
|
|
+ shader->options->divergence_analysis_options);
|
|
+ }
|
|
}
|
|
|
|
/* Compute divergence between vertices of the same primitive. This uses
|
|
@@ -1444,10 +1454,13 @@ nir_vertex_divergence_analysis(nir_shader *shader)
|
|
.first_visit = true,
|
|
};
|
|
|
|
- nir_metadata_require(nir_shader_get_entrypoint(shader),
|
|
- nir_metadata_block_index);
|
|
- visit_cf_list(&nir_shader_get_entrypoint(shader)->body, &state);
|
|
- nir_metadata_preserve(nir_shader_get_entrypoint(shader), nir_metadata_all);
|
|
+ nir_foreach_function_impl(impl, shader) {
|
|
+ state.first_visit = true;
|
|
+ state.impl = impl;
|
|
+ nir_metadata_require(impl, nir_metadata_block_index);
|
|
+ visit_cf_list(&impl->body, &state);
|
|
+ nir_metadata_preserve(impl, nir_metadata_all);
|
|
+ }
|
|
}
|
|
|
|
bool
|
|
diff --git a/src/compiler/nir/nir_functions.c b/src/compiler/nir/nir_functions.c
|
|
index 3ad986f697905..355161cf1b40d 100644
|
|
--- a/src/compiler/nir/nir_functions.c
|
|
+++ b/src/compiler/nir/nir_functions.c
|
|
@@ -194,7 +194,10 @@ static bool inline_functions_pass(nir_builder *b,
|
|
return false;
|
|
|
|
nir_call_instr *call = nir_instr_as_call(instr);
|
|
- assert(call->callee->impl);
|
|
+ if (!call->callee->impl)
|
|
+ return false;
|
|
+
|
|
+ assert(!call->indirect_callee.ssa);
|
|
|
|
if (b->shader->options->driver_functions &&
|
|
b->shader->info.stage == MESA_SHADER_KERNEL) {
|
|
diff --git a/src/compiler/nir/nir_gather_info.c b/src/compiler/nir/nir_gather_info.c
|
|
index a5932cf3b3082..9af452acfb546 100644
|
|
--- a/src/compiler/nir/nir_gather_info.c
|
|
+++ b/src/compiler/nir/nir_gather_info.c
|
|
@@ -954,8 +954,10 @@ gather_func_info(nir_function_impl *func, nir_shader *shader,
|
|
nir_call_instr *call = nir_instr_as_call(instr);
|
|
nir_function_impl *impl = call->callee->impl;
|
|
|
|
- assert(impl || !"nir_shader_gather_info only works with linked shaders");
|
|
- gather_func_info(impl, shader, visited_funcs, dead_ctx);
|
|
+ if (!call->indirect_callee.ssa)
|
|
+ assert(impl || !"nir_shader_gather_info only works with linked shaders");
|
|
+ if (impl)
|
|
+ gather_func_info(impl, shader, visited_funcs, dead_ctx);
|
|
break;
|
|
}
|
|
default:
|
|
diff --git a/src/compiler/nir/nir_inline_helpers.h b/src/compiler/nir/nir_inline_helpers.h
|
|
index 8f3994f5353d6..17f2581cceee1 100644
|
|
--- a/src/compiler/nir/nir_inline_helpers.h
|
|
+++ b/src/compiler/nir/nir_inline_helpers.h
|
|
@@ -107,6 +107,8 @@ nir_foreach_src(nir_instr *instr, nir_foreach_src_cb cb, void *state)
|
|
}
|
|
case nir_instr_type_call: {
|
|
nir_call_instr *call = nir_instr_as_call(instr);
|
|
+ if (call->indirect_callee.ssa && !_nir_visit_src(&call->indirect_callee, cb, state))
|
|
+ return false;
|
|
for (unsigned i = 0; i < call->num_params; i++) {
|
|
if (!_nir_visit_src(&call->params[i], cb, state))
|
|
return false;
|
|
diff --git a/src/compiler/nir/nir_lower_memory_model.c b/src/compiler/nir/nir_lower_memory_model.c
|
|
index 95d9f4e9526dc..c797eae8a4eb5 100644
|
|
--- a/src/compiler/nir/nir_lower_memory_model.c
|
|
+++ b/src/compiler/nir/nir_lower_memory_model.c
|
|
@@ -229,21 +229,24 @@ nir_lower_memory_model(nir_shader *shader)
|
|
{
|
|
bool progress = false;
|
|
|
|
- nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
- struct exec_list *cf_list = &impl->body;
|
|
-
|
|
- uint32_t modes = 0;
|
|
- foreach_list_typed(nir_cf_node, cf_node, node, cf_list)
|
|
- progress |= lower_make_visible(cf_node, &modes);
|
|
-
|
|
- modes = 0;
|
|
- foreach_list_typed_reverse(nir_cf_node, cf_node, node, cf_list)
|
|
- progress |= lower_make_available(cf_node, &modes);
|
|
-
|
|
- if (progress)
|
|
- nir_metadata_preserve(impl, nir_metadata_control_flow);
|
|
- else
|
|
- nir_metadata_preserve(impl, nir_metadata_all);
|
|
+ nir_foreach_function_impl(impl, shader) {
|
|
+ bool impl_progress = false;
|
|
+ struct exec_list *cf_list = &impl->body;
|
|
+
|
|
+ uint32_t modes = 0;
|
|
+ foreach_list_typed(nir_cf_node, cf_node, node, cf_list)
|
|
+ impl_progress |= lower_make_visible(cf_node, &modes);
|
|
+
|
|
+ modes = 0;
|
|
+ foreach_list_typed_reverse(nir_cf_node, cf_node, node, cf_list)
|
|
+ impl_progress |= lower_make_available(cf_node, &modes);
|
|
+
|
|
+ if (impl_progress)
|
|
+ nir_metadata_preserve(impl, nir_metadata_control_flow);
|
|
+ else
|
|
+ nir_metadata_preserve(impl, nir_metadata_all);
|
|
+ progress |= impl_progress;
|
|
+ }
|
|
|
|
return progress;
|
|
}
|
|
diff --git a/src/compiler/nir/nir_metadata.c b/src/compiler/nir/nir_metadata.c
|
|
index e0085991bbc06..29e2ceaa499d1 100644
|
|
--- a/src/compiler/nir/nir_metadata.c
|
|
+++ b/src/compiler/nir/nir_metadata.c
|
|
@@ -61,6 +61,19 @@ nir_metadata_require(nir_function_impl *impl, nir_metadata required, ...)
|
|
void
|
|
nir_metadata_preserve(nir_function_impl *impl, nir_metadata preserved)
|
|
{
|
|
+ /* If we discard valid liveness information, immediately free the
|
|
+ * liveness information for each block. For large shaders, it can
|
|
+ * consume a huge amount of memory, and it's usually not immediately
|
|
+ * needed after dirtying.
|
|
+ */
|
|
+ if ((impl->valid_metadata & ~preserved) & nir_metadata_live_defs) {
|
|
+ nir_foreach_block(block, impl) {
|
|
+ ralloc_free(block->live_in);
|
|
+ ralloc_free(block->live_out);
|
|
+ block->live_in = block->live_out = NULL;
|
|
+ }
|
|
+ }
|
|
+
|
|
impl->valid_metadata &= preserved;
|
|
}
|
|
|
|
diff --git a/src/compiler/nir/nir_opt_call.c b/src/compiler/nir/nir_opt_call.c
|
|
new file mode 100644
|
|
index 0000000000000..421f78096042a
|
|
--- /dev/null
|
|
+++ b/src/compiler/nir/nir_opt_call.c
|
|
@@ -0,0 +1,259 @@
|
|
+/*
|
|
+ * Copyright © 2024 Valve Corporation
|
|
+ * SPDX-License-Identifier: MIT
|
|
+ */
|
|
+
|
|
+#include "nir.h"
|
|
+#include "nir_builder.h"
|
|
+#include "nir_phi_builder.h"
|
|
+
|
|
+struct call_liveness_entry {
|
|
+ struct list_head list;
|
|
+ nir_call_instr *instr;
|
|
+ const BITSET_WORD *live_set;
|
|
+};
|
|
+
|
|
+static bool
|
|
+can_remat_instr(nir_instr *instr)
|
|
+{
|
|
+ switch (instr->type) {
|
|
+ case nir_instr_type_alu:
|
|
+ case nir_instr_type_load_const:
|
|
+ case nir_instr_type_undef:
|
|
+ return true;
|
|
+ case nir_instr_type_intrinsic:
|
|
+ switch (nir_instr_as_intrinsic(instr)->intrinsic) {
|
|
+ case nir_intrinsic_load_ray_launch_id:
|
|
+ case nir_intrinsic_load_ray_launch_size:
|
|
+ case nir_intrinsic_vulkan_resource_index:
|
|
+ case nir_intrinsic_vulkan_resource_reindex:
|
|
+ case nir_intrinsic_load_vulkan_descriptor:
|
|
+ case nir_intrinsic_load_push_constant:
|
|
+ case nir_intrinsic_load_global_constant:
|
|
+ case nir_intrinsic_load_smem_amd:
|
|
+ case nir_intrinsic_load_scalar_arg_amd:
|
|
+ case nir_intrinsic_load_vector_arg_amd:
|
|
+ return true;
|
|
+ default:
|
|
+ return false;
|
|
+ }
|
|
+ default:
|
|
+ return false;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void
|
|
+remat_ssa_def(nir_builder *b, nir_def *def, struct hash_table *remap_table,
|
|
+ struct hash_table *phi_value_table, struct nir_phi_builder *phi_builder,
|
|
+ BITSET_WORD *def_blocks)
|
|
+{
|
|
+ memset(def_blocks, 0, BITSET_WORDS(b->impl->num_blocks) * sizeof(BITSET_WORD));
|
|
+ BITSET_SET(def_blocks, def->parent_instr->block->index);
|
|
+ BITSET_SET(def_blocks, nir_cursor_current_block(b->cursor)->index);
|
|
+ struct nir_phi_builder_value *val = nir_phi_builder_add_value(phi_builder, def->num_components, def->bit_size, def_blocks);
|
|
+ _mesa_hash_table_insert(phi_value_table, def, val);
|
|
+
|
|
+ nir_instr *clone = nir_instr_clone_deep(b->shader, def->parent_instr, remap_table);
|
|
+ nir_builder_instr_insert(b, clone);
|
|
+ nir_def *new_def = nir_instr_def(clone);
|
|
+
|
|
+ _mesa_hash_table_insert(remap_table, def, new_def);
|
|
+ if (nir_cursor_current_block(b->cursor)->index != def->parent_instr->block->index)
|
|
+ nir_phi_builder_value_set_block_def(val, def->parent_instr->block, def);
|
|
+ nir_phi_builder_value_set_block_def(val, nir_cursor_current_block(b->cursor), new_def);
|
|
+}
|
|
+
|
|
+struct remat_chain_check_data {
|
|
+ struct hash_table *remap_table;
|
|
+ unsigned chain_length;
|
|
+};
|
|
+
|
|
+static bool
|
|
+can_remat_chain(nir_src *src, void *data)
|
|
+{
|
|
+ struct remat_chain_check_data *check_data = data;
|
|
+
|
|
+ if (_mesa_hash_table_search(check_data->remap_table, src->ssa))
|
|
+ return true;
|
|
+
|
|
+ if (!can_remat_instr(src->ssa->parent_instr))
|
|
+ return false;
|
|
+
|
|
+ if (check_data->chain_length++ >= 16)
|
|
+ return false;
|
|
+
|
|
+ return nir_foreach_src(src->ssa->parent_instr, can_remat_chain, check_data);
|
|
+}
|
|
+
|
|
+struct remat_chain_data {
|
|
+ nir_builder *b;
|
|
+ struct hash_table *remap_table;
|
|
+ struct hash_table *phi_value_table;
|
|
+ struct nir_phi_builder *phi_builder;
|
|
+ BITSET_WORD *def_blocks;
|
|
+};
|
|
+
|
|
+static bool
|
|
+do_remat_chain(nir_src *src, void *data)
|
|
+{
|
|
+ struct remat_chain_data *remat_data = data;
|
|
+
|
|
+ if (_mesa_hash_table_search(remat_data->remap_table, src->ssa))
|
|
+ return true;
|
|
+
|
|
+ nir_foreach_src(src->ssa->parent_instr, do_remat_chain, remat_data);
|
|
+
|
|
+ remat_ssa_def(remat_data->b, src->ssa, remat_data->remap_table, remat_data->phi_value_table, remat_data->phi_builder, remat_data->def_blocks);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+struct src_rewrite_ctx {
|
|
+ struct hash_table *phi_value_table;
|
|
+ nir_instr *instr;
|
|
+};
|
|
+
|
|
+static bool
|
|
+rewrite_instr_src_from_phi_builder(nir_src *src, void *data)
|
|
+{
|
|
+ struct src_rewrite_ctx *ctx = data;
|
|
+
|
|
+ if (nir_src_is_const(*src)) {
|
|
+ nir_builder b = nir_builder_at(nir_before_instr(ctx->instr));
|
|
+ nir_src_rewrite(src, nir_build_imm(&b, src->ssa->num_components, src->ssa->bit_size, nir_src_as_const_value(*src)));
|
|
+ return true;
|
|
+ }
|
|
+
|
|
+ struct hash_entry *entry = _mesa_hash_table_search(ctx->phi_value_table, src->ssa);
|
|
+ if (!entry)
|
|
+ return true;
|
|
+
|
|
+ nir_block *block = nir_src_parent_instr(src)->block;
|
|
+ nir_def *new_def = nir_phi_builder_value_get_block_def(entry->data, block);
|
|
+
|
|
+ bool can_rewrite = true;
|
|
+ if (new_def->parent_instr->block == block && new_def->index != UINT32_MAX)
|
|
+ can_rewrite = nir_src_parent_instr(src) != nir_block_first_instr(block) &&
|
|
+ !nir_instr_is_between(nir_block_first_instr(block),
|
|
+ new_def->parent_instr,
|
|
+ nir_src_parent_instr(src));
|
|
+
|
|
+ if (can_rewrite)
|
|
+ nir_src_rewrite(src, new_def);
|
|
+ return true;
|
|
+}
|
|
+
|
|
+static bool
|
|
+nir_minimize_call_live_states_impl(nir_function_impl *impl)
|
|
+{
|
|
+ nir_metadata_require(impl, nir_metadata_block_index | nir_metadata_live_defs | nir_metadata_dominance);
|
|
+ bool progress = false;
|
|
+ void *mem_ctx = ralloc_context(NULL);
|
|
+
|
|
+ struct list_head call_list;
|
|
+ list_inithead(&call_list);
|
|
+ unsigned num_defs = impl->ssa_alloc;
|
|
+
|
|
+ nir_def **rematerializable = rzalloc_array_size(mem_ctx, sizeof(nir_def *), num_defs);
|
|
+
|
|
+ nir_foreach_block(block, impl) {
|
|
+ nir_foreach_instr(instr, block) {
|
|
+ nir_def *def = nir_instr_def(instr);
|
|
+ if (def &&
|
|
+ can_remat_instr(instr)) {
|
|
+ rematerializable[def->index] = def;
|
|
+ }
|
|
+
|
|
+ if (instr->type != nir_instr_type_call)
|
|
+ continue;
|
|
+ nir_call_instr *call = nir_instr_as_call(instr);
|
|
+ if (!call->indirect_callee.ssa)
|
|
+ continue;
|
|
+
|
|
+ struct call_liveness_entry *entry = ralloc_size(mem_ctx, sizeof(struct call_liveness_entry));
|
|
+ entry->instr = call;
|
|
+ entry->live_set = nir_get_live_defs(nir_after_instr(instr), mem_ctx);
|
|
+ list_addtail(&entry->list, &call_list);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ const unsigned block_words = BITSET_WORDS(impl->num_blocks);
|
|
+ BITSET_WORD *def_blocks = ralloc_array(mem_ctx, BITSET_WORD, block_words);
|
|
+
|
|
+ list_for_each_entry(struct call_liveness_entry, entry, &call_list, list) {
|
|
+ unsigned i;
|
|
+
|
|
+ nir_builder b = nir_builder_at(nir_after_instr(&entry->instr->instr));
|
|
+
|
|
+ struct nir_phi_builder *builder = nir_phi_builder_create(impl);
|
|
+ struct hash_table *phi_value_table =
|
|
+ _mesa_pointer_hash_table_create(mem_ctx);
|
|
+ struct hash_table *remap_table =
|
|
+ _mesa_pointer_hash_table_create(mem_ctx);
|
|
+
|
|
+ BITSET_FOREACH_SET(i, entry->live_set, num_defs) {
|
|
+ if (!rematerializable[i] || _mesa_hash_table_search(remap_table, rematerializable[i]))
|
|
+ continue;
|
|
+
|
|
+ progress = true;
|
|
+ assert(!_mesa_hash_table_search(phi_value_table, rematerializable[i]));
|
|
+
|
|
+ struct remat_chain_check_data check_data = {
|
|
+ .remap_table = remap_table,
|
|
+ .chain_length = 1,
|
|
+ };
|
|
+
|
|
+ if (!nir_foreach_src(rematerializable[i]->parent_instr, can_remat_chain, &check_data))
|
|
+ continue;
|
|
+
|
|
+ struct remat_chain_data remat_data = {
|
|
+ .b = &b,
|
|
+ .remap_table = remap_table,
|
|
+ .phi_value_table = phi_value_table,
|
|
+ .phi_builder = builder,
|
|
+ .def_blocks = def_blocks,
|
|
+ };
|
|
+
|
|
+ nir_foreach_src(rematerializable[i]->parent_instr, do_remat_chain, &remat_data);
|
|
+
|
|
+ remat_ssa_def(&b, rematerializable[i], remap_table, phi_value_table, builder, def_blocks);
|
|
+ }
|
|
+ _mesa_hash_table_destroy(remap_table, NULL);
|
|
+
|
|
+ nir_foreach_block(block, impl) {
|
|
+ nir_foreach_instr(instr, block) {
|
|
+ if (instr->type == nir_instr_type_phi)
|
|
+ continue;
|
|
+
|
|
+ struct src_rewrite_ctx ctx = {
|
|
+ .phi_value_table = phi_value_table,
|
|
+ .instr = instr,
|
|
+ };
|
|
+ nir_foreach_src(instr, rewrite_instr_src_from_phi_builder, &ctx);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ nir_phi_builder_finish(builder);
|
|
+ _mesa_hash_table_destroy(phi_value_table, NULL);
|
|
+ }
|
|
+
|
|
+ ralloc_free(mem_ctx);
|
|
+
|
|
+ nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
|
|
+ return progress;
|
|
+}
|
|
+
|
|
+/* Tries to rematerialize as many live vars as possible after calls.
|
|
+ * Note: nir_opt_cse will undo any rematerializations done by this pass,
|
|
+ * so it shouldn't be run afterward.
|
|
+ */
|
|
+bool
|
|
+nir_minimize_call_live_states(nir_shader *shader)
|
|
+{
|
|
+ bool progress = false;
|
|
+
|
|
+ nir_foreach_function_impl(impl, shader) {
|
|
+ progress |= nir_minimize_call_live_states_impl(impl);
|
|
+ }
|
|
+
|
|
+ return progress;
|
|
+}
|
|
\ No newline at end of file
|
|
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
|
|
index 41f3eae83fe7d..ff90a20320268 100644
|
|
--- a/src/compiler/nir/nir_print.c
|
|
+++ b/src/compiler/nir/nir_print.c
|
|
@@ -1884,7 +1884,14 @@ print_call_instr(nir_call_instr *instr, print_state *state)
|
|
|
|
print_no_dest_padding(state);
|
|
|
|
+ bool indirect = instr->indirect_callee.ssa;
|
|
+
|
|
fprintf(fp, "call %s ", instr->callee->name);
|
|
+ if (indirect) {
|
|
+ fprintf(fp, "(indirect ");
|
|
+ print_src(&instr->indirect_callee, state, nir_type_invalid);
|
|
+ fprintf(fp, ") ");
|
|
+ }
|
|
|
|
for (unsigned i = 0; i < instr->num_params; i++) {
|
|
if (i != 0)
|
|
diff --git a/src/compiler/nir/nir_serialize.c b/src/compiler/nir/nir_serialize.c
|
|
index 2735683dd083f..ad77c88a47840 100644
|
|
--- a/src/compiler/nir/nir_serialize.c
|
|
+++ b/src/compiler/nir/nir_serialize.c
|
|
@@ -1975,6 +1975,8 @@ write_function(write_ctx *ctx, const nir_function *fxn)
|
|
if (fxn->name)
|
|
blob_write_string(ctx->blob, fxn->name);
|
|
|
|
+ blob_write_uint32(ctx->blob, fxn->driver_attributes);
|
|
+
|
|
blob_write_uint32(ctx->blob, fxn->subroutine_index);
|
|
blob_write_uint32(ctx->blob, fxn->num_subroutine_types);
|
|
for (unsigned i = 0; i < fxn->num_subroutine_types; i++) {
|
|
@@ -1988,9 +1990,14 @@ write_function(write_ctx *ctx, const nir_function *fxn)
|
|
uint32_t val =
|
|
((uint32_t)fxn->params[i].num_components) |
|
|
((uint32_t)fxn->params[i].bit_size) << 8;
|
|
+ if (fxn->params[i].is_return)
|
|
+ val |= (1u << 16);
|
|
+ if (fxn->params[i].is_uniform)
|
|
+ val |= (1u << 17);
|
|
blob_write_uint32(ctx->blob, val);
|
|
encode_type_to_blob(ctx->blob, fxn->params[i].type);
|
|
blob_write_uint32(ctx->blob, encode_deref_modes(fxn->params[i].mode));
|
|
+ blob_write_uint32(ctx->blob, fxn->params[i].driver_attributes);
|
|
}
|
|
|
|
/* At first glance, it looks like we should write the function_impl here.
|
|
@@ -2010,6 +2017,7 @@ read_function(read_ctx *ctx)
|
|
|
|
nir_function *fxn = nir_function_create(ctx->nir, name);
|
|
|
|
+ fxn->driver_attributes = blob_read_uint32(ctx->blob);
|
|
fxn->subroutine_index = blob_read_uint32(ctx->blob);
|
|
fxn->num_subroutine_types = blob_read_uint32(ctx->blob);
|
|
for (unsigned i = 0; i < fxn->num_subroutine_types; i++) {
|
|
@@ -2024,8 +2032,11 @@ read_function(read_ctx *ctx)
|
|
uint32_t val = blob_read_uint32(ctx->blob);
|
|
fxn->params[i].num_components = val & 0xff;
|
|
fxn->params[i].bit_size = (val >> 8) & 0xff;
|
|
+ fxn->params[i].is_return = val & (1u << 16);
|
|
+ fxn->params[i].is_uniform = val & (1u << 17);
|
|
fxn->params[i].type = decode_type_from_blob(ctx->blob);
|
|
fxn->params[i].mode = decode_deref_modes(blob_read_uint32(ctx->blob));
|
|
+ fxn->params[i].driver_attributes = blob_read_uint32(ctx->blob);
|
|
}
|
|
|
|
fxn->is_entrypoint = flags & 0x1;
|
|
diff --git a/src/compiler/nir/nir_sweep.c b/src/compiler/nir/nir_sweep.c
|
|
index 9acd60a60b875..009343c3cf957 100644
|
|
--- a/src/compiler/nir/nir_sweep.c
|
|
+++ b/src/compiler/nir/nir_sweep.c
|
|
@@ -47,15 +47,6 @@ sweep_block(nir_shader *nir, nir_block *block)
|
|
{
|
|
ralloc_steal(nir, block);
|
|
|
|
- /* sweep_impl will mark all metadata invalid. We can safely release all of
|
|
- * this here.
|
|
- */
|
|
- ralloc_free(block->live_in);
|
|
- block->live_in = NULL;
|
|
-
|
|
- ralloc_free(block->live_out);
|
|
- block->live_out = NULL;
|
|
-
|
|
nir_foreach_instr(instr, block) {
|
|
gc_mark_live(nir->gctx, instr);
|
|
|
|
diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c
|
|
index ee2c9cd32c4aa..1f712962556d9 100644
|
|
--- a/src/compiler/nir/nir_validate.c
|
|
+++ b/src/compiler/nir/nir_validate.c
|
|
@@ -966,6 +966,11 @@ validate_call_instr(nir_call_instr *instr, validate_state *state)
|
|
{
|
|
validate_assert(state, instr->num_params == instr->callee->num_params);
|
|
|
|
+ if (instr->indirect_callee.ssa) {
|
|
+ validate_assert(state, !instr->callee->impl);
|
|
+ validate_src(&instr->indirect_callee, state);
|
|
+ }
|
|
+
|
|
for (unsigned i = 0; i < instr->num_params; i++) {
|
|
validate_sized_src(&instr->params[i], state,
|
|
instr->callee->params[i].bit_size,
|
|
diff --git a/src/compiler/spirv/vtn_cfg.c b/src/compiler/spirv/vtn_cfg.c
|
|
index e1b9d21ecfc10..e2afb3f8eaaa9 100644
|
|
--- a/src/compiler/spirv/vtn_cfg.c
|
|
+++ b/src/compiler/spirv/vtn_cfg.c
|
|
@@ -55,6 +55,7 @@ glsl_type_add_to_function_params(const struct glsl_type *type,
|
|
func->params[(*param_idx)++] = (nir_parameter) {
|
|
.num_components = glsl_get_vector_elements(type),
|
|
.bit_size = glsl_get_bit_size(type),
|
|
+ .type = type,
|
|
};
|
|
} else if (glsl_type_is_array_or_matrix(type)) {
|
|
unsigned elems = glsl_get_length(type);
|
|
@@ -290,6 +291,8 @@ vtn_cfg_handle_prepass_instruction(struct vtn_builder *b, SpvOp opcode,
|
|
func->params[idx++] = (nir_parameter) {
|
|
.num_components = nir_address_format_num_components(addr_format),
|
|
.bit_size = nir_address_format_bit_size(addr_format),
|
|
+ .is_return = true,
|
|
+ .type = func_type->return_type->type,
|
|
};
|
|
}
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 4c4b5a7e7b853d0ddcde5436d58cfa43c310d401 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Thu, 3 Oct 2024 15:58:19 +0200
|
|
Subject: [PATCH 05/71] aco/lower_to_hw_instr: Also consider operand alignment
|
|
requirements
|
|
|
|
---
|
|
src/amd/compiler/aco_lower_to_hw_instr.cpp | 7 ++++---
|
|
1 file changed, 4 insertions(+), 3 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
|
|
index 0e18aa66069f8..fa3c805f491b5 100644
|
|
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
|
|
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
|
|
@@ -1191,16 +1191,17 @@ split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op,
|
|
if ((ctx->program->gfx_level < GFX10 || ctx->program->gfx_level >= GFX11) &&
|
|
src.def.regClass().type() == RegType::vgpr)
|
|
max_size = MIN2(max_size, 4);
|
|
- unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
|
|
+ unsigned max_def_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16;
|
|
+ unsigned max_op_align = src.op.regClass().type() == RegType::vgpr ? 4 : 16;
|
|
|
|
/* make sure the size is a power of two and reg % bytes == 0 */
|
|
unsigned bytes = 1;
|
|
for (; bytes <= max_size; bytes *= 2) {
|
|
unsigned next = bytes * 2u;
|
|
- bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 &&
|
|
+ bool can_increase = def_reg.reg_b % MIN2(next, max_def_align) == 0 &&
|
|
offset + next <= src.bytes && next <= max_size;
|
|
if (!src.op.isConstant() && can_increase)
|
|
- can_increase = op_reg.reg_b % MIN2(next, max_align) == 0;
|
|
+ can_increase = op_reg.reg_b % MIN2(next, max_op_align) == 0;
|
|
for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
|
|
can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
|
|
if (!can_increase)
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 325296b50ec3a85b9400189aec2b65b4c18bc40d Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Thu, 3 Oct 2024 15:58:45 +0200
|
|
Subject: [PATCH 06/71] aco/ra: Disallow unaligned SGPR assignment
|
|
|
|
---
|
|
src/amd/compiler/aco_register_allocation.cpp | 3 +++
|
|
1 file changed, 3 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index fc62487627fad..a8068b0da316a 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -2115,6 +2115,9 @@ operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsign
|
|
return false;
|
|
}
|
|
|
|
+ if (rc.type() == RegType::sgpr && reg.reg() % rc.size())
|
|
+ return false;
|
|
+
|
|
switch (instr->format) {
|
|
case Format::SMEM:
|
|
return reg != scc && reg != exec &&
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 50d5f59160434a154a93d2c8db9eca0a27551416 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Fri, 4 Oct 2024 07:20:12 +0200
|
|
Subject: [PATCH 07/71] aco/ra: Fix SGPR parallelcopy operands straddling
|
|
64-reg boundary
|
|
|
|
---
|
|
src/amd/compiler/aco_register_allocation.cpp | 18 +++++++++++++++---
|
|
1 file changed, 15 insertions(+), 3 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index a8068b0da316a..3ce0680bf52d6 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -3009,12 +3009,24 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector<parallelcopy>& parallelcopy
|
|
if (!sgpr_operands_alias_defs) {
|
|
unsigned reg = parallelcopy[i].op.physReg().reg();
|
|
unsigned size = parallelcopy[i].op.getTemp().size();
|
|
- sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
|
|
+ if ((reg + size) / 64u == reg / 64u) {
|
|
+ sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
|
|
+ } else {
|
|
+ sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, 64u - (reg % 64u));
|
|
+ sgpr_operands[(reg + size) / 64u] |= u_bit_consecutive64(0, (reg + size) % 64u);
|
|
+ }
|
|
|
|
reg = parallelcopy[i].def.physReg().reg();
|
|
size = parallelcopy[i].def.getTemp().size();
|
|
- if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
|
|
- sgpr_operands_alias_defs = true;
|
|
+ if ((reg + size) / 64u == reg / 64u) {
|
|
+ if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
|
|
+ sgpr_operands_alias_defs = true;
|
|
+ } else {
|
|
+ if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, 64u - (reg % 64u)))
|
|
+ sgpr_operands_alias_defs = true;
|
|
+ if (sgpr_operands[(reg + size) / 64u] & u_bit_consecutive64(0, (reg + size) % 64u))
|
|
+ sgpr_operands_alias_defs = true;
|
|
+ }
|
|
}
|
|
}
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 0d80a9a6eb1d317727688914ad8f612dc7bace13 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 17 Jun 2024 13:13:21 +0200
|
|
Subject: [PATCH 08/71] radv: Gather info for all functions
|
|
|
|
---
|
|
src/amd/vulkan/radv_pipeline.c | 4 +++-
|
|
1 file changed, 3 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
|
|
index a9df9b6b8aea3..82a5aac71437d 100644
|
|
--- a/src/amd/vulkan/radv_pipeline.c
|
|
+++ b/src/amd/vulkan/radv_pipeline.c
|
|
@@ -431,7 +431,9 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
|
|
NIR_PASS(_, stage->nir, nir_opt_constant_folding);
|
|
|
|
/* Gather info again, to update whether 8/16-bit are used. */
|
|
- nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));
|
|
+ nir_foreach_function_impl (impl, stage->nir)
|
|
+ if (impl->function->is_entrypoint || impl->function->is_exported)
|
|
+ nir_shader_gather_info(stage->nir, impl);
|
|
}
|
|
}
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 5e1e7090670cf7db02ea16a86790104a008c8813 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 11:27:06 +0200
|
|
Subject: [PATCH 09/71] nir/intrinsics: Add incoming/outgoing payload
|
|
load/store instructions
|
|
|
|
With RT function calls, these are going to get lowered to:
|
|
- load/store_param (incoming payload)
|
|
- load/store_var (outgoing payload)
|
|
---
|
|
src/compiler/nir/nir_intrinsics.py | 4 ++++
|
|
1 file changed, 4 insertions(+)
|
|
|
|
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
|
|
index 31af10c320ba8..798e961c0c8e3 100644
|
|
--- a/src/compiler/nir/nir_intrinsics.py
|
|
+++ b/src/compiler/nir/nir_intrinsics.py
|
|
@@ -1703,6 +1703,10 @@ intrinsic("execute_miss_amd", src_comp=[1])
|
|
# BASE=dword index
|
|
intrinsic("load_hit_attrib_amd", dest_comp=1, bit_sizes=[32], indices=[BASE])
|
|
intrinsic("store_hit_attrib_amd", src_comp=[1], indices=[BASE])
|
|
+intrinsic("load_incoming_ray_payload_amd", dest_comp=1, bit_sizes=[32], indices=[BASE])
|
|
+intrinsic("store_incoming_ray_payload_amd", src_comp=[1], indices=[BASE])
|
|
+intrinsic("load_outgoing_ray_payload_amd", dest_comp=1, bit_sizes=[32], indices=[BASE])
|
|
+intrinsic("store_outgoing_ray_payload_amd", src_comp=[1], indices=[BASE])
|
|
|
|
# Load forced VRS rates.
|
|
intrinsic("load_force_vrs_rates_amd", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER])
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 47aae01aefb03df60f1ca9e6c80f17b76a83f031 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Thu, 6 Jun 2024 08:07:34 +0200
|
|
Subject: [PATCH 10/71] radv: Temporarily disable RT pipelines
|
|
|
|
---
|
|
src/amd/vulkan/radv_physical_device.c | 4 ++++
|
|
1 file changed, 4 insertions(+)
|
|
|
|
diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c
|
|
index 5022ead6c9d76..98826470d4d60 100644
|
|
--- a/src/amd/vulkan/radv_physical_device.c
|
|
+++ b/src/amd/vulkan/radv_physical_device.c
|
|
@@ -111,6 +111,10 @@ radv_filter_minmax_enabled(const struct radv_physical_device *pdev)
|
|
bool
|
|
radv_enable_rt(const struct radv_physical_device *pdev, bool rt_pipelines)
|
|
{
|
|
+ /* Temporarily under construction! */
|
|
+ if (rt_pipelines)
|
|
+ return false;
|
|
+
|
|
if (pdev->info.gfx_level < GFX10_3 && !radv_emulate_rt(pdev))
|
|
return false;
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From e268331ef1d7dcd0bb7642286f358ce7ccd50a5c Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 11:28:21 +0200
|
|
Subject: [PATCH 11/71] nir: Remove
|
|
nir_intrinsic_load_rt_arg_scratch_offset_amd
|
|
|
|
Not needed anymore.
|
|
---
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 11 -----------
|
|
src/amd/vulkan/radv_pipeline_rt.c | 1 -
|
|
src/compiler/nir/nir_divergence_analysis.c | 1 -
|
|
src/compiler/nir/nir_intrinsics.py | 3 ---
|
|
4 files changed, 16 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index 8dcd853aa724d..9224c169319fc 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -488,10 +488,6 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data)
|
|
nir_src_rewrite(&intr->src[1], nir_iadd_nuw(b, nir_load_var(b, vars->stack_ptr), intr->src[1].ssa));
|
|
return true;
|
|
}
|
|
- case nir_intrinsic_load_rt_arg_scratch_offset_amd: {
|
|
- ret = nir_load_var(b, vars->arg);
|
|
- break;
|
|
- }
|
|
case nir_intrinsic_load_shader_record_ptr: {
|
|
ret = nir_load_var(b, vars->shader_record_ptr);
|
|
break;
|
|
@@ -1086,12 +1082,6 @@ lower_any_hit_for_intersection(nir_shader *any_hit)
|
|
b->cursor = nir_before_instr(instr);
|
|
nir_src_rewrite(&intrin->src[1], nir_iadd_nuw(b, scratch_offset, intrin->src[1].ssa));
|
|
break;
|
|
- case nir_intrinsic_load_rt_arg_scratch_offset_amd:
|
|
- b->cursor = nir_after_instr(instr);
|
|
- nir_def *arg_offset = nir_isub(b, &intrin->def, scratch_offset);
|
|
- nir_def_rewrite_uses_after(&intrin->def, arg_offset, arg_offset->parent_instr);
|
|
- break;
|
|
-
|
|
default:
|
|
break;
|
|
}
|
|
@@ -1732,7 +1722,6 @@ radv_build_traversal_shader(struct radv_device *device, struct radv_ray_tracing_
|
|
nir_store_var(&b, vars.cull_mask_and_flags, nir_load_cull_mask_and_flags_amd(&b), 0x1);
|
|
nir_store_var(&b, vars.origin, nir_load_ray_world_origin(&b), 0x7);
|
|
nir_store_var(&b, vars.direction, nir_load_ray_world_direction(&b), 0x7);
|
|
- nir_store_var(&b, vars.arg, nir_load_rt_arg_scratch_offset_amd(&b), 0x1);
|
|
nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 0x1);
|
|
|
|
radv_build_traversal(device, pipeline, pCreateInfo, false, &b, &vars, false, info);
|
|
diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c
|
|
index 8d9ba4d6047a6..11acaa74dfc54 100644
|
|
--- a/src/amd/vulkan/radv_pipeline_rt.c
|
|
+++ b/src/amd/vulkan/radv_pipeline_rt.c
|
|
@@ -318,7 +318,6 @@ should_move_rt_instruction(nir_intrinsic_instr *instr)
|
|
switch (instr->intrinsic) {
|
|
case nir_intrinsic_load_hit_attrib_amd:
|
|
return nir_intrinsic_base(instr) < RADV_MAX_HIT_ATTRIB_DWORDS;
|
|
- case nir_intrinsic_load_rt_arg_scratch_offset_amd:
|
|
case nir_intrinsic_load_ray_flags:
|
|
case nir_intrinsic_load_ray_object_origin:
|
|
case nir_intrinsic_load_ray_world_origin:
|
|
diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c
|
|
index 183b3bc81820d..78943c897922f 100644
|
|
--- a/src/compiler/nir/nir_divergence_analysis.c
|
|
+++ b/src/compiler/nir/nir_divergence_analysis.c
|
|
@@ -835,7 +835,6 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
|
case nir_intrinsic_load_packed_passthrough_primitive_amd:
|
|
case nir_intrinsic_load_initial_edgeflags_amd:
|
|
case nir_intrinsic_gds_atomic_add_amd:
|
|
- case nir_intrinsic_load_rt_arg_scratch_offset_amd:
|
|
case nir_intrinsic_load_intersection_opaque_amd:
|
|
case nir_intrinsic_load_vector_arg_amd:
|
|
case nir_intrinsic_load_btd_stack_id_intel:
|
|
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
|
|
index 798e961c0c8e3..2a6de0c4b6f25 100644
|
|
--- a/src/compiler/nir/nir_intrinsics.py
|
|
+++ b/src/compiler/nir/nir_intrinsics.py
|
|
@@ -1673,9 +1673,6 @@ intrinsic("bvh64_intersect_ray_amd", [4, 2, 1, 3, 3, 3], 4, flags=[CAN_ELIMINATE
|
|
# Return of a callable in raytracing pipelines
|
|
intrinsic("rt_return_amd")
|
|
|
|
-# offset into scratch for the input callable data in a raytracing pipeline.
|
|
-system_value("rt_arg_scratch_offset_amd", 1)
|
|
-
|
|
# Whether to call the anyhit shader for an intersection in an intersection shader.
|
|
system_value("intersection_opaque_amd", 1, bit_sizes=[1])
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 8100ae695c5322e10227619b5e1b6027c2b35a02 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 11:31:55 +0200
|
|
Subject: [PATCH 12/71] radv/rt: Remove RT priorities
|
|
|
|
They have been been useful for ensuring reconvergence, but RT function
|
|
calls ensure that on their own now.
|
|
---
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 37 -------------------------
|
|
src/amd/vulkan/radv_cmd_buffer.c | 2 +-
|
|
src/amd/vulkan/radv_pipeline_rt.c | 2 +-
|
|
src/amd/vulkan/radv_shader.h | 27 ------------------
|
|
4 files changed, 2 insertions(+), 66 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index 9224c169319fc..3f50c7297baae 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -1843,43 +1843,6 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device,
|
|
lower_hit_attribs(shader, hit_attribs, 0);
|
|
}
|
|
|
|
-/** Select the next shader based on priorities:
|
|
- *
|
|
- * Detect the priority of the shader stage by the lowest bits in the address (low to high):
|
|
- * - Raygen - idx 0
|
|
- * - Traversal - idx 1
|
|
- * - Closest Hit / Miss - idx 2
|
|
- * - Callable - idx 3
|
|
- *
|
|
- *
|
|
- * This gives us the following priorities:
|
|
- * Raygen : Callable > > Traversal > Raygen
|
|
- * Traversal : > Chit / Miss > > Raygen
|
|
- * CHit / Miss : Callable > Chit / Miss > Traversal > Raygen
|
|
- * Callable : Callable > Chit / Miss > > Raygen
|
|
- */
|
|
-static nir_def *
|
|
-select_next_shader(nir_builder *b, nir_def *shader_addr, unsigned wave_size)
|
|
-{
|
|
- gl_shader_stage stage = b->shader->info.stage;
|
|
- nir_def *prio = nir_iand_imm(b, shader_addr, radv_rt_priority_mask);
|
|
- nir_def *ballot = nir_ballot(b, 1, wave_size, nir_imm_bool(b, true));
|
|
- nir_def *ballot_traversal = nir_ballot(b, 1, wave_size, nir_ieq_imm(b, prio, radv_rt_priority_traversal));
|
|
- nir_def *ballot_hit_miss = nir_ballot(b, 1, wave_size, nir_ieq_imm(b, prio, radv_rt_priority_hit_miss));
|
|
- nir_def *ballot_callable = nir_ballot(b, 1, wave_size, nir_ieq_imm(b, prio, radv_rt_priority_callable));
|
|
-
|
|
- if (stage != MESA_SHADER_CALLABLE && stage != MESA_SHADER_INTERSECTION)
|
|
- ballot = nir_bcsel(b, nir_ine_imm(b, ballot_traversal, 0), ballot_traversal, ballot);
|
|
- if (stage != MESA_SHADER_RAYGEN)
|
|
- ballot = nir_bcsel(b, nir_ine_imm(b, ballot_hit_miss, 0), ballot_hit_miss, ballot);
|
|
- if (stage != MESA_SHADER_INTERSECTION)
|
|
- ballot = nir_bcsel(b, nir_ine_imm(b, ballot_callable, 0), ballot_callable, ballot);
|
|
-
|
|
- nir_def *lsb = nir_find_lsb(b, ballot);
|
|
- nir_def *next = nir_read_invocation(b, shader_addr, lsb);
|
|
- return nir_iand_imm(b, next, ~radv_rt_priority_mask);
|
|
-}
|
|
-
|
|
static void
|
|
radv_store_arg(nir_builder *b, const struct radv_shader_args *args, const struct radv_ray_tracing_stage_info *info,
|
|
struct ac_arg arg, nir_def *value)
|
|
diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
|
|
index d205cebbda64c..96bda7c3cf639 100644
|
|
--- a/src/amd/vulkan/radv_cmd_buffer.c
|
|
+++ b/src/amd/vulkan/radv_cmd_buffer.c
|
|
@@ -7551,7 +7551,7 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_compu
|
|
const uint32_t traversal_shader_addr_offset = radv_get_user_sgpr_loc(rt_prolog, AC_UD_CS_TRAVERSAL_SHADER_ADDR);
|
|
struct radv_shader *traversal_shader = cmd_buffer->state.shaders[MESA_SHADER_INTERSECTION];
|
|
if (traversal_shader_addr_offset && traversal_shader) {
|
|
- uint64_t traversal_va = traversal_shader->va | radv_rt_priority_traversal;
|
|
+ uint64_t traversal_va = traversal_shader->va;
|
|
radv_emit_shader_pointer(device, cmd_buffer->cs, traversal_shader_addr_offset, traversal_va, true);
|
|
}
|
|
}
|
|
diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c
|
|
index 11acaa74dfc54..32a1cba1269f3 100644
|
|
--- a/src/amd/vulkan/radv_pipeline_rt.c
|
|
+++ b/src/amd/vulkan/radv_pipeline_rt.c
|
|
@@ -1138,7 +1138,7 @@ radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkRayTra
|
|
if (pipeline->groups[i].recursive_shader != VK_SHADER_UNUSED_KHR) {
|
|
struct radv_shader *shader = pipeline->stages[pipeline->groups[i].recursive_shader].shader;
|
|
if (shader)
|
|
- pipeline->groups[i].handle.recursive_shader_ptr = shader->va | radv_get_rt_priority(shader->info.stage);
|
|
+ pipeline->groups[i].handle.recursive_shader_ptr = shader->va;
|
|
}
|
|
}
|
|
|
|
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
|
|
index 300358a346dbb..968ebbe6d4af4 100644
|
|
--- a/src/amd/vulkan/radv_shader.h
|
|
+++ b/src/amd/vulkan/radv_shader.h
|
|
@@ -682,33 +682,6 @@ nir_shader *radv_build_traversal_shader(struct radv_device *device, struct radv_
|
|
const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
|
|
struct radv_ray_tracing_stage_info *info);
|
|
|
|
-enum radv_rt_priority {
|
|
- radv_rt_priority_raygen = 0,
|
|
- radv_rt_priority_traversal = 1,
|
|
- radv_rt_priority_hit_miss = 2,
|
|
- radv_rt_priority_callable = 3,
|
|
- radv_rt_priority_mask = 0x3,
|
|
-};
|
|
-
|
|
-static inline enum radv_rt_priority
|
|
-radv_get_rt_priority(gl_shader_stage stage)
|
|
-{
|
|
- switch (stage) {
|
|
- case MESA_SHADER_RAYGEN:
|
|
- return radv_rt_priority_raygen;
|
|
- case MESA_SHADER_INTERSECTION:
|
|
- case MESA_SHADER_ANY_HIT:
|
|
- return radv_rt_priority_traversal;
|
|
- case MESA_SHADER_CLOSEST_HIT:
|
|
- case MESA_SHADER_MISS:
|
|
- return radv_rt_priority_hit_miss;
|
|
- case MESA_SHADER_CALLABLE:
|
|
- return radv_rt_priority_callable;
|
|
- default:
|
|
- unreachable("Unimplemented RT shader stage.");
|
|
- }
|
|
-}
|
|
-
|
|
struct radv_shader_layout;
|
|
enum radv_pipeline_type;
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 8849cf03b0c29eb6b864a4056195ca7dc9f53a68 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 11:39:33 +0200
|
|
Subject: [PATCH 13/71] radv/rt: Refactor radv_nir_lower_rt_vars
|
|
|
|
Now we can use it on load/store instruction. Will be used for lowering
|
|
payloads to load/store_*_payload instructions.
|
|
---
|
|
.../nir/radv_nir_lower_hit_attrib_derefs.c | 93 ++++++++++++++-----
|
|
1 file changed, 70 insertions(+), 23 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c b/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c
|
|
index 38e14dd4015fc..9db157dd4baf0 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c
|
|
@@ -10,13 +10,19 @@
|
|
#include "radv_constants.h"
|
|
#include "radv_nir.h"
|
|
|
|
+typedef nir_def *(*load_intrin_cb)(nir_builder *b, unsigned base);
|
|
+typedef void (*store_intrin_cb)(nir_builder *b, nir_def *val, unsigned base);
|
|
+
|
|
struct lower_hit_attrib_deref_args {
|
|
nir_variable_mode mode;
|
|
uint32_t base_offset;
|
|
+
|
|
+ load_intrin_cb load_cb;
|
|
+ store_intrin_cb store_cb;
|
|
};
|
|
|
|
static bool
|
|
-lower_hit_attrib_deref(nir_builder *b, nir_instr *instr, void *data)
|
|
+lower_rt_var_deref(nir_builder *b, nir_instr *instr, void *data)
|
|
{
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
return false;
|
|
@@ -48,19 +54,16 @@ lower_hit_attrib_deref(nir_builder *b, nir_instr *instr, void *data)
|
|
uint32_t comp_offset = offset % 4;
|
|
|
|
if (bit_size == 64) {
|
|
- components[comp] = nir_pack_64_2x32_split(b, nir_load_hit_attrib_amd(b, .base = base),
|
|
- nir_load_hit_attrib_amd(b, .base = base + 1));
|
|
+ components[comp] = nir_pack_64_2x32_split(b, args->load_cb(b, base), args->load_cb(b, base + 1));
|
|
} else if (bit_size == 32) {
|
|
- components[comp] = nir_load_hit_attrib_amd(b, .base = base);
|
|
+ components[comp] = args->load_cb(b, base);
|
|
} else if (bit_size == 16) {
|
|
- components[comp] =
|
|
- nir_channel(b, nir_unpack_32_2x16(b, nir_load_hit_attrib_amd(b, .base = base)), comp_offset / 2);
|
|
+ components[comp] = nir_channel(b, nir_unpack_32_2x16(b, args->load_cb(b, base)), comp_offset / 2);
|
|
} else if (bit_size == 8) {
|
|
- components[comp] =
|
|
- nir_channel(b, nir_unpack_bits(b, nir_load_hit_attrib_amd(b, .base = base), 8), comp_offset);
|
|
+ components[comp] = nir_channel(b, nir_unpack_bits(b, args->load_cb(b, base), 8), comp_offset);
|
|
} else {
|
|
assert(bit_size == 1);
|
|
- components[comp] = nir_i2b(b, nir_load_hit_attrib_amd(b, .base = base));
|
|
+ components[comp] = nir_i2b(b, args->load_cb(b, base));
|
|
}
|
|
}
|
|
|
|
@@ -78,25 +81,25 @@ lower_hit_attrib_deref(nir_builder *b, nir_instr *instr, void *data)
|
|
nir_def *component = nir_channel(b, value, comp);
|
|
|
|
if (bit_size == 64) {
|
|
- nir_store_hit_attrib_amd(b, nir_unpack_64_2x32_split_x(b, component), .base = base);
|
|
- nir_store_hit_attrib_amd(b, nir_unpack_64_2x32_split_y(b, component), .base = base + 1);
|
|
+ args->store_cb(b, nir_unpack_64_2x32_split_x(b, component), base);
|
|
+ args->store_cb(b, nir_unpack_64_2x32_split_y(b, component), base + 1);
|
|
} else if (bit_size == 32) {
|
|
- nir_store_hit_attrib_amd(b, component, .base = base);
|
|
+ args->store_cb(b, component, base);
|
|
} else if (bit_size == 16) {
|
|
- nir_def *prev = nir_unpack_32_2x16(b, nir_load_hit_attrib_amd(b, .base = base));
|
|
+ nir_def *prev = nir_unpack_32_2x16(b, args->load_cb(b, base));
|
|
nir_def *components[2];
|
|
for (uint32_t word = 0; word < 2; word++)
|
|
components[word] = (word == comp_offset / 2) ? nir_channel(b, value, comp) : nir_channel(b, prev, word);
|
|
- nir_store_hit_attrib_amd(b, nir_pack_32_2x16(b, nir_vec(b, components, 2)), .base = base);
|
|
+ args->store_cb(b, nir_pack_32_2x16(b, nir_vec(b, components, 2)), base);
|
|
} else if (bit_size == 8) {
|
|
- nir_def *prev = nir_unpack_bits(b, nir_load_hit_attrib_amd(b, .base = base), 8);
|
|
+ nir_def *prev = nir_unpack_bits(b, args->load_cb(b, base), 8);
|
|
nir_def *components[4];
|
|
for (uint32_t byte = 0; byte < 4; byte++)
|
|
components[byte] = (byte == comp_offset) ? nir_channel(b, value, comp) : nir_channel(b, prev, byte);
|
|
- nir_store_hit_attrib_amd(b, nir_pack_32_4x8(b, nir_vec(b, components, 4)), .base = base);
|
|
+ args->store_cb(b, nir_pack_32_4x8(b, nir_vec(b, components, 4)), base);
|
|
} else {
|
|
assert(bit_size == 1);
|
|
- nir_store_hit_attrib_amd(b, nir_b2i32(b, component), .base = base);
|
|
+ args->store_cb(b, nir_b2i32(b, component), base);
|
|
}
|
|
}
|
|
}
|
|
@@ -123,13 +126,14 @@ radv_lower_payload_arg_to_offset(nir_builder *b, nir_intrinsic_instr *instr, voi
|
|
}
|
|
|
|
static bool
|
|
-radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, uint32_t base_offset)
|
|
+radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, load_intrin_cb load_cb, store_intrin_cb store_cb,
|
|
+ uint32_t base_offset)
|
|
{
|
|
bool progress = false;
|
|
|
|
progress |= nir_lower_indirect_derefs(shader, mode, UINT32_MAX);
|
|
|
|
- progress |= nir_lower_vars_to_explicit_types(shader, mode, glsl_get_natural_size_align_bytes);
|
|
+ NIR_PASS(_, shader, nir_lower_vars_to_explicit_types, mode, glsl_get_natural_size_align_bytes);
|
|
|
|
if (shader->info.stage == MESA_SHADER_RAYGEN && mode == nir_var_function_temp)
|
|
progress |= nir_shader_intrinsics_pass(shader, radv_lower_payload_arg_to_offset, nir_metadata_control_flow, NULL);
|
|
@@ -137,9 +141,11 @@ radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, uint32_t base
|
|
struct lower_hit_attrib_deref_args args = {
|
|
.mode = mode,
|
|
.base_offset = base_offset,
|
|
+ .load_cb = load_cb,
|
|
+ .store_cb = store_cb,
|
|
};
|
|
|
|
- progress |= nir_shader_instructions_pass(shader, lower_hit_attrib_deref, nir_metadata_control_flow, &args);
|
|
+ progress |= nir_shader_instructions_pass(shader, lower_rt_var_deref, nir_metadata_control_flow, &args);
|
|
|
|
if (progress) {
|
|
nir_remove_dead_derefs(shader);
|
|
@@ -149,16 +155,57 @@ radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, uint32_t base
|
|
return progress;
|
|
}
|
|
|
|
+static nir_def *
|
|
+load_hit_attrib_cb(nir_builder *b, unsigned base)
|
|
+{
|
|
+ return nir_load_hit_attrib_amd(b, .base = base);
|
|
+}
|
|
+
|
|
+static void
|
|
+store_hit_attrib_cb(nir_builder *b, nir_def *val, unsigned base)
|
|
+{
|
|
+ nir_store_hit_attrib_amd(b, val, .base = base);
|
|
+}
|
|
+
|
|
bool
|
|
radv_nir_lower_hit_attrib_derefs(nir_shader *shader)
|
|
{
|
|
- return radv_nir_lower_rt_vars(shader, nir_var_ray_hit_attrib, 0);
|
|
+ bool progress = false;
|
|
+ progress |= nir_lower_vars_to_explicit_types(shader, nir_var_ray_hit_attrib, glsl_get_natural_size_align_bytes);
|
|
+ progress |= radv_nir_lower_rt_vars(shader, nir_var_ray_hit_attrib, load_hit_attrib_cb, store_hit_attrib_cb, 0);
|
|
+ return progress;
|
|
+}
|
|
+
|
|
+static nir_def *
|
|
+load_incoming_payload_cb(nir_builder *b, unsigned base)
|
|
+{
|
|
+ return nir_load_incoming_ray_payload_amd(b, .base = base);
|
|
+}
|
|
+
|
|
+static void
|
|
+store_incoming_payload_cb(nir_builder *b, nir_def *val, unsigned base)
|
|
+{
|
|
+ nir_store_incoming_ray_payload_amd(b, val, .base = base);
|
|
+}
|
|
+
|
|
+static nir_def *
|
|
+load_outgoing_payload_cb(nir_builder *b, unsigned base)
|
|
+{
|
|
+ return nir_load_outgoing_ray_payload_amd(b, .base = base);
|
|
+}
|
|
+
|
|
+static void
|
|
+store_outgoing_payload_cb(nir_builder *b, nir_def *val, unsigned base)
|
|
+{
|
|
+ nir_store_outgoing_ray_payload_amd(b, val, .base = base);
|
|
}
|
|
|
|
bool
|
|
radv_nir_lower_ray_payload_derefs(nir_shader *shader, uint32_t offset)
|
|
{
|
|
- bool progress = radv_nir_lower_rt_vars(shader, nir_var_function_temp, RADV_MAX_HIT_ATTRIB_SIZE + offset);
|
|
- progress |= radv_nir_lower_rt_vars(shader, nir_var_shader_call_data, RADV_MAX_HIT_ATTRIB_SIZE + offset);
|
|
+ bool progress = radv_nir_lower_rt_vars(shader, nir_var_function_temp, load_outgoing_payload_cb,
|
|
+ store_outgoing_payload_cb, offset);
|
|
+ progress |= radv_nir_lower_rt_vars(shader, nir_var_shader_call_data, load_incoming_payload_cb,
|
|
+ store_incoming_payload_cb, offset);
|
|
return progress;
|
|
}
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 7c120680691e255437116f3219d1d4684d28a180 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 11:46:28 +0200
|
|
Subject: [PATCH 14/71] radv/rt: Pass maximum payload size to
|
|
radv_rt_nir_to_asm
|
|
|
|
---
|
|
src/amd/vulkan/radv_pipeline_rt.c | 27 ++++++++++++++++++++++-----
|
|
1 file changed, 22 insertions(+), 5 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c
|
|
index 32a1cba1269f3..0de6d1281b932 100644
|
|
--- a/src/amd/vulkan/radv_pipeline_rt.c
|
|
+++ b/src/amd/vulkan/radv_pipeline_rt.c
|
|
@@ -356,7 +356,7 @@ move_rt_instructions(nir_shader *shader)
|
|
static VkResult
|
|
radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
|
|
const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, struct radv_ray_tracing_pipeline *pipeline,
|
|
- bool monolithic, struct radv_shader_stage *stage, uint32_t *stack_size,
|
|
+ bool monolithic, struct radv_shader_stage *stage, uint32_t *payload_size, uint32_t *stack_size,
|
|
struct radv_ray_tracing_stage_info *stage_info,
|
|
const struct radv_ray_tracing_stage_info *traversal_stage_info,
|
|
struct radv_serialized_shader_arena_block *replay_block, struct radv_shader **out_shader)
|
|
@@ -368,7 +368,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
|
|
bool keep_executable_info = radv_pipeline_capture_shaders(device, pipeline->base.base.create_flags);
|
|
bool keep_statistic_info = radv_pipeline_capture_shader_stats(device, pipeline->base.base.create_flags);
|
|
|
|
- radv_nir_lower_rt_io(stage->nir, monolithic, 0);
|
|
+ radv_nir_lower_rt_io(stage->nir, monolithic, 0, payload_size);
|
|
|
|
/* Gather shader info. */
|
|
nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));
|
|
@@ -586,6 +586,10 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
|
|
if (!stages)
|
|
return VK_ERROR_OUT_OF_HOST_MEMORY;
|
|
|
|
+ uint32_t payload_size = 0;
|
|
+ if (pCreateInfo->pLibraryInterface)
|
|
+ payload_size = pCreateInfo->pLibraryInterface->maxPipelineRayPayloadSize;
|
|
+
|
|
bool library = pipeline->base.base.create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR;
|
|
|
|
bool monolithic = !library;
|
|
@@ -605,6 +609,19 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
|
|
|
|
NIR_PASS(_, stage->nir, radv_nir_lower_hit_attrib_derefs);
|
|
|
|
+ nir_foreach_variable_with_modes (var, stage->nir, nir_var_shader_call_data) {
|
|
+ unsigned size, alignment;
|
|
+ glsl_get_natural_size_align_bytes(var->type, &size, &alignment);
|
|
+ payload_size = MAX2(payload_size, size);
|
|
+ }
|
|
+ nir_foreach_function_impl (impl, stage->nir) {
|
|
+ nir_foreach_variable_in_list (var, &impl->locals) {
|
|
+ unsigned size, alignment;
|
|
+ glsl_get_natural_size_align_bytes(var->type, &size, &alignment);
|
|
+ payload_size = MAX2(payload_size, size);
|
|
+ }
|
|
+ }
|
|
+
|
|
rt_stages[i].info = radv_gather_ray_tracing_stage_info(stage->nir);
|
|
|
|
stage->feedback.duration = os_time_get_nano() - stage_start;
|
|
@@ -670,8 +687,8 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
|
|
|
|
bool monolithic_raygen = monolithic && stage->stage == MESA_SHADER_RAYGEN;
|
|
|
|
- result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, monolithic_raygen, stage, &stack_size,
|
|
- &rt_stages[idx].info, NULL, replay_block, &rt_stages[idx].shader);
|
|
+ result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, monolithic_raygen, stage, &payload_size,
|
|
+ &stack_size, &rt_stages[idx].info, NULL, replay_block, &rt_stages[idx].shader);
|
|
if (result != VK_SUCCESS)
|
|
goto cleanup;
|
|
|
|
@@ -728,7 +745,7 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
|
|
.key = stage_keys[MESA_SHADER_INTERSECTION],
|
|
};
|
|
radv_shader_layout_init(pipeline_layout, MESA_SHADER_INTERSECTION, &traversal_stage.layout);
|
|
- result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, false, &traversal_stage, NULL, NULL,
|
|
+ result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, false, &traversal_stage, &payload_size, NULL, NULL,
|
|
&traversal_info, NULL, &pipeline->base.base.shaders[MESA_SHADER_INTERSECTION]);
|
|
ralloc_free(traversal_nir);
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From d7b329a6c5625895e7e020ee948d2c0b9c9e9329 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 11:47:46 +0200
|
|
Subject: [PATCH 15/71] radv/rt: Track traversal shader stack size
|
|
|
|
---
|
|
src/amd/vulkan/radv_pipeline_rt.c | 14 ++++++++------
|
|
src/amd/vulkan/radv_pipeline_rt.h | 1 +
|
|
2 files changed, 9 insertions(+), 6 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c
|
|
index 0de6d1281b932..3c848361f13e3 100644
|
|
--- a/src/amd/vulkan/radv_pipeline_rt.c
|
|
+++ b/src/amd/vulkan/radv_pipeline_rt.c
|
|
@@ -745,8 +745,9 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca
|
|
.key = stage_keys[MESA_SHADER_INTERSECTION],
|
|
};
|
|
radv_shader_layout_init(pipeline_layout, MESA_SHADER_INTERSECTION, &traversal_stage.layout);
|
|
- result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, false, &traversal_stage, &payload_size, NULL, NULL,
|
|
- &traversal_info, NULL, &pipeline->base.base.shaders[MESA_SHADER_INTERSECTION]);
|
|
+ result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, false, &traversal_stage, &payload_size,
|
|
+ &pipeline->traversal_stack_size, NULL, &traversal_info, NULL,
|
|
+ &pipeline->base.base.shaders[MESA_SHADER_INTERSECTION]);
|
|
ralloc_free(traversal_nir);
|
|
|
|
cleanup:
|
|
@@ -807,10 +808,11 @@ compute_rt_stack_size(const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, stru
|
|
unreachable("Invalid stage type in RT shader");
|
|
}
|
|
}
|
|
- pipeline->stack_size =
|
|
- raygen_size +
|
|
- MIN2(pCreateInfo->maxPipelineRayRecursionDepth, 1) * MAX2(chit_miss_size, intersection_size + any_hit_size) +
|
|
- MAX2(0, (int)(pCreateInfo->maxPipelineRayRecursionDepth) - 1) * chit_miss_size + 2 * callable_size;
|
|
+ pipeline->stack_size = raygen_size +
|
|
+ MIN2(pCreateInfo->maxPipelineRayRecursionDepth, 1) *
|
|
+ (chit_miss_size + intersection_size + any_hit_size + pipeline->traversal_stack_size) +
|
|
+ MAX2(0, (int)(pCreateInfo->maxPipelineRayRecursionDepth) - 1) * chit_miss_size +
|
|
+ 2 * callable_size;
|
|
}
|
|
|
|
static void
|
|
diff --git a/src/amd/vulkan/radv_pipeline_rt.h b/src/amd/vulkan/radv_pipeline_rt.h
|
|
index 99c0067325923..acfe978924a17 100644
|
|
--- a/src/amd/vulkan/radv_pipeline_rt.h
|
|
+++ b/src/amd/vulkan/radv_pipeline_rt.h
|
|
@@ -26,6 +26,7 @@ struct radv_ray_tracing_pipeline {
|
|
unsigned group_count;
|
|
|
|
uint32_t stack_size;
|
|
+ uint32_t traversal_stack_size;
|
|
|
|
/* set if any shaders from this pipeline require robustness2 in the merged traversal shader */
|
|
bool traversal_storage_robustness2 : 1;
|
|
--
|
|
GitLab
|
|
|
|
|
|
From a48ee7d583587d09cf042045f5ae89d01a17f4ad Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 11:48:48 +0200
|
|
Subject: [PATCH 16/71] radv/rt: Set stack size to scratch_bytes_per_wave
|
|
|
|
---
|
|
src/amd/vulkan/radv_pipeline_rt.c | 3 +++
|
|
1 file changed, 3 insertions(+)
|
|
|
|
diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c
|
|
index 3c848361f13e3..c86e292a36244 100644
|
|
--- a/src/amd/vulkan/radv_pipeline_rt.c
|
|
+++ b/src/amd/vulkan/radv_pipeline_rt.c
|
|
@@ -460,6 +460,9 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
|
|
shader = radv_shader_create(device, cache, binary, keep_executable_info || dump_shader);
|
|
|
|
if (shader) {
|
|
+ if (stack_size)
|
|
+ *stack_size += DIV_ROUND_UP(shader->config.scratch_bytes_per_wave, shader->info.wave_size);
|
|
+
|
|
radv_shader_generate_debug_info(device, dump_shader, keep_executable_info, binary, shader, shaders, num_shaders,
|
|
&stage->info);
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 4af66a35fb348043880ebb4c46893bfd6bebb7fc Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 12:15:20 +0200
|
|
Subject: [PATCH 17/71] radv/rt: Use radv_get_rt_shader_entrypoint instead of
|
|
nir_shader_get_entrypoint
|
|
|
|
---
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 2 +-
|
|
src/amd/vulkan/radv_pipeline_rt.c | 2 +-
|
|
src/amd/vulkan/radv_shader.h | 9 +++++++++
|
|
3 files changed, 11 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index 3f50c7297baae..931c8c3e10ab1 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -1610,7 +1610,7 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin
|
|
radv_build_end_trace_token(b, vars, original_tmax, nir_load_var(b, trav_vars.hit),
|
|
nir_load_var(b, iteration_instance_count));
|
|
|
|
- nir_metadata_preserve(nir_shader_get_entrypoint(b->shader), nir_metadata_none);
|
|
+ nir_metadata_preserve(radv_get_rt_shader_entrypoint(b->shader), nir_metadata_none);
|
|
radv_nir_lower_hit_attrib_derefs(b->shader);
|
|
|
|
/* Register storage for hit attributes */
|
|
diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c
|
|
index c86e292a36244..c4feea4a6f95b 100644
|
|
--- a/src/amd/vulkan/radv_pipeline_rt.c
|
|
+++ b/src/amd/vulkan/radv_pipeline_rt.c
|
|
@@ -424,7 +424,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
|
|
pipeline, monolithic, traversal_stage_info);
|
|
|
|
/* Info might be out-of-date after inlining in radv_nir_lower_rt_abi(). */
|
|
- nir_shader_gather_info(temp_stage.nir, nir_shader_get_entrypoint(temp_stage.nir));
|
|
+ nir_shader_gather_info(temp_stage.nir, radv_get_rt_shader_entrypoint(temp_stage.nir));
|
|
|
|
radv_optimize_nir(temp_stage.nir, stage->key.optimisations_disabled);
|
|
radv_postprocess_nir(device, NULL, &temp_stage);
|
|
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
|
|
index 968ebbe6d4af4..36ad1d0dd8bf9 100644
|
|
--- a/src/amd/vulkan/radv_shader.h
|
|
+++ b/src/amd/vulkan/radv_shader.h
|
|
@@ -507,6 +507,15 @@ struct radv_shader_stage;
|
|
void radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively);
|
|
void radv_optimize_nir_algebraic(nir_shader *shader, bool opt_offsets, bool opt_mqsad);
|
|
|
|
+static inline nir_function_impl *
|
|
+radv_get_rt_shader_entrypoint(nir_shader *shader)
|
|
+{
|
|
+ nir_foreach_function_impl (impl, shader)
|
|
+ if (impl->function->is_entrypoint || impl->function->is_exported)
|
|
+ return impl;
|
|
+ return NULL;
|
|
+}
|
|
+
|
|
void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset);
|
|
|
|
struct radv_ray_tracing_stage_info;
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 38ac43cce19772daf5b566eee5128805a90e75a7 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Fri, 4 Oct 2024 05:48:26 +0200
|
|
Subject: [PATCH 18/71] radv/rt: Only lower vars to explicit types for
|
|
monolithic shaders
|
|
|
|
---
|
|
src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c | 2 --
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 2 ++
|
|
2 files changed, 2 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c b/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c
|
|
index 9db157dd4baf0..7efcad3675c6b 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c
|
|
@@ -133,8 +133,6 @@ radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, load_intrin_c
|
|
|
|
progress |= nir_lower_indirect_derefs(shader, mode, UINT32_MAX);
|
|
|
|
- NIR_PASS(_, shader, nir_lower_vars_to_explicit_types, mode, glsl_get_natural_size_align_bytes);
|
|
-
|
|
if (shader->info.stage == MESA_SHADER_RAYGEN && mode == nir_var_function_temp)
|
|
progress |= nir_shader_intrinsics_pass(shader, radv_lower_payload_arg_to_offset, nir_metadata_control_flow, NULL);
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index 931c8c3e10ab1..c2b0e99f74129 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -851,6 +851,8 @@ radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset)
|
|
|
|
NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_function_temp, nir_address_format_32bit_offset);
|
|
} else {
|
|
+ NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_natural_size_align_bytes);
|
|
+ NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_shader_temp, glsl_get_natural_size_align_bytes);
|
|
NIR_PASS(_, nir, radv_nir_lower_ray_payload_derefs, payload_offset);
|
|
}
|
|
}
|
|
--
|
|
GitLab
|
|
|
|
|
|
From c75c5ab22d84c3e168f0879aca26412d0d6d3668 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 11:54:05 +0200
|
|
Subject: [PATCH 19/71] radv/rt: Lower monolithic ray payload load/store
|
|
instructions
|
|
|
|
---
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 98 +++++++++++++++++--------
|
|
src/amd/vulkan/radv_shader.h | 2 +-
|
|
2 files changed, 69 insertions(+), 31 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index c2b0e99f74129..061c58d45949f 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -731,12 +731,13 @@ lower_rt_instructions(nir_shader *shader, struct rt_variables *vars, bool late_l
|
|
nir_shader_instructions_pass(shader, radv_lower_rt_instruction, nir_metadata_none, &data);
|
|
}
|
|
|
|
-/* Lowers hit attributes to registers or shared memory. If hit_attribs is NULL, attributes are
|
|
+/* Lowers RT I/O vars to registers or shared memory. If hit_attribs is NULL, attributes are
|
|
* lowered to shared memory. */
|
|
static void
|
|
-lower_hit_attribs(nir_shader *shader, nir_variable **hit_attribs, uint32_t workgroup_size)
|
|
+lower_rt_storage(nir_shader *shader, nir_variable **hit_attribs, nir_deref_instr **payload_in,
|
|
+ nir_variable **payload_out, uint32_t workgroup_size)
|
|
{
|
|
- nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
+ nir_function_impl *impl = radv_get_rt_shader_entrypoint(shader);
|
|
|
|
nir_foreach_variable_with_modes (attrib, shader, nir_var_ray_hit_attrib)
|
|
attrib->data.mode = nir_var_shader_temp;
|
|
@@ -750,29 +751,55 @@ lower_hit_attribs(nir_shader *shader, nir_variable **hit_attribs, uint32_t workg
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
if (intrin->intrinsic != nir_intrinsic_load_hit_attrib_amd &&
|
|
- intrin->intrinsic != nir_intrinsic_store_hit_attrib_amd)
|
|
+ intrin->intrinsic != nir_intrinsic_store_hit_attrib_amd &&
|
|
+ intrin->intrinsic != nir_intrinsic_load_incoming_ray_payload_amd &&
|
|
+ intrin->intrinsic != nir_intrinsic_store_incoming_ray_payload_amd &&
|
|
+ intrin->intrinsic != nir_intrinsic_load_outgoing_ray_payload_amd &&
|
|
+ intrin->intrinsic != nir_intrinsic_store_outgoing_ray_payload_amd)
|
|
continue;
|
|
|
|
b.cursor = nir_after_instr(instr);
|
|
|
|
- nir_def *offset;
|
|
- if (!hit_attribs)
|
|
- offset = nir_imul_imm(
|
|
- &b, nir_iadd_imm(&b, nir_load_local_invocation_index(&b), nir_intrinsic_base(intrin) * workgroup_size),
|
|
- sizeof(uint32_t));
|
|
-
|
|
- if (intrin->intrinsic == nir_intrinsic_load_hit_attrib_amd) {
|
|
- nir_def *ret;
|
|
- if (hit_attribs)
|
|
- ret = nir_load_var(&b, hit_attribs[nir_intrinsic_base(intrin)]);
|
|
+ if (intrin->intrinsic == nir_intrinsic_load_hit_attrib_amd ||
|
|
+ intrin->intrinsic == nir_intrinsic_store_hit_attrib_amd) {
|
|
+ nir_def *offset;
|
|
+ if (!hit_attribs)
|
|
+ offset = nir_imul_imm(
|
|
+ &b,
|
|
+ nir_iadd_imm(&b, nir_load_local_invocation_index(&b), nir_intrinsic_base(intrin) * workgroup_size),
|
|
+ sizeof(uint32_t));
|
|
+
|
|
+ if (intrin->intrinsic == nir_intrinsic_load_hit_attrib_amd) {
|
|
+ nir_def *ret;
|
|
+ if (hit_attribs)
|
|
+ ret = nir_load_var(&b, hit_attribs[nir_intrinsic_base(intrin)]);
|
|
+ else
|
|
+ ret = nir_load_shared(&b, 1, 32, offset, .base = 0, .align_mul = 4);
|
|
+ nir_def_rewrite_uses(nir_instr_def(instr), ret);
|
|
+ } else {
|
|
+ if (hit_attribs)
|
|
+ nir_store_var(&b, hit_attribs[nir_intrinsic_base(intrin)], intrin->src->ssa, 0x1);
|
|
+ else
|
|
+ nir_store_shared(&b, intrin->src->ssa, offset, .base = 0, .align_mul = 4);
|
|
+ }
|
|
+ } else if (intrin->intrinsic == nir_intrinsic_load_incoming_ray_payload_amd ||
|
|
+ intrin->intrinsic == nir_intrinsic_store_incoming_ray_payload_amd) {
|
|
+ if (!payload_in)
|
|
+ continue;
|
|
+ if (intrin->intrinsic == nir_intrinsic_load_incoming_ray_payload_amd)
|
|
+ nir_def_rewrite_uses(nir_instr_def(instr), nir_load_deref(&b, payload_in[nir_intrinsic_base(intrin)]));
|
|
else
|
|
- ret = nir_load_shared(&b, 1, 32, offset, .base = 0, .align_mul = 4);
|
|
- nir_def_rewrite_uses(nir_instr_def(instr), ret);
|
|
- } else {
|
|
- if (hit_attribs)
|
|
- nir_store_var(&b, hit_attribs[nir_intrinsic_base(intrin)], intrin->src->ssa, 0x1);
|
|
+ nir_store_deref(&b, payload_in[nir_intrinsic_base(intrin)], intrin->src->ssa, 0x1);
|
|
+ } else if (intrin->intrinsic == nir_intrinsic_load_outgoing_ray_payload_amd ||
|
|
+ intrin->intrinsic == nir_intrinsic_store_outgoing_ray_payload_amd) {
|
|
+ if (!payload_out)
|
|
+ continue;
|
|
+ if (intrin->intrinsic == nir_intrinsic_load_outgoing_ray_payload_amd)
|
|
+ nir_def_rewrite_uses(nir_instr_def(instr), nir_load_var(&b, payload_out[nir_intrinsic_base(intrin)]));
|
|
else
|
|
- nir_store_shared(&b, intrin->src->ssa, offset, .base = 0, .align_mul = 4);
|
|
+ nir_store_var(&b, payload_out[nir_intrinsic_base(intrin)], intrin->src->ssa, 0x1);
|
|
+ } else {
|
|
+ continue;
|
|
}
|
|
nir_instr_remove(instr);
|
|
}
|
|
@@ -1620,10 +1647,9 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin
|
|
|
|
if (!monolithic) {
|
|
for (uint32_t i = 0; i < ARRAY_SIZE(hit_attribs); i++)
|
|
- hit_attribs[i] =
|
|
- nir_local_variable_create(nir_shader_get_entrypoint(b->shader), glsl_uint_type(), "ahit_attrib");
|
|
+ hit_attribs[i] = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "ahit_attrib");
|
|
|
|
- lower_hit_attribs(b->shader, hit_attribs, pdev->rt_wave_size);
|
|
+ lower_rt_storage(b->shader, hit_attribs, NULL, NULL, pdev->rt_wave_size);
|
|
}
|
|
|
|
/* Initialize follow-up shader. */
|
|
@@ -1819,10 +1845,11 @@ radv_count_hit_attrib_slots(nir_builder *b, nir_intrinsic_instr *instr, void *da
|
|
|
|
static void
|
|
lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device,
|
|
- struct radv_ray_tracing_pipeline *pipeline,
|
|
- const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, struct rt_variables *vars)
|
|
+ struct radv_ray_tracing_pipeline *pipeline, const struct radv_shader_info *info,
|
|
+ const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, uint32_t payload_size,
|
|
+ struct rt_variables *vars)
|
|
{
|
|
- nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
+ nir_function_impl *impl = radv_get_rt_shader_entrypoint(shader);
|
|
|
|
struct lower_rt_instruction_monolithic_state state = {
|
|
.device = device,
|
|
@@ -1842,7 +1869,17 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device,
|
|
for (uint32_t i = 0; i < hit_attrib_count; i++)
|
|
hit_attribs[i] = nir_local_variable_create(impl, glsl_uint_type(), "ahit_attrib");
|
|
|
|
- lower_hit_attribs(shader, hit_attribs, 0);
|
|
+ nir_builder b = nir_builder_create(impl);
|
|
+ b.cursor = nir_before_impl(impl);
|
|
+ nir_variable **payload_vars = rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(payload_size, 4));
|
|
+ nir_deref_instr **payload_storage =
|
|
+ rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(payload_size, 4));
|
|
+ for (unsigned i = 0; i < DIV_ROUND_UP(payload_size, 4); ++i) {
|
|
+ payload_vars[i] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "_payload");
|
|
+ payload_storage[i] = nir_build_deref_var(&b, payload_vars[i]);
|
|
+ }
|
|
+
|
|
+ lower_rt_storage(shader, hit_attribs, payload_storage, payload_vars, info->wave_size);
|
|
}
|
|
|
|
static void
|
|
@@ -1857,8 +1894,9 @@ radv_store_arg(nir_builder *b, const struct radv_shader_args *args, const struct
|
|
void
|
|
radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
|
|
const struct radv_shader_args *args, const struct radv_shader_info *info, uint32_t *stack_size,
|
|
- bool resume_shader, struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline,
|
|
- bool monolithic, const struct radv_ray_tracing_stage_info *traversal_info)
|
|
+ bool resume_shader, uint32_t payload_size, struct radv_device *device,
|
|
+ struct radv_ray_tracing_pipeline *pipeline, bool monolithic,
|
|
+ const struct radv_ray_tracing_stage_info *traversal_info)
|
|
{
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
|
|
@@ -1867,7 +1905,7 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH
|
|
struct rt_variables vars = create_rt_variables(shader, device, create_flags, monolithic);
|
|
|
|
if (monolithic)
|
|
- lower_rt_instructions_monolithic(shader, device, pipeline, pCreateInfo, &vars);
|
|
+ lower_rt_instructions_monolithic(shader, device, pipeline, info, pCreateInfo, payload_size, &vars);
|
|
|
|
struct radv_rt_shader_info rt_info = {0};
|
|
|
|
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
|
|
index 36ad1d0dd8bf9..4ba7e36d16952 100644
|
|
--- a/src/amd/vulkan/radv_shader.h
|
|
+++ b/src/amd/vulkan/radv_shader.h
|
|
@@ -522,7 +522,7 @@ struct radv_ray_tracing_stage_info;
|
|
|
|
void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
|
|
const struct radv_shader_args *args, const struct radv_shader_info *info,
|
|
- uint32_t *stack_size, bool resume_shader, struct radv_device *device,
|
|
+ uint32_t *stack_size, bool resume_shader, uint32_t payload_size, struct radv_device *device,
|
|
struct radv_ray_tracing_pipeline *pipeline, bool monolithic,
|
|
const struct radv_ray_tracing_stage_info *traversal_info);
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 1ef679cac11353eba65d518f0728747550d40926 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 17 Jun 2024 13:02:28 +0200
|
|
Subject: [PATCH 20/71] aco: Add function call attributes
|
|
|
|
ACO needs RADV to set certain attributes on NIR functions to help with
|
|
compilation of function calls.
|
|
---
|
|
src/amd/compiler/aco_nir_call_attribs.h | 29 +++++++++++++++++++++++++
|
|
1 file changed, 29 insertions(+)
|
|
create mode 100644 src/amd/compiler/aco_nir_call_attribs.h
|
|
|
|
diff --git a/src/amd/compiler/aco_nir_call_attribs.h b/src/amd/compiler/aco_nir_call_attribs.h
|
|
new file mode 100644
|
|
index 0000000000000..33dc011914cd9
|
|
--- /dev/null
|
|
+++ b/src/amd/compiler/aco_nir_call_attribs.h
|
|
@@ -0,0 +1,29 @@
|
|
+/*
|
|
+ * Copyright © 2024 Valve Corporation
|
|
+ *
|
|
+ * SPDX-License-Identifier: MIT
|
|
+ */
|
|
+
|
|
+#ifndef ACO_NIR_CALL_ATTRIBS_H
|
|
+#define ACO_NIR_CALL_ATTRIBS_H
|
|
+
|
|
+enum aco_nir_call_abi {
|
|
+ ACO_NIR_CALL_ABI_RT_RECURSIVE,
|
|
+ ACO_NIR_CALL_ABI_TRAVERSAL,
|
|
+ ACO_NIR_CALL_ABI_AHIT_ISEC,
|
|
+};
|
|
+
|
|
+enum aco_nir_function_attribs {
|
|
+ ACO_NIR_FUNCTION_ATTRIB_ABI_MASK = 0x7F,
|
|
+ /* Different lanes can have different values for the function pointer to call */
|
|
+ ACO_NIR_FUNCTION_ATTRIB_DIVERGENT_CALL = 0x1 << 7,
|
|
+ /* Function will never return */
|
|
+ ACO_NIR_FUNCTION_ATTRIB_NORETURN = 0x2 << 7,
|
|
+};
|
|
+
|
|
+enum aco_nir_parameter_attribs {
|
|
+ /* Parameter value is not used by any callee and does not need to be preserved */
|
|
+ ACO_NIR_PARAM_ATTRIB_DISCARDABLE = 0x1,
|
|
+};
|
|
+
|
|
+#endif /* ACO_NIR_CALL_ATTRIBS_H */
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 10abf8a72b902de027999226432bca4621cde2de Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Thu, 3 Oct 2024 12:34:25 +0200
|
|
Subject: [PATCH 21/71] radv/rt: Lower descriptor loads to param loads
|
|
|
|
---
|
|
.../nir/radv_nir_apply_pipeline_layout.c | 46 +++++++++++++++++--
|
|
1 file changed, 42 insertions(+), 4 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_apply_pipeline_layout.c b/src/amd/vulkan/nir/radv_nir_apply_pipeline_layout.c
|
|
index fd67c3eb18f5e..991cc31eadafd 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_apply_pipeline_layout.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_apply_pipeline_layout.c
|
|
@@ -5,6 +5,7 @@
|
|
*/
|
|
#include "ac_descriptors.h"
|
|
#include "ac_shader_util.h"
|
|
+#include "aco_nir_call_attribs.h"
|
|
#include "nir.h"
|
|
#include "nir_builder.h"
|
|
#include "radv_descriptor_set.h"
|
|
@@ -34,6 +35,42 @@ get_scalar_arg(nir_builder *b, unsigned size, struct ac_arg arg)
|
|
return nir_load_scalar_arg_amd(b, size, .base = arg.arg_index);
|
|
}
|
|
|
|
+static nir_def *
|
|
+get_indirect_descriptors_addr(nir_builder *b, apply_layout_state *state)
|
|
+{
|
|
+ switch (b->shader->info.stage) {
|
|
+ case MESA_SHADER_RAYGEN:
|
|
+ case MESA_SHADER_CALLABLE:
|
|
+ return nir_load_param(b, RAYGEN_ARG_DESCRIPTORS);
|
|
+ case MESA_SHADER_INTERSECTION:
|
|
+ return nir_load_param(b, TRAVERSAL_ARG_DESCRIPTORS);
|
|
+ case MESA_SHADER_CLOSEST_HIT:
|
|
+ case MESA_SHADER_MISS:
|
|
+ return nir_load_param(b, CHIT_MISS_ARG_DESCRIPTORS);
|
|
+ default:
|
|
+ assert(!gl_shader_stage_is_rt(b->shader->info.stage));
|
|
+ return get_scalar_arg(b, 1, state->args->descriptor_sets[0]);
|
|
+ }
|
|
+}
|
|
+
|
|
+static nir_def *
|
|
+get_indirect_push_constants_addr(nir_builder *b, apply_layout_state *state)
|
|
+{
|
|
+ switch (b->shader->info.stage) {
|
|
+ case MESA_SHADER_RAYGEN:
|
|
+ case MESA_SHADER_CALLABLE:
|
|
+ return nir_load_param(b, RAYGEN_ARG_PUSH_CONSTANTS);
|
|
+ case MESA_SHADER_INTERSECTION:
|
|
+ return nir_load_param(b, TRAVERSAL_ARG_PUSH_CONSTANTS);
|
|
+ case MESA_SHADER_CLOSEST_HIT:
|
|
+ case MESA_SHADER_MISS:
|
|
+ return nir_load_param(b, CHIT_MISS_ARG_PUSH_CONSTANTS);
|
|
+ default:
|
|
+ assert(!gl_shader_stage_is_rt(b->shader->info.stage));
|
|
+ return get_scalar_arg(b, 1, state->args->ac.push_constants);
|
|
+ }
|
|
+}
|
|
+
|
|
static nir_def *
|
|
convert_pointer_to_64_bit(nir_builder *b, apply_layout_state *state, nir_def *ptr)
|
|
{
|
|
@@ -44,8 +81,9 @@ static nir_def *
|
|
load_desc_ptr(nir_builder *b, apply_layout_state *state, unsigned set)
|
|
{
|
|
const struct radv_userdata_locations *user_sgprs_locs = &state->info->user_sgprs_locs;
|
|
- if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) {
|
|
- nir_def *addr = get_scalar_arg(b, 1, state->args->descriptor_sets[0]);
|
|
+ if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1 ||
|
|
+ gl_shader_stage_is_rt(b->shader->info.stage)) {
|
|
+ nir_def *addr = get_indirect_descriptors_addr(b, state);
|
|
addr = convert_pointer_to_64_bit(b, state, addr);
|
|
return nir_load_smem_amd(b, 1, addr, nir_imm_int(b, set * 4));
|
|
}
|
|
@@ -67,7 +105,7 @@ visit_vulkan_resource_index(nir_builder *b, apply_layout_state *state, nir_intri
|
|
if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
|
|
layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) {
|
|
unsigned idx = state->layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset;
|
|
- set_ptr = get_scalar_arg(b, 1, state->args->ac.push_constants);
|
|
+ set_ptr = get_indirect_push_constants_addr(b, state);
|
|
offset = state->layout->push_constant_size + idx * 16;
|
|
stride = 16;
|
|
} else {
|
|
@@ -379,7 +417,7 @@ load_push_constant(nir_builder *b, apply_layout_state *state, nir_intrinsic_inst
|
|
}
|
|
|
|
if (!offset) {
|
|
- addr = get_scalar_arg(b, 1, state->args->ac.push_constants);
|
|
+ addr = get_indirect_push_constants_addr(b, state);
|
|
addr = convert_pointer_to_64_bit(b, state, addr);
|
|
offset = nir_iadd_imm_nuw(b, intrin->src[0].ssa, base);
|
|
}
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 41079fe63f7877dacb9fd3d8dc67740ed100439e Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 11:56:09 +0200
|
|
Subject: [PATCH 22/71] radv/rt: Create RT functions to call
|
|
|
|
---
|
|
src/amd/compiler/aco_nir_call_attribs.h | 59 +++++
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 276 +++++++++++++++++++++++-
|
|
2 files changed, 331 insertions(+), 4 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_nir_call_attribs.h b/src/amd/compiler/aco_nir_call_attribs.h
|
|
index 33dc011914cd9..a879c51ebb3c2 100644
|
|
--- a/src/amd/compiler/aco_nir_call_attribs.h
|
|
+++ b/src/amd/compiler/aco_nir_call_attribs.h
|
|
@@ -26,4 +26,63 @@ enum aco_nir_parameter_attribs {
|
|
ACO_NIR_PARAM_ATTRIB_DISCARDABLE = 0x1,
|
|
};
|
|
|
|
+enum aco_nir_raygen_function_arg {
|
|
+ RAYGEN_ARG_LAUNCH_ID = 0,
|
|
+ RAYGEN_ARG_LAUNCH_SIZE,
|
|
+ RAYGEN_ARG_DESCRIPTORS,
|
|
+ RAYGEN_ARG_PUSH_CONSTANTS,
|
|
+ RAYGEN_ARG_SBT_DESCRIPTORS,
|
|
+ RAYGEN_ARG_TRAVERSAL_ADDR,
|
|
+ RAYGEN_ARG_SHADER_RECORD_PTR,
|
|
+ RAYGEN_ARG_COUNT,
|
|
+};
|
|
+
|
|
+enum aco_nir_traversal_function_arg {
|
|
+ TRAVERSAL_ARG_LAUNCH_ID = 0,
|
|
+ TRAVERSAL_ARG_LAUNCH_SIZE,
|
|
+ TRAVERSAL_ARG_DESCRIPTORS,
|
|
+ TRAVERSAL_ARG_PUSH_CONSTANTS,
|
|
+ TRAVERSAL_ARG_SBT_DESCRIPTORS,
|
|
+ TRAVERSAL_ARG_TRAVERSAL_ADDR,
|
|
+ TRAVERSAL_ARG_SHADER_RECORD_PTR,
|
|
+ TRAVERSAL_ARG_ACCEL_STRUCT,
|
|
+ TRAVERSAL_ARG_CULL_MASK_AND_FLAGS,
|
|
+ TRAVERSAL_ARG_SBT_OFFSET,
|
|
+ TRAVERSAL_ARG_SBT_STRIDE,
|
|
+ TRAVERSAL_ARG_MISS_INDEX,
|
|
+ TRAVERSAL_ARG_RAY_ORIGIN,
|
|
+ TRAVERSAL_ARG_RAY_TMIN,
|
|
+ TRAVERSAL_ARG_RAY_DIRECTION,
|
|
+ TRAVERSAL_ARG_RAY_TMAX,
|
|
+ TRAVERSAL_ARG_PRIMITIVE_ID,
|
|
+ TRAVERSAL_ARG_INSTANCE_ADDR,
|
|
+ TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS,
|
|
+ TRAVERSAL_ARG_HIT_KIND,
|
|
+ TRAVERSAL_ARG_PAYLOAD_BASE,
|
|
+};
|
|
+
|
|
+enum aco_nir_chit_miss_function_arg {
|
|
+ CHIT_MISS_ARG_LAUNCH_ID = 0,
|
|
+ CHIT_MISS_ARG_LAUNCH_SIZE,
|
|
+ CHIT_MISS_ARG_DESCRIPTORS,
|
|
+ CHIT_MISS_ARG_PUSH_CONSTANTS,
|
|
+ CHIT_MISS_ARG_SBT_DESCRIPTORS,
|
|
+ CHIT_MISS_ARG_TRAVERSAL_ADDR,
|
|
+ CHIT_MISS_ARG_SHADER_RECORD_PTR,
|
|
+ CHIT_MISS_ARG_ACCEL_STRUCT,
|
|
+ CHIT_MISS_ARG_CULL_MASK_AND_FLAGS,
|
|
+ CHIT_MISS_ARG_SBT_OFFSET,
|
|
+ CHIT_MISS_ARG_SBT_STRIDE,
|
|
+ CHIT_MISS_ARG_MISS_INDEX,
|
|
+ CHIT_MISS_ARG_RAY_ORIGIN,
|
|
+ CHIT_MISS_ARG_RAY_TMIN,
|
|
+ CHIT_MISS_ARG_RAY_DIRECTION,
|
|
+ CHIT_MISS_ARG_RAY_TMAX,
|
|
+ CHIT_MISS_ARG_PRIMITIVE_ID,
|
|
+ CHIT_MISS_ARG_INSTANCE_ADDR,
|
|
+ CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS,
|
|
+ CHIT_MISS_ARG_HIT_KIND,
|
|
+ CHIT_MISS_ARG_PAYLOAD_BASE,
|
|
+};
|
|
+
|
|
#endif /* ACO_NIR_CALL_ATTRIBS_H */
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index 061c58d45949f..165c7e18578e0 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -16,6 +16,8 @@
|
|
#include "radv_pipeline_rt.h"
|
|
#include "radv_shader.h"
|
|
|
|
+#include "aco_nir_call_attribs.h"
|
|
+
|
|
#include "vk_pipeline.h"
|
|
|
|
/* Traversal stack size. This stack is put in LDS and experimentally 16 entries results in best
|
|
@@ -164,6 +166,243 @@ lower_rt_derefs(nir_shader *shader)
|
|
return progress;
|
|
}
|
|
|
|
+static void
|
|
+radv_nir_init_rt_function_params(nir_function *function, gl_shader_stage stage, unsigned payload_size)
|
|
+{
|
|
+ unsigned payload_base = -1u;
|
|
+
|
|
+ switch (stage) {
|
|
+ case MESA_SHADER_RAYGEN:
|
|
+ function->num_params = RAYGEN_ARG_COUNT;
|
|
+ function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params);
|
|
+ function->params[RAYGEN_ARG_LAUNCH_ID].num_components = 3;
|
|
+ function->params[RAYGEN_ARG_LAUNCH_ID].bit_size = 32;
|
|
+ function->params[RAYGEN_ARG_LAUNCH_ID].type = glsl_vector_type(GLSL_TYPE_UINT, 3);
|
|
+ function->params[RAYGEN_ARG_LAUNCH_SIZE].num_components = 3;
|
|
+ function->params[RAYGEN_ARG_LAUNCH_SIZE].bit_size = 32;
|
|
+ function->params[RAYGEN_ARG_LAUNCH_SIZE].type = glsl_vector_type(GLSL_TYPE_UINT, 3);
|
|
+ function->params[RAYGEN_ARG_LAUNCH_SIZE].is_uniform = true;
|
|
+ function->params[RAYGEN_ARG_DESCRIPTORS].num_components = 1;
|
|
+ function->params[RAYGEN_ARG_DESCRIPTORS].bit_size = 32;
|
|
+ function->params[RAYGEN_ARG_DESCRIPTORS].type = glsl_uint_type();
|
|
+ function->params[RAYGEN_ARG_DESCRIPTORS].is_uniform = true;
|
|
+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].num_components = 1;
|
|
+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].bit_size = 32;
|
|
+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].type = glsl_uint_type();
|
|
+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].is_uniform = true;
|
|
+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].num_components = 1;
|
|
+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].bit_size = 64;
|
|
+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].type = glsl_uint64_t_type();
|
|
+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].is_uniform = true;
|
|
+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].num_components = 1;
|
|
+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].bit_size = 64;
|
|
+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].type = glsl_uint64_t_type();
|
|
+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].is_uniform = true;
|
|
+ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].num_components = 1;
|
|
+ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].bit_size = 64;
|
|
+ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].type = glsl_uint64_t_type();
|
|
+ function->driver_attributes = ACO_NIR_CALL_ABI_RT_RECURSIVE | ACO_NIR_FUNCTION_ATTRIB_NORETURN;
|
|
+ break;
|
|
+ case MESA_SHADER_CALLABLE:
|
|
+ function->num_params = RAYGEN_ARG_COUNT + DIV_ROUND_UP(payload_size, 4);
|
|
+ function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params);
|
|
+ function->params[RAYGEN_ARG_LAUNCH_ID].num_components = 3;
|
|
+ function->params[RAYGEN_ARG_LAUNCH_ID].bit_size = 32;
|
|
+ function->params[RAYGEN_ARG_LAUNCH_ID].type = glsl_vector_type(GLSL_TYPE_UINT, 3);
|
|
+ function->params[RAYGEN_ARG_LAUNCH_SIZE].num_components = 3;
|
|
+ function->params[RAYGEN_ARG_LAUNCH_SIZE].bit_size = 32;
|
|
+ function->params[RAYGEN_ARG_LAUNCH_SIZE].type = glsl_vector_type(GLSL_TYPE_UINT, 3);
|
|
+ function->params[RAYGEN_ARG_LAUNCH_SIZE].is_uniform = true;
|
|
+ function->params[RAYGEN_ARG_DESCRIPTORS].num_components = 1;
|
|
+ function->params[RAYGEN_ARG_DESCRIPTORS].bit_size = 32;
|
|
+ function->params[RAYGEN_ARG_DESCRIPTORS].type = glsl_uint_type();
|
|
+ function->params[RAYGEN_ARG_DESCRIPTORS].is_uniform = true;
|
|
+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].num_components = 1;
|
|
+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].bit_size = 32;
|
|
+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].type = glsl_uint_type();
|
|
+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].is_uniform = true;
|
|
+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].num_components = 1;
|
|
+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].bit_size = 64;
|
|
+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].type = glsl_uint64_t_type();
|
|
+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].is_uniform = true;
|
|
+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].num_components = 1;
|
|
+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].bit_size = 64;
|
|
+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].type = glsl_uint64_t_type();
|
|
+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].is_uniform = true;
|
|
+ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].num_components = 1;
|
|
+ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].bit_size = 64;
|
|
+ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].type = glsl_uint64_t_type();
|
|
+
|
|
+ function->driver_attributes = ACO_NIR_CALL_ABI_RT_RECURSIVE | ACO_NIR_FUNCTION_ATTRIB_DIVERGENT_CALL;
|
|
+ payload_base = RAYGEN_ARG_COUNT;
|
|
+ break;
|
|
+ case MESA_SHADER_INTERSECTION:
|
|
+ function->num_params = TRAVERSAL_ARG_PAYLOAD_BASE + DIV_ROUND_UP(payload_size, 4);
|
|
+ function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params);
|
|
+ function->params[TRAVERSAL_ARG_LAUNCH_ID].num_components = 3;
|
|
+ function->params[TRAVERSAL_ARG_LAUNCH_ID].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_LAUNCH_ID].type = glsl_vector_type(GLSL_TYPE_UINT, 3);
|
|
+ function->params[TRAVERSAL_ARG_LAUNCH_SIZE].num_components = 3;
|
|
+ function->params[TRAVERSAL_ARG_LAUNCH_SIZE].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_LAUNCH_SIZE].type = glsl_vector_type(GLSL_TYPE_UINT, 3);
|
|
+ function->params[TRAVERSAL_ARG_LAUNCH_SIZE].is_uniform = true;
|
|
+ function->params[TRAVERSAL_ARG_DESCRIPTORS].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_DESCRIPTORS].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_DESCRIPTORS].type = glsl_uint_type();
|
|
+ function->params[TRAVERSAL_ARG_DESCRIPTORS].is_uniform = true;
|
|
+ function->params[TRAVERSAL_ARG_PUSH_CONSTANTS].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_PUSH_CONSTANTS].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_PUSH_CONSTANTS].type = glsl_uint_type();
|
|
+ function->params[TRAVERSAL_ARG_PUSH_CONSTANTS].is_uniform = true;
|
|
+ function->params[TRAVERSAL_ARG_SBT_DESCRIPTORS].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_SBT_DESCRIPTORS].bit_size = 64;
|
|
+ function->params[TRAVERSAL_ARG_SBT_DESCRIPTORS].type = glsl_uint64_t_type();
|
|
+ function->params[TRAVERSAL_ARG_SBT_DESCRIPTORS].is_uniform = true;
|
|
+ function->params[TRAVERSAL_ARG_TRAVERSAL_ADDR].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_TRAVERSAL_ADDR].bit_size = 64;
|
|
+ function->params[TRAVERSAL_ARG_TRAVERSAL_ADDR].type = glsl_uint64_t_type();
|
|
+ function->params[TRAVERSAL_ARG_TRAVERSAL_ADDR].is_uniform = true;
|
|
+ function->params[TRAVERSAL_ARG_SHADER_RECORD_PTR].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_SHADER_RECORD_PTR].bit_size = 64;
|
|
+ function->params[TRAVERSAL_ARG_SHADER_RECORD_PTR].type = glsl_uint64_t_type();
|
|
+ function->params[TRAVERSAL_ARG_ACCEL_STRUCT].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_ACCEL_STRUCT].bit_size = 64;
|
|
+ function->params[TRAVERSAL_ARG_ACCEL_STRUCT].type = glsl_uint64_t_type();
|
|
+ function->params[TRAVERSAL_ARG_CULL_MASK_AND_FLAGS].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_CULL_MASK_AND_FLAGS].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_CULL_MASK_AND_FLAGS].type = glsl_uint_type();
|
|
+ function->params[TRAVERSAL_ARG_SBT_OFFSET].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_SBT_OFFSET].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_SBT_OFFSET].type = glsl_uint_type();
|
|
+ function->params[TRAVERSAL_ARG_SBT_STRIDE].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_SBT_STRIDE].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_SBT_STRIDE].type = glsl_uint_type();
|
|
+ function->params[TRAVERSAL_ARG_MISS_INDEX].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_MISS_INDEX].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_MISS_INDEX].type = glsl_uint_type();
|
|
+ function->params[TRAVERSAL_ARG_RAY_ORIGIN].num_components = 3;
|
|
+ function->params[TRAVERSAL_ARG_RAY_ORIGIN].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_RAY_ORIGIN].type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
|
|
+ function->params[TRAVERSAL_ARG_RAY_TMIN].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_RAY_TMIN].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_RAY_TMIN].type = glsl_float_type();
|
|
+ function->params[TRAVERSAL_ARG_RAY_DIRECTION].num_components = 3;
|
|
+ function->params[TRAVERSAL_ARG_RAY_DIRECTION].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_RAY_DIRECTION].type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
|
|
+ function->params[TRAVERSAL_ARG_RAY_TMAX].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_RAY_TMAX].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_RAY_TMAX].type = glsl_float_type();
|
|
+ function->params[TRAVERSAL_ARG_PRIMITIVE_ID].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_PRIMITIVE_ID].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_PRIMITIVE_ID].type = glsl_uint_type();
|
|
+ function->params[TRAVERSAL_ARG_INSTANCE_ADDR].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_INSTANCE_ADDR].bit_size = 64;
|
|
+ function->params[TRAVERSAL_ARG_INSTANCE_ADDR].type = glsl_uint64_t_type();
|
|
+ function->params[TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS].type = glsl_uint_type();
|
|
+ function->params[TRAVERSAL_ARG_HIT_KIND].num_components = 1;
|
|
+ function->params[TRAVERSAL_ARG_HIT_KIND].bit_size = 32;
|
|
+ function->params[TRAVERSAL_ARG_HIT_KIND].type = glsl_uint_type();
|
|
+
|
|
+ function->driver_attributes = ACO_NIR_CALL_ABI_TRAVERSAL;
|
|
+ payload_base = TRAVERSAL_ARG_PAYLOAD_BASE;
|
|
+ break;
|
|
+ case MESA_SHADER_CLOSEST_HIT:
|
|
+ case MESA_SHADER_MISS:
|
|
+ function->num_params = CHIT_MISS_ARG_PAYLOAD_BASE + DIV_ROUND_UP(payload_size, 4);
|
|
+ function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params);
|
|
+ function->params[CHIT_MISS_ARG_LAUNCH_ID].num_components = 3;
|
|
+ function->params[CHIT_MISS_ARG_LAUNCH_ID].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_LAUNCH_ID].type = glsl_vector_type(GLSL_TYPE_UINT, 3);
|
|
+ function->params[CHIT_MISS_ARG_LAUNCH_SIZE].num_components = 3;
|
|
+ function->params[CHIT_MISS_ARG_LAUNCH_SIZE].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_LAUNCH_SIZE].type = glsl_vector_type(GLSL_TYPE_UINT, 3);
|
|
+ function->params[CHIT_MISS_ARG_LAUNCH_SIZE].is_uniform = true;
|
|
+ function->params[CHIT_MISS_ARG_DESCRIPTORS].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_DESCRIPTORS].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_DESCRIPTORS].type = glsl_uint_type();
|
|
+ function->params[CHIT_MISS_ARG_DESCRIPTORS].is_uniform = true;
|
|
+ function->params[CHIT_MISS_ARG_PUSH_CONSTANTS].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_PUSH_CONSTANTS].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_PUSH_CONSTANTS].type = glsl_uint_type();
|
|
+ function->params[CHIT_MISS_ARG_PUSH_CONSTANTS].is_uniform = true;
|
|
+ function->params[CHIT_MISS_ARG_SBT_DESCRIPTORS].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_SBT_DESCRIPTORS].bit_size = 64;
|
|
+ function->params[CHIT_MISS_ARG_SBT_DESCRIPTORS].type = glsl_uint64_t_type();
|
|
+ function->params[CHIT_MISS_ARG_SBT_DESCRIPTORS].is_uniform = true;
|
|
+ function->params[CHIT_MISS_ARG_TRAVERSAL_ADDR].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_TRAVERSAL_ADDR].bit_size = 64;
|
|
+ function->params[CHIT_MISS_ARG_TRAVERSAL_ADDR].type = glsl_uint64_t_type();
|
|
+ function->params[CHIT_MISS_ARG_TRAVERSAL_ADDR].is_uniform = true;
|
|
+ function->params[CHIT_MISS_ARG_SHADER_RECORD_PTR].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_SHADER_RECORD_PTR].bit_size = 64;
|
|
+ function->params[CHIT_MISS_ARG_SHADER_RECORD_PTR].type = glsl_uint64_t_type();
|
|
+ function->params[CHIT_MISS_ARG_ACCEL_STRUCT].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_ACCEL_STRUCT].bit_size = 64;
|
|
+ function->params[CHIT_MISS_ARG_ACCEL_STRUCT].driver_attributes = ACO_NIR_PARAM_ATTRIB_DISCARDABLE;
|
|
+ function->params[CHIT_MISS_ARG_ACCEL_STRUCT].type = glsl_uint64_t_type();
|
|
+ function->params[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS].type = glsl_uint_type();
|
|
+ function->params[CHIT_MISS_ARG_SBT_OFFSET].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_SBT_OFFSET].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_SBT_OFFSET].driver_attributes = ACO_NIR_PARAM_ATTRIB_DISCARDABLE;
|
|
+ function->params[CHIT_MISS_ARG_SBT_OFFSET].type = glsl_uint_type();
|
|
+ function->params[CHIT_MISS_ARG_SBT_STRIDE].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_SBT_STRIDE].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_SBT_STRIDE].driver_attributes = ACO_NIR_PARAM_ATTRIB_DISCARDABLE;
|
|
+ function->params[CHIT_MISS_ARG_SBT_STRIDE].type = glsl_uint_type();
|
|
+ function->params[CHIT_MISS_ARG_MISS_INDEX].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_MISS_INDEX].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_MISS_INDEX].driver_attributes = ACO_NIR_PARAM_ATTRIB_DISCARDABLE;
|
|
+ function->params[CHIT_MISS_ARG_MISS_INDEX].type = glsl_uint_type();
|
|
+ function->params[CHIT_MISS_ARG_RAY_ORIGIN].num_components = 3;
|
|
+ function->params[CHIT_MISS_ARG_RAY_ORIGIN].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_RAY_ORIGIN].type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
|
|
+ function->params[CHIT_MISS_ARG_RAY_TMIN].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_RAY_TMIN].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_RAY_TMIN].type = glsl_float_type();
|
|
+ function->params[CHIT_MISS_ARG_RAY_DIRECTION].num_components = 3;
|
|
+ function->params[CHIT_MISS_ARG_RAY_DIRECTION].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_RAY_DIRECTION].type = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
|
|
+ function->params[CHIT_MISS_ARG_RAY_TMAX].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_RAY_TMAX].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_RAY_TMAX].type = glsl_float_type();
|
|
+ function->params[CHIT_MISS_ARG_PRIMITIVE_ID].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_PRIMITIVE_ID].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_PRIMITIVE_ID].type = glsl_uint_type();
|
|
+ function->params[CHIT_MISS_ARG_INSTANCE_ADDR].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_INSTANCE_ADDR].bit_size = 64;
|
|
+ function->params[CHIT_MISS_ARG_INSTANCE_ADDR].type = glsl_uint64_t_type();
|
|
+ function->params[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS].type = glsl_uint_type();
|
|
+ function->params[CHIT_MISS_ARG_HIT_KIND].num_components = 1;
|
|
+ function->params[CHIT_MISS_ARG_HIT_KIND].bit_size = 32;
|
|
+ function->params[CHIT_MISS_ARG_HIT_KIND].type = glsl_uint_type();
|
|
+
|
|
+ function->driver_attributes = ACO_NIR_CALL_ABI_RT_RECURSIVE | ACO_NIR_FUNCTION_ATTRIB_DIVERGENT_CALL;
|
|
+ payload_base = CHIT_MISS_ARG_PAYLOAD_BASE;
|
|
+ break;
|
|
+ default:
|
|
+ unreachable("invalid RT stage");
|
|
+ }
|
|
+
|
|
+ if (payload_base != -1u) {
|
|
+ for (unsigned i = 0; i < DIV_ROUND_UP(payload_size, 4); ++i) {
|
|
+ function->params[payload_base + i].num_components = 1;
|
|
+ function->params[payload_base + i].bit_size = 32;
|
|
+ function->params[payload_base + i].is_return = true;
|
|
+ function->params[payload_base + i].type = glsl_uint_type();
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* Entrypoints can't have parameters. Consider RT stages as callable functions */
|
|
+ function->is_exported = true;
|
|
+ function->is_entrypoint = false;
|
|
+}
|
|
+
|
|
/*
|
|
* Global variables for an RT pipeline
|
|
*/
|
|
@@ -180,6 +419,8 @@ struct rt_variables {
|
|
nir_variable *shader_addr;
|
|
nir_variable *traversal_addr;
|
|
|
|
+ nir_variable *sbt_descriptors;
|
|
+
|
|
/* scratch offset of the argument area relative to stack_ptr */
|
|
nir_variable *arg;
|
|
uint32_t payload_offset;
|
|
@@ -217,12 +458,19 @@ struct rt_variables {
|
|
nir_variable *ahit_terminate;
|
|
nir_variable *terminated;
|
|
|
|
+ nir_variable **out_payload_storage;
|
|
+ unsigned payload_size;
|
|
+
|
|
+ nir_function *trace_ray_func;
|
|
+ nir_function *chit_miss_func;
|
|
+ nir_function *callable_func;
|
|
+
|
|
unsigned stack_size;
|
|
};
|
|
|
|
static struct rt_variables
|
|
create_rt_variables(nir_shader *shader, struct radv_device *device, const VkPipelineCreateFlags2KHR flags,
|
|
- bool monolithic)
|
|
+ unsigned max_payload_size, bool monolithic)
|
|
{
|
|
struct rt_variables vars = {
|
|
.device = device,
|
|
@@ -236,6 +484,8 @@ create_rt_variables(nir_shader *shader, struct radv_device *device, const VkPipe
|
|
vars.stack_ptr = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "stack_ptr");
|
|
vars.shader_record_ptr = nir_variable_create(shader, nir_var_shader_temp, glsl_uint64_t_type(), "shader_record_ptr");
|
|
|
|
+ vars.sbt_descriptors = nir_variable_create(shader, nir_var_shader_temp, glsl_uint64_t_type(), "sbt_descriptors");
|
|
+
|
|
vars.launch_sizes[0] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "launch_size_x");
|
|
vars.launch_sizes[1] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "launch_size_y");
|
|
vars.launch_sizes[2] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "launch_size_z");
|
|
@@ -269,6 +519,23 @@ create_rt_variables(nir_shader *shader, struct radv_device *device, const VkPipe
|
|
vars.ahit_terminate = nir_variable_create(shader, nir_var_shader_temp, glsl_bool_type(), "ahit_terminate");
|
|
vars.terminated = nir_variable_create(shader, nir_var_shader_temp, glsl_bool_type(), "terminated");
|
|
|
|
+ if (max_payload_size)
|
|
+ vars.out_payload_storage = rzalloc_array_size(shader, DIV_ROUND_UP(max_payload_size, 4), sizeof(nir_variable *));
|
|
+ vars.payload_size = max_payload_size;
|
|
+ for (unsigned i = 0; i < DIV_ROUND_UP(max_payload_size, 4); ++i) {
|
|
+ vars.out_payload_storage[i] =
|
|
+ nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "out_payload_storage");
|
|
+ }
|
|
+
|
|
+ nir_function *trace_ray_func = nir_function_create(shader, "trace_ray_func");
|
|
+ radv_nir_init_function_params(trace_ray_func, MESA_SHADER_INTERSECTION, max_payload_size);
|
|
+ vars.trace_ray_func = trace_ray_func;
|
|
+ nir_function *chit_miss_func = nir_function_create(shader, "chit_miss_func");
|
|
+ radv_nir_init_function_params(chit_miss_func, MESA_SHADER_CLOSEST_HIT, max_payload_size);
|
|
+ vars.chit_miss_func = chit_miss_func;
|
|
+ nir_function *callable_func = nir_function_create(shader, "callable_func");
|
|
+ radv_nir_init_function_params(callable_func, MESA_SHADER_CALLABLE, max_payload_size);
|
|
+ vars.callable_func = callable_func;
|
|
return vars;
|
|
}
|
|
|
|
@@ -850,7 +1117,8 @@ insert_rt_case(nir_builder *b, nir_shader *shader, struct rt_variables *vars, ni
|
|
|
|
nir_opt_dead_cf(shader);
|
|
|
|
- struct rt_variables src_vars = create_rt_variables(shader, vars->device, vars->flags, vars->monolithic);
|
|
+ struct rt_variables src_vars =
|
|
+ create_rt_variables(shader, vars->device, vars->flags, vars->payload_size, vars->monolithic);
|
|
map_rt_variables(var_remap, &src_vars, vars);
|
|
|
|
NIR_PASS_V(shader, lower_rt_instructions, &src_vars, false, NULL);
|
|
@@ -1723,7 +1991,7 @@ radv_build_traversal_shader(struct radv_device *device, struct radv_ray_tracing_
|
|
b.shader->info.workgroup_size[0] = 8;
|
|
b.shader->info.workgroup_size[1] = pdev->rt_wave_size == 64 ? 8 : 4;
|
|
b.shader->info.shared_size = pdev->rt_wave_size * MAX_STACK_ENTRY_COUNT * sizeof(uint32_t);
|
|
- struct rt_variables vars = create_rt_variables(b.shader, device, create_flags, false);
|
|
+ struct rt_variables vars = create_rt_variables(b.shader, device, create_flags, false, 0);
|
|
|
|
if (info->tmin.state == RADV_RT_CONST_ARG_STATE_VALID)
|
|
nir_store_var(&b, vars.tmin, nir_imm_int(&b, info->tmin.value), 0x1);
|
|
@@ -1902,7 +2170,7 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH
|
|
|
|
const VkPipelineCreateFlagBits2KHR create_flags = vk_rt_pipeline_create_flags(pCreateInfo);
|
|
|
|
- struct rt_variables vars = create_rt_variables(shader, device, create_flags, monolithic);
|
|
+ struct rt_variables vars = create_rt_variables(shader, device, create_flags, payload_size, monolithic);
|
|
|
|
if (monolithic)
|
|
lower_rt_instructions_monolithic(shader, device, pipeline, info, pCreateInfo, payload_size, &vars);
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 7c97b73c788dcab2347225073bac244aa8aea252 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 12:04:36 +0200
|
|
Subject: [PATCH 23/71] radv/rt: Convert lower_rt_derefs to register payloads
|
|
|
|
All payloads alias the same registers by the time RT functions get
|
|
called. In order to pretend that the payload variables (represented by
|
|
function_temp vars) are separate, payload values are copied to the
|
|
"global" payload variables (shader_temp variables) just before a shader
|
|
call, and copied from there immediately after the shader call.
|
|
---
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 84 ++++++++++++++++++++-----
|
|
1 file changed, 68 insertions(+), 16 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index 165c7e18578e0..0ebb095f52e1c 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -126,6 +126,62 @@ radv_visit_inlined_shaders(nir_builder *b, nir_def *sbt_idx, bool can_have_null_
|
|
free(cases);
|
|
}
|
|
|
|
+static void
|
|
+lower_rt_deref_var(nir_shader *shader, nir_function_impl *impl, nir_instr *instr, struct hash_table *cloned_vars)
|
|
+{
|
|
+ nir_deref_instr *deref = nir_instr_as_deref(instr);
|
|
+ nir_variable *var = deref->var;
|
|
+ struct hash_entry *entry = _mesa_hash_table_search(cloned_vars, var);
|
|
+ if (!(var->data.mode & nir_var_function_temp) && !entry)
|
|
+ return;
|
|
+
|
|
+ hash_table_foreach (cloned_vars, cloned_entry) {
|
|
+ if (var == cloned_entry->data)
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ nir_variable *new_var;
|
|
+ if (entry) {
|
|
+ new_var = entry->data;
|
|
+ } else {
|
|
+ new_var = nir_variable_clone(var, shader);
|
|
+ _mesa_hash_table_insert(cloned_vars, var, new_var);
|
|
+
|
|
+ exec_node_remove(&var->node);
|
|
+ var->data.mode = nir_var_shader_temp;
|
|
+ exec_list_push_tail(&shader->variables, &var->node);
|
|
+
|
|
+ exec_list_push_tail(&impl->locals, &new_var->node);
|
|
+ }
|
|
+
|
|
+ deref->modes = nir_var_shader_temp;
|
|
+
|
|
+ nir_foreach_use_safe (use, nir_instr_def(instr)) {
|
|
+ if (nir_src_is_if(use))
|
|
+ continue;
|
|
+
|
|
+ nir_instr *parent = nir_src_parent_instr(use);
|
|
+ if (parent->type != nir_instr_type_intrinsic)
|
|
+ continue;
|
|
+
|
|
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(parent);
|
|
+ if (intrin->intrinsic != nir_intrinsic_trace_ray && intrin->intrinsic != nir_intrinsic_execute_callable &&
|
|
+ intrin->intrinsic != nir_intrinsic_execute_closest_hit_amd &&
|
|
+ intrin->intrinsic != nir_intrinsic_execute_miss_amd)
|
|
+ continue;
|
|
+
|
|
+ nir_builder b = nir_builder_at(nir_before_instr(parent));
|
|
+ nir_deref_instr *old_deref = nir_build_deref_var(&b, var);
|
|
+ nir_deref_instr *new_deref = nir_build_deref_var(&b, new_var);
|
|
+
|
|
+ nir_copy_deref(&b, new_deref, old_deref);
|
|
+ b.cursor = nir_after_instr(parent);
|
|
+ nir_copy_deref(&b, old_deref, new_deref);
|
|
+
|
|
+ nir_src_rewrite(use, nir_instr_def(&new_deref->instr));
|
|
+ }
|
|
+}
|
|
+
|
|
static bool
|
|
lower_rt_derefs(nir_shader *shader)
|
|
{
|
|
@@ -133,9 +189,7 @@ lower_rt_derefs(nir_shader *shader)
|
|
|
|
bool progress = false;
|
|
|
|
- nir_builder b = nir_builder_at(nir_before_impl(impl));
|
|
-
|
|
- nir_def *arg_offset = nir_load_rt_arg_scratch_offset_amd(&b);
|
|
+ struct hash_table *cloned_vars = _mesa_pointer_hash_table_create(shader);
|
|
|
|
nir_foreach_block (block, impl) {
|
|
nir_foreach_instr_safe (instr, block) {
|
|
@@ -143,17 +197,18 @@ lower_rt_derefs(nir_shader *shader)
|
|
continue;
|
|
|
|
nir_deref_instr *deref = nir_instr_as_deref(instr);
|
|
- if (!nir_deref_mode_is(deref, nir_var_shader_call_data))
|
|
+ if (!nir_deref_mode_is(deref, nir_var_function_temp))
|
|
continue;
|
|
|
|
- deref->modes = nir_var_function_temp;
|
|
- progress = true;
|
|
-
|
|
if (deref->deref_type == nir_deref_type_var) {
|
|
- b.cursor = nir_before_instr(&deref->instr);
|
|
- nir_deref_instr *replacement =
|
|
- nir_build_deref_cast(&b, arg_offset, nir_var_function_temp, deref->var->type, 0);
|
|
- nir_def_replace(&deref->def, &replacement->def);
|
|
+ lower_rt_deref_var(shader, impl, instr, cloned_vars);
|
|
+ progress = true;
|
|
+ } else {
|
|
+ assert(deref->deref_type != nir_deref_type_cast);
|
|
+ /* Parent modes might have changed, propagate change */
|
|
+ nir_deref_instr *parent = nir_src_as_deref(deref->parent);
|
|
+ if (parent->modes != deref->modes)
|
|
+ deref->modes = parent->modes;
|
|
}
|
|
}
|
|
}
|
|
@@ -1139,12 +1194,9 @@ void
|
|
radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset)
|
|
{
|
|
if (!monolithic) {
|
|
- NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_function_temp | nir_var_shader_call_data,
|
|
- glsl_get_natural_size_align_bytes);
|
|
-
|
|
NIR_PASS(_, nir, lower_rt_derefs);
|
|
-
|
|
- NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_function_temp, nir_address_format_32bit_offset);
|
|
+ NIR_PASS(_, nir, nir_split_var_copies);
|
|
+ NIR_PASS(_, nir, nir_lower_var_copies);
|
|
} else {
|
|
NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_natural_size_align_bytes);
|
|
NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_shader_temp, glsl_get_natural_size_align_bytes);
|
|
--
|
|
GitLab
|
|
|
|
|
|
From c45e4fbee8cb3c930d13c5ce1c1478b68fdcbbb5 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 12:09:15 +0200
|
|
Subject: [PATCH 24/71] radv/rt: Align radv_nir_lower_rt_io to new lowering
|
|
|
|
---
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 10 +++++-----
|
|
src/amd/vulkan/radv_shader.h | 2 +-
|
|
2 files changed, 6 insertions(+), 6 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index 0ebb095f52e1c..7708dd8809b79 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -1191,7 +1191,7 @@ insert_rt_case(nir_builder *b, nir_shader *shader, struct rt_variables *vars, ni
|
|
}
|
|
|
|
void
|
|
-radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset)
|
|
+radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset, uint32_t *payload_size)
|
|
{
|
|
if (!monolithic) {
|
|
NIR_PASS(_, nir, lower_rt_derefs);
|
|
@@ -1625,7 +1625,7 @@ radv_build_ahit_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g
|
|
radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->any_hit_shader].nir);
|
|
assert(nir_stage);
|
|
|
|
- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset);
|
|
+ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL);
|
|
|
|
insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.any_hit_index);
|
|
ralloc_free(nir_stage);
|
|
@@ -1649,7 +1649,7 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g
|
|
radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->intersection_shader].nir);
|
|
assert(nir_stage);
|
|
|
|
- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset);
|
|
+ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL);
|
|
|
|
nir_shader *any_hit_stage = NULL;
|
|
if (group->any_hit_shader != VK_SHADER_UNUSED_KHR) {
|
|
@@ -1657,7 +1657,7 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g
|
|
radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->any_hit_shader].nir);
|
|
assert(any_hit_stage);
|
|
|
|
- radv_nir_lower_rt_io(any_hit_stage, data->vars->monolithic, data->vars->payload_offset);
|
|
+ radv_nir_lower_rt_io(any_hit_stage, data->vars->monolithic, data->vars->payload_offset, NULL);
|
|
|
|
/* reserve stack size for any_hit before it is inlined */
|
|
data->pipeline->stages[group->any_hit_shader].stack_size = any_hit_stage->scratch_size;
|
|
@@ -1701,7 +1701,7 @@ radv_build_recursive_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_trac
|
|
radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->recursive_shader].nir);
|
|
assert(nir_stage);
|
|
|
|
- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset);
|
|
+ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL);
|
|
|
|
insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.general_index);
|
|
ralloc_free(nir_stage);
|
|
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
|
|
index 4ba7e36d16952..f6a0f35c23333 100644
|
|
--- a/src/amd/vulkan/radv_shader.h
|
|
+++ b/src/amd/vulkan/radv_shader.h
|
|
@@ -516,7 +516,7 @@ radv_get_rt_shader_entrypoint(nir_shader *shader)
|
|
return NULL;
|
|
}
|
|
|
|
-void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset);
|
|
+void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset, uint32_t *payload_size);
|
|
|
|
struct radv_ray_tracing_stage_info;
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 4b54715289586c84b393e264d99e85c327f614f6 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 12:10:31 +0200
|
|
Subject: [PATCH 25/71] radv/rt: Include inlined shader scratch size in
|
|
traversal scratch
|
|
|
|
When calls without tail-call optimization happen, the traversal shader
|
|
must spill, and spilled vars must be placed after shader scratch.
|
|
---
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 3 +++
|
|
1 file changed, 3 insertions(+)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index 7708dd8809b79..f29f91ce18178 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -1626,6 +1626,7 @@ radv_build_ahit_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g
|
|
assert(nir_stage);
|
|
|
|
radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL);
|
|
+ b->shader->scratch_size = MAX2(nir_stage->scratch_size, b->shader->scratch_size);
|
|
|
|
insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.any_hit_index);
|
|
ralloc_free(nir_stage);
|
|
@@ -1661,10 +1662,12 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g
|
|
|
|
/* reserve stack size for any_hit before it is inlined */
|
|
data->pipeline->stages[group->any_hit_shader].stack_size = any_hit_stage->scratch_size;
|
|
+ b->shader->scratch_size = MAX2(any_hit_stage->scratch_size, b->shader->scratch_size);
|
|
|
|
nir_lower_intersection_shader(nir_stage, any_hit_stage);
|
|
ralloc_free(any_hit_stage);
|
|
}
|
|
+ b->shader->scratch_size = MAX2(nir_stage->scratch_size, b->shader->scratch_size);
|
|
|
|
insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.intersection_index);
|
|
ralloc_free(nir_stage);
|
|
--
|
|
GitLab
|
|
|
|
|
|
From a86319221ae3924ff785061af08f4ae16cc851e9 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 12:17:15 +0200
|
|
Subject: [PATCH 26/71] radv/rt: Don't store vars->shader_record_ptr directly
|
|
in load_sbt_entry
|
|
|
|
When calling functions, we don't want the new shader record to stick
|
|
beyond the function call, so only store it when not calling functions.
|
|
---
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 20 ++++++++++++--------
|
|
1 file changed, 12 insertions(+), 8 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index f29f91ce18178..eeec13b0f539c 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -684,7 +684,7 @@ enum sbt_entry {
|
|
SBT_ANY_HIT_IDX = offsetof(struct radv_pipeline_group_handle, any_hit_index),
|
|
};
|
|
|
|
-static void
|
|
+static nir_def *
|
|
load_sbt_entry(nir_builder *b, const struct rt_variables *vars, nir_def *idx, enum sbt_type binding,
|
|
enum sbt_entry offset)
|
|
{
|
|
@@ -704,7 +704,7 @@ load_sbt_entry(nir_builder *b, const struct rt_variables *vars, nir_def *idx, en
|
|
}
|
|
|
|
nir_def *record_addr = nir_iadd_imm(b, addr, RADV_RT_HANDLE_SIZE - offset);
|
|
- nir_store_var(b, vars->shader_record_ptr, record_addr, 1);
|
|
+ return record_addr;
|
|
}
|
|
|
|
struct radv_rt_shader_info {
|
|
@@ -987,7 +987,7 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data)
|
|
nir_store_var(b, vars->instance_addr, intr->src[3].ssa, 0x1);
|
|
nir_store_var(b, vars->geometry_id_and_flags, intr->src[4].ssa, 0x1);
|
|
nir_store_var(b, vars->hit_kind, intr->src[5].ssa, 0x1);
|
|
- load_sbt_entry(b, vars, intr->src[0].ssa, SBT_HIT, SBT_RECURSIVE_PTR);
|
|
+ nir_def *record = load_sbt_entry(b, vars, intr->src[0].ssa, SBT_HIT, SBT_RECURSIVE_PTR);
|
|
|
|
nir_def *should_return =
|
|
nir_test_mask(b, nir_load_var(b, vars->cull_mask_and_flags), SpvRayFlagsSkipClosestHitShaderKHRMask);
|
|
@@ -1011,7 +1011,7 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data)
|
|
nir_store_var(b, vars->geometry_id_and_flags, undef, 0x1);
|
|
nir_store_var(b, vars->hit_kind, undef, 0x1);
|
|
nir_def *miss_index = nir_load_var(b, vars->miss_index);
|
|
- load_sbt_entry(b, vars, miss_index, SBT_MISS, SBT_RECURSIVE_PTR);
|
|
+ nir_def *record = load_sbt_entry(b, vars, miss_index, SBT_MISS, SBT_RECURSIVE_PTR);
|
|
|
|
if (!(vars->flags & VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR)) {
|
|
/* In case of a NULL miss shader, do nothing and just return. */
|
|
@@ -1741,7 +1741,8 @@ handle_candidate_triangle(nir_builder *b, struct radv_triangle_intersection *int
|
|
nir_store_var(b, inner_vars.instance_addr, nir_load_var(b, data->trav_vars->instance_addr), 0x1);
|
|
nir_store_var(b, inner_vars.hit_kind, hit_kind, 0x1);
|
|
|
|
- load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, SBT_ANY_HIT_IDX);
|
|
+ nir_def *record = load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, SBT_ANY_HIT_IDX);
|
|
+ nir_store_var(b, inner_vars.shader_record_ptr, record, 0x1);
|
|
|
|
struct radv_rt_case_data case_data = {
|
|
.device = data->device,
|
|
@@ -1805,7 +1806,8 @@ handle_candidate_aabb(nir_builder *b, struct radv_leaf_intersection *intersectio
|
|
nir_store_var(b, inner_vars.instance_addr, nir_load_var(b, data->trav_vars->instance_addr), 0x1);
|
|
nir_store_var(b, inner_vars.opaque, intersection->opaque, 1);
|
|
|
|
- load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, SBT_INTERSECTION_IDX);
|
|
+ nir_def *record = load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, SBT_INTERSECTION_IDX);
|
|
+ nir_store_var(b, inner_vars.shader_record_ptr, record, 0x1);
|
|
|
|
nir_store_var(b, data->vars->ahit_accept, nir_imm_false(b), 0x1);
|
|
nir_store_var(b, data->vars->ahit_terminate, nir_imm_false(b), 0x1);
|
|
@@ -1979,7 +1981,8 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin
|
|
nir_push_if(b, nir_load_var(b, trav_vars.hit));
|
|
{
|
|
if (monolithic) {
|
|
- load_sbt_entry(b, vars, nir_load_var(b, vars->idx), SBT_HIT, SBT_CLOSEST_HIT_IDX);
|
|
+ nir_def *record = load_sbt_entry(b, vars, nir_load_var(b, vars->idx), SBT_HIT, SBT_CLOSEST_HIT_IDX);
|
|
+ nir_store_var(b, vars->shader_record_ptr, record, 0x1);
|
|
|
|
nir_def *should_return =
|
|
nir_test_mask(b, nir_load_var(b, vars->cull_mask_and_flags), SpvRayFlagsSkipClosestHitShaderKHRMask);
|
|
@@ -2011,7 +2014,8 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin
|
|
nir_push_else(b, NULL);
|
|
{
|
|
if (monolithic) {
|
|
- load_sbt_entry(b, vars, nir_load_var(b, vars->miss_index), SBT_MISS, SBT_GENERAL_IDX);
|
|
+ nir_def *record = load_sbt_entry(b, vars, nir_load_var(b, vars->miss_index), SBT_MISS, SBT_GENERAL_IDX);
|
|
+ nir_store_var(b, vars->shader_record_ptr, record, 0x1);
|
|
|
|
struct radv_rt_case_data case_data = {
|
|
.device = device,
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 07a8a0b29f5e3d5b969ec8164af8fdefd8ffc28a Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Thu, 3 Oct 2024 15:59:01 +0200
|
|
Subject: [PATCH 27/71] radv/rt: Load SBT descriptor from NIR variables
|
|
|
|
---
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 5 +++++
|
|
1 file changed, 5 insertions(+)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index eeec13b0f539c..2f13831d9d473 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -965,6 +965,10 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data)
|
|
nir_pop_if(b, NULL);
|
|
break;
|
|
}
|
|
+ case nir_intrinsic_load_sbt_base_amd: {
|
|
+ ret = nir_load_var(b, vars->sbt_descriptors);
|
|
+ break;
|
|
+ }
|
|
case nir_intrinsic_load_sbt_offset_amd: {
|
|
ret = nir_load_var(b, vars->sbt_offset);
|
|
break;
|
|
@@ -2077,6 +2081,7 @@ radv_build_traversal_shader(struct radv_device *device, struct radv_ray_tracing_
|
|
nir_store_var(&b, vars.cull_mask_and_flags, nir_load_cull_mask_and_flags_amd(&b), 0x1);
|
|
nir_store_var(&b, vars.origin, nir_load_ray_world_origin(&b), 0x7);
|
|
nir_store_var(&b, vars.direction, nir_load_ray_world_direction(&b), 0x7);
|
|
+ nir_store_var(&b, vars.sbt_descriptors, nir_load_sbt_base_amd(&b), 0x1);
|
|
nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 0x1);
|
|
|
|
radv_build_traversal(device, pipeline, pCreateInfo, false, &b, &vars, false, info);
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 565c4764726d6a68e785c019f49914b00b8930ed Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 12:21:29 +0200
|
|
Subject: [PATCH 28/71] radv/rt: Use function calls for shader calls
|
|
|
|
Don't call nir_lower_shader_calls anymore, but emit nir_call
|
|
instructions for trace_ray and friends. Also, switch from shader args
|
|
to parameters for most things, and change lowerings accordingly.
|
|
---
|
|
src/amd/common/ac_shader_args.h | 16 -
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 487 +++++++++++++-----------
|
|
src/amd/vulkan/radv_pipeline_rt.c | 62 +--
|
|
src/amd/vulkan/radv_shader.h | 7 +-
|
|
src/amd/vulkan/radv_shader_args.c | 20 +-
|
|
5 files changed, 290 insertions(+), 302 deletions(-)
|
|
|
|
diff --git a/src/amd/common/ac_shader_args.h b/src/amd/common/ac_shader_args.h
|
|
index 62ac708c3d185..030a271e22ff3 100644
|
|
--- a/src/amd/common/ac_shader_args.h
|
|
+++ b/src/amd/common/ac_shader_args.h
|
|
@@ -179,29 +179,13 @@ struct ac_shader_args {
|
|
|
|
/* RT */
|
|
struct {
|
|
- struct ac_arg uniform_shader_addr;
|
|
struct ac_arg sbt_descriptors;
|
|
struct ac_arg launch_sizes[3];
|
|
struct ac_arg launch_size_addr;
|
|
struct ac_arg launch_ids[3];
|
|
struct ac_arg dynamic_callable_stack_base;
|
|
struct ac_arg traversal_shader_addr;
|
|
- struct ac_arg shader_addr;
|
|
- struct ac_arg shader_record;
|
|
struct ac_arg payload_offset;
|
|
- struct ac_arg ray_origin;
|
|
- struct ac_arg ray_tmin;
|
|
- struct ac_arg ray_direction;
|
|
- struct ac_arg ray_tmax;
|
|
- struct ac_arg cull_mask_and_flags;
|
|
- struct ac_arg sbt_offset;
|
|
- struct ac_arg sbt_stride;
|
|
- struct ac_arg miss_index;
|
|
- struct ac_arg accel_struct;
|
|
- struct ac_arg primitive_id;
|
|
- struct ac_arg instance_addr;
|
|
- struct ac_arg geometry_id_and_flags;
|
|
- struct ac_arg hit_kind;
|
|
} rt;
|
|
};
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index 2f13831d9d473..7968cb36f5d87 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -688,7 +688,7 @@ static nir_def *
|
|
load_sbt_entry(nir_builder *b, const struct rt_variables *vars, nir_def *idx, enum sbt_type binding,
|
|
enum sbt_entry offset)
|
|
{
|
|
- nir_def *desc_base_addr = nir_load_sbt_base_amd(b);
|
|
+ nir_def *desc_base_addr = nir_load_var(b, vars->sbt_descriptors);
|
|
|
|
nir_def *desc = nir_pack_64_2x32(b, nir_load_smem_amd(b, 2, desc_base_addr, nir_imm_int(b, binding)));
|
|
|
|
@@ -742,74 +742,58 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data)
|
|
|
|
nir_def *ret = NULL;
|
|
switch (intr->intrinsic) {
|
|
- case nir_intrinsic_rt_execute_callable: {
|
|
- uint32_t size = align(nir_intrinsic_stack_size(intr), 16);
|
|
- nir_def *ret_ptr = nir_load_resume_shader_address_amd(b, nir_intrinsic_call_idx(intr));
|
|
- ret_ptr = nir_ior_imm(b, ret_ptr, radv_get_rt_priority(b->shader->info.stage));
|
|
-
|
|
- nir_store_var(b, vars->stack_ptr, nir_iadd_imm_nuw(b, nir_load_var(b, vars->stack_ptr), size), 1);
|
|
- nir_store_scratch(b, ret_ptr, nir_load_var(b, vars->stack_ptr), .align_mul = 16);
|
|
-
|
|
- nir_store_var(b, vars->stack_ptr, nir_iadd_imm_nuw(b, nir_load_var(b, vars->stack_ptr), 16), 1);
|
|
- load_sbt_entry(b, vars, intr->src[0].ssa, SBT_CALLABLE, SBT_RECURSIVE_PTR);
|
|
-
|
|
- nir_store_var(b, vars->arg, nir_iadd_imm(b, intr->src[1].ssa, -size - 16), 1);
|
|
-
|
|
- vars->stack_size = MAX2(vars->stack_size, size + 16);
|
|
+ case nir_intrinsic_execute_callable: {
|
|
+ nir_def *record = load_sbt_entry(b, vars, intr->src[0].ssa, SBT_CALLABLE, SBT_RECURSIVE_PTR);
|
|
+
|
|
+ unsigned param_count = RAYGEN_ARG_COUNT + DIV_ROUND_UP(vars->payload_size, 4);
|
|
+ nir_def **args = rzalloc_array_size(b->shader, sizeof(nir_def *), param_count);
|
|
+ args[RAYGEN_ARG_LAUNCH_ID] = nir_vec3(b, nir_load_var(b, vars->launch_ids[0]), nir_load_var(b, vars->launch_ids[1]), nir_load_var(b, vars->launch_ids[2]));
|
|
+ args[RAYGEN_ARG_LAUNCH_SIZE] = nir_vec3(b, nir_load_var(b, vars->launch_sizes[0]), nir_load_var(b, vars->launch_sizes[1]), nir_load_var(b, vars->launch_sizes[2]));
|
|
+ args[RAYGEN_ARG_DESCRIPTORS] = nir_load_param(b, b->shader->info.stage == MESA_SHADER_RAYGEN || b->shader->info.stage == MESA_SHADER_CALLABLE ? RAYGEN_ARG_DESCRIPTORS : CHIT_MISS_ARG_DESCRIPTORS);
|
|
+ args[RAYGEN_ARG_PUSH_CONSTANTS] = nir_load_param(b, b->shader->info.stage == MESA_SHADER_RAYGEN || b->shader->info.stage == MESA_SHADER_CALLABLE ? RAYGEN_ARG_PUSH_CONSTANTS : CHIT_MISS_ARG_PUSH_CONSTANTS);
|
|
+ args[RAYGEN_ARG_SBT_DESCRIPTORS] = nir_load_var(b, vars->sbt_descriptors);
|
|
+ args[RAYGEN_ARG_TRAVERSAL_ADDR] = nir_load_var(b, vars->traversal_addr);
|
|
+ args[RAYGEN_ARG_SHADER_RECORD_PTR] = record;
|
|
+ for (unsigned i = 0; i < DIV_ROUND_UP(vars->payload_size, 4); ++i) {
|
|
+ args[RAYGEN_ARG_COUNT + i] = nir_instr_def(&nir_build_deref_var(b, vars->out_payload_storage[i])->instr);
|
|
+ }
|
|
+ nir_build_indirect_call(b, vars->callable_func, nir_load_var(b, vars->shader_addr), param_count, args);
|
|
break;
|
|
}
|
|
- case nir_intrinsic_rt_trace_ray: {
|
|
- uint32_t size = align(nir_intrinsic_stack_size(intr), 16);
|
|
- nir_def *ret_ptr = nir_load_resume_shader_address_amd(b, nir_intrinsic_call_idx(intr));
|
|
- ret_ptr = nir_ior_imm(b, ret_ptr, radv_get_rt_priority(b->shader->info.stage));
|
|
-
|
|
- nir_store_var(b, vars->stack_ptr, nir_iadd_imm_nuw(b, nir_load_var(b, vars->stack_ptr), size), 1);
|
|
- nir_store_scratch(b, ret_ptr, nir_load_var(b, vars->stack_ptr), .align_mul = 16);
|
|
-
|
|
- nir_store_var(b, vars->stack_ptr, nir_iadd_imm_nuw(b, nir_load_var(b, vars->stack_ptr), 16), 1);
|
|
-
|
|
- nir_store_var(b, vars->shader_addr, nir_load_var(b, vars->traversal_addr), 1);
|
|
- nir_store_var(b, vars->arg, nir_iadd_imm(b, intr->src[10].ssa, -size - 16), 1);
|
|
-
|
|
- vars->stack_size = MAX2(vars->stack_size, size + 16);
|
|
-
|
|
+ case nir_intrinsic_trace_ray: {
|
|
+ nir_def *undef = nir_undef(b, 1, 32);
|
|
/* Per the SPIR-V extension spec we have to ignore some bits for some arguments. */
|
|
- nir_store_var(b, vars->accel_struct, intr->src[0].ssa, 0x1);
|
|
- nir_store_var(b, vars->cull_mask_and_flags, nir_ior(b, nir_ishl_imm(b, intr->src[2].ssa, 24), intr->src[1].ssa),
|
|
- 0x1);
|
|
- nir_store_var(b, vars->sbt_offset, nir_iand_imm(b, intr->src[3].ssa, 0xf), 0x1);
|
|
- nir_store_var(b, vars->sbt_stride, nir_iand_imm(b, intr->src[4].ssa, 0xf), 0x1);
|
|
- nir_store_var(b, vars->miss_index, nir_iand_imm(b, intr->src[5].ssa, 0xffff), 0x1);
|
|
- nir_store_var(b, vars->origin, intr->src[6].ssa, 0x7);
|
|
- nir_store_var(b, vars->tmin, intr->src[7].ssa, 0x1);
|
|
- nir_store_var(b, vars->direction, intr->src[8].ssa, 0x7);
|
|
- nir_store_var(b, vars->tmax, intr->src[9].ssa, 0x1);
|
|
- break;
|
|
- }
|
|
- case nir_intrinsic_rt_resume: {
|
|
- uint32_t size = align(nir_intrinsic_stack_size(intr), 16);
|
|
-
|
|
- nir_store_var(b, vars->stack_ptr, nir_iadd_imm(b, nir_load_var(b, vars->stack_ptr), -size), 1);
|
|
- break;
|
|
- }
|
|
- case nir_intrinsic_rt_return_amd: {
|
|
- if (b->shader->info.stage == MESA_SHADER_RAYGEN) {
|
|
- nir_terminate(b);
|
|
- break;
|
|
+ nir_def *cull_mask_and_flags = nir_ior(b, nir_ishl_imm(b, intr->src[2].ssa, 24), intr->src[1].ssa);
|
|
+
|
|
+ unsigned param_count = TRAVERSAL_ARG_PAYLOAD_BASE + DIV_ROUND_UP(vars->payload_size, 4);
|
|
+ nir_def **args = rzalloc_array_size(b->shader, sizeof(nir_def *), param_count);
|
|
+ args[TRAVERSAL_ARG_LAUNCH_ID] = nir_vec3(b, nir_load_var(b, vars->launch_ids[0]), nir_load_var(b, vars->launch_ids[1]), nir_load_var(b, vars->launch_ids[2]));
|
|
+ args[TRAVERSAL_ARG_LAUNCH_SIZE] = nir_vec3(b, nir_load_var(b, vars->launch_sizes[0]), nir_load_var(b, vars->launch_sizes[1]), nir_load_var(b, vars->launch_sizes[2]));
|
|
+ args[TRAVERSAL_ARG_DESCRIPTORS] = nir_load_param(b, b->shader->info.stage == MESA_SHADER_RAYGEN ? RAYGEN_ARG_DESCRIPTORS : CHIT_MISS_ARG_DESCRIPTORS);
|
|
+ args[TRAVERSAL_ARG_PUSH_CONSTANTS] = nir_load_param(b, b->shader->info.stage == MESA_SHADER_RAYGEN ? RAYGEN_ARG_PUSH_CONSTANTS : CHIT_MISS_ARG_PUSH_CONSTANTS);
|
|
+ args[TRAVERSAL_ARG_SBT_DESCRIPTORS] = nir_load_var(b, vars->sbt_descriptors);
|
|
+ args[TRAVERSAL_ARG_TRAVERSAL_ADDR] = nir_load_var(b, vars->traversal_addr);
|
|
+ args[TRAVERSAL_ARG_SHADER_RECORD_PTR] = nir_load_var(b, vars->shader_record_ptr);
|
|
+ args[TRAVERSAL_ARG_ACCEL_STRUCT] = intr->src[0].ssa;
|
|
+ args[TRAVERSAL_ARG_CULL_MASK_AND_FLAGS] = cull_mask_and_flags;
|
|
+ args[TRAVERSAL_ARG_SBT_OFFSET] = nir_iand_imm(b, intr->src[3].ssa, 0xf);
|
|
+ args[TRAVERSAL_ARG_SBT_STRIDE] = nir_iand_imm(b, intr->src[4].ssa, 0xf);
|
|
+ args[TRAVERSAL_ARG_MISS_INDEX] = nir_iand_imm(b, intr->src[5].ssa, 0xffff);
|
|
+ args[TRAVERSAL_ARG_RAY_ORIGIN] = intr->src[6].ssa;
|
|
+ args[TRAVERSAL_ARG_RAY_TMIN] = intr->src[7].ssa;
|
|
+ args[TRAVERSAL_ARG_RAY_DIRECTION] = intr->src[8].ssa;
|
|
+ args[TRAVERSAL_ARG_RAY_TMAX] = intr->src[9].ssa;
|
|
+ args[TRAVERSAL_ARG_PRIMITIVE_ID] = undef;
|
|
+ args[TRAVERSAL_ARG_INSTANCE_ADDR] = nir_undef(b, 1, 64);
|
|
+ args[TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS] = undef;
|
|
+ args[TRAVERSAL_ARG_HIT_KIND] = undef;
|
|
+ for (unsigned i = 0; i < DIV_ROUND_UP(vars->payload_size, 4); ++i) {
|
|
+ args[TRAVERSAL_ARG_PAYLOAD_BASE + i] =
|
|
+ nir_instr_def(&nir_build_deref_var(b, vars->out_payload_storage[i])->instr);
|
|
}
|
|
- insert_rt_return(b, vars);
|
|
+ nir_build_indirect_call(b, vars->trace_ray_func, nir_load_var(b, vars->traversal_addr), param_count, args);
|
|
break;
|
|
}
|
|
- case nir_intrinsic_load_scratch: {
|
|
- if (data->late_lowering)
|
|
- nir_src_rewrite(&intr->src[0], nir_iadd_nuw(b, nir_load_var(b, vars->stack_ptr), intr->src[0].ssa));
|
|
- return true;
|
|
- }
|
|
- case nir_intrinsic_store_scratch: {
|
|
- if (data->late_lowering)
|
|
- nir_src_rewrite(&intr->src[1], nir_iadd_nuw(b, nir_load_var(b, vars->stack_ptr), intr->src[1].ssa));
|
|
- return true;
|
|
- }
|
|
case nir_intrinsic_load_shader_record_ptr: {
|
|
ret = nir_load_var(b, vars->shader_record_ptr);
|
|
break;
|
|
@@ -986,11 +970,6 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data)
|
|
break;
|
|
}
|
|
case nir_intrinsic_execute_closest_hit_amd: {
|
|
- nir_store_var(b, vars->tmax, intr->src[1].ssa, 0x1);
|
|
- nir_store_var(b, vars->primitive_id, intr->src[2].ssa, 0x1);
|
|
- nir_store_var(b, vars->instance_addr, intr->src[3].ssa, 0x1);
|
|
- nir_store_var(b, vars->geometry_id_and_flags, intr->src[4].ssa, 0x1);
|
|
- nir_store_var(b, vars->hit_kind, intr->src[5].ssa, 0x1);
|
|
nir_def *record = load_sbt_entry(b, vars, intr->src[0].ssa, SBT_HIT, SBT_RECURSIVE_PTR);
|
|
|
|
nir_def *should_return =
|
|
@@ -1002,28 +981,82 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data)
|
|
|
|
/* should_return is set if we had a hit but we won't be calling the closest hit
|
|
* shader and hence need to return immediately to the calling shader. */
|
|
- nir_push_if(b, should_return);
|
|
- insert_rt_return(b, vars);
|
|
+ nir_push_if(b, nir_inot(b, should_return));
|
|
+ unsigned param_count = CHIT_MISS_ARG_PAYLOAD_BASE + DIV_ROUND_UP(vars->payload_size, 4);
|
|
+ nir_def **args = rzalloc_array_size(b->shader, sizeof(nir_def *), param_count);
|
|
+ args[CHIT_MISS_ARG_LAUNCH_ID] = nir_vec3(b, nir_load_var(b, vars->launch_ids[0]), nir_load_var(b, vars->launch_ids[1]), nir_load_var(b, vars->launch_ids[2]));
|
|
+ args[CHIT_MISS_ARG_LAUNCH_SIZE] = nir_vec3(b, nir_load_var(b, vars->launch_sizes[0]), nir_load_var(b, vars->launch_sizes[1]), nir_load_var(b, vars->launch_sizes[2]));
|
|
+ args[CHIT_MISS_ARG_DESCRIPTORS] = nir_load_param(b, TRAVERSAL_ARG_DESCRIPTORS);
|
|
+ args[CHIT_MISS_ARG_PUSH_CONSTANTS] = nir_load_param(b, TRAVERSAL_ARG_PUSH_CONSTANTS);
|
|
+ args[CHIT_MISS_ARG_SBT_DESCRIPTORS] = nir_load_var(b, vars->sbt_descriptors);
|
|
+ args[CHIT_MISS_ARG_TRAVERSAL_ADDR] = nir_load_var(b, vars->traversal_addr);
|
|
+ args[CHIT_MISS_ARG_SHADER_RECORD_PTR] = record;
|
|
+ args[CHIT_MISS_ARG_ACCEL_STRUCT] = nir_load_var(b, vars->accel_struct);
|
|
+ args[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS] = nir_load_var(b, vars->cull_mask_and_flags);
|
|
+ args[CHIT_MISS_ARG_SBT_OFFSET] = nir_load_var(b, vars->sbt_offset);
|
|
+ args[CHIT_MISS_ARG_SBT_STRIDE] = nir_load_var(b, vars->sbt_stride);
|
|
+ args[CHIT_MISS_ARG_MISS_INDEX] = nir_load_var(b, vars->miss_index);
|
|
+ args[CHIT_MISS_ARG_RAY_ORIGIN] = nir_load_var(b, vars->origin);
|
|
+ args[CHIT_MISS_ARG_RAY_TMIN] = nir_load_var(b, vars->tmin);
|
|
+ args[CHIT_MISS_ARG_RAY_DIRECTION] = nir_load_var(b, vars->direction);
|
|
+ args[CHIT_MISS_ARG_RAY_TMAX] = intr->src[1].ssa;
|
|
+ args[CHIT_MISS_ARG_PRIMITIVE_ID] = intr->src[2].ssa;
|
|
+ args[CHIT_MISS_ARG_INSTANCE_ADDR] = intr->src[3].ssa;
|
|
+ args[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS] = intr->src[4].ssa;
|
|
+ args[CHIT_MISS_ARG_HIT_KIND] = intr->src[5].ssa;
|
|
+ for (unsigned i = 0; i < DIV_ROUND_UP(vars->payload_size, 4); ++i) {
|
|
+ args[CHIT_MISS_ARG_PAYLOAD_BASE + i] =
|
|
+ nir_instr_def(&nir_build_deref_cast(b, nir_load_param(b, TRAVERSAL_ARG_PAYLOAD_BASE + i),
|
|
+ nir_var_shader_call_data, glsl_uint_type(), 4)
|
|
+ ->instr);
|
|
+ }
|
|
+ nir_build_indirect_call(b, vars->chit_miss_func, nir_load_var(b, vars->shader_addr), param_count, args);
|
|
nir_pop_if(b, NULL);
|
|
break;
|
|
}
|
|
case nir_intrinsic_execute_miss_amd: {
|
|
- nir_store_var(b, vars->tmax, intr->src[0].ssa, 0x1);
|
|
nir_def *undef = nir_undef(b, 1, 32);
|
|
- nir_store_var(b, vars->primitive_id, undef, 0x1);
|
|
- nir_store_var(b, vars->instance_addr, nir_undef(b, 1, 64), 0x1);
|
|
- nir_store_var(b, vars->geometry_id_and_flags, undef, 0x1);
|
|
- nir_store_var(b, vars->hit_kind, undef, 0x1);
|
|
nir_def *miss_index = nir_load_var(b, vars->miss_index);
|
|
nir_def *record = load_sbt_entry(b, vars, miss_index, SBT_MISS, SBT_RECURSIVE_PTR);
|
|
|
|
if (!(vars->flags & VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR)) {
|
|
/* In case of a NULL miss shader, do nothing and just return. */
|
|
- nir_push_if(b, nir_ieq_imm(b, nir_load_var(b, vars->shader_addr), 0));
|
|
- insert_rt_return(b, vars);
|
|
- nir_pop_if(b, NULL);
|
|
+ nir_push_if(b, nir_ine_imm(b, nir_load_var(b, vars->shader_addr), 0));
|
|
}
|
|
|
|
+ unsigned param_count = CHIT_MISS_ARG_PAYLOAD_BASE + DIV_ROUND_UP(vars->payload_size, 4);
|
|
+ nir_def **args = rzalloc_array_size(b->shader, sizeof(nir_def *), param_count);
|
|
+ args[CHIT_MISS_ARG_LAUNCH_ID] = nir_vec3(b, nir_load_var(b, vars->launch_ids[0]), nir_load_var(b, vars->launch_ids[1]), nir_load_var(b, vars->launch_ids[2]));
|
|
+ args[CHIT_MISS_ARG_LAUNCH_SIZE] = nir_vec3(b, nir_load_var(b, vars->launch_sizes[0]), nir_load_var(b, vars->launch_sizes[1]), nir_load_var(b, vars->launch_sizes[2]));
|
|
+ args[CHIT_MISS_ARG_DESCRIPTORS] = nir_load_param(b, TRAVERSAL_ARG_DESCRIPTORS);
|
|
+ args[CHIT_MISS_ARG_PUSH_CONSTANTS] = nir_load_param(b, TRAVERSAL_ARG_PUSH_CONSTANTS);
|
|
+ args[CHIT_MISS_ARG_SBT_DESCRIPTORS] = nir_load_var(b, vars->sbt_descriptors);
|
|
+ args[CHIT_MISS_ARG_TRAVERSAL_ADDR] = nir_load_var(b, vars->traversal_addr);
|
|
+ args[CHIT_MISS_ARG_SHADER_RECORD_PTR] = record;
|
|
+ args[CHIT_MISS_ARG_ACCEL_STRUCT] = nir_load_var(b, vars->accel_struct);
|
|
+ args[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS] = nir_load_var(b, vars->cull_mask_and_flags);
|
|
+ args[CHIT_MISS_ARG_SBT_OFFSET] = nir_load_var(b, vars->sbt_offset);
|
|
+ args[CHIT_MISS_ARG_SBT_STRIDE] = nir_load_var(b, vars->sbt_stride);
|
|
+ args[CHIT_MISS_ARG_MISS_INDEX] = nir_load_var(b, vars->miss_index);
|
|
+ args[CHIT_MISS_ARG_RAY_ORIGIN] = nir_load_var(b, vars->origin);
|
|
+ args[CHIT_MISS_ARG_RAY_TMIN] = nir_load_var(b, vars->tmin);
|
|
+ args[CHIT_MISS_ARG_RAY_DIRECTION] = nir_load_var(b, vars->direction);
|
|
+ args[CHIT_MISS_ARG_RAY_TMAX] = intr->src[0].ssa;
|
|
+ args[CHIT_MISS_ARG_PRIMITIVE_ID] = undef;
|
|
+ args[CHIT_MISS_ARG_INSTANCE_ADDR] = nir_undef(b, 1, 64);
|
|
+ args[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS] = undef;
|
|
+ args[CHIT_MISS_ARG_HIT_KIND] = undef;
|
|
+ for (unsigned i = 0; i < DIV_ROUND_UP(vars->payload_size, 4); ++i) {
|
|
+ args[CHIT_MISS_ARG_PAYLOAD_BASE + i] =
|
|
+ nir_instr_def(&nir_build_deref_cast(b, nir_load_param(b, TRAVERSAL_ARG_PAYLOAD_BASE + i),
|
|
+ nir_var_shader_call_data, glsl_uint_type(), 4)
|
|
+ ->instr);
|
|
+ }
|
|
+ nir_build_indirect_call(b, vars->chit_miss_func, nir_load_var(b, vars->shader_addr), param_count, args);
|
|
+
|
|
+ if (!(vars->flags & VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR))
|
|
+ nir_pop_if(b, NULL);
|
|
+
|
|
break;
|
|
}
|
|
case nir_intrinsic_load_ray_triangle_vertex_positions: {
|
|
@@ -1032,6 +1065,14 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data)
|
|
ret = radv_load_vertex_position(vars->device, b, instance_node_addr, primitive_id, nir_intrinsic_column(intr));
|
|
break;
|
|
}
|
|
+ case nir_intrinsic_rt_trace_ray:
|
|
+ unreachable("nir_intrinsic_rt_trace_ray");
|
|
+ case nir_intrinsic_rt_execute_callable:
|
|
+ unreachable("nir_intrinsic_rt_execute_callable");
|
|
+ case nir_intrinsic_rt_resume:
|
|
+ unreachable("nir_intrinsic_rt_resume");
|
|
+ case nir_intrinsic_rt_return_amd:
|
|
+ unreachable("nir_intrinsic_rt_return_amd");
|
|
default:
|
|
return false;
|
|
}
|
|
@@ -1195,7 +1236,7 @@ insert_rt_case(nir_builder *b, nir_shader *shader, struct rt_variables *vars, ni
|
|
}
|
|
|
|
void
|
|
-radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset, uint32_t *payload_size)
|
|
+radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset)
|
|
{
|
|
if (!monolithic) {
|
|
NIR_PASS(_, nir, lower_rt_derefs);
|
|
@@ -1629,7 +1670,7 @@ radv_build_ahit_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g
|
|
radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->any_hit_shader].nir);
|
|
assert(nir_stage);
|
|
|
|
- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL);
|
|
+ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset);
|
|
b->shader->scratch_size = MAX2(nir_stage->scratch_size, b->shader->scratch_size);
|
|
|
|
insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.any_hit_index);
|
|
@@ -1654,7 +1695,7 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g
|
|
radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->intersection_shader].nir);
|
|
assert(nir_stage);
|
|
|
|
- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL);
|
|
+ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset);
|
|
|
|
nir_shader *any_hit_stage = NULL;
|
|
if (group->any_hit_shader != VK_SHADER_UNUSED_KHR) {
|
|
@@ -1662,7 +1703,7 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g
|
|
radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->any_hit_shader].nir);
|
|
assert(any_hit_stage);
|
|
|
|
- radv_nir_lower_rt_io(any_hit_stage, data->vars->monolithic, data->vars->payload_offset, NULL);
|
|
+ radv_nir_lower_rt_io(any_hit_stage, data->vars->monolithic, data->vars->payload_offset);
|
|
|
|
/* reserve stack size for any_hit before it is inlined */
|
|
data->pipeline->stages[group->any_hit_shader].stack_size = any_hit_stage->scratch_size;
|
|
@@ -1708,7 +1749,7 @@ radv_build_recursive_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_trac
|
|
radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->recursive_shader].nir);
|
|
assert(nir_stage);
|
|
|
|
- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL);
|
|
+ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset);
|
|
|
|
insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.general_index);
|
|
ralloc_free(nir_stage);
|
|
@@ -2175,10 +2216,23 @@ radv_count_hit_attrib_slots(nir_builder *b, nir_intrinsic_instr *instr, void *da
|
|
return false;
|
|
}
|
|
|
|
+static bool
|
|
+radv_count_ray_payload_size(nir_builder *b, nir_intrinsic_instr *instr, void *data)
|
|
+{
|
|
+ uint32_t *count = data;
|
|
+ if (instr->intrinsic == nir_intrinsic_load_incoming_ray_payload_amd ||
|
|
+ instr->intrinsic == nir_intrinsic_load_outgoing_ray_payload_amd ||
|
|
+ instr->intrinsic == nir_intrinsic_store_incoming_ray_payload_amd ||
|
|
+ instr->intrinsic == nir_intrinsic_store_outgoing_ray_payload_amd)
|
|
+ *count = MAX2(*count, (nir_intrinsic_base(instr) + 1) * 4);
|
|
+
|
|
+ return false;
|
|
+}
|
|
+
|
|
static void
|
|
lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device,
|
|
struct radv_ray_tracing_pipeline *pipeline, const struct radv_shader_info *info,
|
|
- const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, uint32_t payload_size,
|
|
+ const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, uint32_t *payload_size,
|
|
struct rt_variables *vars)
|
|
{
|
|
nir_function_impl *impl = radv_get_rt_shader_entrypoint(shader);
|
|
@@ -2195,6 +2249,7 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device,
|
|
|
|
uint32_t hit_attrib_count = 0;
|
|
nir_shader_intrinsics_pass(shader, radv_count_hit_attrib_slots, nir_metadata_all, &hit_attrib_count);
|
|
+ nir_shader_intrinsics_pass(shader, radv_count_ray_payload_size, nir_metadata_all, payload_size);
|
|
|
|
/* Register storage for hit attributes */
|
|
STACK_ARRAY(nir_variable *, hit_attribs, hit_attrib_count);
|
|
@@ -2203,10 +2258,10 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device,
|
|
|
|
nir_builder b = nir_builder_create(impl);
|
|
b.cursor = nir_before_impl(impl);
|
|
- nir_variable **payload_vars = rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(payload_size, 4));
|
|
+ nir_variable **payload_vars = rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(*payload_size, 4));
|
|
nir_deref_instr **payload_storage =
|
|
- rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(payload_size, 4));
|
|
- for (unsigned i = 0; i < DIV_ROUND_UP(payload_size, 4); ++i) {
|
|
+ rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(*payload_size, 4));
|
|
+ for (unsigned i = 0; i < DIV_ROUND_UP(*payload_size, 4); ++i) {
|
|
payload_vars[i] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "_payload");
|
|
payload_storage[i] = nir_build_deref_var(&b, payload_vars[i]);
|
|
}
|
|
@@ -2215,26 +2270,28 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device,
|
|
}
|
|
|
|
static void
|
|
-radv_store_arg(nir_builder *b, const struct radv_shader_args *args, const struct radv_ray_tracing_stage_info *info,
|
|
- struct ac_arg arg, nir_def *value)
|
|
+store_param_var(nir_builder *b, nir_variable *var, unsigned param_index, unsigned num_components, unsigned bit_size)
|
|
{
|
|
- /* Do not pass unused data to the next stage. */
|
|
- if (!info || !BITSET_TEST(info->unused_args, arg.arg_index))
|
|
- ac_nir_store_arg(b, &args->ac, arg, value);
|
|
+ if (param_index != -1u)
|
|
+ nir_store_var(b, var, nir_load_param(b, param_index), (1 << num_components) - 1);
|
|
+ else
|
|
+ nir_store_var(b, var, nir_undef(b, num_components, bit_size), (1 << num_components) - 1);
|
|
}
|
|
|
|
void
|
|
radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
|
|
- const struct radv_shader_args *args, const struct radv_shader_info *info, uint32_t *stack_size,
|
|
- bool resume_shader, uint32_t payload_size, struct radv_device *device,
|
|
- struct radv_ray_tracing_pipeline *pipeline, bool monolithic,
|
|
- const struct radv_ray_tracing_stage_info *traversal_info)
|
|
+ const struct radv_shader_args *args, const struct radv_shader_info *info, uint32_t *payload_size,
|
|
+ uint32_t *stack_size, struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline,
|
|
+ bool monolithic)
|
|
{
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
+ nir_function *entrypoint_function = impl->function;
|
|
+
|
|
+ radv_nir_init_function_params(entrypoint_function, shader->info.stage, *payload_size);
|
|
|
|
const VkPipelineCreateFlagBits2KHR create_flags = vk_rt_pipeline_create_flags(pCreateInfo);
|
|
|
|
- struct rt_variables vars = create_rt_variables(shader, device, create_flags, payload_size, monolithic);
|
|
+ struct rt_variables vars = create_rt_variables(shader, device, create_flags, *payload_size, monolithic);
|
|
|
|
if (monolithic)
|
|
lower_rt_instructions_monolithic(shader, device, pipeline, info, pCreateInfo, payload_size, &vars);
|
|
@@ -2247,152 +2304,158 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH
|
|
vars.stack_size = MAX2(vars.stack_size, shader->scratch_size);
|
|
*stack_size = MAX2(*stack_size, vars.stack_size);
|
|
}
|
|
- shader->scratch_size = 0;
|
|
|
|
NIR_PASS(_, shader, nir_lower_returns);
|
|
|
|
- nir_cf_list list;
|
|
- nir_cf_extract(&list, nir_before_impl(impl), nir_after_impl(impl));
|
|
+ unsigned shader_record_ptr_arg = -1u;
|
|
+ unsigned launch_id_arg = -1u;
|
|
+ unsigned launch_size_arg = -1u;
|
|
+ unsigned sbt_descriptors_arg = -1u;
|
|
+ unsigned traversal_addr_arg = -1u;
|
|
+ unsigned accel_struct_arg = -1u;
|
|
+ unsigned cull_mask_and_flags_arg = -1u;
|
|
+ unsigned sbt_offset_arg = -1u;
|
|
+ unsigned sbt_stride_arg = -1u;
|
|
+ unsigned miss_index_arg = -1u;
|
|
+ unsigned ray_origin_arg = -1u;
|
|
+ unsigned ray_tmin_arg = -1u;
|
|
+ unsigned ray_direction_arg = -1u;
|
|
+ unsigned ray_tmax_arg = -1u;
|
|
+ unsigned primitive_id_arg = -1u;
|
|
+ unsigned instance_addr_arg = -1u;
|
|
+ unsigned geometry_id_and_flags_arg = -1u;
|
|
+ unsigned hit_kind_arg = -1u;
|
|
+ unsigned in_payload_base_arg = -1u;
|
|
+
|
|
+ switch (shader->info.stage) {
|
|
+ case MESA_SHADER_CALLABLE:
|
|
+ in_payload_base_arg = RAYGEN_ARG_COUNT;
|
|
+ shader_record_ptr_arg = RAYGEN_ARG_SHADER_RECORD_PTR;
|
|
+ launch_id_arg = RAYGEN_ARG_LAUNCH_ID;
|
|
+ launch_size_arg = RAYGEN_ARG_LAUNCH_SIZE;
|
|
+ sbt_descriptors_arg = RAYGEN_ARG_SBT_DESCRIPTORS;
|
|
+ traversal_addr_arg = RAYGEN_ARG_TRAVERSAL_ADDR;
|
|
+ break;
|
|
+ case MESA_SHADER_RAYGEN:
|
|
+ shader_record_ptr_arg = RAYGEN_ARG_SHADER_RECORD_PTR;
|
|
+ launch_id_arg = RAYGEN_ARG_LAUNCH_ID;
|
|
+ launch_size_arg = RAYGEN_ARG_LAUNCH_SIZE;
|
|
+ sbt_descriptors_arg = RAYGEN_ARG_SBT_DESCRIPTORS;
|
|
+ traversal_addr_arg = RAYGEN_ARG_TRAVERSAL_ADDR;
|
|
+ break;
|
|
+ case MESA_SHADER_INTERSECTION:
|
|
+ launch_id_arg = TRAVERSAL_ARG_LAUNCH_ID;
|
|
+ launch_size_arg = TRAVERSAL_ARG_LAUNCH_SIZE;
|
|
+ sbt_descriptors_arg = TRAVERSAL_ARG_SBT_DESCRIPTORS;
|
|
+ traversal_addr_arg = TRAVERSAL_ARG_TRAVERSAL_ADDR;
|
|
+ shader_record_ptr_arg = TRAVERSAL_ARG_SHADER_RECORD_PTR;
|
|
+ accel_struct_arg = TRAVERSAL_ARG_ACCEL_STRUCT;
|
|
+ cull_mask_and_flags_arg = TRAVERSAL_ARG_CULL_MASK_AND_FLAGS;
|
|
+ sbt_offset_arg = TRAVERSAL_ARG_SBT_OFFSET;
|
|
+ sbt_stride_arg = TRAVERSAL_ARG_SBT_STRIDE;
|
|
+ miss_index_arg = TRAVERSAL_ARG_MISS_INDEX;
|
|
+ ray_origin_arg = TRAVERSAL_ARG_RAY_ORIGIN;
|
|
+ ray_tmin_arg = TRAVERSAL_ARG_RAY_TMIN;
|
|
+ ray_direction_arg = TRAVERSAL_ARG_RAY_DIRECTION;
|
|
+ ray_tmax_arg = TRAVERSAL_ARG_RAY_TMAX;
|
|
+ in_payload_base_arg = TRAVERSAL_ARG_PAYLOAD_BASE;
|
|
+ break;
|
|
+ case MESA_SHADER_CLOSEST_HIT:
|
|
+ case MESA_SHADER_MISS:
|
|
+ launch_id_arg = CHIT_MISS_ARG_LAUNCH_ID;
|
|
+ launch_size_arg = CHIT_MISS_ARG_LAUNCH_SIZE;
|
|
+ sbt_descriptors_arg = CHIT_MISS_ARG_SBT_DESCRIPTORS;
|
|
+ traversal_addr_arg = CHIT_MISS_ARG_TRAVERSAL_ADDR;
|
|
+ shader_record_ptr_arg = CHIT_MISS_ARG_SHADER_RECORD_PTR;
|
|
+ accel_struct_arg = CHIT_MISS_ARG_ACCEL_STRUCT;
|
|
+ cull_mask_and_flags_arg = CHIT_MISS_ARG_CULL_MASK_AND_FLAGS;
|
|
+ sbt_offset_arg = CHIT_MISS_ARG_SBT_OFFSET;
|
|
+ sbt_stride_arg = CHIT_MISS_ARG_SBT_STRIDE;
|
|
+ miss_index_arg = CHIT_MISS_ARG_MISS_INDEX;
|
|
+ ray_origin_arg = CHIT_MISS_ARG_RAY_ORIGIN;
|
|
+ ray_tmin_arg = CHIT_MISS_ARG_RAY_TMIN;
|
|
+ ray_direction_arg = CHIT_MISS_ARG_RAY_DIRECTION;
|
|
+ ray_tmax_arg = CHIT_MISS_ARG_RAY_TMAX;
|
|
+ primitive_id_arg = CHIT_MISS_ARG_PRIMITIVE_ID;
|
|
+ instance_addr_arg = CHIT_MISS_ARG_INSTANCE_ADDR;
|
|
+ geometry_id_and_flags_arg = CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS;
|
|
+ hit_kind_arg = CHIT_MISS_ARG_HIT_KIND;
|
|
+ in_payload_base_arg = CHIT_MISS_ARG_PAYLOAD_BASE;
|
|
+ break;
|
|
+ default:
|
|
+ break;
|
|
+ }
|
|
|
|
/* initialize variables */
|
|
nir_builder b = nir_builder_at(nir_before_impl(impl));
|
|
|
|
- nir_def *descriptor_sets = ac_nir_load_arg(&b, &args->ac, args->descriptor_sets[0]);
|
|
- nir_def *push_constants = ac_nir_load_arg(&b, &args->ac, args->ac.push_constants);
|
|
- nir_def *sbt_descriptors = ac_nir_load_arg(&b, &args->ac, args->ac.rt.sbt_descriptors);
|
|
-
|
|
nir_def *launch_sizes[3];
|
|
+ nir_def *launch_size_vec = nir_load_param(&b, launch_size_arg);
|
|
for (uint32_t i = 0; i < ARRAY_SIZE(launch_sizes); i++) {
|
|
- launch_sizes[i] = ac_nir_load_arg(&b, &args->ac, args->ac.rt.launch_sizes[i]);
|
|
+ launch_sizes[i] = nir_channel(&b, launch_size_vec, i);
|
|
nir_store_var(&b, vars.launch_sizes[i], launch_sizes[i], 1);
|
|
}
|
|
|
|
- nir_def *scratch_offset = NULL;
|
|
- if (args->ac.scratch_offset.used)
|
|
- scratch_offset = ac_nir_load_arg(&b, &args->ac, args->ac.scratch_offset);
|
|
- nir_def *ring_offsets = NULL;
|
|
- if (args->ac.ring_offsets.used)
|
|
- ring_offsets = ac_nir_load_arg(&b, &args->ac, args->ac.ring_offsets);
|
|
-
|
|
nir_def *launch_ids[3];
|
|
+ nir_def *launch_id_vec = nir_load_param(&b, launch_id_arg);
|
|
for (uint32_t i = 0; i < ARRAY_SIZE(launch_ids); i++) {
|
|
- launch_ids[i] = ac_nir_load_arg(&b, &args->ac, args->ac.rt.launch_ids[i]);
|
|
+ launch_ids[i] = nir_channel(&b, launch_id_vec, i);
|
|
nir_store_var(&b, vars.launch_ids[i], launch_ids[i], 1);
|
|
}
|
|
|
|
- nir_def *traversal_addr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.traversal_shader_addr);
|
|
- nir_store_var(&b, vars.traversal_addr, nir_pack_64_2x32(&b, traversal_addr), 1);
|
|
+ nir_store_var(&b, vars.traversal_addr, nir_load_param(&b, traversal_addr_arg), 1);
|
|
|
|
- nir_def *shader_addr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.shader_addr);
|
|
- shader_addr = nir_pack_64_2x32(&b, shader_addr);
|
|
- nir_store_var(&b, vars.shader_addr, shader_addr, 1);
|
|
+ nir_store_var(&b, vars.sbt_descriptors, nir_load_param(&b, sbt_descriptors_arg), 1);
|
|
|
|
- nir_store_var(&b, vars.stack_ptr, ac_nir_load_arg(&b, &args->ac, args->ac.rt.dynamic_callable_stack_base), 1);
|
|
- nir_def *record_ptr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.shader_record);
|
|
- nir_store_var(&b, vars.shader_record_ptr, nir_pack_64_2x32(&b, record_ptr), 1);
|
|
- nir_store_var(&b, vars.arg, ac_nir_load_arg(&b, &args->ac, args->ac.rt.payload_offset), 1);
|
|
-
|
|
- nir_def *accel_struct = ac_nir_load_arg(&b, &args->ac, args->ac.rt.accel_struct);
|
|
- nir_store_var(&b, vars.accel_struct, nir_pack_64_2x32(&b, accel_struct), 1);
|
|
- nir_store_var(&b, vars.cull_mask_and_flags, ac_nir_load_arg(&b, &args->ac, args->ac.rt.cull_mask_and_flags), 1);
|
|
- nir_store_var(&b, vars.sbt_offset, ac_nir_load_arg(&b, &args->ac, args->ac.rt.sbt_offset), 1);
|
|
- nir_store_var(&b, vars.sbt_stride, ac_nir_load_arg(&b, &args->ac, args->ac.rt.sbt_stride), 1);
|
|
- nir_store_var(&b, vars.origin, ac_nir_load_arg(&b, &args->ac, args->ac.rt.ray_origin), 0x7);
|
|
- nir_store_var(&b, vars.tmin, ac_nir_load_arg(&b, &args->ac, args->ac.rt.ray_tmin), 1);
|
|
- nir_store_var(&b, vars.direction, ac_nir_load_arg(&b, &args->ac, args->ac.rt.ray_direction), 0x7);
|
|
- nir_store_var(&b, vars.tmax, ac_nir_load_arg(&b, &args->ac, args->ac.rt.ray_tmax), 1);
|
|
-
|
|
- if (traversal_info && traversal_info->miss_index.state == RADV_RT_CONST_ARG_STATE_VALID)
|
|
- nir_store_var(&b, vars.miss_index, nir_imm_int(&b, traversal_info->miss_index.value), 0x1);
|
|
- else
|
|
- nir_store_var(&b, vars.miss_index, ac_nir_load_arg(&b, &args->ac, args->ac.rt.miss_index), 0x1);
|
|
-
|
|
- nir_store_var(&b, vars.primitive_id, ac_nir_load_arg(&b, &args->ac, args->ac.rt.primitive_id), 1);
|
|
- nir_def *instance_addr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.instance_addr);
|
|
- nir_store_var(&b, vars.instance_addr, nir_pack_64_2x32(&b, instance_addr), 1);
|
|
- nir_store_var(&b, vars.geometry_id_and_flags, ac_nir_load_arg(&b, &args->ac, args->ac.rt.geometry_id_and_flags), 1);
|
|
- nir_store_var(&b, vars.hit_kind, ac_nir_load_arg(&b, &args->ac, args->ac.rt.hit_kind), 1);
|
|
-
|
|
- /* guard the shader, so that only the correct invocations execute it */
|
|
- nir_if *shader_guard = NULL;
|
|
- if (shader->info.stage != MESA_SHADER_RAYGEN || resume_shader) {
|
|
- nir_def *uniform_shader_addr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.uniform_shader_addr);
|
|
- uniform_shader_addr = nir_pack_64_2x32(&b, uniform_shader_addr);
|
|
- uniform_shader_addr = nir_ior_imm(&b, uniform_shader_addr, radv_get_rt_priority(shader->info.stage));
|
|
-
|
|
- shader_guard = nir_push_if(&b, nir_ieq(&b, uniform_shader_addr, shader_addr));
|
|
- shader_guard->control = nir_selection_control_divergent_always_taken;
|
|
- }
|
|
-
|
|
- nir_cf_reinsert(&list, b.cursor);
|
|
-
|
|
- if (shader_guard)
|
|
- nir_pop_if(&b, shader_guard);
|
|
+ if (monolithic) {
|
|
+ nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 1);
|
|
+
|
|
+ nir_store_var(&b, vars.arg, nir_imm_int(&b, 0), 1);
|
|
+ }
|
|
+
|
|
+ store_param_var(&b, vars.shader_record_ptr, shader_record_ptr_arg, 1, 64);
|
|
+ store_param_var(&b, vars.accel_struct, accel_struct_arg, 1, 64);
|
|
+ store_param_var(&b, vars.cull_mask_and_flags, cull_mask_and_flags_arg, 1, 32);
|
|
+ store_param_var(&b, vars.sbt_offset, sbt_offset_arg, 1, 32);
|
|
+ store_param_var(&b, vars.sbt_stride, sbt_stride_arg, 1, 32);
|
|
+ store_param_var(&b, vars.miss_index, miss_index_arg, 1, 32);
|
|
+ store_param_var(&b, vars.origin, ray_origin_arg, 3, 32);
|
|
+ store_param_var(&b, vars.tmin, ray_tmin_arg, 1, 32);
|
|
+ store_param_var(&b, vars.direction, ray_direction_arg, 3, 32);
|
|
+ store_param_var(&b, vars.tmax, ray_tmax_arg, 1, 32);
|
|
+ store_param_var(&b, vars.primitive_id, primitive_id_arg, 1, 32);
|
|
+ store_param_var(&b, vars.instance_addr, instance_addr_arg, 1, 64);
|
|
+ store_param_var(&b, vars.geometry_id_and_flags, geometry_id_and_flags_arg, 1, 32);
|
|
+ store_param_var(&b, vars.hit_kind, hit_kind_arg, 1, 32);
|
|
|
|
b.cursor = nir_after_impl(impl);
|
|
|
|
if (monolithic) {
|
|
nir_terminate(&b);
|
|
- } else {
|
|
- /* select next shader */
|
|
- shader_addr = nir_load_var(&b, vars.shader_addr);
|
|
- nir_def *next = select_next_shader(&b, shader_addr, info->wave_size);
|
|
- ac_nir_store_arg(&b, &args->ac, args->ac.rt.uniform_shader_addr, next);
|
|
-
|
|
- ac_nir_store_arg(&b, &args->ac, args->descriptor_sets[0], descriptor_sets);
|
|
- ac_nir_store_arg(&b, &args->ac, args->ac.push_constants, push_constants);
|
|
- ac_nir_store_arg(&b, &args->ac, args->ac.rt.sbt_descriptors, sbt_descriptors);
|
|
- ac_nir_store_arg(&b, &args->ac, args->ac.rt.traversal_shader_addr, traversal_addr);
|
|
-
|
|
- for (uint32_t i = 0; i < ARRAY_SIZE(launch_sizes); i++) {
|
|
- if (rt_info.uses_launch_size)
|
|
- ac_nir_store_arg(&b, &args->ac, args->ac.rt.launch_sizes[i], launch_sizes[i]);
|
|
- else
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.launch_sizes[i], launch_sizes[i]);
|
|
- }
|
|
-
|
|
- if (scratch_offset)
|
|
- ac_nir_store_arg(&b, &args->ac, args->ac.scratch_offset, scratch_offset);
|
|
- if (ring_offsets)
|
|
- ac_nir_store_arg(&b, &args->ac, args->ac.ring_offsets, ring_offsets);
|
|
-
|
|
- for (uint32_t i = 0; i < ARRAY_SIZE(launch_ids); i++) {
|
|
- if (rt_info.uses_launch_id)
|
|
- ac_nir_store_arg(&b, &args->ac, args->ac.rt.launch_ids[i], launch_ids[i]);
|
|
- else
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.launch_ids[i], launch_ids[i]);
|
|
- }
|
|
-
|
|
- /* store back all variables to registers */
|
|
- ac_nir_store_arg(&b, &args->ac, args->ac.rt.dynamic_callable_stack_base, nir_load_var(&b, vars.stack_ptr));
|
|
- ac_nir_store_arg(&b, &args->ac, args->ac.rt.shader_addr, shader_addr);
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.shader_record, nir_load_var(&b, vars.shader_record_ptr));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.payload_offset, nir_load_var(&b, vars.arg));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.accel_struct, nir_load_var(&b, vars.accel_struct));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.cull_mask_and_flags,
|
|
- nir_load_var(&b, vars.cull_mask_and_flags));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.sbt_offset, nir_load_var(&b, vars.sbt_offset));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.sbt_stride, nir_load_var(&b, vars.sbt_stride));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.miss_index, nir_load_var(&b, vars.miss_index));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.ray_origin, nir_load_var(&b, vars.origin));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.ray_tmin, nir_load_var(&b, vars.tmin));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.ray_direction, nir_load_var(&b, vars.direction));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.ray_tmax, nir_load_var(&b, vars.tmax));
|
|
-
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.primitive_id, nir_load_var(&b, vars.primitive_id));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.instance_addr, nir_load_var(&b, vars.instance_addr));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.geometry_id_and_flags,
|
|
- nir_load_var(&b, vars.geometry_id_and_flags));
|
|
- radv_store_arg(&b, args, traversal_info, args->ac.rt.hit_kind, nir_load_var(&b, vars.hit_kind));
|
|
}
|
|
|
|
nir_metadata_preserve(impl, nir_metadata_none);
|
|
|
|
/* cleanup passes */
|
|
+ if (!monolithic) {
|
|
+ NIR_PASS_V(shader, radv_nir_lower_ray_payload_derefs, 0);
|
|
+
|
|
+ b.cursor = nir_before_impl(impl);
|
|
+ nir_deref_instr **payload_in_storage =
|
|
+ rzalloc_array_size(shader, sizeof(nir_deref_instr *), DIV_ROUND_UP(*payload_size, 4));
|
|
+ if (in_payload_base_arg != -1u) {
|
|
+ for (unsigned i = 0; i < DIV_ROUND_UP(*payload_size, 4); ++i) {
|
|
+ payload_in_storage[i] = nir_build_deref_cast(&b, nir_load_param(&b, in_payload_base_arg + i),
|
|
+ nir_var_shader_call_data, glsl_uint_type(), 4);
|
|
+ }
|
|
+ }
|
|
+ NIR_PASS_V(shader, lower_rt_storage, NULL, payload_in_storage, vars.out_payload_storage, info->wave_size);
|
|
+
|
|
+ nir_remove_dead_derefs(shader);
|
|
+ nir_remove_dead_variables(shader, nir_var_function_temp | nir_var_shader_call_data, NULL);
|
|
+ }
|
|
NIR_PASS_V(shader, nir_lower_global_vars_to_local);
|
|
NIR_PASS_V(shader, nir_lower_vars_to_ssa);
|
|
- if (shader->info.stage == MESA_SHADER_CLOSEST_HIT || shader->info.stage == MESA_SHADER_INTERSECTION)
|
|
- NIR_PASS_V(shader, lower_hit_attribs, NULL, info->wave_size);
|
|
}
|
|
|
|
static bool
|
|
diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c
|
|
index c4feea4a6f95b..196f8aa23a032 100644
|
|
--- a/src/amd/vulkan/radv_pipeline_rt.c
|
|
+++ b/src/amd/vulkan/radv_pipeline_rt.c
|
|
@@ -368,7 +368,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
|
|
bool keep_executable_info = radv_pipeline_capture_shaders(device, pipeline->base.base.create_flags);
|
|
bool keep_statistic_info = radv_pipeline_capture_shader_stats(device, pipeline->base.base.create_flags);
|
|
|
|
- radv_nir_lower_rt_io(stage->nir, monolithic, 0, payload_size);
|
|
+ radv_nir_lower_rt_io(stage->nir, monolithic, 0);
|
|
|
|
/* Gather shader info. */
|
|
nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir));
|
|
@@ -382,70 +382,30 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
|
|
stage->info.user_sgprs_locs = stage->args.user_sgprs_locs;
|
|
stage->info.inline_push_constant_mask = stage->args.ac.inline_push_const_mask;
|
|
|
|
- /* Move ray tracing system values to the top that are set by rt_trace_ray
|
|
- * to prevent them from being overwritten by other rt_trace_ray calls.
|
|
- */
|
|
- NIR_PASS_V(stage->nir, move_rt_instructions);
|
|
-
|
|
- uint32_t num_resume_shaders = 0;
|
|
- nir_shader **resume_shaders = NULL;
|
|
-
|
|
- if (stage->stage != MESA_SHADER_INTERSECTION && !monolithic) {
|
|
- nir_builder b = nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(stage->nir)));
|
|
- nir_rt_return_amd(&b);
|
|
-
|
|
- const nir_lower_shader_calls_options opts = {
|
|
- .address_format = nir_address_format_32bit_offset,
|
|
- .stack_alignment = 16,
|
|
- .localized_loads = true,
|
|
- .vectorizer_callback = ac_nir_mem_vectorize_callback,
|
|
- .vectorizer_data = &pdev->info.gfx_level,
|
|
- };
|
|
- nir_lower_shader_calls(stage->nir, &opts, &resume_shaders, &num_resume_shaders, stage->nir);
|
|
- }
|
|
-
|
|
- unsigned num_shaders = num_resume_shaders + 1;
|
|
- nir_shader **shaders = ralloc_array(stage->nir, nir_shader *, num_shaders);
|
|
- if (!shaders)
|
|
- return VK_ERROR_OUT_OF_HOST_MEMORY;
|
|
-
|
|
- shaders[0] = stage->nir;
|
|
- for (uint32_t i = 0; i < num_resume_shaders; i++)
|
|
- shaders[i + 1] = resume_shaders[i];
|
|
-
|
|
if (stage_info)
|
|
memset(stage_info->unused_args, 0xFF, sizeof(stage_info->unused_args));
|
|
|
|
/* Postprocess shader parts. */
|
|
- for (uint32_t i = 0; i < num_shaders; i++) {
|
|
- struct radv_shader_stage temp_stage = *stage;
|
|
- temp_stage.nir = shaders[i];
|
|
- radv_nir_lower_rt_abi(temp_stage.nir, pCreateInfo, &temp_stage.args, &stage->info, stack_size, i > 0, device,
|
|
- pipeline, monolithic, traversal_stage_info);
|
|
+ radv_nir_lower_rt_abi(stage->nir, pCreateInfo, &stage->args, &stage->info, payload_size, stack_size, device,
|
|
+ pipeline, monolithic);
|
|
|
|
- /* Info might be out-of-date after inlining in radv_nir_lower_rt_abi(). */
|
|
- nir_shader_gather_info(temp_stage.nir, radv_get_rt_shader_entrypoint(temp_stage.nir));
|
|
+ /* Info might be out-of-date after inlining in radv_nir_lower_rt_abi(). */
|
|
+ nir_shader_gather_info(stage->nir, radv_get_rt_shader_entrypoint(stage->nir));
|
|
|
|
- radv_optimize_nir(temp_stage.nir, stage->key.optimisations_disabled);
|
|
- radv_postprocess_nir(device, NULL, &temp_stage);
|
|
-
|
|
- if (stage_info)
|
|
- radv_gather_unused_args(stage_info, shaders[i]);
|
|
- }
|
|
+ radv_optimize_nir(stage->nir, stage->key.optimisations_disabled);
|
|
+ radv_postprocess_nir(device, NULL, stage);
|
|
|
|
- bool dump_shader = radv_can_dump_shader(device, shaders[0], false);
|
|
+ bool dump_shader = radv_can_dump_shader(device, stage->nir, false);
|
|
bool replayable =
|
|
pipeline->base.base.create_flags & VK_PIPELINE_CREATE_2_RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR;
|
|
|
|
if (dump_shader) {
|
|
simple_mtx_lock(&instance->shader_dump_mtx);
|
|
- for (uint32_t i = 0; i < num_shaders; i++)
|
|
- nir_print_shader(shaders[i], stderr);
|
|
+ nir_print_shader(stage->nir, stderr);
|
|
}
|
|
|
|
/* Compile NIR shader to AMD assembly. */
|
|
- binary =
|
|
- radv_shader_nir_to_asm(device, stage, shaders, num_shaders, NULL, keep_executable_info, keep_statistic_info);
|
|
+ binary = radv_shader_nir_to_asm(device, stage, &stage->nir, 1, NULL, keep_executable_info, keep_statistic_info);
|
|
struct radv_shader *shader;
|
|
if (replay_block || replayable) {
|
|
VkResult result = radv_shader_create_uncached(device, binary, replayable, replay_block, &shader);
|
|
@@ -463,7 +423,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache,
|
|
if (stack_size)
|
|
*stack_size += DIV_ROUND_UP(shader->config.scratch_bytes_per_wave, shader->info.wave_size);
|
|
|
|
- radv_shader_generate_debug_info(device, dump_shader, keep_executable_info, binary, shader, shaders, num_shaders,
|
|
+ radv_shader_generate_debug_info(device, dump_shader, keep_executable_info, binary, shader, &stage->nir, 1,
|
|
&stage->info);
|
|
|
|
if (shader && keep_executable_info && stage->spirv.size) {
|
|
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
|
|
index f6a0f35c23333..654ae528866d8 100644
|
|
--- a/src/amd/vulkan/radv_shader.h
|
|
+++ b/src/amd/vulkan/radv_shader.h
|
|
@@ -516,15 +516,14 @@ radv_get_rt_shader_entrypoint(nir_shader *shader)
|
|
return NULL;
|
|
}
|
|
|
|
-void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset, uint32_t *payload_size);
|
|
+void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset);
|
|
|
|
struct radv_ray_tracing_stage_info;
|
|
|
|
void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
|
|
const struct radv_shader_args *args, const struct radv_shader_info *info,
|
|
- uint32_t *stack_size, bool resume_shader, uint32_t payload_size, struct radv_device *device,
|
|
- struct radv_ray_tracing_pipeline *pipeline, bool monolithic,
|
|
- const struct radv_ray_tracing_stage_info *traversal_info);
|
|
+ uint32_t *payload_size, uint32_t *stack_size, struct radv_device *device,
|
|
+ struct radv_ray_tracing_pipeline *pipeline, bool monolithic);
|
|
|
|
void radv_gather_unused_args(struct radv_ray_tracing_stage_info *info, nir_shader *nir);
|
|
|
|
diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c
|
|
index 75f5a66444f91..e52fc48c33ebc 100644
|
|
--- a/src/amd/vulkan/radv_shader_args.c
|
|
+++ b/src/amd/vulkan/radv_shader_args.c
|
|
@@ -316,7 +316,7 @@ radv_init_shader_args(const struct radv_device *device, gl_shader_stage stage, s
|
|
void
|
|
radv_declare_rt_shader_args(enum amd_gfx_level gfx_level, struct radv_shader_args *args)
|
|
{
|
|
- add_ud_arg(args, 2, AC_ARG_CONST_PTR, &args->ac.rt.uniform_shader_addr, AC_UD_SCRATCH_RING_OFFSETS);
|
|
+ add_ud_arg(args, 2, AC_ARG_CONST_PTR, &args->ac.ring_offsets, AC_UD_SCRATCH_RING_OFFSETS);
|
|
add_ud_arg(args, 1, AC_ARG_CONST_PTR_PTR, &args->descriptor_sets[0], AC_UD_INDIRECT_DESCRIPTOR_SETS);
|
|
ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR, &args->ac.push_constants);
|
|
ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->ac.rt.sbt_descriptors);
|
|
@@ -334,25 +334,8 @@ radv_declare_rt_shader_args(enum amd_gfx_level gfx_level, struct radv_shader_arg
|
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.launch_ids[i]);
|
|
|
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.dynamic_callable_stack_base);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_CONST_PTR, &args->ac.rt.shader_addr);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_CONST_PTR, &args->ac.rt.shader_record);
|
|
|
|
ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.payload_offset);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_FLOAT, &args->ac.rt.ray_origin);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_FLOAT, &args->ac.rt.ray_direction);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.rt.ray_tmin);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.rt.ray_tmax);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.cull_mask_and_flags);
|
|
-
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_CONST_PTR, &args->ac.rt.accel_struct);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.sbt_offset);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.sbt_stride);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.miss_index);
|
|
-
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_CONST_PTR, &args->ac.rt.instance_addr);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.primitive_id);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.geometry_id_and_flags);
|
|
- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.hit_kind);
|
|
}
|
|
|
|
static bool
|
|
@@ -548,7 +531,6 @@ declare_shader_args(const struct radv_device *device, const struct radv_graphics
|
|
radv_init_shader_args(device, stage, args);
|
|
|
|
if (gl_shader_stage_is_rt(stage)) {
|
|
- radv_declare_rt_shader_args(gfx_level, args);
|
|
return;
|
|
}
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 7a6a16e551cf02df8e14d8b729584ca9d8bf5443 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 12:22:46 +0200
|
|
Subject: [PATCH 29/71] radv/rt: Remove radv_gather_unused_args
|
|
|
|
Not needed anymore.
|
|
---
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 47 -------------------------
|
|
src/amd/vulkan/radv_shader.h | 2 --
|
|
2 files changed, 49 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index 7968cb36f5d87..d0e43ebd406b7 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -2457,50 +2457,3 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH
|
|
NIR_PASS_V(shader, nir_lower_global_vars_to_local);
|
|
NIR_PASS_V(shader, nir_lower_vars_to_ssa);
|
|
}
|
|
-
|
|
-static bool
|
|
-radv_arg_def_is_unused(nir_def *def)
|
|
-{
|
|
- nir_foreach_use (use, def) {
|
|
- nir_instr *use_instr = nir_src_parent_instr(use);
|
|
- if (use_instr->type == nir_instr_type_intrinsic) {
|
|
- nir_intrinsic_instr *use_intr = nir_instr_as_intrinsic(use_instr);
|
|
- if (use_intr->intrinsic == nir_intrinsic_store_scalar_arg_amd ||
|
|
- use_intr->intrinsic == nir_intrinsic_store_vector_arg_amd)
|
|
- continue;
|
|
- } else if (use_instr->type == nir_instr_type_phi) {
|
|
- nir_cf_node *prev_node = nir_cf_node_prev(&use_instr->block->cf_node);
|
|
- if (!prev_node)
|
|
- return false;
|
|
-
|
|
- nir_phi_instr *phi = nir_instr_as_phi(use_instr);
|
|
- if (radv_arg_def_is_unused(&phi->def))
|
|
- continue;
|
|
- }
|
|
-
|
|
- return false;
|
|
- }
|
|
-
|
|
- return true;
|
|
-}
|
|
-
|
|
-static bool
|
|
-radv_gather_unused_args_instr(nir_builder *b, nir_intrinsic_instr *instr, void *data)
|
|
-{
|
|
- if (instr->intrinsic != nir_intrinsic_load_scalar_arg_amd && instr->intrinsic != nir_intrinsic_load_vector_arg_amd)
|
|
- return false;
|
|
-
|
|
- if (!radv_arg_def_is_unused(&instr->def)) {
|
|
- /* This arg is used for more than passing data to the next stage. */
|
|
- struct radv_ray_tracing_stage_info *info = data;
|
|
- BITSET_CLEAR(info->unused_args, nir_intrinsic_base(instr));
|
|
- }
|
|
-
|
|
- return false;
|
|
-}
|
|
-
|
|
-void
|
|
-radv_gather_unused_args(struct radv_ray_tracing_stage_info *info, nir_shader *nir)
|
|
-{
|
|
- nir_shader_intrinsics_pass(nir, radv_gather_unused_args_instr, nir_metadata_all, info);
|
|
-}
|
|
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
|
|
index 654ae528866d8..7dacf66a7a3fa 100644
|
|
--- a/src/amd/vulkan/radv_shader.h
|
|
+++ b/src/amd/vulkan/radv_shader.h
|
|
@@ -525,8 +525,6 @@ void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateI
|
|
uint32_t *payload_size, uint32_t *stack_size, struct radv_device *device,
|
|
struct radv_ray_tracing_pipeline *pipeline, bool monolithic);
|
|
|
|
-void radv_gather_unused_args(struct radv_ray_tracing_stage_info *info, nir_shader *nir);
|
|
-
|
|
struct radv_shader_stage;
|
|
|
|
nir_shader *radv_shader_spirv_to_nir(struct radv_device *device, const struct radv_shader_stage *stage,
|
|
--
|
|
GitLab
|
|
|
|
|
|
From c4aa21f8f03032e97d13aece927b62240986fd39 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sat, 4 May 2024 17:51:17 +0200
|
|
Subject: [PATCH 30/71] radv/rt: make radv_nir_init_rt_function_params public
|
|
|
|
---
|
|
src/amd/vulkan/nir/radv_nir_rt_shader.c | 10 +++++-----
|
|
src/amd/vulkan/radv_shader.h | 1 +
|
|
2 files changed, 6 insertions(+), 5 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
index d0e43ebd406b7..aa9af1eeefd54 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c
|
|
@@ -221,7 +221,7 @@ lower_rt_derefs(nir_shader *shader)
|
|
return progress;
|
|
}
|
|
|
|
-static void
|
|
+void
|
|
radv_nir_init_rt_function_params(nir_function *function, gl_shader_stage stage, unsigned payload_size)
|
|
{
|
|
unsigned payload_base = -1u;
|
|
@@ -583,13 +583,13 @@ create_rt_variables(nir_shader *shader, struct radv_device *device, const VkPipe
|
|
}
|
|
|
|
nir_function *trace_ray_func = nir_function_create(shader, "trace_ray_func");
|
|
- radv_nir_init_function_params(trace_ray_func, MESA_SHADER_INTERSECTION, max_payload_size);
|
|
+ radv_nir_init_rt_function_params(trace_ray_func, MESA_SHADER_INTERSECTION, max_payload_size);
|
|
vars.trace_ray_func = trace_ray_func;
|
|
nir_function *chit_miss_func = nir_function_create(shader, "chit_miss_func");
|
|
- radv_nir_init_function_params(chit_miss_func, MESA_SHADER_CLOSEST_HIT, max_payload_size);
|
|
+ radv_nir_init_rt_function_params(chit_miss_func, MESA_SHADER_CLOSEST_HIT, max_payload_size);
|
|
vars.chit_miss_func = chit_miss_func;
|
|
nir_function *callable_func = nir_function_create(shader, "callable_func");
|
|
- radv_nir_init_function_params(callable_func, MESA_SHADER_CALLABLE, max_payload_size);
|
|
+ radv_nir_init_rt_function_params(callable_func, MESA_SHADER_CALLABLE, max_payload_size);
|
|
vars.callable_func = callable_func;
|
|
return vars;
|
|
}
|
|
@@ -2287,7 +2287,7 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH
|
|
nir_function_impl *impl = nir_shader_get_entrypoint(shader);
|
|
nir_function *entrypoint_function = impl->function;
|
|
|
|
- radv_nir_init_function_params(entrypoint_function, shader->info.stage, *payload_size);
|
|
+ radv_nir_init_rt_function_params(entrypoint_function, shader->info.stage, *payload_size);
|
|
|
|
const VkPipelineCreateFlagBits2KHR create_flags = vk_rt_pipeline_create_flags(pCreateInfo);
|
|
|
|
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
|
|
index 7dacf66a7a3fa..10e062fb041b9 100644
|
|
--- a/src/amd/vulkan/radv_shader.h
|
|
+++ b/src/amd/vulkan/radv_shader.h
|
|
@@ -520,6 +520,7 @@ void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_
|
|
|
|
struct radv_ray_tracing_stage_info;
|
|
|
|
+void radv_nir_init_rt_function_params(nir_function *function, gl_shader_stage stage, unsigned payload_size);
|
|
void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo,
|
|
const struct radv_shader_args *args, const struct radv_shader_info *info,
|
|
uint32_t *payload_size, uint32_t *stack_size, struct radv_device *device,
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 98acf10bc32ec843f53497bc701a673777232c65 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Fri, 3 May 2024 17:36:43 +0200
|
|
Subject: [PATCH 31/71] radv: Use call optimization
|
|
|
|
---
|
|
src/amd/vulkan/radv_pipeline.c | 2 ++
|
|
1 file changed, 2 insertions(+)
|
|
|
|
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
|
|
index 82a5aac71437d..daaf4e9ba4f00 100644
|
|
--- a/src/amd/vulkan/radv_pipeline.c
|
|
+++ b/src/amd/vulkan/radv_pipeline.c
|
|
@@ -643,6 +643,8 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
|
|
* spilling.
|
|
*/
|
|
NIR_PASS(_, stage->nir, nir_opt_move, nir_move_comparisons);
|
|
+
|
|
+ NIR_PASS(_, stage->nir, nir_minimize_call_live_states);
|
|
}
|
|
}
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 872b8a249c2fa92a5425c4476d7021d881d76990 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Thu, 28 Dec 2023 20:03:05 +0100
|
|
Subject: [PATCH 32/71] aco: Add ABI and Pseudo CALL format
|
|
|
|
---
|
|
src/amd/compiler/aco_builder_h.py | 29 +++
|
|
.../compiler/aco_instruction_selection.cpp | 23 ++
|
|
src/amd/compiler/aco_ir.cpp | 1 +
|
|
src/amd/compiler/aco_ir.h | 235 +++++++++++++++++-
|
|
src/amd/compiler/aco_opcodes.py | 7 +-
|
|
src/amd/compiler/aco_register_allocation.cpp | 71 ------
|
|
6 files changed, 292 insertions(+), 74 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py
|
|
index b1054bda76fd6..3d47be6101248 100644
|
|
--- a/src/amd/compiler/aco_builder_h.py
|
|
+++ b/src/amd/compiler/aco_builder_h.py
|
|
@@ -567,6 +567,7 @@ formats = [("pseudo", [Format.PSEUDO], list(itertools.product(range(5), range(6)
|
|
("branch", [Format.PSEUDO_BRANCH], itertools.product([1], [0, 1])),
|
|
("barrier", [Format.PSEUDO_BARRIER], [(0, 0)]),
|
|
("reduction", [Format.PSEUDO_REDUCTION], [(3, 3)]),
|
|
+ ("call", [Format.PSEUDO_CALL], [(0, 0)]),
|
|
("vop1", [Format.VOP1], [(0, 0), (1, 1), (2, 2)]),
|
|
("vop1_sdwa", [Format.VOP1, Format.SDWA], [(1, 1)]),
|
|
("vop2", [Format.VOP2], itertools.product([1, 2], [2, 3])),
|
|
@@ -603,6 +604,7 @@ formats = [("pseudo", [Format.PSEUDO], list(itertools.product(range(5), range(6)
|
|
formats = [(f if len(f) == 5 else f + ('',)) for f in formats]
|
|
%>\\
|
|
% for name, formats, shapes, extra_field_setup in formats:
|
|
+ % if shapes:
|
|
% for num_definitions, num_operands in shapes:
|
|
<%
|
|
args = ['aco_opcode opcode']
|
|
@@ -655,6 +657,33 @@ formats = [(f if len(f) == 5 else f + ('',)) for f in formats]
|
|
|
|
% endif
|
|
% endfor
|
|
+% else:
|
|
+ <%
|
|
+ args = ['aco_opcode opcode', 'aco::span<Definition> definitions', 'aco::span<Operand> operands' ]
|
|
+ for f in formats:
|
|
+ args += f.get_builder_field_decls()
|
|
+ %>\\
|
|
+
|
|
+ Result ${name}(${', '.join(args)})
|
|
+ {
|
|
+ ${struct} *instr = create_instruction<${struct}>(opcode, (Format)(${'|'.join('(int)Format::%s' % f.name for f in formats)}), operands.size(), definitions.size());
|
|
+ for (unsigned i = 0; i < definitions.size(); ++i) {
|
|
+ instr->definitions[i] = definitions[i];
|
|
+ instr->definitions[i].setPrecise(is_precise);
|
|
+ instr->definitions[i].setNUW(is_nuw);
|
|
+ }
|
|
+ for (unsigned i = 0; i < operands.size(); ++i)
|
|
+ instr->operands[i] = operands[i];
|
|
+ % for f in formats:
|
|
+ % for dest, field_name in zip(f.get_builder_field_dests(), f.get_builder_field_names()):
|
|
+ instr->${dest} = ${field_name};
|
|
+ % endfor
|
|
+ ${f.get_builder_initialization(num_operands)}
|
|
+ % endfor
|
|
+ ${extra_field_setup}
|
|
+ return insert(instr);
|
|
+ }
|
|
+% endif
|
|
% endfor
|
|
};
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index 30f0bdd1cb8f8..662b6cccc0abf 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -10537,6 +10537,29 @@ visit_jump(isel_context* ctx, nir_jump_instr* instr)
|
|
}
|
|
}
|
|
|
|
+ABI
|
|
+make_abi(const ABI& base, Program* program)
|
|
+{
|
|
+ ABI abi = base;
|
|
+
|
|
+ unsigned sgpr_limit = program->dev.sgpr_limit;
|
|
+ /* GFX8- needs a scratch_rsrc that we need to keep around somewhere */
|
|
+ if (program->gfx_level < GFX9)
|
|
+ sgpr_limit -= (align(sgpr_limit, 4) - sgpr_limit) + 4;
|
|
+ unsigned vgpr_limit = program->dev.vgpr_limit;
|
|
+
|
|
+ abi.parameterSpace.sgpr.size =
|
|
+ std::min(abi.parameterSpace.sgpr.size, sgpr_limit - abi.parameterSpace.sgpr.lo());
|
|
+ abi.parameterSpace.vgpr.size =
|
|
+ std::min(abi.parameterSpace.vgpr.size, vgpr_limit - (abi.parameterSpace.vgpr.lo() - 256));
|
|
+ abi.clobberedRegs.sgpr.size =
|
|
+ std::min(abi.clobberedRegs.sgpr.size, sgpr_limit - abi.clobberedRegs.sgpr.lo());
|
|
+ abi.clobberedRegs.vgpr.size =
|
|
+ std::min(abi.clobberedRegs.vgpr.size, vgpr_limit - (abi.clobberedRegs.vgpr.lo() - 256));
|
|
+
|
|
+ return abi;
|
|
+}
|
|
+
|
|
void
|
|
visit_block(isel_context* ctx, nir_block* block)
|
|
{
|
|
diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp
|
|
index 2c0b17a82cae6..afa1364a83f59 100644
|
|
--- a/src/amd/compiler/aco_ir.cpp
|
|
+++ b/src/amd/compiler/aco_ir.cpp
|
|
@@ -1541,6 +1541,7 @@ get_instr_data_size(Format format)
|
|
case Format::PSEUDO_BARRIER: return sizeof(Pseudo_barrier_instruction);
|
|
case Format::PSEUDO_REDUCTION: return sizeof(Pseudo_reduction_instruction);
|
|
case Format::PSEUDO_BRANCH: return sizeof(Pseudo_branch_instruction);
|
|
+ case Format::PSEUDO_CALL: return sizeof(Pseudo_call_instruction);
|
|
case Format::DS: return sizeof(DS_instruction);
|
|
case Format::FLAT:
|
|
case Format::GLOBAL:
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index d838b728e19ce..62661b8918a9e 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -441,6 +441,215 @@ static constexpr PhysReg exec_hi{127};
|
|
static constexpr PhysReg pops_exiting_wave_id{239}; /* GFX9-GFX10.3 */
|
|
static constexpr PhysReg scc{253};
|
|
|
|
+/* Iterator type for making PhysRegInterval compatible with range-based for */
|
|
+struct PhysRegIterator {
|
|
+ using difference_type = int;
|
|
+ using value_type = unsigned;
|
|
+ using reference = const unsigned&;
|
|
+ using pointer = const unsigned*;
|
|
+ using iterator_category = std::bidirectional_iterator_tag;
|
|
+
|
|
+ PhysReg reg;
|
|
+
|
|
+ PhysReg operator*() const { return reg; }
|
|
+
|
|
+ PhysRegIterator& operator++()
|
|
+ {
|
|
+ reg.reg_b += 4;
|
|
+ return *this;
|
|
+ }
|
|
+
|
|
+ PhysRegIterator& operator--()
|
|
+ {
|
|
+ reg.reg_b -= 4;
|
|
+ return *this;
|
|
+ }
|
|
+
|
|
+ bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }
|
|
+
|
|
+ bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }
|
|
+
|
|
+ bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }
|
|
+};
|
|
+
|
|
+/* Half-open register interval used in "sliding window"-style for-loops */
|
|
+struct PhysRegInterval {
|
|
+ PhysReg lo_;
|
|
+ unsigned size;
|
|
+
|
|
+ /* Inclusive lower bound */
|
|
+ PhysReg lo() const { return lo_; }
|
|
+
|
|
+ /* Exclusive upper bound */
|
|
+ PhysReg hi() const { return PhysReg{lo() + size}; }
|
|
+
|
|
+ PhysRegInterval& operator+=(uint32_t stride)
|
|
+ {
|
|
+ lo_ = PhysReg{lo_.reg() + stride};
|
|
+ return *this;
|
|
+ }
|
|
+
|
|
+ bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }
|
|
+
|
|
+ /* Construct a half-open interval, excluding the end register */
|
|
+ static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }
|
|
+
|
|
+ bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }
|
|
+
|
|
+ bool contains(const PhysRegInterval& needle) const
|
|
+ {
|
|
+ return needle.lo() >= lo() && needle.hi() <= hi();
|
|
+ }
|
|
+
|
|
+ PhysRegIterator begin() const { return {lo_}; }
|
|
+
|
|
+ PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }
|
|
+};
|
|
+
|
|
+inline bool
|
|
+intersects(const PhysRegInterval& a, const PhysRegInterval& b)
|
|
+{
|
|
+ return a.hi() > b.lo() && b.hi() > a.lo();
|
|
+}
|
|
+
|
|
+struct GPRInterval {
|
|
+ PhysRegInterval sgpr;
|
|
+ PhysRegInterval vgpr;
|
|
+};
|
|
+
|
|
+struct ABI {
|
|
+ GPRInterval parameterSpace;
|
|
+ GPRInterval clobberedRegs;
|
|
+
|
|
+ bool clobbersVCC;
|
|
+ bool clobbersSCC;
|
|
+};
|
|
+
|
|
+static constexpr ABI rtRaygenABI = {
|
|
+ .parameterSpace =
|
|
+ {
|
|
+ .sgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(0),
|
|
+ .size = 32,
|
|
+ },
|
|
+ .vgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(256),
|
|
+ .size = 32,
|
|
+ },
|
|
+ },
|
|
+ .clobberedRegs =
|
|
+ {
|
|
+ .sgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(0),
|
|
+ .size = 108,
|
|
+ },
|
|
+ .vgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(256),
|
|
+ .size = 128,
|
|
+ },
|
|
+ },
|
|
+ .clobbersVCC = true,
|
|
+ .clobbersSCC = true,
|
|
+};
|
|
+
|
|
+static constexpr ABI rtTraversalABI = {
|
|
+ .parameterSpace =
|
|
+ {
|
|
+ .sgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(0),
|
|
+ .size = 32,
|
|
+ },
|
|
+ .vgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(256),
|
|
+ .size = 32,
|
|
+ },
|
|
+ },
|
|
+ .clobberedRegs =
|
|
+ {
|
|
+ /* TODO: maybe find better values */
|
|
+ .sgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(0),
|
|
+ .size = 108,
|
|
+ },
|
|
+ .vgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(256),
|
|
+ .size = 128,
|
|
+ },
|
|
+ },
|
|
+ .clobbersVCC = true,
|
|
+ .clobbersSCC = true,
|
|
+};
|
|
+
|
|
+static constexpr ABI rtAnyHitABI = {
|
|
+ .parameterSpace =
|
|
+ {
|
|
+ .sgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(0),
|
|
+ .size = 32,
|
|
+ },
|
|
+ .vgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(256),
|
|
+ .size = 32,
|
|
+ },
|
|
+ },
|
|
+ .clobberedRegs =
|
|
+ {
|
|
+ .sgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(80),
|
|
+ .size = 16,
|
|
+ },
|
|
+ .vgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(256 + 80),
|
|
+ .size = 32,
|
|
+ },
|
|
+ },
|
|
+ .clobbersVCC = true,
|
|
+ .clobbersSCC = true,
|
|
+};
|
|
+
|
|
+static constexpr ABI rtClosestHitMissABI = {
|
|
+ .parameterSpace =
|
|
+ {
|
|
+ .sgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(0),
|
|
+ .size = 32,
|
|
+ },
|
|
+ .vgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(256),
|
|
+ .size = 32,
|
|
+ },
|
|
+ },
|
|
+ .clobberedRegs =
|
|
+ {
|
|
+ .sgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(0),
|
|
+ .size = 108,
|
|
+ },
|
|
+ .vgpr =
|
|
+ {
|
|
+ .lo_ = PhysReg(256),
|
|
+ .size = 128,
|
|
+ },
|
|
+ },
|
|
+ .clobbersVCC = true,
|
|
+ .clobbersSCC = true,
|
|
+};
|
|
+
|
|
/**
|
|
* Operand Class
|
|
* Initially, each Operand refers to either
|
|
@@ -1095,6 +1304,7 @@ struct FLAT_instruction;
|
|
struct Pseudo_branch_instruction;
|
|
struct Pseudo_barrier_instruction;
|
|
struct Pseudo_reduction_instruction;
|
|
+struct Pseudo_call_instruction;
|
|
struct VALU_instruction;
|
|
struct VINTERP_inreg_instruction;
|
|
struct VINTRP_instruction;
|
|
@@ -1295,6 +1505,17 @@ struct Instruction {
|
|
return *(Pseudo_reduction_instruction*)this;
|
|
}
|
|
constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; }
|
|
+ Pseudo_call_instruction& call() noexcept
|
|
+ {
|
|
+ assert(isCall());
|
|
+ return *(Pseudo_call_instruction*)this;
|
|
+ }
|
|
+ const Pseudo_call_instruction& call() const noexcept
|
|
+ {
|
|
+ assert(isCall());
|
|
+ return *(Pseudo_call_instruction*)this;
|
|
+ }
|
|
+ constexpr bool isCall() const noexcept { return format == Format::PSEUDO_CALL; }
|
|
constexpr bool isVOP3P() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3P; }
|
|
VINTERP_inreg_instruction& vinterp_inreg() noexcept
|
|
{
|
|
@@ -1773,6 +1994,16 @@ struct Pseudo_reduction_instruction : public Instruction {
|
|
static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4,
|
|
"Unexpected padding");
|
|
|
|
+struct Pseudo_call_instruction : public Instruction {
|
|
+ ABI abi;
|
|
+ /*
|
|
+ * Register demand that's exclusively used for blocking registers for ABI compatibility.
|
|
+ * Set by live var analysis.
|
|
+ */
|
|
+ RegisterDemand blocked_abi_demand;
|
|
+};
|
|
+static_assert(sizeof(Pseudo_call_instruction) == sizeof(Instruction) + 40, "Unexpected padding");
|
|
+
|
|
inline bool
|
|
Instruction::accessesLDS() const noexcept
|
|
{
|
|
@@ -1845,8 +2076,8 @@ memory_sync_info get_sync_info(const Instruction* instr);
|
|
inline bool
|
|
is_dead(const std::vector<uint16_t>& uses, const Instruction* instr)
|
|
{
|
|
- if (instr->definitions.empty() || instr->isBranch() || instr->opcode == aco_opcode::p_startpgm ||
|
|
- instr->opcode == aco_opcode::p_init_scratch ||
|
|
+ if (instr->definitions.empty() || instr->isBranch() || instr->isCall() ||
|
|
+ instr->opcode == aco_opcode::p_startpgm || instr->opcode == aco_opcode::p_init_scratch ||
|
|
instr->opcode == aco_opcode::p_dual_src_export_gfx11)
|
|
return false;
|
|
|
|
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
|
|
index 6e37ee6fad6c9..d828f1642658b 100644
|
|
--- a/src/amd/compiler/aco_opcodes.py
|
|
+++ b/src/amd/compiler/aco_opcodes.py
|
|
@@ -50,6 +50,7 @@ class Format(IntEnum):
|
|
PSEUDO_BRANCH = auto()
|
|
PSEUDO_BARRIER = auto()
|
|
PSEUDO_REDUCTION = auto()
|
|
+ PSEUDO_CALL = auto()
|
|
# Scalar ALU & Control Formats
|
|
SOP1 = auto()
|
|
SOP2 = auto()
|
|
@@ -93,7 +94,7 @@ class Format(IntEnum):
|
|
return "salu"
|
|
elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]:
|
|
return "flatlike"
|
|
- elif self in [Format.PSEUDO_BRANCH, Format.PSEUDO_REDUCTION, Format.PSEUDO_BARRIER]:
|
|
+ elif self in [Format.PSEUDO_BRANCH, Format.PSEUDO_REDUCTION, Format.PSEUDO_BARRIER, Format.PSEUDO_CALL]:
|
|
return self.name.split("_")[-1].lower()
|
|
else:
|
|
return self.name.lower()
|
|
@@ -162,6 +163,8 @@ class Format(IntEnum):
|
|
elif self == Format.PSEUDO_BARRIER:
|
|
return [('memory_sync_info', 'sync', None),
|
|
('sync_scope', 'exec_scope', 'scope_invocation')]
|
|
+ elif self == Format.PSEUDO_CALL:
|
|
+ return [('ABI', 'abi', None)]
|
|
elif self == Format.VINTRP:
|
|
return [('unsigned', 'attribute', None),
|
|
('unsigned', 'component', None),
|
|
@@ -351,6 +354,8 @@ insn("p_cbranch_nz", format=Format.PSEUDO_BRANCH)
|
|
|
|
insn("p_barrier", format=Format.PSEUDO_BARRIER)
|
|
|
|
+insn("p_call", format=Format.PSEUDO_CALL)
|
|
+
|
|
# Primitive Ordered Pixel Shading pseudo-instructions.
|
|
|
|
# For querying whether the current wave can enter the ordered section on GFX9-10.3, doing
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index 3ce0680bf52d6..4d73525bd0660 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -74,37 +74,6 @@ struct assignment {
|
|
}
|
|
};
|
|
|
|
-/* Iterator type for making PhysRegInterval compatible with range-based for */
|
|
-struct PhysRegIterator {
|
|
- using difference_type = int;
|
|
- using value_type = unsigned;
|
|
- using reference = const unsigned&;
|
|
- using pointer = const unsigned*;
|
|
- using iterator_category = std::bidirectional_iterator_tag;
|
|
-
|
|
- PhysReg reg;
|
|
-
|
|
- PhysReg operator*() const { return reg; }
|
|
-
|
|
- PhysRegIterator& operator++()
|
|
- {
|
|
- reg.reg_b += 4;
|
|
- return *this;
|
|
- }
|
|
-
|
|
- PhysRegIterator& operator--()
|
|
- {
|
|
- reg.reg_b -= 4;
|
|
- return *this;
|
|
- }
|
|
-
|
|
- bool operator==(PhysRegIterator oth) const { return reg == oth.reg; }
|
|
-
|
|
- bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; }
|
|
-
|
|
- bool operator<(PhysRegIterator oth) const { return reg < oth.reg; }
|
|
-};
|
|
-
|
|
struct vector_info {
|
|
vector_info() : is_weak(false), num_parts(0), parts(NULL) {}
|
|
vector_info(Instruction* instr, unsigned start = 0, bool weak = false)
|
|
@@ -162,46 +131,6 @@ struct ra_ctx {
|
|
}
|
|
};
|
|
|
|
-/* Half-open register interval used in "sliding window"-style for-loops */
|
|
-struct PhysRegInterval {
|
|
- PhysReg lo_;
|
|
- unsigned size;
|
|
-
|
|
- /* Inclusive lower bound */
|
|
- PhysReg lo() const { return lo_; }
|
|
-
|
|
- /* Exclusive upper bound */
|
|
- PhysReg hi() const { return PhysReg{lo() + size}; }
|
|
-
|
|
- PhysRegInterval& operator+=(uint32_t stride)
|
|
- {
|
|
- lo_ = PhysReg{lo_.reg() + stride};
|
|
- return *this;
|
|
- }
|
|
-
|
|
- bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; }
|
|
-
|
|
- /* Construct a half-open interval, excluding the end register */
|
|
- static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; }
|
|
-
|
|
- bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); }
|
|
-
|
|
- bool contains(const PhysRegInterval& needle) const
|
|
- {
|
|
- return needle.lo() >= lo() && needle.hi() <= hi();
|
|
- }
|
|
-
|
|
- PhysRegIterator begin() const { return {lo_}; }
|
|
-
|
|
- PhysRegIterator end() const { return {PhysReg{lo_ + size}}; }
|
|
-};
|
|
-
|
|
-bool
|
|
-intersects(const PhysRegInterval& a, const PhysRegInterval& b)
|
|
-{
|
|
- return a.hi() > b.lo() && b.hi() > a.lo();
|
|
-}
|
|
-
|
|
/* Gets the stride for full (non-subdword) registers */
|
|
uint32_t
|
|
get_stride(RegClass rc)
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 9d88284e83bab4a0ba20700dc3be48c646284a79 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Tue, 9 Apr 2024 08:08:07 +0200
|
|
Subject: [PATCH 33/71] aco: Add pseudo instr to calculate a function callee's
|
|
stack pointer
|
|
|
|
---
|
|
src/amd/compiler/aco_lower_to_hw_instr.cpp | 14 ++++++++++++++
|
|
src/amd/compiler/aco_opcodes.py | 2 ++
|
|
2 files changed, 16 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
|
|
index fa3c805f491b5..1e1737319c3f6 100644
|
|
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
|
|
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
|
|
@@ -2817,6 +2817,20 @@ lower_to_hw_instr(Program* program)
|
|
((32 - 1) << 11) | shader_cycles_hi);
|
|
break;
|
|
}
|
|
+ case aco_opcode::p_callee_stack_ptr: {
|
|
+ unsigned caller_stack_size =
|
|
+ ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size;
|
|
+ unsigned scratch_param_size = instr->operands[0].constantValue();
|
|
+ unsigned callee_stack_start = caller_stack_size + scratch_param_size;
|
|
+ if (ctx.program->gfx_level < GFX9)
|
|
+ callee_stack_start *= ctx.program->wave_size;
|
|
+ if (instr->operands.size() < 2)
|
|
+ bld.sop1(aco_opcode::s_mov_b32, instr->definitions[0],
|
|
+ Operand::c32(callee_stack_start));
|
|
+ else
|
|
+ bld.sop2(aco_opcode::s_add_u32, instr->definitions[0], Definition(scc, s1),
|
|
+ instr->operands[1], Operand::c32(callee_stack_start));
|
|
+ }
|
|
default: break;
|
|
}
|
|
} else if (instr->isBranch()) {
|
|
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
|
|
index d828f1642658b..696a5a945b310 100644
|
|
--- a/src/amd/compiler/aco_opcodes.py
|
|
+++ b/src/amd/compiler/aco_opcodes.py
|
|
@@ -331,6 +331,8 @@ insn("p_boolean_phi")
|
|
insn("p_as_uniform")
|
|
insn("p_unit_test")
|
|
|
|
+insn("p_callee_stack_ptr")
|
|
+
|
|
insn("p_create_vector")
|
|
insn("p_extract_vector")
|
|
insn("p_split_vector")
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 0e07c86fd764126d0af3bfb2041d680e9367ee6e Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 22 Apr 2024 06:50:54 +0200
|
|
Subject: [PATCH 34/71] aco: Add scratch stack pointer
|
|
|
|
Function callees shouldn't overwrite caller's stacks.
|
|
Track where to write scratch data with a stack pointer.
|
|
---
|
|
src/amd/compiler/aco_ir.h | 1 +
|
|
src/amd/compiler/aco_reindex_ssa.cpp | 1 +
|
|
2 files changed, 2 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index 62661b8918a9e..ef2a6a0255664 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -2361,6 +2361,7 @@ public:
|
|
std::vector<uint8_t> constant_data;
|
|
Temp private_segment_buffer;
|
|
Temp scratch_offset;
|
|
+ Temp stack_ptr = {};
|
|
|
|
uint16_t num_waves = 0;
|
|
uint16_t min_waves = 0;
|
|
diff --git a/src/amd/compiler/aco_reindex_ssa.cpp b/src/amd/compiler/aco_reindex_ssa.cpp
|
|
index 7c30e5b53656e..5e135a8ff83fe 100644
|
|
--- a/src/amd/compiler/aco_reindex_ssa.cpp
|
|
+++ b/src/amd/compiler/aco_reindex_ssa.cpp
|
|
@@ -73,6 +73,7 @@ reindex_program(idx_ctx& ctx, Program* program)
|
|
program->private_segment_buffer.regClass());
|
|
program->scratch_offset =
|
|
Temp(ctx.renames[program->scratch_offset.id()], program->scratch_offset.regClass());
|
|
+ program->stack_ptr = Temp(ctx.renames[program->stack_ptr.id()], program->stack_ptr.regClass());
|
|
program->temp_rc = ctx.temp_rc;
|
|
}
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From e876db458963a92579827a04a21b1427c0442c72 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 22 Apr 2024 06:51:10 +0200
|
|
Subject: [PATCH 35/71] aco/spill: Use scratch stack pointer
|
|
|
|
---
|
|
src/amd/compiler/aco_spill.cpp | 7 ++++++-
|
|
1 file changed, 6 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
|
|
index be45b0eda7632..2e30bf9e2783e 100644
|
|
--- a/src/amd/compiler/aco_spill.cpp
|
|
+++ b/src/amd/compiler/aco_spill.cpp
|
|
@@ -1240,7 +1240,12 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block,
|
|
}
|
|
|
|
/* GFX9+ uses scratch_* instructions, which don't use a resource. */
|
|
- ctx.scratch_rsrc = offset_bld.copy(offset_bld.def(s1), Operand::c32(saddr));
|
|
+ if (ctx.program->stack_ptr.id())
|
|
+ ctx.scratch_rsrc =
|
|
+ offset_bld.sop2(aco_opcode::s_add_u32, offset_bld.def(s1), Definition(scc, s1),
|
|
+ Operand(ctx.program->stack_ptr), Operand::c32(saddr));
|
|
+ else
|
|
+ ctx.scratch_rsrc = offset_bld.copy(offset_bld.def(s1), Operand::c32(saddr));
|
|
}
|
|
} else {
|
|
if (ctx.scratch_rsrc == Temp())
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 968bea7283d902a01843297661c63ea802a67a04 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sat, 4 May 2024 16:01:59 +0200
|
|
Subject: [PATCH 36/71] nir: Allow forward-declaring nir_parameter
|
|
|
|
---
|
|
src/compiler/nir/nir.h | 2 +-
|
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
|
|
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
|
|
index 10a592f4b87bb..dc6b15cd082b4 100644
|
|
--- a/src/compiler/nir/nir.h
|
|
+++ b/src/compiler/nir/nir.h
|
|
@@ -3646,7 +3646,7 @@ nir_cf_list_is_empty_block(struct exec_list *cf_list)
|
|
return false;
|
|
}
|
|
|
|
-typedef struct {
|
|
+typedef struct nir_parameter {
|
|
uint8_t num_components;
|
|
uint8_t bit_size;
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From e245c9553b06094af7afc232d8db158bd2e7b3d6 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 6 Mar 2024 13:27:17 +0100
|
|
Subject: [PATCH 37/71] aco: Add call info
|
|
|
|
---
|
|
.../compiler/aco_instruction_selection.cpp | 80 +++++++++++++++++++
|
|
src/amd/compiler/aco_instruction_selection.h | 32 ++++++++
|
|
.../aco_instruction_selection_setup.cpp | 8 ++
|
|
src/amd/compiler/aco_ir.h | 4 +
|
|
4 files changed, 124 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index 662b6cccc0abf..0875d1c7a20f4 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -10,6 +10,7 @@
|
|
#include "aco_builder.h"
|
|
#include "aco_interface.h"
|
|
#include "aco_ir.h"
|
|
+#include "aco_nir_call_attribs.h"
|
|
|
|
#include "common/ac_descriptors.h"
|
|
#include "common/ac_gpu_info.h"
|
|
@@ -10560,6 +10561,85 @@ make_abi(const ABI& base, Program* program)
|
|
return abi;
|
|
}
|
|
|
|
+struct callee_info
|
|
+get_callee_info(const ABI& abi, unsigned param_count, const nir_parameter* parameters,
|
|
+ Program* program)
|
|
+{
|
|
+ struct callee_info info = {};
|
|
+ info.param_infos.reserve(param_count);
|
|
+
|
|
+ unsigned sgpr_reg_byte_offset = 0;
|
|
+ unsigned vgpr_reg_byte_offset = 0;
|
|
+ unsigned scratch_param_byte_offset = 0;
|
|
+
|
|
+ Temp return_addr = program ? program->allocateTmp(s2) : Temp();
|
|
+ Definition return_def = Definition(return_addr);
|
|
+ return_def.setPrecolored(abi.parameterSpace.sgpr.lo().advance(sgpr_reg_byte_offset));
|
|
+ sgpr_reg_byte_offset += 8;
|
|
+
|
|
+ info.return_address = parameter_info{
|
|
+ .discardable = false,
|
|
+ .is_reg = true,
|
|
+ .def = return_def,
|
|
+ };
|
|
+
|
|
+ for (unsigned i = 0; i < param_count; ++i) {
|
|
+ unsigned* reg_byte_offset;
|
|
+ PhysRegInterval interval;
|
|
+ RegType type;
|
|
+ if (parameters[i].is_uniform) {
|
|
+ reg_byte_offset = &sgpr_reg_byte_offset;
|
|
+ interval = abi.parameterSpace.sgpr;
|
|
+ /* Explicitly reserve space for the stack pointer, which is allocated last */
|
|
+ interval.size -= 1;
|
|
+ type = RegType::sgpr;
|
|
+ } else {
|
|
+ reg_byte_offset = &vgpr_reg_byte_offset;
|
|
+ interval = abi.parameterSpace.vgpr;
|
|
+ type = RegType::vgpr;
|
|
+ }
|
|
+
|
|
+ unsigned byte_size = align(parameters[i].bit_size, 32) / 8 * parameters[i].num_components;
|
|
+ RegClass rc = RegClass(type, byte_size / 4);
|
|
+ PhysReg param_reg = interval.lo().advance(*reg_byte_offset);
|
|
+
|
|
+ if (param_reg < interval.hi()) {
|
|
+ ++info.reg_param_count;
|
|
+ if (parameters[i].is_return)
|
|
+ ++info.reg_return_param_count;
|
|
+ Temp dst = program ? program->allocateTmp(rc) : Temp();
|
|
+ Definition def = Definition(dst);
|
|
+ def.setPrecolored(param_reg);
|
|
+ *reg_byte_offset += byte_size;
|
|
+ info.param_infos.emplace_back(parameter_info{
|
|
+ .discardable = !!(parameters[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE),
|
|
+ .is_reg = true,
|
|
+ .def = def,
|
|
+ });
|
|
+ } else {
|
|
+ info.param_infos.emplace_back(parameter_info{
|
|
+ .discardable = !!(parameters[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE),
|
|
+ .is_reg = false,
|
|
+ .scratch_offset = scratch_param_byte_offset,
|
|
+ });
|
|
+ scratch_param_byte_offset += byte_size;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ Temp stack_ptr = program ? program->allocateTmp(s1) : Temp();
|
|
+ Definition stack_def = Definition(stack_ptr);
|
|
+ stack_def.setPrecolored(abi.parameterSpace.sgpr.lo().advance(sgpr_reg_byte_offset));
|
|
+ sgpr_reg_byte_offset += 4;
|
|
+ info.stack_ptr = parameter_info{
|
|
+ .discardable = false,
|
|
+ .is_reg = true,
|
|
+ .def = stack_def,
|
|
+ };
|
|
+
|
|
+ info.scratch_param_size = scratch_param_byte_offset;
|
|
+ return info;
|
|
+}
|
|
+
|
|
void
|
|
visit_block(isel_context* ctx, nir_block* block)
|
|
{
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h
|
|
index d7464811def91..1682ed262f1e5 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.h
|
|
+++ b/src/amd/compiler/aco_instruction_selection.h
|
|
@@ -35,6 +35,31 @@ struct shader_io_state {
|
|
}
|
|
};
|
|
|
|
+struct parameter_info {
|
|
+ bool discardable;
|
|
+ bool is_reg;
|
|
+ union {
|
|
+ Definition def;
|
|
+ unsigned scratch_offset;
|
|
+ };
|
|
+};
|
|
+
|
|
+struct call_info {
|
|
+ nir_call_instr* nir_instr;
|
|
+ Instruction* aco_instr;
|
|
+ std::vector<parameter_info> return_info;
|
|
+ unsigned scratch_param_size;
|
|
+};
|
|
+
|
|
+struct callee_info {
|
|
+ std::vector<parameter_info> param_infos;
|
|
+ parameter_info return_address;
|
|
+ parameter_info stack_ptr;
|
|
+ unsigned reg_param_count = 0;
|
|
+ unsigned reg_return_param_count = 0;
|
|
+ unsigned scratch_param_size = 0;
|
|
+};
|
|
+
|
|
struct exec_info {
|
|
/* Set to false when loop_nest_depth==0 && parent_if.is_divergent==false */
|
|
bool potentially_empty_discard = false;
|
|
@@ -111,6 +136,13 @@ struct isel_context {
|
|
uint32_t wqm_instruction_idx;
|
|
|
|
BITSET_DECLARE(output_args, AC_MAX_ARGS);
|
|
+
|
|
+ /* Function information */
|
|
+ ABI callee_abi;
|
|
+ struct callee_info callee_info;
|
|
+ std::vector<call_info> call_infos;
|
|
+ Temp next_divergent_pc;
|
|
+ Temp next_pc;
|
|
};
|
|
|
|
inline Temp
|
|
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
|
|
index 28708503c6b38..f1cd92aad5fd2 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
|
|
@@ -393,6 +393,8 @@ init_context(isel_context* ctx, nir_shader* shader)
|
|
ctx->program->allocateRange(impl->ssa_alloc);
|
|
RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id;
|
|
|
|
+ unsigned call_count = 0;
|
|
+
|
|
/* TODO: make this recursive to improve compile times */
|
|
bool done = false;
|
|
while (!done) {
|
|
@@ -699,12 +701,18 @@ init_context(isel_context* ctx, nir_shader* shader)
|
|
regclasses[phi->def.index] = rc;
|
|
break;
|
|
}
|
|
+ case nir_instr_type_call: {
|
|
+ ++call_count;
|
|
+ break;
|
|
+ }
|
|
default: break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+ ctx->call_infos.reserve(call_count);
|
|
+
|
|
ctx->program->config->spi_ps_input_ena = ctx->program->info.ps.spi_ps_input_ena;
|
|
ctx->program->config->spi_ps_input_addr = ctx->program->info.ps.spi_ps_input_addr;
|
|
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index ef2a6a0255664..920174ac50798 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -2385,6 +2385,10 @@ public:
|
|
/* For shader part with previous shader part that has lds access. */
|
|
bool pending_lds_access = false;
|
|
|
|
+ ABI callee_abi = {};
|
|
+ unsigned short arg_sgpr_count;
|
|
+ unsigned short arg_vgpr_count;
|
|
+
|
|
struct {
|
|
monotonic_buffer_resource memory;
|
|
/* live-in temps per block */
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 112032179e9758b2c24ab0184b3dd73ff34d7266 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sun, 21 Apr 2024 17:52:58 +0200
|
|
Subject: [PATCH 38/71] aco/isel: Use stack pointer parameter in
|
|
load/store_scratch
|
|
|
|
---
|
|
.../compiler/aco_instruction_selection.cpp | 32 +++++++++++++++++--
|
|
1 file changed, 29 insertions(+), 3 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index 0875d1c7a20f4..f985685b1d524 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -7751,11 +7751,28 @@ visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
if (ctx->program->gfx_level >= GFX9) {
|
|
if (nir_src_is_const(instr->src[0])) {
|
|
uint32_t max = ctx->program->dev.scratch_global_offset_max + 1;
|
|
- info.offset =
|
|
- bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
|
|
+ if (ctx->callee_info.stack_ptr.is_reg)
|
|
+ info.offset =
|
|
+ bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
|
|
+ Operand(ctx->callee_info.stack_ptr.def.getTemp()),
|
|
+ Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
|
|
+ else
|
|
+ info.offset = bld.copy(
|
|
+ bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max)));
|
|
info.const_offset = nir_src_as_uint(instr->src[0]) % max;
|
|
} else {
|
|
- info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
|
|
+ if (ctx->callee_info.stack_ptr.is_reg) {
|
|
+ Temp store_offset = get_ssa_temp(ctx, instr->src[0].ssa);
|
|
+ if (store_offset.type() == RegType::sgpr)
|
|
+ info.offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
|
|
+ Operand(ctx->callee_info.stack_ptr.def.getTemp()),
|
|
+ Operand(store_offset));
|
|
+ else
|
|
+ info.offset = bld.vop2(aco_opcode::v_add_u32, bld.def(v1),
|
|
+ Operand(ctx->callee_info.stack_ptr.def.getTemp()),
|
|
+ Operand(store_offset));
|
|
+ } else
|
|
+ info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa));
|
|
}
|
|
EmitLoadParameters params = scratch_flat_load_params;
|
|
params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
|
|
@@ -7775,6 +7792,15 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
|
|
Temp offset = get_ssa_temp(ctx, instr->src[1].ssa);
|
|
|
|
+ if (ctx->callee_info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9) {
|
|
+ if (offset.type() == RegType::sgpr)
|
|
+ offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
|
|
+ Operand(ctx->callee_info.stack_ptr.def.getTemp()), Operand(offset));
|
|
+ else
|
|
+ offset = bld.vop2(aco_opcode::v_add_u32, bld.def(v1),
|
|
+ Operand(ctx->callee_info.stack_ptr.def.getTemp()), Operand(offset));
|
|
+ }
|
|
+
|
|
unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8;
|
|
unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes);
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From b8e49a1b7325c6b46fa2bd27732047b213ef5bda Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sat, 9 Mar 2024 11:15:43 +0100
|
|
Subject: [PATCH 39/71] nir,aco: Add set_next_call_pc_amd intrinsic
|
|
|
|
Used for lowering function calls
|
|
---
|
|
src/amd/compiler/aco_instruction_selection.cpp | 5 +++++
|
|
src/compiler/nir/nir_intrinsics.py | 2 ++
|
|
2 files changed, 7 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index f985685b1d524..d83801d8e35cc 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -9640,6 +9640,11 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)),
|
|
Operand::c32(nir_intrinsic_base(instr)));
|
|
break;
|
|
+ case nir_intrinsic_set_next_call_pc_amd: {
|
|
+ ctx->next_divergent_pc = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
|
|
+ ctx->next_pc = get_ssa_temp(ctx, instr->src[1].ssa);
|
|
+ break;
|
|
+ }
|
|
default:
|
|
isel_err(&instr->instr, "Unimplemented intrinsic instr");
|
|
abort();
|
|
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
|
|
index 2a6de0c4b6f25..1e3efcf06446d 100644
|
|
--- a/src/compiler/nir/nir_intrinsics.py
|
|
+++ b/src/compiler/nir/nir_intrinsics.py
|
|
@@ -2374,3 +2374,5 @@ intrinsic("enqueue_node_payloads", src_comp=[-1])
|
|
|
|
# Returns true if it has been called for every payload.
|
|
intrinsic("finalize_incoming_node_payload", src_comp=[-1], dest_comp=1)
|
|
+
|
|
+intrinsic("set_next_call_pc_amd", src_comp=[1, 1], bit_sizes=[64])
|
|
--
|
|
GitLab
|
|
|
|
|
|
From c8aec7b77ef0fd5e1bb36cbf06929fd75523b8ca Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 26 Feb 2024 12:20:26 +0100
|
|
Subject: [PATCH 40/71] nir,aco: add call_return_adress sysval
|
|
|
|
---
|
|
src/amd/compiler/aco_instruction_selection.cpp | 5 +++++
|
|
src/compiler/nir/nir_divergence_analysis.c | 1 +
|
|
src/compiler/nir/nir_intrinsics.py | 1 +
|
|
3 files changed, 7 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index d83801d8e35cc..d0d0dc1b036df 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -9640,6 +9640,11 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)),
|
|
Operand::c32(nir_intrinsic_base(instr)));
|
|
break;
|
|
+ case nir_intrinsic_load_call_return_address_amd: {
|
|
+ bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
|
|
+ Operand(ctx->callee_info.return_address.def.getTemp()));
|
|
+ break;
|
|
+ }
|
|
case nir_intrinsic_set_next_call_pc_amd: {
|
|
ctx->next_divergent_pc = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa));
|
|
ctx->next_pc = get_ssa_temp(ctx, instr->src[1].ssa);
|
|
diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c
|
|
index 78943c897922f..2fc4eda71aeb0 100644
|
|
--- a/src/compiler/nir/nir_divergence_analysis.c
|
|
+++ b/src/compiler/nir/nir_divergence_analysis.c
|
|
@@ -344,6 +344,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
|
case nir_intrinsic_load_samples_log2_agx:
|
|
case nir_intrinsic_load_active_subgroup_count_agx:
|
|
case nir_intrinsic_load_constant_base_ptr:
|
|
+ case nir_intrinsic_load_call_return_address_amd:
|
|
is_divergent = false;
|
|
break;
|
|
|
|
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
|
|
index 1e3efcf06446d..808ee31420ba0 100644
|
|
--- a/src/compiler/nir/nir_intrinsics.py
|
|
+++ b/src/compiler/nir/nir_intrinsics.py
|
|
@@ -2375,4 +2375,5 @@ intrinsic("enqueue_node_payloads", src_comp=[-1])
|
|
# Returns true if it has been called for every payload.
|
|
intrinsic("finalize_incoming_node_payload", src_comp=[-1], dest_comp=1)
|
|
|
|
+system_value("call_return_address_amd", 1, bit_sizes=[64])
|
|
intrinsic("set_next_call_pc_amd", src_comp=[1, 1], bit_sizes=[64])
|
|
--
|
|
GitLab
|
|
|
|
|
|
From a61f79118bc11db5dbbc1ef19c521c834936a637 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sun, 7 Jan 2024 22:15:13 +0100
|
|
Subject: [PATCH 41/71] radv/nir: Lower NIR function call ABI
|
|
|
|
---
|
|
src/amd/vulkan/meson.build | 1 +
|
|
src/amd/vulkan/nir/radv_nir.h | 4 +
|
|
src/amd/vulkan/nir/radv_nir_lower_call_abi.c | 433 +++++++++++++++++++
|
|
src/amd/vulkan/radv_pipeline.c | 4 +
|
|
src/compiler/nir/nir_divergence_analysis.c | 1 +
|
|
src/compiler/nir/nir_intrinsics.py | 3 +
|
|
6 files changed, 446 insertions(+)
|
|
create mode 100644 src/amd/vulkan/nir/radv_nir_lower_call_abi.c
|
|
|
|
diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build
|
|
index 5976bef8b85cf..84414ad41b7c0 100644
|
|
--- a/src/amd/vulkan/meson.build
|
|
+++ b/src/amd/vulkan/meson.build
|
|
@@ -66,6 +66,7 @@ libradv_files = files(
|
|
'nir/radv_nir_apply_pipeline_layout.c',
|
|
'nir/radv_nir_export_multiview.c',
|
|
'nir/radv_nir_lower_abi.c',
|
|
+ 'nir/radv_nir_lower_call_abi.c',
|
|
'nir/radv_nir_lower_cooperative_matrix.c',
|
|
'nir/radv_nir_lower_fs_barycentric.c',
|
|
'nir/radv_nir_lower_fs_intrinsics.c',
|
|
diff --git a/src/amd/vulkan/nir/radv_nir.h b/src/amd/vulkan/nir/radv_nir.h
|
|
index cd779d64e857c..e004de467ed3e 100644
|
|
--- a/src/amd/vulkan/nir/radv_nir.h
|
|
+++ b/src/amd/vulkan/nir/radv_nir.h
|
|
@@ -90,6 +90,10 @@ typedef struct radv_nir_opt_tid_function_options {
|
|
|
|
bool radv_nir_opt_tid_function(nir_shader *shader, const radv_nir_opt_tid_function_options *options);
|
|
|
|
+void radv_nir_lower_callee_signature(nir_function *function, struct set *visited_funcs);
|
|
+
|
|
+bool radv_nir_lower_call_abi(nir_shader *shader, unsigned wave_size);
|
|
+
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
diff --git a/src/amd/vulkan/nir/radv_nir_lower_call_abi.c b/src/amd/vulkan/nir/radv_nir_lower_call_abi.c
|
|
new file mode 100644
|
|
index 0000000000000..5f18f9aea0f28
|
|
--- /dev/null
|
|
+++ b/src/amd/vulkan/nir/radv_nir_lower_call_abi.c
|
|
@@ -0,0 +1,433 @@
|
|
+/*
|
|
+ * Copyright © 2023 Valve Corporation
|
|
+ *
|
|
+ * Permission is hereby granted, free of charge, to any person obtaining a
|
|
+ * copy of this software and associated documentation files (the "Software"),
|
|
+ * to deal in the Software without restriction, including without limitation
|
|
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
+ * and/or sell copies of the Software, and to permit persons to whom the
|
|
+ * Software is furnished to do so, subject to the following conditions:
|
|
+ *
|
|
+ * The above copyright notice and this permission notice (including the next
|
|
+ * paragraph) shall be included in all copies or substantial portions of the
|
|
+ * Software.
|
|
+ *
|
|
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
+ * IN THE SOFTWARE.
|
|
+ */
|
|
+
|
|
+#include "aco_nir_call_attribs.h"
|
|
+#include "nir_builder.h"
|
|
+#include "radv_nir.h"
|
|
+
|
|
+void
|
|
+radv_nir_lower_callee_signature(nir_function *function, struct set *visited_funcs)
|
|
+{
|
|
+ if (visited_funcs) {
|
|
+ if (_mesa_set_search(visited_funcs, function))
|
|
+ return;
|
|
+ _mesa_set_add(visited_funcs, function);
|
|
+ }
|
|
+
|
|
+ nir_parameter *old_params = function->params;
|
|
+ unsigned old_num_params = function->num_params;
|
|
+
|
|
+ function->num_params += 2;
|
|
+ function->params = rzalloc_array_size(function->shader, function->num_params, sizeof(nir_parameter));
|
|
+
|
|
+ memcpy(function->params + 2, old_params, old_num_params * sizeof(nir_parameter));
|
|
+
|
|
+ function->params[0].num_components = 1;
|
|
+ function->params[0].bit_size = 64;
|
|
+ function->params[1].num_components = 1;
|
|
+ function->params[1].bit_size = 64;
|
|
+ function->params[1].is_uniform = true;
|
|
+
|
|
+ nir_function_impl *impl = function->impl;
|
|
+
|
|
+ if (!impl)
|
|
+ return;
|
|
+
|
|
+ nir_foreach_block (block, impl) {
|
|
+ nir_foreach_instr_safe (instr, block) {
|
|
+ if (instr->type != nir_instr_type_intrinsic)
|
|
+ continue;
|
|
+
|
|
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
|
+
|
|
+ if (intr->intrinsic == nir_intrinsic_load_param)
|
|
+ nir_intrinsic_set_param_idx(intr, nir_intrinsic_param_idx(intr) + 2);
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+/* Checks if caller can call callee using tail calls.
|
|
+ *
|
|
+ * If the ABIs mismatch, we might need to insert move instructions to move return values from callee return registers to
|
|
+ * caller return registers after the call. In that case, tail-calls are impossible to do correctly.
|
|
+ */
|
|
+static bool
|
|
+is_tail_call_compatible(nir_function *caller, nir_function *callee)
|
|
+{
|
|
+ /* If the caller doesn't return at all, we don't need to care if return params are compatible */
|
|
+ if (caller->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_NORETURN)
|
|
+ return true;
|
|
+ /* The same ABI can't mismatch */
|
|
+ if ((caller->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) ==
|
|
+ (callee->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK))
|
|
+ return true;
|
|
+ /* The recursive shader ABI and the traversal shader ABI are built so that return parameters occupy exactly
|
|
+ * the same registers, to allow tail calls from the traversal shader. */
|
|
+ if ((caller->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) == ACO_NIR_CALL_ABI_TRAVERSAL &&
|
|
+ (callee->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) == ACO_NIR_CALL_ABI_RT_RECURSIVE)
|
|
+ return true;
|
|
+ return false;
|
|
+}
|
|
+
|
|
+static void
|
|
+gather_tail_call_instrs_block(nir_function *caller, const struct nir_block *block, struct set *tail_calls)
|
|
+{
|
|
+ nir_foreach_instr_reverse (instr, block) {
|
|
+ switch (instr->type) {
|
|
+ case nir_instr_type_phi:
|
|
+ case nir_instr_type_undef:
|
|
+ case nir_instr_type_load_const:
|
|
+ continue;
|
|
+ case nir_instr_type_alu:
|
|
+ if (!nir_op_is_vec_or_mov(nir_instr_as_alu(instr)->op))
|
|
+ return;
|
|
+ continue;
|
|
+ case nir_instr_type_call: {
|
|
+ nir_call_instr *call = nir_instr_as_call(instr);
|
|
+
|
|
+ if (!is_tail_call_compatible(caller, call->callee))
|
|
+ return;
|
|
+
|
|
+ for (unsigned i = 0; i < call->num_params; ++i) {
|
|
+ if (call->callee->params[i].is_return != caller->params[i].is_return)
|
|
+ return;
|
|
+ /* We can only do tail calls if the caller returns exactly the callee return values */
|
|
+ if (caller->params[i].is_return) {
|
|
+ assert(call->params[i].ssa->parent_instr->type == nir_instr_type_deref);
|
|
+ nir_deref_instr *deref_root = nir_instr_as_deref(call->params[i].ssa->parent_instr);
|
|
+ while (nir_deref_instr_parent(deref_root))
|
|
+ deref_root = nir_deref_instr_parent(deref_root);
|
|
+
|
|
+ if (!deref_root->parent.ssa)
|
|
+ return;
|
|
+ if (deref_root->parent.ssa->parent_instr->type != nir_instr_type_intrinsic)
|
|
+ return;
|
|
+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(deref_root->parent.ssa->parent_instr);
|
|
+ if (intrin->intrinsic != nir_intrinsic_load_param)
|
|
+ return;
|
|
+ if (nir_intrinsic_param_idx(intrin) != i)
|
|
+ return;
|
|
+ }
|
|
+ if (call->callee->params[i].is_uniform != caller->params[i].is_uniform)
|
|
+ return;
|
|
+ if (call->callee->params[i].bit_size != caller->params[i].bit_size)
|
|
+ return;
|
|
+ if (call->callee->params[i].num_components != caller->params[i].num_components)
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ _mesa_set_add(tail_calls, instr);
|
|
+ continue;
|
|
+ }
|
|
+ default:
|
|
+ return;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ set_foreach (block->predecessors, pred) {
|
|
+ gather_tail_call_instrs_block(caller, pred->key, tail_calls);
|
|
+ }
|
|
+}
|
|
+
|
|
+struct lower_param_info {
|
|
+ /* */
|
|
+ nir_def *load_param_def;
|
|
+
|
|
+ nir_def *return_deref;
|
|
+ bool has_store;
|
|
+};
|
|
+
|
|
+static void
|
|
+check_param_uses_for_stores(nir_deref_instr *instr, struct lower_param_info *info)
|
|
+{
|
|
+ nir_foreach_use (deref_use, &instr->def) {
|
|
+ nir_instr *use_instr = nir_src_parent_instr(deref_use);
|
|
+ if (use_instr->type == nir_instr_type_deref)
|
|
+ check_param_uses_for_stores(nir_instr_as_deref(use_instr), info);
|
|
+ else if ((use_instr->type == nir_instr_type_intrinsic &&
|
|
+ nir_instr_as_intrinsic(use_instr)->intrinsic == nir_intrinsic_store_deref) ||
|
|
+ use_instr->type == nir_instr_type_call)
|
|
+ info->has_store = true;
|
|
+ }
|
|
+}
|
|
+
|
|
+static void
|
|
+rewrite_return_param_uses(nir_intrinsic_instr *intr, unsigned param_idx, struct lower_param_info *param_defs)
|
|
+{
|
|
+ nir_foreach_use_safe (use, &intr->def) {
|
|
+ nir_instr *use_instr = nir_src_parent_instr(use);
|
|
+ assert(use_instr && use_instr->type == nir_instr_type_deref &&
|
|
+ nir_instr_as_deref(use_instr)->deref_type == nir_deref_type_cast);
|
|
+ check_param_uses_for_stores(nir_instr_as_deref(use_instr), ¶m_defs[param_idx]);
|
|
+ nir_def_rewrite_uses(&nir_instr_as_deref(use_instr)->def, param_defs[param_idx].return_deref);
|
|
+
|
|
+ nir_instr_remove(use_instr);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void
|
|
+lower_call_abi_for_callee(nir_function *function, unsigned wave_size, struct set *visited_funcs)
|
|
+{
|
|
+ nir_function_impl *impl = function->impl;
|
|
+
|
|
+ nir_builder b = nir_builder_create(impl);
|
|
+ b.cursor = nir_before_impl(impl);
|
|
+
|
|
+ nir_variable *tail_call_pc =
|
|
+ nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint64_t_type(), "_tail_call_pc");
|
|
+ nir_store_var(&b, tail_call_pc, nir_imm_int64(&b, 0), 0x1);
|
|
+
|
|
+ struct set *tail_call_instrs = _mesa_set_create(b.shader, _mesa_hash_pointer, _mesa_key_pointer_equal);
|
|
+ gather_tail_call_instrs_block(function, nir_impl_last_block(impl), tail_call_instrs);
|
|
+
|
|
+ radv_nir_lower_callee_signature(function, visited_funcs);
|
|
+
|
|
+ /* guard the shader, so that only the correct invocations execute it */
|
|
+
|
|
+ nir_def *guard_condition = NULL;
|
|
+ nir_def *shader_addr;
|
|
+ nir_def *uniform_shader_addr;
|
|
+ if (function->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_DIVERGENT_CALL) {
|
|
+ nir_cf_list list;
|
|
+ nir_cf_extract(&list, nir_before_impl(impl), nir_after_impl(impl));
|
|
+
|
|
+ b.cursor = nir_before_impl(impl);
|
|
+
|
|
+ shader_addr = nir_load_param(&b, 0);
|
|
+ uniform_shader_addr = nir_load_param(&b, 1);
|
|
+
|
|
+ guard_condition = nir_ieq(&b, uniform_shader_addr, shader_addr);
|
|
+ nir_if *shader_guard = nir_push_if(&b, guard_condition);
|
|
+ shader_guard->control = nir_selection_control_divergent_always_taken;
|
|
+ nir_cf_reinsert(&list, b.cursor);
|
|
+ nir_pop_if(&b, shader_guard);
|
|
+ } else {
|
|
+ shader_addr = nir_load_param(&b, 0);
|
|
+ }
|
|
+
|
|
+ b.cursor = nir_before_impl(impl);
|
|
+ struct lower_param_info *param_infos = ralloc_size(b.shader, function->num_params * sizeof(struct lower_param_info));
|
|
+ nir_variable **param_vars = ralloc_size(b.shader, function->num_params * sizeof(nir_variable *));
|
|
+
|
|
+ for (unsigned i = 2; i < function->num_params; ++i) {
|
|
+ param_vars[i] = nir_local_variable_create(impl, function->params[i].type, "_param");
|
|
+ unsigned num_components = glsl_get_vector_elements(function->params[i].type);
|
|
+
|
|
+ if (function->params[i].is_return) {
|
|
+ assert(!glsl_type_is_array(function->params[i].type) && !glsl_type_is_struct(function->params[i].type));
|
|
+
|
|
+ function->params[i].bit_size = glsl_get_bit_size(function->params[i].type);
|
|
+ function->params[i].num_components = num_components;
|
|
+
|
|
+ param_infos[i].return_deref = &nir_build_deref_var(&b, param_vars[i])->def;
|
|
+ } else {
|
|
+ param_infos[i].return_deref = NULL;
|
|
+ }
|
|
+
|
|
+ param_infos[i].has_store = false;
|
|
+ param_infos[i].load_param_def = nir_load_param(&b, i);
|
|
+ nir_store_var(&b, param_vars[i], param_infos[i].load_param_def, (0x1 << num_components) - 1);
|
|
+ }
|
|
+
|
|
+ unsigned max_tail_call_param = 0;
|
|
+
|
|
+ nir_foreach_block (block, impl) {
|
|
+ bool progress;
|
|
+ do {
|
|
+ progress = false;
|
|
+ nir_foreach_instr_safe (instr, block) {
|
|
+ if (instr->type == nir_instr_type_call && _mesa_set_search(tail_call_instrs, instr)) {
|
|
+ nir_call_instr *call = nir_instr_as_call(instr);
|
|
+ b.cursor = nir_before_instr(instr);
|
|
+
|
|
+ for (unsigned i = 0; i < call->num_params; ++i) {
|
|
+ if (call->callee->params[i].is_return)
|
|
+ nir_store_var(&b, param_vars[i + 2],
|
|
+ nir_load_deref(&b, nir_instr_as_deref(call->params[i].ssa->parent_instr)),
|
|
+ (0x1 << glsl_get_vector_elements(call->callee->params[i].type)) - 1);
|
|
+ else
|
|
+ nir_store_var(&b, param_vars[i + 2], call->params[i].ssa,
|
|
+ (0x1 << call->params[i].ssa->num_components) - 1);
|
|
+ param_infos[i + 2].has_store = true;
|
|
+ }
|
|
+
|
|
+ nir_store_var(&b, tail_call_pc, call->indirect_callee.ssa, 0x1);
|
|
+ max_tail_call_param = MAX2(max_tail_call_param, call->num_params + 2);
|
|
+
|
|
+ nir_instr_remove(instr);
|
|
+
|
|
+ progress = true;
|
|
+ break;
|
|
+ }
|
|
+
|
|
+ if (instr->type != nir_instr_type_intrinsic)
|
|
+ continue;
|
|
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
|
+ if (nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_param) {
|
|
+ unsigned param_idx = nir_intrinsic_param_idx(intr);
|
|
+
|
|
+ if (param_idx >= 2 && &intr->def != param_infos[param_idx].load_param_def) {
|
|
+ if (function->params[param_idx].is_return)
|
|
+ rewrite_return_param_uses(intr, param_idx, param_infos);
|
|
+ else
|
|
+ nir_def_rewrite_uses(&intr->def, param_infos[param_idx].load_param_def);
|
|
+ nir_instr_remove(instr);
|
|
+ progress = true;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ } while (progress);
|
|
+ }
|
|
+
|
|
+ b.cursor = nir_after_impl(impl);
|
|
+
|
|
+ for (unsigned i = 2; i < function->num_params; ++i) {
|
|
+ if (param_infos[i].has_store)
|
|
+ nir_store_param_amd(&b, nir_load_var(&b, param_vars[i]), .param_idx = i);
|
|
+ }
|
|
+
|
|
+ if (guard_condition)
|
|
+ shader_addr = nir_bcsel(&b, guard_condition, nir_load_var(&b, tail_call_pc), shader_addr);
|
|
+ else
|
|
+ shader_addr = nir_load_var(&b, tail_call_pc);
|
|
+ nir_def *ballot = nir_ballot(&b, 1, wave_size, nir_ine_imm(&b, shader_addr, 0));
|
|
+ nir_def *ballot_addr = nir_read_invocation(&b, shader_addr, nir_find_lsb(&b, ballot));
|
|
+ uniform_shader_addr = nir_bcsel(&b, nir_ieq_imm(&b, ballot, 0), nir_load_call_return_address_amd(&b), ballot_addr);
|
|
+
|
|
+ if (!(function->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_NORETURN)) {
|
|
+ nir_push_if(&b, nir_ieq_imm(&b, uniform_shader_addr, 0));
|
|
+ nir_terminate(&b);
|
|
+ nir_pop_if(&b, NULL);
|
|
+
|
|
+ nir_set_next_call_pc_amd(&b, shader_addr, uniform_shader_addr);
|
|
+ }
|
|
+}
|
|
+
|
|
+static void
|
|
+lower_call_abi_for_call(nir_builder *b, nir_call_instr *call, unsigned *cur_call_idx, struct set *visited_funcs,
|
|
+ struct set *visited_calls)
|
|
+{
|
|
+ unsigned call_idx = (*cur_call_idx)++;
|
|
+
|
|
+ for (unsigned i = 0; i < call->num_params; ++i) {
|
|
+ unsigned callee_param_idx = i;
|
|
+ if (_mesa_set_search(visited_funcs, call->callee))
|
|
+ callee_param_idx += 2;
|
|
+
|
|
+ if (!call->callee->params[callee_param_idx].is_return)
|
|
+ continue;
|
|
+
|
|
+ b->cursor = nir_before_instr(&call->instr);
|
|
+
|
|
+ nir_src *old_src = &call->params[i];
|
|
+
|
|
+ assert(old_src->ssa->parent_instr->type == nir_instr_type_deref);
|
|
+ nir_deref_instr *param_deref = nir_instr_as_deref(old_src->ssa->parent_instr);
|
|
+ assert(param_deref->deref_type == nir_deref_type_var);
|
|
+
|
|
+ nir_src_rewrite(old_src, nir_load_deref(b, param_deref));
|
|
+
|
|
+ b->cursor = nir_after_instr(&call->instr);
|
|
+
|
|
+ unsigned num_components = glsl_get_vector_elements(param_deref->type);
|
|
+
|
|
+ nir_store_deref(
|
|
+ b, param_deref,
|
|
+ nir_load_return_param_amd(b, num_components, glsl_base_type_get_bit_size(param_deref->type->base_type),
|
|
+ .call_idx = call_idx, .param_idx = i + 2),
|
|
+ (1u << num_components) - 1);
|
|
+
|
|
+ assert(call->callee->params[callee_param_idx].bit_size == glsl_get_bit_size(param_deref->type));
|
|
+ assert(call->callee->params[callee_param_idx].num_components == num_components);
|
|
+ }
|
|
+
|
|
+ radv_nir_lower_callee_signature(call->callee, visited_funcs);
|
|
+
|
|
+ b->cursor = nir_after_instr(&call->instr);
|
|
+
|
|
+ nir_call_instr *new_call = nir_call_instr_create(b->shader, call->callee);
|
|
+ new_call->indirect_callee = nir_src_for_ssa(call->indirect_callee.ssa);
|
|
+ new_call->params[0] = nir_src_for_ssa(call->indirect_callee.ssa);
|
|
+ new_call->params[1] = nir_src_for_ssa(nir_read_first_invocation(b, call->indirect_callee.ssa));
|
|
+ for (unsigned i = 2; i < new_call->num_params; ++i)
|
|
+ new_call->params[i] = nir_src_for_ssa(call->params[i - 2].ssa);
|
|
+
|
|
+ nir_builder_instr_insert(b, &new_call->instr);
|
|
+ b->cursor = nir_after_instr(&new_call->instr);
|
|
+ _mesa_set_add(visited_calls, new_call);
|
|
+
|
|
+ nir_instr_remove(&call->instr);
|
|
+}
|
|
+
|
|
+static bool
|
|
+lower_call_abi_for_caller(nir_function_impl *impl, struct set *visited_funcs)
|
|
+{
|
|
+ bool progress = false;
|
|
+ unsigned cur_call_idx = 0;
|
|
+ struct set *visited_calls = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
|
|
+
|
|
+ nir_foreach_block (block, impl) {
|
|
+ nir_foreach_instr_safe (instr, block) {
|
|
+ if (instr->type != nir_instr_type_call)
|
|
+ continue;
|
|
+ nir_call_instr *call = nir_instr_as_call(instr);
|
|
+ if (call->callee->impl)
|
|
+ continue;
|
|
+ if (_mesa_set_search(visited_calls, call))
|
|
+ continue;
|
|
+
|
|
+ nir_builder b = nir_builder_create(impl);
|
|
+ lower_call_abi_for_call(&b, call, &cur_call_idx, visited_funcs, visited_calls);
|
|
+ progress = true;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ _mesa_set_destroy(visited_calls, NULL);
|
|
+
|
|
+ return progress;
|
|
+}
|
|
+
|
|
+bool
|
|
+radv_nir_lower_call_abi(nir_shader *shader, unsigned wave_size)
|
|
+{
|
|
+ struct set *visited_funcs = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
|
|
+
|
|
+ bool progress = false;
|
|
+ nir_foreach_function_with_impl (function, impl, shader) {
|
|
+ bool func_progress = false;
|
|
+ if (function->is_exported) {
|
|
+ lower_call_abi_for_callee(function, wave_size, visited_funcs);
|
|
+ func_progress = true;
|
|
+ }
|
|
+ func_progress |= lower_call_abi_for_caller(impl, visited_funcs);
|
|
+
|
|
+ if (func_progress)
|
|
+ nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance);
|
|
+ progress |= func_progress;
|
|
+ }
|
|
+
|
|
+ _mesa_set_destroy(visited_funcs, NULL);
|
|
+
|
|
+ return progress;
|
|
+}
|
|
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
|
|
index daaf4e9ba4f00..fc7195b5067ca 100644
|
|
--- a/src/amd/vulkan/radv_pipeline.c
|
|
+++ b/src/amd/vulkan/radv_pipeline.c
|
|
@@ -575,6 +575,10 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat
|
|
stage->nir, io_to_mem || lowered_ngg || stage->stage == MESA_SHADER_COMPUTE || stage->stage == MESA_SHADER_TASK,
|
|
gfx_level >= GFX8);
|
|
|
|
+ NIR_PASS(_, stage->nir, radv_nir_lower_call_abi, stage->info.wave_size);
|
|
+ NIR_PASS(_, stage->nir, nir_lower_global_vars_to_local);
|
|
+ NIR_PASS(_, stage->nir, nir_lower_vars_to_ssa);
|
|
+
|
|
NIR_PASS(_, stage->nir, nir_lower_fp16_casts, nir_lower_fp16_split_fp64);
|
|
|
|
if (stage->nir->info.bit_sizes_int & (8 | 16)) {
|
|
diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c
|
|
index 2fc4eda71aeb0..1f780f0621cac 100644
|
|
--- a/src/compiler/nir/nir_divergence_analysis.c
|
|
+++ b/src/compiler/nir/nir_divergence_analysis.c
|
|
@@ -892,6 +892,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
|
|
case nir_intrinsic_load_sample_mask:
|
|
case nir_intrinsic_quad_ballot_agx:
|
|
case nir_intrinsic_load_agx:
|
|
+ case nir_intrinsic_load_return_param_amd:
|
|
is_divergent = true;
|
|
break;
|
|
|
|
diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py
|
|
index 808ee31420ba0..32ab9b8a6acb8 100644
|
|
--- a/src/compiler/nir/nir_intrinsics.py
|
|
+++ b/src/compiler/nir/nir_intrinsics.py
|
|
@@ -2375,5 +2375,8 @@ intrinsic("enqueue_node_payloads", src_comp=[-1])
|
|
# Returns true if it has been called for every payload.
|
|
intrinsic("finalize_incoming_node_payload", src_comp=[-1], dest_comp=1)
|
|
|
|
+intrinsic("store_param_amd", src_comp=[-1], indices=[PARAM_IDX])
|
|
+intrinsic("load_return_param_amd", dest_comp=0, indices=[CALL_IDX, PARAM_IDX])
|
|
+
|
|
system_value("call_return_address_amd", 1, bit_sizes=[64])
|
|
intrinsic("set_next_call_pc_amd", src_comp=[1, 1], bit_sizes=[64])
|
|
--
|
|
GitLab
|
|
|
|
|
|
From fbe63f63878376a556e9eab7999edab5f332f257 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sun, 7 Jan 2024 22:42:03 +0100
|
|
Subject: [PATCH 42/71] aco: Compile all functions in RT shaders
|
|
|
|
---
|
|
.../compiler/aco_instruction_selection.cpp | 43 +-
|
|
.../aco_instruction_selection_setup.cpp | 639 +++++++++---------
|
|
2 files changed, 345 insertions(+), 337 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index d0d0dc1b036df..95baf3a302d0c 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -11891,30 +11891,35 @@ void
|
|
select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders,
|
|
const struct ac_shader_args* args)
|
|
{
|
|
+ bool first_block = true;
|
|
for (unsigned i = 0; i < shader_count; i++) {
|
|
- if (i) {
|
|
- ctx.block = ctx.program->create_and_insert_block();
|
|
- ctx.block->kind = block_kind_top_level | block_kind_resume;
|
|
- }
|
|
+ nir_foreach_function_impl (impl, shaders[i]) {
|
|
+ if (!first_block) {
|
|
+ ctx.block = ctx.program->create_and_insert_block();
|
|
+ ctx.block->kind = block_kind_top_level | block_kind_resume;
|
|
+ }
|
|
+ nir_shader* nir = shaders[i];
|
|
|
|
- nir_shader* nir = shaders[i];
|
|
- init_context(&ctx, nir);
|
|
- setup_fp_mode(&ctx, nir);
|
|
+ init_context(&ctx, nir);
|
|
+ setup_fp_mode(&ctx, nir);
|
|
|
|
- Instruction* startpgm = add_startpgm(&ctx);
|
|
- append_logical_start(ctx.block);
|
|
- split_arguments(&ctx, startpgm);
|
|
- visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body);
|
|
- append_logical_end(ctx.block);
|
|
- ctx.block->kind |= block_kind_uniform;
|
|
+ Instruction* startpgm = add_startpgm(&ctx);
|
|
+ append_logical_start(ctx.block);
|
|
+ split_arguments(&ctx, startpgm);
|
|
+ visit_cf_list(&ctx, &impl->body);
|
|
+ append_logical_end(ctx.block);
|
|
+ ctx.block->kind |= block_kind_uniform;
|
|
|
|
- /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen
|
|
- * shader without shader calls.
|
|
- */
|
|
- if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN)
|
|
- insert_rt_jump_next(ctx, args);
|
|
+ /* Fix output registers and jump to next shader. We can skip this when dealing with a
|
|
+ * raygen shader without shader calls.
|
|
+ */
|
|
+ if ((shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN) &&
|
|
+ impl == nir_shader_get_entrypoint(nir))
|
|
+ insert_rt_jump_next(ctx, args);
|
|
|
|
- cleanup_context(&ctx);
|
|
+ cleanup_context(&ctx);
|
|
+ first_block = false;
|
|
+ }
|
|
}
|
|
|
|
ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val;
|
|
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
|
|
index f1cd92aad5fd2..600c63c8b9ce3 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
|
|
@@ -257,8 +257,8 @@ setup_nir(isel_context* ctx, nir_shader* nir)
|
|
nir_opt_dce(nir);
|
|
}
|
|
|
|
- nir_function_impl* func = nir_shader_get_entrypoint(nir);
|
|
- nir_index_ssa_defs(func);
|
|
+ nir_foreach_function_impl (impl, nir)
|
|
+ nir_index_ssa_defs(impl);
|
|
}
|
|
|
|
/* Returns true if we can skip uniformization of a merge phi. This makes the destination divergent,
|
|
@@ -349,7 +349,6 @@ skip_uniformize_merge_phi(nir_def* ssa, unsigned depth)
|
|
void
|
|
init_context(isel_context* ctx, nir_shader* shader)
|
|
{
|
|
- nir_function_impl* impl = nir_shader_get_entrypoint(shader);
|
|
ctx->shader = shader;
|
|
|
|
/* Init NIR range analysis. */
|
|
@@ -366,356 +365,359 @@ init_context(isel_context* ctx, nir_shader* shader)
|
|
|
|
ac_nir_opt_shared_append(shader);
|
|
|
|
- uint32_t options =
|
|
- shader->options->divergence_analysis_options | nir_divergence_ignore_undef_if_phi_srcs;
|
|
- nir_divergence_analysis_impl(impl, (nir_divergence_options)options);
|
|
- shader->info.divergence_analysis_run = true;
|
|
- if (nir_opt_uniform_atomics(shader, false)) {
|
|
- nir_lower_int64(shader);
|
|
+ nir_foreach_function_impl (impl, shader) {
|
|
+ uint32_t options =
|
|
+ shader->options->divergence_analysis_options | nir_divergence_ignore_undef_if_phi_srcs;
|
|
nir_divergence_analysis_impl(impl, (nir_divergence_options)options);
|
|
- }
|
|
+ shader->info.divergence_analysis_run = true;
|
|
+ if (nir_opt_uniform_atomics(shader, false)) {
|
|
+ nir_lower_int64(shader);
|
|
+ nir_divergence_analysis_impl(impl, (nir_divergence_options)options);
|
|
+ }
|
|
|
|
- apply_nuw_to_offsets(ctx, impl);
|
|
+ apply_nuw_to_offsets(ctx, impl);
|
|
|
|
- /* sanitize control flow */
|
|
- sanitize_cf_list(impl, &impl->body);
|
|
- nir_metadata_preserve(impl, nir_metadata_none);
|
|
+ /* sanitize control flow */
|
|
+ sanitize_cf_list(impl, &impl->body);
|
|
+ nir_metadata_preserve(impl, nir_metadata_none);
|
|
|
|
- /* we'll need these for isel */
|
|
- nir_metadata_require(impl, nir_metadata_block_index | nir_metadata_dominance);
|
|
+ /* we'll need these for isel */
|
|
+ nir_metadata_require(impl, nir_metadata_block_index | nir_metadata_dominance);
|
|
|
|
- if (ctx->options->dump_preoptir) {
|
|
- fprintf(stderr, "NIR shader before instruction selection:\n");
|
|
- nir_print_shader(shader, stderr);
|
|
- }
|
|
+ if (ctx->options->dump_preoptir) {
|
|
+ fprintf(stderr, "NIR shader before instruction selection:\n");
|
|
+ nir_print_shader(shader, stderr);
|
|
+ }
|
|
|
|
- ctx->first_temp_id = ctx->program->peekAllocationId();
|
|
- ctx->program->allocateRange(impl->ssa_alloc);
|
|
- RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id;
|
|
-
|
|
- unsigned call_count = 0;
|
|
-
|
|
- /* TODO: make this recursive to improve compile times */
|
|
- bool done = false;
|
|
- while (!done) {
|
|
- done = true;
|
|
- nir_foreach_block (block, impl) {
|
|
- nir_foreach_instr (instr, block) {
|
|
- switch (instr->type) {
|
|
- case nir_instr_type_alu: {
|
|
- nir_alu_instr* alu_instr = nir_instr_as_alu(instr);
|
|
- RegType type = alu_instr->def.divergent ? RegType::vgpr : RegType::sgpr;
|
|
-
|
|
- /* packed 16bit instructions have to be VGPR */
|
|
- if (alu_instr->def.num_components == 2 &&
|
|
- nir_op_infos[alu_instr->op].output_size == 0)
|
|
+ ctx->first_temp_id = ctx->program->peekAllocationId();
|
|
+ ctx->program->allocateRange(impl->ssa_alloc);
|
|
+ RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id;
|
|
+
|
|
+ unsigned call_count = 0;
|
|
+
|
|
+ /* TODO: make this recursive to improve compile times */
|
|
+ bool done = false;
|
|
+ while (!done) {
|
|
+ done = true;
|
|
+ nir_foreach_block (block, impl) {
|
|
+ nir_foreach_instr (instr, block) {
|
|
+ switch (instr->type) {
|
|
+ case nir_instr_type_alu: {
|
|
+ nir_alu_instr* alu_instr = nir_instr_as_alu(instr);
|
|
+ RegType type = alu_instr->def.divergent ? RegType::vgpr : RegType::sgpr;
|
|
+
|
|
+ /* packed 16bit instructions have to be VGPR */
|
|
+ if (alu_instr->def.num_components == 2 &&
|
|
+ nir_op_infos[alu_instr->op].output_size == 0)
|
|
type = RegType::vgpr;
|
|
|
|
- switch (alu_instr->op) {
|
|
- case nir_op_f2i16:
|
|
- case nir_op_f2u16:
|
|
- case nir_op_f2i32:
|
|
- case nir_op_f2u32:
|
|
- case nir_op_b2i8:
|
|
- case nir_op_b2i16:
|
|
- case nir_op_b2i32:
|
|
- case nir_op_b2b32:
|
|
- case nir_op_b2f16:
|
|
- case nir_op_b2f32:
|
|
- case nir_op_mov: break;
|
|
- case nir_op_fmulz:
|
|
- case nir_op_ffmaz:
|
|
- case nir_op_f2f64:
|
|
- case nir_op_u2f64:
|
|
- case nir_op_i2f64:
|
|
- case nir_op_pack_unorm_2x16:
|
|
- case nir_op_pack_snorm_2x16:
|
|
- case nir_op_pack_uint_2x16:
|
|
- case nir_op_pack_sint_2x16:
|
|
- case nir_op_ldexp:
|
|
- case nir_op_frexp_sig:
|
|
- case nir_op_frexp_exp:
|
|
- case nir_op_cube_amd:
|
|
- case nir_op_msad_4x8:
|
|
- case nir_op_mqsad_4x8:
|
|
- case nir_op_udot_4x8_uadd:
|
|
- case nir_op_sdot_4x8_iadd:
|
|
- case nir_op_sudot_4x8_iadd:
|
|
- case nir_op_udot_4x8_uadd_sat:
|
|
- case nir_op_sdot_4x8_iadd_sat:
|
|
- case nir_op_sudot_4x8_iadd_sat:
|
|
- case nir_op_udot_2x16_uadd:
|
|
- case nir_op_sdot_2x16_iadd:
|
|
- case nir_op_udot_2x16_uadd_sat:
|
|
- case nir_op_sdot_2x16_iadd_sat: type = RegType::vgpr; break;
|
|
- case nir_op_fmul:
|
|
- case nir_op_ffma:
|
|
- case nir_op_fadd:
|
|
- case nir_op_fsub:
|
|
- case nir_op_fmax:
|
|
- case nir_op_fmin:
|
|
- case nir_op_fsat:
|
|
- case nir_op_fneg:
|
|
- case nir_op_fabs:
|
|
- case nir_op_fsign:
|
|
- case nir_op_i2f16:
|
|
- case nir_op_i2f32:
|
|
- case nir_op_u2f16:
|
|
- case nir_op_u2f32:
|
|
- case nir_op_f2f16:
|
|
- case nir_op_f2f16_rtz:
|
|
- case nir_op_f2f16_rtne:
|
|
- case nir_op_f2f32:
|
|
- case nir_op_fquantize2f16:
|
|
- case nir_op_ffract:
|
|
- case nir_op_ffloor:
|
|
- case nir_op_fceil:
|
|
- case nir_op_ftrunc:
|
|
- case nir_op_fround_even:
|
|
- case nir_op_frcp:
|
|
- case nir_op_frsq:
|
|
- case nir_op_fsqrt:
|
|
- case nir_op_fexp2:
|
|
- case nir_op_flog2:
|
|
- case nir_op_fsin_amd:
|
|
- case nir_op_fcos_amd:
|
|
- case nir_op_pack_half_2x16_rtz_split:
|
|
- case nir_op_pack_half_2x16_split:
|
|
- case nir_op_unpack_half_2x16_split_x:
|
|
- case nir_op_unpack_half_2x16_split_y: {
|
|
- if (ctx->program->gfx_level < GFX11_5 ||
|
|
+ switch (alu_instr->op) {
|
|
+ case nir_op_f2i16:
|
|
+ case nir_op_f2u16:
|
|
+ case nir_op_f2i32:
|
|
+ case nir_op_f2u32:
|
|
+ case nir_op_b2i8:
|
|
+ case nir_op_b2i16:
|
|
+ case nir_op_b2i32:
|
|
+ case nir_op_b2b32:
|
|
+ case nir_op_b2f16:
|
|
+ case nir_op_b2f32:
|
|
+ case nir_op_mov: break;
|
|
+ case nir_op_fmulz:
|
|
+ case nir_op_ffmaz:
|
|
+ case nir_op_f2f64:
|
|
+ case nir_op_u2f64:
|
|
+ case nir_op_i2f64:
|
|
+ case nir_op_pack_unorm_2x16:
|
|
+ case nir_op_pack_snorm_2x16:
|
|
+ case nir_op_pack_uint_2x16:
|
|
+ case nir_op_pack_sint_2x16:
|
|
+ case nir_op_ldexp:
|
|
+ case nir_op_frexp_sig:
|
|
+ case nir_op_frexp_exp:
|
|
+ case nir_op_cube_amd:
|
|
+ case nir_op_msad_4x8:
|
|
+ case nir_op_mqsad_4x8:
|
|
+ case nir_op_udot_4x8_uadd:
|
|
+ case nir_op_sdot_4x8_iadd:
|
|
+ case nir_op_sudot_4x8_iadd:
|
|
+ case nir_op_udot_4x8_uadd_sat:
|
|
+ case nir_op_sdot_4x8_iadd_sat:
|
|
+ case nir_op_sudot_4x8_iadd_sat:
|
|
+ case nir_op_udot_2x16_uadd:
|
|
+ case nir_op_sdot_2x16_iadd:
|
|
+ case nir_op_udot_2x16_uadd_sat:
|
|
+ case nir_op_sdot_2x16_iadd_sat: type = RegType::vgpr; break;
|
|
+ case nir_op_fmul:
|
|
+ case nir_op_ffma:
|
|
+ case nir_op_fadd:
|
|
+ case nir_op_fsub:
|
|
+ case nir_op_fmax:
|
|
+ case nir_op_fmin:
|
|
+ case nir_op_fsat:
|
|
+ case nir_op_fneg:
|
|
+ case nir_op_fabs:
|
|
+ case nir_op_fsign:
|
|
+ case nir_op_i2f16:
|
|
+ case nir_op_i2f32:
|
|
+ case nir_op_u2f16:
|
|
+ case nir_op_u2f32:
|
|
+ case nir_op_f2f16:
|
|
+ case nir_op_f2f16_rtz:
|
|
+ case nir_op_f2f16_rtne:
|
|
+ case nir_op_f2f32:
|
|
+ case nir_op_fquantize2f16:
|
|
+ case nir_op_ffract:
|
|
+ case nir_op_ffloor:
|
|
+ case nir_op_fceil:
|
|
+ case nir_op_ftrunc:
|
|
+ case nir_op_fround_even:
|
|
+ case nir_op_frcp:
|
|
+ case nir_op_frsq:
|
|
+ case nir_op_fsqrt:
|
|
+ case nir_op_fexp2:
|
|
+ case nir_op_flog2:
|
|
+ case nir_op_fsin_amd:
|
|
+ case nir_op_fcos_amd:
|
|
+ case nir_op_pack_half_2x16_rtz_split:
|
|
+ case nir_op_pack_half_2x16_split:
|
|
+ case nir_op_unpack_half_2x16_split_x:
|
|
+ case nir_op_unpack_half_2x16_split_y: {
|
|
+ if (ctx->program->gfx_level < GFX11_5 ||
|
|
alu_instr->src[0].src.ssa->bit_size > 32) {
|
|
type = RegType::vgpr;
|
|
break;
|
|
}
|
|
- FALLTHROUGH;
|
|
- }
|
|
- default:
|
|
- for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
|
|
- if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
|
|
- type = RegType::vgpr;
|
|
+ FALLTHROUGH;}
|
|
+ default:
|
|
+ for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) {
|
|
+ if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr)
|
|
+ type = RegType::vgpr;
|
|
+ }
|
|
+ break;
|
|
}
|
|
- break;
|
|
- }
|
|
|
|
- RegClass rc =
|
|
- get_reg_class(ctx, type, alu_instr->def.num_components, alu_instr->def.bit_size);
|
|
- regclasses[alu_instr->def.index] = rc;
|
|
- break;
|
|
- }
|
|
- case nir_instr_type_load_const: {
|
|
- unsigned num_components = nir_instr_as_load_const(instr)->def.num_components;
|
|
- unsigned bit_size = nir_instr_as_load_const(instr)->def.bit_size;
|
|
- RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);
|
|
- regclasses[nir_instr_as_load_const(instr)->def.index] = rc;
|
|
- break;
|
|
- }
|
|
- case nir_instr_type_intrinsic: {
|
|
- nir_intrinsic_instr* intrinsic = nir_instr_as_intrinsic(instr);
|
|
- if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest)
|
|
+ RegClass rc = get_reg_class(ctx, type, alu_instr->def.num_components,
|
|
+ alu_instr->def.bit_size);
|
|
+ regclasses[alu_instr->def.index] = rc;
|
|
break;
|
|
- if (intrinsic->intrinsic == nir_intrinsic_strict_wqm_coord_amd) {
|
|
- regclasses[intrinsic->def.index] =
|
|
- RegClass::get(RegType::vgpr, intrinsic->def.num_components * 4 +
|
|
- nir_intrinsic_base(intrinsic))
|
|
- .as_linear();
|
|
+ }
|
|
+ case nir_instr_type_load_const: {
|
|
+ unsigned num_components = nir_instr_as_load_const(instr)->def.num_components;
|
|
+ unsigned bit_size = nir_instr_as_load_const(instr)->def.bit_size;
|
|
+ RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);
|
|
+ regclasses[nir_instr_as_load_const(instr)->def.index] = rc;
|
|
break;
|
|
}
|
|
- RegType type = RegType::sgpr;
|
|
- switch (intrinsic->intrinsic) {
|
|
- case nir_intrinsic_load_push_constant:
|
|
- case nir_intrinsic_load_workgroup_id:
|
|
- case nir_intrinsic_load_num_workgroups:
|
|
- case nir_intrinsic_load_sbt_base_amd:
|
|
- case nir_intrinsic_load_subgroup_id:
|
|
- case nir_intrinsic_load_num_subgroups:
|
|
- case nir_intrinsic_load_first_vertex:
|
|
- case nir_intrinsic_load_base_instance:
|
|
- case nir_intrinsic_vote_all:
|
|
- case nir_intrinsic_vote_any:
|
|
- case nir_intrinsic_read_first_invocation:
|
|
- case nir_intrinsic_as_uniform:
|
|
- case nir_intrinsic_read_invocation:
|
|
- case nir_intrinsic_first_invocation:
|
|
- case nir_intrinsic_ballot:
|
|
- case nir_intrinsic_ballot_relaxed:
|
|
- case nir_intrinsic_bindless_image_samples:
|
|
- case nir_intrinsic_load_scalar_arg_amd:
|
|
- case nir_intrinsic_load_lds_ngg_scratch_base_amd:
|
|
- case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd:
|
|
- case nir_intrinsic_load_smem_amd:
|
|
- case nir_intrinsic_unit_test_uniform_amd: type = RegType::sgpr; break;
|
|
- case nir_intrinsic_load_sample_id:
|
|
- case nir_intrinsic_load_input:
|
|
- case nir_intrinsic_load_per_primitive_input:
|
|
- case nir_intrinsic_load_output:
|
|
- case nir_intrinsic_load_input_vertex:
|
|
- case nir_intrinsic_load_per_vertex_input:
|
|
- case nir_intrinsic_load_per_vertex_output:
|
|
- case nir_intrinsic_load_vertex_id_zero_base:
|
|
- case nir_intrinsic_load_barycentric_sample:
|
|
- case nir_intrinsic_load_barycentric_pixel:
|
|
- case nir_intrinsic_load_barycentric_model:
|
|
- case nir_intrinsic_load_barycentric_centroid:
|
|
- case nir_intrinsic_load_barycentric_at_offset:
|
|
- case nir_intrinsic_load_interpolated_input:
|
|
- case nir_intrinsic_load_frag_coord:
|
|
- case nir_intrinsic_load_frag_shading_rate:
|
|
- case nir_intrinsic_load_sample_pos:
|
|
- case nir_intrinsic_load_local_invocation_id:
|
|
- case nir_intrinsic_load_local_invocation_index:
|
|
- case nir_intrinsic_load_subgroup_invocation:
|
|
- case nir_intrinsic_load_tess_coord:
|
|
- case nir_intrinsic_write_invocation_amd:
|
|
- case nir_intrinsic_mbcnt_amd:
|
|
- case nir_intrinsic_lane_permute_16_amd:
|
|
- case nir_intrinsic_dpp16_shift_amd:
|
|
- case nir_intrinsic_load_instance_id:
|
|
- case nir_intrinsic_ssbo_atomic:
|
|
- case nir_intrinsic_ssbo_atomic_swap:
|
|
- case nir_intrinsic_global_atomic_amd:
|
|
- case nir_intrinsic_global_atomic_swap_amd:
|
|
- case nir_intrinsic_bindless_image_atomic:
|
|
- case nir_intrinsic_bindless_image_atomic_swap:
|
|
- case nir_intrinsic_bindless_image_size:
|
|
- case nir_intrinsic_shared_atomic:
|
|
- case nir_intrinsic_shared_atomic_swap:
|
|
- case nir_intrinsic_load_scratch:
|
|
- case nir_intrinsic_load_invocation_id:
|
|
- case nir_intrinsic_load_primitive_id:
|
|
- case nir_intrinsic_load_typed_buffer_amd:
|
|
- case nir_intrinsic_load_buffer_amd:
|
|
- case nir_intrinsic_load_initial_edgeflags_amd:
|
|
- case nir_intrinsic_gds_atomic_add_amd:
|
|
- case nir_intrinsic_bvh64_intersect_ray_amd:
|
|
- case nir_intrinsic_load_vector_arg_amd:
|
|
- case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd:
|
|
- case nir_intrinsic_cmat_muladd_amd:
|
|
- case nir_intrinsic_unit_test_divergent_amd: type = RegType::vgpr; break;
|
|
- case nir_intrinsic_load_shared:
|
|
- case nir_intrinsic_load_shared2_amd:
|
|
- /* When the result of these loads is only used by cross-lane instructions,
|
|
- * it is beneficial to use a VGPR destination. This is because this allows
|
|
- * to put the s_waitcnt further down, which decreases latency.
|
|
- */
|
|
- if (only_used_by_cross_lane_instrs(&intrinsic->def)) {
|
|
- type = RegType::vgpr;
|
|
+ case nir_instr_type_intrinsic: {
|
|
+ nir_intrinsic_instr* intrinsic = nir_instr_as_intrinsic(instr);
|
|
+ if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest)
|
|
+ break;
|
|
+ if (intrinsic->intrinsic == nir_intrinsic_strict_wqm_coord_amd) {
|
|
+ regclasses[intrinsic->def.index] =
|
|
+ RegClass::get(RegType::vgpr, intrinsic->def.num_components * 4 +
|
|
+ nir_intrinsic_base(intrinsic))
|
|
+ .as_linear();
|
|
break;
|
|
}
|
|
- FALLTHROUGH;
|
|
- case nir_intrinsic_shuffle:
|
|
- case nir_intrinsic_quad_broadcast:
|
|
- case nir_intrinsic_quad_swap_horizontal:
|
|
- case nir_intrinsic_quad_swap_vertical:
|
|
- case nir_intrinsic_quad_swap_diagonal:
|
|
- case nir_intrinsic_quad_swizzle_amd:
|
|
- case nir_intrinsic_masked_swizzle_amd:
|
|
- case nir_intrinsic_rotate:
|
|
- case nir_intrinsic_inclusive_scan:
|
|
- case nir_intrinsic_exclusive_scan:
|
|
- case nir_intrinsic_reduce:
|
|
- case nir_intrinsic_load_ubo:
|
|
- case nir_intrinsic_load_ssbo:
|
|
- case nir_intrinsic_load_global_amd:
|
|
- type = intrinsic->def.divergent ? RegType::vgpr : RegType::sgpr;
|
|
- break;
|
|
- case nir_intrinsic_ddx:
|
|
- case nir_intrinsic_ddy:
|
|
- case nir_intrinsic_ddx_fine:
|
|
- case nir_intrinsic_ddy_fine:
|
|
- case nir_intrinsic_ddx_coarse:
|
|
- case nir_intrinsic_ddy_coarse:
|
|
+ RegType type = RegType::sgpr;
|
|
+ switch (intrinsic->intrinsic) {
|
|
+ case nir_intrinsic_load_push_constant:
|
|
+ case nir_intrinsic_load_workgroup_id:
|
|
+ case nir_intrinsic_load_num_workgroups:
|
|
+ case nir_intrinsic_load_ray_launch_size:
|
|
+ case nir_intrinsic_load_sbt_base_amd:
|
|
+ case nir_intrinsic_load_subgroup_id:
|
|
+ case nir_intrinsic_load_num_subgroups:
|
|
+ case nir_intrinsic_load_first_vertex:
|
|
+ case nir_intrinsic_load_base_instance:
|
|
+ case nir_intrinsic_vote_all:
|
|
+ case nir_intrinsic_vote_any:
|
|
+ case nir_intrinsic_read_first_invocation:
|
|
+ case nir_intrinsic_as_uniform:
|
|
+ case nir_intrinsic_read_invocation:
|
|
+ case nir_intrinsic_first_invocation:
|
|
+ case nir_intrinsic_ballot:
|
|
+ case nir_intrinsic_ballot_relaxed:
|
|
+ case nir_intrinsic_bindless_image_samples:
|
|
+ case nir_intrinsic_load_scalar_arg_amd:
|
|
+ case nir_intrinsic_load_lds_ngg_scratch_base_amd:
|
|
+ case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd:
|
|
+ case nir_intrinsic_load_smem_amd:
|
|
+ case nir_intrinsic_unit_test_uniform_amd: type = RegType::sgpr; break;
|
|
+ case nir_intrinsic_load_sample_id:
|
|
+ case nir_intrinsic_load_input:
|
|
+ case nir_intrinsic_load_per_primitive_input:
|
|
+ case nir_intrinsic_load_output:
|
|
+ case nir_intrinsic_load_input_vertex:
|
|
+ case nir_intrinsic_load_per_vertex_input:
|
|
+ case nir_intrinsic_load_per_vertex_output:
|
|
+ case nir_intrinsic_load_vertex_id_zero_base:
|
|
+ case nir_intrinsic_load_barycentric_sample:
|
|
+ case nir_intrinsic_load_barycentric_pixel:
|
|
+ case nir_intrinsic_load_barycentric_model:
|
|
+ case nir_intrinsic_load_barycentric_centroid:
|
|
+ case nir_intrinsic_load_barycentric_at_offset:
|
|
+ case nir_intrinsic_load_interpolated_input:
|
|
+ case nir_intrinsic_load_frag_coord:
|
|
+ case nir_intrinsic_load_frag_shading_rate:
|
|
+ case nir_intrinsic_load_sample_pos:
|
|
+ case nir_intrinsic_load_local_invocation_id:
|
|
+ case nir_intrinsic_load_local_invocation_index:
|
|
+ case nir_intrinsic_load_subgroup_invocation:
|
|
+ case nir_intrinsic_load_ray_launch_id:
|
|
+ case nir_intrinsic_load_tess_coord:
|
|
+ case nir_intrinsic_write_invocation_amd:
|
|
+ case nir_intrinsic_mbcnt_amd:
|
|
+ case nir_intrinsic_lane_permute_16_amd:
|
|
+ case nir_intrinsic_dpp16_shift_amd:
|
|
+ case nir_intrinsic_load_instance_id:
|
|
+ case nir_intrinsic_ssbo_atomic:
|
|
+ case nir_intrinsic_ssbo_atomic_swap:
|
|
+ case nir_intrinsic_global_atomic_amd:
|
|
+ case nir_intrinsic_global_atomic_swap_amd:
|
|
+ case nir_intrinsic_bindless_image_atomic:
|
|
+ case nir_intrinsic_bindless_image_atomic_swap:
|
|
+ case nir_intrinsic_bindless_image_size:
|
|
+ case nir_intrinsic_shared_atomic:
|
|
+ case nir_intrinsic_shared_atomic_swap:
|
|
+ case nir_intrinsic_load_scratch:
|
|
+ case nir_intrinsic_load_invocation_id:
|
|
+ case nir_intrinsic_load_primitive_id:
|
|
+ case nir_intrinsic_load_typed_buffer_amd:
|
|
+ case nir_intrinsic_load_buffer_amd:
|
|
+ case nir_intrinsic_load_initial_edgeflags_amd:
|
|
+ case nir_intrinsic_gds_atomic_add_amd:
|
|
+ case nir_intrinsic_bvh64_intersect_ray_amd:
|
|
+ case nir_intrinsic_load_vector_arg_amd:
|
|
+ case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd:
|
|
+ case nir_intrinsic_cmat_muladd_amd:
|
|
+ case nir_intrinsic_unit_test_divergent_amd: type = RegType::vgpr; break;
|
|
+ case nir_intrinsic_load_shared:
|
|
+ case nir_intrinsic_load_shared2_amd:
|
|
+ /* When the result of these loads is only used by cross-lane instructions,
|
|
+ * it is beneficial to use a VGPR destination. This is because this allows
|
|
+ * to put the s_waitcnt further down, which decreases latency.
|
|
+ */
|
|
+ if (only_used_by_cross_lane_instrs(&intrinsic->def)) {
|
|
+ type = RegType::vgpr;
|
|
+ break;
|
|
+ }
|
|
+ FALLTHROUGH;
|
|
+ case nir_intrinsic_shuffle:
|
|
+ case nir_intrinsic_quad_broadcast:
|
|
+ case nir_intrinsic_quad_swap_horizontal:
|
|
+ case nir_intrinsic_quad_swap_vertical:
|
|
+ case nir_intrinsic_quad_swap_diagonal:
|
|
+ case nir_intrinsic_quad_swizzle_amd:
|
|
+ case nir_intrinsic_masked_swizzle_amd:
|
|
+ case nir_intrinsic_rotate:
|
|
+ case nir_intrinsic_inclusive_scan:
|
|
+ case nir_intrinsic_exclusive_scan:
|
|
+ case nir_intrinsic_reduce:
|
|
+ case nir_intrinsic_load_ubo:
|
|
+ case nir_intrinsic_load_ssbo:
|
|
+ case nir_intrinsic_load_global_amd:
|
|
+ type = intrinsic->def.divergent ? RegType::vgpr : RegType::sgpr;
|
|
+ break;
|
|
+ case nir_intrinsic_ddx:
|
|
+ case nir_intrinsic_ddy:
|
|
+ case nir_intrinsic_ddx_fine:
|
|
+ case nir_intrinsic_ddy_fine:
|
|
+ case nir_intrinsic_ddx_coarse:
|
|
+ case nir_intrinsic_ddy_coarse:
|
|
type = RegType::vgpr;
|
|
break;
|
|
- case nir_intrinsic_load_view_index:
|
|
- type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr;
|
|
- break;
|
|
- default:
|
|
- for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
|
|
- i++) {
|
|
- if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr)
|
|
- type = RegType::vgpr;
|
|
+ case nir_intrinsic_load_view_index:
|
|
+ type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr;
|
|
+ break;
|
|
+ default:
|
|
+ for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
|
|
+ i++) {
|
|
+ if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr)
|
|
+ type = RegType::vgpr;
|
|
+ }
|
|
+ break;
|
|
}
|
|
+ RegClass rc = get_reg_class(ctx, type, intrinsic->def.num_components,
|
|
+ intrinsic->def.bit_size);
|
|
+ regclasses[intrinsic->def.index] = rc;
|
|
break;
|
|
}
|
|
- RegClass rc =
|
|
- get_reg_class(ctx, type, intrinsic->def.num_components, intrinsic->def.bit_size);
|
|
- regclasses[intrinsic->def.index] = rc;
|
|
- break;
|
|
- }
|
|
- case nir_instr_type_tex: {
|
|
- nir_tex_instr* tex = nir_instr_as_tex(instr);
|
|
- RegType type = tex->def.divergent ? RegType::vgpr : RegType::sgpr;
|
|
+ case nir_instr_type_tex: {
|
|
+ nir_tex_instr* tex = nir_instr_as_tex(instr);
|
|
+ RegType type = tex->def.divergent ? RegType::vgpr : RegType::sgpr;
|
|
|
|
- if (tex->op == nir_texop_texture_samples) {
|
|
- assert(!tex->def.divergent);
|
|
- }
|
|
-
|
|
- RegClass rc = get_reg_class(ctx, type, tex->def.num_components, tex->def.bit_size);
|
|
- regclasses[tex->def.index] = rc;
|
|
- break;
|
|
- }
|
|
- case nir_instr_type_undef: {
|
|
- unsigned num_components = nir_instr_as_undef(instr)->def.num_components;
|
|
- unsigned bit_size = nir_instr_as_undef(instr)->def.bit_size;
|
|
- RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);
|
|
- regclasses[nir_instr_as_undef(instr)->def.index] = rc;
|
|
- break;
|
|
- }
|
|
- case nir_instr_type_phi: {
|
|
- nir_phi_instr* phi = nir_instr_as_phi(instr);
|
|
- RegType type = RegType::sgpr;
|
|
- unsigned num_components = phi->def.num_components;
|
|
- assert((phi->def.bit_size != 1 || num_components == 1) &&
|
|
- "Multiple components not supported on boolean phis.");
|
|
-
|
|
- if (phi->def.divergent) {
|
|
- type = RegType::vgpr;
|
|
- } else {
|
|
- bool vgpr_src = false;
|
|
- nir_foreach_phi_src (src, phi)
|
|
- vgpr_src |= regclasses[src->src.ssa->index].type() == RegType::vgpr;
|
|
+ if (tex->op == nir_texop_texture_samples) {
|
|
+ assert(!tex->def.divergent);
|
|
+ }
|
|
|
|
- if (vgpr_src) {
|
|
+ RegClass rc =
|
|
+ get_reg_class(ctx, type, tex->def.num_components, tex->def.bit_size);
|
|
+ regclasses[tex->def.index] = rc;
|
|
+ break;
|
|
+ }
|
|
+ case nir_instr_type_undef: {
|
|
+ unsigned num_components = nir_instr_as_undef(instr)->def.num_components;
|
|
+ unsigned bit_size = nir_instr_as_undef(instr)->def.bit_size;
|
|
+ RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size);
|
|
+ regclasses[nir_instr_as_undef(instr)->def.index] = rc;
|
|
+ break;
|
|
+ }
|
|
+ case nir_instr_type_phi: {
|
|
+ nir_phi_instr* phi = nir_instr_as_phi(instr);
|
|
+ RegType type = RegType::sgpr;
|
|
+ unsigned num_components = phi->def.num_components;
|
|
+ assert((phi->def.bit_size != 1 || num_components == 1) &&
|
|
+ "Multiple components not supported on boolean phis.");
|
|
+
|
|
+ if (phi->def.divergent) {
|
|
type = RegType::vgpr;
|
|
+ } else {
|
|
+ bool vgpr_src = false;
|
|
+ nir_foreach_phi_src (src, phi)
|
|
+ vgpr_src |= regclasses[src->src.ssa->index].type() == RegType::vgpr;
|
|
|
|
- /* This might be the case because of nir_divergence_ignore_undef_if_phi_srcs. */
|
|
- bool divergent_merge = false;
|
|
- if (nir_cf_node_prev(&block->cf_node) &&
|
|
- nir_cf_node_prev(&block->cf_node)->type == nir_cf_node_if) {
|
|
- nir_if* nif = nir_cf_node_as_if(nir_cf_node_prev(&block->cf_node));
|
|
- divergent_merge = nir_src_is_divergent(&nif->condition);
|
|
- }
|
|
+ if (vgpr_src) {
|
|
+ type = RegType::vgpr;
|
|
|
|
- /* In case of uniform phis after divergent merges, ensure that the dst is an
|
|
- * SGPR and does not contain undefined values for some invocations.
|
|
- */
|
|
- if (divergent_merge && !skip_uniformize_merge_phi(&phi->def, 0))
|
|
- type = RegType::sgpr;
|
|
+ /* This might be the case because of nir_divergence_ignore_undef_if_phi_srcs. */
|
|
+ bool divergent_merge = false;
|
|
+ if (nir_cf_node_prev(&block->cf_node) &&
|
|
+ nir_cf_node_prev(&block->cf_node)->type == nir_cf_node_if) {
|
|
+ nir_if* nif = nir_cf_node_as_if(nir_cf_node_prev(&block->cf_node));
|
|
+ divergent_merge = nir_src_is_divergent(&nif->condition);
|
|
+ }
|
|
+
|
|
+ /* In case of uniform phis after divergent merges, ensure that the dst is an
|
|
+ * SGPR and does not contain undefined values for some invocations.
|
|
+ */
|
|
+ if (divergent_merge && !skip_uniformize_merge_phi(&phi->def, 0))
|
|
+ type = RegType::sgpr;
|
|
+ }
|
|
}
|
|
- }
|
|
|
|
- RegClass rc = get_reg_class(ctx, type, num_components, phi->def.bit_size);
|
|
- if (rc != regclasses[phi->def.index])
|
|
- done = false;
|
|
- regclasses[phi->def.index] = rc;
|
|
- break;
|
|
- }
|
|
- case nir_instr_type_call: {
|
|
- ++call_count;
|
|
- break;
|
|
- }
|
|
- default: break;
|
|
+ RegClass rc = get_reg_class(ctx, type, num_components, phi->def.bit_size);
|
|
+ if (rc != regclasses[phi->def.index])
|
|
+ done = false;
|
|
+ regclasses[phi->def.index] = rc;
|
|
+ break;
|
|
+ }
|
|
+ case nir_instr_type_call: {
|
|
+ ++call_count;
|
|
+ break;
|
|
+ }
|
|
+ default: break;
|
|
+ }
|
|
}
|
|
}
|
|
}
|
|
- }
|
|
-
|
|
- ctx->call_infos.reserve(call_count);
|
|
|
|
- ctx->program->config->spi_ps_input_ena = ctx->program->info.ps.spi_ps_input_ena;
|
|
- ctx->program->config->spi_ps_input_addr = ctx->program->info.ps.spi_ps_input_addr;
|
|
+ ctx->call_infos.reserve(call_count);
|
|
|
|
+ ctx->program->config->spi_ps_input_ena = ctx->program->info.ps.spi_ps_input_ena;
|
|
+ ctx->program->config->spi_ps_input_addr = ctx->program->info.ps.spi_ps_input_addr;
|
|
+ }
|
|
/* align and copy constant data */
|
|
while (ctx->program->constant_data.size() % 4u)
|
|
ctx->program->constant_data.push_back(0);
|
|
@@ -794,7 +796,8 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c
|
|
|
|
unsigned nir_num_blocks = 0;
|
|
for (unsigned i = 0; i < shader_count; i++)
|
|
- nir_num_blocks += nir_shader_get_entrypoint(shaders[i])->num_blocks;
|
|
+ nir_foreach_function_impl (impl, shaders[i])
|
|
+ nir_num_blocks += impl->num_blocks;
|
|
ctx.program->blocks.reserve(nir_num_blocks * 2);
|
|
ctx.block = ctx.program->create_and_insert_block();
|
|
ctx.block->kind = block_kind_top_level;
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 5bfdc4d5da9fd66e98e3d04f0320719331a5bfaa Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sat, 23 Mar 2024 11:20:58 +0100
|
|
Subject: [PATCH 43/71] aco: Add param temps in startpgm
|
|
|
|
---
|
|
src/amd/compiler/aco_assembler.cpp | 3 ++-
|
|
.../compiler/aco_instruction_selection.cpp | 23 ++++++++++++++++++-
|
|
src/amd/compiler/aco_ir.h | 1 +
|
|
3 files changed, 25 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp
|
|
index 9f50c3f59821b..9a774aec8621c 100644
|
|
--- a/src/amd/compiler/aco_assembler.cpp
|
|
+++ b/src/amd/compiler/aco_assembler.cpp
|
|
@@ -1755,7 +1755,8 @@ emit_program(Program* program, std::vector<uint32_t>& code, std::vector<struct a
|
|
(uint32_t*)(program->constant_data.data() + program->constant_data.size()));
|
|
|
|
program->config->scratch_bytes_per_wave =
|
|
- align(program->config->scratch_bytes_per_wave, program->dev.scratch_alloc_granule);
|
|
+ align(program->config->scratch_bytes_per_wave + program->scratch_arg_size,
|
|
+ program->dev.scratch_alloc_granule);
|
|
|
|
return exec_size;
|
|
}
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index 95baf3a302d0c..c44a7324d58e8 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -11557,8 +11557,12 @@ create_fs_end_for_epilog(isel_context* ctx)
|
|
}
|
|
|
|
Instruction*
|
|
-add_startpgm(struct isel_context* ctx)
|
|
+add_startpgm(struct isel_context* ctx, bool is_callee = false)
|
|
{
|
|
+ ctx->program->arg_sgpr_count = ctx->args->num_sgprs_used;
|
|
+ ctx->program->arg_vgpr_count = ctx->args->num_vgprs_used;
|
|
+ ctx->program->scratch_arg_size += ctx->callee_info.scratch_param_size;
|
|
+
|
|
unsigned def_count = 0;
|
|
for (unsigned i = 0; i < ctx->args->arg_count; i++) {
|
|
if (ctx->args->args[i].skip)
|
|
@@ -11569,6 +11573,9 @@ add_startpgm(struct isel_context* ctx)
|
|
else
|
|
def_count++;
|
|
}
|
|
+ unsigned used_arg_count = def_count;
|
|
+ def_count +=
|
|
+ ctx->callee_info.reg_param_count + (is_callee ? 2 : 0); /* parameters + return address */
|
|
|
|
if (ctx->stage.hw == AC_HW_COMPUTE_SHADER && ctx->program->gfx_level >= GFX12)
|
|
def_count += 3;
|
|
@@ -11634,6 +11641,20 @@ add_startpgm(struct isel_context* ctx)
|
|
ctx->workgroup_id[i] = ids[i].used ? Operand(get_arg(ctx, ids[i])) : Operand::zero();
|
|
}
|
|
|
|
+ if (is_callee) {
|
|
+ unsigned def_idx = used_arg_count;
|
|
+
|
|
+ ctx->program->stack_ptr = ctx->callee_info.stack_ptr.def.getTemp();
|
|
+ startpgm->definitions[def_idx++] = ctx->callee_info.stack_ptr.def;
|
|
+ startpgm->definitions[def_idx++] = ctx->callee_info.return_address.def;
|
|
+
|
|
+ for (auto& info : ctx->callee_info.param_infos) {
|
|
+ if (!info.is_reg)
|
|
+ continue;
|
|
+ startpgm->definitions[def_idx++] = info.def;
|
|
+ }
|
|
+ }
|
|
+
|
|
/* epilog has no scratch */
|
|
if (ctx->args->scratch_offset.used) {
|
|
if (ctx->program->gfx_level < GFX9) {
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index 920174ac50798..7989d27dfe75b 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -2388,6 +2388,7 @@ public:
|
|
ABI callee_abi = {};
|
|
unsigned short arg_sgpr_count;
|
|
unsigned short arg_vgpr_count;
|
|
+ unsigned scratch_arg_size = 0;
|
|
|
|
struct {
|
|
monotonic_buffer_resource memory;
|
|
--
|
|
GitLab
|
|
|
|
|
|
From c2b0a99236c67af869bef06a2e3d2af329206ef7 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 6 Mar 2024 13:27:56 +0100
|
|
Subject: [PATCH 44/71] aco: Implement call parameter intrinsics
|
|
|
|
---
|
|
.../compiler/aco_instruction_selection.cpp | 158 ++++++++++++++++++
|
|
.../aco_instruction_selection_setup.cpp | 13 +-
|
|
2 files changed, 170 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index c44a7324d58e8..f3ec6fa04dd36 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -8341,6 +8341,107 @@ visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
emit_split_vector(ctx, dst, instr->def.num_components);
|
|
}
|
|
|
|
+void
|
|
+load_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param, Temp stack_ptr,
|
|
+ unsigned scratch_param_size, Temp dst)
|
|
+{
|
|
+ int32_t const_offset = param.scratch_offset - scratch_param_size;
|
|
+ unsigned byte_size = dst.bytes();
|
|
+ if (ctx->program->gfx_level < GFX9) {
|
|
+ Temp scratch_rsrc = load_scratch_resource(ctx->program, bld, true, false);
|
|
+
|
|
+ Temp soffset = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc),
|
|
+ stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr),
|
|
+ Operand::c32(-const_offset * ctx->program->wave_size));
|
|
+
|
|
+ aco_opcode op;
|
|
+ switch (byte_size) {
|
|
+ case 4: op = aco_opcode::buffer_load_dword; break;
|
|
+ case 8: op = aco_opcode::buffer_load_dwordx2; break;
|
|
+ case 12: op = aco_opcode::buffer_load_dwordx3; break;
|
|
+ case 16: op = aco_opcode::buffer_load_dwordx4; break;
|
|
+ default: unreachable("Unexpected param size");
|
|
+ }
|
|
+
|
|
+ Instruction* instr =
|
|
+ bld.mubuf(op, Definition(dst), scratch_rsrc, Operand(v1), soffset, 0, false);
|
|
+ instr->mubuf().sync = memory_sync_info(storage_scratch);
|
|
+ instr->mubuf().cache.value = ac_swizzled;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (const_offset < ctx->program->dev.scratch_global_offset_min) {
|
|
+ stack_ptr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
|
|
+ stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr),
|
|
+ Operand::c32(const_offset));
|
|
+ const_offset = 0;
|
|
+ }
|
|
+
|
|
+ aco_opcode op;
|
|
+ switch (byte_size) {
|
|
+ case 4: op = aco_opcode::scratch_load_dword; break;
|
|
+ case 8: op = aco_opcode::scratch_load_dwordx2; break;
|
|
+ case 12: op = aco_opcode::scratch_load_dwordx3; break;
|
|
+ case 16: op = aco_opcode::scratch_load_dwordx4; break;
|
|
+ default: unreachable("Unexpected param size");
|
|
+ }
|
|
+
|
|
+ bld.scratch(op, Definition(dst), Operand(v1),
|
|
+ stack_ptr == Temp() ? Operand(s1) : Operand(stack_ptr), (int16_t)const_offset,
|
|
+ memory_sync_info(storage_scratch));
|
|
+}
|
|
+
|
|
+void
|
|
+store_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param, Temp stack_ptr,
|
|
+ unsigned scratch_param_size, Temp data)
|
|
+{
|
|
+ int32_t const_offset = param.scratch_offset - scratch_param_size;
|
|
+ unsigned byte_size = data.bytes();
|
|
+ if (ctx->program->gfx_level < GFX9) {
|
|
+ Temp scratch_rsrc = load_scratch_resource(ctx->program, bld, true, false);
|
|
+
|
|
+ Temp soffset = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc),
|
|
+ stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr),
|
|
+ Operand::c32(-const_offset * ctx->program->wave_size));
|
|
+
|
|
+ assert(-const_offset * ctx->program->wave_size < 0x1ff00);
|
|
+
|
|
+ aco_opcode op;
|
|
+ switch (byte_size) {
|
|
+ case 4: op = aco_opcode::buffer_store_dword; break;
|
|
+ case 8: op = aco_opcode::buffer_store_dwordx2; break;
|
|
+ case 12: op = aco_opcode::buffer_store_dwordx3; break;
|
|
+ case 16: op = aco_opcode::buffer_store_dwordx4; break;
|
|
+ default: unreachable("Unexpected param size");
|
|
+ }
|
|
+
|
|
+ Instruction* instr =
|
|
+ bld.mubuf(op, scratch_rsrc, Operand(v1), Operand(soffset), as_vgpr(bld, data), 0, false);
|
|
+ instr->mubuf().sync = memory_sync_info(storage_scratch);
|
|
+ instr->mubuf().cache.value = ac_swizzled;
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (const_offset < ctx->program->dev.scratch_global_offset_min) {
|
|
+ stack_ptr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc),
|
|
+ stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr),
|
|
+ Operand::c32(const_offset));
|
|
+ const_offset = 0;
|
|
+ }
|
|
+
|
|
+ aco_opcode op;
|
|
+ switch (byte_size) {
|
|
+ case 4: op = aco_opcode::scratch_store_dword; break;
|
|
+ case 8: op = aco_opcode::scratch_store_dwordx2; break;
|
|
+ case 12: op = aco_opcode::scratch_store_dwordx3; break;
|
|
+ case 16: op = aco_opcode::scratch_store_dwordx4; break;
|
|
+ default: unreachable("Unexpected param size");
|
|
+ }
|
|
+
|
|
+ bld.scratch(op, Operand(v1), stack_ptr == Temp() ? Operand(s1) : Operand(stack_ptr),
|
|
+ as_vgpr(bld, data), (int16_t)const_offset, memory_sync_info(storage_scratch));
|
|
+}
|
|
+
|
|
void
|
|
visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
{
|
|
@@ -9640,6 +9741,63 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)),
|
|
Operand::c32(nir_intrinsic_base(instr)));
|
|
break;
|
|
+ case nir_intrinsic_load_return_param_amd: {
|
|
+ call_info& info = ctx->call_infos[nir_intrinsic_call_idx(instr)];
|
|
+
|
|
+ assert(nir_intrinsic_param_idx(instr) < info.nir_instr->callee->num_params);
|
|
+
|
|
+ unsigned index_in_return_params = 0u;
|
|
+ for (unsigned i = 0; i < info.nir_instr->callee->num_params; ++i) {
|
|
+ if (nir_intrinsic_param_idx(instr) == i) {
|
|
+ assert(info.nir_instr->callee->params[i].is_return);
|
|
+ break;
|
|
+ }
|
|
+ if (info.nir_instr->callee->params[i].is_return) {
|
|
+ ++index_in_return_params;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (info.return_info[index_in_return_params].is_reg) {
|
|
+ bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
|
|
+ Operand(info.return_info[index_in_return_params].def.getTemp()));
|
|
+ } else {
|
|
+ Temp stack_ptr;
|
|
+ if (ctx->callee_info.stack_ptr.is_reg)
|
|
+ stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1),
|
|
+ Operand::c32(info.scratch_param_size),
|
|
+ Operand(ctx->callee_info.stack_ptr.def.getTemp()));
|
|
+ else
|
|
+ stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1),
|
|
+ Operand::c32(info.scratch_param_size));
|
|
+ load_scratch_param(ctx, bld, info.return_info[index_in_return_params], stack_ptr,
|
|
+ info.scratch_param_size, get_ssa_temp(ctx, &instr->def));
|
|
+ }
|
|
+ break;
|
|
+ }
|
|
+ case nir_intrinsic_load_param: {
|
|
+ const auto& param = ctx->callee_info.param_infos[nir_intrinsic_param_idx(instr)];
|
|
+ if (param.is_reg)
|
|
+ bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), Operand(param.def.getTemp()));
|
|
+ else
|
|
+ load_scratch_param(
|
|
+ ctx, bld, param,
|
|
+ ctx->callee_info.stack_ptr.is_reg ? ctx->callee_info.stack_ptr.def.getTemp() : Temp(),
|
|
+ ctx->callee_info.scratch_param_size, get_ssa_temp(ctx, &instr->def));
|
|
+ break;
|
|
+ }
|
|
+ case nir_intrinsic_store_param_amd: {
|
|
+ auto& param = ctx->callee_info.param_infos[nir_intrinsic_param_idx(instr)];
|
|
+ if (param.is_reg)
|
|
+ param.def.setTemp(param.def.regClass().type() == RegType::vgpr
|
|
+ ? as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa))
|
|
+ : bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)));
|
|
+ else
|
|
+ store_scratch_param(
|
|
+ ctx, bld, param,
|
|
+ ctx->callee_info.stack_ptr.is_reg ? ctx->callee_info.stack_ptr.def.getTemp() : Temp(),
|
|
+ ctx->callee_info.scratch_param_size, get_ssa_temp(ctx, instr->src[0].ssa));
|
|
+ break;
|
|
+ }
|
|
case nir_intrinsic_load_call_return_address_amd: {
|
|
bld.copy(Definition(get_ssa_temp(ctx, &instr->def)),
|
|
Operand(ctx->callee_info.return_address.def.getTemp()));
|
|
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
|
|
index 600c63c8b9ce3..1bdbe28ec17bd 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
|
|
@@ -5,12 +5,13 @@
|
|
*/
|
|
|
|
#include "aco_instruction_selection.h"
|
|
+#include "aco_nir_call_attribs.h"
|
|
|
|
#include "common/ac_nir.h"
|
|
#include "common/sid.h"
|
|
|
|
-#include "nir_control_flow.h"
|
|
#include "nir_builder.h"
|
|
+#include "nir_control_flow.h"
|
|
|
|
#include <vector>
|
|
|
|
@@ -631,6 +632,16 @@ init_context(isel_context* ctx, nir_shader* shader)
|
|
case nir_intrinsic_load_view_index:
|
|
type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr;
|
|
break;
|
|
+ case nir_intrinsic_load_return_param_amd: {
|
|
+ type = RegType::vgpr;
|
|
+ break;
|
|
+ }
|
|
+ case nir_intrinsic_load_param: {
|
|
+ nir_parameter* param =
|
|
+ &impl->function->params[nir_intrinsic_param_idx(intrinsic)];
|
|
+ type = param->is_uniform ? RegType::sgpr : RegType::vgpr;
|
|
+ break;
|
|
+ }
|
|
default:
|
|
for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs;
|
|
i++) {
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 04c145740dcc48f05926edf8db90fc38b02bf2e5 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Thu, 6 Jun 2024 07:17:15 +0200
|
|
Subject: [PATCH 45/71] aco: Add common utility to load scratch descriptor
|
|
|
|
Also modifies the scratch descriptor to take the stack pointer into
|
|
account.
|
|
---
|
|
.../compiler/aco_instruction_selection.cpp | 40 +--------
|
|
src/amd/compiler/aco_scratch_rsrc.h | 82 +++++++++++++++++++
|
|
src/amd/compiler/aco_spill.cpp | 54 +-----------
|
|
3 files changed, 87 insertions(+), 89 deletions(-)
|
|
create mode 100644 src/amd/compiler/aco_scratch_rsrc.h
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index f3ec6fa04dd36..6ed8dd84c777f 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -11,6 +11,7 @@
|
|
#include "aco_interface.h"
|
|
#include "aco_ir.h"
|
|
#include "aco_nir_call_attribs.h"
|
|
+#include "aco_scratch_rsrc.h"
|
|
|
|
#include "common/ac_descriptors.h"
|
|
#include "common/ac_gpu_info.h"
|
|
@@ -7701,41 +7702,6 @@ visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
}
|
|
}
|
|
|
|
-Temp
|
|
-get_scratch_resource(isel_context* ctx)
|
|
-{
|
|
- Builder bld(ctx->program, ctx->block);
|
|
- Temp scratch_addr = ctx->program->private_segment_buffer;
|
|
- if (!scratch_addr.bytes()) {
|
|
- Temp addr_lo =
|
|
- bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
|
|
- Temp addr_hi =
|
|
- bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
|
|
- scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
|
|
- } else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) {
|
|
- scratch_addr =
|
|
- bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero());
|
|
- }
|
|
-
|
|
- struct ac_buffer_state ac_state = {0};
|
|
- uint32_t desc[4];
|
|
-
|
|
- ac_state.size = 0xffffffff;
|
|
- ac_state.format = PIPE_FORMAT_R32_FLOAT;
|
|
- for (int i = 0; i < 4; i++)
|
|
- ac_state.swizzle[i] = PIPE_SWIZZLE_0;
|
|
- /* older generations need element size = 4 bytes. element size removed in GFX9 */
|
|
- ac_state.element_size = ctx->program->gfx_level <= GFX8 ? 1u : 0u;
|
|
- ac_state.index_stride = ctx->program->wave_size == 64 ? 3u : 2u;
|
|
- ac_state.add_tid = true;
|
|
- ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
|
|
-
|
|
- ac_build_buffer_descriptor(ctx->program->gfx_level, &ac_state, desc);
|
|
-
|
|
- return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(desc[2]),
|
|
- Operand::c32(desc[3]));
|
|
-}
|
|
-
|
|
void
|
|
visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
{
|
|
@@ -7778,7 +7744,7 @@ visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1;
|
|
emit_load(ctx, bld, info, params);
|
|
} else {
|
|
- info.resource = get_scratch_resource(ctx);
|
|
+ info.resource = load_scratch_resource(ctx->program, bld, false, true);
|
|
info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)));
|
|
info.soffset = ctx->program->scratch_offset;
|
|
emit_load(ctx, bld, info, scratch_mubuf_load_params);
|
|
@@ -7841,7 +7807,7 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr)
|
|
memory_sync_info(storage_scratch, semantic_private));
|
|
}
|
|
} else {
|
|
- Temp rsrc = get_scratch_resource(ctx);
|
|
+ Temp rsrc = load_scratch_resource(ctx->program, bld, false, true);
|
|
offset = as_vgpr(ctx, offset);
|
|
for (unsigned i = 0; i < write_count; i++) {
|
|
aco_opcode op = get_buffer_store_op(write_datas[i].bytes());
|
|
diff --git a/src/amd/compiler/aco_scratch_rsrc.h b/src/amd/compiler/aco_scratch_rsrc.h
|
|
new file mode 100644
|
|
index 0000000000000..5b0af2bca46f0
|
|
--- /dev/null
|
|
+++ b/src/amd/compiler/aco_scratch_rsrc.h
|
|
@@ -0,0 +1,82 @@
|
|
+/*
|
|
+ * Copyright © 2024 Valve Corporation.
|
|
+ *
|
|
+ * SPDX-License-Identifier: MIT
|
|
+ */
|
|
+
|
|
+#include "aco_builder.h"
|
|
+#include "aco_ir.h"
|
|
+
|
|
+#include "ac_descriptors.h"
|
|
+#include "amdgfxregs.h"
|
|
+
|
|
+#ifndef ACO_SCRATCH_RSRC_H
|
|
+#define ACO_SCRATCH_RSRC_H
|
|
+
|
|
+namespace aco {
|
|
+
|
|
+inline Temp
|
|
+load_scratch_resource(Program* program, Builder& bld, bool apply_scratch_offset,
|
|
+ bool apply_stack_ptr)
|
|
+{
|
|
+ Temp private_segment_buffer = program->private_segment_buffer;
|
|
+ if (!private_segment_buffer.bytes()) {
|
|
+ Temp addr_lo =
|
|
+ bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
|
|
+ Temp addr_hi =
|
|
+ bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
|
|
+ private_segment_buffer =
|
|
+ bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
|
|
+ } else if (program->stage.hw != AC_HW_COMPUTE_SHADER) {
|
|
+ private_segment_buffer =
|
|
+ bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero());
|
|
+ }
|
|
+
|
|
+ if ((apply_stack_ptr && program->stack_ptr != Temp()) || apply_scratch_offset) {
|
|
+ Temp addr_lo = bld.tmp(s1);
|
|
+ Temp addr_hi = bld.tmp(s1);
|
|
+ bld.pseudo(aco_opcode::p_split_vector, Definition(addr_lo), Definition(addr_hi),
|
|
+ private_segment_buffer);
|
|
+
|
|
+ if (apply_stack_ptr && program->stack_ptr != Temp()) {
|
|
+ Temp carry = bld.tmp(s1);
|
|
+ addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo,
|
|
+ program->stack_ptr);
|
|
+ addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi,
|
|
+ Operand::c32(0), bld.scc(carry));
|
|
+ }
|
|
+
|
|
+ if (apply_scratch_offset) {
|
|
+ Temp carry = bld.tmp(s1);
|
|
+ addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo,
|
|
+ program->scratch_offset);
|
|
+ addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi,
|
|
+ Operand::c32(0), bld.scc(carry));
|
|
+ }
|
|
+
|
|
+ private_segment_buffer =
|
|
+ bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
|
|
+ }
|
|
+
|
|
+ struct ac_buffer_state ac_state = {0};
|
|
+ uint32_t desc[4];
|
|
+
|
|
+ ac_state.size = 0xffffffff;
|
|
+ ac_state.format = PIPE_FORMAT_R32_FLOAT;
|
|
+ for (int i = 0; i < 4; i++)
|
|
+ ac_state.swizzle[i] = PIPE_SWIZZLE_0;
|
|
+ /* older generations need element size = 4 bytes. element size removed in GFX9 */
|
|
+ ac_state.element_size = program->gfx_level <= GFX8 ? 1u : 0u;
|
|
+ ac_state.index_stride = program->wave_size == 64 ? 3u : 2u;
|
|
+ ac_state.add_tid = true;
|
|
+ ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
|
|
+
|
|
+ ac_build_buffer_descriptor(program->gfx_level, &ac_state, desc);
|
|
+
|
|
+ return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
|
|
+ Operand::c32(desc[2]), Operand::c32(desc[3]));
|
|
+}
|
|
+
|
|
+} // namespace aco
|
|
+
|
|
+#endif // ACO_SCRATCH_RSRC_H
|
|
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
|
|
index 2e30bf9e2783e..c271cbcf01eb8 100644
|
|
--- a/src/amd/compiler/aco_spill.cpp
|
|
+++ b/src/amd/compiler/aco_spill.cpp
|
|
@@ -7,6 +7,7 @@
|
|
|
|
#include "aco_builder.h"
|
|
#include "aco_ir.h"
|
|
+#include "aco_scratch_rsrc.h"
|
|
#include "aco_util.h"
|
|
|
|
#include "common/ac_descriptors.h"
|
|
@@ -1134,57 +1135,6 @@ spill_block(spill_ctx& ctx, unsigned block_idx)
|
|
}
|
|
}
|
|
|
|
-Temp
|
|
-load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset)
|
|
-{
|
|
- Temp private_segment_buffer = ctx.program->private_segment_buffer;
|
|
- if (!private_segment_buffer.bytes()) {
|
|
- Temp addr_lo =
|
|
- bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo));
|
|
- Temp addr_hi =
|
|
- bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi));
|
|
- private_segment_buffer =
|
|
- bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
|
|
- } else if (ctx.program->stage.hw != AC_HW_COMPUTE_SHADER) {
|
|
- private_segment_buffer =
|
|
- bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero());
|
|
- }
|
|
-
|
|
- if (apply_scratch_offset) {
|
|
- Temp addr_lo = bld.tmp(s1);
|
|
- Temp addr_hi = bld.tmp(s1);
|
|
- bld.pseudo(aco_opcode::p_split_vector, Definition(addr_lo), Definition(addr_hi),
|
|
- private_segment_buffer);
|
|
-
|
|
- Temp carry = bld.tmp(s1);
|
|
- addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo,
|
|
- ctx.program->scratch_offset);
|
|
- addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi,
|
|
- Operand::c32(0), bld.scc(carry));
|
|
-
|
|
- private_segment_buffer =
|
|
- bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi);
|
|
- }
|
|
-
|
|
- struct ac_buffer_state ac_state = {0};
|
|
- uint32_t desc[4];
|
|
-
|
|
- ac_state.size = 0xffffffff;
|
|
- ac_state.format = PIPE_FORMAT_R32_FLOAT;
|
|
- for (int i = 0; i < 4; i++)
|
|
- ac_state.swizzle[i] = PIPE_SWIZZLE_0;
|
|
- /* older generations need element size = 4 bytes. element size removed in GFX9 */
|
|
- ac_state.element_size = ctx.program->gfx_level <= GFX8 ? 1u : 0u;
|
|
- ac_state.index_stride = ctx.program->wave_size == 64 ? 3u : 2u;
|
|
- ac_state.add_tid = true;
|
|
- ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW;
|
|
-
|
|
- ac_build_buffer_descriptor(ctx.program->gfx_level, &ac_state, desc);
|
|
-
|
|
- return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer,
|
|
- Operand::c32(desc[2]), Operand::c32(desc[3]));
|
|
-}
|
|
-
|
|
void
|
|
setup_vgpr_spill_reload(spill_ctx& ctx, Block& block,
|
|
std::vector<aco_ptr<Instruction>>& instructions, uint32_t spill_slot,
|
|
@@ -1249,7 +1199,7 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block,
|
|
}
|
|
} else {
|
|
if (ctx.scratch_rsrc == Temp())
|
|
- ctx.scratch_rsrc = load_scratch_resource(ctx, rsrc_bld, overflow);
|
|
+ ctx.scratch_rsrc = load_scratch_resource(ctx.program, rsrc_bld, overflow, true);
|
|
|
|
if (overflow) {
|
|
uint32_t soffset =
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 912041711336f7e14a19439aeffd8a404990fd55 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 13 May 2024 06:14:32 +0200
|
|
Subject: [PATCH 46/71] aco: Add Program::is_callee and set it for RT shaders
|
|
|
|
---
|
|
src/amd/compiler/aco_instruction_selection.cpp | 2 ++
|
|
src/amd/compiler/aco_ir.h | 1 +
|
|
2 files changed, 3 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index 6ed8dd84c777f..d3d15c9500d5e 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -12048,6 +12048,8 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c
|
|
init_context(&ctx, nir);
|
|
setup_fp_mode(&ctx, nir);
|
|
|
|
+ ctx.program->is_callee = true;
|
|
+
|
|
Instruction* startpgm = add_startpgm(&ctx);
|
|
append_logical_start(ctx.block);
|
|
split_arguments(&ctx, startpgm);
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index 7989d27dfe75b..2bc7b91c81584 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -2385,6 +2385,7 @@ public:
|
|
/* For shader part with previous shader part that has lds access. */
|
|
bool pending_lds_access = false;
|
|
|
|
+ bool is_callee = false;
|
|
ABI callee_abi = {};
|
|
unsigned short arg_sgpr_count;
|
|
unsigned short arg_vgpr_count;
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 441ab8b850fb95ed9a8cfc7ae0fe0e258385fdaa Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sat, 4 May 2024 17:54:14 +0200
|
|
Subject: [PATCH 47/71] radv,aco: Use function call structure for RT programs
|
|
|
|
---
|
|
.../compiler/aco_instruction_selection.cpp | 208 ++++++++++++------
|
|
src/amd/compiler/aco_interface.cpp | 7 +-
|
|
src/amd/compiler/aco_interface.h | 4 +-
|
|
src/amd/compiler/aco_ir.h | 4 +-
|
|
src/amd/vulkan/radv_pipeline_rt.c | 6 +-
|
|
src/amd/vulkan/radv_shader.c | 8 +-
|
|
src/amd/vulkan/radv_shader.h | 3 +-
|
|
7 files changed, 165 insertions(+), 75 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index d3d15c9500d5e..901b9ca843eb1 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -12003,33 +12003,53 @@ merged_wave_info_to_mask(isel_context* ctx, unsigned i)
|
|
return lanecount_to_mask(ctx, count, i * 8u);
|
|
}
|
|
|
|
-static void
|
|
-insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args)
|
|
+void
|
|
+insert_return(isel_context& ctx)
|
|
{
|
|
- unsigned src_count = 0;
|
|
- for (unsigned i = 0; i < ctx.args->arg_count; i++)
|
|
- src_count += !!BITSET_TEST(ctx.output_args, i);
|
|
-
|
|
+ unsigned return_param_count = 0;
|
|
+ for (auto& param_def : ctx.callee_info.param_infos) {
|
|
+ if (!param_def.is_reg || param_def.discardable)
|
|
+ continue;
|
|
+ ++return_param_count;
|
|
+ }
|
|
+ unsigned src_count = return_param_count + 2;
|
|
+ if (ctx.next_pc != Temp())
|
|
+ src_count += ctx.args->arg_count;
|
|
Instruction* ret = create_instruction(aco_opcode::p_return, Format::PSEUDO, src_count, 0);
|
|
ctx.block->instructions.emplace_back(ret);
|
|
|
|
- src_count = 0;
|
|
- for (unsigned i = 0; i < ctx.args->arg_count; i++) {
|
|
- if (!BITSET_TEST(ctx.output_args, i))
|
|
- continue;
|
|
-
|
|
- enum ac_arg_regfile file = ctx.args->args[i].file;
|
|
- unsigned size = ctx.args->args[i].size;
|
|
- unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
|
|
- RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
|
|
- Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
|
|
- : Operand(PhysReg{reg}, type);
|
|
- ret->operands[src_count] = op;
|
|
- src_count++;
|
|
+ if (ctx.next_pc != Temp()) {
|
|
+ for (unsigned i = 0; i < ctx.args->arg_count; i++) {
|
|
+ enum ac_arg_regfile file = ctx.args->args[i].file;
|
|
+ unsigned size = ctx.args->args[i].size;
|
|
+ unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
|
|
+ RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
|
|
+ Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg})
|
|
+ : Operand(PhysReg{reg}, type);
|
|
+ ret->operands[i] = op;
|
|
+ }
|
|
}
|
|
|
|
- Builder bld(ctx.program, ctx.block);
|
|
- bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr));
|
|
+ unsigned def_idx = ctx.next_pc != Temp() ? ctx.args->arg_count : 0;
|
|
+ for (unsigned i = 0; i < ctx.callee_info.param_infos.size(); ++i) {
|
|
+ const auto& param_info = ctx.callee_info.param_infos[i];
|
|
+ if (!param_info.is_reg || param_info.discardable)
|
|
+ continue;
|
|
+ Temp param_temp = param_info.def.getTemp();
|
|
+ if (i == 0 && ctx.next_pc != Temp())
|
|
+ param_temp = ctx.next_divergent_pc;
|
|
+ else if (i == 1 && ctx.next_pc != Temp())
|
|
+ param_temp = ctx.next_pc;
|
|
+ Operand op = Operand(param_temp);
|
|
+ op.setPrecolored(param_info.def.physReg());
|
|
+ ret->operands[def_idx++] = op;
|
|
+ }
|
|
+ Operand op = Operand(ctx.callee_info.return_address.def.getTemp());
|
|
+ op.setPrecolored(ctx.callee_info.return_address.def.physReg());
|
|
+ ret->operands[def_idx++] = op;
|
|
+ Operand stack_op = Operand(ctx.callee_info.stack_ptr.def.getTemp());
|
|
+ stack_op.setPrecolored(ctx.callee_info.stack_ptr.def.physReg());
|
|
+ ret->operands[def_idx++] = stack_op;
|
|
}
|
|
|
|
void
|
|
@@ -12048,21 +12068,38 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c
|
|
init_context(&ctx, nir);
|
|
setup_fp_mode(&ctx, nir);
|
|
|
|
+ ABI abi;
|
|
+ /* TODO: callable abi? */
|
|
+ switch (shaders[i]->info.stage) {
|
|
+ case MESA_SHADER_RAYGEN:
|
|
+ case MESA_SHADER_CLOSEST_HIT:
|
|
+ case MESA_SHADER_MISS:
|
|
+ case MESA_SHADER_CALLABLE: abi = rtRaygenABI; break;
|
|
+ case MESA_SHADER_INTERSECTION: abi = rtTraversalABI; break;
|
|
+ case MESA_SHADER_ANY_HIT: abi = rtAnyHitABI; break;
|
|
+ default: unreachable("invalid RT shader stage");
|
|
+ }
|
|
+
|
|
+ ctx.callee_abi = make_abi(abi, ctx.program);
|
|
+ ctx.program->callee_abi = ctx.callee_abi;
|
|
+ ctx.callee_info = get_callee_info(ctx.callee_abi, impl->function->num_params,
|
|
+ impl->function->params, ctx.program);
|
|
ctx.program->is_callee = true;
|
|
|
|
- Instruction* startpgm = add_startpgm(&ctx);
|
|
+ Instruction* startpgm = add_startpgm(&ctx, true);
|
|
append_logical_start(ctx.block);
|
|
split_arguments(&ctx, startpgm);
|
|
visit_cf_list(&ctx, &impl->body);
|
|
append_logical_end(ctx.block);
|
|
ctx.block->kind |= block_kind_uniform;
|
|
|
|
- /* Fix output registers and jump to next shader. We can skip this when dealing with a
|
|
- * raygen shader without shader calls.
|
|
- */
|
|
- if ((shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN) &&
|
|
- impl == nir_shader_get_entrypoint(nir))
|
|
- insert_rt_jump_next(ctx, args);
|
|
+ if (ctx.next_pc != Temp()) {
|
|
+ insert_return(ctx);
|
|
+
|
|
+ Builder(ctx.program, ctx.block).sop1(aco_opcode::s_setpc_b64, Operand(ctx.next_pc));
|
|
+ } else {
|
|
+ Builder(ctx.program, ctx.block).sopp(aco_opcode::s_endpgm);
|
|
+ }
|
|
|
|
cleanup_context(&ctx);
|
|
first_block = false;
|
|
@@ -12879,7 +12916,8 @@ calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args,
|
|
void
|
|
select_rt_prolog(Program* program, ac_shader_config* config,
|
|
const struct aco_compiler_options* options, const struct aco_shader_info* info,
|
|
- const struct ac_shader_args* in_args, const struct ac_shader_args* out_args)
|
|
+ const struct ac_shader_args* in_args, const struct ac_arg* descriptors,
|
|
+ unsigned raygen_param_count, nir_parameter* raygen_params)
|
|
{
|
|
init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode,
|
|
config);
|
|
@@ -12890,8 +12928,11 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|
calc_min_waves(program);
|
|
Builder bld(program, block);
|
|
block->instructions.reserve(32);
|
|
- unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used);
|
|
- unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used);
|
|
+ unsigned num_sgprs = in_args->num_sgprs_used;
|
|
+ unsigned num_vgprs = in_args->num_vgprs_used;
|
|
+
|
|
+ struct callee_info raygen_info =
|
|
+ get_callee_info(make_abi(rtRaygenABI, program), raygen_param_count, raygen_params, NULL);
|
|
|
|
/* Inputs:
|
|
* Ring offsets: s[0-1]
|
|
@@ -12906,9 +12947,11 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|
* Local invocation IDs: v[0-2]
|
|
*/
|
|
PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets);
|
|
+ PhysReg in_descriptors = get_arg_reg(in_args, *descriptors);
|
|
+ PhysReg in_push_constants = get_arg_reg(in_args, in_args->push_constants);
|
|
PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors);
|
|
+ PhysReg in_traversal_addr = get_arg_reg(in_args, in_args->rt.traversal_shader_addr);
|
|
PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr);
|
|
- PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base);
|
|
PhysReg in_wg_id_x;
|
|
PhysReg in_wg_id_y;
|
|
PhysReg in_wg_id_z;
|
|
@@ -12942,46 +12985,84 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|
* Shader VA: v[4-5]
|
|
* Shader Record Ptr: v[6-7]
|
|
*/
|
|
- PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr);
|
|
- PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_sizes[0]);
|
|
- PhysReg out_launch_size_y = get_arg_reg(out_args, out_args->rt.launch_sizes[1]);
|
|
- PhysReg out_launch_size_z = get_arg_reg(out_args, out_args->rt.launch_sizes[2]);
|
|
+ assert(raygen_info.stack_ptr.is_reg);
|
|
+ assert(raygen_info.return_address.is_reg);
|
|
+ assert(raygen_info.param_infos[0].is_reg);
|
|
+ assert(raygen_info.param_infos[1].is_reg);
|
|
+ assert(raygen_info.param_infos[RAYGEN_ARG_LAUNCH_ID + 2].is_reg);
|
|
+ assert(raygen_info.param_infos[RAYGEN_ARG_LAUNCH_SIZE + 2].is_reg);
|
|
+ assert(raygen_info.param_infos[RAYGEN_ARG_DESCRIPTORS + 2].is_reg);
|
|
+ assert(raygen_info.param_infos[RAYGEN_ARG_PUSH_CONSTANTS + 2].is_reg);
|
|
+ assert(raygen_info.param_infos[RAYGEN_ARG_SBT_DESCRIPTORS + 2].is_reg);
|
|
+ assert(raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR + 2].is_reg);
|
|
+ assert(raygen_info.param_infos[RAYGEN_ARG_SHADER_RECORD_PTR + 2].is_reg);
|
|
+ PhysReg out_stack_ptr_param = raygen_info.stack_ptr.def.physReg();
|
|
+ PhysReg out_return_shader_addr = raygen_info.return_address.def.physReg();
|
|
+ PhysReg out_divergent_shader_addr = raygen_info.param_infos[0].def.physReg();
|
|
+ PhysReg out_uniform_shader_addr = raygen_info.param_infos[1].def.physReg();
|
|
+ PhysReg out_launch_size_x = raygen_info.param_infos[RAYGEN_ARG_LAUNCH_SIZE + 2].def.physReg();
|
|
+ PhysReg out_launch_size_y = out_launch_size_x.advance(4);
|
|
+ PhysReg out_launch_size_z = out_launch_size_y.advance(4);
|
|
PhysReg out_launch_ids[3];
|
|
- for (unsigned i = 0; i < 3; i++)
|
|
- out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_ids[i]);
|
|
- PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base);
|
|
- PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record);
|
|
+ out_launch_ids[0] = raygen_info.param_infos[RAYGEN_ARG_LAUNCH_ID + 2].def.physReg();
|
|
+ for (unsigned i = 1; i < 3; i++)
|
|
+ out_launch_ids[i] = out_launch_ids[i - 1].advance(4);
|
|
+ PhysReg out_descriptors = raygen_info.param_infos[RAYGEN_ARG_DESCRIPTORS + 2].def.physReg();
|
|
+ PhysReg out_push_constants =
|
|
+ raygen_info.param_infos[RAYGEN_ARG_PUSH_CONSTANTS + 2].def.physReg();
|
|
+ PhysReg out_sbt_descriptors =
|
|
+ raygen_info.param_infos[RAYGEN_ARG_SBT_DESCRIPTORS + 2].def.physReg();
|
|
+ PhysReg out_traversal_addr =
|
|
+ raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR + 2].def.physReg();
|
|
+ PhysReg out_record_ptr = raygen_info.param_infos[RAYGEN_ARG_SHADER_RECORD_PTR + 2].def.physReg();
|
|
+
|
|
+ num_sgprs = std::max(num_sgprs, out_stack_ptr_param.reg());
|
|
+ num_vgprs = std::max(num_vgprs, out_record_ptr.reg() - 256 + 2);
|
|
|
|
/* Temporaries: */
|
|
num_sgprs = align(num_sgprs, 2);
|
|
+ num_sgprs += 2;
|
|
PhysReg tmp_raygen_sbt = PhysReg{num_sgprs};
|
|
num_sgprs += 2;
|
|
+ PhysReg tmp_launch_size_addr = PhysReg{num_sgprs};
|
|
+ num_sgprs += 2;
|
|
PhysReg tmp_ring_offsets = PhysReg{num_sgprs};
|
|
num_sgprs += 2;
|
|
+ PhysReg tmp_traversal_addr = PhysReg{num_sgprs};
|
|
+ num_sgprs += 2;
|
|
PhysReg tmp_wg_id_x_times_size = PhysReg{num_sgprs};
|
|
num_sgprs++;
|
|
|
|
PhysReg tmp_invocation_idx = PhysReg{256 + num_vgprs++};
|
|
|
|
/* Confirm some assumptions about register aliasing */
|
|
- assert(in_ring_offsets == out_uniform_shader_addr);
|
|
- assert(get_arg_reg(in_args, in_args->push_constants) ==
|
|
- get_arg_reg(out_args, out_args->push_constants));
|
|
- assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) ==
|
|
- get_arg_reg(out_args, out_args->rt.sbt_descriptors));
|
|
- assert(in_launch_size_addr == out_launch_size_x);
|
|
- assert(in_stack_base == out_launch_size_z);
|
|
- assert(in_local_ids[0] == out_launch_ids[0]);
|
|
+ assert(in_descriptors == out_uniform_shader_addr);
|
|
+ assert(in_sbt_desc == out_launch_size_x);
|
|
+ assert(in_traversal_addr == out_launch_size_z);
|
|
+ assert(in_wg_id_x == out_traversal_addr);
|
|
|
|
/* <gfx9 reads in_scratch_offset at the end of the prolog to write out the scratch_offset
|
|
* arg. Make sure no other outputs have overwritten it by then.
|
|
*/
|
|
- assert(options->gfx_level >= GFX9 || in_scratch_offset.reg() >= out_args->num_sgprs_used);
|
|
+ assert(options->gfx_level >= GFX9 ||
|
|
+ in_scratch_offset.reg() >=
|
|
+ raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR].def.physReg());
|
|
|
|
/* load raygen sbt */
|
|
bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2),
|
|
Operand::c32(0u));
|
|
|
|
+ bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_launch_size_addr, s2),
|
|
+ Operand(in_launch_size_addr, s2));
|
|
+ bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_traversal_addr, s2),
|
|
+ Operand(in_traversal_addr, s2));
|
|
+ bld.sop1(aco_opcode::s_mov_b32, Definition(out_descriptors, s1), Operand(in_descriptors, s1));
|
|
+ bld.sop1(aco_opcode::s_mov_b32, Definition(out_push_constants, s1),
|
|
+ Operand(in_push_constants, s1));
|
|
+ bld.sop1(aco_opcode::s_mov_b32, Definition(out_sbt_descriptors, s1), Operand(in_sbt_desc, s1));
|
|
+ bld.sop1(aco_opcode::s_mov_b32, Definition(out_sbt_descriptors.advance(4), s1),
|
|
+ Operand(in_sbt_desc.advance(4), s1));
|
|
+
|
|
/* init scratch */
|
|
if (options->gfx_level < GFX9) {
|
|
/* copy ring offsets to temporary location*/
|
|
@@ -12992,18 +13073,15 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|
Operand(in_scratch_offset, s1));
|
|
}
|
|
|
|
- /* set stack ptr */
|
|
- bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1));
|
|
-
|
|
/* load raygen address */
|
|
bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2),
|
|
Operand(tmp_raygen_sbt, s2), Operand::c32(0u));
|
|
|
|
/* load ray launch sizes */
|
|
bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1),
|
|
- Operand(in_launch_size_addr, s2), Operand::c32(8u));
|
|
+ Operand(tmp_launch_size_addr, s2), Operand::c32(8u));
|
|
bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2),
|
|
- Operand(in_launch_size_addr, s2), Operand::c32(0u));
|
|
+ Operand(tmp_launch_size_addr, s2), Operand::c32(0u));
|
|
|
|
/* calculate ray launch ids */
|
|
if (options->gfx_level >= GFX11) {
|
|
@@ -13059,6 +13137,11 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|
Operand::c32(-1u), Operand(tmp_invocation_idx, v1));
|
|
}
|
|
|
|
+ bld.sop1(aco_opcode::s_mov_b32, Definition(out_traversal_addr, s1),
|
|
+ Operand(tmp_traversal_addr, s1));
|
|
+ bld.sop1(aco_opcode::s_mov_b32, Definition(out_traversal_addr.advance(4), s1),
|
|
+ Operand(tmp_traversal_addr.advance(4), s1));
|
|
+
|
|
/* Make fixup operations a no-op if this is not a converted 2D dispatch. */
|
|
bld.sopc(aco_opcode::s_cmp_lg_u32, Definition(scc, s1),
|
|
Operand::c32(ACO_RT_CONVERTED_2D_LAUNCH_SIZE), Operand(out_launch_size_y, s1));
|
|
@@ -13070,14 +13153,15 @@ select_rt_prolog(Program* program, ac_shader_config* config,
|
|
bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(),
|
|
Operand(out_launch_ids[1], v1), Operand(vcc, bld.lm));
|
|
|
|
- if (options->gfx_level < GFX9) {
|
|
- /* write scratch/ring offsets to outputs, if needed */
|
|
- bld.sop1(aco_opcode::s_mov_b32,
|
|
- Definition(get_arg_reg(out_args, out_args->scratch_offset), s1),
|
|
- Operand(in_scratch_offset, s1));
|
|
- bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2),
|
|
- Operand(tmp_ring_offsets, s2));
|
|
- }
|
|
+ if (program->gfx_level < GFX8)
|
|
+ bld.vop3(aco_opcode::v_lshr_b64, Definition(out_divergent_shader_addr, v2),
|
|
+ Operand(out_uniform_shader_addr, s2), Operand::c32(0));
|
|
+ else
|
|
+ bld.vop3(aco_opcode::v_lshrrev_b64, Definition(out_divergent_shader_addr, v2),
|
|
+ Operand::c32(0), Operand(out_uniform_shader_addr, s2));
|
|
+ bld.sop1(aco_opcode::s_mov_b64, Definition(out_return_shader_addr, s2), Operand::c32(0));
|
|
+
|
|
+ bld.sopk(aco_opcode::s_movk_i32, Definition(out_stack_ptr_param, s1), 0);
|
|
|
|
/* jump to raygen */
|
|
bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2));
|
|
diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
|
|
index 32a28908f90f0..5c7956caeedd4 100644
|
|
--- a/src/amd/compiler/aco_interface.cpp
|
|
+++ b/src/amd/compiler/aco_interface.cpp
|
|
@@ -307,8 +307,8 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_
|
|
void
|
|
aco_compile_rt_prolog(const struct aco_compiler_options* options,
|
|
const struct aco_shader_info* info, const struct ac_shader_args* in_args,
|
|
- const struct ac_shader_args* out_args, aco_callback* build_prolog,
|
|
- void** binary)
|
|
+ const struct ac_arg* descriptors, unsigned raygen_param_count,
|
|
+ nir_parameter* raygen_params, aco_callback* build_prolog, void** binary)
|
|
{
|
|
init();
|
|
|
|
@@ -319,7 +319,8 @@ aco_compile_rt_prolog(const struct aco_compiler_options* options,
|
|
program->debug.func = NULL;
|
|
program->debug.private_data = NULL;
|
|
|
|
- select_rt_prolog(program.get(), &config, options, info, in_args, out_args);
|
|
+ select_rt_prolog(program.get(), &config, options, info, in_args, descriptors, raygen_param_count,
|
|
+ raygen_params);
|
|
validate(program.get());
|
|
insert_waitcnt(program.get());
|
|
insert_NOPs(program.get());
|
|
diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h
|
|
index 462727432a1ac..efc3172647183 100644
|
|
--- a/src/amd/compiler/aco_interface.h
|
|
+++ b/src/amd/compiler/aco_interface.h
|
|
@@ -49,8 +49,8 @@ void aco_compile_shader(const struct aco_compiler_options* options,
|
|
|
|
void aco_compile_rt_prolog(const struct aco_compiler_options* options,
|
|
const struct aco_shader_info* info, const struct ac_shader_args* in_args,
|
|
- const struct ac_shader_args* out_args, aco_callback* build_prolog,
|
|
- void** binary);
|
|
+ const struct ac_arg* descriptors, unsigned raygen_param_count,
|
|
+ nir_parameter* raygen_params, aco_callback* build_prolog, void** binary);
|
|
|
|
void aco_compile_vs_prolog(const struct aco_compiler_options* options,
|
|
const struct aco_shader_info* info,
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index 2bc7b91c81584..ccf2710d5453f 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -23,6 +23,7 @@
|
|
#include <vector>
|
|
|
|
typedef struct nir_shader nir_shader;
|
|
+typedef struct nir_parameter nir_parameter;
|
|
|
|
namespace aco {
|
|
|
|
@@ -2462,7 +2463,8 @@ void select_trap_handler_shader(Program* program, struct nir_shader* shader,
|
|
void select_rt_prolog(Program* program, ac_shader_config* config,
|
|
const struct aco_compiler_options* options,
|
|
const struct aco_shader_info* info, const struct ac_shader_args* in_args,
|
|
- const struct ac_shader_args* out_args);
|
|
+ const struct ac_arg* descriptors, unsigned raygen_param_count,
|
|
+ nir_parameter* raygen_params);
|
|
void select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo,
|
|
ac_shader_config* config, const struct aco_compiler_options* options,
|
|
const struct aco_shader_info* info, const struct ac_shader_args* args);
|
|
diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c
|
|
index 196f8aa23a032..216eb1bb09f89 100644
|
|
--- a/src/amd/vulkan/radv_pipeline_rt.c
|
|
+++ b/src/amd/vulkan/radv_pipeline_rt.c
|
|
@@ -808,8 +808,12 @@ static void
|
|
compile_rt_prolog(struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline)
|
|
{
|
|
const struct radv_physical_device *pdev = radv_device_physical(device);
|
|
+ struct nir_function raygen_stub = {};
|
|
|
|
- pipeline->prolog = radv_create_rt_prolog(device);
|
|
+ /* Create a dummy function signature for raygen shaders in order to pass parameter info to the prolog */
|
|
+ radv_nir_init_rt_function_params(&raygen_stub, MESA_SHADER_RAYGEN, 0);
|
|
+ radv_nir_lower_callee_signature(&raygen_stub, NULL);
|
|
+ pipeline->prolog = radv_create_rt_prolog(device, raygen_stub.num_params, raygen_stub.params);
|
|
|
|
/* create combined config */
|
|
struct ac_shader_config *config = &pipeline->prolog->config;
|
|
diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
|
|
index e5aa2ff636e1f..60648b2405321 100644
|
|
--- a/src/amd/vulkan/radv_shader.c
|
|
+++ b/src/amd/vulkan/radv_shader.c
|
|
@@ -3073,13 +3073,12 @@ radv_aco_build_shader_part(void **bin, uint32_t num_sgprs, uint32_t num_vgprs, c
|
|
}
|
|
|
|
struct radv_shader *
|
|
-radv_create_rt_prolog(struct radv_device *device)
|
|
+radv_create_rt_prolog(struct radv_device *device, unsigned raygen_param_count, nir_parameter *raygen_params)
|
|
{
|
|
const struct radv_physical_device *pdev = radv_device_physical(device);
|
|
const struct radv_instance *instance = radv_physical_device_instance(pdev);
|
|
struct radv_shader *prolog;
|
|
struct radv_shader_args in_args = {0};
|
|
- struct radv_shader_args out_args = {0};
|
|
struct radv_nir_compiler_options options = {0};
|
|
radv_fill_nir_compiler_options(&options, device, NULL, false, instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS, false,
|
|
radv_device_fault_detection_enabled(device), false);
|
|
@@ -3100,7 +3099,6 @@ radv_create_rt_prolog(struct radv_device *device)
|
|
info.cs.uses_block_id[i] = true;
|
|
|
|
radv_declare_shader_args(device, NULL, &info, MESA_SHADER_COMPUTE, MESA_SHADER_NONE, &in_args);
|
|
- radv_declare_rt_shader_args(options.info->gfx_level, &out_args);
|
|
info.user_sgprs_locs = in_args.user_sgprs_locs;
|
|
|
|
#if AMD_LLVM_AVAILABLE
|
|
@@ -3114,8 +3112,8 @@ radv_create_rt_prolog(struct radv_device *device)
|
|
struct aco_compiler_options ac_opts;
|
|
radv_aco_convert_shader_info(&ac_info, &info, &in_args, &device->cache_key, options.info->gfx_level);
|
|
radv_aco_convert_opts(&ac_opts, &options, &in_args, &stage_key);
|
|
- aco_compile_rt_prolog(&ac_opts, &ac_info, &in_args.ac, &out_args.ac, &radv_aco_build_shader_binary,
|
|
- (void **)&binary);
|
|
+ aco_compile_rt_prolog(&ac_opts, &ac_info, &in_args.ac, &in_args.descriptor_sets[0], raygen_param_count, raygen_params,
|
|
+ &radv_aco_build_shader_binary, (void **)&binary);
|
|
binary->info = info;
|
|
|
|
radv_postprocess_binary_config(device, binary, &in_args);
|
|
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
|
|
index 10e062fb041b9..5ee1ee40466cf 100644
|
|
--- a/src/amd/vulkan/radv_shader.h
|
|
+++ b/src/amd/vulkan/radv_shader.h
|
|
@@ -576,7 +576,8 @@ void radv_free_shader_memory(struct radv_device *device, union radv_shader_arena
|
|
|
|
struct radv_shader *radv_create_trap_handler_shader(struct radv_device *device);
|
|
|
|
-struct radv_shader *radv_create_rt_prolog(struct radv_device *device);
|
|
+struct radv_shader *radv_create_rt_prolog(struct radv_device *device, unsigned raygen_param_count,
|
|
+ nir_parameter *raygen_params);
|
|
|
|
struct radv_shader_part *radv_shader_part_create(struct radv_device *device, struct radv_shader_part_binary *binary,
|
|
unsigned wave_size);
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 26d71a1077a1d0b29e4e426c5a83d0a04a7b18d6 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 13 May 2024 06:17:34 +0200
|
|
Subject: [PATCH 48/71] aco/ssa_elimination: Don't remove exec writes for last
|
|
blocks of callee shaders
|
|
|
|
The caller is going to use the exec mask written there.
|
|
---
|
|
src/amd/compiler/aco_ssa_elimination.cpp | 3 ++-
|
|
1 file changed, 2 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp
|
|
index a1477244f51d9..e63dd63ad917c 100644
|
|
--- a/src/amd/compiler/aco_ssa_elimination.cpp
|
|
+++ b/src/amd/compiler/aco_ssa_elimination.cpp
|
|
@@ -758,7 +758,8 @@ eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block)
|
|
/* Check if any successor needs the outgoing exec mask from the current block. */
|
|
|
|
bool exec_write_used;
|
|
- if (block.kind & block_kind_end_with_regs) {
|
|
+ if (block.kind & block_kind_end_with_regs ||
|
|
+ (block.linear_succs.empty() && ctx.program->is_callee)) {
|
|
/* Last block of a program with succeed shader part should respect final exec write. */
|
|
exec_write_used = true;
|
|
} else {
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 6935d9d0a326ae77622e57057ee433faf3c33146 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 6 Mar 2024 14:53:39 +0100
|
|
Subject: [PATCH 49/71] aco/isel: Handle calls
|
|
|
|
---
|
|
.../compiler/aco_instruction_selection.cpp | 130 ++++++++++++++++++
|
|
1 file changed, 130 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index 901b9ca843eb1..b926d357739a4 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -10800,6 +10800,135 @@ get_callee_info(const ABI& abi, unsigned param_count, const nir_parameter* param
|
|
return info;
|
|
}
|
|
|
|
+void
|
|
+visit_call(isel_context* ctx, nir_call_instr* instr)
|
|
+{
|
|
+ Builder bld(ctx->program, ctx->block);
|
|
+
|
|
+ ABI abi;
|
|
+ /* TODO: callable abi? */
|
|
+ switch (instr->callee->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) {
|
|
+ case ACO_NIR_CALL_ABI_RT_RECURSIVE: abi = make_abi(rtRaygenABI, ctx->program); break;
|
|
+ case ACO_NIR_CALL_ABI_TRAVERSAL: abi = make_abi(rtTraversalABI, ctx->program); break;
|
|
+ case ACO_NIR_CALL_ABI_AHIT_ISEC: abi = make_abi(rtAnyHitABI, ctx->program); break;
|
|
+ default: unreachable("invalid abi");
|
|
+ }
|
|
+
|
|
+ struct callee_info info =
|
|
+ get_callee_info(abi, instr->callee->num_params, instr->callee->params, nullptr);
|
|
+ std::vector<parameter_info> return_infos;
|
|
+
|
|
+ Instruction* stack_instr;
|
|
+ Definition stack_ptr;
|
|
+ if (info.stack_ptr.is_reg) {
|
|
+ stack_instr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1),
|
|
+ Operand::c32(info.scratch_param_size),
|
|
+ Operand(ctx->callee_info.stack_ptr.def.getTemp()));
|
|
+ stack_ptr = ctx->callee_info.stack_ptr.def;
|
|
+ } else {
|
|
+ stack_instr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1),
|
|
+ Operand::c32(info.scratch_param_size));
|
|
+ stack_ptr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s1), Operand::c32(0)).def(0);
|
|
+ }
|
|
+
|
|
+ for (unsigned i = 0; i < info.param_infos.size(); ++i) {
|
|
+ if (info.param_infos[i].is_reg)
|
|
+ continue;
|
|
+
|
|
+ store_scratch_param(ctx, bld, info.param_infos[i], stack_instr->definitions[0].getTemp(),
|
|
+ info.scratch_param_size, get_ssa_temp(ctx, instr->params[i].ssa));
|
|
+ }
|
|
+
|
|
+ unsigned extra_def_count = 1;
|
|
+
|
|
+ Temp vcc_backup;
|
|
+ if (ctx->program->dev.sgpr_limit <= vcc_hi.reg()) {
|
|
+ vcc_backup = bld.copy(bld.def(bld.lm), Operand(vcc, bld.lm));
|
|
+ --extra_def_count;
|
|
+ }
|
|
+
|
|
+ unsigned extra_param_count = 3;
|
|
+ if (ctx->program->gfx_level < GFX9)
|
|
+ ++extra_param_count;
|
|
+
|
|
+ unsigned param_size = info.scratch_param_size;
|
|
+ if (ctx->program->gfx_level < GFX9)
|
|
+ param_size *= ctx->program->wave_size;
|
|
+
|
|
+ Instruction* call_instr =
|
|
+ create_instruction(aco_opcode::p_call, Format::PSEUDO_CALL,
|
|
+ info.reg_param_count + ctx->args->arg_count + extra_param_count,
|
|
+ info.reg_return_param_count + extra_def_count);
|
|
+ call_instr->call().abi = abi;
|
|
+ call_instr->operands[0] = Operand(ctx->callee_info.return_address.def.getTemp(),
|
|
+ info.return_address.def.physReg());
|
|
+ call_instr->operands[1] = Operand(stack_ptr.getTemp(), info.stack_ptr.def.physReg());
|
|
+ call_instr->operands[2] = Operand::c32(param_size);
|
|
+ if (ctx->program->gfx_level < GFX9) {
|
|
+ call_instr->operands[info.reg_param_count + ctx->args->arg_count + 3] =
|
|
+ Operand(load_scratch_resource(ctx->program, bld, true, false));
|
|
+ call_instr->operands[info.reg_param_count + ctx->args->arg_count + 3].setLateKill(true);
|
|
+ }
|
|
+
|
|
+ unsigned reg_return_param_idx = 0;
|
|
+ for (unsigned i = 0; i < info.param_infos.size(); ++i) {
|
|
+ if (!info.param_infos[i].is_reg) {
|
|
+ if (instr->callee->params[i].is_return) {
|
|
+ return_infos.emplace_back(parameter_info{
|
|
+ .is_reg = false,
|
|
+ .scratch_offset = info.param_infos[i].scratch_offset,
|
|
+ });
|
|
+ }
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (instr->callee->params[i].is_uniform)
|
|
+ call_instr->operands[i + 3] = Operand(get_ssa_temp(ctx, instr->params[i].ssa));
|
|
+ else
|
|
+ call_instr->operands[i + 3] =
|
|
+ Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->params[i].ssa)));
|
|
+
|
|
+ if (instr->callee->params[i].is_return) {
|
|
+ assert(!instr->callee->params[i].is_uniform);
|
|
+ Definition def =
|
|
+ bld.def(RegClass(RegType::vgpr, DIV_ROUND_UP(instr->callee->params[i].bit_size, 32)),
|
|
+ info.param_infos[i].def.physReg());
|
|
+ call_instr->definitions[extra_def_count + reg_return_param_idx++] = def;
|
|
+ return_infos.emplace_back(parameter_info{
|
|
+ .is_reg = true,
|
|
+ .def = def,
|
|
+ });
|
|
+ }
|
|
+
|
|
+ call_instr->operands[i + 3].setPrecolored(info.param_infos[i].def.physReg());
|
|
+ }
|
|
+
|
|
+ for (unsigned i = 0; i < ctx->args->arg_count; i++) {
|
|
+ enum ac_arg_regfile file = ctx->args->args[i].file;
|
|
+ unsigned size = ctx->args->args[i].size;
|
|
+ unsigned reg = ctx->args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256);
|
|
+ RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size);
|
|
+ Operand op = ctx->arg_temps[i].id() ? Operand(ctx->arg_temps[i], PhysReg{reg})
|
|
+ : Operand(PhysReg{reg}, type);
|
|
+ op.setLateKill(true);
|
|
+ call_instr->operands[info.reg_param_count + 3 + i] = op;
|
|
+ }
|
|
+
|
|
+ if (ctx->program->dev.sgpr_limit <= vcc_hi.reg())
|
|
+ bld.copy(bld.def(bld.lm, vcc), Operand(vcc_backup));
|
|
+ else
|
|
+ call_instr->definitions[0] = bld.def(s2, vcc);
|
|
+
|
|
+ ctx->block->instructions.emplace_back(static_cast<Instruction*>(call_instr));
|
|
+
|
|
+ ctx->call_infos.emplace_back(call_info{
|
|
+ .nir_instr = instr,
|
|
+ .aco_instr = call_instr,
|
|
+ .return_info = std::move(return_infos),
|
|
+ .scratch_param_size = info.scratch_param_size,
|
|
+ });
|
|
+}
|
|
+
|
|
void
|
|
visit_block(isel_context* ctx, nir_block* block)
|
|
{
|
|
@@ -10823,6 +10952,7 @@ visit_block(isel_context* ctx, nir_block* block)
|
|
case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break;
|
|
case nir_instr_type_deref: break;
|
|
case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break;
|
|
+ case nir_instr_type_call: visit_call(ctx, nir_instr_as_call(instr)); break;
|
|
default: isel_err(instr, "Unknown NIR instr type");
|
|
}
|
|
}
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 5a1503448739d2e2012bb0392711e3f6612df00f Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 6 Mar 2024 14:56:16 +0100
|
|
Subject: [PATCH 50/71] aco/lower_to_hw_instr: Lower calls
|
|
|
|
---
|
|
src/amd/compiler/aco_lower_to_hw_instr.cpp | 10 ++++++++++
|
|
1 file changed, 10 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
|
|
index 1e1737319c3f6..c9a918d8a373f 100644
|
|
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
|
|
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
|
|
@@ -3017,6 +3017,16 @@ lower_to_hw_instr(Program* program)
|
|
} else if (instr->isMIMG() && instr->mimg().strict_wqm) {
|
|
lower_image_sample(&ctx, instr);
|
|
ctx.instructions.emplace_back(std::move(instr));
|
|
+ } else if (instr->isCall()) {
|
|
+ PhysReg stack_reg = instr->operands[1].physReg();
|
|
+ if (instr->operands[2].constantValue())
|
|
+ bld.sop2(aco_opcode::s_add_u32, Definition(stack_reg, s1), Definition(scc, s1),
|
|
+ Operand(stack_reg, s1), instr->operands[2]);
|
|
+ bld.sop1(aco_opcode::s_swappc_b64, Definition(instr->operands[0].physReg(), s2),
|
|
+ Operand(instr->operands[4].physReg(), s2));
|
|
+ if (instr->operands[2].constantValue())
|
|
+ bld.sop2(aco_opcode::s_sub_u32, Definition(stack_reg, s1), Definition(scc, s1),
|
|
+ Operand(stack_reg, s1), instr->operands[2]);
|
|
} else {
|
|
ctx.instructions.emplace_back(std::move(instr));
|
|
}
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 6a4e937529ba36e41712205f201a308e98c6a8c9 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 13 Mar 2024 10:59:52 +0100
|
|
Subject: [PATCH 51/71] aco/live_var_analysis: Handle calls
|
|
|
|
---
|
|
src/amd/compiler/aco_live_var_analysis.cpp | 47 ++++++++++++++++++++++
|
|
1 file changed, 47 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
|
|
index a635c94496143..64814e983bb2e 100644
|
|
--- a/src/amd/compiler/aco_live_var_analysis.cpp
|
|
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
|
|
@@ -29,9 +29,46 @@ get_temp_register_demand(Instruction* instr, RegisterDemand& demand_before, Regi
|
|
demand_before += op.getTemp();
|
|
}
|
|
}
|
|
+
|
|
+ if (instr->isCall())
|
|
+ demand_after += instr->call().blocked_abi_demand;
|
|
}
|
|
}
|
|
|
|
+void
|
|
+compute_blocked_abi_demand(Program* program, unsigned linear_vgpr_demand, Pseudo_call_instruction& instr)
|
|
+{
|
|
+ const unsigned max_vgpr = get_addr_vgpr_from_waves(program, program->min_waves);
|
|
+ /* Linear VGPRs can intersect with preserved VGPRs, we insert spill code for them in
|
|
+ * spill_preserved.
|
|
+ */
|
|
+ unsigned preserved_vgprs = max_vgpr - (instr.abi.clobberedRegs.vgpr.hi() - 256);
|
|
+ linear_vgpr_demand -= std::min(preserved_vgprs, linear_vgpr_demand);
|
|
+
|
|
+ unsigned preserved_vgpr_demand =
|
|
+ instr.abi.clobberedRegs.vgpr.size -
|
|
+ std::min(linear_vgpr_demand, instr.abi.clobberedRegs.vgpr.size);
|
|
+ unsigned preserved_sgpr_demand = instr.abi.clobberedRegs.sgpr.size;
|
|
+
|
|
+ /* Don't count definitions contained in clobbered call regs twice */
|
|
+ for (auto& definition : instr.definitions) {
|
|
+ if (definition.isTemp() && definition.isFixed()) {
|
|
+ auto def_regs = PhysRegInterval{PhysReg{definition.physReg().reg()}, definition.size()};
|
|
+ for (auto reg : def_regs) {
|
|
+ if (instr.abi.clobberedRegs.sgpr.contains(reg))
|
|
+ --preserved_sgpr_demand;
|
|
+ if (instr.abi.clobberedRegs.vgpr.contains(reg))
|
|
+ --preserved_vgpr_demand;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ if (instr.abi.clobberedRegs.sgpr.contains(instr.operands[1].physReg()) &&
|
|
+ !instr.operands[1].isKill())
|
|
+ --preserved_sgpr_demand;
|
|
+
|
|
+ instr.blocked_abi_demand = RegisterDemand(preserved_vgpr_demand, preserved_sgpr_demand);
|
|
+}
|
|
+
|
|
RegisterDemand
|
|
get_live_changes(Instruction* instr)
|
|
{
|
|
@@ -313,6 +350,16 @@ process_live_temps_per_block(live_ctx& ctx, Block* block)
|
|
}
|
|
}
|
|
|
|
+ if (insn->isCall()) {
|
|
+ unsigned linear_vgpr_demand = 0;
|
|
+ for (unsigned t : live) {
|
|
+ if (ctx.program->temp_rc[t].is_linear_vgpr())
|
|
+ linear_vgpr_demand += ctx.program->temp_rc[t].size();
|
|
+ }
|
|
+ compute_blocked_abi_demand(ctx.program, linear_vgpr_demand, insn->call());
|
|
+ insn->register_demand += insn->call().blocked_abi_demand;
|
|
+ }
|
|
+
|
|
operand_demand += new_demand;
|
|
insn->register_demand.update(operand_demand);
|
|
block->register_demand.update(insn->register_demand);
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 66c7c6cc5e167e8a763fe17520e575ad6cae7f50 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sat, 23 Mar 2024 10:29:13 +0100
|
|
Subject: [PATCH 52/71] aco/ra: add utility to block interval
|
|
|
|
---
|
|
src/amd/compiler/aco_register_allocation.cpp | 2 ++
|
|
1 file changed, 2 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index 4d73525bd0660..9012a742bda33 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -264,6 +264,8 @@ public:
|
|
fill(start, rc.size(), 0xFFFFFFFF);
|
|
}
|
|
|
|
+ void block(PhysRegInterval interval) { fill(interval.lo(), interval.size, 0xFFFFFFFF); }
|
|
+
|
|
bool is_blocked(PhysReg start) const
|
|
{
|
|
if (regs[start] == 0xFFFFFFFF)
|
|
--
|
|
GitLab
|
|
|
|
|
|
From f2f3a2b63f646a30906c47bac0bb095618b12e9f Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sat, 23 Mar 2024 10:31:35 +0100
|
|
Subject: [PATCH 53/71] aco/ra: handle clobbered regions by calls
|
|
|
|
---
|
|
src/amd/compiler/aco_register_allocation.cpp | 53 ++++++++++++++++++++
|
|
1 file changed, 53 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index 9012a742bda33..68502a79476e2 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -2104,6 +2104,12 @@ handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file,
|
|
bool found = false;
|
|
for (auto reg : regs.second) {
|
|
PhysRegInterval range = {reg, ctx.program->temp_rc[regs.first].size()};
|
|
+ if (instr->isCall()) {
|
|
+ if (intersects(instr->call().abi.clobberedRegs.vgpr, range))
|
|
+ continue;
|
|
+ if (intersects(instr->call().abi.clobberedRegs.sgpr, range))
|
|
+ continue;
|
|
+ }
|
|
bool intersects_with_def = false;
|
|
for (const auto& def : instr->definitions) {
|
|
if (!def.isTemp() || !def.isFixed())
|
|
@@ -3142,6 +3148,49 @@ register_allocation(Program* program, ra_test_policy policy)
|
|
register_file.clear(op);
|
|
}
|
|
|
|
+ if (instr->isCall()) {
|
|
+ /* create parallelcopy pair to move blocking vars */
|
|
+ RegisterFile tmp_file = register_file;
|
|
+ std::vector<unsigned> vars =
|
|
+ collect_vars(ctx, tmp_file, instr->call().abi.clobberedRegs.sgpr);
|
|
+ std::vector<unsigned> vars2 =
|
|
+ collect_vars(ctx, tmp_file, instr->call().abi.clobberedRegs.vgpr);
|
|
+
|
|
+ /* Allow linear VGPRs in the clobbered range, they are spilled in spill_preserved. */
|
|
+ for (auto it = vars2.begin(); it != vars2.end();) {
|
|
+ if (program->temp_rc[*it].is_linear_vgpr()) {
|
|
+ it = vars2.erase(it);
|
|
+ tmp_file.block(ctx.assignments[*it].reg, program->temp_rc[*it]);
|
|
+ } else {
|
|
+ ++it;
|
|
+ }
|
|
+ }
|
|
+ for (auto it = vars.begin(); it != vars.end();) {
|
|
+ if (instr->operands[1].tempId() == *it)
|
|
+ it = vars.erase(it);
|
|
+ else
|
|
+ ++it;
|
|
+ }
|
|
+
|
|
+ vars.insert(vars.end(), vars2.begin(), vars2.end());
|
|
+
|
|
+ tmp_file.fill_killed_operands(instr.get());
|
|
+ tmp_file.block(instr->call().abi.clobberedRegs.sgpr);
|
|
+ tmp_file.block(instr->call().abi.clobberedRegs.vgpr);
|
|
+
|
|
+ adjust_max_used_regs(ctx, RegClass::s1,
|
|
+ instr->call().abi.clobberedRegs.sgpr.hi().reg() - 1);
|
|
+ adjust_max_used_regs(ctx, RegClass::v1,
|
|
+ instr->call().abi.clobberedRegs.vgpr.hi().reg() - 1);
|
|
+
|
|
+ ASSERTED bool success = false;
|
|
+ success =
|
|
+ get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, PhysRegInterval{});
|
|
+ assert(success);
|
|
+
|
|
+ update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops);
|
|
+ }
|
|
+
|
|
optimize_encoding(ctx, register_file, instr);
|
|
|
|
/* Handle definitions which must have the same register as an operand.
|
|
@@ -3171,6 +3220,10 @@ register_allocation(Program* program, ra_test_policy policy)
|
|
RegisterFile tmp_file(register_file);
|
|
/* re-enable the killed operands, so that we don't move the blocking vars there */
|
|
tmp_file.fill_killed_operands(instr.get());
|
|
+ if (instr->isCall()) {
|
|
+ tmp_file.block(instr->call().abi.clobberedRegs.sgpr);
|
|
+ tmp_file.block(instr->call().abi.clobberedRegs.vgpr);
|
|
+ }
|
|
|
|
ASSERTED bool success = false;
|
|
success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, def_regs);
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 04f918a810d1b5953922cf91c9ea068a3d6c54db Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Fri, 3 May 2024 17:37:04 +0200
|
|
Subject: [PATCH 54/71] aco/insert_waitcnt: Insert waitcnts before s_swappc too
|
|
|
|
---
|
|
src/amd/compiler/aco_insert_waitcnt.cpp | 4 ++++
|
|
1 file changed, 4 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp
|
|
index e6263d6f191f2..510ed8613c41d 100644
|
|
--- a/src/amd/compiler/aco_insert_waitcnt.cpp
|
|
+++ b/src/amd/compiler/aco_insert_waitcnt.cpp
|
|
@@ -344,6 +344,10 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf
|
|
force_waitcnt(ctx, imm);
|
|
}
|
|
|
|
+ if (instr->opcode == aco_opcode::s_swappc_b64)
|
|
+ u_foreach_bit (i, (~counter_vs) & ctx.nonzero)
|
|
+ imm[i] = 0;
|
|
+
|
|
/* Make sure POPS coherent memory accesses have reached the L2 cache before letting the
|
|
* overlapping waves proceed into the ordered section.
|
|
*/
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 35688a25c2e66aa5a8ddbe2c2700cf0fe0e7642b Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 13 May 2024 06:30:07 +0200
|
|
Subject: [PATCH 55/71] aco/ra: Add utility to clear PhysRegInterval
|
|
|
|
---
|
|
src/amd/compiler/aco_register_allocation.cpp | 2 ++
|
|
1 file changed, 2 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index 68502a79476e2..eb87bf111f5a8 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -266,6 +266,8 @@ public:
|
|
|
|
void block(PhysRegInterval interval) { fill(interval.lo(), interval.size, 0xFFFFFFFF); }
|
|
|
|
+ void clear(PhysRegInterval interval) { fill(interval.lo(), interval.size, 0); }
|
|
+
|
|
bool is_blocked(PhysReg start) const
|
|
{
|
|
if (regs[start] == 0xFFFFFFFF)
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 15ce5c3c90909b56b7c62d00d7e5022f4244140e Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sat, 18 May 2024 10:19:58 +0200
|
|
Subject: [PATCH 56/71] aco/util: Add aco::unordered_set
|
|
|
|
---
|
|
src/amd/compiler/aco_util.h | 9 +++++++++
|
|
1 file changed, 9 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_util.h b/src/amd/compiler/aco_util.h
|
|
index 68a6c686408f0..0c5f9566bd213 100644
|
|
--- a/src/amd/compiler/aco_util.h
|
|
+++ b/src/amd/compiler/aco_util.h
|
|
@@ -20,6 +20,7 @@
|
|
#include <map>
|
|
#include <type_traits>
|
|
#include <unordered_map>
|
|
+#include <unordered_set>
|
|
#include <vector>
|
|
|
|
namespace aco {
|
|
@@ -390,6 +391,14 @@ template <class Key, class T, class Hash = std::hash<Key>, class Pred = std::equ
|
|
using unordered_map =
|
|
std::unordered_map<Key, T, Hash, Pred, aco::monotonic_allocator<std::pair<const Key, T>>>;
|
|
|
|
+/*
|
|
+ * aco::unordered_set - alias for std::unordered_map with monotonic_allocator
|
|
+ *
|
|
+ * This template specialization mimics std::pmr::unordered_set.
|
|
+ */
|
|
+template <class T, class Hash = std::hash<T>, class Pred = std::equal_to<T>>
|
|
+using unordered_set = std::unordered_set<T, Hash, Pred, aco::monotonic_allocator<T>>;
|
|
+
|
|
/*
|
|
* Cache-friendly set of 32-bit IDs with fast insert/erase/lookup and
|
|
* the ability to efficiently iterate over contained elements.
|
|
--
|
|
GitLab
|
|
|
|
|
|
From be7080caa16a484d00a6213c284f91421bb9abb1 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 13 May 2024 06:23:55 +0200
|
|
Subject: [PATCH 57/71] aco: Add pass for spilling call-related VGPRs
|
|
|
|
Spills preserved VGPRs for callees and linear VGPRs added by the
|
|
spiller.
|
|
---
|
|
.../compiler/aco_instruction_selection.cpp | 65 ++-
|
|
src/amd/compiler/aco_interface.cpp | 2 +
|
|
src/amd/compiler/aco_ir.h | 11 +-
|
|
src/amd/compiler/aco_opcodes.py | 3 +
|
|
src/amd/compiler/aco_opt_value_numbering.cpp | 3 +-
|
|
src/amd/compiler/aco_register_allocation.cpp | 62 +-
|
|
src/amd/compiler/aco_spill_preserved.cpp | 547 ++++++++++++++++++
|
|
src/amd/compiler/meson.build | 1 +
|
|
8 files changed, 670 insertions(+), 24 deletions(-)
|
|
create mode 100644 src/amd/compiler/aco_spill_preserved.cpp
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index b926d357739a4..deb97c1867667 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -106,9 +106,21 @@ append_logical_start(Block* b)
|
|
}
|
|
|
|
static void
|
|
-append_logical_end(Block* b)
|
|
+append_logical_end(isel_context* ctx)
|
|
{
|
|
- Builder(NULL, b).pseudo(aco_opcode::p_logical_end);
|
|
+ Builder bld(ctx->program, ctx->block);
|
|
+
|
|
+ Operand stack_ptr_op;
|
|
+ if (ctx->program->gfx_level >= GFX9)
|
|
+ stack_ptr_op = Operand(ctx->callee_info.stack_ptr.def.getTemp());
|
|
+ else
|
|
+ stack_ptr_op = Operand(load_scratch_resource(ctx->program, bld, true, true));
|
|
+ stack_ptr_op.setLateKill(true);
|
|
+ if (ctx->program->is_callee)
|
|
+ bld.pseudo(aco_opcode::p_reload_preserved_vgpr, bld.def(s1), bld.def(bld.lm),
|
|
+ bld.def(s1, scc), stack_ptr_op);
|
|
+
|
|
+ bld.pseudo(aco_opcode::p_logical_end);
|
|
}
|
|
|
|
Temp
|
|
@@ -10485,7 +10497,7 @@ void
|
|
begin_loop(isel_context* ctx, loop_context* lc)
|
|
{
|
|
// TODO: we might want to wrap the loop around a branch if exec.potentially_empty=true
|
|
- append_logical_end(ctx->block);
|
|
+ append_logical_end(ctx);
|
|
ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform;
|
|
Builder bld(ctx->program, ctx->block);
|
|
bld.branch(aco_opcode::p_branch, bld.def(s2));
|
|
@@ -10543,7 +10555,7 @@ end_loop(isel_context* ctx, loop_context* lc)
|
|
if (!ctx->cf_info.has_branch) {
|
|
unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx;
|
|
Builder bld(ctx->program, ctx->block);
|
|
- append_logical_end(ctx->block);
|
|
+ append_logical_end(ctx);
|
|
|
|
/* No need to check exec.potentially_empty_break/continue originating inside the loop. In the
|
|
* only case where it's possible at this point (divergent break after divergent continue), we
|
|
@@ -10610,7 +10622,7 @@ emit_loop_jump(isel_context* ctx, bool is_break)
|
|
{
|
|
Builder bld(ctx->program, ctx->block);
|
|
Block* logical_target;
|
|
- append_logical_end(ctx->block);
|
|
+ append_logical_end(ctx);
|
|
unsigned idx = ctx->block->index;
|
|
|
|
if (is_break) {
|
|
@@ -11072,7 +11084,7 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond,
|
|
{
|
|
ic->cond = cond;
|
|
|
|
- append_logical_end(ctx->block);
|
|
+ append_logical_end(ctx);
|
|
ctx->block->kind |= block_kind_branch;
|
|
|
|
/* branch to linear then block */
|
|
@@ -11118,7 +11130,7 @@ begin_divergent_if_else(isel_context* ctx, if_context* ic,
|
|
nir_selection_control sel_ctrl = nir_selection_control_none)
|
|
{
|
|
Block* BB_then_logical = ctx->block;
|
|
- append_logical_end(BB_then_logical);
|
|
+ append_logical_end(ctx);
|
|
/* branch from logical then block to invert block */
|
|
aco_ptr<Instruction> branch;
|
|
branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
|
|
@@ -11177,7 +11189,7 @@ static void
|
|
end_divergent_if(isel_context* ctx, if_context* ic)
|
|
{
|
|
Block* BB_else_logical = ctx->block;
|
|
- append_logical_end(BB_else_logical);
|
|
+ append_logical_end(ctx);
|
|
|
|
/* branch from logical else block to endif block */
|
|
aco_ptr<Instruction> branch;
|
|
@@ -11222,7 +11234,7 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond)
|
|
{
|
|
assert(cond.regClass() == s1);
|
|
|
|
- append_logical_end(ctx->block);
|
|
+ append_logical_end(ctx);
|
|
ctx->block->kind |= block_kind_uniform;
|
|
|
|
aco_ptr<Instruction> branch;
|
|
@@ -11257,7 +11269,7 @@ begin_uniform_if_else(isel_context* ctx, if_context* ic)
|
|
Block* BB_then = ctx->block;
|
|
|
|
if (!ctx->cf_info.has_branch) {
|
|
- append_logical_end(BB_then);
|
|
+ append_logical_end(ctx);
|
|
/* branch from then block to endif block */
|
|
aco_ptr<Instruction> branch;
|
|
branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
|
|
@@ -11291,7 +11303,7 @@ end_uniform_if(isel_context* ctx, if_context* ic)
|
|
Block* BB_else = ctx->block;
|
|
|
|
if (!ctx->cf_info.has_branch) {
|
|
- append_logical_end(BB_else);
|
|
+ append_logical_end(ctx);
|
|
/* branch from then block to endif block */
|
|
aco_ptr<Instruction> branch;
|
|
branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1));
|
|
@@ -12217,13 +12229,34 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c
|
|
ctx.program->is_callee = true;
|
|
|
|
Instruction* startpgm = add_startpgm(&ctx, true);
|
|
+
|
|
+ Builder bld(ctx.program, ctx.block);
|
|
+
|
|
+ Operand stack_ptr_op;
|
|
+ if (ctx.program->gfx_level >= GFX9)
|
|
+ stack_ptr_op = Operand(ctx.callee_info.stack_ptr.def.getTemp());
|
|
+ else
|
|
+ stack_ptr_op = Operand(load_scratch_resource(ctx.program, bld, true, true));
|
|
+ stack_ptr_op.setLateKill(true);
|
|
+ bld.pseudo(aco_opcode::p_spill_preserved_vgpr, bld.def(s1), bld.def(bld.lm),
|
|
+ bld.def(s1, scc), stack_ptr_op);
|
|
+
|
|
append_logical_start(ctx.block);
|
|
split_arguments(&ctx, startpgm);
|
|
visit_cf_list(&ctx, &impl->body);
|
|
- append_logical_end(ctx.block);
|
|
+ append_logical_end(&ctx);
|
|
ctx.block->kind |= block_kind_uniform;
|
|
|
|
if (ctx.next_pc != Temp()) {
|
|
+ bld = Builder(ctx.program, ctx.block);
|
|
+ if (ctx.program->gfx_level >= GFX9)
|
|
+ stack_ptr_op = Operand(ctx.callee_info.stack_ptr.def.getTemp());
|
|
+ else
|
|
+ stack_ptr_op = Operand(load_scratch_resource(ctx.program, bld, true, true));
|
|
+ stack_ptr_op.setLateKill(true);
|
|
+ bld.pseudo(aco_opcode::p_reload_preserved_vgpr, bld.def(s1), bld.def(bld.lm),
|
|
+ bld.def(s1, scc), stack_ptr_op);
|
|
+
|
|
insert_return(ctx);
|
|
|
|
Builder(ctx.program, ctx.block).sop1(aco_opcode::s_setpc_b64, Operand(ctx.next_pc));
|
|
@@ -12503,7 +12536,7 @@ select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, cons
|
|
if (need_endpgm) {
|
|
program->config->float_mode = program->blocks[0].fp_mode.val;
|
|
|
|
- append_logical_end(ctx.block);
|
|
+ append_logical_end(&ctx);
|
|
ctx.block->kind |= block_kind_uniform;
|
|
|
|
if ((!program->info.ps.has_epilog && !is_first_stage_of_merged_shader) ||
|
|
@@ -12918,7 +12951,7 @@ select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shade
|
|
|
|
program->config->float_mode = program->blocks[0].fp_mode.val;
|
|
|
|
- append_logical_end(ctx.block);
|
|
+ append_logical_end(&ctx);
|
|
ctx.block->kind |= block_kind_uniform;
|
|
bld.sopp(aco_opcode::s_endpgm);
|
|
|
|
@@ -13864,7 +13897,7 @@ select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config,
|
|
|
|
program->config->float_mode = program->blocks[0].fp_mode.val;
|
|
|
|
- append_logical_end(ctx.block);
|
|
+ append_logical_end(&ctx);
|
|
ctx.block->kind |= block_kind_export_end;
|
|
bld.reset(ctx.block);
|
|
bld.sopp(aco_opcode::s_endpgm);
|
|
@@ -13900,7 +13933,7 @@ select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
|
|
|
|
program->config->float_mode = program->blocks[0].fp_mode.val;
|
|
|
|
- append_logical_end(ctx.block);
|
|
+ append_logical_end(&ctx);
|
|
|
|
build_end_with_regs(&ctx, regs);
|
|
|
|
diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp
|
|
index 5c7956caeedd4..921fc3894c694 100644
|
|
--- a/src/amd/compiler/aco_interface.cpp
|
|
+++ b/src/amd/compiler/aco_interface.cpp
|
|
@@ -172,6 +172,8 @@ aco_postprocess_shader(const struct aco_compiler_options* options,
|
|
validate(program.get());
|
|
}
|
|
|
|
+ spill_preserved(program.get());
|
|
+
|
|
ssa_elimination(program.get());
|
|
}
|
|
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index ccf2710d5453f..e2101ae5162bc 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -2079,7 +2079,9 @@ is_dead(const std::vector<uint16_t>& uses, const Instruction* instr)
|
|
{
|
|
if (instr->definitions.empty() || instr->isBranch() || instr->isCall() ||
|
|
instr->opcode == aco_opcode::p_startpgm || instr->opcode == aco_opcode::p_init_scratch ||
|
|
- instr->opcode == aco_opcode::p_dual_src_export_gfx11)
|
|
+ instr->opcode == aco_opcode::p_dual_src_export_gfx11 ||
|
|
+ instr->opcode == aco_opcode::p_spill_preserved_vgpr ||
|
|
+ instr->opcode == aco_opcode::p_reload_preserved_vgpr)
|
|
return false;
|
|
|
|
if (std::any_of(instr->definitions.begin(), instr->definitions.end(),
|
|
@@ -2492,6 +2494,7 @@ void setup_reduce_temp(Program* program);
|
|
void lower_to_cssa(Program* program);
|
|
void register_allocation(Program* program, ra_test_policy = {});
|
|
void reindex_ssa(Program* program);
|
|
+void spill_preserved(Program* program);
|
|
void ssa_elimination(Program* program);
|
|
void lower_to_hw_instr(Program* program);
|
|
void schedule_program(Program* program);
|
|
@@ -2608,4 +2611,10 @@ extern const Info instr_info;
|
|
|
|
} // namespace aco
|
|
|
|
+namespace std {
|
|
+template <> struct hash<aco::PhysReg> {
|
|
+ size_t operator()(aco::PhysReg temp) const noexcept { return std::hash<uint32_t>{}(temp.reg_b); }
|
|
+};
|
|
+} // namespace std
|
|
+
|
|
#endif /* ACO_IR_H */
|
|
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
|
|
index 696a5a945b310..8d0b93a044270 100644
|
|
--- a/src/amd/compiler/aco_opcodes.py
|
|
+++ b/src/amd/compiler/aco_opcodes.py
|
|
@@ -333,6 +333,9 @@ insn("p_unit_test")
|
|
|
|
insn("p_callee_stack_ptr")
|
|
|
|
+insn("p_spill_preserved_vgpr")
|
|
+insn("p_reload_preserved_vgpr")
|
|
+
|
|
insn("p_create_vector")
|
|
insn("p_extract_vector")
|
|
insn("p_split_vector")
|
|
diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp
|
|
index d5be9e9302d66..a199216907a5e 100644
|
|
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
|
|
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
|
|
@@ -313,7 +313,8 @@ can_eliminate(aco_ptr<Instruction>& instr)
|
|
if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi ||
|
|
instr->opcode == aco_opcode::p_linear_phi ||
|
|
instr->opcode == aco_opcode::p_pops_gfx9_add_exiting_wave_id ||
|
|
- instr->definitions[0].isNoCSE())
|
|
+ instr->definitions[0].isNoCSE() || instr->opcode == aco_opcode::p_spill_preserved_vgpr ||
|
|
+ instr->opcode == aco_opcode::p_reload_preserved_vgpr)
|
|
return false;
|
|
|
|
return true;
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index eb87bf111f5a8..88f40f894e79c 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -19,12 +19,6 @@
|
|
#include <unordered_set>
|
|
#include <vector>
|
|
|
|
-namespace std {
|
|
-template <> struct hash<aco::PhysReg> {
|
|
- size_t operator()(aco::PhysReg temp) const noexcept { return std::hash<uint32_t>{}(temp.reg_b); }
|
|
-};
|
|
-} // namespace std
|
|
-
|
|
namespace aco {
|
|
namespace {
|
|
|
|
@@ -2492,6 +2486,23 @@ init_reg_file(ra_ctx& ctx, const std::vector<IDSet>& live_out_per_block, Block&
|
|
const IDSet& live_in = live_out_per_block[block.index];
|
|
assert(block.index != 0 || live_in.empty());
|
|
|
|
+ /* Callee shaders only get a chance to spill preserved registers after p_startpgm.
|
|
+ * To make sure nothing uses these regs until we can spill them, block them here.
|
|
+ */
|
|
+ if (block.index == 0 && ctx.program->is_callee) {
|
|
+ PhysRegInterval preserved_vgpr_lo = PhysRegInterval{
|
|
+ .lo_ = PhysReg{256u + ctx.program->arg_vgpr_count},
|
|
+ .size =
|
|
+ ctx.program->callee_abi.clobberedRegs.vgpr.lo() - 256 - ctx.program->arg_vgpr_count,
|
|
+ };
|
|
+ PhysRegInterval preserved_vgpr_hi = PhysRegInterval{
|
|
+ .lo_ = ctx.program->callee_abi.clobberedRegs.vgpr.hi(),
|
|
+ .size = PhysReg{256u + ctx.vgpr_limit} - ctx.program->callee_abi.clobberedRegs.vgpr.hi(),
|
|
+ };
|
|
+ register_file.block(preserved_vgpr_hi);
|
|
+ register_file.block(preserved_vgpr_lo);
|
|
+ }
|
|
+
|
|
if (block.kind & block_kind_loop_header) {
|
|
ctx.loop_header.emplace_back(block.index);
|
|
/* already rename phis incoming value */
|
|
@@ -3093,6 +3104,31 @@ register_allocation(Program* program, ra_test_policy policy)
|
|
instructions.emplace_back(std::move(instr));
|
|
break;
|
|
}
|
|
+ if (instr->opcode == aco_opcode::p_reload_preserved_vgpr && block.linear_succs.empty()) {
|
|
+ PhysRegInterval preserved_vgpr_lo = PhysRegInterval{
|
|
+ .lo_ = PhysReg{256u + ctx.program->arg_vgpr_count},
|
|
+ .size = ctx.program->callee_abi.clobberedRegs.vgpr.lo() - 256u -
|
|
+ ctx.program->arg_vgpr_count,
|
|
+ };
|
|
+ PhysRegInterval preserved_vgpr_hi = PhysRegInterval{
|
|
+ .lo_ = ctx.program->callee_abi.clobberedRegs.vgpr.hi(),
|
|
+ .size =
|
|
+ PhysReg{256u + ctx.vgpr_limit} - ctx.program->callee_abi.clobberedRegs.vgpr.hi(),
|
|
+ };
|
|
+ std::vector<unsigned> vars = collect_vars(ctx, register_file, preserved_vgpr_lo);
|
|
+ std::vector<unsigned> vars2 = collect_vars(ctx, register_file, preserved_vgpr_hi);
|
|
+ vars.insert(vars.end(), vars2.begin(), vars2.end());
|
|
+
|
|
+ register_file.block(preserved_vgpr_lo);
|
|
+ register_file.block(preserved_vgpr_hi);
|
|
+
|
|
+ ASSERTED bool success = false;
|
|
+ success = get_regs_for_copies(ctx, register_file, parallelcopy, vars, instr,
|
|
+ PhysRegInterval{});
|
|
+ assert(success);
|
|
+
|
|
+ update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
|
|
+ }
|
|
|
|
assert(!is_phi(instr));
|
|
|
|
@@ -3397,6 +3433,20 @@ register_allocation(Program* program, ra_test_policy policy)
|
|
instr->format = asVOP3(instr->format);
|
|
}
|
|
|
|
+ if (instr->opcode == aco_opcode::p_spill_preserved_vgpr) {
|
|
+ PhysRegInterval preserved_vgpr_lo = PhysRegInterval{
|
|
+ .lo_ = PhysReg{256u + ctx.program->arg_vgpr_count},
|
|
+ .size = ctx.program->callee_abi.clobberedRegs.vgpr.lo() - 256u -
|
|
+ ctx.program->arg_vgpr_count,
|
|
+ };
|
|
+ PhysRegInterval preserved_vgpr_hi = PhysRegInterval{
|
|
+ .lo_ = ctx.program->callee_abi.clobberedRegs.vgpr.hi(),
|
|
+ .size =
|
|
+ PhysReg{256u + ctx.vgpr_limit} - ctx.program->callee_abi.clobberedRegs.vgpr.hi(),
|
|
+ };
|
|
+ register_file.clear(preserved_vgpr_hi);
|
|
+ register_file.clear(preserved_vgpr_lo);
|
|
+ }
|
|
instructions.emplace_back(std::move(*instr_it));
|
|
|
|
} /* end for Instr */
|
|
diff --git a/src/amd/compiler/aco_spill_preserved.cpp b/src/amd/compiler/aco_spill_preserved.cpp
|
|
new file mode 100644
|
|
index 0000000000000..a6a6dd04c2d9f
|
|
--- /dev/null
|
|
+++ b/src/amd/compiler/aco_spill_preserved.cpp
|
|
@@ -0,0 +1,547 @@
|
|
+/*
|
|
+ * Copyright © 2024 Valve Corporation
|
|
+ *
|
|
+ * SPDX-License-Identifier: MIT
|
|
+ */
|
|
+
|
|
+#include "aco_builder.h"
|
|
+#include "aco_ir.h"
|
|
+
|
|
+#include <set>
|
|
+#include <unordered_set>
|
|
+
|
|
+namespace aco {
|
|
+
|
|
+struct postdom_info {
|
|
+ unsigned logical_imm_postdom;
|
|
+ unsigned linear_imm_postdom;
|
|
+};
|
|
+
|
|
+struct spill_preserved_ctx {
|
|
+ Program* program;
|
|
+ aco::monotonic_buffer_resource memory;
|
|
+
|
|
+ aco::unordered_map<PhysReg, uint32_t> preserved_spill_offsets;
|
|
+ aco::unordered_set<PhysReg> preserved_regs;
|
|
+ aco::unordered_set<PhysReg> preserved_linear_regs;
|
|
+
|
|
+ aco::unordered_map<PhysReg, std::unordered_set<unsigned>> reg_block_uses;
|
|
+ std::vector<postdom_info> dom_info;
|
|
+
|
|
+ unsigned next_preserved_offset;
|
|
+
|
|
+ explicit spill_preserved_ctx(Program* program_)
|
|
+ : program(program_), memory(), preserved_spill_offsets(memory), preserved_regs(memory),
|
|
+ preserved_linear_regs(memory), reg_block_uses(memory),
|
|
+ next_preserved_offset(
|
|
+ DIV_ROUND_UP(program_->config->scratch_bytes_per_wave, program_->wave_size))
|
|
+ {
|
|
+ dom_info.resize(program->blocks.size(), {-1u, -1u});
|
|
+ }
|
|
+};
|
|
+
|
|
+void
|
|
+add_instr(spill_preserved_ctx& ctx, unsigned block_index, bool seen_reload,
|
|
+ const aco_ptr<Instruction>& instr)
|
|
+{
|
|
+ for (auto& def : instr->definitions) {
|
|
+ assert(def.isFixed());
|
|
+ if (def.regClass().type() == RegType::sgpr)
|
|
+ continue;
|
|
+ /* Round down subdword registers to their base */
|
|
+ PhysReg start_reg = PhysReg{def.physReg().reg()};
|
|
+ for (auto reg : PhysRegInterval{start_reg, def.regClass().size()}) {
|
|
+ if (reg < 256u + ctx.program->arg_vgpr_count)
|
|
+ continue;
|
|
+ if (ctx.program->callee_abi.clobberedRegs.vgpr.contains(reg) &&
|
|
+ !def.regClass().is_linear_vgpr())
|
|
+ continue;
|
|
+ /* Don't count start_linear_vgpr without a copy as a use since the value doesn't matter.
|
|
+ * This allows us to move reloads a bit further up the CF.
|
|
+ */
|
|
+ if (instr->opcode == aco_opcode::p_start_linear_vgpr && instr->operands.empty())
|
|
+ continue;
|
|
+
|
|
+ if (def.regClass().is_linear_vgpr())
|
|
+ ctx.preserved_linear_regs.insert(reg);
|
|
+ else
|
|
+ ctx.preserved_regs.insert(reg);
|
|
+
|
|
+ if (seen_reload) {
|
|
+ if (def.regClass().is_linear_vgpr())
|
|
+ for (auto succ : ctx.program->blocks[block_index].linear_succs)
|
|
+ ctx.reg_block_uses[reg].emplace(succ);
|
|
+ else
|
|
+ for (auto succ : ctx.program->blocks[block_index].logical_succs)
|
|
+ ctx.reg_block_uses[reg].emplace(succ);
|
|
+ } else {
|
|
+ ctx.reg_block_uses[reg].emplace(block_index);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+ for (auto& op : instr->operands) {
|
|
+ assert(op.isFixed());
|
|
+ if (op.regClass().type() == RegType::sgpr)
|
|
+ continue;
|
|
+ if (op.isConstant())
|
|
+ continue;
|
|
+ /* Round down subdword registers to their base */
|
|
+ PhysReg start_reg = PhysReg{op.physReg().reg()};
|
|
+ for (auto reg : PhysRegInterval{start_reg, op.regClass().size()}) {
|
|
+ if (reg < 256u + ctx.program->arg_vgpr_count)
|
|
+ continue;
|
|
+ /* Don't count end_linear_vgpr as a use since the value doesn't matter.
|
|
+ * This allows us to move reloads a bit further up the CF.
|
|
+ */
|
|
+ if (instr->opcode == aco_opcode::p_end_linear_vgpr)
|
|
+ continue;
|
|
+ if (ctx.program->callee_abi.clobberedRegs.vgpr.contains(reg) &&
|
|
+ !op.regClass().is_linear_vgpr())
|
|
+ continue;
|
|
+ if (op.regClass().is_linear_vgpr())
|
|
+ ctx.preserved_linear_regs.insert(reg);
|
|
+
|
|
+ if (seen_reload) {
|
|
+ if (op.regClass().is_linear_vgpr())
|
|
+ for (auto succ : ctx.program->blocks[block_index].linear_succs)
|
|
+ ctx.reg_block_uses[reg].emplace(succ);
|
|
+ else
|
|
+ for (auto succ : ctx.program->blocks[block_index].logical_succs)
|
|
+ ctx.reg_block_uses[reg].emplace(succ);
|
|
+ } else {
|
|
+ ctx.reg_block_uses[reg].emplace(block_index);
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+}
|
|
+
|
|
+void
|
|
+spill_preserved(spill_preserved_ctx& ctx, PhysReg reg, std::vector<std::pair<PhysReg, int>>& spills,
|
|
+ std::vector<std::pair<PhysReg, int>>& lvgpr_spills)
|
|
+{
|
|
+ unsigned offset;
|
|
+
|
|
+ auto offset_iter = ctx.preserved_spill_offsets.find(reg);
|
|
+ if (offset_iter == ctx.preserved_spill_offsets.end()) {
|
|
+ offset = ctx.next_preserved_offset;
|
|
+ ctx.next_preserved_offset += 4;
|
|
+ ctx.preserved_spill_offsets.emplace(reg, offset);
|
|
+ } else {
|
|
+ offset = offset_iter->second;
|
|
+ }
|
|
+
|
|
+ if (ctx.preserved_linear_regs.find(reg) != ctx.preserved_linear_regs.end())
|
|
+ lvgpr_spills.emplace_back(reg, offset);
|
|
+ else
|
|
+ spills.emplace_back(reg, offset);
|
|
+}
|
|
+
|
|
+void
|
|
+emit_spills_reloads_internal(spill_preserved_ctx& ctx, Builder& bld,
|
|
+ std::vector<std::pair<PhysReg, int>>& spills, PhysReg stack_reg,
|
|
+ PhysReg soffset, bool reload, bool linear, bool soffset_valid)
|
|
+{
|
|
+ if (spills.empty())
|
|
+ return;
|
|
+
|
|
+ int end_offset = spills.back().second;
|
|
+ int start_offset = spills.front().second;
|
|
+ if (ctx.program->gfx_level >= GFX9)
|
|
+ assert(end_offset - start_offset < ctx.program->dev.scratch_global_offset_max);
|
|
+
|
|
+ bool overflow =
|
|
+ end_offset > ctx.program->dev.scratch_global_offset_max || ctx.program->gfx_level < GFX9;
|
|
+ if (overflow) {
|
|
+ if (ctx.program->gfx_level >= GFX9)
|
|
+ bld.sop2(aco_opcode::s_add_u32, Definition(soffset, s1), Definition(scc, s1),
|
|
+ Operand(stack_reg, s1), Operand::c32(start_offset));
|
|
+ else if (soffset_valid)
|
|
+ bld.sop2(aco_opcode::s_add_u32, Definition(soffset, s1), Definition(scc, s1),
|
|
+ Operand(soffset, s1), Operand::c32(start_offset * ctx.program->wave_size));
|
|
+ else
|
|
+ bld.sop1(aco_opcode::s_mov_b32, Definition(soffset, s1),
|
|
+ Operand::c32(start_offset * ctx.program->wave_size));
|
|
+ }
|
|
+
|
|
+ Operand soffset_op;
|
|
+ if (ctx.program->gfx_level >= GFX9)
|
|
+ soffset_op = Operand(overflow ? soffset : stack_reg, s1);
|
|
+ else
|
|
+ soffset_op = soffset_valid || overflow ? Operand(soffset, s1) : Operand(sgpr_null, s1);
|
|
+
|
|
+ for (const auto& spill : spills) {
|
|
+ if (ctx.program->gfx_level >= GFX9) {
|
|
+ if (reload)
|
|
+ bld.scratch(aco_opcode::scratch_load_dword,
|
|
+ Definition(spill.first, linear ? v1.as_linear() : v1), Operand(v1),
|
|
+ soffset_op, overflow ? spill.second - start_offset : spill.second,
|
|
+ memory_sync_info(storage_vgpr_spill, semantic_private));
|
|
+ else
|
|
+ bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), soffset_op,
|
|
+ Operand(spill.first, linear ? v1.as_linear() : v1),
|
|
+ overflow ? spill.second - start_offset : spill.second,
|
|
+ memory_sync_info(storage_vgpr_spill, semantic_private));
|
|
+ } else {
|
|
+ if (reload) {
|
|
+ Instruction* instr = bld.mubuf(
|
|
+ aco_opcode::buffer_load_dword, Definition(spill.first, linear ? v1.as_linear() : v1),
|
|
+ Operand(stack_reg, s4), Operand(v1), soffset_op,
|
|
+ overflow ? spill.second - start_offset : spill.second, false);
|
|
+ instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
|
|
+ instr->mubuf().cache.value = ac_swizzled;
|
|
+ } else {
|
|
+ Instruction* instr =
|
|
+ bld.mubuf(aco_opcode::buffer_store_dword, Operand(stack_reg, s4), Operand(v1),
|
|
+ soffset_op, Operand(spill.first, linear ? v1.as_linear() : v1),
|
|
+ overflow ? spill.second - start_offset : spill.second, false);
|
|
+ instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private);
|
|
+ instr->mubuf().cache.value = ac_swizzled;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ if (overflow && ctx.program->gfx_level < GFX9)
|
|
+ bld.sop2(aco_opcode::s_sub_i32, Definition(soffset, s1), Definition(scc, s1),
|
|
+ Operand(soffset, s1), Operand::c32(start_offset * ctx.program->wave_size));
|
|
+}
|
|
+
|
|
+void
|
|
+emit_spills_reloads(spill_preserved_ctx& ctx, std::vector<aco_ptr<Instruction>>& instructions,
|
|
+ std::vector<aco_ptr<Instruction>>::iterator& insert_point,
|
|
+ std::vector<std::pair<PhysReg, int>>& spills,
|
|
+ std::vector<std::pair<PhysReg, int>>& lvgpr_spills, bool reload)
|
|
+{
|
|
+ auto spill_reload_compare = [](const auto& first, const auto& second)
|
|
+ { return first.second < second.second; };
|
|
+
|
|
+ std::sort(spills.begin(), spills.end(), spill_reload_compare);
|
|
+ std::sort(lvgpr_spills.begin(), lvgpr_spills.end(), spill_reload_compare);
|
|
+
|
|
+ PhysReg stack_reg = (*insert_point)->operands[0].physReg();
|
|
+ PhysReg soffset = (*insert_point)->definitions[0].physReg();
|
|
+ PhysReg exec_backup = (*insert_point)->definitions[1].physReg();
|
|
+
|
|
+ std::vector<aco_ptr<Instruction>> spill_instructions;
|
|
+ Builder bld(ctx.program, &spill_instructions);
|
|
+
|
|
+ emit_spills_reloads_internal(ctx, bld, spills, stack_reg, soffset, reload, false, false);
|
|
+ if (!lvgpr_spills.empty()) {
|
|
+ bld.sop1(Builder::s_or_saveexec, Definition(exec_backup, bld.lm), Definition(scc, s1),
|
|
+ Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm));
|
|
+ emit_spills_reloads_internal(ctx, bld, lvgpr_spills, stack_reg, soffset, reload, true, false);
|
|
+ bld.sop1(Builder::WaveSpecificOpcode::s_mov, Definition(exec, bld.lm),
|
|
+ Operand(exec_backup, bld.lm));
|
|
+ }
|
|
+
|
|
+ insert_point = instructions.erase(insert_point);
|
|
+ instructions.insert(insert_point, std::move_iterator(spill_instructions.begin()),
|
|
+ std::move_iterator(spill_instructions.end()));
|
|
+}
|
|
+
|
|
+void
|
|
+init_block_info(spill_preserved_ctx& ctx)
|
|
+{
|
|
+ unsigned cur_loop_header = -1u;
|
|
+ for (unsigned index = ctx.program->blocks.size() - 1; index < ctx.program->blocks.size();) {
|
|
+ const Block& block = ctx.program->blocks[index];
|
|
+
|
|
+ if (block.linear_succs.empty()) {
|
|
+ ctx.dom_info[index].logical_imm_postdom = block.index;
|
|
+ ctx.dom_info[index].linear_imm_postdom = block.index;
|
|
+ } else {
|
|
+ int new_logical_postdom = -1;
|
|
+ int new_linear_postdom = -1;
|
|
+ for (unsigned succ_idx : block.logical_succs) {
|
|
+ if ((int)ctx.dom_info[succ_idx].logical_imm_postdom == -1) {
|
|
+ assert(cur_loop_header == -1u || succ_idx >= cur_loop_header);
|
|
+ if (cur_loop_header == -1u)
|
|
+ cur_loop_header = succ_idx;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (new_logical_postdom == -1) {
|
|
+ new_logical_postdom = (int)succ_idx;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ while ((int)succ_idx != new_logical_postdom) {
|
|
+ if ((int)succ_idx < new_logical_postdom)
|
|
+ succ_idx = ctx.dom_info[succ_idx].logical_imm_postdom;
|
|
+ if ((int)succ_idx > new_logical_postdom)
|
|
+ new_logical_postdom = (int)ctx.dom_info[new_logical_postdom].logical_imm_postdom;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ for (unsigned succ_idx : block.linear_succs) {
|
|
+ if ((int)ctx.dom_info[succ_idx].linear_imm_postdom == -1) {
|
|
+ assert(cur_loop_header == -1u || succ_idx >= cur_loop_header);
|
|
+ if (cur_loop_header == -1u)
|
|
+ cur_loop_header = succ_idx;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ if (new_linear_postdom == -1) {
|
|
+ new_linear_postdom = (int)succ_idx;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ while ((int)succ_idx != new_linear_postdom) {
|
|
+ if ((int)succ_idx < new_linear_postdom)
|
|
+ succ_idx = ctx.dom_info[succ_idx].linear_imm_postdom;
|
|
+ if ((int)succ_idx > new_linear_postdom)
|
|
+ new_linear_postdom = (int)ctx.dom_info[new_linear_postdom].linear_imm_postdom;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ ctx.dom_info[index].logical_imm_postdom = new_logical_postdom;
|
|
+ ctx.dom_info[index].linear_imm_postdom = new_linear_postdom;
|
|
+ }
|
|
+
|
|
+ bool seen_reload_vgpr = false;
|
|
+ for (auto& instr : block.instructions) {
|
|
+ if (instr->opcode == aco_opcode::p_reload_preserved_vgpr) {
|
|
+ seen_reload_vgpr = true;
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ add_instr(ctx, index, seen_reload_vgpr, instr);
|
|
+ }
|
|
+
|
|
+ /* Process predecessors of loop headers again, since post-dominance information of the header
|
|
+ * was not available the first time
|
|
+ */
|
|
+ unsigned next_idx = index - 1;
|
|
+ if (index == cur_loop_header) {
|
|
+ assert(block.kind & block_kind_loop_header);
|
|
+ for (auto pred : block.logical_preds)
|
|
+ if (ctx.dom_info[pred].logical_imm_postdom == -1u)
|
|
+ next_idx = std::max(next_idx, pred);
|
|
+ for (auto pred : block.linear_preds)
|
|
+ if (ctx.dom_info[pred].linear_imm_postdom == -1u)
|
|
+ next_idx = std::max(next_idx, pred);
|
|
+ cur_loop_header = -1u;
|
|
+ }
|
|
+ index = next_idx;
|
|
+ }
|
|
+}
|
|
+
|
|
+struct call_spill {
|
|
+ unsigned instr_idx;
|
|
+ std::vector<std::pair<PhysReg, int>> spills;
|
|
+};
|
|
+
|
|
+void
|
|
+emit_call_spills(spill_preserved_ctx& ctx)
|
|
+{
|
|
+ std::set<PhysReg> linear_vgprs;
|
|
+ std::unordered_map<unsigned, std::vector<call_spill>> block_call_spills;
|
|
+
|
|
+ unsigned max_scratch_offset = ctx.next_preserved_offset;
|
|
+
|
|
+ for (auto& block : ctx.program->blocks) {
|
|
+ for (auto it = block.instructions.begin(); it != block.instructions.end(); ++it) {
|
|
+ auto& instr = *it;
|
|
+
|
|
+ if (instr->opcode == aco_opcode::p_call) {
|
|
+ unsigned scratch_offset = ctx.next_preserved_offset;
|
|
+ struct call_spill spill = {
|
|
+ .instr_idx = (unsigned)(it - block.instructions.begin()),
|
|
+ };
|
|
+ for (auto& reg : linear_vgprs) {
|
|
+ if (!instr->call().abi.clobberedRegs.vgpr.contains(reg))
|
|
+ continue;
|
|
+ spill.spills.emplace_back(reg, scratch_offset);
|
|
+ scratch_offset += 4;
|
|
+ }
|
|
+ max_scratch_offset = std::max(max_scratch_offset, scratch_offset);
|
|
+
|
|
+ block_call_spills[block.index].emplace_back(std::move(spill));
|
|
+ } else if (instr->opcode == aco_opcode::p_start_linear_vgpr) {
|
|
+ linear_vgprs.insert(instr->definitions[0].physReg());
|
|
+ } else if (instr->opcode == aco_opcode::p_end_linear_vgpr) {
|
|
+ for (auto& op : instr->operands)
|
|
+ linear_vgprs.erase(op.physReg());
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ /* XXX: This should also be possible on GFX9, although small negative scratch offsets
|
|
+ * seem to hang the GPU, so disable it there now.
|
|
+ */
|
|
+ if (ctx.program->gfx_level >= GFX10)
|
|
+ for (auto& block_calls : block_call_spills)
|
|
+ for (auto& call_spills : block_calls.second)
|
|
+ for (auto& spill : call_spills.spills)
|
|
+ spill.second -= max_scratch_offset;
|
|
+
|
|
+ for (auto& block_calls : block_call_spills) {
|
|
+ for (unsigned i = 0; i < block_calls.second.size(); ++i) {
|
|
+ auto& block = ctx.program->blocks[block_calls.first];
|
|
+ auto& call = block_calls.second[i];
|
|
+ auto& instr = block.instructions[call.instr_idx];
|
|
+ auto it = block.instructions.begin() + call.instr_idx;
|
|
+ unsigned num_inserted_instrs = 0;
|
|
+
|
|
+ std::vector<aco_ptr<Instruction>> spill_instructions;
|
|
+ Builder bld(ctx.program, &spill_instructions);
|
|
+
|
|
+ PhysReg stack_reg = instr->operands[1].physReg();
|
|
+ PhysReg soffset = PhysReg{UINT32_MAX};
|
|
+ PhysReg scratch_rsrc = PhysReg{UINT32_MAX};
|
|
+ if (ctx.program->gfx_level < GFX9)
|
|
+ scratch_rsrc = instr->operands.back().physReg();
|
|
+
|
|
+ if (ctx.program->gfx_level >= GFX10) {
|
|
+ bld.sop2(aco_opcode::s_add_u32, Definition(stack_reg, s1), Definition(scc, s1),
|
|
+ Operand(stack_reg, s1), Operand::c32(max_scratch_offset));
|
|
+ emit_spills_reloads_internal(ctx, bld, call.spills, stack_reg, soffset, false, true,
|
|
+ false);
|
|
+ } else if (ctx.program->gfx_level == GFX9) {
|
|
+ emit_spills_reloads_internal(ctx, bld, call.spills, stack_reg, soffset, false, true,
|
|
+ false);
|
|
+ bld.sop2(aco_opcode::s_add_u32, Definition(stack_reg, s1), Definition(scc, s1),
|
|
+ Operand(stack_reg, s1), Operand::c32(max_scratch_offset));
|
|
+ } else {
|
|
+ emit_spills_reloads_internal(ctx, bld, call.spills, scratch_rsrc, stack_reg, false,
|
|
+ true, true);
|
|
+ bld.sop2(aco_opcode::s_add_u32, Definition(stack_reg, s1), Definition(scc, s1),
|
|
+ Operand(stack_reg, s1),
|
|
+ Operand::c32(max_scratch_offset * ctx.program->wave_size));
|
|
+ }
|
|
+
|
|
+ it = block.instructions.insert(it, std::move_iterator(spill_instructions.begin()),
|
|
+ std::move_iterator(spill_instructions.end()));
|
|
+ it += spill_instructions.size() + 1;
|
|
+ num_inserted_instrs += spill_instructions.size();
|
|
+
|
|
+ spill_instructions.clear();
|
|
+
|
|
+ if (ctx.program->gfx_level >= GFX10) {
|
|
+ emit_spills_reloads_internal(ctx, bld, call.spills, stack_reg, soffset, true, true,
|
|
+ false);
|
|
+ bld.sop2(aco_opcode::s_sub_u32, Definition(stack_reg, s1), Definition(scc, s1),
|
|
+ Operand(stack_reg, s1), Operand::c32(max_scratch_offset));
|
|
+ } else if (ctx.program->gfx_level == GFX9) {
|
|
+ bld.sop2(aco_opcode::s_sub_u32, Definition(stack_reg, s1), Definition(scc, s1),
|
|
+ Operand(stack_reg, s1), Operand::c32(max_scratch_offset));
|
|
+ emit_spills_reloads_internal(ctx, bld, call.spills, stack_reg, soffset, true, true,
|
|
+ false);
|
|
+ } else {
|
|
+ bld.sop2(aco_opcode::s_sub_u32, Definition(stack_reg, s1), Definition(scc, s1),
|
|
+ Operand(stack_reg, s1),
|
|
+ Operand::c32(max_scratch_offset * ctx.program->wave_size));
|
|
+ emit_spills_reloads_internal(ctx, bld, call.spills, scratch_rsrc, stack_reg, true, true,
|
|
+ true);
|
|
+ }
|
|
+
|
|
+ block.instructions.insert(it, std::move_iterator(spill_instructions.begin()),
|
|
+ std::move_iterator(spill_instructions.end()));
|
|
+ num_inserted_instrs += spill_instructions.size();
|
|
+
|
|
+ for (unsigned j = i + 1; j < block_calls.second.size(); ++j)
|
|
+ block_calls.second[j].instr_idx += num_inserted_instrs;
|
|
+ }
|
|
+ }
|
|
+
|
|
+ ctx.next_preserved_offset = max_scratch_offset;
|
|
+}
|
|
+
|
|
+void
|
|
+emit_preserved_spills(spill_preserved_ctx& ctx)
|
|
+{
|
|
+ std::vector<std::pair<PhysReg, int>> spills;
|
|
+ std::vector<std::pair<PhysReg, int>> lvgpr_spills;
|
|
+
|
|
+ for (auto reg : ctx.preserved_regs)
|
|
+ spill_preserved(ctx, reg, spills, lvgpr_spills);
|
|
+ for (auto reg : ctx.preserved_linear_regs)
|
|
+ spill_preserved(ctx, reg, spills, lvgpr_spills);
|
|
+
|
|
+ auto start_instr = std::find_if(ctx.program->blocks.front().instructions.begin(),
|
|
+ ctx.program->blocks.front().instructions.end(),
|
|
+ [](const auto& instr)
|
|
+ { return instr->opcode == aco_opcode::p_spill_preserved_vgpr; });
|
|
+ emit_spills_reloads(ctx, ctx.program->blocks.front().instructions, start_instr, spills,
|
|
+ lvgpr_spills, false);
|
|
+
|
|
+ auto block_reloads =
|
|
+ std::vector<std::vector<std::pair<PhysReg, int>>>(ctx.program->blocks.size());
|
|
+ auto lvgpr_block_reloads =
|
|
+ std::vector<std::vector<std::pair<PhysReg, int>>>(ctx.program->blocks.size());
|
|
+
|
|
+ for (auto it = ctx.reg_block_uses.begin(); it != ctx.reg_block_uses.end();) {
|
|
+ bool is_linear = ctx.preserved_linear_regs.find(it->first) != ctx.preserved_linear_regs.end();
|
|
+
|
|
+ if (!is_linear && ctx.preserved_regs.find(it->first) == ctx.preserved_regs.end()) {
|
|
+ it = ctx.reg_block_uses.erase(it);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ unsigned min_common_postdom = 0;
|
|
+
|
|
+ for (auto succ_idx : it->second) {
|
|
+ while (succ_idx != min_common_postdom) {
|
|
+ if (min_common_postdom < succ_idx) {
|
|
+ min_common_postdom = is_linear
|
|
+ ? ctx.dom_info[min_common_postdom].linear_imm_postdom
|
|
+ : ctx.dom_info[min_common_postdom].logical_imm_postdom;
|
|
+ } else {
|
|
+ succ_idx = is_linear ? ctx.dom_info[succ_idx].linear_imm_postdom
|
|
+ : ctx.dom_info[succ_idx].logical_imm_postdom;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
+ while (std::find_if(ctx.program->blocks[min_common_postdom].instructions.rbegin(),
|
|
+ ctx.program->blocks[min_common_postdom].instructions.rend(),
|
|
+ [](const auto& instr) {
|
|
+ return instr->opcode == aco_opcode::p_reload_preserved_vgpr;
|
|
+ }) == ctx.program->blocks[min_common_postdom].instructions.rend())
|
|
+ min_common_postdom = is_linear ? ctx.dom_info[min_common_postdom].linear_imm_postdom
|
|
+ : ctx.dom_info[min_common_postdom].logical_imm_postdom;
|
|
+
|
|
+ if (is_linear) {
|
|
+ lvgpr_block_reloads[min_common_postdom].emplace_back(
|
|
+ it->first, ctx.preserved_spill_offsets[it->first]);
|
|
+ ctx.preserved_linear_regs.erase(it->first);
|
|
+ } else {
|
|
+ block_reloads[min_common_postdom].emplace_back(it->first,
|
|
+ ctx.preserved_spill_offsets[it->first]);
|
|
+ ctx.preserved_regs.erase(it->first);
|
|
+ }
|
|
+
|
|
+ it = ctx.reg_block_uses.erase(it);
|
|
+ }
|
|
+
|
|
+ for (unsigned i = 0; i < ctx.program->blocks.size(); ++i) {
|
|
+ auto instr_it = std::find_if(
|
|
+ ctx.program->blocks[i].instructions.rbegin(), ctx.program->blocks[i].instructions.rend(),
|
|
+ [](const auto& instr) { return instr->opcode == aco_opcode::p_reload_preserved_vgpr; });
|
|
+ if (instr_it == ctx.program->blocks[i].instructions.rend()) {
|
|
+ assert(block_reloads[i].empty() && lvgpr_block_reloads[i].empty());
|
|
+ continue;
|
|
+ }
|
|
+ auto end_instr = std::prev(instr_it.base());
|
|
+ emit_spills_reloads(ctx, ctx.program->blocks[i].instructions, end_instr, block_reloads[i],
|
|
+ lvgpr_block_reloads[i], true);
|
|
+ }
|
|
+}
|
|
+
|
|
+void
|
|
+spill_preserved(Program* program)
|
|
+{
|
|
+ if (!program->is_callee)
|
|
+ return;
|
|
+
|
|
+ spill_preserved_ctx ctx(program);
|
|
+
|
|
+ init_block_info(ctx);
|
|
+
|
|
+ if (!program->bypass_reg_preservation)
|
|
+ emit_preserved_spills(ctx);
|
|
+
|
|
+ emit_call_spills(ctx);
|
|
+
|
|
+ program->config->scratch_bytes_per_wave = ctx.next_preserved_offset * program->wave_size;
|
|
+}
|
|
+} // namespace aco
|
|
diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build
|
|
index b235f626f97af..38006e78543dc 100644
|
|
--- a/src/amd/compiler/meson.build
|
|
+++ b/src/amd/compiler/meson.build
|
|
@@ -62,6 +62,7 @@ libaco_files = files(
|
|
'aco_scheduler.cpp',
|
|
'aco_scheduler_ilp.cpp',
|
|
'aco_spill.cpp',
|
|
+ 'aco_spill_preserved.cpp',
|
|
'aco_ssa_elimination.cpp',
|
|
'aco_statistics.cpp',
|
|
'aco_util.h',
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 35220611d653ced3a7ed06565c71815e9d135b5e Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 13 May 2024 06:26:51 +0200
|
|
Subject: [PATCH 58/71] aco: Add cur_reg_demand to Program
|
|
|
|
For checking whether spilling of preserved SGPRs is needed.
|
|
---
|
|
src/amd/compiler/aco_ir.h | 1 +
|
|
src/amd/compiler/aco_live_var_analysis.cpp | 1 +
|
|
2 files changed, 2 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index e2101ae5162bc..6f510fef17a04 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -2345,6 +2345,7 @@ public:
|
|
std::vector<Block> blocks;
|
|
std::vector<RegClass> temp_rc = {s1};
|
|
RegisterDemand max_reg_demand = RegisterDemand();
|
|
+ RegisterDemand cur_reg_demand = RegisterDemand();
|
|
ac_shader_config* config;
|
|
struct aco_shader_info info;
|
|
enum amd_gfx_level gfx_level;
|
|
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
|
|
index 64814e983bb2e..52561464b0e1e 100644
|
|
--- a/src/amd/compiler/aco_live_var_analysis.cpp
|
|
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
|
|
@@ -565,6 +565,7 @@ update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
|
uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
|
|
uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
|
|
|
|
+ program->cur_reg_demand = new_demand;
|
|
/* this won't compile, register pressure reduction necessary */
|
|
if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) {
|
|
program->num_waves = 0;
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 20e1d11ec9b648ecc2d41bd5974c91545880e7b8 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 13 May 2024 06:28:31 +0200
|
|
Subject: [PATCH 59/71] aco: Spill callee-preserved SGPRs
|
|
|
|
---
|
|
src/amd/compiler/aco_opcodes.py | 2 +
|
|
src/amd/compiler/aco_register_allocation.cpp | 46 ++++++-
|
|
src/amd/compiler/aco_scheduler.cpp | 8 ++
|
|
src/amd/compiler/aco_spill.cpp | 119 +++++++++++++++++--
|
|
4 files changed, 167 insertions(+), 8 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
|
|
index 8d0b93a044270..a2f0876838f92 100644
|
|
--- a/src/amd/compiler/aco_opcodes.py
|
|
+++ b/src/amd/compiler/aco_opcodes.py
|
|
@@ -334,7 +334,9 @@ insn("p_unit_test")
|
|
insn("p_callee_stack_ptr")
|
|
|
|
insn("p_spill_preserved_vgpr")
|
|
+insn("p_spill_preserved_sgpr")
|
|
insn("p_reload_preserved_vgpr")
|
|
+insn("p_reload_preserved_sgpr")
|
|
|
|
insn("p_create_vector")
|
|
insn("p_extract_vector")
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index 88f40f894e79c..b8915e520e8e1 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -3054,11 +3054,35 @@ register_allocation(Program* program, ra_test_policy policy)
|
|
ra_ctx ctx(program, policy);
|
|
get_affinities(ctx);
|
|
|
|
+ std::unordered_set<PhysReg> blocked_sgpr;
|
|
+ if (ctx.program->is_callee) {
|
|
+ PhysRegInterval preserved_sgpr_lo = PhysRegInterval{
|
|
+ .lo_ = PhysReg{ctx.program->arg_sgpr_count},
|
|
+ .size = ctx.program->callee_abi.clobberedRegs.sgpr.lo() - ctx.program->arg_sgpr_count,
|
|
+ };
|
|
+ PhysRegInterval preserved_sgpr_hi = PhysRegInterval{
|
|
+ .lo_ = ctx.program->callee_abi.clobberedRegs.sgpr.hi(),
|
|
+ .size = PhysReg{ctx.sgpr_limit} - ctx.program->callee_abi.clobberedRegs.sgpr.hi(),
|
|
+ };
|
|
+ for (auto reg : preserved_sgpr_lo) {
|
|
+ blocked_sgpr.insert(reg);
|
|
+ adjust_max_used_regs(ctx, RegClass::s1, reg);
|
|
+ }
|
|
+ for (auto reg : preserved_sgpr_hi) {
|
|
+ blocked_sgpr.insert(reg);
|
|
+ adjust_max_used_regs(ctx, RegClass::s1, reg);
|
|
+ }
|
|
+ }
|
|
+
|
|
for (Block& block : program->blocks) {
|
|
ctx.block = █
|
|
|
|
/* initialize register file */
|
|
RegisterFile register_file = init_reg_file(ctx, program->live.live_in, block);
|
|
+ for (auto& reg : blocked_sgpr) {
|
|
+ if (register_file.is_empty_or_blocked(reg))
|
|
+ register_file.block(reg, s1);
|
|
+ }
|
|
ctx.war_hint.reset();
|
|
ctx.rr_vgpr_it = {PhysReg{256}};
|
|
ctx.rr_sgpr_it = {PhysReg{0}};
|
|
@@ -3104,7 +3128,27 @@ register_allocation(Program* program, ra_test_policy policy)
|
|
instructions.emplace_back(std::move(instr));
|
|
break;
|
|
}
|
|
- if (instr->opcode == aco_opcode::p_reload_preserved_vgpr && block.linear_succs.empty()) {
|
|
+ if (instr->opcode == aco_opcode::p_spill_preserved_sgpr) {
|
|
+ if (register_file.is_blocked(instr->operands[0].physReg()))
|
|
+ register_file.clear(instr->operands[0]);
|
|
+ blocked_sgpr.erase(instr->operands[0].physReg());
|
|
+ continue;
|
|
+ } else if (instr->opcode == aco_opcode::p_reload_preserved_sgpr) {
|
|
+ blocked_sgpr.insert(instr->operands[0].physReg());
|
|
+ std::vector<unsigned> vars = collect_vars(
|
|
+ ctx, register_file, {instr->operands[0].physReg(), instr->operands[0].size()});
|
|
+ register_file.block(instr->operands[0].physReg(), instr->operands[0].regClass());
|
|
+ ASSERTED bool success = false;
|
|
+ success = get_regs_for_copies(ctx, register_file, parallelcopy, vars, instr,
|
|
+ PhysRegInterval{});
|
|
+ assert(success);
|
|
+
|
|
+ update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0);
|
|
+ register_file.block(instr->operands[0].physReg(), instr->operands[0].regClass());
|
|
+ emit_parallel_copy(ctx, parallelcopy, instr, instructions, temp_in_scc, register_file);
|
|
+ continue;
|
|
+ } else if (instr->opcode == aco_opcode::p_reload_preserved_vgpr &&
|
|
+ block.linear_succs.empty()) {
|
|
PhysRegInterval preserved_vgpr_lo = PhysRegInterval{
|
|
.lo_ = PhysReg{256u + ctx.program->arg_vgpr_count},
|
|
.size = ctx.program->callee_abi.clobberedRegs.vgpr.lo() - 256u -
|
|
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
|
|
index 4115c0bf3d7cf..e6eb1e49a4021 100644
|
|
--- a/src/amd/compiler/aco_scheduler.cpp
|
|
+++ b/src/amd/compiler/aco_scheduler.cpp
|
|
@@ -1266,6 +1266,14 @@ schedule_program(Program* program)
|
|
assert(ctx.num_waves > 0);
|
|
ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2),
|
|
int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))};
|
|
+ /* If not all preserved SGPRs in callee shaders were spilled, don't try using them for
|
|
+ * scheduling.
|
|
+ */
|
|
+ if (program->is_callee) {
|
|
+ ctx.mv.max_registers.sgpr =
|
|
+ std::max(std::min(ctx.mv.max_registers.sgpr, program->cur_reg_demand.sgpr),
|
|
+ (int16_t)program->callee_abi.clobberedRegs.sgpr.size);
|
|
+ }
|
|
|
|
/* NGG culling shaders are very sensitive to position export scheduling.
|
|
* Schedule less aggressively when early primitive export is used, and
|
|
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
|
|
index c271cbcf01eb8..e143b51809570 100644
|
|
--- a/src/amd/compiler/aco_spill.cpp
|
|
+++ b/src/amd/compiler/aco_spill.cpp
|
|
@@ -75,6 +75,8 @@ struct spill_ctx {
|
|
std::vector<aco::unordered_map<Temp, uint32_t>> spills_entry;
|
|
std::vector<aco::unordered_map<Temp, uint32_t>> spills_exit;
|
|
|
|
+ std::vector<uint32_t> preserved_spill_ids;
|
|
+
|
|
std::vector<bool> processed;
|
|
std::vector<loop_info> loop;
|
|
|
|
@@ -138,11 +140,27 @@ struct spill_ctx {
|
|
for (auto pair : loop.back().spills)
|
|
add_interference(spill_id, pair.second);
|
|
}
|
|
+ for (auto id : preserved_spill_ids)
|
|
+ add_interference(spill_id, id);
|
|
|
|
spills[to_spill] = spill_id;
|
|
return spill_id;
|
|
}
|
|
|
|
+ uint32_t add_preserved_spill(RegClass rc,
|
|
+ std::vector<aco::unordered_map<Temp, uint32_t>>& block_spills)
|
|
+ {
|
|
+ const uint32_t spill_id = allocate_spill_id(rc);
|
|
+ for (auto& spills : block_spills)
|
|
+ for (auto pair : spills)
|
|
+ add_interference(spill_id, pair.second);
|
|
+ for (auto id : preserved_spill_ids)
|
|
+ add_interference(spill_id, id);
|
|
+ preserved_spill_ids.push_back(spill_id);
|
|
+
|
|
+ return spill_id;
|
|
+ }
|
|
+
|
|
void add_interference(uint32_t first, uint32_t second)
|
|
{
|
|
if (interferences[first].first.type() != interferences[second].first.type())
|
|
@@ -1461,6 +1479,8 @@ end_unused_spill_vgprs(spill_ctx& ctx, Block& block, std::vector<Temp>& vgpr_spi
|
|
if (pair.first.type() == RegType::sgpr && ctx.is_reloaded[pair.second])
|
|
is_used[slots[pair.second] / ctx.wave_size] = true;
|
|
}
|
|
+ for (auto preserved : ctx.preserved_spill_ids)
|
|
+ is_used[slots[preserved] / ctx.wave_size] = true;
|
|
|
|
std::vector<Temp> temps;
|
|
for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) {
|
|
@@ -1635,6 +1655,13 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
|
|
}
|
|
}
|
|
|
|
+ if (!(*it)->definitions[0].isTemp()) {
|
|
+ auto id_it = std::find(ctx.preserved_spill_ids.begin(),
|
|
+ ctx.preserved_spill_ids.end(), spill_id);
|
|
+ assert(id_it != ctx.preserved_spill_ids.end());
|
|
+ ctx.preserved_spill_ids.erase(id_it);
|
|
+ }
|
|
+
|
|
/* reload sgpr: just add the vgpr temp to operands */
|
|
Instruction* reload = create_instruction(aco_opcode::p_reload, Format::PSEUDO, 2, 1);
|
|
reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]);
|
|
@@ -1653,6 +1680,37 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
|
|
ctx.program->config->scratch_bytes_per_wave += ctx.vgpr_spill_slots * 4 * ctx.program->wave_size;
|
|
}
|
|
|
|
+void
|
|
+spill_reload_preserved_sgpr(spill_ctx& ctx, std::vector<aco_ptr<Instruction>>& spill_instructions,
|
|
+ std::vector<aco_ptr<Instruction>>& reload_instructions, PhysReg reg)
|
|
+{
|
|
+ uint32_t spill_id = ctx.add_preserved_spill(RegClass::s1, ctx.spills_exit);
|
|
+
|
|
+ aco_ptr<Instruction> spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)};
|
|
+ spill->operands[0] = Operand(reg, RegClass::s1);
|
|
+ spill->operands[1] = Operand::c32(spill_id);
|
|
+
|
|
+ aco_ptr<Instruction> unblock{
|
|
+ create_instruction(aco_opcode::p_spill_preserved_sgpr, Format::PSEUDO, 1, 0)};
|
|
+ unblock->operands[0] = Operand(reg, RegClass::s1);
|
|
+
|
|
+ spill_instructions.emplace_back(std::move(spill));
|
|
+ spill_instructions.emplace_back(std::move(unblock));
|
|
+
|
|
+ aco_ptr<Instruction> block{
|
|
+ create_instruction(aco_opcode::p_reload_preserved_sgpr, Format::PSEUDO, 1, 0)};
|
|
+ block->operands[0] = Operand(reg, RegClass::s1);
|
|
+
|
|
+ aco_ptr<Instruction> reload{create_instruction(aco_opcode::p_reload, Format::PSEUDO, 1, 1)};
|
|
+ reload->operands[0] = Operand::c32(spill_id);
|
|
+ reload->definitions[0] = Definition(reg, RegClass::s1);
|
|
+
|
|
+ reload_instructions.emplace_back(std::move(block));
|
|
+ reload_instructions.emplace_back(std::move(reload));
|
|
+
|
|
+ ctx.is_reloaded[spill_id] = true;
|
|
+}
|
|
+
|
|
} /* end namespace */
|
|
|
|
void
|
|
@@ -1663,8 +1721,16 @@ spill(Program* program)
|
|
|
|
program->progress = CompilationProgress::after_spilling;
|
|
|
|
+ const uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
|
|
+ const uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
|
|
+ uint16_t abi_sgpr_limit =
|
|
+ std::min((uint16_t)(program->callee_abi.clobberedRegs.sgpr.size + program->arg_sgpr_count),
|
|
+ sgpr_limit);
|
|
+ if (!program->is_callee)
|
|
+ abi_sgpr_limit = sgpr_limit;
|
|
+
|
|
/* no spilling when register pressure is low enough */
|
|
- if (program->num_waves > 0)
|
|
+ if (program->num_waves > 0 && program->cur_reg_demand.sgpr <= abi_sgpr_limit)
|
|
return;
|
|
|
|
/* lower to CSSA before spilling to ensure correctness w.r.t. phis */
|
|
@@ -1672,14 +1738,12 @@ spill(Program* program)
|
|
|
|
/* calculate target register demand */
|
|
const RegisterDemand demand = program->max_reg_demand; /* current max */
|
|
- const uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
|
|
- const uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
|
|
uint16_t extra_vgprs = 0;
|
|
uint16_t extra_sgprs = 0;
|
|
|
|
/* calculate extra VGPRs required for spilling SGPRs */
|
|
- if (demand.sgpr > sgpr_limit) {
|
|
- unsigned sgpr_spills = demand.sgpr - sgpr_limit;
|
|
+ if (demand.sgpr > abi_sgpr_limit) {
|
|
+ unsigned sgpr_spills = demand.sgpr - abi_sgpr_limit;
|
|
extra_vgprs = DIV_ROUND_UP(sgpr_spills * 2, program->wave_size) + 1;
|
|
}
|
|
/* add extra SGPRs required for spilling VGPRs */
|
|
@@ -1688,9 +1752,9 @@ spill(Program* program)
|
|
extra_sgprs = 1; /* SADDR */
|
|
else
|
|
extra_sgprs = 5; /* scratch_resource (s4) + scratch_offset (s1) */
|
|
- if (demand.sgpr + extra_sgprs > sgpr_limit) {
|
|
+ if (demand.sgpr + extra_sgprs > abi_sgpr_limit) {
|
|
/* re-calculate in case something has changed */
|
|
- unsigned sgpr_spills = demand.sgpr + extra_sgprs - sgpr_limit;
|
|
+ unsigned sgpr_spills = demand.sgpr + extra_sgprs - abi_sgpr_limit;
|
|
extra_vgprs = DIV_ROUND_UP(sgpr_spills * 2, program->wave_size) + 1;
|
|
}
|
|
}
|
|
@@ -1702,10 +1766,51 @@ spill(Program* program)
|
|
gather_ssa_use_info(ctx);
|
|
get_rematerialize_info(ctx);
|
|
|
|
+ /* Prepare spilling of preserved SGPRs. Don't insert the instructions yet so live info
|
|
+ * stays valid.
|
|
+ */
|
|
+ std::vector<aco_ptr<Instruction>> preserved_spill_instructions;
|
|
+ std::vector<aco_ptr<Instruction>> preserved_reload_instructions;
|
|
+ if (demand.sgpr > abi_sgpr_limit && ctx.program->is_callee) {
|
|
+ ctx.preserved_spill_ids.reserve(demand.sgpr - abi_sgpr_limit);
|
|
+
|
|
+ for (PhysReg reg = PhysReg{program->arg_sgpr_count};
|
|
+ reg < program->callee_abi.clobberedRegs.sgpr.lo(); reg = reg.advance(4))
|
|
+ spill_reload_preserved_sgpr(ctx, preserved_spill_instructions,
|
|
+ preserved_reload_instructions, reg);
|
|
+
|
|
+ unsigned max_reg =
|
|
+ std::min((unsigned)program->cur_reg_demand.sgpr + extra_sgprs, (unsigned)sgpr_limit);
|
|
+ for (PhysReg reg = program->callee_abi.clobberedRegs.sgpr.hi(); reg < max_reg;
|
|
+ reg = reg.advance(4))
|
|
+ spill_reload_preserved_sgpr(ctx, preserved_spill_instructions,
|
|
+ preserved_reload_instructions, reg);
|
|
+ }
|
|
+
|
|
/* create spills and reloads */
|
|
for (unsigned i = 0; i < program->blocks.size(); i++)
|
|
spill_block(ctx, i);
|
|
|
|
+ if (!preserved_spill_instructions.empty()) {
|
|
+ auto spill_insert_point = std::find_if(
|
|
+ program->blocks.front().instructions.begin(), program->blocks.front().instructions.end(),
|
|
+ [](const auto& instr) { return instr->opcode == aco_opcode::p_spill_preserved_vgpr; });
|
|
+ assert(spill_insert_point != program->blocks.front().instructions.end());
|
|
+
|
|
+ spill_insert_point = std::next(spill_insert_point);
|
|
+ program->blocks.front().instructions.insert(
|
|
+ spill_insert_point, std::move_iterator(preserved_spill_instructions.begin()),
|
|
+ std::move_iterator(preserved_spill_instructions.end()));
|
|
+
|
|
+ auto reload_insert_point = std::find_if(
|
|
+ program->blocks.back().instructions.begin(), program->blocks.back().instructions.end(),
|
|
+ [](const auto& instr) { return instr->opcode == aco_opcode::p_reload_preserved_vgpr; });
|
|
+ assert(reload_insert_point != program->blocks.back().instructions.end());
|
|
+ program->blocks.back().instructions.insert(
|
|
+ reload_insert_point, std::move_iterator(preserved_reload_instructions.begin()),
|
|
+ std::move_iterator(preserved_reload_instructions.end()));
|
|
+ }
|
|
+
|
|
/* assign spill slots and DCE rematerialized code */
|
|
assign_spill_slots(ctx, extra_vgprs);
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 3f8defc2ff59734c6e9b2bdc2554fc4f30204a1a Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 13 May 2024 06:29:40 +0200
|
|
Subject: [PATCH 60/71] aco/ra: Also consider blocked registers as not
|
|
containing temps
|
|
|
|
---
|
|
src/amd/compiler/aco_register_allocation.cpp | 12 ++++++++++--
|
|
1 file changed, 10 insertions(+), 2 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index b8915e520e8e1..ff8475e19014d 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -231,6 +231,14 @@ public:
|
|
return res;
|
|
}
|
|
|
|
+ unsigned count_zero_or_blocked(PhysRegInterval reg_interval) const
|
|
+ {
|
|
+ unsigned res = 0;
|
|
+ for (PhysReg reg : reg_interval)
|
|
+ res += !regs[reg] || regs[reg] == 0xFFFFFFFF;
|
|
+ return res;
|
|
+ }
|
|
+
|
|
/* Returns true if any of the bytes in the given range are allocated or blocked */
|
|
bool test(PhysReg start, unsigned num_bytes) const
|
|
{
|
|
@@ -3501,8 +3509,8 @@ register_allocation(Program* program, ra_test_policy policy)
|
|
|
|
ASSERTED PhysRegInterval vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, false);
|
|
ASSERTED PhysRegInterval sgpr_bounds = get_reg_bounds(ctx, RegType::sgpr, false);
|
|
- assert(register_file.count_zero(vgpr_bounds) == ctx.vgpr_bounds);
|
|
- assert(register_file.count_zero(sgpr_bounds) == ctx.sgpr_bounds);
|
|
+ assert(register_file.count_zero_or_blocked(vgpr_bounds) == ctx.vgpr_bounds);
|
|
+ assert(register_file.count_zero_or_blocked(sgpr_bounds) == ctx.sgpr_bounds);
|
|
} else if (should_compact_linear_vgprs(ctx, register_file)) {
|
|
aco_ptr<Instruction> br = std::move(instructions.back());
|
|
instructions.pop_back();
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 475664aaa95eaf7cf58abef67f524a658363d379 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 13 May 2024 06:30:35 +0200
|
|
Subject: [PATCH 61/71] aco/ra: Skip blocked regs in get_reg_impl
|
|
|
|
---
|
|
src/amd/compiler/aco_register_allocation.cpp | 2 +-
|
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index ff8475e19014d..aec47824719a9 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -1307,7 +1307,7 @@ get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, std::vector<parallelcopy
|
|
continue;
|
|
}
|
|
|
|
- if (reg_file[j] == 0 || reg_file[j] == last_var)
|
|
+ if (reg_file[j] == 0 || reg_file[j] == 0xFFFFFFFF || reg_file[j] == last_var)
|
|
continue;
|
|
|
|
if (reg_file[j] == 0xF0000000) {
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 47242773a9872ee2756ff2bceeee154db4c6e5d2 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Mon, 13 May 2024 06:31:01 +0200
|
|
Subject: [PATCH 62/71] aco/isel: Bypass reg preservation for noreturn shaders
|
|
|
|
---
|
|
src/amd/compiler/aco_instruction_selection.cpp | 1 +
|
|
src/amd/compiler/aco_ir.h | 1 +
|
|
2 files changed, 2 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index deb97c1867667..6c98777b12689 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -12261,6 +12261,7 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c
|
|
|
|
Builder(ctx.program, ctx.block).sop1(aco_opcode::s_setpc_b64, Operand(ctx.next_pc));
|
|
} else {
|
|
+ ctx.program->bypass_reg_preservation = true;
|
|
Builder(ctx.program, ctx.block).sopp(aco_opcode::s_endpgm);
|
|
}
|
|
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index 6f510fef17a04..2ab9eaa5d653c 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -2390,6 +2390,7 @@ public:
|
|
bool pending_lds_access = false;
|
|
|
|
bool is_callee = false;
|
|
+ bool bypass_reg_preservation = false;
|
|
ABI callee_abi = {};
|
|
unsigned short arg_sgpr_count;
|
|
unsigned short arg_vgpr_count;
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 8de0e68756db0eea3b7e332bf47b295863de41a1 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Fri, 31 May 2024 16:46:28 +0200
|
|
Subject: [PATCH 63/71] aco/ra: Add separate counter for blocked registers
|
|
|
|
We can't assume blocked registers are free in get_reg_impl, but
|
|
we don't want to pessimize register usage estimations either.
|
|
---
|
|
src/amd/compiler/aco_register_allocation.cpp | 25 ++++++++++++++++----
|
|
1 file changed, 21 insertions(+), 4 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index aec47824719a9..5b4b50652006e 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -97,6 +97,8 @@ struct ra_ctx {
|
|
aco_ptr<Instruction> phi_dummy;
|
|
uint16_t max_used_sgpr = 0;
|
|
uint16_t max_used_vgpr = 0;
|
|
+ uint16_t max_blocked_sgpr = 0;
|
|
+ uint16_t max_blocked_vgpr = 0;
|
|
uint16_t sgpr_limit;
|
|
uint16_t vgpr_limit;
|
|
std::bitset<512> war_hint;
|
|
@@ -765,6 +767,21 @@ adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
|
|
}
|
|
}
|
|
|
|
+void
|
|
+adjust_max_blocked_regs(ra_ctx& ctx, RegType type, unsigned reg)
|
|
+{
|
|
+ uint16_t max_addressible_sgpr = ctx.sgpr_limit;
|
|
+ if (type == RegType::vgpr) {
|
|
+ assert(reg >= 256);
|
|
+ uint16_t hi = reg - 256 - 1;
|
|
+ assert(hi <= 255);
|
|
+ ctx.max_blocked_vgpr = std::max(ctx.max_blocked_vgpr, hi);
|
|
+ } else if (reg <= max_addressible_sgpr) {
|
|
+ uint16_t hi = reg - 1;
|
|
+ ctx.max_blocked_sgpr = std::max(ctx.max_blocked_sgpr, std::min(hi, max_addressible_sgpr));
|
|
+ }
|
|
+}
|
|
+
|
|
enum UpdateRenames {
|
|
rename_not_killed_ops = 0x1,
|
|
};
|
|
@@ -3268,10 +3285,10 @@ register_allocation(Program* program, ra_test_policy policy)
|
|
tmp_file.block(instr->call().abi.clobberedRegs.sgpr);
|
|
tmp_file.block(instr->call().abi.clobberedRegs.vgpr);
|
|
|
|
- adjust_max_used_regs(ctx, RegClass::s1,
|
|
- instr->call().abi.clobberedRegs.sgpr.hi().reg() - 1);
|
|
- adjust_max_used_regs(ctx, RegClass::v1,
|
|
- instr->call().abi.clobberedRegs.vgpr.hi().reg() - 1);
|
|
+ adjust_max_blocked_regs(ctx, RegType::sgpr,
|
|
+ instr->call().abi.clobberedRegs.sgpr.hi().reg());
|
|
+ adjust_max_blocked_regs(ctx, RegType::vgpr,
|
|
+ instr->call().abi.clobberedRegs.vgpr.hi().reg());
|
|
|
|
ASSERTED bool success = false;
|
|
success =
|
|
--
|
|
GitLab
|
|
|
|
|
|
From ffb65b8b229cab1e36a6334344088aa9f0928d3a Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sat, 1 Jun 2024 11:50:04 +0200
|
|
Subject: [PATCH 64/71] aco/spill: Don't spill scratch_rsrc-related temps
|
|
|
|
These temps are used to create the scratch_rsrc. Spilling them will
|
|
never benefit anything, because assign_spill_slots will insert code
|
|
that keeps them live. Since the spiller assumes all spilled variables
|
|
to be dead, this can cause more variables being live than intended and
|
|
spilling to fail.
|
|
---
|
|
src/amd/compiler/aco_spill.cpp | 10 +++++++++-
|
|
1 file changed, 9 insertions(+), 1 deletion(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
|
|
index e143b51809570..b36a15b68e553 100644
|
|
--- a/src/amd/compiler/aco_spill.cpp
|
|
+++ b/src/amd/compiler/aco_spill.cpp
|
|
@@ -371,6 +371,9 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
|
|
if (var.type() != type || ctx.spills_entry[block_idx].count(var) ||
|
|
var.regClass().is_linear_vgpr())
|
|
continue;
|
|
+ if (var == ctx.program->stack_ptr || var == ctx.program->scratch_offset ||
|
|
+ var == ctx.program->private_segment_buffer)
|
|
+ continue;
|
|
|
|
unsigned can_remat = ctx.remat.count(var);
|
|
if (can_remat > remat || (can_remat == remat && ctx.ssa_infos[t].score() > score)) {
|
|
@@ -415,7 +418,8 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx)
|
|
continue;
|
|
Temp var = phi->definitions[0].getTemp();
|
|
if (var.type() == type && !ctx.spills_entry[block_idx].count(var) &&
|
|
- ctx.ssa_infos[var.id()].score() > score) {
|
|
+ ctx.ssa_infos[var.id()].score() > score && var != ctx.program->stack_ptr &&
|
|
+ var != ctx.program->scratch_offset && var != ctx.program->private_segment_buffer) {
|
|
to_spill = var;
|
|
score = ctx.ssa_infos[var.id()].score();
|
|
}
|
|
@@ -965,6 +969,10 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
|
|
|
if (can_rematerialize > do_rematerialize || loop_variable > avoid_respill ||
|
|
ctx.ssa_infos[t].score() > score) {
|
|
+ if (var == ctx.program->stack_ptr || var == ctx.program->scratch_offset ||
|
|
+ var == ctx.program->private_segment_buffer)
|
|
+ continue;
|
|
+
|
|
unsigned cur_operand_idx = -1u;
|
|
bool can_spill = true;
|
|
for (auto it = instr->operands.begin(); it != instr->operands.end(); ++it) {
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 524d5f329cc352e8049ef573a728d47f2f6741e3 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Wed, 5 Jun 2024 11:06:32 +0200
|
|
Subject: [PATCH 65/71] aco/spill: Ignore extra VGPRs/SGPRs for calls
|
|
|
|
For VGPRs, we make sure they're spilled in the spill_preserved pass.
|
|
For SGPRs, we make sure to reinitialize scratch_rsrc after calls.
|
|
---
|
|
src/amd/compiler/aco_spill.cpp | 20 +++++++++++++++-----
|
|
1 file changed, 15 insertions(+), 5 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
|
|
index b36a15b68e553..943a3788a15c2 100644
|
|
--- a/src/amd/compiler/aco_spill.cpp
|
|
+++ b/src/amd/compiler/aco_spill.cpp
|
|
@@ -88,17 +88,20 @@ struct spill_ctx {
|
|
std::set<Instruction*> unused_remats;
|
|
unsigned wave_size;
|
|
|
|
+ RegisterDemand extra_demand;
|
|
+
|
|
unsigned sgpr_spill_slots;
|
|
unsigned vgpr_spill_slots;
|
|
Temp scratch_rsrc;
|
|
|
|
- spill_ctx(const RegisterDemand target_pressure_, Program* program_)
|
|
+ spill_ctx(const RegisterDemand target_pressure_, RegisterDemand extra_demand_, Program* program_)
|
|
: target_pressure(target_pressure_), program(program_), memory(),
|
|
renames(program->blocks.size(), aco::map<Temp, Temp>(memory)),
|
|
spills_entry(program->blocks.size(), aco::unordered_map<Temp, uint32_t>(memory)),
|
|
spills_exit(program->blocks.size(), aco::unordered_map<Temp, uint32_t>(memory)),
|
|
processed(program->blocks.size(), false), ssa_infos(program->peekAllocationId()),
|
|
- remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0)
|
|
+ remat(memory), wave_size(program->wave_size), extra_demand(extra_demand_),
|
|
+ sgpr_spill_slots(0), vgpr_spill_slots(0)
|
|
{}
|
|
|
|
void add_affinity(uint32_t first, uint32_t second)
|
|
@@ -943,8 +946,14 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
|
RegisterDemand new_demand = instr->register_demand;
|
|
std::optional<RegisterDemand> live_changes;
|
|
|
|
+ RegisterDemand ignored_regs = {};
|
|
+
|
|
+ /* We spill linear VGPRs for calls in spill_preserved */
|
|
+ if (instr->isCall() || (!instructions.empty() && instructions.back()->isCall()))
|
|
+ ignored_regs += ctx.extra_demand;
|
|
+
|
|
/* if reg pressure is too high, spill variable with furthest next use */
|
|
- while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) {
|
|
+ while ((new_demand - spilled_registers).exceeds(ctx.target_pressure + ignored_regs)) {
|
|
float score = 0.0;
|
|
Temp to_spill = Temp();
|
|
unsigned operand_idx = -1u;
|
|
@@ -953,7 +962,8 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
|
unsigned avoid_respill = 0;
|
|
|
|
RegType type = RegType::sgpr;
|
|
- if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr)
|
|
+ if (new_demand.vgpr - spilled_registers.vgpr >
|
|
+ (ctx.target_pressure.vgpr + ignored_regs.vgpr))
|
|
type = RegType::vgpr;
|
|
|
|
for (unsigned t : ctx.program->live.live_in[block_idx]) {
|
|
@@ -1770,7 +1780,7 @@ spill(Program* program)
|
|
const RegisterDemand target(vgpr_limit - extra_vgprs, sgpr_limit - extra_sgprs);
|
|
|
|
/* initialize ctx */
|
|
- spill_ctx ctx(target, program);
|
|
+ spill_ctx ctx(target, RegisterDemand(extra_vgprs, extra_sgprs), program);
|
|
gather_ssa_use_info(ctx);
|
|
get_rematerialize_info(ctx);
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 9bedff4e6eef064be53aaa64c14cb40318e311b9 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sat, 1 Jun 2024 16:38:24 +0200
|
|
Subject: [PATCH 66/71] aco: Add and set block->contains_call
|
|
|
|
---
|
|
src/amd/compiler/aco_instruction_selection.cpp | 1 +
|
|
src/amd/compiler/aco_ir.h | 1 +
|
|
2 files changed, 2 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
|
|
index 6c98777b12689..fae8d57479bb8 100644
|
|
--- a/src/amd/compiler/aco_instruction_selection.cpp
|
|
+++ b/src/amd/compiler/aco_instruction_selection.cpp
|
|
@@ -10939,6 +10939,7 @@ visit_call(isel_context* ctx, nir_call_instr* instr)
|
|
.return_info = std::move(return_infos),
|
|
.scratch_param_size = info.scratch_param_size,
|
|
});
|
|
+ ctx->block->contains_call = true;
|
|
}
|
|
|
|
void
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index 2ab9eaa5d653c..14f2c07eda7a8 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -2219,6 +2219,7 @@ struct Block {
|
|
/* this information is needed for predecessors to blocks with phis when
|
|
* moving out of ssa */
|
|
bool scc_live_out = false;
|
|
+ bool contains_call = true;
|
|
|
|
Block() : index(0) {}
|
|
};
|
|
--
|
|
GitLab
|
|
|
|
|
|
From ca4c18e7be750667c68229346bba989d28255ceb Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Sat, 1 Jun 2024 12:00:48 +0200
|
|
Subject: [PATCH 67/71] aco/spill: Reset scratch_rsrc on calls
|
|
|
|
---
|
|
src/amd/compiler/aco_spill.cpp | 46 ++++++++++++++++++++++++++++------
|
|
1 file changed, 39 insertions(+), 7 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
|
|
index 943a3788a15c2..61ffd57b497f9 100644
|
|
--- a/src/amd/compiler/aco_spill.cpp
|
|
+++ b/src/amd/compiler/aco_spill.cpp
|
|
@@ -93,6 +93,7 @@ struct spill_ctx {
|
|
unsigned sgpr_spill_slots;
|
|
unsigned vgpr_spill_slots;
|
|
Temp scratch_rsrc;
|
|
+ unsigned scratch_rsrc_block = -1u;
|
|
|
|
spill_ctx(const RegisterDemand target_pressure_, RegisterDemand extra_demand_, Program* program_)
|
|
: target_pressure(target_pressure_), program(program_), memory(),
|
|
@@ -1192,19 +1193,28 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block,
|
|
bool overflow = (ctx.vgpr_spill_slots - 1) * 4 > offset_range;
|
|
|
|
Builder rsrc_bld(ctx.program);
|
|
+ unsigned bld_block = block.index;
|
|
if (block.kind & block_kind_top_level) {
|
|
rsrc_bld.reset(&instructions);
|
|
} else if (ctx.scratch_rsrc == Temp() && (!overflow || ctx.program->gfx_level < GFX9)) {
|
|
Block* tl_block = █
|
|
- while (!(tl_block->kind & block_kind_top_level))
|
|
+ while (!(tl_block->kind & block_kind_top_level) &&
|
|
+ std::find_if(tl_block->instructions.begin(), tl_block->instructions.end(),
|
|
+ [](auto& instr)
|
|
+ { return !instr || instr->isCall(); }) == tl_block->instructions.end())
|
|
tl_block = &ctx.program->blocks[tl_block->linear_idom];
|
|
|
|
/* find p_logical_end */
|
|
- std::vector<aco_ptr<Instruction>>& prev_instructions = tl_block->instructions;
|
|
- unsigned idx = prev_instructions.size() - 1;
|
|
- while (prev_instructions[idx]->opcode != aco_opcode::p_logical_end)
|
|
- idx--;
|
|
- rsrc_bld.reset(&prev_instructions, std::next(prev_instructions.begin(), idx));
|
|
+ if (tl_block->kind & block_kind_top_level) {
|
|
+ std::vector<aco_ptr<Instruction>>& prev_instructions = tl_block->instructions;
|
|
+ unsigned idx = prev_instructions.size() - 1;
|
|
+ while (prev_instructions[idx]->opcode != aco_opcode::p_logical_end)
|
|
+ idx--;
|
|
+ rsrc_bld.reset(&prev_instructions, std::next(prev_instructions.begin(), idx));
|
|
+ bld_block = tl_block->index;
|
|
+ } else {
|
|
+ rsrc_bld.reset(&instructions);
|
|
+ }
|
|
}
|
|
|
|
/* If spilling overflows the constant offset range at any point, we need to emit the soffset
|
|
@@ -1232,10 +1242,13 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block,
|
|
Operand(ctx.program->stack_ptr), Operand::c32(saddr));
|
|
else
|
|
ctx.scratch_rsrc = offset_bld.copy(offset_bld.def(s1), Operand::c32(saddr));
|
|
+ ctx.scratch_rsrc_block = bld_block;
|
|
}
|
|
} else {
|
|
- if (ctx.scratch_rsrc == Temp())
|
|
+ if (ctx.scratch_rsrc == Temp()) {
|
|
ctx.scratch_rsrc = load_scratch_resource(ctx.program, rsrc_bld, overflow, true);
|
|
+ ctx.scratch_rsrc_block = bld_block;
|
|
+ }
|
|
|
|
if (overflow) {
|
|
uint32_t soffset =
|
|
@@ -1571,6 +1584,22 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
|
|
unsigned last_top_level_block_idx = 0;
|
|
for (Block& block : ctx.program->blocks) {
|
|
|
|
+ if (ctx.scratch_rsrc_block < ctx.program->blocks.size() &&
|
|
+ !(ctx.program->blocks[ctx.scratch_rsrc_block].kind & block_kind_top_level))
|
|
+ ctx.scratch_rsrc = Temp();
|
|
+
|
|
+ if (block.kind & block_kind_loop_header) {
|
|
+ for (unsigned index = block.index;
|
|
+ index < ctx.program->blocks.size() &&
|
|
+ ctx.program->blocks[index].loop_nest_depth >= block.loop_nest_depth;
|
|
+ ++index) {
|
|
+ if (ctx.program->blocks[index].contains_call) {
|
|
+ ctx.scratch_rsrc = Temp();
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ }
|
|
+
|
|
if (block.kind & block_kind_top_level) {
|
|
last_top_level_block_idx = block.index;
|
|
|
|
@@ -1588,6 +1617,9 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr)
|
|
Builder bld(ctx.program, &instructions);
|
|
for (it = block.instructions.begin(); it != block.instructions.end(); ++it) {
|
|
|
|
+ if ((*it)->isCall())
|
|
+ ctx.scratch_rsrc = Temp();
|
|
+
|
|
if ((*it)->opcode == aco_opcode::p_spill) {
|
|
uint32_t spill_id = (*it)->operands[1].constantValue();
|
|
|
|
--
|
|
GitLab
|
|
|
|
|
|
From b90eeacb2aa89b4d33315cc3e49c13611710d945 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Thu, 6 Jun 2024 08:08:02 +0200
|
|
Subject: [PATCH 68/71] radv: Re-enable RT pipelines on GFX9+
|
|
|
|
---
|
|
src/amd/vulkan/radv_physical_device.c | 6 +-----
|
|
1 file changed, 1 insertion(+), 5 deletions(-)
|
|
|
|
diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c
|
|
index 98826470d4d60..382195e70a77e 100644
|
|
--- a/src/amd/vulkan/radv_physical_device.c
|
|
+++ b/src/amd/vulkan/radv_physical_device.c
|
|
@@ -111,14 +111,10 @@ radv_filter_minmax_enabled(const struct radv_physical_device *pdev)
|
|
bool
|
|
radv_enable_rt(const struct radv_physical_device *pdev, bool rt_pipelines)
|
|
{
|
|
- /* Temporarily under construction! */
|
|
- if (rt_pipelines)
|
|
- return false;
|
|
-
|
|
if (pdev->info.gfx_level < GFX10_3 && !radv_emulate_rt(pdev))
|
|
return false;
|
|
|
|
- if (rt_pipelines && pdev->use_llvm)
|
|
+ if (rt_pipelines && (pdev->use_llvm || pdev->info.gfx_level < GFX9))
|
|
return false;
|
|
|
|
return true;
|
|
--
|
|
GitLab
|
|
|
|
|
|
From c73f158059b287185f612d3ea1e1ef8bcc46f58b Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Thu, 6 Jun 2024 08:03:43 +0200
|
|
Subject: [PATCH 69/71] aco: Add separate register usage tracking for
|
|
ABI-preserved regs
|
|
|
|
If a shader uses fewer registers than are preserved by an ABI, we'll
|
|
want to set the register demand to the actual register usage instead of
|
|
the demand set by preserved call registers.
|
|
|
|
Totals from 11 (0.01% of 81072) affected shaders:
|
|
MaxWaves: 120 -> 176 (+46.67%)
|
|
Instrs: 9493 -> 9516 (+0.24%)
|
|
CodeSize: 54868 -> 55012 (+0.26%); split: -0.03%, +0.29%
|
|
VGPRs: 1088 -> 640 (-41.18%)
|
|
Latency: 140184 -> 141125 (+0.67%); split: -0.06%, +0.73%
|
|
InvThroughput: 38824 -> 35752 (-7.91%); split: -7.93%, +0.02%
|
|
VClause: 256 -> 262 (+2.34%)
|
|
SClause: 129 -> 136 (+5.43%)
|
|
Copies: 1379 -> 1402 (+1.67%); split: -0.15%, +1.81%
|
|
VALU: 6386 -> 6405 (+0.30%); split: -0.03%, +0.33%
|
|
SALU: 968 -> 972 (+0.41%)
|
|
VMEM: 1028 -> 1030 (+0.19%)
|
|
---
|
|
src/amd/compiler/aco_ir.h | 7 +++-
|
|
src/amd/compiler/aco_live_var_analysis.cpp | 24 +++++++----
|
|
src/amd/compiler/aco_lower_to_cssa.cpp | 10 ++++-
|
|
src/amd/compiler/aco_register_allocation.cpp | 22 +++++-----
|
|
src/amd/compiler/aco_scheduler.cpp | 43 +++++++++++++++++++-
|
|
src/amd/compiler/aco_spill.cpp | 4 +-
|
|
6 files changed, 84 insertions(+), 26 deletions(-)
|
|
|
|
diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h
|
|
index 14f2c07eda7a8..92b21a8b4ed6a 100644
|
|
--- a/src/amd/compiler/aco_ir.h
|
|
+++ b/src/amd/compiler/aco_ir.h
|
|
@@ -2346,7 +2346,7 @@ public:
|
|
std::vector<Block> blocks;
|
|
std::vector<RegClass> temp_rc = {s1};
|
|
RegisterDemand max_reg_demand = RegisterDemand();
|
|
- RegisterDemand cur_reg_demand = RegisterDemand();
|
|
+ RegisterDemand max_real_reg_demand = RegisterDemand();
|
|
ac_shader_config* config;
|
|
struct aco_shader_info info;
|
|
enum amd_gfx_level gfx_level;
|
|
@@ -2485,7 +2485,8 @@ void select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config,
|
|
void lower_phis(Program* program);
|
|
void lower_subdword(Program* program);
|
|
void calc_min_waves(Program* program);
|
|
-void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand);
|
|
+void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand,
|
|
+ const RegisterDemand new_real_demand);
|
|
void live_var_analysis(Program* program);
|
|
std::vector<uint16_t> dead_code_analysis(Program* program);
|
|
void dominator_tree(Program* program);
|
|
@@ -2561,6 +2562,8 @@ int get_op_fixed_to_def(Instruction* instr);
|
|
RegisterDemand get_live_changes(Instruction* instr);
|
|
RegisterDemand get_temp_registers(Instruction* instr);
|
|
RegisterDemand get_temp_reg_changes(Instruction* instr);
|
|
+void compute_blocked_abi_demand(Program* program, unsigned linear_vgpr_demand,
|
|
+ Pseudo_call_instruction& instr);
|
|
|
|
/* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */
|
|
uint16_t get_extra_sgprs(Program* program);
|
|
diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp
|
|
index 52561464b0e1e..9d6284b38e0a3 100644
|
|
--- a/src/amd/compiler/aco_live_var_analysis.cpp
|
|
+++ b/src/amd/compiler/aco_live_var_analysis.cpp
|
|
@@ -207,6 +207,7 @@ void
|
|
process_live_temps_per_block(live_ctx& ctx, Block* block)
|
|
{
|
|
RegisterDemand new_demand;
|
|
+ RegisterDemand real_block_demand;
|
|
block->register_demand = RegisterDemand();
|
|
IDSet live = compute_live_out(ctx, block);
|
|
|
|
@@ -363,6 +364,10 @@ process_live_temps_per_block(live_ctx& ctx, Block* block)
|
|
operand_demand += new_demand;
|
|
insn->register_demand.update(operand_demand);
|
|
block->register_demand.update(insn->register_demand);
|
|
+ if (insn->isCall())
|
|
+ real_block_demand.update(insn->register_demand - insn->call().blocked_abi_demand);
|
|
+ else
|
|
+ real_block_demand.update(insn->register_demand);
|
|
}
|
|
|
|
/* handle phi definitions */
|
|
@@ -419,6 +424,7 @@ process_live_temps_per_block(live_ctx& ctx, Block* block)
|
|
block->live_in_demand = new_demand;
|
|
block->live_in_demand.sgpr += 2; /* Add 2 SGPRs for potential long-jumps. */
|
|
block->register_demand.update(block->live_in_demand);
|
|
+ ctx.program->max_real_reg_demand.update(real_block_demand);
|
|
ctx.program->max_reg_demand.update(block->register_demand);
|
|
ctx.handled_once = std::min(ctx.handled_once, block->index);
|
|
|
|
@@ -559,29 +565,30 @@ max_suitable_waves(Program* program, uint16_t waves)
|
|
}
|
|
|
|
void
|
|
-update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand)
|
|
+update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand,
|
|
+ const RegisterDemand new_real_demand)
|
|
{
|
|
assert(program->min_waves >= 1);
|
|
uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
|
|
uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
|
|
|
|
- program->cur_reg_demand = new_demand;
|
|
+ program->max_reg_demand = new_demand;
|
|
/* this won't compile, register pressure reduction necessary */
|
|
if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) {
|
|
program->num_waves = 0;
|
|
- program->max_reg_demand = new_demand;
|
|
} else {
|
|
- program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr);
|
|
+ program->num_waves =
|
|
+ program->dev.physical_sgprs / get_sgpr_alloc(program, new_real_demand.sgpr);
|
|
uint16_t vgpr_demand =
|
|
- get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2;
|
|
+ get_vgpr_alloc(program, new_real_demand.vgpr) + program->config->num_shared_vgprs / 2;
|
|
program->num_waves =
|
|
std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand);
|
|
program->num_waves = std::min(program->num_waves, program->dev.max_waves_per_simd);
|
|
|
|
/* Adjust for LDS and workgroup multiples and calculate max_reg_demand */
|
|
program->num_waves = max_suitable_waves(program, program->num_waves);
|
|
- program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);
|
|
- program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
|
|
+ program->max_real_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves);
|
|
+ program->max_real_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves);
|
|
}
|
|
}
|
|
|
|
@@ -592,6 +599,7 @@ live_var_analysis(Program* program)
|
|
program->live.memory.release();
|
|
program->live.live_in.resize(program->blocks.size(), IDSet(program->live.memory));
|
|
program->max_reg_demand = RegisterDemand();
|
|
+ program->max_real_reg_demand = RegisterDemand();
|
|
program->needs_vcc = program->gfx_level >= GFX10;
|
|
|
|
live_ctx ctx;
|
|
@@ -607,7 +615,7 @@ live_var_analysis(Program* program)
|
|
|
|
/* calculate the program's register demand and number of waves */
|
|
if (program->progress < CompilationProgress::after_ra)
|
|
- update_vgpr_sgpr_demand(program, program->max_reg_demand);
|
|
+ update_vgpr_sgpr_demand(program, program->max_reg_demand, program->max_real_reg_demand);
|
|
}
|
|
|
|
} // namespace aco
|
|
diff --git a/src/amd/compiler/aco_lower_to_cssa.cpp b/src/amd/compiler/aco_lower_to_cssa.cpp
|
|
index 4268e21d820d2..237aaa01f4bc7 100644
|
|
--- a/src/amd/compiler/aco_lower_to_cssa.cpp
|
|
+++ b/src/amd/compiler/aco_lower_to_cssa.cpp
|
|
@@ -519,6 +519,7 @@ emit_parallelcopies(cssa_ctx& ctx)
|
|
}
|
|
|
|
RegisterDemand new_demand;
|
|
+ RegisterDemand real_new_demand;
|
|
for (Block& block : ctx.program->blocks) {
|
|
/* Finally, rename coalesced phi operands */
|
|
for (aco_ptr<Instruction>& phi : block.instructions) {
|
|
@@ -538,13 +539,18 @@ emit_parallelcopies(cssa_ctx& ctx)
|
|
|
|
/* Resummarize the block's register demand */
|
|
block.register_demand = block.live_in_demand;
|
|
- for (const aco_ptr<Instruction>& instr : block.instructions)
|
|
+ for (const aco_ptr<Instruction>& instr : block.instructions) {
|
|
block.register_demand.update(instr->register_demand);
|
|
+ if (instr->isCall())
|
|
+ real_new_demand.update(instr->register_demand - instr->call().blocked_abi_demand);
|
|
+ else
|
|
+ real_new_demand.update(instr->register_demand);
|
|
+ }
|
|
new_demand.update(block.register_demand);
|
|
}
|
|
|
|
/* Update max_reg_demand and num_waves */
|
|
- update_vgpr_sgpr_demand(ctx.program, new_demand);
|
|
+ update_vgpr_sgpr_demand(ctx.program, new_demand, real_new_demand);
|
|
|
|
assert(renames.empty());
|
|
}
|
|
diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp
|
|
index 5b4b50652006e..e0d6f6bfeaf5a 100644
|
|
--- a/src/amd/compiler/aco_register_allocation.cpp
|
|
+++ b/src/amd/compiler/aco_register_allocation.cpp
|
|
@@ -121,8 +121,8 @@ struct ra_ctx {
|
|
sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves);
|
|
vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves);
|
|
|
|
- sgpr_bounds = program->max_reg_demand.sgpr;
|
|
- vgpr_bounds = program->max_reg_demand.vgpr;
|
|
+ sgpr_bounds = program->max_real_reg_demand.sgpr;
|
|
+ vgpr_bounds = program->max_real_reg_demand.vgpr;
|
|
num_linear_vgprs = 0;
|
|
}
|
|
};
|
|
@@ -1426,16 +1426,18 @@ increase_register_file(ra_ctx& ctx, RegClass rc)
|
|
{
|
|
if (rc.type() == RegType::vgpr && ctx.num_linear_vgprs == 0 &&
|
|
ctx.vgpr_bounds < ctx.vgpr_limit) {
|
|
+ RegisterDemand new_demand =
|
|
+ RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_real_reg_demand.sgpr);
|
|
/* If vgpr_bounds is less than max_reg_demand.vgpr, this should be a no-op. */
|
|
- update_vgpr_sgpr_demand(
|
|
- ctx.program, RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_reg_demand.sgpr));
|
|
+ update_vgpr_sgpr_demand(ctx.program, new_demand, new_demand);
|
|
|
|
- ctx.vgpr_bounds = ctx.program->max_reg_demand.vgpr;
|
|
- } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) {
|
|
- update_vgpr_sgpr_demand(
|
|
- ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.sgpr_bounds + 1));
|
|
+ ctx.vgpr_bounds = ctx.program->max_real_reg_demand.vgpr;
|
|
+ } else if (rc.type() == RegType::sgpr && ctx.program->max_real_reg_demand.sgpr < ctx.sgpr_limit) {
|
|
+ RegisterDemand new_demand =
|
|
+ RegisterDemand(ctx.program->max_real_reg_demand.vgpr, ctx.sgpr_bounds + 1);
|
|
+ update_vgpr_sgpr_demand(ctx.program, new_demand, new_demand);
|
|
|
|
- ctx.sgpr_bounds = ctx.program->max_reg_demand.sgpr;
|
|
+ ctx.sgpr_bounds = ctx.program->max_real_reg_demand.sgpr;
|
|
} else {
|
|
return false;
|
|
}
|
|
@@ -2049,7 +2051,7 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr)
|
|
;
|
|
if (reg < 0) {
|
|
reg = ctx.max_used_sgpr + 1;
|
|
- for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
|
|
+ for (; reg < ctx.program->max_real_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++)
|
|
;
|
|
}
|
|
|
|
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
|
|
index e6eb1e49a4021..438e45def661c 100644
|
|
--- a/src/amd/compiler/aco_scheduler.cpp
|
|
+++ b/src/amd/compiler/aco_scheduler.cpp
|
|
@@ -1271,7 +1271,7 @@ schedule_program(Program* program)
|
|
*/
|
|
if (program->is_callee) {
|
|
ctx.mv.max_registers.sgpr =
|
|
- std::max(std::min(ctx.mv.max_registers.sgpr, program->cur_reg_demand.sgpr),
|
|
+ std::max(std::min(ctx.mv.max_registers.sgpr, program->max_reg_demand.sgpr),
|
|
(int16_t)program->callee_abi.clobberedRegs.sgpr.size);
|
|
}
|
|
|
|
@@ -1291,10 +1291,49 @@ schedule_program(Program* program)
|
|
|
|
/* update max_reg_demand and num_waves */
|
|
RegisterDemand new_demand;
|
|
+ RegisterDemand real_new_demand;
|
|
for (Block& block : program->blocks) {
|
|
new_demand.update(block.register_demand);
|
|
+ if (block.contains_call) {
|
|
+ unsigned linear_vgpr_demand = 0;
|
|
+ for (auto t : program->live.live_in[block.index])
|
|
+ if (program->temp_rc[t].is_linear_vgpr())
|
|
+ linear_vgpr_demand += program->temp_rc[t].size();
|
|
+
|
|
+ for (unsigned i = block.instructions.size() - 1; i < block.instructions.size(); --i) {
|
|
+ Instruction* instr = block.instructions[i].get();
|
|
+
|
|
+ for (auto& def : instr->definitions) {
|
|
+ if (def.regClass().is_linear_vgpr() && !def.isKill())
|
|
+ linear_vgpr_demand -= def.size();
|
|
+ }
|
|
+ for (auto& op : instr->operands) {
|
|
+ if (op.regClass().is_linear_vgpr() && op.isFirstKill())
|
|
+ linear_vgpr_demand += op.size();
|
|
+ }
|
|
+
|
|
+ if (!block.instructions[i]->isCall()) {
|
|
+ real_new_demand.update(block.instructions[i]->register_demand);
|
|
+ continue;
|
|
+ }
|
|
+
|
|
+ compute_blocked_abi_demand(program, linear_vgpr_demand, instr->call());
|
|
+
|
|
+ const unsigned max_vgpr = get_addr_vgpr_from_waves(program, program->min_waves);
|
|
+ const unsigned max_sgpr = get_addr_sgpr_from_waves(program, program->min_waves);
|
|
+
|
|
+ if (instr->call().abi.clobberedRegs.vgpr.hi() == PhysReg{256 + max_vgpr} &&
|
|
+ instr->call().abi.clobberedRegs.sgpr.hi() == PhysReg{max_sgpr})
|
|
+ real_new_demand.update(block.instructions[i]->register_demand -
|
|
+ instr->call().blocked_abi_demand);
|
|
+ else
|
|
+ real_new_demand.update(block.instructions[i]->register_demand);
|
|
+ }
|
|
+ } else {
|
|
+ real_new_demand.update(block.register_demand);
|
|
+ }
|
|
}
|
|
- update_vgpr_sgpr_demand(program, new_demand);
|
|
+ update_vgpr_sgpr_demand(program, new_demand, real_new_demand);
|
|
|
|
/* Validate live variable information */
|
|
if (!validate_live_vars(program))
|
|
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
|
|
index 61ffd57b497f9..2ebe7c28fa8fd 100644
|
|
--- a/src/amd/compiler/aco_spill.cpp
|
|
+++ b/src/amd/compiler/aco_spill.cpp
|
|
@@ -1780,7 +1780,7 @@ spill(Program* program)
|
|
abi_sgpr_limit = sgpr_limit;
|
|
|
|
/* no spilling when register pressure is low enough */
|
|
- if (program->num_waves > 0 && program->cur_reg_demand.sgpr <= abi_sgpr_limit)
|
|
+ if (program->num_waves > 0 && program->max_reg_demand.sgpr <= abi_sgpr_limit)
|
|
return;
|
|
|
|
/* lower to CSSA before spilling to ensure correctness w.r.t. phis */
|
|
@@ -1830,7 +1830,7 @@ spill(Program* program)
|
|
preserved_reload_instructions, reg);
|
|
|
|
unsigned max_reg =
|
|
- std::min((unsigned)program->cur_reg_demand.sgpr + extra_sgprs, (unsigned)sgpr_limit);
|
|
+ std::min((unsigned)program->max_reg_demand.sgpr + extra_sgprs, (unsigned)sgpr_limit);
|
|
for (PhysReg reg = program->callee_abi.clobberedRegs.sgpr.hi(); reg < max_reg;
|
|
reg = reg.advance(4))
|
|
spill_reload_preserved_sgpr(ctx, preserved_spill_instructions,
|
|
--
|
|
GitLab
|
|
|
|
|
|
From 450c3456e89dd5d8604128482be7768eebda4b1e Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Tue, 4 Jun 2024 15:08:48 +0200
|
|
Subject: [PATCH 70/71] aco/spill: Restore registers spilled by call
|
|
immediately
|
|
|
|
Makes for better latency hiding if we're not short on registers
|
|
otherwise.
|
|
|
|
Totals from 7 (0.01% of 81072) affected shaders:
|
|
Instrs: 9084 -> 8980 (-1.14%)
|
|
CodeSize: 52564 -> 51976 (-1.12%)
|
|
SpillSGPRs: 244 -> 248 (+1.64%); split: -3.28%, +4.92%
|
|
SpillVGPRs: 360 -> 367 (+1.94%)
|
|
Latency: 138989 -> 135669 (-2.39%); split: -2.49%, +0.10%
|
|
InvThroughput: 35120 -> 35301 (+0.52%); split: -0.06%, +0.57%
|
|
VClause: 258 -> 241 (-6.59%)
|
|
SClause: 116 -> 117 (+0.86%)
|
|
Copies: 1290 -> 1311 (+1.63%)
|
|
Branches: 131 -> 119 (-9.16%)
|
|
VALU: 6125 -> 6143 (+0.29%); split: -0.20%, +0.49%
|
|
SALU: 920 -> 913 (-0.76%); split: -0.98%, +0.22%
|
|
VMEM: 1026 -> 989 (-3.61%)
|
|
---
|
|
src/amd/compiler/aco_spill.cpp | 21 +++++++++++++++++++++
|
|
1 file changed, 21 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp
|
|
index 2ebe7c28fa8fd..dea810ce42cf4 100644
|
|
--- a/src/amd/compiler/aco_spill.cpp
|
|
+++ b/src/amd/compiler/aco_spill.cpp
|
|
@@ -908,6 +908,8 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
|
|
|
auto& current_spills = ctx.spills_exit[block_idx];
|
|
|
|
+ std::vector<Temp> call_spills;
|
|
+
|
|
while (idx < block->instructions.size()) {
|
|
aco_ptr<Instruction>& instr = block->instructions[idx];
|
|
|
|
@@ -922,6 +924,22 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
|
|
|
std::map<Temp, std::pair<Temp, uint32_t>> reloads;
|
|
|
|
+ if (!call_spills.empty()) {
|
|
+ RegisterDemand demand = instr->register_demand;
|
|
+ while (!(demand - spilled_registers).exceeds(ctx.target_pressure) &&
|
|
+ !call_spills.empty()) {
|
|
+ Temp old_tmp = call_spills.back();
|
|
+ call_spills.pop_back();
|
|
+
|
|
+ Temp new_tmp = ctx.program->allocateTmp(ctx.program->temp_rc[old_tmp.id()]);
|
|
+ ctx.renames[block_idx][old_tmp] = new_tmp;
|
|
+ reloads[old_tmp] = std::make_pair(new_tmp, current_spills[old_tmp]);
|
|
+ current_spills.erase(old_tmp);
|
|
+ spilled_registers -= new_tmp;
|
|
+ }
|
|
+ call_spills.clear();
|
|
+ }
|
|
+
|
|
/* rename and reload operands */
|
|
for (Operand& op : instr->operands) {
|
|
if (!op.isTemp())
|
|
@@ -1051,6 +1069,9 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s
|
|
}
|
|
|
|
uint32_t spill_id = ctx.add_to_spills(to_spill, current_spills);
|
|
+ if (instr->isCall())
|
|
+ call_spills.emplace_back(to_spill);
|
|
+
|
|
/* add interferences with reloads */
|
|
for (std::pair<const Temp, std::pair<Temp, uint32_t>>& pair : reloads)
|
|
ctx.add_interference(spill_id, pair.second.second);
|
|
--
|
|
GitLab
|
|
|
|
|
|
From ceea1b8cab549bf5b79c51c54c6f995a5fa79a62 Mon Sep 17 00:00:00 2001
|
|
From: Friedrich Vock <friedrich.vock@gmx.de>
|
|
Date: Tue, 4 Jun 2024 15:12:21 +0200
|
|
Subject: [PATCH 71/71] aco/vn: Don't combine expressions across calls
|
|
|
|
This increases live state across calls, which in turn increases spilling
|
|
and makes for slower shaders overall.
|
|
|
|
Totals from 7 (0.01% of 81072) affected shaders:
|
|
Instrs: 8980 -> 8955 (-0.28%); split: -0.88%, +0.60%
|
|
CodeSize: 51976 -> 51684 (-0.56%); split: -1.02%, +0.46%
|
|
SpillSGPRs: 248 -> 244 (-1.61%); split: -3.63%, +2.02%
|
|
SpillVGPRs: 367 -> 365 (-0.54%); split: -1.09%, +0.54%
|
|
Scratch: 32768 -> 31744 (-3.12%)
|
|
Latency: 135669 -> 128720 (-5.12%); split: -5.13%, +0.01%
|
|
InvThroughput: 35301 -> 34783 (-1.47%); split: -1.51%, +0.05%
|
|
VClause: 241 -> 242 (+0.41%)
|
|
SClause: 117 -> 120 (+2.56%)
|
|
Copies: 1311 -> 1338 (+2.06%); split: -0.69%, +2.75%
|
|
PreSGPRs: 899 -> 895 (-0.44%); split: -1.56%, +1.11%
|
|
PreVGPRs: 1103 -> 1099 (-0.36%)
|
|
VALU: 6143 -> 6098 (-0.73%); split: -1.22%, +0.49%
|
|
SALU: 913 -> 933 (+2.19%); split: -0.11%, +2.30%
|
|
VMEM: 989 -> 967 (-2.22%)
|
|
SMEM: 201 -> 214 (+6.47%)
|
|
---
|
|
src/amd/compiler/aco_opt_value_numbering.cpp | 24 ++++++++++++++++++++
|
|
1 file changed, 24 insertions(+)
|
|
|
|
diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp
|
|
index a199216907a5e..c35dbdaa5fcb7 100644
|
|
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
|
|
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
|
|
@@ -43,6 +43,8 @@ struct InstrHash {
|
|
for (const Operand& op : instr->operands)
|
|
hash = murmur_32_scramble(hash, op.constantValue());
|
|
|
|
+ hash = murmur_32_scramble(hash, instr->pass_flags >> 16);
|
|
+
|
|
size_t data_size = get_instr_data_size(instr->format);
|
|
|
|
/* skip format, opcode and pass_flags and op/def spans */
|
|
@@ -240,6 +242,9 @@ struct vn_ctx {
|
|
expr_set expr_values;
|
|
aco::unordered_map<uint32_t, Temp> renames;
|
|
|
|
+ /* For each block, a counter of how many calls were encountered in the linear/logical CFG. */
|
|
+ std::vector<std::pair<uint32_t, uint32_t>> call_indices;
|
|
+
|
|
/* The exec id should be the same on the same level of control flow depth.
|
|
* Together with the check for dominator relations, it is safe to assume
|
|
* that the same exec_id also means the same execution mask.
|
|
@@ -254,6 +259,7 @@ struct vn_ctx {
|
|
for (Block& block : program->blocks)
|
|
size += block.instructions.size();
|
|
expr_values.reserve(size);
|
|
+ call_indices.resize(program->blocks.size(), {0, 0});
|
|
}
|
|
};
|
|
|
|
@@ -341,6 +347,13 @@ process_block(vn_ctx& ctx, Block& block)
|
|
std::vector<aco_ptr<Instruction>> new_instructions;
|
|
new_instructions.reserve(block.instructions.size());
|
|
|
|
+ uint32_t linear_call_idx = 0;
|
|
+ uint32_t logical_call_idx = 0;
|
|
+ for (auto index : block.linear_preds)
|
|
+ linear_call_idx = std::max(linear_call_idx, ctx.call_indices[index].first);
|
|
+ for (auto index : block.logical_preds)
|
|
+ logical_call_idx = std::max(logical_call_idx, ctx.call_indices[index].second);
|
|
+
|
|
for (aco_ptr<Instruction>& instr : block.instructions) {
|
|
/* first, rename operands */
|
|
for (Operand& op : instr->operands) {
|
|
@@ -354,6 +367,10 @@ process_block(vn_ctx& ctx, Block& block)
|
|
if (instr->opcode == aco_opcode::p_discard_if ||
|
|
instr->opcode == aco_opcode::p_demote_to_helper || instr->opcode == aco_opcode::p_end_wqm)
|
|
ctx.exec_id++;
|
|
+ if (instr->isCall()) {
|
|
+ ++linear_call_idx;
|
|
+ ++logical_call_idx;
|
|
+ }
|
|
|
|
/* simple copy-propagation through renaming */
|
|
bool copy_instr =
|
|
@@ -370,7 +387,12 @@ process_block(vn_ctx& ctx, Block& block)
|
|
continue;
|
|
}
|
|
|
|
+ bool use_linear_call_idx =
|
|
+ std::any_of(instr->definitions.begin(), instr->definitions.end(),
|
|
+ [](const auto& def) { return def.regClass().is_linear(); });
|
|
+
|
|
instr->pass_flags = ctx.exec_id;
|
|
+ instr->pass_flags |= (use_linear_call_idx ? linear_call_idx : logical_call_idx) << 16;
|
|
std::pair<expr_set::iterator, bool> res = ctx.expr_values.emplace(instr.get(), block.index);
|
|
|
|
/* if there was already an expression with the same value number */
|
|
@@ -409,6 +431,8 @@ process_block(vn_ctx& ctx, Block& block)
|
|
}
|
|
}
|
|
|
|
+ ctx.call_indices[block.index] = {linear_call_idx, logical_call_idx};
|
|
+
|
|
block.instructions = std::move(new_instructions);
|
|
}
|
|
|
|
--
|
|
GitLab
|
|
|