diff --git a/patches/rtperf.diff b/patches/rtperf.diff deleted file mode 100644 index f102864..0000000 --- a/patches/rtperf.diff +++ /dev/null @@ -1,11451 +0,0 @@ -From dc44099798c94c194dedcb107e7aadee0d4c8e0b Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Tue, 4 Jun 2024 15:09:20 +0200 -Subject: [PATCH 01/71] !29536 - ---- - src/amd/compiler/aco_interface.cpp | 2 + - src/amd/compiler/aco_ir.h | 1 + - src/amd/compiler/aco_vectorize_spills.cpp | 253 ++++++++++++++++++++++ - src/amd/compiler/meson.build | 1 + - 4 files changed, 257 insertions(+) - create mode 100644 src/amd/compiler/aco_vectorize_spills.cpp - -diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp -index bc24b31a5bb6a..32a28908f90f0 100644 ---- a/src/amd/compiler/aco_interface.cpp -+++ b/src/amd/compiler/aco_interface.cpp -@@ -152,6 +152,8 @@ aco_postprocess_shader(const struct aco_compiler_options* options, - schedule_program(program.get()); - validate(program.get()); - -+ vectorize_spills(program.get()); -+ - /* Register Allocation */ - register_allocation(program.get()); - -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index 7542c1e0db143..96bc3c540e0bf 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -2263,6 +2263,7 @@ void combine_delay_alu(Program* program); - bool dealloc_vgprs(Program* program); - void insert_NOPs(Program* program); - void form_hard_clauses(Program* program); -+void vectorize_spills(Program* program); - unsigned emit_program(Program* program, std::vector& code, - std::vector* symbols = NULL, bool append_endpgm = true); - /** -diff --git a/src/amd/compiler/aco_vectorize_spills.cpp b/src/amd/compiler/aco_vectorize_spills.cpp -new file mode 100644 -index 0000000000000..b989306b5a3c2 ---- /dev/null -+++ b/src/amd/compiler/aco_vectorize_spills.cpp -@@ -0,0 +1,253 @@ -+/* -+ * Copyright © 2024 Valve Corporation -+ * -+ * SPDX-License-Identifier: MIT -+ */ -+ -+#include "aco_builder.h" -+#include "aco_ir.h" -+ -+#include -+ -+namespace aco { -+ -+struct vectorize_ctx { -+ std::vector> instrs_to_vectorize; -+ -+ std::vector> vectors; -+ std::vector> vectorized_instrs; -+ -+ std::vector component_idxs; -+ -+ std::unordered_set killed_soffset_ids; -+ std::unordered_set seen_soffset_ids; -+ -+ std::vector>::iterator insert_point; -+ Block* block; -+ Program* program; -+}; -+ -+void -+vectorize_and_insert(vectorize_ctx& ctx, bool store) -+{ -+ std::sort(ctx.instrs_to_vectorize.begin(), ctx.instrs_to_vectorize.end(), -+ [](const auto& one, const auto& other) -+ { return one->scratch().offset < other->scratch().offset; }); -+ -+ Builder instr_bld(ctx.program, &ctx.vectorized_instrs); -+ -+ for (unsigned i = 0; i < ctx.instrs_to_vectorize.size(); ++i) { -+ ctx.component_idxs.push_back(i); -+ for (auto j = i + 1; j < ctx.instrs_to_vectorize.size(); ++j) { -+ const auto& component = ctx.instrs_to_vectorize[ctx.component_idxs.back()]; -+ const auto& instr = ctx.instrs_to_vectorize[j]; -+ /* skip stores with unrelated soffset */ -+ if (instr->operands[1].tempId() != component->operands[1].tempId()) -+ continue; -+ int16_t next_offset; -+ if (store) -+ next_offset = component->scratch().offset + (int16_t)component->operands[2].bytes(); -+ else -+ next_offset = component->scratch().offset + (int16_t)component->definitions[0].bytes(); -+ -+ /* there's a gap, can't vectorize across it */ -+ if (instr->scratch().offset > next_offset) -+ break; -+ /* XXX: Hitting this means there are intersecting stores. This shouldn't happen! */ -+ if (instr->scratch().offset != next_offset) -+ break; -+ -+ if (instr->operands[1].isKill()) -+ ctx.killed_soffset_ids.insert(instr->operands[1].tempId()); -+ -+ ctx.component_idxs.push_back(j); -+ } -+ -+ if (ctx.component_idxs.empty()) -+ continue; -+ -+ size_t comp_idx = 0; -+ while (comp_idx < ctx.component_idxs.size()) { -+ size_t vector_size = 4; -+ while (vector_size > ctx.component_idxs.size() - comp_idx) -+ vector_size >>= 1; -+ -+ auto& first_component = ctx.instrs_to_vectorize[ctx.component_idxs[comp_idx]]; -+ -+ if (vector_size == 1) { -+ ctx.vectorized_instrs.emplace_back(std::move(first_component)); -+ ++comp_idx; -+ continue; -+ } -+ -+ if (store) { -+ Temp vec_tmp = ctx.program->allocateTmp(RegClass(RegType::vgpr, vector_size)); -+ Instruction* vec = -+ create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, vector_size, 1); -+ for (unsigned c = 0; c < vector_size; ++c) { -+ auto& component = ctx.instrs_to_vectorize[ctx.component_idxs[comp_idx + c]]; -+ vec->operands[c] = component->operands[2]; -+ } -+ vec->definitions[0] = Definition(vec_tmp); -+ ctx.vectors.emplace_back(vec); -+ -+ aco_opcode opcode; -+ switch (vector_size) { -+ case 4: opcode = aco_opcode::scratch_store_dwordx4; break; -+ case 2: opcode = aco_opcode::scratch_store_dwordx2; break; -+ default: unreachable("invalid vector size"); -+ } -+ -+ Operand vec_op = Operand(vec_tmp); -+ vec_op.setFirstKill(true); -+ instr_bld.scratch(opcode, Operand(v1), first_component->operands[1], vec_op, -+ first_component->scratch().offset, first_component->scratch().sync); -+ } else { -+ Temp vec_tmp = ctx.program->allocateTmp(RegClass(RegType::vgpr, vector_size)); -+ -+ aco_opcode opcode; -+ switch (vector_size) { -+ case 4: opcode = aco_opcode::scratch_load_dwordx4; break; -+ case 2: opcode = aco_opcode::scratch_load_dwordx2; break; -+ default: unreachable("invalid vector size"); -+ } -+ -+ instr_bld.scratch(opcode, Definition(vec_tmp), Operand(v1), -+ first_component->operands[1], first_component->scratch().offset, -+ first_component->scratch().sync); -+ -+ Instruction* vec = -+ create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, vector_size); -+ for (unsigned c = 0; c < vector_size; ++c) { -+ auto& component = ctx.instrs_to_vectorize[ctx.component_idxs[comp_idx + c]]; -+ vec->definitions[c] = component->definitions[0]; -+ } -+ vec->operands[0] = Operand(vec_tmp); -+ vec->operands[0].setFirstKill(true); -+ ctx.vectors.emplace_back(vec); -+ } -+ comp_idx += vector_size; -+ } -+ -+ for (unsigned j = 0; j < ctx.component_idxs.size(); ++j) { -+ auto idx = ctx.component_idxs[j]; -+ ctx.instrs_to_vectorize.erase(ctx.instrs_to_vectorize.begin() + (idx - j)); -+ } -+ /* Adjust for deleted instruction */ -+ --i; -+ -+ ctx.component_idxs.clear(); -+ } -+ -+ for (auto it = ctx.vectorized_instrs.rbegin(); it != ctx.vectorized_instrs.rend(); ++it) { -+ auto soffset_id = (*it)->operands[1].tempId(); -+ if (ctx.seen_soffset_ids.find(soffset_id) == ctx.seen_soffset_ids.end()) { -+ if (ctx.killed_soffset_ids.find(soffset_id) != ctx.killed_soffset_ids.end()) -+ (*it)->operands[1].setFirstKill(true); -+ ctx.seen_soffset_ids.insert(soffset_id); -+ } -+ } -+ -+ if (store) { -+ ctx.insert_point = -+ ctx.block->instructions.insert(ctx.insert_point, std::move_iterator(ctx.vectors.begin()), -+ std::move_iterator(ctx.vectors.end())); -+ ctx.insert_point += ctx.vectors.size(); -+ ctx.insert_point = ctx.block->instructions.insert( -+ ctx.insert_point, std::move_iterator(ctx.vectorized_instrs.rbegin()), -+ std::move_iterator(ctx.vectorized_instrs.rend())); -+ ctx.insert_point += ctx.vectorized_instrs.size(); -+ } else { -+ ctx.insert_point = ctx.block->instructions.insert( -+ ctx.insert_point, std::move_iterator(ctx.vectorized_instrs.rbegin()), -+ std::move_iterator(ctx.vectorized_instrs.rend())); -+ ctx.insert_point += ctx.vectorized_instrs.size(); -+ ctx.insert_point = -+ ctx.block->instructions.insert(ctx.insert_point, std::move_iterator(ctx.vectors.begin()), -+ std::move_iterator(ctx.vectors.end())); -+ ctx.insert_point += ctx.vectors.size(); -+ } -+ -+ ctx.vectors.clear(); -+ ctx.vectorized_instrs.clear(); -+ ctx.instrs_to_vectorize.clear(); -+ ctx.seen_soffset_ids.clear(); -+ ctx.killed_soffset_ids.clear(); -+} -+ -+void -+vectorize_spills(Program* program) -+{ -+ vectorize_ctx ctx; -+ ctx.program = program; -+ aco::monotonic_buffer_resource memory; -+ -+ for (auto& block : program->blocks) { -+ ctx.block = █ -+ IDSet conflicting_temps(memory); -+ -+ /* Try vectorizing stores */ -+ for (auto it = block.instructions.begin(); it != block.instructions.end();) { -+ bool vectorize_now = !(*it)->isVMEM() && it != block.instructions.begin(); -+ -+ /* Only look for stores that kill their operand. We can move/combine these with other -+ * instructions without affecting register demand. -+ */ -+ if ((*it)->opcode == aco_opcode::scratch_store_dword && (*it)->operands[2].isKill() && -+ !(*it)->operands[2].regClass().is_subdword()) { -+ if (conflicting_temps.count((*it)->operands[2].tempId())) { -+ vectorize_now = true; -+ --it; -+ } else { -+ bool first = ctx.instrs_to_vectorize.empty(); -+ ctx.instrs_to_vectorize.emplace_back(std::move(*it)); -+ it = block.instructions.erase(it); -+ if (first) -+ ctx.insert_point = it; -+ continue; -+ } -+ } -+ -+ if (vectorize_now) { -+ auto clause_size = it - ctx.insert_point; -+ vectorize_and_insert(ctx, true); -+ it = ctx.insert_point + clause_size; -+ conflicting_temps = IDSet(memory); -+ } else { -+ for (auto& def : (*it)->definitions) -+ if (def.isTemp()) -+ conflicting_temps.insert(def.tempId()); -+ } -+ ++it; -+ } -+ /* Try vectorizing loads */ -+ for (auto it = block.instructions.begin(); it != block.instructions.end();) { -+ bool vectorize_now = !(*it)->isVMEM() && it != block.instructions.begin(); -+ for (auto& op : (*it)->operands) { -+ if (op.isTemp() && conflicting_temps.count(op.tempId())) { -+ vectorize_now = true; -+ --it; -+ } -+ } -+ -+ /* Loads that kill their definition are dead and shouldn't appear with spilling */ -+ if (!vectorize_now && (*it)->opcode == aco_opcode::scratch_load_dword && -+ !(*it)->definitions[0].isKill() && !(*it)->definitions[0].regClass().is_subdword()) { -+ ctx.instrs_to_vectorize.emplace_back(std::move(*it)); -+ conflicting_temps.insert((*it)->definitions[0].tempId()); -+ it = block.instructions.erase(it); -+ continue; -+ } -+ -+ if (vectorize_now) { -+ ctx.insert_point = it; -+ vectorize_and_insert(ctx, false); -+ it = ctx.insert_point; -+ conflicting_temps = IDSet(memory); -+ } -+ ++it; -+ } -+ } -+} -+ -+} // namespace aco -diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build -index ae2d6a41b793a..b235f626f97af 100644 ---- a/src/amd/compiler/meson.build -+++ b/src/amd/compiler/meson.build -@@ -66,6 +66,7 @@ libaco_files = files( - 'aco_statistics.cpp', - 'aco_util.h', - 'aco_validate.cpp', -+ 'aco_vectorize_spills.cpp', - ) - - cpp_args_aco = cpp.get_supported_arguments(['-fno-exceptions', '-fno-rtti', '-Wimplicit-fallthrough', '-Wshadow']) --- -GitLab - - -From 8123a30fc5553bbf237833fbb7a5b39ce677664d Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 25 Mar 2024 16:52:45 +0100 -Subject: [PATCH 02/71] !29576 - ---- - src/amd/compiler/aco_ir.h | 1 + - src/amd/compiler/aco_register_allocation.cpp | 316 +++++++++++-------- - 2 files changed, 193 insertions(+), 124 deletions(-) - -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index 96bc3c540e0bf..8a501797092ed 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -742,6 +742,7 @@ public: - isPrecolored_ = isFixed_; - } - -+ - constexpr bool isConstant() const noexcept { return isConstant_; } - - constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; } -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index 7ff35c079e2ed..fc62487627fad 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -14,8 +14,17 @@ - #include - #include - #include -+#include -+#include -+#include - #include - -+namespace std { -+template <> struct hash { -+ size_t operator()(aco::PhysReg temp) const noexcept { return std::hash{}(temp.reg_b); } -+}; -+} // namespace std -+ - namespace aco { - namespace { - -@@ -29,6 +38,19 @@ void add_subdword_operand(ra_ctx& ctx, aco_ptr& instr, unsigned idx - void add_subdword_definition(Program* program, aco_ptr& instr, PhysReg reg, - bool allow_16bit_write); - -+struct parallelcopy { -+ constexpr parallelcopy() : skip_renaming(false) {} -+ constexpr parallelcopy(Operand op_, Definition def_) : op(op_), def(def_), skip_renaming(false) -+ {} -+ constexpr parallelcopy(Operand op_, Definition def_, bool skip_renaming_) -+ : op(op_), def(def_), skip_renaming(skip_renaming_) -+ {} -+ -+ Operand op; -+ Definition def; -+ bool skip_renaming; -+}; -+ - struct assignment { - PhysReg reg; - RegClass rc; -@@ -270,7 +292,11 @@ public: - std::array regs; - std::map> subdword_regs; - -- const uint32_t& operator[](PhysReg index) const { return regs[index]; } -+ const uint32_t& operator[](PhysReg index) const -+ { -+ assert(index.reg() < 512); -+ return regs[index]; -+ } - - uint32_t& operator[](PhysReg index) { return regs[index]; } - -@@ -357,7 +383,12 @@ public: - } - } - -- void clear(Operand op) { clear(op.physReg(), op.regClass()); } -+ void clear(Operand op) -+ { -+ if (op.isTemp() && get_id(op.physReg()) && !is_blocked(op.physReg())) -+ assert(get_id(op.physReg()) == op.tempId()); -+ clear(op.physReg(), op.regClass()); -+ } - - void fill(Definition def) - { -@@ -805,22 +836,21 @@ enum UpdateRenames { - MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames); - - void --update_renames(ra_ctx& ctx, RegisterFile& reg_file, -- std::vector>& parallelcopies, -+update_renames(ra_ctx& ctx, RegisterFile& reg_file, std::vector& parallelcopies, - aco_ptr& instr, UpdateRenames flags) - { - /* clear operands */ -- for (std::pair& copy : parallelcopies) { -+ for (parallelcopy& copy : parallelcopies) { - /* the definitions with id are not from this function and already handled */ -- if (copy.second.isTemp()) -+ if (copy.def.isTemp()) - continue; -- reg_file.clear(copy.first); -+ reg_file.clear(copy.op); - } - - /* allocate id's and rename operands: this is done transparently here */ - auto it = parallelcopies.begin(); - while (it != parallelcopies.end()) { -- if (it->second.isTemp()) { -+ if (it->def.isTemp()) { - ++it; - continue; - } -@@ -828,9 +858,9 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, - /* check if we moved a definition: change the register and remove copy */ - bool is_def = false; - for (Definition& def : instr->definitions) { -- if (def.isTemp() && def.getTemp() == it->first.getTemp()) { -+ if (def.isTemp() && def.getTemp() == it->op.getTemp()) { - // FIXME: ensure that the definition can use this reg -- def.setFixed(it->second.physReg()); -+ def.setFixed(it->def.physReg()); - reg_file.fill(def); - ctx.assignments[def.tempId()].reg = def.physReg(); - it = parallelcopies.erase(it); -@@ -842,34 +872,52 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, - continue; - - /* check if we moved another parallelcopy definition */ -- for (std::pair& other : parallelcopies) { -- if (!other.second.isTemp()) -+ for (parallelcopy& other : parallelcopies) { -+ if (!other.def.isTemp()) - continue; -- if (it->first.getTemp() == other.second.getTemp()) { -- other.second.setFixed(it->second.physReg()); -- ctx.assignments[other.second.tempId()].reg = other.second.physReg(); -- it = parallelcopies.erase(it); -- is_def = true; -+ if (it->op.getTemp() == other.def.getTemp()) { -+ bool other_ensures_precoloring = false; -+ - /* check if we moved an operand, again */ - bool fill = true; - for (Operand& op : instr->operands) { -- if (op.isTemp() && op.tempId() == other.second.tempId()) { -- // FIXME: ensure that the operand can use this reg -- op.setFixed(other.second.physReg()); -- fill = !op.isKillBeforeDef(); -- } -+ if (!op.isTemp() || op.tempId() != other.def.tempId()) -+ continue; -+ bool isKillBeforeDef = op.isFirstKillBeforeDef(); -+ fill = !isKillBeforeDef; -+ -+ if (other.def.physReg() == op.physReg() && op.isPrecolored()) -+ other_ensures_precoloring = true; -+ else -+ op.setFixed(it->def.physReg()); -+ break; -+ } -+ -+ Definition fill_def; -+ -+ if (other_ensures_precoloring) { -+ it->op = other.op; -+ ctx.assignments[other.op.tempId()].reg = it->def.physReg(); -+ fill_def = it->def; -+ } else { -+ other.def.setFixed(it->def.physReg()); -+ ctx.assignments[other.def.tempId()].reg = other.def.physReg(); -+ it = parallelcopies.erase(it); -+ fill_def = other.def; - } -+ is_def = true; -+ - if (fill) -- reg_file.fill(other.second); -+ reg_file.fill(fill_def); - break; - } - } - if (is_def) - continue; - -- std::pair& copy = *it; -- copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass())); -- ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass()); -+ parallelcopy& copy = *it; -+ copy.def.setTemp(ctx.program->allocateTmp(copy.def.regClass())); -+ ctx.assignments.emplace_back(copy.def.physReg(), copy.def.regClass()); - assert(ctx.assignments.size() == ctx.program->peekAllocationId()); - - /* check if we moved an operand */ -@@ -879,19 +927,19 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, - Operand& op = instr->operands[i]; - if (!op.isTemp()) - continue; -- if (op.tempId() == copy.first.tempId()) { -+ if (op.tempId() == copy.op.tempId()) { - /* only rename precolored operands if the copy-location matches */ -- bool omit_renaming = op.isPrecolored() && op.physReg() != copy.second.physReg(); -+ bool omit_renaming = op.isPrecolored() && op.physReg() != copy.def.physReg(); - - /* Omit renaming in some cases for p_create_vector in order to avoid - * unnecessary shuffle code. */ - if (!(flags & rename_not_killed_ops) && !op.isKillBeforeDef()) { - omit_renaming = true; -- for (std::pair& pc : parallelcopies) { -- PhysReg def_reg = pc.second.physReg(); -- omit_renaming &= def_reg > copy.first.physReg() -- ? (copy.first.physReg() + copy.first.size() <= def_reg.reg()) -- : (def_reg + pc.second.size() <= copy.first.physReg().reg()); -+ for (parallelcopy& pc : parallelcopies) { -+ PhysReg def_reg = pc.def.physReg(); -+ omit_renaming &= def_reg > copy.op.physReg() -+ ? (copy.op.physReg() + copy.op.size() <= def_reg.reg()) -+ : (def_reg + pc.def.size() <= copy.op.physReg().reg()); - } - } - -@@ -905,8 +953,8 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, - if (omit_renaming) - continue; - -- op.setTemp(copy.second.getTemp()); -- op.setFixed(copy.second.physReg()); -+ op.setTemp(copy.def.getTemp()); -+ op.setFixed(copy.def.physReg()); - - fill = !op.isKillBeforeDef() || op.isPrecolored(); - } -@@ -914,7 +962,7 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, - - /* Apply changes to register file. */ - if (fill) -- reg_file.fill(copy.second); -+ reg_file.fill(copy.def); - - ++it; - } -@@ -1050,7 +1098,7 @@ collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_inte - - std::optional - get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file, -- std::vector>& parallelcopies, -+ std::vector& parallelcopies, - aco_ptr& instr, const PhysRegInterval def_reg, - DefInfo info, unsigned id) - { -@@ -1102,8 +1150,7 @@ get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file, - } - - bool --get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, -- std::vector>& parallelcopies, -+get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, std::vector& parallelcopies, - const std::vector& vars, aco_ptr& instr, - const PhysRegInterval def_reg) - { -@@ -1253,9 +1300,8 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, - } - - std::optional --get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, -- std::vector>& parallelcopies, const DefInfo& info, -- aco_ptr& instr) -+get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, std::vector& parallelcopies, -+ const DefInfo& info, aco_ptr& instr) - { - const PhysRegInterval& bounds = info.bounds; - uint32_t size = info.size; -@@ -1381,7 +1427,7 @@ get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, - if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) - tmp_file.fill_killed_operands(instr.get()); - -- std::vector> pc; -+ std::vector pc; - if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win)) - return {}; - -@@ -1460,11 +1506,13 @@ struct IDAndInfo { - }; - - void --add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val) -+add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val, bool add_to_ctx = true) - { -- ctx.renames[ctx.block->index][orig_val.id()] = new_val; - ctx.orig_names.emplace(new_val.id(), orig_val); -- ctx.assignments[orig_val.id()].renamed = true; -+ if (add_to_ctx) { -+ ctx.renames[ctx.block->index][orig_val.id()] = new_val; -+ ctx.assignments[orig_val.id()].renamed = true; -+ } - } - - /* Reallocates vars by sorting them and placing each variable after the previous -@@ -1473,7 +1521,7 @@ add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val) - */ - PhysReg - compact_relocate_vars(ra_ctx& ctx, const std::vector& vars, -- std::vector>& parallelcopies, PhysReg start) -+ std::vector& parallelcopies, PhysReg start) - { - /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword - * temporary sizes to dwords. -@@ -1624,7 +1672,7 @@ get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr>& parallelcopies) -+ std::vector& parallelcopies) - { - PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true); - int zeros = reg_file.count_zero(linear_vgpr_bounds); -@@ -1650,7 +1698,7 @@ compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file, - */ - PhysReg - alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr& instr, -- std::vector>& parallelcopies) -+ std::vector& parallelcopies) - { - assert(instr->opcode == aco_opcode::p_start_linear_vgpr); - assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0); -@@ -1683,7 +1731,7 @@ alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr> pc; -+ std::vector pc; - if (!ctx.policy.skip_optimistic_path && - get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) { - parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); -@@ -1734,7 +1782,7 @@ should_compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file) - - PhysReg - get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, -- std::vector>& parallelcopies, aco_ptr& instr, -+ std::vector& parallelcopies, aco_ptr& instr, - int operand_index = -1) - { - auto split_vec = ctx.split_vectors.find(temp.id()); -@@ -1808,7 +1856,7 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, - return *res; - - /* try compacting the linear vgprs to make more space */ -- std::vector> pc; -+ std::vector pc; - if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) && - compact_linear_vgprs(ctx, reg_file, pc)) { - parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); -@@ -1816,8 +1864,8 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, - /* We don't need to fill the copy definitions in because we don't care about the linear VGPR - * space here. */ - RegisterFile tmp_file(reg_file); -- for (std::pair& copy : pc) -- tmp_file.clear(copy.first); -+ for (parallelcopy& copy : pc) -+ tmp_file.clear(copy.op); - - return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index); - } -@@ -1875,8 +1923,7 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, - - PhysReg - get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, -- std::vector>& parallelcopies, -- aco_ptr& instr) -+ std::vector& parallelcopies, aco_ptr& instr) - { - RegClass rc = temp.regClass(); - /* create_vector instructions have different costs w.r.t. register coalescing */ -@@ -1993,7 +2040,7 @@ get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, - std::vector vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size}); - - bool success = false; -- std::vector> pc; -+ std::vector pc; - success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size}); - - if (!success) { -@@ -2084,59 +2131,81 @@ operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr& instr, unsign - - void - handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file, -- std::vector>& parallelcopy, -- aco_ptr& instr) -+ std::vector& parallelcopy, aco_ptr& instr) - { - assert(instr->operands.size() <= 128); - assert(parallelcopy.empty()); - - RegisterFile tmp_file(register_file); -+ std::unordered_map> temp_regs; -+ std::vector blocking_vars; - -- BITSET_DECLARE(mask, 128) = {0}; -- -- for (unsigned i = 0; i < instr->operands.size(); i++) { -- Operand& op = instr->operands[i]; -- -- if (!op.isPrecolored()) -+ for (auto it = instr->operands.begin(); it != instr->operands.end(); ++it) { -+ if (!it->isPrecolored()) - continue; - -- assert(op.isTemp()); -- PhysReg src = ctx.assignments[op.tempId()].reg; -- adjust_max_used_regs(ctx, op.regClass(), op.physReg()); -+ assert(it->isTemp()); -+ adjust_max_used_regs(ctx, it->regClass(), it->physReg()); -+ PhysReg src = ctx.assignments[it->tempId()].reg; -+ temp_regs[it->tempId()].emplace(it->physReg()); - -- if (op.physReg() == src) { -- tmp_file.block(op.physReg(), op.regClass()); -- continue; -+ if (src == it->physReg()) { -+ tmp_file.block(it->physReg(), it->regClass()); -+ } else { -+ /* clear from register_file so fixed operands are not collected be collect_vars() */ -+ if (!tmp_file.is_blocked(src)) -+ tmp_file.clear(src, it->regClass()); // TODO: try to avoid moving block vars to src - } - - /* An instruction can have at most one operand precolored to the same register. */ - assert(std::none_of(parallelcopy.begin(), parallelcopy.end(), -- [&](auto copy) { return copy.second.physReg() == op.physReg(); })); -+ [&](auto copy) { return copy.def.physReg() == it->physReg(); })); -+ } -+ -+ for (auto& regs : temp_regs) { -+ PhysReg src = ctx.assignments[regs.first].reg; - -- /* clear from register_file so fixed operands are not collected be collect_vars() */ -- tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src -+ PhysReg live_reg = *regs.second.begin(); -+ if (regs.second.size() > 1) { -+ bool found = false; -+ for (auto reg : regs.second) { -+ PhysRegInterval range = {reg, ctx.program->temp_rc[regs.first].size()}; -+ bool intersects_with_def = false; -+ for (const auto& def : instr->definitions) { -+ if (!def.isTemp() || !def.isFixed()) -+ continue; -+ PhysRegInterval def_range = {def.physReg(), def.regClass().size()}; -+ if (intersects(def_range, range)) { -+ intersects_with_def = true; -+ break; -+ } -+ } -+ if (intersects_with_def) -+ continue; - -- BITSET_SET(mask, i); -+ if (!found || reg == src) { -+ live_reg = reg; -+ found = true; -+ if (reg == src) -+ break; -+ } -+ } -+ } - -- Operand pc_op(instr->operands[i].getTemp()); -- pc_op.setFixed(src); -- Definition pc_def = Definition(op.physReg(), pc_op.regClass()); -- parallelcopy.emplace_back(pc_op, pc_def); -- } -+ RegClass rc = ctx.program->temp_rc[regs.first]; - -- if (BITSET_IS_EMPTY(mask)) -- return; -+ for (auto reg : regs.second) { -+ if (reg == src) -+ continue; - -- unsigned i; -- std::vector blocking_vars; -- BITSET_FOREACH_SET (i, mask, instr->operands.size()) { -- Operand& op = instr->operands[i]; -- PhysRegInterval target{op.physReg(), op.size()}; -- std::vector blocking_vars2 = collect_vars(ctx, tmp_file, target); -- blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end()); -+ Definition copy_def = Definition(reg, rc); -+ parallelcopy.emplace_back(Operand(Temp(regs.first, rc), src), copy_def, reg != live_reg); - -- /* prevent get_regs_for_copies() from using these registers */ -- tmp_file.block(op.physReg(), op.regClass()); -+ PhysRegInterval target{reg, rc.size()}; -+ std::vector blocking_vars2 = collect_vars(ctx, tmp_file, target); -+ blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end()); -+ tmp_file.block(reg, rc); -+ } - } - - get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval()); -@@ -2145,8 +2214,8 @@ handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file, - - void - get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, -- std::vector>& parallelcopy, -- aco_ptr& instr, Operand& operand, unsigned operand_index) -+ std::vector& parallelcopy, aco_ptr& instr, -+ Operand& operand, unsigned operand_index) - { - /* clear the operand in case it's only a stride mismatch */ - PhysReg src = ctx.assignments[operand.tempId()].reg; -@@ -2166,45 +2235,44 @@ get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file, - std::vector>& instructions, Block& block, - aco_ptr& phi, Temp tmp) - { -- std::vector> parallelcopy; -+ std::vector parallelcopy; - PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi); - update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops); - - /* process parallelcopy */ -- for (std::pair pc : parallelcopy) { -+ for (struct parallelcopy pc : parallelcopy) { - /* see if it's a copy from a different phi */ - // TODO: prefer moving some previous phis over live-ins - // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a - // problem in practice since they can only be fixed to exec) - Instruction* prev_phi = NULL; - for (auto phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) { -- if ((*phi_it)->definitions[0].tempId() == pc.first.tempId()) -+ if ((*phi_it)->definitions[0].tempId() == pc.op.tempId()) - prev_phi = phi_it->get(); - } - if (prev_phi) { - /* if so, just update that phi's register */ -- prev_phi->definitions[0].setFixed(pc.second.physReg()); -+ prev_phi->definitions[0].setFixed(pc.def.physReg()); - register_file.fill(prev_phi->definitions[0]); -- ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), -- pc.second.regClass()}; -+ ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.def.physReg(), pc.def.regClass()}; - continue; - } - - /* rename */ -- auto orig_it = ctx.orig_names.find(pc.first.tempId()); -- Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp(); -- add_rename(ctx, orig, pc.second.getTemp()); -+ auto orig_it = ctx.orig_names.find(pc.op.tempId()); -+ Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.op.getTemp(); -+ add_rename(ctx, orig, pc.def.getTemp()); - - /* otherwise, this is a live-in and we need to create a new phi - * to move it in this block's predecessors */ - aco_opcode opcode = -- pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; -+ pc.op.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; - Block::edge_vec& preds = -- pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; -+ pc.op.getTemp().is_linear() ? block.linear_preds : block.logical_preds; - aco_ptr new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; -- new_phi->definitions[0] = pc.second; -+ new_phi->definitions[0] = pc.def; - for (unsigned i = 0; i < preds.size(); i++) -- new_phi->operands[i] = Operand(pc.first); -+ new_phi->operands[i] = Operand(pc.op); - instructions.emplace_back(std::move(new_phi)); - - /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is -@@ -2916,7 +2984,7 @@ optimize_encoding(ra_ctx& ctx, RegisterFile& register_file, aco_ptr - } - - void --emit_parallel_copy_internal(ra_ctx& ctx, std::vector>& parallelcopy, -+emit_parallel_copy_internal(ra_ctx& ctx, std::vector& parallelcopy, - aco_ptr& instr, - std::vector>& instructions, bool temp_in_scc, - RegisterFile& register_file) -@@ -2931,31 +2999,31 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vectoroperands[i] = parallelcopy[i].first; -- pc->definitions[i] = parallelcopy[i].second; -+ pc->operands[i] = parallelcopy[i].op; -+ pc->definitions[i] = parallelcopy[i].def; - assert(pc->operands[i].size() == pc->definitions[i].size()); - - /* it might happen that the operand is already renamed. we have to restore the - * original name. */ - auto it = ctx.orig_names.find(pc->operands[i].tempId()); - Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp(); -- add_rename(ctx, orig, pc->definitions[i].getTemp()); -+ add_rename(ctx, orig, pc->definitions[i].getTemp(), !parallelcopy[i].skip_renaming); - } - - if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) { -@@ -2982,18 +3050,18 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector>& parallelcopy, -+emit_parallel_copy(ra_ctx& ctx, std::vector& parallelcopy, - aco_ptr& instr, std::vector>& instructions, - bool temp_in_scc, RegisterFile& register_file) - { - if (parallelcopy.empty()) - return; - -- std::vector> linear_vgpr; -+ std::vector linear_vgpr; - if (ctx.num_linear_vgprs) { - unsigned next = 0; - for (unsigned i = 0; i < parallelcopy.size(); i++) { -- if (parallelcopy[i].first.regClass().is_linear_vgpr()) { -+ if (parallelcopy[i].def.regClass().is_linear_vgpr()) { - linear_vgpr.push_back(parallelcopy[i]); - continue; - } -@@ -3063,7 +3131,7 @@ register_allocation(Program* program, ra_test_policy policy) - auto instr_it = std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi); - for (; instr_it != block.instructions.end(); ++instr_it) { - aco_ptr& instr = *instr_it; -- std::vector> parallelcopy; -+ std::vector parallelcopy; - bool temp_in_scc = register_file[scc]; - - if (instr->opcode == aco_opcode::p_branch) { -@@ -3084,7 +3152,6 @@ register_allocation(Program* program, ra_test_policy policy) - /* rename operands */ - operand.setTemp(read_variable(ctx, operand.getTemp(), block.index)); - assert(ctx.assignments[operand.tempId()].assigned); -- - fixed |= - operand.isPrecolored() && ctx.assignments[operand.tempId()].reg != operand.physReg(); - } -@@ -3101,8 +3168,9 @@ register_allocation(Program* program, ra_test_policy policy) - } - } - -- if (fixed) -+ if (fixed) { - handle_fixed_operands(ctx, register_file, parallelcopy, instr); -+ } - - for (unsigned i = 0; i < instr->operands.size(); ++i) { - auto& operand = instr->operands[i]; -@@ -3347,7 +3415,7 @@ register_allocation(Program* program, ra_test_policy policy) - bool temp_in_scc = - register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc); - -- std::vector> parallelcopy; -+ std::vector parallelcopy; - compact_linear_vgprs(ctx, register_file, parallelcopy); - update_renames(ctx, register_file, parallelcopy, br, rename_not_killed_ops); - emit_parallel_copy_internal(ctx, parallelcopy, br, instructions, temp_in_scc, register_file); --- -GitLab - - -From 51acc061a662fc8fcc1e257a12346474af5912d6 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 24 Jun 2024 16:48:43 +0200 -Subject: [PATCH 03/71] !29730 - ---- - src/amd/compiler/aco_ir.h | 1 + - src/amd/compiler/aco_live_var_analysis.cpp | 50 +++++--- - src/amd/compiler/aco_spill.cpp | 133 ++++++++++++++++++--- - 3 files changed, 151 insertions(+), 33 deletions(-) - -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index 8a501797092ed..d838b728e19ce 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -2314,6 +2314,7 @@ int get_op_fixed_to_def(Instruction* instr); - /* utilities for dealing with register demand */ - RegisterDemand get_live_changes(Instruction* instr); - RegisterDemand get_temp_registers(Instruction* instr); -+RegisterDemand get_temp_reg_changes(Instruction* instr); - - /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */ - uint16_t get_extra_sgprs(Program* program); -diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp -index 8744258a1b9aa..a635c94496143 100644 ---- a/src/amd/compiler/aco_live_var_analysis.cpp -+++ b/src/amd/compiler/aco_live_var_analysis.cpp -@@ -9,6 +9,29 @@ - - namespace aco { - -+namespace { -+void -+get_temp_register_demand(Instruction* instr, RegisterDemand& demand_before, RegisterDemand& demand_after) -+{ -+ for (Definition def : instr->definitions) { -+ if (def.isKill()) -+ demand_after += def.getTemp(); -+ else if (def.isTemp()) -+ demand_before -= def.getTemp(); -+ } -+ -+ for (Operand op : instr->operands) { -+ if (op.isFirstKill() || op.isCopyKill()) { -+ demand_before += op.getTemp(); -+ if (op.isLateKill()) -+ demand_after += op.getTemp(); -+ } else if (op.isClobbered() && !op.isKill()) { -+ demand_before += op.getTemp(); -+ } -+ } -+} -+} -+ - RegisterDemand - get_live_changes(Instruction* instr) - { -@@ -34,27 +57,22 @@ get_temp_registers(Instruction* instr) - RegisterDemand demand_before; - RegisterDemand demand_after; - -- for (Definition def : instr->definitions) { -- if (def.isKill()) -- demand_after += def.getTemp(); -- else if (def.isTemp()) -- demand_before -= def.getTemp(); -- } -- -- for (Operand op : instr->operands) { -- if (op.isFirstKill() || op.isCopyKill()) { -- demand_before += op.getTemp(); -- if (op.isLateKill()) -- demand_after += op.getTemp(); -- } else if (op.isClobbered() && !op.isKill()) { -- demand_before += op.getTemp(); -- } -- } -+ get_temp_register_demand(instr, demand_before, demand_after); - - demand_after.update(demand_before); - return demand_after; - } - -+RegisterDemand get_temp_reg_changes(Instruction* instr) -+{ -+ RegisterDemand demand_before; -+ RegisterDemand demand_after; -+ -+ get_temp_register_demand(instr, demand_before, demand_after); -+ -+ return demand_after - demand_before; -+} -+ - namespace { - - struct live_ctx { -diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp -index ae7ae16e3298b..be45b0eda7632 100644 ---- a/src/amd/compiler/aco_spill.cpp -+++ b/src/amd/compiler/aco_spill.cpp -@@ -15,6 +15,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -909,7 +910,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s - /* the Operand is spilled: add it to reloads */ - Temp new_tmp = ctx.program->allocateTmp(op.regClass()); - ctx.renames[block_idx][op.getTemp()] = new_tmp; -- reloads[new_tmp] = std::make_pair(op.getTemp(), current_spills[op.getTemp()]); -+ reloads[op.getTemp()] = std::make_pair(new_tmp, current_spills[op.getTemp()]); - current_spills.erase(op.getTemp()); - spilled_registers -= new_tmp; - } -@@ -917,13 +918,17 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s - /* check if register demand is low enough during and after the current instruction */ - if (block->register_demand.exceeds(ctx.target_pressure)) { - RegisterDemand new_demand = instr->register_demand; -+ std::optional live_changes; - - /* if reg pressure is too high, spill variable with furthest next use */ - while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) { - float score = 0.0; - Temp to_spill = Temp(); -+ unsigned operand_idx = -1u; -+ unsigned respill_slot = -1u; - unsigned do_rematerialize = 0; - unsigned avoid_respill = 0; -+ - RegType type = RegType::sgpr; - if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) - type = RegType::vgpr; -@@ -941,24 +946,68 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s - - if (can_rematerialize > do_rematerialize || loop_variable > avoid_respill || - ctx.ssa_infos[t].score() > score) { -- /* Don't spill operands */ -- if (std::any_of(instr->operands.begin(), instr->operands.end(), -- [&](Operand& op) { return op.isTemp() && op.getTemp() == var; })) -+ unsigned cur_operand_idx = -1u; -+ bool can_spill = true; -+ for (auto it = instr->operands.begin(); it != instr->operands.end(); ++it) { -+ if (!it->isTemp() || it->getTemp() != var) -+ continue; -+ -+ /* Vectors with size >4 require a p_split_vector. When spilling an operand, -+ * the p_split_vector cannot kill the vector (because it's also an operand -+ * to the current instruction) and will therefore increase register demand -+ * instead of helping reduce it. -+ */ -+ if (it->regClass().size() > 4) { -+ can_spill = false; -+ break; -+ } -+ -+ if (!live_changes) -+ live_changes = get_temp_reg_changes(instr.get()); -+ -+ /* Don't spill operands if killing operands won't help with register pressure */ -+ if ((type == RegType::sgpr && live_changes->sgpr < (int16_t)it->size()) || -+ (type == RegType::vgpr && live_changes->vgpr < (int16_t)it->size())) { -+ can_spill = false; -+ break; -+ } -+ -+ cur_operand_idx = it - instr->operands.begin(); -+ if (it->isLateKill() || it->isKill()) -+ can_spill = false; -+ break; -+ } -+ if (!can_spill) - continue; - -+ bool is_spilled_operand = reloads.count(var); -+ - to_spill = var; - score = ctx.ssa_infos[t].score(); - do_rematerialize = can_rematerialize; -- avoid_respill = loop_variable; -+ avoid_respill = loop_variable || is_spilled_operand; -+ operand_idx = cur_operand_idx; -+ -+ /* This variable is spilled at the loop-header of the current loop. -+ * Re-use the spill-slot in order to avoid an extra store. -+ */ -+ if (loop_variable) -+ respill_slot = ctx.loop.back().spills[var]; -+ else if (is_spilled_operand) -+ respill_slot = reloads[var].second; - } - } - assert(to_spill != Temp()); - -- if (avoid_respill) { -- /* This variable is spilled at the loop-header of the current loop. -- * Re-use the spill-slot in order to avoid an extra store. -+ if (operand_idx != -1u) { -+ /* We might not be able to spill all operands. Keep live_changes up-to-date so we -+ * stop when we spilled every operand we can. - */ -- current_spills[to_spill] = ctx.loop.back().spills[to_spill]; -+ *live_changes -= instr->operands[operand_idx].getTemp(); -+ } -+ -+ if (avoid_respill) { -+ current_spills[to_spill] = respill_slot; - spilled_registers += to_spill; - continue; - } -@@ -1007,7 +1056,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s - /* add reloads and instruction to new instructions */ - for (std::pair>& pair : reloads) { - aco_ptr reload = -- do_reload(ctx, pair.second.first, pair.first, pair.second.second); -+ do_reload(ctx, pair.first, pair.second.first, pair.second.second); - instructions.emplace_back(std::move(reload)); - } - instructions.emplace_back(std::move(instr)); -@@ -1227,7 +1276,7 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector>& inst - assert(temp.type() == RegType::vgpr && !temp.is_linear()); - - Builder bld(ctx.program, &instructions); -- if (temp.size() > 1) { -+ if (temp.size() > 4) { - Instruction* split{ - create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())}; - split->operands[0] = Operand(temp); -@@ -1246,11 +1295,36 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector>& inst - instr->mubuf().cache.value = ac_swizzled; - } - } -- } else if (ctx.program->gfx_level >= GFX9) { -- bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), ctx.scratch_rsrc, temp, offset, -+ return; -+ } -+ -+ aco_opcode opcode; -+ switch (temp.size()) { -+ case 4: -+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_store_dwordx4 -+ : aco_opcode::buffer_store_dwordx4; -+ break; -+ case 3: -+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_store_dwordx3 -+ : aco_opcode::buffer_store_dwordx3; -+ break; -+ case 2: -+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_store_dwordx2 -+ : aco_opcode::buffer_store_dwordx2; -+ break; -+ case 1: -+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_store_dword -+ : aco_opcode::buffer_store_dword; -+ break; -+ default: -+ unreachable("Unhandled vector size!\n"); -+ } -+ -+ if (ctx.program->gfx_level >= GFX9) { -+ bld.scratch(opcode, Operand(v1), ctx.scratch_rsrc, temp, offset, - memory_sync_info(storage_vgpr_spill, semantic_private)); - } else { -- Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, Operand(v1), -+ Instruction* instr = bld.mubuf(opcode, ctx.scratch_rsrc, Operand(v1), - scratch_offset, temp, offset, false); - instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); - instr->mubuf().cache.value = ac_swizzled; -@@ -1291,11 +1365,36 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector>& ins - } - } - bld.insert(vec); -- } else if (ctx.program->gfx_level >= GFX9) { -- bld.scratch(aco_opcode::scratch_load_dword, def, Operand(v1), ctx.scratch_rsrc, offset, -+ return; -+ } -+ -+ aco_opcode opcode; -+ switch (def.size()) { -+ case 4: -+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_load_dwordx4 -+ : aco_opcode::buffer_load_dwordx4; -+ break; -+ case 3: -+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_load_dwordx3 -+ : aco_opcode::buffer_load_dwordx3; -+ break; -+ case 2: -+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_load_dwordx2 -+ : aco_opcode::buffer_load_dwordx2; -+ break; -+ case 1: -+ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_load_dword -+ : aco_opcode::buffer_load_dword; -+ break; -+ default: -+ unreachable("Unhandled vector size!\n"); -+ } -+ -+ if (ctx.program->gfx_level >= GFX9) { -+ bld.scratch(opcode, def, Operand(v1), ctx.scratch_rsrc, offset, - memory_sync_info(storage_vgpr_spill, semantic_private)); - } else { -- Instruction* instr = bld.mubuf(aco_opcode::buffer_load_dword, def, ctx.scratch_rsrc, -+ Instruction* instr = bld.mubuf(opcode, def, ctx.scratch_rsrc, - Operand(v1), scratch_offset, offset, false); - instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); - instr->mubuf().cache.value = ac_swizzled; --- -GitLab - - -From a0276e8120c286a81006d1636f5e5e552c807d69 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 17 Jun 2024 12:55:48 +0200 -Subject: [PATCH 04/71] !29577 - ---- - src/compiler/nir/meson.build | 1 + - src/compiler/nir/nir.c | 7 +- - src/compiler/nir/nir.h | 35 ++- - src/compiler/nir/nir_builder.h | 22 ++ - src/compiler/nir/nir_clone.c | 1 + - src/compiler/nir/nir_divergence_analysis.c | 31 ++- - src/compiler/nir/nir_functions.c | 5 +- - src/compiler/nir/nir_gather_info.c | 6 +- - src/compiler/nir/nir_inline_helpers.h | 2 + - src/compiler/nir/nir_lower_memory_model.c | 33 +-- - src/compiler/nir/nir_metadata.c | 13 ++ - src/compiler/nir/nir_opt_call.c | 259 +++++++++++++++++++++ - src/compiler/nir/nir_print.c | 7 + - src/compiler/nir/nir_serialize.c | 11 + - src/compiler/nir/nir_sweep.c | 9 - - src/compiler/nir/nir_validate.c | 5 + - src/compiler/spirv/vtn_cfg.c | 3 + - 17 files changed, 410 insertions(+), 40 deletions(-) - create mode 100644 src/compiler/nir/nir_opt_call.c - -diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build -index 514f5e0e1b7a1..2df6b28d73b39 100644 ---- a/src/compiler/nir/meson.build -+++ b/src/compiler/nir/meson.build -@@ -219,6 +219,7 @@ files_libnir = files( - 'nir_normalize_cubemap_coords.c', - 'nir_opt_access.c', - 'nir_opt_barriers.c', -+ 'nir_opt_call.c', - 'nir_opt_combine_stores.c', - 'nir_opt_comparison_pre.c', - 'nir_opt_conditional_discard.c', -diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c -index 513fd04f36f99..0b2736e4a0308 100644 ---- a/src/compiler/nir/nir.c -+++ b/src/compiler/nir/nir.c -@@ -502,6 +502,7 @@ nir_function_create(nir_shader *shader, const char *name) - func->is_preamble = false; - func->dont_inline = false; - func->should_inline = false; -+ func->driver_attributes = 0; - func->is_subroutine = false; - func->is_tmp_globals_wrapper = false; - func->subroutine_index = 0; -@@ -1584,8 +1585,8 @@ nir_def_rewrite_uses_src(nir_def *def, nir_src new_src) - nir_def_rewrite_uses(def, new_src.ssa); - } - --static bool --is_instr_between(nir_instr *start, nir_instr *end, nir_instr *between) -+bool -+nir_instr_is_between(nir_instr *start, nir_instr *end, nir_instr *between) - { - assert(start->block == end->block); - -@@ -1629,7 +1630,7 @@ nir_def_rewrite_uses_after(nir_def *def, nir_def *new_ssa, - * not be dominated by after_me is if it is between def and after_me in - * the instruction list. - */ -- if (is_instr_between(def->parent_instr, after_me, nir_src_parent_instr(use_src))) -+ if (nir_instr_is_between(def->parent_instr, after_me, nir_src_parent_instr(use_src))) - continue; - } - -diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h -index 7a781b7fefb4e..10a592f4b87bb 100644 ---- a/src/compiler/nir/nir.h -+++ b/src/compiler/nir/nir.h -@@ -1915,6 +1915,10 @@ typedef struct { - nir_instr instr; - - struct nir_function *callee; -+ /* If this function call is indirect, the function pointer to call. -+ * Otherwise, null initialized. -+ */ -+ nir_src indirect_callee; - - unsigned num_params; - nir_src params[]; -@@ -3646,13 +3650,28 @@ typedef struct { - uint8_t num_components; - uint8_t bit_size; - -- /* True if this paramater is actually the function return variable */ -+ /* True if this parameter is a deref used for returning values */ - bool is_return; - - bool implicit_conversion_prohibited; - -+ /* True if this parameter is not divergent. This is inverted to make -+ * parameters divergent by default unless explicitly specified -+ * otherwise. -+ */ -+ bool is_uniform; -+ - nir_variable_mode mode; - -+ /* Drivers may optionally stash flags here describing the parameter. -+ * For example, this might encode whether the driver expects the value -+ * to be uniform or divergent, if the driver handles divergent parameters -+ * differently from uniform ones. -+ * -+ * NIR will preserve this value but does not interpret it in any way. -+ */ -+ uint32_t driver_attributes; -+ - /* The type of the function param */ - const struct glsl_type *type; - } nir_parameter; -@@ -3675,6 +3694,14 @@ typedef struct nir_function { - */ - nir_function_impl *impl; - -+ /* Drivers may optionally stash flags here describing the function call. -+ * For example, this might encode the ABI used for the call if a driver -+ * supports multiple ABIs. -+ * -+ * NIR will preserve this value but does not interpret it in any way. -+ */ -+ uint32_t driver_attributes; -+ - bool is_entrypoint; - /* from SPIR-V linkage, only for libraries */ - bool is_exported; -@@ -5053,6 +5080,8 @@ void nir_instr_clear_src(nir_instr *instr, nir_src *src); - - void nir_instr_move_src(nir_instr *dest_instr, nir_src *dest, nir_src *src); - -+bool nir_instr_is_between(nir_instr *start, nir_instr *end, nir_instr *between); -+ - void nir_def_init(nir_instr *instr, nir_def *def, - unsigned num_components, unsigned bit_size); - static inline void -@@ -6789,6 +6818,10 @@ bool nir_opt_combine_barriers(nir_shader *shader, - void *data); - bool nir_opt_barrier_modes(nir_shader *shader); - -+typedef bool (*can_remat_cb)(nir_instr *instr); -+ -+bool nir_minimize_call_live_states(nir_shader *shader); -+ - bool nir_opt_combine_stores(nir_shader *shader, nir_variable_mode modes); - - bool nir_copy_prop_impl(nir_function_impl *impl); -diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h -index 5e07f588b4a5b..97a12e8c9ffc4 100644 ---- a/src/compiler/nir/nir_builder.h -+++ b/src/compiler/nir/nir_builder.h -@@ -2218,6 +2218,22 @@ nir_build_call(nir_builder *build, nir_function *func, size_t count, - nir_builder_instr_insert(build, &call->instr); - } - -+static inline void -+nir_build_indirect_call(nir_builder *build, nir_function *func, nir_def *callee, -+ size_t count, nir_def **args) -+{ -+ assert(count == func->num_params && "parameter count must match"); -+ assert(!func->impl && "cannot call directly defined functions indirectly"); -+ nir_call_instr *call = nir_call_instr_create(build->shader, func); -+ -+ for (unsigned i = 0; i < func->num_params; ++i) { -+ call->params[i] = nir_src_for_ssa(args[i]); -+ } -+ call->indirect_callee = nir_src_for_ssa(callee); -+ -+ nir_builder_instr_insert(build, &call->instr); -+} -+ - static inline void - nir_discard(nir_builder *build) - { -@@ -2251,6 +2267,12 @@ nir_build_string(nir_builder *build, const char *value); - nir_build_call(build, func, ARRAY_SIZE(args), args); \ - } while (0) - -+#define nir_call_indirect(build, func, callee, ...) \ -+ do { \ -+ nir_def *_args[] = { __VA_ARGS__ }; \ -+ nir_build_indirect_call(build, func, callee, ARRAY_SIZE(_args), _args); \ -+ } while (0) -+ - nir_def * - nir_compare_func(nir_builder *b, enum compare_func func, - nir_def *src0, nir_def *src1); -diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c -index a8359fcd8da76..0bfd9623686ec 100644 ---- a/src/compiler/nir/nir_clone.c -+++ b/src/compiler/nir/nir_clone.c -@@ -714,6 +714,7 @@ nir_function_clone(nir_shader *ns, const nir_function *fxn) - nfxn->should_inline = fxn->should_inline; - nfxn->dont_inline = fxn->dont_inline; - nfxn->is_subroutine = fxn->is_subroutine; -+ nfxn->driver_attributes = fxn->driver_attributes; - nfxn->is_tmp_globals_wrapper = fxn->is_tmp_globals_wrapper; - nfxn->num_subroutine_types = fxn->num_subroutine_types; - nfxn->subroutine_index = fxn->subroutine_index; -diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c -index 7c1b94ea81eb2..183b3bc81820d 100644 ---- a/src/compiler/nir/nir_divergence_analysis.c -+++ b/src/compiler/nir/nir_divergence_analysis.c -@@ -39,6 +39,7 @@ - struct divergence_state { - const gl_shader_stage stage; - nir_shader *shader; -+ nir_function_impl *impl; - nir_divergence_options options; - nir_loop *loop; - -@@ -713,11 +714,15 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) - src_divergent(instr->src[1], state); - break; - -+ case nir_intrinsic_load_param: -+ is_divergent = -+ !state->impl->function->params[nir_intrinsic_param_idx(instr)].is_uniform; -+ break; -+ - /* Intrinsics which are always divergent */ - case nir_intrinsic_inverse_ballot: - case nir_intrinsic_load_color0: - case nir_intrinsic_load_color1: -- case nir_intrinsic_load_param: - case nir_intrinsic_load_sample_id: - case nir_intrinsic_load_sample_id_no_per_sample: - case nir_intrinsic_load_sample_mask_in: -@@ -1089,8 +1094,9 @@ instr_is_loop_invariant(nir_instr *instr, struct divergence_state *state) - case nir_instr_type_deref: - case nir_instr_type_tex: - return nir_foreach_src(instr, src_invariant, state->loop); -- case nir_instr_type_phi: - case nir_instr_type_call: -+ return false; -+ case nir_instr_type_phi: - case nir_instr_type_parallel_copy: - default: - unreachable("NIR divergence analysis: Unsupported instruction type."); -@@ -1115,9 +1121,10 @@ update_instr_divergence(nir_instr *instr, struct divergence_state *state) - return visit_deref(state->shader, nir_instr_as_deref(instr), state); - case nir_instr_type_debug_info: - return false; -+ case nir_instr_type_call: -+ return false; - case nir_instr_type_jump: - case nir_instr_type_phi: -- case nir_instr_type_call: - case nir_instr_type_parallel_copy: - default: - unreachable("NIR divergence analysis: Unsupported instruction type."); -@@ -1405,6 +1412,7 @@ nir_divergence_analysis_impl(nir_function_impl *impl, nir_divergence_options opt - struct divergence_state state = { - .stage = impl->function->shader->info.stage, - .shader = impl->function->shader, -+ .impl = impl, - .options = options, - .loop = NULL, - .divergent_loop_cf = false, -@@ -1422,8 +1430,10 @@ void - nir_divergence_analysis(nir_shader *shader) - { - shader->info.divergence_analysis_run = true; -- nir_divergence_analysis_impl(nir_shader_get_entrypoint(shader), -- shader->options->divergence_analysis_options); -+ nir_foreach_function_impl(impl, shader) { -+ nir_divergence_analysis_impl(impl, -+ shader->options->divergence_analysis_options); -+ } - } - - /* Compute divergence between vertices of the same primitive. This uses -@@ -1444,10 +1454,13 @@ nir_vertex_divergence_analysis(nir_shader *shader) - .first_visit = true, - }; - -- nir_metadata_require(nir_shader_get_entrypoint(shader), -- nir_metadata_block_index); -- visit_cf_list(&nir_shader_get_entrypoint(shader)->body, &state); -- nir_metadata_preserve(nir_shader_get_entrypoint(shader), nir_metadata_all); -+ nir_foreach_function_impl(impl, shader) { -+ state.first_visit = true; -+ state.impl = impl; -+ nir_metadata_require(impl, nir_metadata_block_index); -+ visit_cf_list(&impl->body, &state); -+ nir_metadata_preserve(impl, nir_metadata_all); -+ } - } - - bool -diff --git a/src/compiler/nir/nir_functions.c b/src/compiler/nir/nir_functions.c -index 3ad986f697905..355161cf1b40d 100644 ---- a/src/compiler/nir/nir_functions.c -+++ b/src/compiler/nir/nir_functions.c -@@ -194,7 +194,10 @@ static bool inline_functions_pass(nir_builder *b, - return false; - - nir_call_instr *call = nir_instr_as_call(instr); -- assert(call->callee->impl); -+ if (!call->callee->impl) -+ return false; -+ -+ assert(!call->indirect_callee.ssa); - - if (b->shader->options->driver_functions && - b->shader->info.stage == MESA_SHADER_KERNEL) { -diff --git a/src/compiler/nir/nir_gather_info.c b/src/compiler/nir/nir_gather_info.c -index a5932cf3b3082..9af452acfb546 100644 ---- a/src/compiler/nir/nir_gather_info.c -+++ b/src/compiler/nir/nir_gather_info.c -@@ -954,8 +954,10 @@ gather_func_info(nir_function_impl *func, nir_shader *shader, - nir_call_instr *call = nir_instr_as_call(instr); - nir_function_impl *impl = call->callee->impl; - -- assert(impl || !"nir_shader_gather_info only works with linked shaders"); -- gather_func_info(impl, shader, visited_funcs, dead_ctx); -+ if (!call->indirect_callee.ssa) -+ assert(impl || !"nir_shader_gather_info only works with linked shaders"); -+ if (impl) -+ gather_func_info(impl, shader, visited_funcs, dead_ctx); - break; - } - default: -diff --git a/src/compiler/nir/nir_inline_helpers.h b/src/compiler/nir/nir_inline_helpers.h -index 8f3994f5353d6..17f2581cceee1 100644 ---- a/src/compiler/nir/nir_inline_helpers.h -+++ b/src/compiler/nir/nir_inline_helpers.h -@@ -107,6 +107,8 @@ nir_foreach_src(nir_instr *instr, nir_foreach_src_cb cb, void *state) - } - case nir_instr_type_call: { - nir_call_instr *call = nir_instr_as_call(instr); -+ if (call->indirect_callee.ssa && !_nir_visit_src(&call->indirect_callee, cb, state)) -+ return false; - for (unsigned i = 0; i < call->num_params; i++) { - if (!_nir_visit_src(&call->params[i], cb, state)) - return false; -diff --git a/src/compiler/nir/nir_lower_memory_model.c b/src/compiler/nir/nir_lower_memory_model.c -index 95d9f4e9526dc..c797eae8a4eb5 100644 ---- a/src/compiler/nir/nir_lower_memory_model.c -+++ b/src/compiler/nir/nir_lower_memory_model.c -@@ -229,21 +229,24 @@ nir_lower_memory_model(nir_shader *shader) - { - bool progress = false; - -- nir_function_impl *impl = nir_shader_get_entrypoint(shader); -- struct exec_list *cf_list = &impl->body; -- -- uint32_t modes = 0; -- foreach_list_typed(nir_cf_node, cf_node, node, cf_list) -- progress |= lower_make_visible(cf_node, &modes); -- -- modes = 0; -- foreach_list_typed_reverse(nir_cf_node, cf_node, node, cf_list) -- progress |= lower_make_available(cf_node, &modes); -- -- if (progress) -- nir_metadata_preserve(impl, nir_metadata_control_flow); -- else -- nir_metadata_preserve(impl, nir_metadata_all); -+ nir_foreach_function_impl(impl, shader) { -+ bool impl_progress = false; -+ struct exec_list *cf_list = &impl->body; -+ -+ uint32_t modes = 0; -+ foreach_list_typed(nir_cf_node, cf_node, node, cf_list) -+ impl_progress |= lower_make_visible(cf_node, &modes); -+ -+ modes = 0; -+ foreach_list_typed_reverse(nir_cf_node, cf_node, node, cf_list) -+ impl_progress |= lower_make_available(cf_node, &modes); -+ -+ if (impl_progress) -+ nir_metadata_preserve(impl, nir_metadata_control_flow); -+ else -+ nir_metadata_preserve(impl, nir_metadata_all); -+ progress |= impl_progress; -+ } - - return progress; - } -diff --git a/src/compiler/nir/nir_metadata.c b/src/compiler/nir/nir_metadata.c -index e0085991bbc06..29e2ceaa499d1 100644 ---- a/src/compiler/nir/nir_metadata.c -+++ b/src/compiler/nir/nir_metadata.c -@@ -61,6 +61,19 @@ nir_metadata_require(nir_function_impl *impl, nir_metadata required, ...) - void - nir_metadata_preserve(nir_function_impl *impl, nir_metadata preserved) - { -+ /* If we discard valid liveness information, immediately free the -+ * liveness information for each block. For large shaders, it can -+ * consume a huge amount of memory, and it's usually not immediately -+ * needed after dirtying. -+ */ -+ if ((impl->valid_metadata & ~preserved) & nir_metadata_live_defs) { -+ nir_foreach_block(block, impl) { -+ ralloc_free(block->live_in); -+ ralloc_free(block->live_out); -+ block->live_in = block->live_out = NULL; -+ } -+ } -+ - impl->valid_metadata &= preserved; - } - -diff --git a/src/compiler/nir/nir_opt_call.c b/src/compiler/nir/nir_opt_call.c -new file mode 100644 -index 0000000000000..421f78096042a ---- /dev/null -+++ b/src/compiler/nir/nir_opt_call.c -@@ -0,0 +1,259 @@ -+/* -+ * Copyright © 2024 Valve Corporation -+ * SPDX-License-Identifier: MIT -+ */ -+ -+#include "nir.h" -+#include "nir_builder.h" -+#include "nir_phi_builder.h" -+ -+struct call_liveness_entry { -+ struct list_head list; -+ nir_call_instr *instr; -+ const BITSET_WORD *live_set; -+}; -+ -+static bool -+can_remat_instr(nir_instr *instr) -+{ -+ switch (instr->type) { -+ case nir_instr_type_alu: -+ case nir_instr_type_load_const: -+ case nir_instr_type_undef: -+ return true; -+ case nir_instr_type_intrinsic: -+ switch (nir_instr_as_intrinsic(instr)->intrinsic) { -+ case nir_intrinsic_load_ray_launch_id: -+ case nir_intrinsic_load_ray_launch_size: -+ case nir_intrinsic_vulkan_resource_index: -+ case nir_intrinsic_vulkan_resource_reindex: -+ case nir_intrinsic_load_vulkan_descriptor: -+ case nir_intrinsic_load_push_constant: -+ case nir_intrinsic_load_global_constant: -+ case nir_intrinsic_load_smem_amd: -+ case nir_intrinsic_load_scalar_arg_amd: -+ case nir_intrinsic_load_vector_arg_amd: -+ return true; -+ default: -+ return false; -+ } -+ default: -+ return false; -+ } -+} -+ -+static void -+remat_ssa_def(nir_builder *b, nir_def *def, struct hash_table *remap_table, -+ struct hash_table *phi_value_table, struct nir_phi_builder *phi_builder, -+ BITSET_WORD *def_blocks) -+{ -+ memset(def_blocks, 0, BITSET_WORDS(b->impl->num_blocks) * sizeof(BITSET_WORD)); -+ BITSET_SET(def_blocks, def->parent_instr->block->index); -+ BITSET_SET(def_blocks, nir_cursor_current_block(b->cursor)->index); -+ struct nir_phi_builder_value *val = nir_phi_builder_add_value(phi_builder, def->num_components, def->bit_size, def_blocks); -+ _mesa_hash_table_insert(phi_value_table, def, val); -+ -+ nir_instr *clone = nir_instr_clone_deep(b->shader, def->parent_instr, remap_table); -+ nir_builder_instr_insert(b, clone); -+ nir_def *new_def = nir_instr_def(clone); -+ -+ _mesa_hash_table_insert(remap_table, def, new_def); -+ if (nir_cursor_current_block(b->cursor)->index != def->parent_instr->block->index) -+ nir_phi_builder_value_set_block_def(val, def->parent_instr->block, def); -+ nir_phi_builder_value_set_block_def(val, nir_cursor_current_block(b->cursor), new_def); -+} -+ -+struct remat_chain_check_data { -+ struct hash_table *remap_table; -+ unsigned chain_length; -+}; -+ -+static bool -+can_remat_chain(nir_src *src, void *data) -+{ -+ struct remat_chain_check_data *check_data = data; -+ -+ if (_mesa_hash_table_search(check_data->remap_table, src->ssa)) -+ return true; -+ -+ if (!can_remat_instr(src->ssa->parent_instr)) -+ return false; -+ -+ if (check_data->chain_length++ >= 16) -+ return false; -+ -+ return nir_foreach_src(src->ssa->parent_instr, can_remat_chain, check_data); -+} -+ -+struct remat_chain_data { -+ nir_builder *b; -+ struct hash_table *remap_table; -+ struct hash_table *phi_value_table; -+ struct nir_phi_builder *phi_builder; -+ BITSET_WORD *def_blocks; -+}; -+ -+static bool -+do_remat_chain(nir_src *src, void *data) -+{ -+ struct remat_chain_data *remat_data = data; -+ -+ if (_mesa_hash_table_search(remat_data->remap_table, src->ssa)) -+ return true; -+ -+ nir_foreach_src(src->ssa->parent_instr, do_remat_chain, remat_data); -+ -+ remat_ssa_def(remat_data->b, src->ssa, remat_data->remap_table, remat_data->phi_value_table, remat_data->phi_builder, remat_data->def_blocks); -+ return true; -+} -+ -+struct src_rewrite_ctx { -+ struct hash_table *phi_value_table; -+ nir_instr *instr; -+}; -+ -+static bool -+rewrite_instr_src_from_phi_builder(nir_src *src, void *data) -+{ -+ struct src_rewrite_ctx *ctx = data; -+ -+ if (nir_src_is_const(*src)) { -+ nir_builder b = nir_builder_at(nir_before_instr(ctx->instr)); -+ nir_src_rewrite(src, nir_build_imm(&b, src->ssa->num_components, src->ssa->bit_size, nir_src_as_const_value(*src))); -+ return true; -+ } -+ -+ struct hash_entry *entry = _mesa_hash_table_search(ctx->phi_value_table, src->ssa); -+ if (!entry) -+ return true; -+ -+ nir_block *block = nir_src_parent_instr(src)->block; -+ nir_def *new_def = nir_phi_builder_value_get_block_def(entry->data, block); -+ -+ bool can_rewrite = true; -+ if (new_def->parent_instr->block == block && new_def->index != UINT32_MAX) -+ can_rewrite = nir_src_parent_instr(src) != nir_block_first_instr(block) && -+ !nir_instr_is_between(nir_block_first_instr(block), -+ new_def->parent_instr, -+ nir_src_parent_instr(src)); -+ -+ if (can_rewrite) -+ nir_src_rewrite(src, new_def); -+ return true; -+} -+ -+static bool -+nir_minimize_call_live_states_impl(nir_function_impl *impl) -+{ -+ nir_metadata_require(impl, nir_metadata_block_index | nir_metadata_live_defs | nir_metadata_dominance); -+ bool progress = false; -+ void *mem_ctx = ralloc_context(NULL); -+ -+ struct list_head call_list; -+ list_inithead(&call_list); -+ unsigned num_defs = impl->ssa_alloc; -+ -+ nir_def **rematerializable = rzalloc_array_size(mem_ctx, sizeof(nir_def *), num_defs); -+ -+ nir_foreach_block(block, impl) { -+ nir_foreach_instr(instr, block) { -+ nir_def *def = nir_instr_def(instr); -+ if (def && -+ can_remat_instr(instr)) { -+ rematerializable[def->index] = def; -+ } -+ -+ if (instr->type != nir_instr_type_call) -+ continue; -+ nir_call_instr *call = nir_instr_as_call(instr); -+ if (!call->indirect_callee.ssa) -+ continue; -+ -+ struct call_liveness_entry *entry = ralloc_size(mem_ctx, sizeof(struct call_liveness_entry)); -+ entry->instr = call; -+ entry->live_set = nir_get_live_defs(nir_after_instr(instr), mem_ctx); -+ list_addtail(&entry->list, &call_list); -+ } -+ } -+ -+ const unsigned block_words = BITSET_WORDS(impl->num_blocks); -+ BITSET_WORD *def_blocks = ralloc_array(mem_ctx, BITSET_WORD, block_words); -+ -+ list_for_each_entry(struct call_liveness_entry, entry, &call_list, list) { -+ unsigned i; -+ -+ nir_builder b = nir_builder_at(nir_after_instr(&entry->instr->instr)); -+ -+ struct nir_phi_builder *builder = nir_phi_builder_create(impl); -+ struct hash_table *phi_value_table = -+ _mesa_pointer_hash_table_create(mem_ctx); -+ struct hash_table *remap_table = -+ _mesa_pointer_hash_table_create(mem_ctx); -+ -+ BITSET_FOREACH_SET(i, entry->live_set, num_defs) { -+ if (!rematerializable[i] || _mesa_hash_table_search(remap_table, rematerializable[i])) -+ continue; -+ -+ progress = true; -+ assert(!_mesa_hash_table_search(phi_value_table, rematerializable[i])); -+ -+ struct remat_chain_check_data check_data = { -+ .remap_table = remap_table, -+ .chain_length = 1, -+ }; -+ -+ if (!nir_foreach_src(rematerializable[i]->parent_instr, can_remat_chain, &check_data)) -+ continue; -+ -+ struct remat_chain_data remat_data = { -+ .b = &b, -+ .remap_table = remap_table, -+ .phi_value_table = phi_value_table, -+ .phi_builder = builder, -+ .def_blocks = def_blocks, -+ }; -+ -+ nir_foreach_src(rematerializable[i]->parent_instr, do_remat_chain, &remat_data); -+ -+ remat_ssa_def(&b, rematerializable[i], remap_table, phi_value_table, builder, def_blocks); -+ } -+ _mesa_hash_table_destroy(remap_table, NULL); -+ -+ nir_foreach_block(block, impl) { -+ nir_foreach_instr(instr, block) { -+ if (instr->type == nir_instr_type_phi) -+ continue; -+ -+ struct src_rewrite_ctx ctx = { -+ .phi_value_table = phi_value_table, -+ .instr = instr, -+ }; -+ nir_foreach_src(instr, rewrite_instr_src_from_phi_builder, &ctx); -+ } -+ } -+ -+ nir_phi_builder_finish(builder); -+ _mesa_hash_table_destroy(phi_value_table, NULL); -+ } -+ -+ ralloc_free(mem_ctx); -+ -+ nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); -+ return progress; -+} -+ -+/* Tries to rematerialize as many live vars as possible after calls. -+ * Note: nir_opt_cse will undo any rematerializations done by this pass, -+ * so it shouldn't be run afterward. -+ */ -+bool -+nir_minimize_call_live_states(nir_shader *shader) -+{ -+ bool progress = false; -+ -+ nir_foreach_function_impl(impl, shader) { -+ progress |= nir_minimize_call_live_states_impl(impl); -+ } -+ -+ return progress; -+} -\ No newline at end of file -diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c -index 41f3eae83fe7d..ff90a20320268 100644 ---- a/src/compiler/nir/nir_print.c -+++ b/src/compiler/nir/nir_print.c -@@ -1884,7 +1884,14 @@ print_call_instr(nir_call_instr *instr, print_state *state) - - print_no_dest_padding(state); - -+ bool indirect = instr->indirect_callee.ssa; -+ - fprintf(fp, "call %s ", instr->callee->name); -+ if (indirect) { -+ fprintf(fp, "(indirect "); -+ print_src(&instr->indirect_callee, state, nir_type_invalid); -+ fprintf(fp, ") "); -+ } - - for (unsigned i = 0; i < instr->num_params; i++) { - if (i != 0) -diff --git a/src/compiler/nir/nir_serialize.c b/src/compiler/nir/nir_serialize.c -index 2735683dd083f..ad77c88a47840 100644 ---- a/src/compiler/nir/nir_serialize.c -+++ b/src/compiler/nir/nir_serialize.c -@@ -1975,6 +1975,8 @@ write_function(write_ctx *ctx, const nir_function *fxn) - if (fxn->name) - blob_write_string(ctx->blob, fxn->name); - -+ blob_write_uint32(ctx->blob, fxn->driver_attributes); -+ - blob_write_uint32(ctx->blob, fxn->subroutine_index); - blob_write_uint32(ctx->blob, fxn->num_subroutine_types); - for (unsigned i = 0; i < fxn->num_subroutine_types; i++) { -@@ -1988,9 +1990,14 @@ write_function(write_ctx *ctx, const nir_function *fxn) - uint32_t val = - ((uint32_t)fxn->params[i].num_components) | - ((uint32_t)fxn->params[i].bit_size) << 8; -+ if (fxn->params[i].is_return) -+ val |= (1u << 16); -+ if (fxn->params[i].is_uniform) -+ val |= (1u << 17); - blob_write_uint32(ctx->blob, val); - encode_type_to_blob(ctx->blob, fxn->params[i].type); - blob_write_uint32(ctx->blob, encode_deref_modes(fxn->params[i].mode)); -+ blob_write_uint32(ctx->blob, fxn->params[i].driver_attributes); - } - - /* At first glance, it looks like we should write the function_impl here. -@@ -2010,6 +2017,7 @@ read_function(read_ctx *ctx) - - nir_function *fxn = nir_function_create(ctx->nir, name); - -+ fxn->driver_attributes = blob_read_uint32(ctx->blob); - fxn->subroutine_index = blob_read_uint32(ctx->blob); - fxn->num_subroutine_types = blob_read_uint32(ctx->blob); - for (unsigned i = 0; i < fxn->num_subroutine_types; i++) { -@@ -2024,8 +2032,11 @@ read_function(read_ctx *ctx) - uint32_t val = blob_read_uint32(ctx->blob); - fxn->params[i].num_components = val & 0xff; - fxn->params[i].bit_size = (val >> 8) & 0xff; -+ fxn->params[i].is_return = val & (1u << 16); -+ fxn->params[i].is_uniform = val & (1u << 17); - fxn->params[i].type = decode_type_from_blob(ctx->blob); - fxn->params[i].mode = decode_deref_modes(blob_read_uint32(ctx->blob)); -+ fxn->params[i].driver_attributes = blob_read_uint32(ctx->blob); - } - - fxn->is_entrypoint = flags & 0x1; -diff --git a/src/compiler/nir/nir_sweep.c b/src/compiler/nir/nir_sweep.c -index 9acd60a60b875..009343c3cf957 100644 ---- a/src/compiler/nir/nir_sweep.c -+++ b/src/compiler/nir/nir_sweep.c -@@ -47,15 +47,6 @@ sweep_block(nir_shader *nir, nir_block *block) - { - ralloc_steal(nir, block); - -- /* sweep_impl will mark all metadata invalid. We can safely release all of -- * this here. -- */ -- ralloc_free(block->live_in); -- block->live_in = NULL; -- -- ralloc_free(block->live_out); -- block->live_out = NULL; -- - nir_foreach_instr(instr, block) { - gc_mark_live(nir->gctx, instr); - -diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c -index ee2c9cd32c4aa..1f712962556d9 100644 ---- a/src/compiler/nir/nir_validate.c -+++ b/src/compiler/nir/nir_validate.c -@@ -966,6 +966,11 @@ validate_call_instr(nir_call_instr *instr, validate_state *state) - { - validate_assert(state, instr->num_params == instr->callee->num_params); - -+ if (instr->indirect_callee.ssa) { -+ validate_assert(state, !instr->callee->impl); -+ validate_src(&instr->indirect_callee, state); -+ } -+ - for (unsigned i = 0; i < instr->num_params; i++) { - validate_sized_src(&instr->params[i], state, - instr->callee->params[i].bit_size, -diff --git a/src/compiler/spirv/vtn_cfg.c b/src/compiler/spirv/vtn_cfg.c -index e1b9d21ecfc10..e2afb3f8eaaa9 100644 ---- a/src/compiler/spirv/vtn_cfg.c -+++ b/src/compiler/spirv/vtn_cfg.c -@@ -55,6 +55,7 @@ glsl_type_add_to_function_params(const struct glsl_type *type, - func->params[(*param_idx)++] = (nir_parameter) { - .num_components = glsl_get_vector_elements(type), - .bit_size = glsl_get_bit_size(type), -+ .type = type, - }; - } else if (glsl_type_is_array_or_matrix(type)) { - unsigned elems = glsl_get_length(type); -@@ -290,6 +291,8 @@ vtn_cfg_handle_prepass_instruction(struct vtn_builder *b, SpvOp opcode, - func->params[idx++] = (nir_parameter) { - .num_components = nir_address_format_num_components(addr_format), - .bit_size = nir_address_format_bit_size(addr_format), -+ .is_return = true, -+ .type = func_type->return_type->type, - }; - } - --- -GitLab - - -From 4c4b5a7e7b853d0ddcde5436d58cfa43c310d401 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Thu, 3 Oct 2024 15:58:19 +0200 -Subject: [PATCH 05/71] aco/lower_to_hw_instr: Also consider operand alignment - requirements - ---- - src/amd/compiler/aco_lower_to_hw_instr.cpp | 7 ++++--- - 1 file changed, 4 insertions(+), 3 deletions(-) - -diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp -index 0e18aa66069f8..fa3c805f491b5 100644 ---- a/src/amd/compiler/aco_lower_to_hw_instr.cpp -+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp -@@ -1191,16 +1191,17 @@ split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op, - if ((ctx->program->gfx_level < GFX10 || ctx->program->gfx_level >= GFX11) && - src.def.regClass().type() == RegType::vgpr) - max_size = MIN2(max_size, 4); -- unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16; -+ unsigned max_def_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16; -+ unsigned max_op_align = src.op.regClass().type() == RegType::vgpr ? 4 : 16; - - /* make sure the size is a power of two and reg % bytes == 0 */ - unsigned bytes = 1; - for (; bytes <= max_size; bytes *= 2) { - unsigned next = bytes * 2u; -- bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 && -+ bool can_increase = def_reg.reg_b % MIN2(next, max_def_align) == 0 && - offset + next <= src.bytes && next <= max_size; - if (!src.op.isConstant() && can_increase) -- can_increase = op_reg.reg_b % MIN2(next, max_align) == 0; -+ can_increase = op_reg.reg_b % MIN2(next, max_op_align) == 0; - for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++) - can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0); - if (!can_increase) --- -GitLab - - -From 325296b50ec3a85b9400189aec2b65b4c18bc40d Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Thu, 3 Oct 2024 15:58:45 +0200 -Subject: [PATCH 06/71] aco/ra: Disallow unaligned SGPR assignment - ---- - src/amd/compiler/aco_register_allocation.cpp | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index fc62487627fad..a8068b0da316a 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -2115,6 +2115,9 @@ operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr& instr, unsign - return false; - } - -+ if (rc.type() == RegType::sgpr && reg.reg() % rc.size()) -+ return false; -+ - switch (instr->format) { - case Format::SMEM: - return reg != scc && reg != exec && --- -GitLab - - -From 50d5f59160434a154a93d2c8db9eca0a27551416 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Fri, 4 Oct 2024 07:20:12 +0200 -Subject: [PATCH 07/71] aco/ra: Fix SGPR parallelcopy operands straddling - 64-reg boundary - ---- - src/amd/compiler/aco_register_allocation.cpp | 18 +++++++++++++++--- - 1 file changed, 15 insertions(+), 3 deletions(-) - -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index a8068b0da316a..3ce0680bf52d6 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -3009,12 +3009,24 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector& parallelcopy - if (!sgpr_operands_alias_defs) { - unsigned reg = parallelcopy[i].op.physReg().reg(); - unsigned size = parallelcopy[i].op.getTemp().size(); -- sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size); -+ if ((reg + size) / 64u == reg / 64u) { -+ sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size); -+ } else { -+ sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, 64u - (reg % 64u)); -+ sgpr_operands[(reg + size) / 64u] |= u_bit_consecutive64(0, (reg + size) % 64u); -+ } - - reg = parallelcopy[i].def.physReg().reg(); - size = parallelcopy[i].def.getTemp().size(); -- if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size)) -- sgpr_operands_alias_defs = true; -+ if ((reg + size) / 64u == reg / 64u) { -+ if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size)) -+ sgpr_operands_alias_defs = true; -+ } else { -+ if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, 64u - (reg % 64u))) -+ sgpr_operands_alias_defs = true; -+ if (sgpr_operands[(reg + size) / 64u] & u_bit_consecutive64(0, (reg + size) % 64u)) -+ sgpr_operands_alias_defs = true; -+ } - } - } - --- -GitLab - - -From 0d80a9a6eb1d317727688914ad8f612dc7bace13 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 17 Jun 2024 13:13:21 +0200 -Subject: [PATCH 08/71] radv: Gather info for all functions - ---- - src/amd/vulkan/radv_pipeline.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c -index a9df9b6b8aea3..82a5aac71437d 100644 ---- a/src/amd/vulkan/radv_pipeline.c -+++ b/src/amd/vulkan/radv_pipeline.c -@@ -431,7 +431,9 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat - NIR_PASS(_, stage->nir, nir_opt_constant_folding); - - /* Gather info again, to update whether 8/16-bit are used. */ -- nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir)); -+ nir_foreach_function_impl (impl, stage->nir) -+ if (impl->function->is_entrypoint || impl->function->is_exported) -+ nir_shader_gather_info(stage->nir, impl); - } - } - --- -GitLab - - -From 5e1e7090670cf7db02ea16a86790104a008c8813 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 11:27:06 +0200 -Subject: [PATCH 09/71] nir/intrinsics: Add incoming/outgoing payload - load/store instructions - -With RT function calls, these are going to get lowered to: -- load/store_param (incoming payload) -- load/store_var (outgoing payload) ---- - src/compiler/nir/nir_intrinsics.py | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py -index 31af10c320ba8..798e961c0c8e3 100644 ---- a/src/compiler/nir/nir_intrinsics.py -+++ b/src/compiler/nir/nir_intrinsics.py -@@ -1703,6 +1703,10 @@ intrinsic("execute_miss_amd", src_comp=[1]) - # BASE=dword index - intrinsic("load_hit_attrib_amd", dest_comp=1, bit_sizes=[32], indices=[BASE]) - intrinsic("store_hit_attrib_amd", src_comp=[1], indices=[BASE]) -+intrinsic("load_incoming_ray_payload_amd", dest_comp=1, bit_sizes=[32], indices=[BASE]) -+intrinsic("store_incoming_ray_payload_amd", src_comp=[1], indices=[BASE]) -+intrinsic("load_outgoing_ray_payload_amd", dest_comp=1, bit_sizes=[32], indices=[BASE]) -+intrinsic("store_outgoing_ray_payload_amd", src_comp=[1], indices=[BASE]) - - # Load forced VRS rates. - intrinsic("load_force_vrs_rates_amd", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER]) --- -GitLab - - -From 47aae01aefb03df60f1ca9e6c80f17b76a83f031 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Thu, 6 Jun 2024 08:07:34 +0200 -Subject: [PATCH 10/71] radv: Temporarily disable RT pipelines - ---- - src/amd/vulkan/radv_physical_device.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c -index 5022ead6c9d76..98826470d4d60 100644 ---- a/src/amd/vulkan/radv_physical_device.c -+++ b/src/amd/vulkan/radv_physical_device.c -@@ -111,6 +111,10 @@ radv_filter_minmax_enabled(const struct radv_physical_device *pdev) - bool - radv_enable_rt(const struct radv_physical_device *pdev, bool rt_pipelines) - { -+ /* Temporarily under construction! */ -+ if (rt_pipelines) -+ return false; -+ - if (pdev->info.gfx_level < GFX10_3 && !radv_emulate_rt(pdev)) - return false; - --- -GitLab - - -From e268331ef1d7dcd0bb7642286f358ce7ccd50a5c Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 11:28:21 +0200 -Subject: [PATCH 11/71] nir: Remove - nir_intrinsic_load_rt_arg_scratch_offset_amd - -Not needed anymore. ---- - src/amd/vulkan/nir/radv_nir_rt_shader.c | 11 ----------- - src/amd/vulkan/radv_pipeline_rt.c | 1 - - src/compiler/nir/nir_divergence_analysis.c | 1 - - src/compiler/nir/nir_intrinsics.py | 3 --- - 4 files changed, 16 deletions(-) - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index 8dcd853aa724d..9224c169319fc 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -488,10 +488,6 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) - nir_src_rewrite(&intr->src[1], nir_iadd_nuw(b, nir_load_var(b, vars->stack_ptr), intr->src[1].ssa)); - return true; - } -- case nir_intrinsic_load_rt_arg_scratch_offset_amd: { -- ret = nir_load_var(b, vars->arg); -- break; -- } - case nir_intrinsic_load_shader_record_ptr: { - ret = nir_load_var(b, vars->shader_record_ptr); - break; -@@ -1086,12 +1082,6 @@ lower_any_hit_for_intersection(nir_shader *any_hit) - b->cursor = nir_before_instr(instr); - nir_src_rewrite(&intrin->src[1], nir_iadd_nuw(b, scratch_offset, intrin->src[1].ssa)); - break; -- case nir_intrinsic_load_rt_arg_scratch_offset_amd: -- b->cursor = nir_after_instr(instr); -- nir_def *arg_offset = nir_isub(b, &intrin->def, scratch_offset); -- nir_def_rewrite_uses_after(&intrin->def, arg_offset, arg_offset->parent_instr); -- break; -- - default: - break; - } -@@ -1732,7 +1722,6 @@ radv_build_traversal_shader(struct radv_device *device, struct radv_ray_tracing_ - nir_store_var(&b, vars.cull_mask_and_flags, nir_load_cull_mask_and_flags_amd(&b), 0x1); - nir_store_var(&b, vars.origin, nir_load_ray_world_origin(&b), 0x7); - nir_store_var(&b, vars.direction, nir_load_ray_world_direction(&b), 0x7); -- nir_store_var(&b, vars.arg, nir_load_rt_arg_scratch_offset_amd(&b), 0x1); - nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 0x1); - - radv_build_traversal(device, pipeline, pCreateInfo, false, &b, &vars, false, info); -diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c -index 8d9ba4d6047a6..11acaa74dfc54 100644 ---- a/src/amd/vulkan/radv_pipeline_rt.c -+++ b/src/amd/vulkan/radv_pipeline_rt.c -@@ -318,7 +318,6 @@ should_move_rt_instruction(nir_intrinsic_instr *instr) - switch (instr->intrinsic) { - case nir_intrinsic_load_hit_attrib_amd: - return nir_intrinsic_base(instr) < RADV_MAX_HIT_ATTRIB_DWORDS; -- case nir_intrinsic_load_rt_arg_scratch_offset_amd: - case nir_intrinsic_load_ray_flags: - case nir_intrinsic_load_ray_object_origin: - case nir_intrinsic_load_ray_world_origin: -diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c -index 183b3bc81820d..78943c897922f 100644 ---- a/src/compiler/nir/nir_divergence_analysis.c -+++ b/src/compiler/nir/nir_divergence_analysis.c -@@ -835,7 +835,6 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) - case nir_intrinsic_load_packed_passthrough_primitive_amd: - case nir_intrinsic_load_initial_edgeflags_amd: - case nir_intrinsic_gds_atomic_add_amd: -- case nir_intrinsic_load_rt_arg_scratch_offset_amd: - case nir_intrinsic_load_intersection_opaque_amd: - case nir_intrinsic_load_vector_arg_amd: - case nir_intrinsic_load_btd_stack_id_intel: -diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py -index 798e961c0c8e3..2a6de0c4b6f25 100644 ---- a/src/compiler/nir/nir_intrinsics.py -+++ b/src/compiler/nir/nir_intrinsics.py -@@ -1673,9 +1673,6 @@ intrinsic("bvh64_intersect_ray_amd", [4, 2, 1, 3, 3, 3], 4, flags=[CAN_ELIMINATE - # Return of a callable in raytracing pipelines - intrinsic("rt_return_amd") - --# offset into scratch for the input callable data in a raytracing pipeline. --system_value("rt_arg_scratch_offset_amd", 1) -- - # Whether to call the anyhit shader for an intersection in an intersection shader. - system_value("intersection_opaque_amd", 1, bit_sizes=[1]) - --- -GitLab - - -From 8100ae695c5322e10227619b5e1b6027c2b35a02 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 11:31:55 +0200 -Subject: [PATCH 12/71] radv/rt: Remove RT priorities - -They have been been useful for ensuring reconvergence, but RT function -calls ensure that on their own now. ---- - src/amd/vulkan/nir/radv_nir_rt_shader.c | 37 ------------------------- - src/amd/vulkan/radv_cmd_buffer.c | 2 +- - src/amd/vulkan/radv_pipeline_rt.c | 2 +- - src/amd/vulkan/radv_shader.h | 27 ------------------ - 4 files changed, 2 insertions(+), 66 deletions(-) - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index 9224c169319fc..3f50c7297baae 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -1843,43 +1843,6 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, - lower_hit_attribs(shader, hit_attribs, 0); - } - --/** Select the next shader based on priorities: -- * -- * Detect the priority of the shader stage by the lowest bits in the address (low to high): -- * - Raygen - idx 0 -- * - Traversal - idx 1 -- * - Closest Hit / Miss - idx 2 -- * - Callable - idx 3 -- * -- * -- * This gives us the following priorities: -- * Raygen : Callable > > Traversal > Raygen -- * Traversal : > Chit / Miss > > Raygen -- * CHit / Miss : Callable > Chit / Miss > Traversal > Raygen -- * Callable : Callable > Chit / Miss > > Raygen -- */ --static nir_def * --select_next_shader(nir_builder *b, nir_def *shader_addr, unsigned wave_size) --{ -- gl_shader_stage stage = b->shader->info.stage; -- nir_def *prio = nir_iand_imm(b, shader_addr, radv_rt_priority_mask); -- nir_def *ballot = nir_ballot(b, 1, wave_size, nir_imm_bool(b, true)); -- nir_def *ballot_traversal = nir_ballot(b, 1, wave_size, nir_ieq_imm(b, prio, radv_rt_priority_traversal)); -- nir_def *ballot_hit_miss = nir_ballot(b, 1, wave_size, nir_ieq_imm(b, prio, radv_rt_priority_hit_miss)); -- nir_def *ballot_callable = nir_ballot(b, 1, wave_size, nir_ieq_imm(b, prio, radv_rt_priority_callable)); -- -- if (stage != MESA_SHADER_CALLABLE && stage != MESA_SHADER_INTERSECTION) -- ballot = nir_bcsel(b, nir_ine_imm(b, ballot_traversal, 0), ballot_traversal, ballot); -- if (stage != MESA_SHADER_RAYGEN) -- ballot = nir_bcsel(b, nir_ine_imm(b, ballot_hit_miss, 0), ballot_hit_miss, ballot); -- if (stage != MESA_SHADER_INTERSECTION) -- ballot = nir_bcsel(b, nir_ine_imm(b, ballot_callable, 0), ballot_callable, ballot); -- -- nir_def *lsb = nir_find_lsb(b, ballot); -- nir_def *next = nir_read_invocation(b, shader_addr, lsb); -- return nir_iand_imm(b, next, ~radv_rt_priority_mask); --} -- - static void - radv_store_arg(nir_builder *b, const struct radv_shader_args *args, const struct radv_ray_tracing_stage_info *info, - struct ac_arg arg, nir_def *value) -diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c -index d205cebbda64c..96bda7c3cf639 100644 ---- a/src/amd/vulkan/radv_cmd_buffer.c -+++ b/src/amd/vulkan/radv_cmd_buffer.c -@@ -7551,7 +7551,7 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_compu - const uint32_t traversal_shader_addr_offset = radv_get_user_sgpr_loc(rt_prolog, AC_UD_CS_TRAVERSAL_SHADER_ADDR); - struct radv_shader *traversal_shader = cmd_buffer->state.shaders[MESA_SHADER_INTERSECTION]; - if (traversal_shader_addr_offset && traversal_shader) { -- uint64_t traversal_va = traversal_shader->va | radv_rt_priority_traversal; -+ uint64_t traversal_va = traversal_shader->va; - radv_emit_shader_pointer(device, cmd_buffer->cs, traversal_shader_addr_offset, traversal_va, true); - } - } -diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c -index 11acaa74dfc54..32a1cba1269f3 100644 ---- a/src/amd/vulkan/radv_pipeline_rt.c -+++ b/src/amd/vulkan/radv_pipeline_rt.c -@@ -1138,7 +1138,7 @@ radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkRayTra - if (pipeline->groups[i].recursive_shader != VK_SHADER_UNUSED_KHR) { - struct radv_shader *shader = pipeline->stages[pipeline->groups[i].recursive_shader].shader; - if (shader) -- pipeline->groups[i].handle.recursive_shader_ptr = shader->va | radv_get_rt_priority(shader->info.stage); -+ pipeline->groups[i].handle.recursive_shader_ptr = shader->va; - } - } - -diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h -index 300358a346dbb..968ebbe6d4af4 100644 ---- a/src/amd/vulkan/radv_shader.h -+++ b/src/amd/vulkan/radv_shader.h -@@ -682,33 +682,6 @@ nir_shader *radv_build_traversal_shader(struct radv_device *device, struct radv_ - const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, - struct radv_ray_tracing_stage_info *info); - --enum radv_rt_priority { -- radv_rt_priority_raygen = 0, -- radv_rt_priority_traversal = 1, -- radv_rt_priority_hit_miss = 2, -- radv_rt_priority_callable = 3, -- radv_rt_priority_mask = 0x3, --}; -- --static inline enum radv_rt_priority --radv_get_rt_priority(gl_shader_stage stage) --{ -- switch (stage) { -- case MESA_SHADER_RAYGEN: -- return radv_rt_priority_raygen; -- case MESA_SHADER_INTERSECTION: -- case MESA_SHADER_ANY_HIT: -- return radv_rt_priority_traversal; -- case MESA_SHADER_CLOSEST_HIT: -- case MESA_SHADER_MISS: -- return radv_rt_priority_hit_miss; -- case MESA_SHADER_CALLABLE: -- return radv_rt_priority_callable; -- default: -- unreachable("Unimplemented RT shader stage."); -- } --} -- - struct radv_shader_layout; - enum radv_pipeline_type; - --- -GitLab - - -From 8849cf03b0c29eb6b864a4056195ca7dc9f53a68 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 11:39:33 +0200 -Subject: [PATCH 13/71] radv/rt: Refactor radv_nir_lower_rt_vars - -Now we can use it on load/store instruction. Will be used for lowering -payloads to load/store_*_payload instructions. ---- - .../nir/radv_nir_lower_hit_attrib_derefs.c | 93 ++++++++++++++----- - 1 file changed, 70 insertions(+), 23 deletions(-) - -diff --git a/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c b/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c -index 38e14dd4015fc..9db157dd4baf0 100644 ---- a/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c -+++ b/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c -@@ -10,13 +10,19 @@ - #include "radv_constants.h" - #include "radv_nir.h" - -+typedef nir_def *(*load_intrin_cb)(nir_builder *b, unsigned base); -+typedef void (*store_intrin_cb)(nir_builder *b, nir_def *val, unsigned base); -+ - struct lower_hit_attrib_deref_args { - nir_variable_mode mode; - uint32_t base_offset; -+ -+ load_intrin_cb load_cb; -+ store_intrin_cb store_cb; - }; - - static bool --lower_hit_attrib_deref(nir_builder *b, nir_instr *instr, void *data) -+lower_rt_var_deref(nir_builder *b, nir_instr *instr, void *data) - { - if (instr->type != nir_instr_type_intrinsic) - return false; -@@ -48,19 +54,16 @@ lower_hit_attrib_deref(nir_builder *b, nir_instr *instr, void *data) - uint32_t comp_offset = offset % 4; - - if (bit_size == 64) { -- components[comp] = nir_pack_64_2x32_split(b, nir_load_hit_attrib_amd(b, .base = base), -- nir_load_hit_attrib_amd(b, .base = base + 1)); -+ components[comp] = nir_pack_64_2x32_split(b, args->load_cb(b, base), args->load_cb(b, base + 1)); - } else if (bit_size == 32) { -- components[comp] = nir_load_hit_attrib_amd(b, .base = base); -+ components[comp] = args->load_cb(b, base); - } else if (bit_size == 16) { -- components[comp] = -- nir_channel(b, nir_unpack_32_2x16(b, nir_load_hit_attrib_amd(b, .base = base)), comp_offset / 2); -+ components[comp] = nir_channel(b, nir_unpack_32_2x16(b, args->load_cb(b, base)), comp_offset / 2); - } else if (bit_size == 8) { -- components[comp] = -- nir_channel(b, nir_unpack_bits(b, nir_load_hit_attrib_amd(b, .base = base), 8), comp_offset); -+ components[comp] = nir_channel(b, nir_unpack_bits(b, args->load_cb(b, base), 8), comp_offset); - } else { - assert(bit_size == 1); -- components[comp] = nir_i2b(b, nir_load_hit_attrib_amd(b, .base = base)); -+ components[comp] = nir_i2b(b, args->load_cb(b, base)); - } - } - -@@ -78,25 +81,25 @@ lower_hit_attrib_deref(nir_builder *b, nir_instr *instr, void *data) - nir_def *component = nir_channel(b, value, comp); - - if (bit_size == 64) { -- nir_store_hit_attrib_amd(b, nir_unpack_64_2x32_split_x(b, component), .base = base); -- nir_store_hit_attrib_amd(b, nir_unpack_64_2x32_split_y(b, component), .base = base + 1); -+ args->store_cb(b, nir_unpack_64_2x32_split_x(b, component), base); -+ args->store_cb(b, nir_unpack_64_2x32_split_y(b, component), base + 1); - } else if (bit_size == 32) { -- nir_store_hit_attrib_amd(b, component, .base = base); -+ args->store_cb(b, component, base); - } else if (bit_size == 16) { -- nir_def *prev = nir_unpack_32_2x16(b, nir_load_hit_attrib_amd(b, .base = base)); -+ nir_def *prev = nir_unpack_32_2x16(b, args->load_cb(b, base)); - nir_def *components[2]; - for (uint32_t word = 0; word < 2; word++) - components[word] = (word == comp_offset / 2) ? nir_channel(b, value, comp) : nir_channel(b, prev, word); -- nir_store_hit_attrib_amd(b, nir_pack_32_2x16(b, nir_vec(b, components, 2)), .base = base); -+ args->store_cb(b, nir_pack_32_2x16(b, nir_vec(b, components, 2)), base); - } else if (bit_size == 8) { -- nir_def *prev = nir_unpack_bits(b, nir_load_hit_attrib_amd(b, .base = base), 8); -+ nir_def *prev = nir_unpack_bits(b, args->load_cb(b, base), 8); - nir_def *components[4]; - for (uint32_t byte = 0; byte < 4; byte++) - components[byte] = (byte == comp_offset) ? nir_channel(b, value, comp) : nir_channel(b, prev, byte); -- nir_store_hit_attrib_amd(b, nir_pack_32_4x8(b, nir_vec(b, components, 4)), .base = base); -+ args->store_cb(b, nir_pack_32_4x8(b, nir_vec(b, components, 4)), base); - } else { - assert(bit_size == 1); -- nir_store_hit_attrib_amd(b, nir_b2i32(b, component), .base = base); -+ args->store_cb(b, nir_b2i32(b, component), base); - } - } - } -@@ -123,13 +126,14 @@ radv_lower_payload_arg_to_offset(nir_builder *b, nir_intrinsic_instr *instr, voi - } - - static bool --radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, uint32_t base_offset) -+radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, load_intrin_cb load_cb, store_intrin_cb store_cb, -+ uint32_t base_offset) - { - bool progress = false; - - progress |= nir_lower_indirect_derefs(shader, mode, UINT32_MAX); - -- progress |= nir_lower_vars_to_explicit_types(shader, mode, glsl_get_natural_size_align_bytes); -+ NIR_PASS(_, shader, nir_lower_vars_to_explicit_types, mode, glsl_get_natural_size_align_bytes); - - if (shader->info.stage == MESA_SHADER_RAYGEN && mode == nir_var_function_temp) - progress |= nir_shader_intrinsics_pass(shader, radv_lower_payload_arg_to_offset, nir_metadata_control_flow, NULL); -@@ -137,9 +141,11 @@ radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, uint32_t base - struct lower_hit_attrib_deref_args args = { - .mode = mode, - .base_offset = base_offset, -+ .load_cb = load_cb, -+ .store_cb = store_cb, - }; - -- progress |= nir_shader_instructions_pass(shader, lower_hit_attrib_deref, nir_metadata_control_flow, &args); -+ progress |= nir_shader_instructions_pass(shader, lower_rt_var_deref, nir_metadata_control_flow, &args); - - if (progress) { - nir_remove_dead_derefs(shader); -@@ -149,16 +155,57 @@ radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, uint32_t base - return progress; - } - -+static nir_def * -+load_hit_attrib_cb(nir_builder *b, unsigned base) -+{ -+ return nir_load_hit_attrib_amd(b, .base = base); -+} -+ -+static void -+store_hit_attrib_cb(nir_builder *b, nir_def *val, unsigned base) -+{ -+ nir_store_hit_attrib_amd(b, val, .base = base); -+} -+ - bool - radv_nir_lower_hit_attrib_derefs(nir_shader *shader) - { -- return radv_nir_lower_rt_vars(shader, nir_var_ray_hit_attrib, 0); -+ bool progress = false; -+ progress |= nir_lower_vars_to_explicit_types(shader, nir_var_ray_hit_attrib, glsl_get_natural_size_align_bytes); -+ progress |= radv_nir_lower_rt_vars(shader, nir_var_ray_hit_attrib, load_hit_attrib_cb, store_hit_attrib_cb, 0); -+ return progress; -+} -+ -+static nir_def * -+load_incoming_payload_cb(nir_builder *b, unsigned base) -+{ -+ return nir_load_incoming_ray_payload_amd(b, .base = base); -+} -+ -+static void -+store_incoming_payload_cb(nir_builder *b, nir_def *val, unsigned base) -+{ -+ nir_store_incoming_ray_payload_amd(b, val, .base = base); -+} -+ -+static nir_def * -+load_outgoing_payload_cb(nir_builder *b, unsigned base) -+{ -+ return nir_load_outgoing_ray_payload_amd(b, .base = base); -+} -+ -+static void -+store_outgoing_payload_cb(nir_builder *b, nir_def *val, unsigned base) -+{ -+ nir_store_outgoing_ray_payload_amd(b, val, .base = base); - } - - bool - radv_nir_lower_ray_payload_derefs(nir_shader *shader, uint32_t offset) - { -- bool progress = radv_nir_lower_rt_vars(shader, nir_var_function_temp, RADV_MAX_HIT_ATTRIB_SIZE + offset); -- progress |= radv_nir_lower_rt_vars(shader, nir_var_shader_call_data, RADV_MAX_HIT_ATTRIB_SIZE + offset); -+ bool progress = radv_nir_lower_rt_vars(shader, nir_var_function_temp, load_outgoing_payload_cb, -+ store_outgoing_payload_cb, offset); -+ progress |= radv_nir_lower_rt_vars(shader, nir_var_shader_call_data, load_incoming_payload_cb, -+ store_incoming_payload_cb, offset); - return progress; - } --- -GitLab - - -From 7c120680691e255437116f3219d1d4684d28a180 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 11:46:28 +0200 -Subject: [PATCH 14/71] radv/rt: Pass maximum payload size to - radv_rt_nir_to_asm - ---- - src/amd/vulkan/radv_pipeline_rt.c | 27 ++++++++++++++++++++++----- - 1 file changed, 22 insertions(+), 5 deletions(-) - -diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c -index 32a1cba1269f3..0de6d1281b932 100644 ---- a/src/amd/vulkan/radv_pipeline_rt.c -+++ b/src/amd/vulkan/radv_pipeline_rt.c -@@ -356,7 +356,7 @@ move_rt_instructions(nir_shader *shader) - static VkResult - radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, - const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, struct radv_ray_tracing_pipeline *pipeline, -- bool monolithic, struct radv_shader_stage *stage, uint32_t *stack_size, -+ bool monolithic, struct radv_shader_stage *stage, uint32_t *payload_size, uint32_t *stack_size, - struct radv_ray_tracing_stage_info *stage_info, - const struct radv_ray_tracing_stage_info *traversal_stage_info, - struct radv_serialized_shader_arena_block *replay_block, struct radv_shader **out_shader) -@@ -368,7 +368,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, - bool keep_executable_info = radv_pipeline_capture_shaders(device, pipeline->base.base.create_flags); - bool keep_statistic_info = radv_pipeline_capture_shader_stats(device, pipeline->base.base.create_flags); - -- radv_nir_lower_rt_io(stage->nir, monolithic, 0); -+ radv_nir_lower_rt_io(stage->nir, monolithic, 0, payload_size); - - /* Gather shader info. */ - nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir)); -@@ -586,6 +586,10 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca - if (!stages) - return VK_ERROR_OUT_OF_HOST_MEMORY; - -+ uint32_t payload_size = 0; -+ if (pCreateInfo->pLibraryInterface) -+ payload_size = pCreateInfo->pLibraryInterface->maxPipelineRayPayloadSize; -+ - bool library = pipeline->base.base.create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR; - - bool monolithic = !library; -@@ -605,6 +609,19 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca - - NIR_PASS(_, stage->nir, radv_nir_lower_hit_attrib_derefs); - -+ nir_foreach_variable_with_modes (var, stage->nir, nir_var_shader_call_data) { -+ unsigned size, alignment; -+ glsl_get_natural_size_align_bytes(var->type, &size, &alignment); -+ payload_size = MAX2(payload_size, size); -+ } -+ nir_foreach_function_impl (impl, stage->nir) { -+ nir_foreach_variable_in_list (var, &impl->locals) { -+ unsigned size, alignment; -+ glsl_get_natural_size_align_bytes(var->type, &size, &alignment); -+ payload_size = MAX2(payload_size, size); -+ } -+ } -+ - rt_stages[i].info = radv_gather_ray_tracing_stage_info(stage->nir); - - stage->feedback.duration = os_time_get_nano() - stage_start; -@@ -670,8 +687,8 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca - - bool monolithic_raygen = monolithic && stage->stage == MESA_SHADER_RAYGEN; - -- result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, monolithic_raygen, stage, &stack_size, -- &rt_stages[idx].info, NULL, replay_block, &rt_stages[idx].shader); -+ result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, monolithic_raygen, stage, &payload_size, -+ &stack_size, &rt_stages[idx].info, NULL, replay_block, &rt_stages[idx].shader); - if (result != VK_SUCCESS) - goto cleanup; - -@@ -728,7 +745,7 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca - .key = stage_keys[MESA_SHADER_INTERSECTION], - }; - radv_shader_layout_init(pipeline_layout, MESA_SHADER_INTERSECTION, &traversal_stage.layout); -- result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, false, &traversal_stage, NULL, NULL, -+ result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, false, &traversal_stage, &payload_size, NULL, NULL, - &traversal_info, NULL, &pipeline->base.base.shaders[MESA_SHADER_INTERSECTION]); - ralloc_free(traversal_nir); - --- -GitLab - - -From d7b329a6c5625895e7e020ee948d2c0b9c9e9329 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 11:47:46 +0200 -Subject: [PATCH 15/71] radv/rt: Track traversal shader stack size - ---- - src/amd/vulkan/radv_pipeline_rt.c | 14 ++++++++------ - src/amd/vulkan/radv_pipeline_rt.h | 1 + - 2 files changed, 9 insertions(+), 6 deletions(-) - -diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c -index 0de6d1281b932..3c848361f13e3 100644 ---- a/src/amd/vulkan/radv_pipeline_rt.c -+++ b/src/amd/vulkan/radv_pipeline_rt.c -@@ -745,8 +745,9 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca - .key = stage_keys[MESA_SHADER_INTERSECTION], - }; - radv_shader_layout_init(pipeline_layout, MESA_SHADER_INTERSECTION, &traversal_stage.layout); -- result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, false, &traversal_stage, &payload_size, NULL, NULL, -- &traversal_info, NULL, &pipeline->base.base.shaders[MESA_SHADER_INTERSECTION]); -+ result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, false, &traversal_stage, &payload_size, -+ &pipeline->traversal_stack_size, NULL, &traversal_info, NULL, -+ &pipeline->base.base.shaders[MESA_SHADER_INTERSECTION]); - ralloc_free(traversal_nir); - - cleanup: -@@ -807,10 +808,11 @@ compute_rt_stack_size(const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, stru - unreachable("Invalid stage type in RT shader"); - } - } -- pipeline->stack_size = -- raygen_size + -- MIN2(pCreateInfo->maxPipelineRayRecursionDepth, 1) * MAX2(chit_miss_size, intersection_size + any_hit_size) + -- MAX2(0, (int)(pCreateInfo->maxPipelineRayRecursionDepth) - 1) * chit_miss_size + 2 * callable_size; -+ pipeline->stack_size = raygen_size + -+ MIN2(pCreateInfo->maxPipelineRayRecursionDepth, 1) * -+ (chit_miss_size + intersection_size + any_hit_size + pipeline->traversal_stack_size) + -+ MAX2(0, (int)(pCreateInfo->maxPipelineRayRecursionDepth) - 1) * chit_miss_size + -+ 2 * callable_size; - } - - static void -diff --git a/src/amd/vulkan/radv_pipeline_rt.h b/src/amd/vulkan/radv_pipeline_rt.h -index 99c0067325923..acfe978924a17 100644 ---- a/src/amd/vulkan/radv_pipeline_rt.h -+++ b/src/amd/vulkan/radv_pipeline_rt.h -@@ -26,6 +26,7 @@ struct radv_ray_tracing_pipeline { - unsigned group_count; - - uint32_t stack_size; -+ uint32_t traversal_stack_size; - - /* set if any shaders from this pipeline require robustness2 in the merged traversal shader */ - bool traversal_storage_robustness2 : 1; --- -GitLab - - -From a48ee7d583587d09cf042045f5ae89d01a17f4ad Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 11:48:48 +0200 -Subject: [PATCH 16/71] radv/rt: Set stack size to scratch_bytes_per_wave - ---- - src/amd/vulkan/radv_pipeline_rt.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c -index 3c848361f13e3..c86e292a36244 100644 ---- a/src/amd/vulkan/radv_pipeline_rt.c -+++ b/src/amd/vulkan/radv_pipeline_rt.c -@@ -460,6 +460,9 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, - shader = radv_shader_create(device, cache, binary, keep_executable_info || dump_shader); - - if (shader) { -+ if (stack_size) -+ *stack_size += DIV_ROUND_UP(shader->config.scratch_bytes_per_wave, shader->info.wave_size); -+ - radv_shader_generate_debug_info(device, dump_shader, keep_executable_info, binary, shader, shaders, num_shaders, - &stage->info); - --- -GitLab - - -From 4af66a35fb348043880ebb4c46893bfd6bebb7fc Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 12:15:20 +0200 -Subject: [PATCH 17/71] radv/rt: Use radv_get_rt_shader_entrypoint instead of - nir_shader_get_entrypoint - ---- - src/amd/vulkan/nir/radv_nir_rt_shader.c | 2 +- - src/amd/vulkan/radv_pipeline_rt.c | 2 +- - src/amd/vulkan/radv_shader.h | 9 +++++++++ - 3 files changed, 11 insertions(+), 2 deletions(-) - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index 3f50c7297baae..931c8c3e10ab1 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -1610,7 +1610,7 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin - radv_build_end_trace_token(b, vars, original_tmax, nir_load_var(b, trav_vars.hit), - nir_load_var(b, iteration_instance_count)); - -- nir_metadata_preserve(nir_shader_get_entrypoint(b->shader), nir_metadata_none); -+ nir_metadata_preserve(radv_get_rt_shader_entrypoint(b->shader), nir_metadata_none); - radv_nir_lower_hit_attrib_derefs(b->shader); - - /* Register storage for hit attributes */ -diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c -index c86e292a36244..c4feea4a6f95b 100644 ---- a/src/amd/vulkan/radv_pipeline_rt.c -+++ b/src/amd/vulkan/radv_pipeline_rt.c -@@ -424,7 +424,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, - pipeline, monolithic, traversal_stage_info); - - /* Info might be out-of-date after inlining in radv_nir_lower_rt_abi(). */ -- nir_shader_gather_info(temp_stage.nir, nir_shader_get_entrypoint(temp_stage.nir)); -+ nir_shader_gather_info(temp_stage.nir, radv_get_rt_shader_entrypoint(temp_stage.nir)); - - radv_optimize_nir(temp_stage.nir, stage->key.optimisations_disabled); - radv_postprocess_nir(device, NULL, &temp_stage); -diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h -index 968ebbe6d4af4..36ad1d0dd8bf9 100644 ---- a/src/amd/vulkan/radv_shader.h -+++ b/src/amd/vulkan/radv_shader.h -@@ -507,6 +507,15 @@ struct radv_shader_stage; - void radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively); - void radv_optimize_nir_algebraic(nir_shader *shader, bool opt_offsets, bool opt_mqsad); - -+static inline nir_function_impl * -+radv_get_rt_shader_entrypoint(nir_shader *shader) -+{ -+ nir_foreach_function_impl (impl, shader) -+ if (impl->function->is_entrypoint || impl->function->is_exported) -+ return impl; -+ return NULL; -+} -+ - void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset); - - struct radv_ray_tracing_stage_info; --- -GitLab - - -From 38ac43cce19772daf5b566eee5128805a90e75a7 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Fri, 4 Oct 2024 05:48:26 +0200 -Subject: [PATCH 18/71] radv/rt: Only lower vars to explicit types for - monolithic shaders - ---- - src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c | 2 -- - src/amd/vulkan/nir/radv_nir_rt_shader.c | 2 ++ - 2 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c b/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c -index 9db157dd4baf0..7efcad3675c6b 100644 ---- a/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c -+++ b/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c -@@ -133,8 +133,6 @@ radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, load_intrin_c - - progress |= nir_lower_indirect_derefs(shader, mode, UINT32_MAX); - -- NIR_PASS(_, shader, nir_lower_vars_to_explicit_types, mode, glsl_get_natural_size_align_bytes); -- - if (shader->info.stage == MESA_SHADER_RAYGEN && mode == nir_var_function_temp) - progress |= nir_shader_intrinsics_pass(shader, radv_lower_payload_arg_to_offset, nir_metadata_control_flow, NULL); - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index 931c8c3e10ab1..c2b0e99f74129 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -851,6 +851,8 @@ radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset) - - NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_function_temp, nir_address_format_32bit_offset); - } else { -+ NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_natural_size_align_bytes); -+ NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_shader_temp, glsl_get_natural_size_align_bytes); - NIR_PASS(_, nir, radv_nir_lower_ray_payload_derefs, payload_offset); - } - } --- -GitLab - - -From c75c5ab22d84c3e168f0879aca26412d0d6d3668 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 11:54:05 +0200 -Subject: [PATCH 19/71] radv/rt: Lower monolithic ray payload load/store - instructions - ---- - src/amd/vulkan/nir/radv_nir_rt_shader.c | 98 +++++++++++++++++-------- - src/amd/vulkan/radv_shader.h | 2 +- - 2 files changed, 69 insertions(+), 31 deletions(-) - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index c2b0e99f74129..061c58d45949f 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -731,12 +731,13 @@ lower_rt_instructions(nir_shader *shader, struct rt_variables *vars, bool late_l - nir_shader_instructions_pass(shader, radv_lower_rt_instruction, nir_metadata_none, &data); - } - --/* Lowers hit attributes to registers or shared memory. If hit_attribs is NULL, attributes are -+/* Lowers RT I/O vars to registers or shared memory. If hit_attribs is NULL, attributes are - * lowered to shared memory. */ - static void --lower_hit_attribs(nir_shader *shader, nir_variable **hit_attribs, uint32_t workgroup_size) -+lower_rt_storage(nir_shader *shader, nir_variable **hit_attribs, nir_deref_instr **payload_in, -+ nir_variable **payload_out, uint32_t workgroup_size) - { -- nir_function_impl *impl = nir_shader_get_entrypoint(shader); -+ nir_function_impl *impl = radv_get_rt_shader_entrypoint(shader); - - nir_foreach_variable_with_modes (attrib, shader, nir_var_ray_hit_attrib) - attrib->data.mode = nir_var_shader_temp; -@@ -750,29 +751,55 @@ lower_hit_attribs(nir_shader *shader, nir_variable **hit_attribs, uint32_t workg - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - if (intrin->intrinsic != nir_intrinsic_load_hit_attrib_amd && -- intrin->intrinsic != nir_intrinsic_store_hit_attrib_amd) -+ intrin->intrinsic != nir_intrinsic_store_hit_attrib_amd && -+ intrin->intrinsic != nir_intrinsic_load_incoming_ray_payload_amd && -+ intrin->intrinsic != nir_intrinsic_store_incoming_ray_payload_amd && -+ intrin->intrinsic != nir_intrinsic_load_outgoing_ray_payload_amd && -+ intrin->intrinsic != nir_intrinsic_store_outgoing_ray_payload_amd) - continue; - - b.cursor = nir_after_instr(instr); - -- nir_def *offset; -- if (!hit_attribs) -- offset = nir_imul_imm( -- &b, nir_iadd_imm(&b, nir_load_local_invocation_index(&b), nir_intrinsic_base(intrin) * workgroup_size), -- sizeof(uint32_t)); -- -- if (intrin->intrinsic == nir_intrinsic_load_hit_attrib_amd) { -- nir_def *ret; -- if (hit_attribs) -- ret = nir_load_var(&b, hit_attribs[nir_intrinsic_base(intrin)]); -+ if (intrin->intrinsic == nir_intrinsic_load_hit_attrib_amd || -+ intrin->intrinsic == nir_intrinsic_store_hit_attrib_amd) { -+ nir_def *offset; -+ if (!hit_attribs) -+ offset = nir_imul_imm( -+ &b, -+ nir_iadd_imm(&b, nir_load_local_invocation_index(&b), nir_intrinsic_base(intrin) * workgroup_size), -+ sizeof(uint32_t)); -+ -+ if (intrin->intrinsic == nir_intrinsic_load_hit_attrib_amd) { -+ nir_def *ret; -+ if (hit_attribs) -+ ret = nir_load_var(&b, hit_attribs[nir_intrinsic_base(intrin)]); -+ else -+ ret = nir_load_shared(&b, 1, 32, offset, .base = 0, .align_mul = 4); -+ nir_def_rewrite_uses(nir_instr_def(instr), ret); -+ } else { -+ if (hit_attribs) -+ nir_store_var(&b, hit_attribs[nir_intrinsic_base(intrin)], intrin->src->ssa, 0x1); -+ else -+ nir_store_shared(&b, intrin->src->ssa, offset, .base = 0, .align_mul = 4); -+ } -+ } else if (intrin->intrinsic == nir_intrinsic_load_incoming_ray_payload_amd || -+ intrin->intrinsic == nir_intrinsic_store_incoming_ray_payload_amd) { -+ if (!payload_in) -+ continue; -+ if (intrin->intrinsic == nir_intrinsic_load_incoming_ray_payload_amd) -+ nir_def_rewrite_uses(nir_instr_def(instr), nir_load_deref(&b, payload_in[nir_intrinsic_base(intrin)])); - else -- ret = nir_load_shared(&b, 1, 32, offset, .base = 0, .align_mul = 4); -- nir_def_rewrite_uses(nir_instr_def(instr), ret); -- } else { -- if (hit_attribs) -- nir_store_var(&b, hit_attribs[nir_intrinsic_base(intrin)], intrin->src->ssa, 0x1); -+ nir_store_deref(&b, payload_in[nir_intrinsic_base(intrin)], intrin->src->ssa, 0x1); -+ } else if (intrin->intrinsic == nir_intrinsic_load_outgoing_ray_payload_amd || -+ intrin->intrinsic == nir_intrinsic_store_outgoing_ray_payload_amd) { -+ if (!payload_out) -+ continue; -+ if (intrin->intrinsic == nir_intrinsic_load_outgoing_ray_payload_amd) -+ nir_def_rewrite_uses(nir_instr_def(instr), nir_load_var(&b, payload_out[nir_intrinsic_base(intrin)])); - else -- nir_store_shared(&b, intrin->src->ssa, offset, .base = 0, .align_mul = 4); -+ nir_store_var(&b, payload_out[nir_intrinsic_base(intrin)], intrin->src->ssa, 0x1); -+ } else { -+ continue; - } - nir_instr_remove(instr); - } -@@ -1620,10 +1647,9 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin - - if (!monolithic) { - for (uint32_t i = 0; i < ARRAY_SIZE(hit_attribs); i++) -- hit_attribs[i] = -- nir_local_variable_create(nir_shader_get_entrypoint(b->shader), glsl_uint_type(), "ahit_attrib"); -+ hit_attribs[i] = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "ahit_attrib"); - -- lower_hit_attribs(b->shader, hit_attribs, pdev->rt_wave_size); -+ lower_rt_storage(b->shader, hit_attribs, NULL, NULL, pdev->rt_wave_size); - } - - /* Initialize follow-up shader. */ -@@ -1819,10 +1845,11 @@ radv_count_hit_attrib_slots(nir_builder *b, nir_intrinsic_instr *instr, void *da - - static void - lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, -- struct radv_ray_tracing_pipeline *pipeline, -- const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, struct rt_variables *vars) -+ struct radv_ray_tracing_pipeline *pipeline, const struct radv_shader_info *info, -+ const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, uint32_t payload_size, -+ struct rt_variables *vars) - { -- nir_function_impl *impl = nir_shader_get_entrypoint(shader); -+ nir_function_impl *impl = radv_get_rt_shader_entrypoint(shader); - - struct lower_rt_instruction_monolithic_state state = { - .device = device, -@@ -1842,7 +1869,17 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, - for (uint32_t i = 0; i < hit_attrib_count; i++) - hit_attribs[i] = nir_local_variable_create(impl, glsl_uint_type(), "ahit_attrib"); - -- lower_hit_attribs(shader, hit_attribs, 0); -+ nir_builder b = nir_builder_create(impl); -+ b.cursor = nir_before_impl(impl); -+ nir_variable **payload_vars = rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(payload_size, 4)); -+ nir_deref_instr **payload_storage = -+ rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(payload_size, 4)); -+ for (unsigned i = 0; i < DIV_ROUND_UP(payload_size, 4); ++i) { -+ payload_vars[i] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "_payload"); -+ payload_storage[i] = nir_build_deref_var(&b, payload_vars[i]); -+ } -+ -+ lower_rt_storage(shader, hit_attribs, payload_storage, payload_vars, info->wave_size); - } - - static void -@@ -1857,8 +1894,9 @@ radv_store_arg(nir_builder *b, const struct radv_shader_args *args, const struct - void - radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, - const struct radv_shader_args *args, const struct radv_shader_info *info, uint32_t *stack_size, -- bool resume_shader, struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline, -- bool monolithic, const struct radv_ray_tracing_stage_info *traversal_info) -+ bool resume_shader, uint32_t payload_size, struct radv_device *device, -+ struct radv_ray_tracing_pipeline *pipeline, bool monolithic, -+ const struct radv_ray_tracing_stage_info *traversal_info) - { - nir_function_impl *impl = nir_shader_get_entrypoint(shader); - -@@ -1867,7 +1905,7 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH - struct rt_variables vars = create_rt_variables(shader, device, create_flags, monolithic); - - if (monolithic) -- lower_rt_instructions_monolithic(shader, device, pipeline, pCreateInfo, &vars); -+ lower_rt_instructions_monolithic(shader, device, pipeline, info, pCreateInfo, payload_size, &vars); - - struct radv_rt_shader_info rt_info = {0}; - -diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h -index 36ad1d0dd8bf9..4ba7e36d16952 100644 ---- a/src/amd/vulkan/radv_shader.h -+++ b/src/amd/vulkan/radv_shader.h -@@ -522,7 +522,7 @@ struct radv_ray_tracing_stage_info; - - void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, - const struct radv_shader_args *args, const struct radv_shader_info *info, -- uint32_t *stack_size, bool resume_shader, struct radv_device *device, -+ uint32_t *stack_size, bool resume_shader, uint32_t payload_size, struct radv_device *device, - struct radv_ray_tracing_pipeline *pipeline, bool monolithic, - const struct radv_ray_tracing_stage_info *traversal_info); - --- -GitLab - - -From 1ef679cac11353eba65d518f0728747550d40926 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 17 Jun 2024 13:02:28 +0200 -Subject: [PATCH 20/71] aco: Add function call attributes - -ACO needs RADV to set certain attributes on NIR functions to help with -compilation of function calls. ---- - src/amd/compiler/aco_nir_call_attribs.h | 29 +++++++++++++++++++++++++ - 1 file changed, 29 insertions(+) - create mode 100644 src/amd/compiler/aco_nir_call_attribs.h - -diff --git a/src/amd/compiler/aco_nir_call_attribs.h b/src/amd/compiler/aco_nir_call_attribs.h -new file mode 100644 -index 0000000000000..33dc011914cd9 ---- /dev/null -+++ b/src/amd/compiler/aco_nir_call_attribs.h -@@ -0,0 +1,29 @@ -+/* -+ * Copyright © 2024 Valve Corporation -+ * -+ * SPDX-License-Identifier: MIT -+ */ -+ -+#ifndef ACO_NIR_CALL_ATTRIBS_H -+#define ACO_NIR_CALL_ATTRIBS_H -+ -+enum aco_nir_call_abi { -+ ACO_NIR_CALL_ABI_RT_RECURSIVE, -+ ACO_NIR_CALL_ABI_TRAVERSAL, -+ ACO_NIR_CALL_ABI_AHIT_ISEC, -+}; -+ -+enum aco_nir_function_attribs { -+ ACO_NIR_FUNCTION_ATTRIB_ABI_MASK = 0x7F, -+ /* Different lanes can have different values for the function pointer to call */ -+ ACO_NIR_FUNCTION_ATTRIB_DIVERGENT_CALL = 0x1 << 7, -+ /* Function will never return */ -+ ACO_NIR_FUNCTION_ATTRIB_NORETURN = 0x2 << 7, -+}; -+ -+enum aco_nir_parameter_attribs { -+ /* Parameter value is not used by any callee and does not need to be preserved */ -+ ACO_NIR_PARAM_ATTRIB_DISCARDABLE = 0x1, -+}; -+ -+#endif /* ACO_NIR_CALL_ATTRIBS_H */ --- -GitLab - - -From 10abf8a72b902de027999226432bca4621cde2de Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Thu, 3 Oct 2024 12:34:25 +0200 -Subject: [PATCH 21/71] radv/rt: Lower descriptor loads to param loads - ---- - .../nir/radv_nir_apply_pipeline_layout.c | 46 +++++++++++++++++-- - 1 file changed, 42 insertions(+), 4 deletions(-) - -diff --git a/src/amd/vulkan/nir/radv_nir_apply_pipeline_layout.c b/src/amd/vulkan/nir/radv_nir_apply_pipeline_layout.c -index fd67c3eb18f5e..991cc31eadafd 100644 ---- a/src/amd/vulkan/nir/radv_nir_apply_pipeline_layout.c -+++ b/src/amd/vulkan/nir/radv_nir_apply_pipeline_layout.c -@@ -5,6 +5,7 @@ - */ - #include "ac_descriptors.h" - #include "ac_shader_util.h" -+#include "aco_nir_call_attribs.h" - #include "nir.h" - #include "nir_builder.h" - #include "radv_descriptor_set.h" -@@ -34,6 +35,42 @@ get_scalar_arg(nir_builder *b, unsigned size, struct ac_arg arg) - return nir_load_scalar_arg_amd(b, size, .base = arg.arg_index); - } - -+static nir_def * -+get_indirect_descriptors_addr(nir_builder *b, apply_layout_state *state) -+{ -+ switch (b->shader->info.stage) { -+ case MESA_SHADER_RAYGEN: -+ case MESA_SHADER_CALLABLE: -+ return nir_load_param(b, RAYGEN_ARG_DESCRIPTORS); -+ case MESA_SHADER_INTERSECTION: -+ return nir_load_param(b, TRAVERSAL_ARG_DESCRIPTORS); -+ case MESA_SHADER_CLOSEST_HIT: -+ case MESA_SHADER_MISS: -+ return nir_load_param(b, CHIT_MISS_ARG_DESCRIPTORS); -+ default: -+ assert(!gl_shader_stage_is_rt(b->shader->info.stage)); -+ return get_scalar_arg(b, 1, state->args->descriptor_sets[0]); -+ } -+} -+ -+static nir_def * -+get_indirect_push_constants_addr(nir_builder *b, apply_layout_state *state) -+{ -+ switch (b->shader->info.stage) { -+ case MESA_SHADER_RAYGEN: -+ case MESA_SHADER_CALLABLE: -+ return nir_load_param(b, RAYGEN_ARG_PUSH_CONSTANTS); -+ case MESA_SHADER_INTERSECTION: -+ return nir_load_param(b, TRAVERSAL_ARG_PUSH_CONSTANTS); -+ case MESA_SHADER_CLOSEST_HIT: -+ case MESA_SHADER_MISS: -+ return nir_load_param(b, CHIT_MISS_ARG_PUSH_CONSTANTS); -+ default: -+ assert(!gl_shader_stage_is_rt(b->shader->info.stage)); -+ return get_scalar_arg(b, 1, state->args->ac.push_constants); -+ } -+} -+ - static nir_def * - convert_pointer_to_64_bit(nir_builder *b, apply_layout_state *state, nir_def *ptr) - { -@@ -44,8 +81,9 @@ static nir_def * - load_desc_ptr(nir_builder *b, apply_layout_state *state, unsigned set) - { - const struct radv_userdata_locations *user_sgprs_locs = &state->info->user_sgprs_locs; -- if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) { -- nir_def *addr = get_scalar_arg(b, 1, state->args->descriptor_sets[0]); -+ if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1 || -+ gl_shader_stage_is_rt(b->shader->info.stage)) { -+ nir_def *addr = get_indirect_descriptors_addr(b, state); - addr = convert_pointer_to_64_bit(b, state, addr); - return nir_load_smem_amd(b, 1, addr, nir_imm_int(b, set * 4)); - } -@@ -67,7 +105,7 @@ visit_vulkan_resource_index(nir_builder *b, apply_layout_state *state, nir_intri - if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || - layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { - unsigned idx = state->layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset; -- set_ptr = get_scalar_arg(b, 1, state->args->ac.push_constants); -+ set_ptr = get_indirect_push_constants_addr(b, state); - offset = state->layout->push_constant_size + idx * 16; - stride = 16; - } else { -@@ -379,7 +417,7 @@ load_push_constant(nir_builder *b, apply_layout_state *state, nir_intrinsic_inst - } - - if (!offset) { -- addr = get_scalar_arg(b, 1, state->args->ac.push_constants); -+ addr = get_indirect_push_constants_addr(b, state); - addr = convert_pointer_to_64_bit(b, state, addr); - offset = nir_iadd_imm_nuw(b, intrin->src[0].ssa, base); - } --- -GitLab - - -From 41079fe63f7877dacb9fd3d8dc67740ed100439e Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 11:56:09 +0200 -Subject: [PATCH 22/71] radv/rt: Create RT functions to call - ---- - src/amd/compiler/aco_nir_call_attribs.h | 59 +++++ - src/amd/vulkan/nir/radv_nir_rt_shader.c | 276 +++++++++++++++++++++++- - 2 files changed, 331 insertions(+), 4 deletions(-) - -diff --git a/src/amd/compiler/aco_nir_call_attribs.h b/src/amd/compiler/aco_nir_call_attribs.h -index 33dc011914cd9..a879c51ebb3c2 100644 ---- a/src/amd/compiler/aco_nir_call_attribs.h -+++ b/src/amd/compiler/aco_nir_call_attribs.h -@@ -26,4 +26,63 @@ enum aco_nir_parameter_attribs { - ACO_NIR_PARAM_ATTRIB_DISCARDABLE = 0x1, - }; - -+enum aco_nir_raygen_function_arg { -+ RAYGEN_ARG_LAUNCH_ID = 0, -+ RAYGEN_ARG_LAUNCH_SIZE, -+ RAYGEN_ARG_DESCRIPTORS, -+ RAYGEN_ARG_PUSH_CONSTANTS, -+ RAYGEN_ARG_SBT_DESCRIPTORS, -+ RAYGEN_ARG_TRAVERSAL_ADDR, -+ RAYGEN_ARG_SHADER_RECORD_PTR, -+ RAYGEN_ARG_COUNT, -+}; -+ -+enum aco_nir_traversal_function_arg { -+ TRAVERSAL_ARG_LAUNCH_ID = 0, -+ TRAVERSAL_ARG_LAUNCH_SIZE, -+ TRAVERSAL_ARG_DESCRIPTORS, -+ TRAVERSAL_ARG_PUSH_CONSTANTS, -+ TRAVERSAL_ARG_SBT_DESCRIPTORS, -+ TRAVERSAL_ARG_TRAVERSAL_ADDR, -+ TRAVERSAL_ARG_SHADER_RECORD_PTR, -+ TRAVERSAL_ARG_ACCEL_STRUCT, -+ TRAVERSAL_ARG_CULL_MASK_AND_FLAGS, -+ TRAVERSAL_ARG_SBT_OFFSET, -+ TRAVERSAL_ARG_SBT_STRIDE, -+ TRAVERSAL_ARG_MISS_INDEX, -+ TRAVERSAL_ARG_RAY_ORIGIN, -+ TRAVERSAL_ARG_RAY_TMIN, -+ TRAVERSAL_ARG_RAY_DIRECTION, -+ TRAVERSAL_ARG_RAY_TMAX, -+ TRAVERSAL_ARG_PRIMITIVE_ID, -+ TRAVERSAL_ARG_INSTANCE_ADDR, -+ TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS, -+ TRAVERSAL_ARG_HIT_KIND, -+ TRAVERSAL_ARG_PAYLOAD_BASE, -+}; -+ -+enum aco_nir_chit_miss_function_arg { -+ CHIT_MISS_ARG_LAUNCH_ID = 0, -+ CHIT_MISS_ARG_LAUNCH_SIZE, -+ CHIT_MISS_ARG_DESCRIPTORS, -+ CHIT_MISS_ARG_PUSH_CONSTANTS, -+ CHIT_MISS_ARG_SBT_DESCRIPTORS, -+ CHIT_MISS_ARG_TRAVERSAL_ADDR, -+ CHIT_MISS_ARG_SHADER_RECORD_PTR, -+ CHIT_MISS_ARG_ACCEL_STRUCT, -+ CHIT_MISS_ARG_CULL_MASK_AND_FLAGS, -+ CHIT_MISS_ARG_SBT_OFFSET, -+ CHIT_MISS_ARG_SBT_STRIDE, -+ CHIT_MISS_ARG_MISS_INDEX, -+ CHIT_MISS_ARG_RAY_ORIGIN, -+ CHIT_MISS_ARG_RAY_TMIN, -+ CHIT_MISS_ARG_RAY_DIRECTION, -+ CHIT_MISS_ARG_RAY_TMAX, -+ CHIT_MISS_ARG_PRIMITIVE_ID, -+ CHIT_MISS_ARG_INSTANCE_ADDR, -+ CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS, -+ CHIT_MISS_ARG_HIT_KIND, -+ CHIT_MISS_ARG_PAYLOAD_BASE, -+}; -+ - #endif /* ACO_NIR_CALL_ATTRIBS_H */ -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index 061c58d45949f..165c7e18578e0 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -16,6 +16,8 @@ - #include "radv_pipeline_rt.h" - #include "radv_shader.h" - -+#include "aco_nir_call_attribs.h" -+ - #include "vk_pipeline.h" - - /* Traversal stack size. This stack is put in LDS and experimentally 16 entries results in best -@@ -164,6 +166,243 @@ lower_rt_derefs(nir_shader *shader) - return progress; - } - -+static void -+radv_nir_init_rt_function_params(nir_function *function, gl_shader_stage stage, unsigned payload_size) -+{ -+ unsigned payload_base = -1u; -+ -+ switch (stage) { -+ case MESA_SHADER_RAYGEN: -+ function->num_params = RAYGEN_ARG_COUNT; -+ function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params); -+ function->params[RAYGEN_ARG_LAUNCH_ID].num_components = 3; -+ function->params[RAYGEN_ARG_LAUNCH_ID].bit_size = 32; -+ function->params[RAYGEN_ARG_LAUNCH_ID].type = glsl_vector_type(GLSL_TYPE_UINT, 3); -+ function->params[RAYGEN_ARG_LAUNCH_SIZE].num_components = 3; -+ function->params[RAYGEN_ARG_LAUNCH_SIZE].bit_size = 32; -+ function->params[RAYGEN_ARG_LAUNCH_SIZE].type = glsl_vector_type(GLSL_TYPE_UINT, 3); -+ function->params[RAYGEN_ARG_LAUNCH_SIZE].is_uniform = true; -+ function->params[RAYGEN_ARG_DESCRIPTORS].num_components = 1; -+ function->params[RAYGEN_ARG_DESCRIPTORS].bit_size = 32; -+ function->params[RAYGEN_ARG_DESCRIPTORS].type = glsl_uint_type(); -+ function->params[RAYGEN_ARG_DESCRIPTORS].is_uniform = true; -+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].num_components = 1; -+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].bit_size = 32; -+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].type = glsl_uint_type(); -+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].is_uniform = true; -+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].num_components = 1; -+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].bit_size = 64; -+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].type = glsl_uint64_t_type(); -+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].is_uniform = true; -+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].num_components = 1; -+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].bit_size = 64; -+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].type = glsl_uint64_t_type(); -+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].is_uniform = true; -+ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].num_components = 1; -+ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].bit_size = 64; -+ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].type = glsl_uint64_t_type(); -+ function->driver_attributes = ACO_NIR_CALL_ABI_RT_RECURSIVE | ACO_NIR_FUNCTION_ATTRIB_NORETURN; -+ break; -+ case MESA_SHADER_CALLABLE: -+ function->num_params = RAYGEN_ARG_COUNT + DIV_ROUND_UP(payload_size, 4); -+ function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params); -+ function->params[RAYGEN_ARG_LAUNCH_ID].num_components = 3; -+ function->params[RAYGEN_ARG_LAUNCH_ID].bit_size = 32; -+ function->params[RAYGEN_ARG_LAUNCH_ID].type = glsl_vector_type(GLSL_TYPE_UINT, 3); -+ function->params[RAYGEN_ARG_LAUNCH_SIZE].num_components = 3; -+ function->params[RAYGEN_ARG_LAUNCH_SIZE].bit_size = 32; -+ function->params[RAYGEN_ARG_LAUNCH_SIZE].type = glsl_vector_type(GLSL_TYPE_UINT, 3); -+ function->params[RAYGEN_ARG_LAUNCH_SIZE].is_uniform = true; -+ function->params[RAYGEN_ARG_DESCRIPTORS].num_components = 1; -+ function->params[RAYGEN_ARG_DESCRIPTORS].bit_size = 32; -+ function->params[RAYGEN_ARG_DESCRIPTORS].type = glsl_uint_type(); -+ function->params[RAYGEN_ARG_DESCRIPTORS].is_uniform = true; -+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].num_components = 1; -+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].bit_size = 32; -+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].type = glsl_uint_type(); -+ function->params[RAYGEN_ARG_PUSH_CONSTANTS].is_uniform = true; -+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].num_components = 1; -+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].bit_size = 64; -+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].type = glsl_uint64_t_type(); -+ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].is_uniform = true; -+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].num_components = 1; -+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].bit_size = 64; -+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].type = glsl_uint64_t_type(); -+ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].is_uniform = true; -+ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].num_components = 1; -+ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].bit_size = 64; -+ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].type = glsl_uint64_t_type(); -+ -+ function->driver_attributes = ACO_NIR_CALL_ABI_RT_RECURSIVE | ACO_NIR_FUNCTION_ATTRIB_DIVERGENT_CALL; -+ payload_base = RAYGEN_ARG_COUNT; -+ break; -+ case MESA_SHADER_INTERSECTION: -+ function->num_params = TRAVERSAL_ARG_PAYLOAD_BASE + DIV_ROUND_UP(payload_size, 4); -+ function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params); -+ function->params[TRAVERSAL_ARG_LAUNCH_ID].num_components = 3; -+ function->params[TRAVERSAL_ARG_LAUNCH_ID].bit_size = 32; -+ function->params[TRAVERSAL_ARG_LAUNCH_ID].type = glsl_vector_type(GLSL_TYPE_UINT, 3); -+ function->params[TRAVERSAL_ARG_LAUNCH_SIZE].num_components = 3; -+ function->params[TRAVERSAL_ARG_LAUNCH_SIZE].bit_size = 32; -+ function->params[TRAVERSAL_ARG_LAUNCH_SIZE].type = glsl_vector_type(GLSL_TYPE_UINT, 3); -+ function->params[TRAVERSAL_ARG_LAUNCH_SIZE].is_uniform = true; -+ function->params[TRAVERSAL_ARG_DESCRIPTORS].num_components = 1; -+ function->params[TRAVERSAL_ARG_DESCRIPTORS].bit_size = 32; -+ function->params[TRAVERSAL_ARG_DESCRIPTORS].type = glsl_uint_type(); -+ function->params[TRAVERSAL_ARG_DESCRIPTORS].is_uniform = true; -+ function->params[TRAVERSAL_ARG_PUSH_CONSTANTS].num_components = 1; -+ function->params[TRAVERSAL_ARG_PUSH_CONSTANTS].bit_size = 32; -+ function->params[TRAVERSAL_ARG_PUSH_CONSTANTS].type = glsl_uint_type(); -+ function->params[TRAVERSAL_ARG_PUSH_CONSTANTS].is_uniform = true; -+ function->params[TRAVERSAL_ARG_SBT_DESCRIPTORS].num_components = 1; -+ function->params[TRAVERSAL_ARG_SBT_DESCRIPTORS].bit_size = 64; -+ function->params[TRAVERSAL_ARG_SBT_DESCRIPTORS].type = glsl_uint64_t_type(); -+ function->params[TRAVERSAL_ARG_SBT_DESCRIPTORS].is_uniform = true; -+ function->params[TRAVERSAL_ARG_TRAVERSAL_ADDR].num_components = 1; -+ function->params[TRAVERSAL_ARG_TRAVERSAL_ADDR].bit_size = 64; -+ function->params[TRAVERSAL_ARG_TRAVERSAL_ADDR].type = glsl_uint64_t_type(); -+ function->params[TRAVERSAL_ARG_TRAVERSAL_ADDR].is_uniform = true; -+ function->params[TRAVERSAL_ARG_SHADER_RECORD_PTR].num_components = 1; -+ function->params[TRAVERSAL_ARG_SHADER_RECORD_PTR].bit_size = 64; -+ function->params[TRAVERSAL_ARG_SHADER_RECORD_PTR].type = glsl_uint64_t_type(); -+ function->params[TRAVERSAL_ARG_ACCEL_STRUCT].num_components = 1; -+ function->params[TRAVERSAL_ARG_ACCEL_STRUCT].bit_size = 64; -+ function->params[TRAVERSAL_ARG_ACCEL_STRUCT].type = glsl_uint64_t_type(); -+ function->params[TRAVERSAL_ARG_CULL_MASK_AND_FLAGS].num_components = 1; -+ function->params[TRAVERSAL_ARG_CULL_MASK_AND_FLAGS].bit_size = 32; -+ function->params[TRAVERSAL_ARG_CULL_MASK_AND_FLAGS].type = glsl_uint_type(); -+ function->params[TRAVERSAL_ARG_SBT_OFFSET].num_components = 1; -+ function->params[TRAVERSAL_ARG_SBT_OFFSET].bit_size = 32; -+ function->params[TRAVERSAL_ARG_SBT_OFFSET].type = glsl_uint_type(); -+ function->params[TRAVERSAL_ARG_SBT_STRIDE].num_components = 1; -+ function->params[TRAVERSAL_ARG_SBT_STRIDE].bit_size = 32; -+ function->params[TRAVERSAL_ARG_SBT_STRIDE].type = glsl_uint_type(); -+ function->params[TRAVERSAL_ARG_MISS_INDEX].num_components = 1; -+ function->params[TRAVERSAL_ARG_MISS_INDEX].bit_size = 32; -+ function->params[TRAVERSAL_ARG_MISS_INDEX].type = glsl_uint_type(); -+ function->params[TRAVERSAL_ARG_RAY_ORIGIN].num_components = 3; -+ function->params[TRAVERSAL_ARG_RAY_ORIGIN].bit_size = 32; -+ function->params[TRAVERSAL_ARG_RAY_ORIGIN].type = glsl_vector_type(GLSL_TYPE_FLOAT, 3); -+ function->params[TRAVERSAL_ARG_RAY_TMIN].num_components = 1; -+ function->params[TRAVERSAL_ARG_RAY_TMIN].bit_size = 32; -+ function->params[TRAVERSAL_ARG_RAY_TMIN].type = glsl_float_type(); -+ function->params[TRAVERSAL_ARG_RAY_DIRECTION].num_components = 3; -+ function->params[TRAVERSAL_ARG_RAY_DIRECTION].bit_size = 32; -+ function->params[TRAVERSAL_ARG_RAY_DIRECTION].type = glsl_vector_type(GLSL_TYPE_FLOAT, 3); -+ function->params[TRAVERSAL_ARG_RAY_TMAX].num_components = 1; -+ function->params[TRAVERSAL_ARG_RAY_TMAX].bit_size = 32; -+ function->params[TRAVERSAL_ARG_RAY_TMAX].type = glsl_float_type(); -+ function->params[TRAVERSAL_ARG_PRIMITIVE_ID].num_components = 1; -+ function->params[TRAVERSAL_ARG_PRIMITIVE_ID].bit_size = 32; -+ function->params[TRAVERSAL_ARG_PRIMITIVE_ID].type = glsl_uint_type(); -+ function->params[TRAVERSAL_ARG_INSTANCE_ADDR].num_components = 1; -+ function->params[TRAVERSAL_ARG_INSTANCE_ADDR].bit_size = 64; -+ function->params[TRAVERSAL_ARG_INSTANCE_ADDR].type = glsl_uint64_t_type(); -+ function->params[TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS].num_components = 1; -+ function->params[TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS].bit_size = 32; -+ function->params[TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS].type = glsl_uint_type(); -+ function->params[TRAVERSAL_ARG_HIT_KIND].num_components = 1; -+ function->params[TRAVERSAL_ARG_HIT_KIND].bit_size = 32; -+ function->params[TRAVERSAL_ARG_HIT_KIND].type = glsl_uint_type(); -+ -+ function->driver_attributes = ACO_NIR_CALL_ABI_TRAVERSAL; -+ payload_base = TRAVERSAL_ARG_PAYLOAD_BASE; -+ break; -+ case MESA_SHADER_CLOSEST_HIT: -+ case MESA_SHADER_MISS: -+ function->num_params = CHIT_MISS_ARG_PAYLOAD_BASE + DIV_ROUND_UP(payload_size, 4); -+ function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params); -+ function->params[CHIT_MISS_ARG_LAUNCH_ID].num_components = 3; -+ function->params[CHIT_MISS_ARG_LAUNCH_ID].bit_size = 32; -+ function->params[CHIT_MISS_ARG_LAUNCH_ID].type = glsl_vector_type(GLSL_TYPE_UINT, 3); -+ function->params[CHIT_MISS_ARG_LAUNCH_SIZE].num_components = 3; -+ function->params[CHIT_MISS_ARG_LAUNCH_SIZE].bit_size = 32; -+ function->params[CHIT_MISS_ARG_LAUNCH_SIZE].type = glsl_vector_type(GLSL_TYPE_UINT, 3); -+ function->params[CHIT_MISS_ARG_LAUNCH_SIZE].is_uniform = true; -+ function->params[CHIT_MISS_ARG_DESCRIPTORS].num_components = 1; -+ function->params[CHIT_MISS_ARG_DESCRIPTORS].bit_size = 32; -+ function->params[CHIT_MISS_ARG_DESCRIPTORS].type = glsl_uint_type(); -+ function->params[CHIT_MISS_ARG_DESCRIPTORS].is_uniform = true; -+ function->params[CHIT_MISS_ARG_PUSH_CONSTANTS].num_components = 1; -+ function->params[CHIT_MISS_ARG_PUSH_CONSTANTS].bit_size = 32; -+ function->params[CHIT_MISS_ARG_PUSH_CONSTANTS].type = glsl_uint_type(); -+ function->params[CHIT_MISS_ARG_PUSH_CONSTANTS].is_uniform = true; -+ function->params[CHIT_MISS_ARG_SBT_DESCRIPTORS].num_components = 1; -+ function->params[CHIT_MISS_ARG_SBT_DESCRIPTORS].bit_size = 64; -+ function->params[CHIT_MISS_ARG_SBT_DESCRIPTORS].type = glsl_uint64_t_type(); -+ function->params[CHIT_MISS_ARG_SBT_DESCRIPTORS].is_uniform = true; -+ function->params[CHIT_MISS_ARG_TRAVERSAL_ADDR].num_components = 1; -+ function->params[CHIT_MISS_ARG_TRAVERSAL_ADDR].bit_size = 64; -+ function->params[CHIT_MISS_ARG_TRAVERSAL_ADDR].type = glsl_uint64_t_type(); -+ function->params[CHIT_MISS_ARG_TRAVERSAL_ADDR].is_uniform = true; -+ function->params[CHIT_MISS_ARG_SHADER_RECORD_PTR].num_components = 1; -+ function->params[CHIT_MISS_ARG_SHADER_RECORD_PTR].bit_size = 64; -+ function->params[CHIT_MISS_ARG_SHADER_RECORD_PTR].type = glsl_uint64_t_type(); -+ function->params[CHIT_MISS_ARG_ACCEL_STRUCT].num_components = 1; -+ function->params[CHIT_MISS_ARG_ACCEL_STRUCT].bit_size = 64; -+ function->params[CHIT_MISS_ARG_ACCEL_STRUCT].driver_attributes = ACO_NIR_PARAM_ATTRIB_DISCARDABLE; -+ function->params[CHIT_MISS_ARG_ACCEL_STRUCT].type = glsl_uint64_t_type(); -+ function->params[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS].num_components = 1; -+ function->params[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS].bit_size = 32; -+ function->params[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS].type = glsl_uint_type(); -+ function->params[CHIT_MISS_ARG_SBT_OFFSET].num_components = 1; -+ function->params[CHIT_MISS_ARG_SBT_OFFSET].bit_size = 32; -+ function->params[CHIT_MISS_ARG_SBT_OFFSET].driver_attributes = ACO_NIR_PARAM_ATTRIB_DISCARDABLE; -+ function->params[CHIT_MISS_ARG_SBT_OFFSET].type = glsl_uint_type(); -+ function->params[CHIT_MISS_ARG_SBT_STRIDE].num_components = 1; -+ function->params[CHIT_MISS_ARG_SBT_STRIDE].bit_size = 32; -+ function->params[CHIT_MISS_ARG_SBT_STRIDE].driver_attributes = ACO_NIR_PARAM_ATTRIB_DISCARDABLE; -+ function->params[CHIT_MISS_ARG_SBT_STRIDE].type = glsl_uint_type(); -+ function->params[CHIT_MISS_ARG_MISS_INDEX].num_components = 1; -+ function->params[CHIT_MISS_ARG_MISS_INDEX].bit_size = 32; -+ function->params[CHIT_MISS_ARG_MISS_INDEX].driver_attributes = ACO_NIR_PARAM_ATTRIB_DISCARDABLE; -+ function->params[CHIT_MISS_ARG_MISS_INDEX].type = glsl_uint_type(); -+ function->params[CHIT_MISS_ARG_RAY_ORIGIN].num_components = 3; -+ function->params[CHIT_MISS_ARG_RAY_ORIGIN].bit_size = 32; -+ function->params[CHIT_MISS_ARG_RAY_ORIGIN].type = glsl_vector_type(GLSL_TYPE_FLOAT, 3); -+ function->params[CHIT_MISS_ARG_RAY_TMIN].num_components = 1; -+ function->params[CHIT_MISS_ARG_RAY_TMIN].bit_size = 32; -+ function->params[CHIT_MISS_ARG_RAY_TMIN].type = glsl_float_type(); -+ function->params[CHIT_MISS_ARG_RAY_DIRECTION].num_components = 3; -+ function->params[CHIT_MISS_ARG_RAY_DIRECTION].bit_size = 32; -+ function->params[CHIT_MISS_ARG_RAY_DIRECTION].type = glsl_vector_type(GLSL_TYPE_FLOAT, 3); -+ function->params[CHIT_MISS_ARG_RAY_TMAX].num_components = 1; -+ function->params[CHIT_MISS_ARG_RAY_TMAX].bit_size = 32; -+ function->params[CHIT_MISS_ARG_RAY_TMAX].type = glsl_float_type(); -+ function->params[CHIT_MISS_ARG_PRIMITIVE_ID].num_components = 1; -+ function->params[CHIT_MISS_ARG_PRIMITIVE_ID].bit_size = 32; -+ function->params[CHIT_MISS_ARG_PRIMITIVE_ID].type = glsl_uint_type(); -+ function->params[CHIT_MISS_ARG_INSTANCE_ADDR].num_components = 1; -+ function->params[CHIT_MISS_ARG_INSTANCE_ADDR].bit_size = 64; -+ function->params[CHIT_MISS_ARG_INSTANCE_ADDR].type = glsl_uint64_t_type(); -+ function->params[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS].num_components = 1; -+ function->params[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS].bit_size = 32; -+ function->params[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS].type = glsl_uint_type(); -+ function->params[CHIT_MISS_ARG_HIT_KIND].num_components = 1; -+ function->params[CHIT_MISS_ARG_HIT_KIND].bit_size = 32; -+ function->params[CHIT_MISS_ARG_HIT_KIND].type = glsl_uint_type(); -+ -+ function->driver_attributes = ACO_NIR_CALL_ABI_RT_RECURSIVE | ACO_NIR_FUNCTION_ATTRIB_DIVERGENT_CALL; -+ payload_base = CHIT_MISS_ARG_PAYLOAD_BASE; -+ break; -+ default: -+ unreachable("invalid RT stage"); -+ } -+ -+ if (payload_base != -1u) { -+ for (unsigned i = 0; i < DIV_ROUND_UP(payload_size, 4); ++i) { -+ function->params[payload_base + i].num_components = 1; -+ function->params[payload_base + i].bit_size = 32; -+ function->params[payload_base + i].is_return = true; -+ function->params[payload_base + i].type = glsl_uint_type(); -+ } -+ } -+ -+ /* Entrypoints can't have parameters. Consider RT stages as callable functions */ -+ function->is_exported = true; -+ function->is_entrypoint = false; -+} -+ - /* - * Global variables for an RT pipeline - */ -@@ -180,6 +419,8 @@ struct rt_variables { - nir_variable *shader_addr; - nir_variable *traversal_addr; - -+ nir_variable *sbt_descriptors; -+ - /* scratch offset of the argument area relative to stack_ptr */ - nir_variable *arg; - uint32_t payload_offset; -@@ -217,12 +458,19 @@ struct rt_variables { - nir_variable *ahit_terminate; - nir_variable *terminated; - -+ nir_variable **out_payload_storage; -+ unsigned payload_size; -+ -+ nir_function *trace_ray_func; -+ nir_function *chit_miss_func; -+ nir_function *callable_func; -+ - unsigned stack_size; - }; - - static struct rt_variables - create_rt_variables(nir_shader *shader, struct radv_device *device, const VkPipelineCreateFlags2KHR flags, -- bool monolithic) -+ unsigned max_payload_size, bool monolithic) - { - struct rt_variables vars = { - .device = device, -@@ -236,6 +484,8 @@ create_rt_variables(nir_shader *shader, struct radv_device *device, const VkPipe - vars.stack_ptr = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "stack_ptr"); - vars.shader_record_ptr = nir_variable_create(shader, nir_var_shader_temp, glsl_uint64_t_type(), "shader_record_ptr"); - -+ vars.sbt_descriptors = nir_variable_create(shader, nir_var_shader_temp, glsl_uint64_t_type(), "sbt_descriptors"); -+ - vars.launch_sizes[0] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "launch_size_x"); - vars.launch_sizes[1] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "launch_size_y"); - vars.launch_sizes[2] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "launch_size_z"); -@@ -269,6 +519,23 @@ create_rt_variables(nir_shader *shader, struct radv_device *device, const VkPipe - vars.ahit_terminate = nir_variable_create(shader, nir_var_shader_temp, glsl_bool_type(), "ahit_terminate"); - vars.terminated = nir_variable_create(shader, nir_var_shader_temp, glsl_bool_type(), "terminated"); - -+ if (max_payload_size) -+ vars.out_payload_storage = rzalloc_array_size(shader, DIV_ROUND_UP(max_payload_size, 4), sizeof(nir_variable *)); -+ vars.payload_size = max_payload_size; -+ for (unsigned i = 0; i < DIV_ROUND_UP(max_payload_size, 4); ++i) { -+ vars.out_payload_storage[i] = -+ nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "out_payload_storage"); -+ } -+ -+ nir_function *trace_ray_func = nir_function_create(shader, "trace_ray_func"); -+ radv_nir_init_function_params(trace_ray_func, MESA_SHADER_INTERSECTION, max_payload_size); -+ vars.trace_ray_func = trace_ray_func; -+ nir_function *chit_miss_func = nir_function_create(shader, "chit_miss_func"); -+ radv_nir_init_function_params(chit_miss_func, MESA_SHADER_CLOSEST_HIT, max_payload_size); -+ vars.chit_miss_func = chit_miss_func; -+ nir_function *callable_func = nir_function_create(shader, "callable_func"); -+ radv_nir_init_function_params(callable_func, MESA_SHADER_CALLABLE, max_payload_size); -+ vars.callable_func = callable_func; - return vars; - } - -@@ -850,7 +1117,8 @@ insert_rt_case(nir_builder *b, nir_shader *shader, struct rt_variables *vars, ni - - nir_opt_dead_cf(shader); - -- struct rt_variables src_vars = create_rt_variables(shader, vars->device, vars->flags, vars->monolithic); -+ struct rt_variables src_vars = -+ create_rt_variables(shader, vars->device, vars->flags, vars->payload_size, vars->monolithic); - map_rt_variables(var_remap, &src_vars, vars); - - NIR_PASS_V(shader, lower_rt_instructions, &src_vars, false, NULL); -@@ -1723,7 +1991,7 @@ radv_build_traversal_shader(struct radv_device *device, struct radv_ray_tracing_ - b.shader->info.workgroup_size[0] = 8; - b.shader->info.workgroup_size[1] = pdev->rt_wave_size == 64 ? 8 : 4; - b.shader->info.shared_size = pdev->rt_wave_size * MAX_STACK_ENTRY_COUNT * sizeof(uint32_t); -- struct rt_variables vars = create_rt_variables(b.shader, device, create_flags, false); -+ struct rt_variables vars = create_rt_variables(b.shader, device, create_flags, false, 0); - - if (info->tmin.state == RADV_RT_CONST_ARG_STATE_VALID) - nir_store_var(&b, vars.tmin, nir_imm_int(&b, info->tmin.value), 0x1); -@@ -1902,7 +2170,7 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH - - const VkPipelineCreateFlagBits2KHR create_flags = vk_rt_pipeline_create_flags(pCreateInfo); - -- struct rt_variables vars = create_rt_variables(shader, device, create_flags, monolithic); -+ struct rt_variables vars = create_rt_variables(shader, device, create_flags, payload_size, monolithic); - - if (monolithic) - lower_rt_instructions_monolithic(shader, device, pipeline, info, pCreateInfo, payload_size, &vars); --- -GitLab - - -From 7c97b73c788dcab2347225073bac244aa8aea252 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 12:04:36 +0200 -Subject: [PATCH 23/71] radv/rt: Convert lower_rt_derefs to register payloads - -All payloads alias the same registers by the time RT functions get -called. In order to pretend that the payload variables (represented by -function_temp vars) are separate, payload values are copied to the -"global" payload variables (shader_temp variables) just before a shader -call, and copied from there immediately after the shader call. ---- - src/amd/vulkan/nir/radv_nir_rt_shader.c | 84 ++++++++++++++++++++----- - 1 file changed, 68 insertions(+), 16 deletions(-) - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index 165c7e18578e0..0ebb095f52e1c 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -126,6 +126,62 @@ radv_visit_inlined_shaders(nir_builder *b, nir_def *sbt_idx, bool can_have_null_ - free(cases); - } - -+static void -+lower_rt_deref_var(nir_shader *shader, nir_function_impl *impl, nir_instr *instr, struct hash_table *cloned_vars) -+{ -+ nir_deref_instr *deref = nir_instr_as_deref(instr); -+ nir_variable *var = deref->var; -+ struct hash_entry *entry = _mesa_hash_table_search(cloned_vars, var); -+ if (!(var->data.mode & nir_var_function_temp) && !entry) -+ return; -+ -+ hash_table_foreach (cloned_vars, cloned_entry) { -+ if (var == cloned_entry->data) -+ return; -+ } -+ -+ nir_variable *new_var; -+ if (entry) { -+ new_var = entry->data; -+ } else { -+ new_var = nir_variable_clone(var, shader); -+ _mesa_hash_table_insert(cloned_vars, var, new_var); -+ -+ exec_node_remove(&var->node); -+ var->data.mode = nir_var_shader_temp; -+ exec_list_push_tail(&shader->variables, &var->node); -+ -+ exec_list_push_tail(&impl->locals, &new_var->node); -+ } -+ -+ deref->modes = nir_var_shader_temp; -+ -+ nir_foreach_use_safe (use, nir_instr_def(instr)) { -+ if (nir_src_is_if(use)) -+ continue; -+ -+ nir_instr *parent = nir_src_parent_instr(use); -+ if (parent->type != nir_instr_type_intrinsic) -+ continue; -+ -+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(parent); -+ if (intrin->intrinsic != nir_intrinsic_trace_ray && intrin->intrinsic != nir_intrinsic_execute_callable && -+ intrin->intrinsic != nir_intrinsic_execute_closest_hit_amd && -+ intrin->intrinsic != nir_intrinsic_execute_miss_amd) -+ continue; -+ -+ nir_builder b = nir_builder_at(nir_before_instr(parent)); -+ nir_deref_instr *old_deref = nir_build_deref_var(&b, var); -+ nir_deref_instr *new_deref = nir_build_deref_var(&b, new_var); -+ -+ nir_copy_deref(&b, new_deref, old_deref); -+ b.cursor = nir_after_instr(parent); -+ nir_copy_deref(&b, old_deref, new_deref); -+ -+ nir_src_rewrite(use, nir_instr_def(&new_deref->instr)); -+ } -+} -+ - static bool - lower_rt_derefs(nir_shader *shader) - { -@@ -133,9 +189,7 @@ lower_rt_derefs(nir_shader *shader) - - bool progress = false; - -- nir_builder b = nir_builder_at(nir_before_impl(impl)); -- -- nir_def *arg_offset = nir_load_rt_arg_scratch_offset_amd(&b); -+ struct hash_table *cloned_vars = _mesa_pointer_hash_table_create(shader); - - nir_foreach_block (block, impl) { - nir_foreach_instr_safe (instr, block) { -@@ -143,17 +197,18 @@ lower_rt_derefs(nir_shader *shader) - continue; - - nir_deref_instr *deref = nir_instr_as_deref(instr); -- if (!nir_deref_mode_is(deref, nir_var_shader_call_data)) -+ if (!nir_deref_mode_is(deref, nir_var_function_temp)) - continue; - -- deref->modes = nir_var_function_temp; -- progress = true; -- - if (deref->deref_type == nir_deref_type_var) { -- b.cursor = nir_before_instr(&deref->instr); -- nir_deref_instr *replacement = -- nir_build_deref_cast(&b, arg_offset, nir_var_function_temp, deref->var->type, 0); -- nir_def_replace(&deref->def, &replacement->def); -+ lower_rt_deref_var(shader, impl, instr, cloned_vars); -+ progress = true; -+ } else { -+ assert(deref->deref_type != nir_deref_type_cast); -+ /* Parent modes might have changed, propagate change */ -+ nir_deref_instr *parent = nir_src_as_deref(deref->parent); -+ if (parent->modes != deref->modes) -+ deref->modes = parent->modes; - } - } - } -@@ -1139,12 +1194,9 @@ void - radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset) - { - if (!monolithic) { -- NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_function_temp | nir_var_shader_call_data, -- glsl_get_natural_size_align_bytes); -- - NIR_PASS(_, nir, lower_rt_derefs); -- -- NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_function_temp, nir_address_format_32bit_offset); -+ NIR_PASS(_, nir, nir_split_var_copies); -+ NIR_PASS(_, nir, nir_lower_var_copies); - } else { - NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_natural_size_align_bytes); - NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_shader_temp, glsl_get_natural_size_align_bytes); --- -GitLab - - -From c45e4fbee8cb3c930d13c5ce1c1478b68fdcbbb5 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 12:09:15 +0200 -Subject: [PATCH 24/71] radv/rt: Align radv_nir_lower_rt_io to new lowering - ---- - src/amd/vulkan/nir/radv_nir_rt_shader.c | 10 +++++----- - src/amd/vulkan/radv_shader.h | 2 +- - 2 files changed, 6 insertions(+), 6 deletions(-) - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index 0ebb095f52e1c..7708dd8809b79 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -1191,7 +1191,7 @@ insert_rt_case(nir_builder *b, nir_shader *shader, struct rt_variables *vars, ni - } - - void --radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset) -+radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset, uint32_t *payload_size) - { - if (!monolithic) { - NIR_PASS(_, nir, lower_rt_derefs); -@@ -1625,7 +1625,7 @@ radv_build_ahit_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g - radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->any_hit_shader].nir); - assert(nir_stage); - -- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset); -+ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); - - insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.any_hit_index); - ralloc_free(nir_stage); -@@ -1649,7 +1649,7 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g - radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->intersection_shader].nir); - assert(nir_stage); - -- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset); -+ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); - - nir_shader *any_hit_stage = NULL; - if (group->any_hit_shader != VK_SHADER_UNUSED_KHR) { -@@ -1657,7 +1657,7 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g - radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->any_hit_shader].nir); - assert(any_hit_stage); - -- radv_nir_lower_rt_io(any_hit_stage, data->vars->monolithic, data->vars->payload_offset); -+ radv_nir_lower_rt_io(any_hit_stage, data->vars->monolithic, data->vars->payload_offset, NULL); - - /* reserve stack size for any_hit before it is inlined */ - data->pipeline->stages[group->any_hit_shader].stack_size = any_hit_stage->scratch_size; -@@ -1701,7 +1701,7 @@ radv_build_recursive_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_trac - radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->recursive_shader].nir); - assert(nir_stage); - -- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset); -+ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); - - insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.general_index); - ralloc_free(nir_stage); -diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h -index 4ba7e36d16952..f6a0f35c23333 100644 ---- a/src/amd/vulkan/radv_shader.h -+++ b/src/amd/vulkan/radv_shader.h -@@ -516,7 +516,7 @@ radv_get_rt_shader_entrypoint(nir_shader *shader) - return NULL; - } - --void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset); -+void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset, uint32_t *payload_size); - - struct radv_ray_tracing_stage_info; - --- -GitLab - - -From 4b54715289586c84b393e264d99e85c327f614f6 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 12:10:31 +0200 -Subject: [PATCH 25/71] radv/rt: Include inlined shader scratch size in - traversal scratch - -When calls without tail-call optimization happen, the traversal shader -must spill, and spilled vars must be placed after shader scratch. ---- - src/amd/vulkan/nir/radv_nir_rt_shader.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index 7708dd8809b79..f29f91ce18178 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -1626,6 +1626,7 @@ radv_build_ahit_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g - assert(nir_stage); - - radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); -+ b->shader->scratch_size = MAX2(nir_stage->scratch_size, b->shader->scratch_size); - - insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.any_hit_index); - ralloc_free(nir_stage); -@@ -1661,10 +1662,12 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g - - /* reserve stack size for any_hit before it is inlined */ - data->pipeline->stages[group->any_hit_shader].stack_size = any_hit_stage->scratch_size; -+ b->shader->scratch_size = MAX2(any_hit_stage->scratch_size, b->shader->scratch_size); - - nir_lower_intersection_shader(nir_stage, any_hit_stage); - ralloc_free(any_hit_stage); - } -+ b->shader->scratch_size = MAX2(nir_stage->scratch_size, b->shader->scratch_size); - - insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.intersection_index); - ralloc_free(nir_stage); --- -GitLab - - -From a86319221ae3924ff785061af08f4ae16cc851e9 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 12:17:15 +0200 -Subject: [PATCH 26/71] radv/rt: Don't store vars->shader_record_ptr directly - in load_sbt_entry - -When calling functions, we don't want the new shader record to stick -beyond the function call, so only store it when not calling functions. ---- - src/amd/vulkan/nir/radv_nir_rt_shader.c | 20 ++++++++++++-------- - 1 file changed, 12 insertions(+), 8 deletions(-) - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index f29f91ce18178..eeec13b0f539c 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -684,7 +684,7 @@ enum sbt_entry { - SBT_ANY_HIT_IDX = offsetof(struct radv_pipeline_group_handle, any_hit_index), - }; - --static void -+static nir_def * - load_sbt_entry(nir_builder *b, const struct rt_variables *vars, nir_def *idx, enum sbt_type binding, - enum sbt_entry offset) - { -@@ -704,7 +704,7 @@ load_sbt_entry(nir_builder *b, const struct rt_variables *vars, nir_def *idx, en - } - - nir_def *record_addr = nir_iadd_imm(b, addr, RADV_RT_HANDLE_SIZE - offset); -- nir_store_var(b, vars->shader_record_ptr, record_addr, 1); -+ return record_addr; - } - - struct radv_rt_shader_info { -@@ -987,7 +987,7 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) - nir_store_var(b, vars->instance_addr, intr->src[3].ssa, 0x1); - nir_store_var(b, vars->geometry_id_and_flags, intr->src[4].ssa, 0x1); - nir_store_var(b, vars->hit_kind, intr->src[5].ssa, 0x1); -- load_sbt_entry(b, vars, intr->src[0].ssa, SBT_HIT, SBT_RECURSIVE_PTR); -+ nir_def *record = load_sbt_entry(b, vars, intr->src[0].ssa, SBT_HIT, SBT_RECURSIVE_PTR); - - nir_def *should_return = - nir_test_mask(b, nir_load_var(b, vars->cull_mask_and_flags), SpvRayFlagsSkipClosestHitShaderKHRMask); -@@ -1011,7 +1011,7 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) - nir_store_var(b, vars->geometry_id_and_flags, undef, 0x1); - nir_store_var(b, vars->hit_kind, undef, 0x1); - nir_def *miss_index = nir_load_var(b, vars->miss_index); -- load_sbt_entry(b, vars, miss_index, SBT_MISS, SBT_RECURSIVE_PTR); -+ nir_def *record = load_sbt_entry(b, vars, miss_index, SBT_MISS, SBT_RECURSIVE_PTR); - - if (!(vars->flags & VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR)) { - /* In case of a NULL miss shader, do nothing and just return. */ -@@ -1741,7 +1741,8 @@ handle_candidate_triangle(nir_builder *b, struct radv_triangle_intersection *int - nir_store_var(b, inner_vars.instance_addr, nir_load_var(b, data->trav_vars->instance_addr), 0x1); - nir_store_var(b, inner_vars.hit_kind, hit_kind, 0x1); - -- load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, SBT_ANY_HIT_IDX); -+ nir_def *record = load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, SBT_ANY_HIT_IDX); -+ nir_store_var(b, inner_vars.shader_record_ptr, record, 0x1); - - struct radv_rt_case_data case_data = { - .device = data->device, -@@ -1805,7 +1806,8 @@ handle_candidate_aabb(nir_builder *b, struct radv_leaf_intersection *intersectio - nir_store_var(b, inner_vars.instance_addr, nir_load_var(b, data->trav_vars->instance_addr), 0x1); - nir_store_var(b, inner_vars.opaque, intersection->opaque, 1); - -- load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, SBT_INTERSECTION_IDX); -+ nir_def *record = load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, SBT_INTERSECTION_IDX); -+ nir_store_var(b, inner_vars.shader_record_ptr, record, 0x1); - - nir_store_var(b, data->vars->ahit_accept, nir_imm_false(b), 0x1); - nir_store_var(b, data->vars->ahit_terminate, nir_imm_false(b), 0x1); -@@ -1979,7 +1981,8 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin - nir_push_if(b, nir_load_var(b, trav_vars.hit)); - { - if (monolithic) { -- load_sbt_entry(b, vars, nir_load_var(b, vars->idx), SBT_HIT, SBT_CLOSEST_HIT_IDX); -+ nir_def *record = load_sbt_entry(b, vars, nir_load_var(b, vars->idx), SBT_HIT, SBT_CLOSEST_HIT_IDX); -+ nir_store_var(b, vars->shader_record_ptr, record, 0x1); - - nir_def *should_return = - nir_test_mask(b, nir_load_var(b, vars->cull_mask_and_flags), SpvRayFlagsSkipClosestHitShaderKHRMask); -@@ -2011,7 +2014,8 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin - nir_push_else(b, NULL); - { - if (monolithic) { -- load_sbt_entry(b, vars, nir_load_var(b, vars->miss_index), SBT_MISS, SBT_GENERAL_IDX); -+ nir_def *record = load_sbt_entry(b, vars, nir_load_var(b, vars->miss_index), SBT_MISS, SBT_GENERAL_IDX); -+ nir_store_var(b, vars->shader_record_ptr, record, 0x1); - - struct radv_rt_case_data case_data = { - .device = device, --- -GitLab - - -From 07a8a0b29f5e3d5b969ec8164af8fdefd8ffc28a Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Thu, 3 Oct 2024 15:59:01 +0200 -Subject: [PATCH 27/71] radv/rt: Load SBT descriptor from NIR variables - ---- - src/amd/vulkan/nir/radv_nir_rt_shader.c | 5 +++++ - 1 file changed, 5 insertions(+) - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index eeec13b0f539c..2f13831d9d473 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -965,6 +965,10 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) - nir_pop_if(b, NULL); - break; - } -+ case nir_intrinsic_load_sbt_base_amd: { -+ ret = nir_load_var(b, vars->sbt_descriptors); -+ break; -+ } - case nir_intrinsic_load_sbt_offset_amd: { - ret = nir_load_var(b, vars->sbt_offset); - break; -@@ -2077,6 +2081,7 @@ radv_build_traversal_shader(struct radv_device *device, struct radv_ray_tracing_ - nir_store_var(&b, vars.cull_mask_and_flags, nir_load_cull_mask_and_flags_amd(&b), 0x1); - nir_store_var(&b, vars.origin, nir_load_ray_world_origin(&b), 0x7); - nir_store_var(&b, vars.direction, nir_load_ray_world_direction(&b), 0x7); -+ nir_store_var(&b, vars.sbt_descriptors, nir_load_sbt_base_amd(&b), 0x1); - nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 0x1); - - radv_build_traversal(device, pipeline, pCreateInfo, false, &b, &vars, false, info); --- -GitLab - - -From 565c4764726d6a68e785c019f49914b00b8930ed Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 12:21:29 +0200 -Subject: [PATCH 28/71] radv/rt: Use function calls for shader calls - -Don't call nir_lower_shader_calls anymore, but emit nir_call -instructions for trace_ray and friends. Also, switch from shader args -to parameters for most things, and change lowerings accordingly. ---- - src/amd/common/ac_shader_args.h | 16 - - src/amd/vulkan/nir/radv_nir_rt_shader.c | 487 +++++++++++++----------- - src/amd/vulkan/radv_pipeline_rt.c | 62 +-- - src/amd/vulkan/radv_shader.h | 7 +- - src/amd/vulkan/radv_shader_args.c | 20 +- - 5 files changed, 290 insertions(+), 302 deletions(-) - -diff --git a/src/amd/common/ac_shader_args.h b/src/amd/common/ac_shader_args.h -index 62ac708c3d185..030a271e22ff3 100644 ---- a/src/amd/common/ac_shader_args.h -+++ b/src/amd/common/ac_shader_args.h -@@ -179,29 +179,13 @@ struct ac_shader_args { - - /* RT */ - struct { -- struct ac_arg uniform_shader_addr; - struct ac_arg sbt_descriptors; - struct ac_arg launch_sizes[3]; - struct ac_arg launch_size_addr; - struct ac_arg launch_ids[3]; - struct ac_arg dynamic_callable_stack_base; - struct ac_arg traversal_shader_addr; -- struct ac_arg shader_addr; -- struct ac_arg shader_record; - struct ac_arg payload_offset; -- struct ac_arg ray_origin; -- struct ac_arg ray_tmin; -- struct ac_arg ray_direction; -- struct ac_arg ray_tmax; -- struct ac_arg cull_mask_and_flags; -- struct ac_arg sbt_offset; -- struct ac_arg sbt_stride; -- struct ac_arg miss_index; -- struct ac_arg accel_struct; -- struct ac_arg primitive_id; -- struct ac_arg instance_addr; -- struct ac_arg geometry_id_and_flags; -- struct ac_arg hit_kind; - } rt; - }; - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index 2f13831d9d473..7968cb36f5d87 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -688,7 +688,7 @@ static nir_def * - load_sbt_entry(nir_builder *b, const struct rt_variables *vars, nir_def *idx, enum sbt_type binding, - enum sbt_entry offset) - { -- nir_def *desc_base_addr = nir_load_sbt_base_amd(b); -+ nir_def *desc_base_addr = nir_load_var(b, vars->sbt_descriptors); - - nir_def *desc = nir_pack_64_2x32(b, nir_load_smem_amd(b, 2, desc_base_addr, nir_imm_int(b, binding))); - -@@ -742,74 +742,58 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) - - nir_def *ret = NULL; - switch (intr->intrinsic) { -- case nir_intrinsic_rt_execute_callable: { -- uint32_t size = align(nir_intrinsic_stack_size(intr), 16); -- nir_def *ret_ptr = nir_load_resume_shader_address_amd(b, nir_intrinsic_call_idx(intr)); -- ret_ptr = nir_ior_imm(b, ret_ptr, radv_get_rt_priority(b->shader->info.stage)); -- -- nir_store_var(b, vars->stack_ptr, nir_iadd_imm_nuw(b, nir_load_var(b, vars->stack_ptr), size), 1); -- nir_store_scratch(b, ret_ptr, nir_load_var(b, vars->stack_ptr), .align_mul = 16); -- -- nir_store_var(b, vars->stack_ptr, nir_iadd_imm_nuw(b, nir_load_var(b, vars->stack_ptr), 16), 1); -- load_sbt_entry(b, vars, intr->src[0].ssa, SBT_CALLABLE, SBT_RECURSIVE_PTR); -- -- nir_store_var(b, vars->arg, nir_iadd_imm(b, intr->src[1].ssa, -size - 16), 1); -- -- vars->stack_size = MAX2(vars->stack_size, size + 16); -+ case nir_intrinsic_execute_callable: { -+ nir_def *record = load_sbt_entry(b, vars, intr->src[0].ssa, SBT_CALLABLE, SBT_RECURSIVE_PTR); -+ -+ unsigned param_count = RAYGEN_ARG_COUNT + DIV_ROUND_UP(vars->payload_size, 4); -+ nir_def **args = rzalloc_array_size(b->shader, sizeof(nir_def *), param_count); -+ args[RAYGEN_ARG_LAUNCH_ID] = nir_vec3(b, nir_load_var(b, vars->launch_ids[0]), nir_load_var(b, vars->launch_ids[1]), nir_load_var(b, vars->launch_ids[2])); -+ args[RAYGEN_ARG_LAUNCH_SIZE] = nir_vec3(b, nir_load_var(b, vars->launch_sizes[0]), nir_load_var(b, vars->launch_sizes[1]), nir_load_var(b, vars->launch_sizes[2])); -+ args[RAYGEN_ARG_DESCRIPTORS] = nir_load_param(b, b->shader->info.stage == MESA_SHADER_RAYGEN || b->shader->info.stage == MESA_SHADER_CALLABLE ? RAYGEN_ARG_DESCRIPTORS : CHIT_MISS_ARG_DESCRIPTORS); -+ args[RAYGEN_ARG_PUSH_CONSTANTS] = nir_load_param(b, b->shader->info.stage == MESA_SHADER_RAYGEN || b->shader->info.stage == MESA_SHADER_CALLABLE ? RAYGEN_ARG_PUSH_CONSTANTS : CHIT_MISS_ARG_PUSH_CONSTANTS); -+ args[RAYGEN_ARG_SBT_DESCRIPTORS] = nir_load_var(b, vars->sbt_descriptors); -+ args[RAYGEN_ARG_TRAVERSAL_ADDR] = nir_load_var(b, vars->traversal_addr); -+ args[RAYGEN_ARG_SHADER_RECORD_PTR] = record; -+ for (unsigned i = 0; i < DIV_ROUND_UP(vars->payload_size, 4); ++i) { -+ args[RAYGEN_ARG_COUNT + i] = nir_instr_def(&nir_build_deref_var(b, vars->out_payload_storage[i])->instr); -+ } -+ nir_build_indirect_call(b, vars->callable_func, nir_load_var(b, vars->shader_addr), param_count, args); - break; - } -- case nir_intrinsic_rt_trace_ray: { -- uint32_t size = align(nir_intrinsic_stack_size(intr), 16); -- nir_def *ret_ptr = nir_load_resume_shader_address_amd(b, nir_intrinsic_call_idx(intr)); -- ret_ptr = nir_ior_imm(b, ret_ptr, radv_get_rt_priority(b->shader->info.stage)); -- -- nir_store_var(b, vars->stack_ptr, nir_iadd_imm_nuw(b, nir_load_var(b, vars->stack_ptr), size), 1); -- nir_store_scratch(b, ret_ptr, nir_load_var(b, vars->stack_ptr), .align_mul = 16); -- -- nir_store_var(b, vars->stack_ptr, nir_iadd_imm_nuw(b, nir_load_var(b, vars->stack_ptr), 16), 1); -- -- nir_store_var(b, vars->shader_addr, nir_load_var(b, vars->traversal_addr), 1); -- nir_store_var(b, vars->arg, nir_iadd_imm(b, intr->src[10].ssa, -size - 16), 1); -- -- vars->stack_size = MAX2(vars->stack_size, size + 16); -- -+ case nir_intrinsic_trace_ray: { -+ nir_def *undef = nir_undef(b, 1, 32); - /* Per the SPIR-V extension spec we have to ignore some bits for some arguments. */ -- nir_store_var(b, vars->accel_struct, intr->src[0].ssa, 0x1); -- nir_store_var(b, vars->cull_mask_and_flags, nir_ior(b, nir_ishl_imm(b, intr->src[2].ssa, 24), intr->src[1].ssa), -- 0x1); -- nir_store_var(b, vars->sbt_offset, nir_iand_imm(b, intr->src[3].ssa, 0xf), 0x1); -- nir_store_var(b, vars->sbt_stride, nir_iand_imm(b, intr->src[4].ssa, 0xf), 0x1); -- nir_store_var(b, vars->miss_index, nir_iand_imm(b, intr->src[5].ssa, 0xffff), 0x1); -- nir_store_var(b, vars->origin, intr->src[6].ssa, 0x7); -- nir_store_var(b, vars->tmin, intr->src[7].ssa, 0x1); -- nir_store_var(b, vars->direction, intr->src[8].ssa, 0x7); -- nir_store_var(b, vars->tmax, intr->src[9].ssa, 0x1); -- break; -- } -- case nir_intrinsic_rt_resume: { -- uint32_t size = align(nir_intrinsic_stack_size(intr), 16); -- -- nir_store_var(b, vars->stack_ptr, nir_iadd_imm(b, nir_load_var(b, vars->stack_ptr), -size), 1); -- break; -- } -- case nir_intrinsic_rt_return_amd: { -- if (b->shader->info.stage == MESA_SHADER_RAYGEN) { -- nir_terminate(b); -- break; -+ nir_def *cull_mask_and_flags = nir_ior(b, nir_ishl_imm(b, intr->src[2].ssa, 24), intr->src[1].ssa); -+ -+ unsigned param_count = TRAVERSAL_ARG_PAYLOAD_BASE + DIV_ROUND_UP(vars->payload_size, 4); -+ nir_def **args = rzalloc_array_size(b->shader, sizeof(nir_def *), param_count); -+ args[TRAVERSAL_ARG_LAUNCH_ID] = nir_vec3(b, nir_load_var(b, vars->launch_ids[0]), nir_load_var(b, vars->launch_ids[1]), nir_load_var(b, vars->launch_ids[2])); -+ args[TRAVERSAL_ARG_LAUNCH_SIZE] = nir_vec3(b, nir_load_var(b, vars->launch_sizes[0]), nir_load_var(b, vars->launch_sizes[1]), nir_load_var(b, vars->launch_sizes[2])); -+ args[TRAVERSAL_ARG_DESCRIPTORS] = nir_load_param(b, b->shader->info.stage == MESA_SHADER_RAYGEN ? RAYGEN_ARG_DESCRIPTORS : CHIT_MISS_ARG_DESCRIPTORS); -+ args[TRAVERSAL_ARG_PUSH_CONSTANTS] = nir_load_param(b, b->shader->info.stage == MESA_SHADER_RAYGEN ? RAYGEN_ARG_PUSH_CONSTANTS : CHIT_MISS_ARG_PUSH_CONSTANTS); -+ args[TRAVERSAL_ARG_SBT_DESCRIPTORS] = nir_load_var(b, vars->sbt_descriptors); -+ args[TRAVERSAL_ARG_TRAVERSAL_ADDR] = nir_load_var(b, vars->traversal_addr); -+ args[TRAVERSAL_ARG_SHADER_RECORD_PTR] = nir_load_var(b, vars->shader_record_ptr); -+ args[TRAVERSAL_ARG_ACCEL_STRUCT] = intr->src[0].ssa; -+ args[TRAVERSAL_ARG_CULL_MASK_AND_FLAGS] = cull_mask_and_flags; -+ args[TRAVERSAL_ARG_SBT_OFFSET] = nir_iand_imm(b, intr->src[3].ssa, 0xf); -+ args[TRAVERSAL_ARG_SBT_STRIDE] = nir_iand_imm(b, intr->src[4].ssa, 0xf); -+ args[TRAVERSAL_ARG_MISS_INDEX] = nir_iand_imm(b, intr->src[5].ssa, 0xffff); -+ args[TRAVERSAL_ARG_RAY_ORIGIN] = intr->src[6].ssa; -+ args[TRAVERSAL_ARG_RAY_TMIN] = intr->src[7].ssa; -+ args[TRAVERSAL_ARG_RAY_DIRECTION] = intr->src[8].ssa; -+ args[TRAVERSAL_ARG_RAY_TMAX] = intr->src[9].ssa; -+ args[TRAVERSAL_ARG_PRIMITIVE_ID] = undef; -+ args[TRAVERSAL_ARG_INSTANCE_ADDR] = nir_undef(b, 1, 64); -+ args[TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS] = undef; -+ args[TRAVERSAL_ARG_HIT_KIND] = undef; -+ for (unsigned i = 0; i < DIV_ROUND_UP(vars->payload_size, 4); ++i) { -+ args[TRAVERSAL_ARG_PAYLOAD_BASE + i] = -+ nir_instr_def(&nir_build_deref_var(b, vars->out_payload_storage[i])->instr); - } -- insert_rt_return(b, vars); -+ nir_build_indirect_call(b, vars->trace_ray_func, nir_load_var(b, vars->traversal_addr), param_count, args); - break; - } -- case nir_intrinsic_load_scratch: { -- if (data->late_lowering) -- nir_src_rewrite(&intr->src[0], nir_iadd_nuw(b, nir_load_var(b, vars->stack_ptr), intr->src[0].ssa)); -- return true; -- } -- case nir_intrinsic_store_scratch: { -- if (data->late_lowering) -- nir_src_rewrite(&intr->src[1], nir_iadd_nuw(b, nir_load_var(b, vars->stack_ptr), intr->src[1].ssa)); -- return true; -- } - case nir_intrinsic_load_shader_record_ptr: { - ret = nir_load_var(b, vars->shader_record_ptr); - break; -@@ -986,11 +970,6 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) - break; - } - case nir_intrinsic_execute_closest_hit_amd: { -- nir_store_var(b, vars->tmax, intr->src[1].ssa, 0x1); -- nir_store_var(b, vars->primitive_id, intr->src[2].ssa, 0x1); -- nir_store_var(b, vars->instance_addr, intr->src[3].ssa, 0x1); -- nir_store_var(b, vars->geometry_id_and_flags, intr->src[4].ssa, 0x1); -- nir_store_var(b, vars->hit_kind, intr->src[5].ssa, 0x1); - nir_def *record = load_sbt_entry(b, vars, intr->src[0].ssa, SBT_HIT, SBT_RECURSIVE_PTR); - - nir_def *should_return = -@@ -1002,28 +981,82 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) - - /* should_return is set if we had a hit but we won't be calling the closest hit - * shader and hence need to return immediately to the calling shader. */ -- nir_push_if(b, should_return); -- insert_rt_return(b, vars); -+ nir_push_if(b, nir_inot(b, should_return)); -+ unsigned param_count = CHIT_MISS_ARG_PAYLOAD_BASE + DIV_ROUND_UP(vars->payload_size, 4); -+ nir_def **args = rzalloc_array_size(b->shader, sizeof(nir_def *), param_count); -+ args[CHIT_MISS_ARG_LAUNCH_ID] = nir_vec3(b, nir_load_var(b, vars->launch_ids[0]), nir_load_var(b, vars->launch_ids[1]), nir_load_var(b, vars->launch_ids[2])); -+ args[CHIT_MISS_ARG_LAUNCH_SIZE] = nir_vec3(b, nir_load_var(b, vars->launch_sizes[0]), nir_load_var(b, vars->launch_sizes[1]), nir_load_var(b, vars->launch_sizes[2])); -+ args[CHIT_MISS_ARG_DESCRIPTORS] = nir_load_param(b, TRAVERSAL_ARG_DESCRIPTORS); -+ args[CHIT_MISS_ARG_PUSH_CONSTANTS] = nir_load_param(b, TRAVERSAL_ARG_PUSH_CONSTANTS); -+ args[CHIT_MISS_ARG_SBT_DESCRIPTORS] = nir_load_var(b, vars->sbt_descriptors); -+ args[CHIT_MISS_ARG_TRAVERSAL_ADDR] = nir_load_var(b, vars->traversal_addr); -+ args[CHIT_MISS_ARG_SHADER_RECORD_PTR] = record; -+ args[CHIT_MISS_ARG_ACCEL_STRUCT] = nir_load_var(b, vars->accel_struct); -+ args[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS] = nir_load_var(b, vars->cull_mask_and_flags); -+ args[CHIT_MISS_ARG_SBT_OFFSET] = nir_load_var(b, vars->sbt_offset); -+ args[CHIT_MISS_ARG_SBT_STRIDE] = nir_load_var(b, vars->sbt_stride); -+ args[CHIT_MISS_ARG_MISS_INDEX] = nir_load_var(b, vars->miss_index); -+ args[CHIT_MISS_ARG_RAY_ORIGIN] = nir_load_var(b, vars->origin); -+ args[CHIT_MISS_ARG_RAY_TMIN] = nir_load_var(b, vars->tmin); -+ args[CHIT_MISS_ARG_RAY_DIRECTION] = nir_load_var(b, vars->direction); -+ args[CHIT_MISS_ARG_RAY_TMAX] = intr->src[1].ssa; -+ args[CHIT_MISS_ARG_PRIMITIVE_ID] = intr->src[2].ssa; -+ args[CHIT_MISS_ARG_INSTANCE_ADDR] = intr->src[3].ssa; -+ args[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS] = intr->src[4].ssa; -+ args[CHIT_MISS_ARG_HIT_KIND] = intr->src[5].ssa; -+ for (unsigned i = 0; i < DIV_ROUND_UP(vars->payload_size, 4); ++i) { -+ args[CHIT_MISS_ARG_PAYLOAD_BASE + i] = -+ nir_instr_def(&nir_build_deref_cast(b, nir_load_param(b, TRAVERSAL_ARG_PAYLOAD_BASE + i), -+ nir_var_shader_call_data, glsl_uint_type(), 4) -+ ->instr); -+ } -+ nir_build_indirect_call(b, vars->chit_miss_func, nir_load_var(b, vars->shader_addr), param_count, args); - nir_pop_if(b, NULL); - break; - } - case nir_intrinsic_execute_miss_amd: { -- nir_store_var(b, vars->tmax, intr->src[0].ssa, 0x1); - nir_def *undef = nir_undef(b, 1, 32); -- nir_store_var(b, vars->primitive_id, undef, 0x1); -- nir_store_var(b, vars->instance_addr, nir_undef(b, 1, 64), 0x1); -- nir_store_var(b, vars->geometry_id_and_flags, undef, 0x1); -- nir_store_var(b, vars->hit_kind, undef, 0x1); - nir_def *miss_index = nir_load_var(b, vars->miss_index); - nir_def *record = load_sbt_entry(b, vars, miss_index, SBT_MISS, SBT_RECURSIVE_PTR); - - if (!(vars->flags & VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR)) { - /* In case of a NULL miss shader, do nothing and just return. */ -- nir_push_if(b, nir_ieq_imm(b, nir_load_var(b, vars->shader_addr), 0)); -- insert_rt_return(b, vars); -- nir_pop_if(b, NULL); -+ nir_push_if(b, nir_ine_imm(b, nir_load_var(b, vars->shader_addr), 0)); - } - -+ unsigned param_count = CHIT_MISS_ARG_PAYLOAD_BASE + DIV_ROUND_UP(vars->payload_size, 4); -+ nir_def **args = rzalloc_array_size(b->shader, sizeof(nir_def *), param_count); -+ args[CHIT_MISS_ARG_LAUNCH_ID] = nir_vec3(b, nir_load_var(b, vars->launch_ids[0]), nir_load_var(b, vars->launch_ids[1]), nir_load_var(b, vars->launch_ids[2])); -+ args[CHIT_MISS_ARG_LAUNCH_SIZE] = nir_vec3(b, nir_load_var(b, vars->launch_sizes[0]), nir_load_var(b, vars->launch_sizes[1]), nir_load_var(b, vars->launch_sizes[2])); -+ args[CHIT_MISS_ARG_DESCRIPTORS] = nir_load_param(b, TRAVERSAL_ARG_DESCRIPTORS); -+ args[CHIT_MISS_ARG_PUSH_CONSTANTS] = nir_load_param(b, TRAVERSAL_ARG_PUSH_CONSTANTS); -+ args[CHIT_MISS_ARG_SBT_DESCRIPTORS] = nir_load_var(b, vars->sbt_descriptors); -+ args[CHIT_MISS_ARG_TRAVERSAL_ADDR] = nir_load_var(b, vars->traversal_addr); -+ args[CHIT_MISS_ARG_SHADER_RECORD_PTR] = record; -+ args[CHIT_MISS_ARG_ACCEL_STRUCT] = nir_load_var(b, vars->accel_struct); -+ args[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS] = nir_load_var(b, vars->cull_mask_and_flags); -+ args[CHIT_MISS_ARG_SBT_OFFSET] = nir_load_var(b, vars->sbt_offset); -+ args[CHIT_MISS_ARG_SBT_STRIDE] = nir_load_var(b, vars->sbt_stride); -+ args[CHIT_MISS_ARG_MISS_INDEX] = nir_load_var(b, vars->miss_index); -+ args[CHIT_MISS_ARG_RAY_ORIGIN] = nir_load_var(b, vars->origin); -+ args[CHIT_MISS_ARG_RAY_TMIN] = nir_load_var(b, vars->tmin); -+ args[CHIT_MISS_ARG_RAY_DIRECTION] = nir_load_var(b, vars->direction); -+ args[CHIT_MISS_ARG_RAY_TMAX] = intr->src[0].ssa; -+ args[CHIT_MISS_ARG_PRIMITIVE_ID] = undef; -+ args[CHIT_MISS_ARG_INSTANCE_ADDR] = nir_undef(b, 1, 64); -+ args[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS] = undef; -+ args[CHIT_MISS_ARG_HIT_KIND] = undef; -+ for (unsigned i = 0; i < DIV_ROUND_UP(vars->payload_size, 4); ++i) { -+ args[CHIT_MISS_ARG_PAYLOAD_BASE + i] = -+ nir_instr_def(&nir_build_deref_cast(b, nir_load_param(b, TRAVERSAL_ARG_PAYLOAD_BASE + i), -+ nir_var_shader_call_data, glsl_uint_type(), 4) -+ ->instr); -+ } -+ nir_build_indirect_call(b, vars->chit_miss_func, nir_load_var(b, vars->shader_addr), param_count, args); -+ -+ if (!(vars->flags & VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR)) -+ nir_pop_if(b, NULL); -+ - break; - } - case nir_intrinsic_load_ray_triangle_vertex_positions: { -@@ -1032,6 +1065,14 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) - ret = radv_load_vertex_position(vars->device, b, instance_node_addr, primitive_id, nir_intrinsic_column(intr)); - break; - } -+ case nir_intrinsic_rt_trace_ray: -+ unreachable("nir_intrinsic_rt_trace_ray"); -+ case nir_intrinsic_rt_execute_callable: -+ unreachable("nir_intrinsic_rt_execute_callable"); -+ case nir_intrinsic_rt_resume: -+ unreachable("nir_intrinsic_rt_resume"); -+ case nir_intrinsic_rt_return_amd: -+ unreachable("nir_intrinsic_rt_return_amd"); - default: - return false; - } -@@ -1195,7 +1236,7 @@ insert_rt_case(nir_builder *b, nir_shader *shader, struct rt_variables *vars, ni - } - - void --radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset, uint32_t *payload_size) -+radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset) - { - if (!monolithic) { - NIR_PASS(_, nir, lower_rt_derefs); -@@ -1629,7 +1670,7 @@ radv_build_ahit_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g - radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->any_hit_shader].nir); - assert(nir_stage); - -- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); -+ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset); - b->shader->scratch_size = MAX2(nir_stage->scratch_size, b->shader->scratch_size); - - insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.any_hit_index); -@@ -1654,7 +1695,7 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g - radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->intersection_shader].nir); - assert(nir_stage); - -- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); -+ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset); - - nir_shader *any_hit_stage = NULL; - if (group->any_hit_shader != VK_SHADER_UNUSED_KHR) { -@@ -1662,7 +1703,7 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g - radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->any_hit_shader].nir); - assert(any_hit_stage); - -- radv_nir_lower_rt_io(any_hit_stage, data->vars->monolithic, data->vars->payload_offset, NULL); -+ radv_nir_lower_rt_io(any_hit_stage, data->vars->monolithic, data->vars->payload_offset); - - /* reserve stack size for any_hit before it is inlined */ - data->pipeline->stages[group->any_hit_shader].stack_size = any_hit_stage->scratch_size; -@@ -1708,7 +1749,7 @@ radv_build_recursive_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_trac - radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->recursive_shader].nir); - assert(nir_stage); - -- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); -+ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset); - - insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.general_index); - ralloc_free(nir_stage); -@@ -2175,10 +2216,23 @@ radv_count_hit_attrib_slots(nir_builder *b, nir_intrinsic_instr *instr, void *da - return false; - } - -+static bool -+radv_count_ray_payload_size(nir_builder *b, nir_intrinsic_instr *instr, void *data) -+{ -+ uint32_t *count = data; -+ if (instr->intrinsic == nir_intrinsic_load_incoming_ray_payload_amd || -+ instr->intrinsic == nir_intrinsic_load_outgoing_ray_payload_amd || -+ instr->intrinsic == nir_intrinsic_store_incoming_ray_payload_amd || -+ instr->intrinsic == nir_intrinsic_store_outgoing_ray_payload_amd) -+ *count = MAX2(*count, (nir_intrinsic_base(instr) + 1) * 4); -+ -+ return false; -+} -+ - static void - lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, - struct radv_ray_tracing_pipeline *pipeline, const struct radv_shader_info *info, -- const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, uint32_t payload_size, -+ const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, uint32_t *payload_size, - struct rt_variables *vars) - { - nir_function_impl *impl = radv_get_rt_shader_entrypoint(shader); -@@ -2195,6 +2249,7 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, - - uint32_t hit_attrib_count = 0; - nir_shader_intrinsics_pass(shader, radv_count_hit_attrib_slots, nir_metadata_all, &hit_attrib_count); -+ nir_shader_intrinsics_pass(shader, radv_count_ray_payload_size, nir_metadata_all, payload_size); - - /* Register storage for hit attributes */ - STACK_ARRAY(nir_variable *, hit_attribs, hit_attrib_count); -@@ -2203,10 +2258,10 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, - - nir_builder b = nir_builder_create(impl); - b.cursor = nir_before_impl(impl); -- nir_variable **payload_vars = rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(payload_size, 4)); -+ nir_variable **payload_vars = rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(*payload_size, 4)); - nir_deref_instr **payload_storage = -- rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(payload_size, 4)); -- for (unsigned i = 0; i < DIV_ROUND_UP(payload_size, 4); ++i) { -+ rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(*payload_size, 4)); -+ for (unsigned i = 0; i < DIV_ROUND_UP(*payload_size, 4); ++i) { - payload_vars[i] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "_payload"); - payload_storage[i] = nir_build_deref_var(&b, payload_vars[i]); - } -@@ -2215,26 +2270,28 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, - } - - static void --radv_store_arg(nir_builder *b, const struct radv_shader_args *args, const struct radv_ray_tracing_stage_info *info, -- struct ac_arg arg, nir_def *value) -+store_param_var(nir_builder *b, nir_variable *var, unsigned param_index, unsigned num_components, unsigned bit_size) - { -- /* Do not pass unused data to the next stage. */ -- if (!info || !BITSET_TEST(info->unused_args, arg.arg_index)) -- ac_nir_store_arg(b, &args->ac, arg, value); -+ if (param_index != -1u) -+ nir_store_var(b, var, nir_load_param(b, param_index), (1 << num_components) - 1); -+ else -+ nir_store_var(b, var, nir_undef(b, num_components, bit_size), (1 << num_components) - 1); - } - - void - radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, -- const struct radv_shader_args *args, const struct radv_shader_info *info, uint32_t *stack_size, -- bool resume_shader, uint32_t payload_size, struct radv_device *device, -- struct radv_ray_tracing_pipeline *pipeline, bool monolithic, -- const struct radv_ray_tracing_stage_info *traversal_info) -+ const struct radv_shader_args *args, const struct radv_shader_info *info, uint32_t *payload_size, -+ uint32_t *stack_size, struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline, -+ bool monolithic) - { - nir_function_impl *impl = nir_shader_get_entrypoint(shader); -+ nir_function *entrypoint_function = impl->function; -+ -+ radv_nir_init_function_params(entrypoint_function, shader->info.stage, *payload_size); - - const VkPipelineCreateFlagBits2KHR create_flags = vk_rt_pipeline_create_flags(pCreateInfo); - -- struct rt_variables vars = create_rt_variables(shader, device, create_flags, payload_size, monolithic); -+ struct rt_variables vars = create_rt_variables(shader, device, create_flags, *payload_size, monolithic); - - if (monolithic) - lower_rt_instructions_monolithic(shader, device, pipeline, info, pCreateInfo, payload_size, &vars); -@@ -2247,152 +2304,158 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH - vars.stack_size = MAX2(vars.stack_size, shader->scratch_size); - *stack_size = MAX2(*stack_size, vars.stack_size); - } -- shader->scratch_size = 0; - - NIR_PASS(_, shader, nir_lower_returns); - -- nir_cf_list list; -- nir_cf_extract(&list, nir_before_impl(impl), nir_after_impl(impl)); -+ unsigned shader_record_ptr_arg = -1u; -+ unsigned launch_id_arg = -1u; -+ unsigned launch_size_arg = -1u; -+ unsigned sbt_descriptors_arg = -1u; -+ unsigned traversal_addr_arg = -1u; -+ unsigned accel_struct_arg = -1u; -+ unsigned cull_mask_and_flags_arg = -1u; -+ unsigned sbt_offset_arg = -1u; -+ unsigned sbt_stride_arg = -1u; -+ unsigned miss_index_arg = -1u; -+ unsigned ray_origin_arg = -1u; -+ unsigned ray_tmin_arg = -1u; -+ unsigned ray_direction_arg = -1u; -+ unsigned ray_tmax_arg = -1u; -+ unsigned primitive_id_arg = -1u; -+ unsigned instance_addr_arg = -1u; -+ unsigned geometry_id_and_flags_arg = -1u; -+ unsigned hit_kind_arg = -1u; -+ unsigned in_payload_base_arg = -1u; -+ -+ switch (shader->info.stage) { -+ case MESA_SHADER_CALLABLE: -+ in_payload_base_arg = RAYGEN_ARG_COUNT; -+ shader_record_ptr_arg = RAYGEN_ARG_SHADER_RECORD_PTR; -+ launch_id_arg = RAYGEN_ARG_LAUNCH_ID; -+ launch_size_arg = RAYGEN_ARG_LAUNCH_SIZE; -+ sbt_descriptors_arg = RAYGEN_ARG_SBT_DESCRIPTORS; -+ traversal_addr_arg = RAYGEN_ARG_TRAVERSAL_ADDR; -+ break; -+ case MESA_SHADER_RAYGEN: -+ shader_record_ptr_arg = RAYGEN_ARG_SHADER_RECORD_PTR; -+ launch_id_arg = RAYGEN_ARG_LAUNCH_ID; -+ launch_size_arg = RAYGEN_ARG_LAUNCH_SIZE; -+ sbt_descriptors_arg = RAYGEN_ARG_SBT_DESCRIPTORS; -+ traversal_addr_arg = RAYGEN_ARG_TRAVERSAL_ADDR; -+ break; -+ case MESA_SHADER_INTERSECTION: -+ launch_id_arg = TRAVERSAL_ARG_LAUNCH_ID; -+ launch_size_arg = TRAVERSAL_ARG_LAUNCH_SIZE; -+ sbt_descriptors_arg = TRAVERSAL_ARG_SBT_DESCRIPTORS; -+ traversal_addr_arg = TRAVERSAL_ARG_TRAVERSAL_ADDR; -+ shader_record_ptr_arg = TRAVERSAL_ARG_SHADER_RECORD_PTR; -+ accel_struct_arg = TRAVERSAL_ARG_ACCEL_STRUCT; -+ cull_mask_and_flags_arg = TRAVERSAL_ARG_CULL_MASK_AND_FLAGS; -+ sbt_offset_arg = TRAVERSAL_ARG_SBT_OFFSET; -+ sbt_stride_arg = TRAVERSAL_ARG_SBT_STRIDE; -+ miss_index_arg = TRAVERSAL_ARG_MISS_INDEX; -+ ray_origin_arg = TRAVERSAL_ARG_RAY_ORIGIN; -+ ray_tmin_arg = TRAVERSAL_ARG_RAY_TMIN; -+ ray_direction_arg = TRAVERSAL_ARG_RAY_DIRECTION; -+ ray_tmax_arg = TRAVERSAL_ARG_RAY_TMAX; -+ in_payload_base_arg = TRAVERSAL_ARG_PAYLOAD_BASE; -+ break; -+ case MESA_SHADER_CLOSEST_HIT: -+ case MESA_SHADER_MISS: -+ launch_id_arg = CHIT_MISS_ARG_LAUNCH_ID; -+ launch_size_arg = CHIT_MISS_ARG_LAUNCH_SIZE; -+ sbt_descriptors_arg = CHIT_MISS_ARG_SBT_DESCRIPTORS; -+ traversal_addr_arg = CHIT_MISS_ARG_TRAVERSAL_ADDR; -+ shader_record_ptr_arg = CHIT_MISS_ARG_SHADER_RECORD_PTR; -+ accel_struct_arg = CHIT_MISS_ARG_ACCEL_STRUCT; -+ cull_mask_and_flags_arg = CHIT_MISS_ARG_CULL_MASK_AND_FLAGS; -+ sbt_offset_arg = CHIT_MISS_ARG_SBT_OFFSET; -+ sbt_stride_arg = CHIT_MISS_ARG_SBT_STRIDE; -+ miss_index_arg = CHIT_MISS_ARG_MISS_INDEX; -+ ray_origin_arg = CHIT_MISS_ARG_RAY_ORIGIN; -+ ray_tmin_arg = CHIT_MISS_ARG_RAY_TMIN; -+ ray_direction_arg = CHIT_MISS_ARG_RAY_DIRECTION; -+ ray_tmax_arg = CHIT_MISS_ARG_RAY_TMAX; -+ primitive_id_arg = CHIT_MISS_ARG_PRIMITIVE_ID; -+ instance_addr_arg = CHIT_MISS_ARG_INSTANCE_ADDR; -+ geometry_id_and_flags_arg = CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS; -+ hit_kind_arg = CHIT_MISS_ARG_HIT_KIND; -+ in_payload_base_arg = CHIT_MISS_ARG_PAYLOAD_BASE; -+ break; -+ default: -+ break; -+ } - - /* initialize variables */ - nir_builder b = nir_builder_at(nir_before_impl(impl)); - -- nir_def *descriptor_sets = ac_nir_load_arg(&b, &args->ac, args->descriptor_sets[0]); -- nir_def *push_constants = ac_nir_load_arg(&b, &args->ac, args->ac.push_constants); -- nir_def *sbt_descriptors = ac_nir_load_arg(&b, &args->ac, args->ac.rt.sbt_descriptors); -- - nir_def *launch_sizes[3]; -+ nir_def *launch_size_vec = nir_load_param(&b, launch_size_arg); - for (uint32_t i = 0; i < ARRAY_SIZE(launch_sizes); i++) { -- launch_sizes[i] = ac_nir_load_arg(&b, &args->ac, args->ac.rt.launch_sizes[i]); -+ launch_sizes[i] = nir_channel(&b, launch_size_vec, i); - nir_store_var(&b, vars.launch_sizes[i], launch_sizes[i], 1); - } - -- nir_def *scratch_offset = NULL; -- if (args->ac.scratch_offset.used) -- scratch_offset = ac_nir_load_arg(&b, &args->ac, args->ac.scratch_offset); -- nir_def *ring_offsets = NULL; -- if (args->ac.ring_offsets.used) -- ring_offsets = ac_nir_load_arg(&b, &args->ac, args->ac.ring_offsets); -- - nir_def *launch_ids[3]; -+ nir_def *launch_id_vec = nir_load_param(&b, launch_id_arg); - for (uint32_t i = 0; i < ARRAY_SIZE(launch_ids); i++) { -- launch_ids[i] = ac_nir_load_arg(&b, &args->ac, args->ac.rt.launch_ids[i]); -+ launch_ids[i] = nir_channel(&b, launch_id_vec, i); - nir_store_var(&b, vars.launch_ids[i], launch_ids[i], 1); - } - -- nir_def *traversal_addr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.traversal_shader_addr); -- nir_store_var(&b, vars.traversal_addr, nir_pack_64_2x32(&b, traversal_addr), 1); -+ nir_store_var(&b, vars.traversal_addr, nir_load_param(&b, traversal_addr_arg), 1); - -- nir_def *shader_addr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.shader_addr); -- shader_addr = nir_pack_64_2x32(&b, shader_addr); -- nir_store_var(&b, vars.shader_addr, shader_addr, 1); -+ nir_store_var(&b, vars.sbt_descriptors, nir_load_param(&b, sbt_descriptors_arg), 1); - -- nir_store_var(&b, vars.stack_ptr, ac_nir_load_arg(&b, &args->ac, args->ac.rt.dynamic_callable_stack_base), 1); -- nir_def *record_ptr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.shader_record); -- nir_store_var(&b, vars.shader_record_ptr, nir_pack_64_2x32(&b, record_ptr), 1); -- nir_store_var(&b, vars.arg, ac_nir_load_arg(&b, &args->ac, args->ac.rt.payload_offset), 1); -- -- nir_def *accel_struct = ac_nir_load_arg(&b, &args->ac, args->ac.rt.accel_struct); -- nir_store_var(&b, vars.accel_struct, nir_pack_64_2x32(&b, accel_struct), 1); -- nir_store_var(&b, vars.cull_mask_and_flags, ac_nir_load_arg(&b, &args->ac, args->ac.rt.cull_mask_and_flags), 1); -- nir_store_var(&b, vars.sbt_offset, ac_nir_load_arg(&b, &args->ac, args->ac.rt.sbt_offset), 1); -- nir_store_var(&b, vars.sbt_stride, ac_nir_load_arg(&b, &args->ac, args->ac.rt.sbt_stride), 1); -- nir_store_var(&b, vars.origin, ac_nir_load_arg(&b, &args->ac, args->ac.rt.ray_origin), 0x7); -- nir_store_var(&b, vars.tmin, ac_nir_load_arg(&b, &args->ac, args->ac.rt.ray_tmin), 1); -- nir_store_var(&b, vars.direction, ac_nir_load_arg(&b, &args->ac, args->ac.rt.ray_direction), 0x7); -- nir_store_var(&b, vars.tmax, ac_nir_load_arg(&b, &args->ac, args->ac.rt.ray_tmax), 1); -- -- if (traversal_info && traversal_info->miss_index.state == RADV_RT_CONST_ARG_STATE_VALID) -- nir_store_var(&b, vars.miss_index, nir_imm_int(&b, traversal_info->miss_index.value), 0x1); -- else -- nir_store_var(&b, vars.miss_index, ac_nir_load_arg(&b, &args->ac, args->ac.rt.miss_index), 0x1); -- -- nir_store_var(&b, vars.primitive_id, ac_nir_load_arg(&b, &args->ac, args->ac.rt.primitive_id), 1); -- nir_def *instance_addr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.instance_addr); -- nir_store_var(&b, vars.instance_addr, nir_pack_64_2x32(&b, instance_addr), 1); -- nir_store_var(&b, vars.geometry_id_and_flags, ac_nir_load_arg(&b, &args->ac, args->ac.rt.geometry_id_and_flags), 1); -- nir_store_var(&b, vars.hit_kind, ac_nir_load_arg(&b, &args->ac, args->ac.rt.hit_kind), 1); -- -- /* guard the shader, so that only the correct invocations execute it */ -- nir_if *shader_guard = NULL; -- if (shader->info.stage != MESA_SHADER_RAYGEN || resume_shader) { -- nir_def *uniform_shader_addr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.uniform_shader_addr); -- uniform_shader_addr = nir_pack_64_2x32(&b, uniform_shader_addr); -- uniform_shader_addr = nir_ior_imm(&b, uniform_shader_addr, radv_get_rt_priority(shader->info.stage)); -- -- shader_guard = nir_push_if(&b, nir_ieq(&b, uniform_shader_addr, shader_addr)); -- shader_guard->control = nir_selection_control_divergent_always_taken; -- } -- -- nir_cf_reinsert(&list, b.cursor); -- -- if (shader_guard) -- nir_pop_if(&b, shader_guard); -+ if (monolithic) { -+ nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 1); -+ -+ nir_store_var(&b, vars.arg, nir_imm_int(&b, 0), 1); -+ } -+ -+ store_param_var(&b, vars.shader_record_ptr, shader_record_ptr_arg, 1, 64); -+ store_param_var(&b, vars.accel_struct, accel_struct_arg, 1, 64); -+ store_param_var(&b, vars.cull_mask_and_flags, cull_mask_and_flags_arg, 1, 32); -+ store_param_var(&b, vars.sbt_offset, sbt_offset_arg, 1, 32); -+ store_param_var(&b, vars.sbt_stride, sbt_stride_arg, 1, 32); -+ store_param_var(&b, vars.miss_index, miss_index_arg, 1, 32); -+ store_param_var(&b, vars.origin, ray_origin_arg, 3, 32); -+ store_param_var(&b, vars.tmin, ray_tmin_arg, 1, 32); -+ store_param_var(&b, vars.direction, ray_direction_arg, 3, 32); -+ store_param_var(&b, vars.tmax, ray_tmax_arg, 1, 32); -+ store_param_var(&b, vars.primitive_id, primitive_id_arg, 1, 32); -+ store_param_var(&b, vars.instance_addr, instance_addr_arg, 1, 64); -+ store_param_var(&b, vars.geometry_id_and_flags, geometry_id_and_flags_arg, 1, 32); -+ store_param_var(&b, vars.hit_kind, hit_kind_arg, 1, 32); - - b.cursor = nir_after_impl(impl); - - if (monolithic) { - nir_terminate(&b); -- } else { -- /* select next shader */ -- shader_addr = nir_load_var(&b, vars.shader_addr); -- nir_def *next = select_next_shader(&b, shader_addr, info->wave_size); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.uniform_shader_addr, next); -- -- ac_nir_store_arg(&b, &args->ac, args->descriptor_sets[0], descriptor_sets); -- ac_nir_store_arg(&b, &args->ac, args->ac.push_constants, push_constants); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.sbt_descriptors, sbt_descriptors); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.traversal_shader_addr, traversal_addr); -- -- for (uint32_t i = 0; i < ARRAY_SIZE(launch_sizes); i++) { -- if (rt_info.uses_launch_size) -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.launch_sizes[i], launch_sizes[i]); -- else -- radv_store_arg(&b, args, traversal_info, args->ac.rt.launch_sizes[i], launch_sizes[i]); -- } -- -- if (scratch_offset) -- ac_nir_store_arg(&b, &args->ac, args->ac.scratch_offset, scratch_offset); -- if (ring_offsets) -- ac_nir_store_arg(&b, &args->ac, args->ac.ring_offsets, ring_offsets); -- -- for (uint32_t i = 0; i < ARRAY_SIZE(launch_ids); i++) { -- if (rt_info.uses_launch_id) -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.launch_ids[i], launch_ids[i]); -- else -- radv_store_arg(&b, args, traversal_info, args->ac.rt.launch_ids[i], launch_ids[i]); -- } -- -- /* store back all variables to registers */ -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.dynamic_callable_stack_base, nir_load_var(&b, vars.stack_ptr)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.shader_addr, shader_addr); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.shader_record, nir_load_var(&b, vars.shader_record_ptr)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.payload_offset, nir_load_var(&b, vars.arg)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.accel_struct, nir_load_var(&b, vars.accel_struct)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.cull_mask_and_flags, -- nir_load_var(&b, vars.cull_mask_and_flags)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.sbt_offset, nir_load_var(&b, vars.sbt_offset)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.sbt_stride, nir_load_var(&b, vars.sbt_stride)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.miss_index, nir_load_var(&b, vars.miss_index)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.ray_origin, nir_load_var(&b, vars.origin)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.ray_tmin, nir_load_var(&b, vars.tmin)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.ray_direction, nir_load_var(&b, vars.direction)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.ray_tmax, nir_load_var(&b, vars.tmax)); -- -- radv_store_arg(&b, args, traversal_info, args->ac.rt.primitive_id, nir_load_var(&b, vars.primitive_id)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.instance_addr, nir_load_var(&b, vars.instance_addr)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.geometry_id_and_flags, -- nir_load_var(&b, vars.geometry_id_and_flags)); -- radv_store_arg(&b, args, traversal_info, args->ac.rt.hit_kind, nir_load_var(&b, vars.hit_kind)); - } - - nir_metadata_preserve(impl, nir_metadata_none); - - /* cleanup passes */ -+ if (!monolithic) { -+ NIR_PASS_V(shader, radv_nir_lower_ray_payload_derefs, 0); -+ -+ b.cursor = nir_before_impl(impl); -+ nir_deref_instr **payload_in_storage = -+ rzalloc_array_size(shader, sizeof(nir_deref_instr *), DIV_ROUND_UP(*payload_size, 4)); -+ if (in_payload_base_arg != -1u) { -+ for (unsigned i = 0; i < DIV_ROUND_UP(*payload_size, 4); ++i) { -+ payload_in_storage[i] = nir_build_deref_cast(&b, nir_load_param(&b, in_payload_base_arg + i), -+ nir_var_shader_call_data, glsl_uint_type(), 4); -+ } -+ } -+ NIR_PASS_V(shader, lower_rt_storage, NULL, payload_in_storage, vars.out_payload_storage, info->wave_size); -+ -+ nir_remove_dead_derefs(shader); -+ nir_remove_dead_variables(shader, nir_var_function_temp | nir_var_shader_call_data, NULL); -+ } - NIR_PASS_V(shader, nir_lower_global_vars_to_local); - NIR_PASS_V(shader, nir_lower_vars_to_ssa); -- if (shader->info.stage == MESA_SHADER_CLOSEST_HIT || shader->info.stage == MESA_SHADER_INTERSECTION) -- NIR_PASS_V(shader, lower_hit_attribs, NULL, info->wave_size); - } - - static bool -diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c -index c4feea4a6f95b..196f8aa23a032 100644 ---- a/src/amd/vulkan/radv_pipeline_rt.c -+++ b/src/amd/vulkan/radv_pipeline_rt.c -@@ -368,7 +368,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, - bool keep_executable_info = radv_pipeline_capture_shaders(device, pipeline->base.base.create_flags); - bool keep_statistic_info = radv_pipeline_capture_shader_stats(device, pipeline->base.base.create_flags); - -- radv_nir_lower_rt_io(stage->nir, monolithic, 0, payload_size); -+ radv_nir_lower_rt_io(stage->nir, monolithic, 0); - - /* Gather shader info. */ - nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir)); -@@ -382,70 +382,30 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, - stage->info.user_sgprs_locs = stage->args.user_sgprs_locs; - stage->info.inline_push_constant_mask = stage->args.ac.inline_push_const_mask; - -- /* Move ray tracing system values to the top that are set by rt_trace_ray -- * to prevent them from being overwritten by other rt_trace_ray calls. -- */ -- NIR_PASS_V(stage->nir, move_rt_instructions); -- -- uint32_t num_resume_shaders = 0; -- nir_shader **resume_shaders = NULL; -- -- if (stage->stage != MESA_SHADER_INTERSECTION && !monolithic) { -- nir_builder b = nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(stage->nir))); -- nir_rt_return_amd(&b); -- -- const nir_lower_shader_calls_options opts = { -- .address_format = nir_address_format_32bit_offset, -- .stack_alignment = 16, -- .localized_loads = true, -- .vectorizer_callback = ac_nir_mem_vectorize_callback, -- .vectorizer_data = &pdev->info.gfx_level, -- }; -- nir_lower_shader_calls(stage->nir, &opts, &resume_shaders, &num_resume_shaders, stage->nir); -- } -- -- unsigned num_shaders = num_resume_shaders + 1; -- nir_shader **shaders = ralloc_array(stage->nir, nir_shader *, num_shaders); -- if (!shaders) -- return VK_ERROR_OUT_OF_HOST_MEMORY; -- -- shaders[0] = stage->nir; -- for (uint32_t i = 0; i < num_resume_shaders; i++) -- shaders[i + 1] = resume_shaders[i]; -- - if (stage_info) - memset(stage_info->unused_args, 0xFF, sizeof(stage_info->unused_args)); - - /* Postprocess shader parts. */ -- for (uint32_t i = 0; i < num_shaders; i++) { -- struct radv_shader_stage temp_stage = *stage; -- temp_stage.nir = shaders[i]; -- radv_nir_lower_rt_abi(temp_stage.nir, pCreateInfo, &temp_stage.args, &stage->info, stack_size, i > 0, device, -- pipeline, monolithic, traversal_stage_info); -+ radv_nir_lower_rt_abi(stage->nir, pCreateInfo, &stage->args, &stage->info, payload_size, stack_size, device, -+ pipeline, monolithic); - -- /* Info might be out-of-date after inlining in radv_nir_lower_rt_abi(). */ -- nir_shader_gather_info(temp_stage.nir, radv_get_rt_shader_entrypoint(temp_stage.nir)); -+ /* Info might be out-of-date after inlining in radv_nir_lower_rt_abi(). */ -+ nir_shader_gather_info(stage->nir, radv_get_rt_shader_entrypoint(stage->nir)); - -- radv_optimize_nir(temp_stage.nir, stage->key.optimisations_disabled); -- radv_postprocess_nir(device, NULL, &temp_stage); -- -- if (stage_info) -- radv_gather_unused_args(stage_info, shaders[i]); -- } -+ radv_optimize_nir(stage->nir, stage->key.optimisations_disabled); -+ radv_postprocess_nir(device, NULL, stage); - -- bool dump_shader = radv_can_dump_shader(device, shaders[0], false); -+ bool dump_shader = radv_can_dump_shader(device, stage->nir, false); - bool replayable = - pipeline->base.base.create_flags & VK_PIPELINE_CREATE_2_RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR; - - if (dump_shader) { - simple_mtx_lock(&instance->shader_dump_mtx); -- for (uint32_t i = 0; i < num_shaders; i++) -- nir_print_shader(shaders[i], stderr); -+ nir_print_shader(stage->nir, stderr); - } - - /* Compile NIR shader to AMD assembly. */ -- binary = -- radv_shader_nir_to_asm(device, stage, shaders, num_shaders, NULL, keep_executable_info, keep_statistic_info); -+ binary = radv_shader_nir_to_asm(device, stage, &stage->nir, 1, NULL, keep_executable_info, keep_statistic_info); - struct radv_shader *shader; - if (replay_block || replayable) { - VkResult result = radv_shader_create_uncached(device, binary, replayable, replay_block, &shader); -@@ -463,7 +423,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, - if (stack_size) - *stack_size += DIV_ROUND_UP(shader->config.scratch_bytes_per_wave, shader->info.wave_size); - -- radv_shader_generate_debug_info(device, dump_shader, keep_executable_info, binary, shader, shaders, num_shaders, -+ radv_shader_generate_debug_info(device, dump_shader, keep_executable_info, binary, shader, &stage->nir, 1, - &stage->info); - - if (shader && keep_executable_info && stage->spirv.size) { -diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h -index f6a0f35c23333..654ae528866d8 100644 ---- a/src/amd/vulkan/radv_shader.h -+++ b/src/amd/vulkan/radv_shader.h -@@ -516,15 +516,14 @@ radv_get_rt_shader_entrypoint(nir_shader *shader) - return NULL; - } - --void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset, uint32_t *payload_size); -+void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset); - - struct radv_ray_tracing_stage_info; - - void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, - const struct radv_shader_args *args, const struct radv_shader_info *info, -- uint32_t *stack_size, bool resume_shader, uint32_t payload_size, struct radv_device *device, -- struct radv_ray_tracing_pipeline *pipeline, bool monolithic, -- const struct radv_ray_tracing_stage_info *traversal_info); -+ uint32_t *payload_size, uint32_t *stack_size, struct radv_device *device, -+ struct radv_ray_tracing_pipeline *pipeline, bool monolithic); - - void radv_gather_unused_args(struct radv_ray_tracing_stage_info *info, nir_shader *nir); - -diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c -index 75f5a66444f91..e52fc48c33ebc 100644 ---- a/src/amd/vulkan/radv_shader_args.c -+++ b/src/amd/vulkan/radv_shader_args.c -@@ -316,7 +316,7 @@ radv_init_shader_args(const struct radv_device *device, gl_shader_stage stage, s - void - radv_declare_rt_shader_args(enum amd_gfx_level gfx_level, struct radv_shader_args *args) - { -- add_ud_arg(args, 2, AC_ARG_CONST_PTR, &args->ac.rt.uniform_shader_addr, AC_UD_SCRATCH_RING_OFFSETS); -+ add_ud_arg(args, 2, AC_ARG_CONST_PTR, &args->ac.ring_offsets, AC_UD_SCRATCH_RING_OFFSETS); - add_ud_arg(args, 1, AC_ARG_CONST_PTR_PTR, &args->descriptor_sets[0], AC_UD_INDIRECT_DESCRIPTOR_SETS); - ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR, &args->ac.push_constants); - ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->ac.rt.sbt_descriptors); -@@ -334,25 +334,8 @@ radv_declare_rt_shader_args(enum amd_gfx_level gfx_level, struct radv_shader_arg - ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.launch_ids[i]); - - ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.dynamic_callable_stack_base); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_CONST_PTR, &args->ac.rt.shader_addr); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_CONST_PTR, &args->ac.rt.shader_record); - - ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.payload_offset); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_FLOAT, &args->ac.rt.ray_origin); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_FLOAT, &args->ac.rt.ray_direction); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.rt.ray_tmin); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.rt.ray_tmax); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.cull_mask_and_flags); -- -- ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_CONST_PTR, &args->ac.rt.accel_struct); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.sbt_offset); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.sbt_stride); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.miss_index); -- -- ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_CONST_PTR, &args->ac.rt.instance_addr); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.primitive_id); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.geometry_id_and_flags); -- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.hit_kind); - } - - static bool -@@ -548,7 +531,6 @@ declare_shader_args(const struct radv_device *device, const struct radv_graphics - radv_init_shader_args(device, stage, args); - - if (gl_shader_stage_is_rt(stage)) { -- radv_declare_rt_shader_args(gfx_level, args); - return; - } - --- -GitLab - - -From 7a6a16e551cf02df8e14d8b729584ca9d8bf5443 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 12:22:46 +0200 -Subject: [PATCH 29/71] radv/rt: Remove radv_gather_unused_args - -Not needed anymore. ---- - src/amd/vulkan/nir/radv_nir_rt_shader.c | 47 ------------------------- - src/amd/vulkan/radv_shader.h | 2 -- - 2 files changed, 49 deletions(-) - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index 7968cb36f5d87..d0e43ebd406b7 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -2457,50 +2457,3 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH - NIR_PASS_V(shader, nir_lower_global_vars_to_local); - NIR_PASS_V(shader, nir_lower_vars_to_ssa); - } -- --static bool --radv_arg_def_is_unused(nir_def *def) --{ -- nir_foreach_use (use, def) { -- nir_instr *use_instr = nir_src_parent_instr(use); -- if (use_instr->type == nir_instr_type_intrinsic) { -- nir_intrinsic_instr *use_intr = nir_instr_as_intrinsic(use_instr); -- if (use_intr->intrinsic == nir_intrinsic_store_scalar_arg_amd || -- use_intr->intrinsic == nir_intrinsic_store_vector_arg_amd) -- continue; -- } else if (use_instr->type == nir_instr_type_phi) { -- nir_cf_node *prev_node = nir_cf_node_prev(&use_instr->block->cf_node); -- if (!prev_node) -- return false; -- -- nir_phi_instr *phi = nir_instr_as_phi(use_instr); -- if (radv_arg_def_is_unused(&phi->def)) -- continue; -- } -- -- return false; -- } -- -- return true; --} -- --static bool --radv_gather_unused_args_instr(nir_builder *b, nir_intrinsic_instr *instr, void *data) --{ -- if (instr->intrinsic != nir_intrinsic_load_scalar_arg_amd && instr->intrinsic != nir_intrinsic_load_vector_arg_amd) -- return false; -- -- if (!radv_arg_def_is_unused(&instr->def)) { -- /* This arg is used for more than passing data to the next stage. */ -- struct radv_ray_tracing_stage_info *info = data; -- BITSET_CLEAR(info->unused_args, nir_intrinsic_base(instr)); -- } -- -- return false; --} -- --void --radv_gather_unused_args(struct radv_ray_tracing_stage_info *info, nir_shader *nir) --{ -- nir_shader_intrinsics_pass(nir, radv_gather_unused_args_instr, nir_metadata_all, info); --} -diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h -index 654ae528866d8..7dacf66a7a3fa 100644 ---- a/src/amd/vulkan/radv_shader.h -+++ b/src/amd/vulkan/radv_shader.h -@@ -525,8 +525,6 @@ void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateI - uint32_t *payload_size, uint32_t *stack_size, struct radv_device *device, - struct radv_ray_tracing_pipeline *pipeline, bool monolithic); - --void radv_gather_unused_args(struct radv_ray_tracing_stage_info *info, nir_shader *nir); -- - struct radv_shader_stage; - - nir_shader *radv_shader_spirv_to_nir(struct radv_device *device, const struct radv_shader_stage *stage, --- -GitLab - - -From c4aa21f8f03032e97d13aece927b62240986fd39 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sat, 4 May 2024 17:51:17 +0200 -Subject: [PATCH 30/71] radv/rt: make radv_nir_init_rt_function_params public - ---- - src/amd/vulkan/nir/radv_nir_rt_shader.c | 10 +++++----- - src/amd/vulkan/radv_shader.h | 1 + - 2 files changed, 6 insertions(+), 5 deletions(-) - -diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c -index d0e43ebd406b7..aa9af1eeefd54 100644 ---- a/src/amd/vulkan/nir/radv_nir_rt_shader.c -+++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c -@@ -221,7 +221,7 @@ lower_rt_derefs(nir_shader *shader) - return progress; - } - --static void -+void - radv_nir_init_rt_function_params(nir_function *function, gl_shader_stage stage, unsigned payload_size) - { - unsigned payload_base = -1u; -@@ -583,13 +583,13 @@ create_rt_variables(nir_shader *shader, struct radv_device *device, const VkPipe - } - - nir_function *trace_ray_func = nir_function_create(shader, "trace_ray_func"); -- radv_nir_init_function_params(trace_ray_func, MESA_SHADER_INTERSECTION, max_payload_size); -+ radv_nir_init_rt_function_params(trace_ray_func, MESA_SHADER_INTERSECTION, max_payload_size); - vars.trace_ray_func = trace_ray_func; - nir_function *chit_miss_func = nir_function_create(shader, "chit_miss_func"); -- radv_nir_init_function_params(chit_miss_func, MESA_SHADER_CLOSEST_HIT, max_payload_size); -+ radv_nir_init_rt_function_params(chit_miss_func, MESA_SHADER_CLOSEST_HIT, max_payload_size); - vars.chit_miss_func = chit_miss_func; - nir_function *callable_func = nir_function_create(shader, "callable_func"); -- radv_nir_init_function_params(callable_func, MESA_SHADER_CALLABLE, max_payload_size); -+ radv_nir_init_rt_function_params(callable_func, MESA_SHADER_CALLABLE, max_payload_size); - vars.callable_func = callable_func; - return vars; - } -@@ -2287,7 +2287,7 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH - nir_function_impl *impl = nir_shader_get_entrypoint(shader); - nir_function *entrypoint_function = impl->function; - -- radv_nir_init_function_params(entrypoint_function, shader->info.stage, *payload_size); -+ radv_nir_init_rt_function_params(entrypoint_function, shader->info.stage, *payload_size); - - const VkPipelineCreateFlagBits2KHR create_flags = vk_rt_pipeline_create_flags(pCreateInfo); - -diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h -index 7dacf66a7a3fa..10e062fb041b9 100644 ---- a/src/amd/vulkan/radv_shader.h -+++ b/src/amd/vulkan/radv_shader.h -@@ -520,6 +520,7 @@ void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_ - - struct radv_ray_tracing_stage_info; - -+void radv_nir_init_rt_function_params(nir_function *function, gl_shader_stage stage, unsigned payload_size); - void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, - const struct radv_shader_args *args, const struct radv_shader_info *info, - uint32_t *payload_size, uint32_t *stack_size, struct radv_device *device, --- -GitLab - - -From 98acf10bc32ec843f53497bc701a673777232c65 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Fri, 3 May 2024 17:36:43 +0200 -Subject: [PATCH 31/71] radv: Use call optimization - ---- - src/amd/vulkan/radv_pipeline.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c -index 82a5aac71437d..daaf4e9ba4f00 100644 ---- a/src/amd/vulkan/radv_pipeline.c -+++ b/src/amd/vulkan/radv_pipeline.c -@@ -643,6 +643,8 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat - * spilling. - */ - NIR_PASS(_, stage->nir, nir_opt_move, nir_move_comparisons); -+ -+ NIR_PASS(_, stage->nir, nir_minimize_call_live_states); - } - } - --- -GitLab - - -From 872b8a249c2fa92a5425c4476d7021d881d76990 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Thu, 28 Dec 2023 20:03:05 +0100 -Subject: [PATCH 32/71] aco: Add ABI and Pseudo CALL format - ---- - src/amd/compiler/aco_builder_h.py | 29 +++ - .../compiler/aco_instruction_selection.cpp | 23 ++ - src/amd/compiler/aco_ir.cpp | 1 + - src/amd/compiler/aco_ir.h | 235 +++++++++++++++++- - src/amd/compiler/aco_opcodes.py | 7 +- - src/amd/compiler/aco_register_allocation.cpp | 71 ------ - 6 files changed, 292 insertions(+), 74 deletions(-) - -diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py -index b1054bda76fd6..3d47be6101248 100644 ---- a/src/amd/compiler/aco_builder_h.py -+++ b/src/amd/compiler/aco_builder_h.py -@@ -567,6 +567,7 @@ formats = [("pseudo", [Format.PSEUDO], list(itertools.product(range(5), range(6) - ("branch", [Format.PSEUDO_BRANCH], itertools.product([1], [0, 1])), - ("barrier", [Format.PSEUDO_BARRIER], [(0, 0)]), - ("reduction", [Format.PSEUDO_REDUCTION], [(3, 3)]), -+ ("call", [Format.PSEUDO_CALL], [(0, 0)]), - ("vop1", [Format.VOP1], [(0, 0), (1, 1), (2, 2)]), - ("vop1_sdwa", [Format.VOP1, Format.SDWA], [(1, 1)]), - ("vop2", [Format.VOP2], itertools.product([1, 2], [2, 3])), -@@ -603,6 +604,7 @@ formats = [("pseudo", [Format.PSEUDO], list(itertools.product(range(5), range(6) - formats = [(f if len(f) == 5 else f + ('',)) for f in formats] - %>\\ - % for name, formats, shapes, extra_field_setup in formats: -+ % if shapes: - % for num_definitions, num_operands in shapes: - <% - args = ['aco_opcode opcode'] -@@ -655,6 +657,33 @@ formats = [(f if len(f) == 5 else f + ('',)) for f in formats] - - % endif - % endfor -+% else: -+ <% -+ args = ['aco_opcode opcode', 'aco::span definitions', 'aco::span operands' ] -+ for f in formats: -+ args += f.get_builder_field_decls() -+ %>\\ -+ -+ Result ${name}(${', '.join(args)}) -+ { -+ ${struct} *instr = create_instruction<${struct}>(opcode, (Format)(${'|'.join('(int)Format::%s' % f.name for f in formats)}), operands.size(), definitions.size()); -+ for (unsigned i = 0; i < definitions.size(); ++i) { -+ instr->definitions[i] = definitions[i]; -+ instr->definitions[i].setPrecise(is_precise); -+ instr->definitions[i].setNUW(is_nuw); -+ } -+ for (unsigned i = 0; i < operands.size(); ++i) -+ instr->operands[i] = operands[i]; -+ % for f in formats: -+ % for dest, field_name in zip(f.get_builder_field_dests(), f.get_builder_field_names()): -+ instr->${dest} = ${field_name}; -+ % endfor -+ ${f.get_builder_initialization(num_operands)} -+ % endfor -+ ${extra_field_setup} -+ return insert(instr); -+ } -+% endif - % endfor - }; - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index 30f0bdd1cb8f8..662b6cccc0abf 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -10537,6 +10537,29 @@ visit_jump(isel_context* ctx, nir_jump_instr* instr) - } - } - -+ABI -+make_abi(const ABI& base, Program* program) -+{ -+ ABI abi = base; -+ -+ unsigned sgpr_limit = program->dev.sgpr_limit; -+ /* GFX8- needs a scratch_rsrc that we need to keep around somewhere */ -+ if (program->gfx_level < GFX9) -+ sgpr_limit -= (align(sgpr_limit, 4) - sgpr_limit) + 4; -+ unsigned vgpr_limit = program->dev.vgpr_limit; -+ -+ abi.parameterSpace.sgpr.size = -+ std::min(abi.parameterSpace.sgpr.size, sgpr_limit - abi.parameterSpace.sgpr.lo()); -+ abi.parameterSpace.vgpr.size = -+ std::min(abi.parameterSpace.vgpr.size, vgpr_limit - (abi.parameterSpace.vgpr.lo() - 256)); -+ abi.clobberedRegs.sgpr.size = -+ std::min(abi.clobberedRegs.sgpr.size, sgpr_limit - abi.clobberedRegs.sgpr.lo()); -+ abi.clobberedRegs.vgpr.size = -+ std::min(abi.clobberedRegs.vgpr.size, vgpr_limit - (abi.clobberedRegs.vgpr.lo() - 256)); -+ -+ return abi; -+} -+ - void - visit_block(isel_context* ctx, nir_block* block) - { -diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp -index 2c0b17a82cae6..afa1364a83f59 100644 ---- a/src/amd/compiler/aco_ir.cpp -+++ b/src/amd/compiler/aco_ir.cpp -@@ -1541,6 +1541,7 @@ get_instr_data_size(Format format) - case Format::PSEUDO_BARRIER: return sizeof(Pseudo_barrier_instruction); - case Format::PSEUDO_REDUCTION: return sizeof(Pseudo_reduction_instruction); - case Format::PSEUDO_BRANCH: return sizeof(Pseudo_branch_instruction); -+ case Format::PSEUDO_CALL: return sizeof(Pseudo_call_instruction); - case Format::DS: return sizeof(DS_instruction); - case Format::FLAT: - case Format::GLOBAL: -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index d838b728e19ce..62661b8918a9e 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -441,6 +441,215 @@ static constexpr PhysReg exec_hi{127}; - static constexpr PhysReg pops_exiting_wave_id{239}; /* GFX9-GFX10.3 */ - static constexpr PhysReg scc{253}; - -+/* Iterator type for making PhysRegInterval compatible with range-based for */ -+struct PhysRegIterator { -+ using difference_type = int; -+ using value_type = unsigned; -+ using reference = const unsigned&; -+ using pointer = const unsigned*; -+ using iterator_category = std::bidirectional_iterator_tag; -+ -+ PhysReg reg; -+ -+ PhysReg operator*() const { return reg; } -+ -+ PhysRegIterator& operator++() -+ { -+ reg.reg_b += 4; -+ return *this; -+ } -+ -+ PhysRegIterator& operator--() -+ { -+ reg.reg_b -= 4; -+ return *this; -+ } -+ -+ bool operator==(PhysRegIterator oth) const { return reg == oth.reg; } -+ -+ bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; } -+ -+ bool operator<(PhysRegIterator oth) const { return reg < oth.reg; } -+}; -+ -+/* Half-open register interval used in "sliding window"-style for-loops */ -+struct PhysRegInterval { -+ PhysReg lo_; -+ unsigned size; -+ -+ /* Inclusive lower bound */ -+ PhysReg lo() const { return lo_; } -+ -+ /* Exclusive upper bound */ -+ PhysReg hi() const { return PhysReg{lo() + size}; } -+ -+ PhysRegInterval& operator+=(uint32_t stride) -+ { -+ lo_ = PhysReg{lo_.reg() + stride}; -+ return *this; -+ } -+ -+ bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; } -+ -+ /* Construct a half-open interval, excluding the end register */ -+ static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; } -+ -+ bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); } -+ -+ bool contains(const PhysRegInterval& needle) const -+ { -+ return needle.lo() >= lo() && needle.hi() <= hi(); -+ } -+ -+ PhysRegIterator begin() const { return {lo_}; } -+ -+ PhysRegIterator end() const { return {PhysReg{lo_ + size}}; } -+}; -+ -+inline bool -+intersects(const PhysRegInterval& a, const PhysRegInterval& b) -+{ -+ return a.hi() > b.lo() && b.hi() > a.lo(); -+} -+ -+struct GPRInterval { -+ PhysRegInterval sgpr; -+ PhysRegInterval vgpr; -+}; -+ -+struct ABI { -+ GPRInterval parameterSpace; -+ GPRInterval clobberedRegs; -+ -+ bool clobbersVCC; -+ bool clobbersSCC; -+}; -+ -+static constexpr ABI rtRaygenABI = { -+ .parameterSpace = -+ { -+ .sgpr = -+ { -+ .lo_ = PhysReg(0), -+ .size = 32, -+ }, -+ .vgpr = -+ { -+ .lo_ = PhysReg(256), -+ .size = 32, -+ }, -+ }, -+ .clobberedRegs = -+ { -+ .sgpr = -+ { -+ .lo_ = PhysReg(0), -+ .size = 108, -+ }, -+ .vgpr = -+ { -+ .lo_ = PhysReg(256), -+ .size = 128, -+ }, -+ }, -+ .clobbersVCC = true, -+ .clobbersSCC = true, -+}; -+ -+static constexpr ABI rtTraversalABI = { -+ .parameterSpace = -+ { -+ .sgpr = -+ { -+ .lo_ = PhysReg(0), -+ .size = 32, -+ }, -+ .vgpr = -+ { -+ .lo_ = PhysReg(256), -+ .size = 32, -+ }, -+ }, -+ .clobberedRegs = -+ { -+ /* TODO: maybe find better values */ -+ .sgpr = -+ { -+ .lo_ = PhysReg(0), -+ .size = 108, -+ }, -+ .vgpr = -+ { -+ .lo_ = PhysReg(256), -+ .size = 128, -+ }, -+ }, -+ .clobbersVCC = true, -+ .clobbersSCC = true, -+}; -+ -+static constexpr ABI rtAnyHitABI = { -+ .parameterSpace = -+ { -+ .sgpr = -+ { -+ .lo_ = PhysReg(0), -+ .size = 32, -+ }, -+ .vgpr = -+ { -+ .lo_ = PhysReg(256), -+ .size = 32, -+ }, -+ }, -+ .clobberedRegs = -+ { -+ .sgpr = -+ { -+ .lo_ = PhysReg(80), -+ .size = 16, -+ }, -+ .vgpr = -+ { -+ .lo_ = PhysReg(256 + 80), -+ .size = 32, -+ }, -+ }, -+ .clobbersVCC = true, -+ .clobbersSCC = true, -+}; -+ -+static constexpr ABI rtClosestHitMissABI = { -+ .parameterSpace = -+ { -+ .sgpr = -+ { -+ .lo_ = PhysReg(0), -+ .size = 32, -+ }, -+ .vgpr = -+ { -+ .lo_ = PhysReg(256), -+ .size = 32, -+ }, -+ }, -+ .clobberedRegs = -+ { -+ .sgpr = -+ { -+ .lo_ = PhysReg(0), -+ .size = 108, -+ }, -+ .vgpr = -+ { -+ .lo_ = PhysReg(256), -+ .size = 128, -+ }, -+ }, -+ .clobbersVCC = true, -+ .clobbersSCC = true, -+}; -+ - /** - * Operand Class - * Initially, each Operand refers to either -@@ -1095,6 +1304,7 @@ struct FLAT_instruction; - struct Pseudo_branch_instruction; - struct Pseudo_barrier_instruction; - struct Pseudo_reduction_instruction; -+struct Pseudo_call_instruction; - struct VALU_instruction; - struct VINTERP_inreg_instruction; - struct VINTRP_instruction; -@@ -1295,6 +1505,17 @@ struct Instruction { - return *(Pseudo_reduction_instruction*)this; - } - constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; } -+ Pseudo_call_instruction& call() noexcept -+ { -+ assert(isCall()); -+ return *(Pseudo_call_instruction*)this; -+ } -+ const Pseudo_call_instruction& call() const noexcept -+ { -+ assert(isCall()); -+ return *(Pseudo_call_instruction*)this; -+ } -+ constexpr bool isCall() const noexcept { return format == Format::PSEUDO_CALL; } - constexpr bool isVOP3P() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3P; } - VINTERP_inreg_instruction& vinterp_inreg() noexcept - { -@@ -1773,6 +1994,16 @@ struct Pseudo_reduction_instruction : public Instruction { - static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, - "Unexpected padding"); - -+struct Pseudo_call_instruction : public Instruction { -+ ABI abi; -+ /* -+ * Register demand that's exclusively used for blocking registers for ABI compatibility. -+ * Set by live var analysis. -+ */ -+ RegisterDemand blocked_abi_demand; -+}; -+static_assert(sizeof(Pseudo_call_instruction) == sizeof(Instruction) + 40, "Unexpected padding"); -+ - inline bool - Instruction::accessesLDS() const noexcept - { -@@ -1845,8 +2076,8 @@ memory_sync_info get_sync_info(const Instruction* instr); - inline bool - is_dead(const std::vector& uses, const Instruction* instr) - { -- if (instr->definitions.empty() || instr->isBranch() || instr->opcode == aco_opcode::p_startpgm || -- instr->opcode == aco_opcode::p_init_scratch || -+ if (instr->definitions.empty() || instr->isBranch() || instr->isCall() || -+ instr->opcode == aco_opcode::p_startpgm || instr->opcode == aco_opcode::p_init_scratch || - instr->opcode == aco_opcode::p_dual_src_export_gfx11) - return false; - -diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py -index 6e37ee6fad6c9..d828f1642658b 100644 ---- a/src/amd/compiler/aco_opcodes.py -+++ b/src/amd/compiler/aco_opcodes.py -@@ -50,6 +50,7 @@ class Format(IntEnum): - PSEUDO_BRANCH = auto() - PSEUDO_BARRIER = auto() - PSEUDO_REDUCTION = auto() -+ PSEUDO_CALL = auto() - # Scalar ALU & Control Formats - SOP1 = auto() - SOP2 = auto() -@@ -93,7 +94,7 @@ class Format(IntEnum): - return "salu" - elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]: - return "flatlike" -- elif self in [Format.PSEUDO_BRANCH, Format.PSEUDO_REDUCTION, Format.PSEUDO_BARRIER]: -+ elif self in [Format.PSEUDO_BRANCH, Format.PSEUDO_REDUCTION, Format.PSEUDO_BARRIER, Format.PSEUDO_CALL]: - return self.name.split("_")[-1].lower() - else: - return self.name.lower() -@@ -162,6 +163,8 @@ class Format(IntEnum): - elif self == Format.PSEUDO_BARRIER: - return [('memory_sync_info', 'sync', None), - ('sync_scope', 'exec_scope', 'scope_invocation')] -+ elif self == Format.PSEUDO_CALL: -+ return [('ABI', 'abi', None)] - elif self == Format.VINTRP: - return [('unsigned', 'attribute', None), - ('unsigned', 'component', None), -@@ -351,6 +354,8 @@ insn("p_cbranch_nz", format=Format.PSEUDO_BRANCH) - - insn("p_barrier", format=Format.PSEUDO_BARRIER) - -+insn("p_call", format=Format.PSEUDO_CALL) -+ - # Primitive Ordered Pixel Shading pseudo-instructions. - - # For querying whether the current wave can enter the ordered section on GFX9-10.3, doing -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index 3ce0680bf52d6..4d73525bd0660 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -74,37 +74,6 @@ struct assignment { - } - }; - --/* Iterator type for making PhysRegInterval compatible with range-based for */ --struct PhysRegIterator { -- using difference_type = int; -- using value_type = unsigned; -- using reference = const unsigned&; -- using pointer = const unsigned*; -- using iterator_category = std::bidirectional_iterator_tag; -- -- PhysReg reg; -- -- PhysReg operator*() const { return reg; } -- -- PhysRegIterator& operator++() -- { -- reg.reg_b += 4; -- return *this; -- } -- -- PhysRegIterator& operator--() -- { -- reg.reg_b -= 4; -- return *this; -- } -- -- bool operator==(PhysRegIterator oth) const { return reg == oth.reg; } -- -- bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; } -- -- bool operator<(PhysRegIterator oth) const { return reg < oth.reg; } --}; -- - struct vector_info { - vector_info() : is_weak(false), num_parts(0), parts(NULL) {} - vector_info(Instruction* instr, unsigned start = 0, bool weak = false) -@@ -162,46 +131,6 @@ struct ra_ctx { - } - }; - --/* Half-open register interval used in "sliding window"-style for-loops */ --struct PhysRegInterval { -- PhysReg lo_; -- unsigned size; -- -- /* Inclusive lower bound */ -- PhysReg lo() const { return lo_; } -- -- /* Exclusive upper bound */ -- PhysReg hi() const { return PhysReg{lo() + size}; } -- -- PhysRegInterval& operator+=(uint32_t stride) -- { -- lo_ = PhysReg{lo_.reg() + stride}; -- return *this; -- } -- -- bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; } -- -- /* Construct a half-open interval, excluding the end register */ -- static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; } -- -- bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); } -- -- bool contains(const PhysRegInterval& needle) const -- { -- return needle.lo() >= lo() && needle.hi() <= hi(); -- } -- -- PhysRegIterator begin() const { return {lo_}; } -- -- PhysRegIterator end() const { return {PhysReg{lo_ + size}}; } --}; -- --bool --intersects(const PhysRegInterval& a, const PhysRegInterval& b) --{ -- return a.hi() > b.lo() && b.hi() > a.lo(); --} -- - /* Gets the stride for full (non-subdword) registers */ - uint32_t - get_stride(RegClass rc) --- -GitLab - - -From 9d88284e83bab4a0ba20700dc3be48c646284a79 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Tue, 9 Apr 2024 08:08:07 +0200 -Subject: [PATCH 33/71] aco: Add pseudo instr to calculate a function callee's - stack pointer - ---- - src/amd/compiler/aco_lower_to_hw_instr.cpp | 14 ++++++++++++++ - src/amd/compiler/aco_opcodes.py | 2 ++ - 2 files changed, 16 insertions(+) - -diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp -index fa3c805f491b5..1e1737319c3f6 100644 ---- a/src/amd/compiler/aco_lower_to_hw_instr.cpp -+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp -@@ -2817,6 +2817,20 @@ lower_to_hw_instr(Program* program) - ((32 - 1) << 11) | shader_cycles_hi); - break; - } -+ case aco_opcode::p_callee_stack_ptr: { -+ unsigned caller_stack_size = -+ ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size; -+ unsigned scratch_param_size = instr->operands[0].constantValue(); -+ unsigned callee_stack_start = caller_stack_size + scratch_param_size; -+ if (ctx.program->gfx_level < GFX9) -+ callee_stack_start *= ctx.program->wave_size; -+ if (instr->operands.size() < 2) -+ bld.sop1(aco_opcode::s_mov_b32, instr->definitions[0], -+ Operand::c32(callee_stack_start)); -+ else -+ bld.sop2(aco_opcode::s_add_u32, instr->definitions[0], Definition(scc, s1), -+ instr->operands[1], Operand::c32(callee_stack_start)); -+ } - default: break; - } - } else if (instr->isBranch()) { -diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py -index d828f1642658b..696a5a945b310 100644 ---- a/src/amd/compiler/aco_opcodes.py -+++ b/src/amd/compiler/aco_opcodes.py -@@ -331,6 +331,8 @@ insn("p_boolean_phi") - insn("p_as_uniform") - insn("p_unit_test") - -+insn("p_callee_stack_ptr") -+ - insn("p_create_vector") - insn("p_extract_vector") - insn("p_split_vector") --- -GitLab - - -From 0e07c86fd764126d0af3bfb2041d680e9367ee6e Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 22 Apr 2024 06:50:54 +0200 -Subject: [PATCH 34/71] aco: Add scratch stack pointer - -Function callees shouldn't overwrite caller's stacks. -Track where to write scratch data with a stack pointer. ---- - src/amd/compiler/aco_ir.h | 1 + - src/amd/compiler/aco_reindex_ssa.cpp | 1 + - 2 files changed, 2 insertions(+) - -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index 62661b8918a9e..ef2a6a0255664 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -2361,6 +2361,7 @@ public: - std::vector constant_data; - Temp private_segment_buffer; - Temp scratch_offset; -+ Temp stack_ptr = {}; - - uint16_t num_waves = 0; - uint16_t min_waves = 0; -diff --git a/src/amd/compiler/aco_reindex_ssa.cpp b/src/amd/compiler/aco_reindex_ssa.cpp -index 7c30e5b53656e..5e135a8ff83fe 100644 ---- a/src/amd/compiler/aco_reindex_ssa.cpp -+++ b/src/amd/compiler/aco_reindex_ssa.cpp -@@ -73,6 +73,7 @@ reindex_program(idx_ctx& ctx, Program* program) - program->private_segment_buffer.regClass()); - program->scratch_offset = - Temp(ctx.renames[program->scratch_offset.id()], program->scratch_offset.regClass()); -+ program->stack_ptr = Temp(ctx.renames[program->stack_ptr.id()], program->stack_ptr.regClass()); - program->temp_rc = ctx.temp_rc; - } - --- -GitLab - - -From e876db458963a92579827a04a21b1427c0442c72 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 22 Apr 2024 06:51:10 +0200 -Subject: [PATCH 35/71] aco/spill: Use scratch stack pointer - ---- - src/amd/compiler/aco_spill.cpp | 7 ++++++- - 1 file changed, 6 insertions(+), 1 deletion(-) - -diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp -index be45b0eda7632..2e30bf9e2783e 100644 ---- a/src/amd/compiler/aco_spill.cpp -+++ b/src/amd/compiler/aco_spill.cpp -@@ -1240,7 +1240,12 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block, - } - - /* GFX9+ uses scratch_* instructions, which don't use a resource. */ -- ctx.scratch_rsrc = offset_bld.copy(offset_bld.def(s1), Operand::c32(saddr)); -+ if (ctx.program->stack_ptr.id()) -+ ctx.scratch_rsrc = -+ offset_bld.sop2(aco_opcode::s_add_u32, offset_bld.def(s1), Definition(scc, s1), -+ Operand(ctx.program->stack_ptr), Operand::c32(saddr)); -+ else -+ ctx.scratch_rsrc = offset_bld.copy(offset_bld.def(s1), Operand::c32(saddr)); - } - } else { - if (ctx.scratch_rsrc == Temp()) --- -GitLab - - -From 968bea7283d902a01843297661c63ea802a67a04 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sat, 4 May 2024 16:01:59 +0200 -Subject: [PATCH 36/71] nir: Allow forward-declaring nir_parameter - ---- - src/compiler/nir/nir.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h -index 10a592f4b87bb..dc6b15cd082b4 100644 ---- a/src/compiler/nir/nir.h -+++ b/src/compiler/nir/nir.h -@@ -3646,7 +3646,7 @@ nir_cf_list_is_empty_block(struct exec_list *cf_list) - return false; - } - --typedef struct { -+typedef struct nir_parameter { - uint8_t num_components; - uint8_t bit_size; - --- -GitLab - - -From e245c9553b06094af7afc232d8db158bd2e7b3d6 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 6 Mar 2024 13:27:17 +0100 -Subject: [PATCH 37/71] aco: Add call info - ---- - .../compiler/aco_instruction_selection.cpp | 80 +++++++++++++++++++ - src/amd/compiler/aco_instruction_selection.h | 32 ++++++++ - .../aco_instruction_selection_setup.cpp | 8 ++ - src/amd/compiler/aco_ir.h | 4 + - 4 files changed, 124 insertions(+) - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index 662b6cccc0abf..0875d1c7a20f4 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -10,6 +10,7 @@ - #include "aco_builder.h" - #include "aco_interface.h" - #include "aco_ir.h" -+#include "aco_nir_call_attribs.h" - - #include "common/ac_descriptors.h" - #include "common/ac_gpu_info.h" -@@ -10560,6 +10561,85 @@ make_abi(const ABI& base, Program* program) - return abi; - } - -+struct callee_info -+get_callee_info(const ABI& abi, unsigned param_count, const nir_parameter* parameters, -+ Program* program) -+{ -+ struct callee_info info = {}; -+ info.param_infos.reserve(param_count); -+ -+ unsigned sgpr_reg_byte_offset = 0; -+ unsigned vgpr_reg_byte_offset = 0; -+ unsigned scratch_param_byte_offset = 0; -+ -+ Temp return_addr = program ? program->allocateTmp(s2) : Temp(); -+ Definition return_def = Definition(return_addr); -+ return_def.setPrecolored(abi.parameterSpace.sgpr.lo().advance(sgpr_reg_byte_offset)); -+ sgpr_reg_byte_offset += 8; -+ -+ info.return_address = parameter_info{ -+ .discardable = false, -+ .is_reg = true, -+ .def = return_def, -+ }; -+ -+ for (unsigned i = 0; i < param_count; ++i) { -+ unsigned* reg_byte_offset; -+ PhysRegInterval interval; -+ RegType type; -+ if (parameters[i].is_uniform) { -+ reg_byte_offset = &sgpr_reg_byte_offset; -+ interval = abi.parameterSpace.sgpr; -+ /* Explicitly reserve space for the stack pointer, which is allocated last */ -+ interval.size -= 1; -+ type = RegType::sgpr; -+ } else { -+ reg_byte_offset = &vgpr_reg_byte_offset; -+ interval = abi.parameterSpace.vgpr; -+ type = RegType::vgpr; -+ } -+ -+ unsigned byte_size = align(parameters[i].bit_size, 32) / 8 * parameters[i].num_components; -+ RegClass rc = RegClass(type, byte_size / 4); -+ PhysReg param_reg = interval.lo().advance(*reg_byte_offset); -+ -+ if (param_reg < interval.hi()) { -+ ++info.reg_param_count; -+ if (parameters[i].is_return) -+ ++info.reg_return_param_count; -+ Temp dst = program ? program->allocateTmp(rc) : Temp(); -+ Definition def = Definition(dst); -+ def.setPrecolored(param_reg); -+ *reg_byte_offset += byte_size; -+ info.param_infos.emplace_back(parameter_info{ -+ .discardable = !!(parameters[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE), -+ .is_reg = true, -+ .def = def, -+ }); -+ } else { -+ info.param_infos.emplace_back(parameter_info{ -+ .discardable = !!(parameters[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE), -+ .is_reg = false, -+ .scratch_offset = scratch_param_byte_offset, -+ }); -+ scratch_param_byte_offset += byte_size; -+ } -+ } -+ -+ Temp stack_ptr = program ? program->allocateTmp(s1) : Temp(); -+ Definition stack_def = Definition(stack_ptr); -+ stack_def.setPrecolored(abi.parameterSpace.sgpr.lo().advance(sgpr_reg_byte_offset)); -+ sgpr_reg_byte_offset += 4; -+ info.stack_ptr = parameter_info{ -+ .discardable = false, -+ .is_reg = true, -+ .def = stack_def, -+ }; -+ -+ info.scratch_param_size = scratch_param_byte_offset; -+ return info; -+} -+ - void - visit_block(isel_context* ctx, nir_block* block) - { -diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h -index d7464811def91..1682ed262f1e5 100644 ---- a/src/amd/compiler/aco_instruction_selection.h -+++ b/src/amd/compiler/aco_instruction_selection.h -@@ -35,6 +35,31 @@ struct shader_io_state { - } - }; - -+struct parameter_info { -+ bool discardable; -+ bool is_reg; -+ union { -+ Definition def; -+ unsigned scratch_offset; -+ }; -+}; -+ -+struct call_info { -+ nir_call_instr* nir_instr; -+ Instruction* aco_instr; -+ std::vector return_info; -+ unsigned scratch_param_size; -+}; -+ -+struct callee_info { -+ std::vector param_infos; -+ parameter_info return_address; -+ parameter_info stack_ptr; -+ unsigned reg_param_count = 0; -+ unsigned reg_return_param_count = 0; -+ unsigned scratch_param_size = 0; -+}; -+ - struct exec_info { - /* Set to false when loop_nest_depth==0 && parent_if.is_divergent==false */ - bool potentially_empty_discard = false; -@@ -111,6 +136,13 @@ struct isel_context { - uint32_t wqm_instruction_idx; - - BITSET_DECLARE(output_args, AC_MAX_ARGS); -+ -+ /* Function information */ -+ ABI callee_abi; -+ struct callee_info callee_info; -+ std::vector call_infos; -+ Temp next_divergent_pc; -+ Temp next_pc; - }; - - inline Temp -diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp -index 28708503c6b38..f1cd92aad5fd2 100644 ---- a/src/amd/compiler/aco_instruction_selection_setup.cpp -+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp -@@ -393,6 +393,8 @@ init_context(isel_context* ctx, nir_shader* shader) - ctx->program->allocateRange(impl->ssa_alloc); - RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id; - -+ unsigned call_count = 0; -+ - /* TODO: make this recursive to improve compile times */ - bool done = false; - while (!done) { -@@ -699,12 +701,18 @@ init_context(isel_context* ctx, nir_shader* shader) - regclasses[phi->def.index] = rc; - break; - } -+ case nir_instr_type_call: { -+ ++call_count; -+ break; -+ } - default: break; - } - } - } - } - -+ ctx->call_infos.reserve(call_count); -+ - ctx->program->config->spi_ps_input_ena = ctx->program->info.ps.spi_ps_input_ena; - ctx->program->config->spi_ps_input_addr = ctx->program->info.ps.spi_ps_input_addr; - -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index ef2a6a0255664..920174ac50798 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -2385,6 +2385,10 @@ public: - /* For shader part with previous shader part that has lds access. */ - bool pending_lds_access = false; - -+ ABI callee_abi = {}; -+ unsigned short arg_sgpr_count; -+ unsigned short arg_vgpr_count; -+ - struct { - monotonic_buffer_resource memory; - /* live-in temps per block */ --- -GitLab - - -From 112032179e9758b2c24ab0184b3dd73ff34d7266 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sun, 21 Apr 2024 17:52:58 +0200 -Subject: [PATCH 38/71] aco/isel: Use stack pointer parameter in - load/store_scratch - ---- - .../compiler/aco_instruction_selection.cpp | 32 +++++++++++++++++-- - 1 file changed, 29 insertions(+), 3 deletions(-) - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index 0875d1c7a20f4..f985685b1d524 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -7751,11 +7751,28 @@ visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr) - if (ctx->program->gfx_level >= GFX9) { - if (nir_src_is_const(instr->src[0])) { - uint32_t max = ctx->program->dev.scratch_global_offset_max + 1; -- info.offset = -- bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max))); -+ if (ctx->callee_info.stack_ptr.is_reg) -+ info.offset = -+ bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), -+ Operand(ctx->callee_info.stack_ptr.def.getTemp()), -+ Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max))); -+ else -+ info.offset = bld.copy( -+ bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max))); - info.const_offset = nir_src_as_uint(instr->src[0]) % max; - } else { -- info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa)); -+ if (ctx->callee_info.stack_ptr.is_reg) { -+ Temp store_offset = get_ssa_temp(ctx, instr->src[0].ssa); -+ if (store_offset.type() == RegType::sgpr) -+ info.offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), -+ Operand(ctx->callee_info.stack_ptr.def.getTemp()), -+ Operand(store_offset)); -+ else -+ info.offset = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), -+ Operand(ctx->callee_info.stack_ptr.def.getTemp()), -+ Operand(store_offset)); -+ } else -+ info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa)); - } - EmitLoadParameters params = scratch_flat_load_params; - params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1; -@@ -7775,6 +7792,15 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr) - Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); - Temp offset = get_ssa_temp(ctx, instr->src[1].ssa); - -+ if (ctx->callee_info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9) { -+ if (offset.type() == RegType::sgpr) -+ offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), -+ Operand(ctx->callee_info.stack_ptr.def.getTemp()), Operand(offset)); -+ else -+ offset = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), -+ Operand(ctx->callee_info.stack_ptr.def.getTemp()), Operand(offset)); -+ } -+ - unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; - unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes); - --- -GitLab - - -From b8e49a1b7325c6b46fa2bd27732047b213ef5bda Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sat, 9 Mar 2024 11:15:43 +0100 -Subject: [PATCH 39/71] nir,aco: Add set_next_call_pc_amd intrinsic - -Used for lowering function calls ---- - src/amd/compiler/aco_instruction_selection.cpp | 5 +++++ - src/compiler/nir/nir_intrinsics.py | 2 ++ - 2 files changed, 7 insertions(+) - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index f985685b1d524..d83801d8e35cc 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -9640,6 +9640,11 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) - bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)), - Operand::c32(nir_intrinsic_base(instr))); - break; -+ case nir_intrinsic_set_next_call_pc_amd: { -+ ctx->next_divergent_pc = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); -+ ctx->next_pc = get_ssa_temp(ctx, instr->src[1].ssa); -+ break; -+ } - default: - isel_err(&instr->instr, "Unimplemented intrinsic instr"); - abort(); -diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py -index 2a6de0c4b6f25..1e3efcf06446d 100644 ---- a/src/compiler/nir/nir_intrinsics.py -+++ b/src/compiler/nir/nir_intrinsics.py -@@ -2374,3 +2374,5 @@ intrinsic("enqueue_node_payloads", src_comp=[-1]) - - # Returns true if it has been called for every payload. - intrinsic("finalize_incoming_node_payload", src_comp=[-1], dest_comp=1) -+ -+intrinsic("set_next_call_pc_amd", src_comp=[1, 1], bit_sizes=[64]) --- -GitLab - - -From c8aec7b77ef0fd5e1bb36cbf06929fd75523b8ca Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 26 Feb 2024 12:20:26 +0100 -Subject: [PATCH 40/71] nir,aco: add call_return_adress sysval - ---- - src/amd/compiler/aco_instruction_selection.cpp | 5 +++++ - src/compiler/nir/nir_divergence_analysis.c | 1 + - src/compiler/nir/nir_intrinsics.py | 1 + - 3 files changed, 7 insertions(+) - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index d83801d8e35cc..d0d0dc1b036df 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -9640,6 +9640,11 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) - bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)), - Operand::c32(nir_intrinsic_base(instr))); - break; -+ case nir_intrinsic_load_call_return_address_amd: { -+ bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), -+ Operand(ctx->callee_info.return_address.def.getTemp())); -+ break; -+ } - case nir_intrinsic_set_next_call_pc_amd: { - ctx->next_divergent_pc = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); - ctx->next_pc = get_ssa_temp(ctx, instr->src[1].ssa); -diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c -index 78943c897922f..2fc4eda71aeb0 100644 ---- a/src/compiler/nir/nir_divergence_analysis.c -+++ b/src/compiler/nir/nir_divergence_analysis.c -@@ -344,6 +344,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) - case nir_intrinsic_load_samples_log2_agx: - case nir_intrinsic_load_active_subgroup_count_agx: - case nir_intrinsic_load_constant_base_ptr: -+ case nir_intrinsic_load_call_return_address_amd: - is_divergent = false; - break; - -diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py -index 1e3efcf06446d..808ee31420ba0 100644 ---- a/src/compiler/nir/nir_intrinsics.py -+++ b/src/compiler/nir/nir_intrinsics.py -@@ -2375,4 +2375,5 @@ intrinsic("enqueue_node_payloads", src_comp=[-1]) - # Returns true if it has been called for every payload. - intrinsic("finalize_incoming_node_payload", src_comp=[-1], dest_comp=1) - -+system_value("call_return_address_amd", 1, bit_sizes=[64]) - intrinsic("set_next_call_pc_amd", src_comp=[1, 1], bit_sizes=[64]) --- -GitLab - - -From a61f79118bc11db5dbbc1ef19c521c834936a637 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sun, 7 Jan 2024 22:15:13 +0100 -Subject: [PATCH 41/71] radv/nir: Lower NIR function call ABI - ---- - src/amd/vulkan/meson.build | 1 + - src/amd/vulkan/nir/radv_nir.h | 4 + - src/amd/vulkan/nir/radv_nir_lower_call_abi.c | 433 +++++++++++++++++++ - src/amd/vulkan/radv_pipeline.c | 4 + - src/compiler/nir/nir_divergence_analysis.c | 1 + - src/compiler/nir/nir_intrinsics.py | 3 + - 6 files changed, 446 insertions(+) - create mode 100644 src/amd/vulkan/nir/radv_nir_lower_call_abi.c - -diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build -index 5976bef8b85cf..84414ad41b7c0 100644 ---- a/src/amd/vulkan/meson.build -+++ b/src/amd/vulkan/meson.build -@@ -66,6 +66,7 @@ libradv_files = files( - 'nir/radv_nir_apply_pipeline_layout.c', - 'nir/radv_nir_export_multiview.c', - 'nir/radv_nir_lower_abi.c', -+ 'nir/radv_nir_lower_call_abi.c', - 'nir/radv_nir_lower_cooperative_matrix.c', - 'nir/radv_nir_lower_fs_barycentric.c', - 'nir/radv_nir_lower_fs_intrinsics.c', -diff --git a/src/amd/vulkan/nir/radv_nir.h b/src/amd/vulkan/nir/radv_nir.h -index cd779d64e857c..e004de467ed3e 100644 ---- a/src/amd/vulkan/nir/radv_nir.h -+++ b/src/amd/vulkan/nir/radv_nir.h -@@ -90,6 +90,10 @@ typedef struct radv_nir_opt_tid_function_options { - - bool radv_nir_opt_tid_function(nir_shader *shader, const radv_nir_opt_tid_function_options *options); - -+void radv_nir_lower_callee_signature(nir_function *function, struct set *visited_funcs); -+ -+bool radv_nir_lower_call_abi(nir_shader *shader, unsigned wave_size); -+ - #ifdef __cplusplus - } - #endif -diff --git a/src/amd/vulkan/nir/radv_nir_lower_call_abi.c b/src/amd/vulkan/nir/radv_nir_lower_call_abi.c -new file mode 100644 -index 0000000000000..5f18f9aea0f28 ---- /dev/null -+++ b/src/amd/vulkan/nir/radv_nir_lower_call_abi.c -@@ -0,0 +1,433 @@ -+/* -+ * Copyright © 2023 Valve Corporation -+ * -+ * Permission is hereby granted, free of charge, to any person obtaining a -+ * copy of this software and associated documentation files (the "Software"), -+ * to deal in the Software without restriction, including without limitation -+ * the rights to use, copy, modify, merge, publish, distribute, sublicense, -+ * and/or sell copies of the Software, and to permit persons to whom the -+ * Software is furnished to do so, subject to the following conditions: -+ * -+ * The above copyright notice and this permission notice (including the next -+ * paragraph) shall be included in all copies or substantial portions of the -+ * Software. -+ * -+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS -+ * IN THE SOFTWARE. -+ */ -+ -+#include "aco_nir_call_attribs.h" -+#include "nir_builder.h" -+#include "radv_nir.h" -+ -+void -+radv_nir_lower_callee_signature(nir_function *function, struct set *visited_funcs) -+{ -+ if (visited_funcs) { -+ if (_mesa_set_search(visited_funcs, function)) -+ return; -+ _mesa_set_add(visited_funcs, function); -+ } -+ -+ nir_parameter *old_params = function->params; -+ unsigned old_num_params = function->num_params; -+ -+ function->num_params += 2; -+ function->params = rzalloc_array_size(function->shader, function->num_params, sizeof(nir_parameter)); -+ -+ memcpy(function->params + 2, old_params, old_num_params * sizeof(nir_parameter)); -+ -+ function->params[0].num_components = 1; -+ function->params[0].bit_size = 64; -+ function->params[1].num_components = 1; -+ function->params[1].bit_size = 64; -+ function->params[1].is_uniform = true; -+ -+ nir_function_impl *impl = function->impl; -+ -+ if (!impl) -+ return; -+ -+ nir_foreach_block (block, impl) { -+ nir_foreach_instr_safe (instr, block) { -+ if (instr->type != nir_instr_type_intrinsic) -+ continue; -+ -+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); -+ -+ if (intr->intrinsic == nir_intrinsic_load_param) -+ nir_intrinsic_set_param_idx(intr, nir_intrinsic_param_idx(intr) + 2); -+ } -+ } -+} -+ -+/* Checks if caller can call callee using tail calls. -+ * -+ * If the ABIs mismatch, we might need to insert move instructions to move return values from callee return registers to -+ * caller return registers after the call. In that case, tail-calls are impossible to do correctly. -+ */ -+static bool -+is_tail_call_compatible(nir_function *caller, nir_function *callee) -+{ -+ /* If the caller doesn't return at all, we don't need to care if return params are compatible */ -+ if (caller->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_NORETURN) -+ return true; -+ /* The same ABI can't mismatch */ -+ if ((caller->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) == -+ (callee->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK)) -+ return true; -+ /* The recursive shader ABI and the traversal shader ABI are built so that return parameters occupy exactly -+ * the same registers, to allow tail calls from the traversal shader. */ -+ if ((caller->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) == ACO_NIR_CALL_ABI_TRAVERSAL && -+ (callee->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) == ACO_NIR_CALL_ABI_RT_RECURSIVE) -+ return true; -+ return false; -+} -+ -+static void -+gather_tail_call_instrs_block(nir_function *caller, const struct nir_block *block, struct set *tail_calls) -+{ -+ nir_foreach_instr_reverse (instr, block) { -+ switch (instr->type) { -+ case nir_instr_type_phi: -+ case nir_instr_type_undef: -+ case nir_instr_type_load_const: -+ continue; -+ case nir_instr_type_alu: -+ if (!nir_op_is_vec_or_mov(nir_instr_as_alu(instr)->op)) -+ return; -+ continue; -+ case nir_instr_type_call: { -+ nir_call_instr *call = nir_instr_as_call(instr); -+ -+ if (!is_tail_call_compatible(caller, call->callee)) -+ return; -+ -+ for (unsigned i = 0; i < call->num_params; ++i) { -+ if (call->callee->params[i].is_return != caller->params[i].is_return) -+ return; -+ /* We can only do tail calls if the caller returns exactly the callee return values */ -+ if (caller->params[i].is_return) { -+ assert(call->params[i].ssa->parent_instr->type == nir_instr_type_deref); -+ nir_deref_instr *deref_root = nir_instr_as_deref(call->params[i].ssa->parent_instr); -+ while (nir_deref_instr_parent(deref_root)) -+ deref_root = nir_deref_instr_parent(deref_root); -+ -+ if (!deref_root->parent.ssa) -+ return; -+ if (deref_root->parent.ssa->parent_instr->type != nir_instr_type_intrinsic) -+ return; -+ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(deref_root->parent.ssa->parent_instr); -+ if (intrin->intrinsic != nir_intrinsic_load_param) -+ return; -+ if (nir_intrinsic_param_idx(intrin) != i) -+ return; -+ } -+ if (call->callee->params[i].is_uniform != caller->params[i].is_uniform) -+ return; -+ if (call->callee->params[i].bit_size != caller->params[i].bit_size) -+ return; -+ if (call->callee->params[i].num_components != caller->params[i].num_components) -+ return; -+ } -+ -+ _mesa_set_add(tail_calls, instr); -+ continue; -+ } -+ default: -+ return; -+ } -+ } -+ -+ set_foreach (block->predecessors, pred) { -+ gather_tail_call_instrs_block(caller, pred->key, tail_calls); -+ } -+} -+ -+struct lower_param_info { -+ /* */ -+ nir_def *load_param_def; -+ -+ nir_def *return_deref; -+ bool has_store; -+}; -+ -+static void -+check_param_uses_for_stores(nir_deref_instr *instr, struct lower_param_info *info) -+{ -+ nir_foreach_use (deref_use, &instr->def) { -+ nir_instr *use_instr = nir_src_parent_instr(deref_use); -+ if (use_instr->type == nir_instr_type_deref) -+ check_param_uses_for_stores(nir_instr_as_deref(use_instr), info); -+ else if ((use_instr->type == nir_instr_type_intrinsic && -+ nir_instr_as_intrinsic(use_instr)->intrinsic == nir_intrinsic_store_deref) || -+ use_instr->type == nir_instr_type_call) -+ info->has_store = true; -+ } -+} -+ -+static void -+rewrite_return_param_uses(nir_intrinsic_instr *intr, unsigned param_idx, struct lower_param_info *param_defs) -+{ -+ nir_foreach_use_safe (use, &intr->def) { -+ nir_instr *use_instr = nir_src_parent_instr(use); -+ assert(use_instr && use_instr->type == nir_instr_type_deref && -+ nir_instr_as_deref(use_instr)->deref_type == nir_deref_type_cast); -+ check_param_uses_for_stores(nir_instr_as_deref(use_instr), ¶m_defs[param_idx]); -+ nir_def_rewrite_uses(&nir_instr_as_deref(use_instr)->def, param_defs[param_idx].return_deref); -+ -+ nir_instr_remove(use_instr); -+ } -+} -+ -+static void -+lower_call_abi_for_callee(nir_function *function, unsigned wave_size, struct set *visited_funcs) -+{ -+ nir_function_impl *impl = function->impl; -+ -+ nir_builder b = nir_builder_create(impl); -+ b.cursor = nir_before_impl(impl); -+ -+ nir_variable *tail_call_pc = -+ nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint64_t_type(), "_tail_call_pc"); -+ nir_store_var(&b, tail_call_pc, nir_imm_int64(&b, 0), 0x1); -+ -+ struct set *tail_call_instrs = _mesa_set_create(b.shader, _mesa_hash_pointer, _mesa_key_pointer_equal); -+ gather_tail_call_instrs_block(function, nir_impl_last_block(impl), tail_call_instrs); -+ -+ radv_nir_lower_callee_signature(function, visited_funcs); -+ -+ /* guard the shader, so that only the correct invocations execute it */ -+ -+ nir_def *guard_condition = NULL; -+ nir_def *shader_addr; -+ nir_def *uniform_shader_addr; -+ if (function->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_DIVERGENT_CALL) { -+ nir_cf_list list; -+ nir_cf_extract(&list, nir_before_impl(impl), nir_after_impl(impl)); -+ -+ b.cursor = nir_before_impl(impl); -+ -+ shader_addr = nir_load_param(&b, 0); -+ uniform_shader_addr = nir_load_param(&b, 1); -+ -+ guard_condition = nir_ieq(&b, uniform_shader_addr, shader_addr); -+ nir_if *shader_guard = nir_push_if(&b, guard_condition); -+ shader_guard->control = nir_selection_control_divergent_always_taken; -+ nir_cf_reinsert(&list, b.cursor); -+ nir_pop_if(&b, shader_guard); -+ } else { -+ shader_addr = nir_load_param(&b, 0); -+ } -+ -+ b.cursor = nir_before_impl(impl); -+ struct lower_param_info *param_infos = ralloc_size(b.shader, function->num_params * sizeof(struct lower_param_info)); -+ nir_variable **param_vars = ralloc_size(b.shader, function->num_params * sizeof(nir_variable *)); -+ -+ for (unsigned i = 2; i < function->num_params; ++i) { -+ param_vars[i] = nir_local_variable_create(impl, function->params[i].type, "_param"); -+ unsigned num_components = glsl_get_vector_elements(function->params[i].type); -+ -+ if (function->params[i].is_return) { -+ assert(!glsl_type_is_array(function->params[i].type) && !glsl_type_is_struct(function->params[i].type)); -+ -+ function->params[i].bit_size = glsl_get_bit_size(function->params[i].type); -+ function->params[i].num_components = num_components; -+ -+ param_infos[i].return_deref = &nir_build_deref_var(&b, param_vars[i])->def; -+ } else { -+ param_infos[i].return_deref = NULL; -+ } -+ -+ param_infos[i].has_store = false; -+ param_infos[i].load_param_def = nir_load_param(&b, i); -+ nir_store_var(&b, param_vars[i], param_infos[i].load_param_def, (0x1 << num_components) - 1); -+ } -+ -+ unsigned max_tail_call_param = 0; -+ -+ nir_foreach_block (block, impl) { -+ bool progress; -+ do { -+ progress = false; -+ nir_foreach_instr_safe (instr, block) { -+ if (instr->type == nir_instr_type_call && _mesa_set_search(tail_call_instrs, instr)) { -+ nir_call_instr *call = nir_instr_as_call(instr); -+ b.cursor = nir_before_instr(instr); -+ -+ for (unsigned i = 0; i < call->num_params; ++i) { -+ if (call->callee->params[i].is_return) -+ nir_store_var(&b, param_vars[i + 2], -+ nir_load_deref(&b, nir_instr_as_deref(call->params[i].ssa->parent_instr)), -+ (0x1 << glsl_get_vector_elements(call->callee->params[i].type)) - 1); -+ else -+ nir_store_var(&b, param_vars[i + 2], call->params[i].ssa, -+ (0x1 << call->params[i].ssa->num_components) - 1); -+ param_infos[i + 2].has_store = true; -+ } -+ -+ nir_store_var(&b, tail_call_pc, call->indirect_callee.ssa, 0x1); -+ max_tail_call_param = MAX2(max_tail_call_param, call->num_params + 2); -+ -+ nir_instr_remove(instr); -+ -+ progress = true; -+ break; -+ } -+ -+ if (instr->type != nir_instr_type_intrinsic) -+ continue; -+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); -+ if (nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_param) { -+ unsigned param_idx = nir_intrinsic_param_idx(intr); -+ -+ if (param_idx >= 2 && &intr->def != param_infos[param_idx].load_param_def) { -+ if (function->params[param_idx].is_return) -+ rewrite_return_param_uses(intr, param_idx, param_infos); -+ else -+ nir_def_rewrite_uses(&intr->def, param_infos[param_idx].load_param_def); -+ nir_instr_remove(instr); -+ progress = true; -+ break; -+ } -+ } -+ } -+ } while (progress); -+ } -+ -+ b.cursor = nir_after_impl(impl); -+ -+ for (unsigned i = 2; i < function->num_params; ++i) { -+ if (param_infos[i].has_store) -+ nir_store_param_amd(&b, nir_load_var(&b, param_vars[i]), .param_idx = i); -+ } -+ -+ if (guard_condition) -+ shader_addr = nir_bcsel(&b, guard_condition, nir_load_var(&b, tail_call_pc), shader_addr); -+ else -+ shader_addr = nir_load_var(&b, tail_call_pc); -+ nir_def *ballot = nir_ballot(&b, 1, wave_size, nir_ine_imm(&b, shader_addr, 0)); -+ nir_def *ballot_addr = nir_read_invocation(&b, shader_addr, nir_find_lsb(&b, ballot)); -+ uniform_shader_addr = nir_bcsel(&b, nir_ieq_imm(&b, ballot, 0), nir_load_call_return_address_amd(&b), ballot_addr); -+ -+ if (!(function->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_NORETURN)) { -+ nir_push_if(&b, nir_ieq_imm(&b, uniform_shader_addr, 0)); -+ nir_terminate(&b); -+ nir_pop_if(&b, NULL); -+ -+ nir_set_next_call_pc_amd(&b, shader_addr, uniform_shader_addr); -+ } -+} -+ -+static void -+lower_call_abi_for_call(nir_builder *b, nir_call_instr *call, unsigned *cur_call_idx, struct set *visited_funcs, -+ struct set *visited_calls) -+{ -+ unsigned call_idx = (*cur_call_idx)++; -+ -+ for (unsigned i = 0; i < call->num_params; ++i) { -+ unsigned callee_param_idx = i; -+ if (_mesa_set_search(visited_funcs, call->callee)) -+ callee_param_idx += 2; -+ -+ if (!call->callee->params[callee_param_idx].is_return) -+ continue; -+ -+ b->cursor = nir_before_instr(&call->instr); -+ -+ nir_src *old_src = &call->params[i]; -+ -+ assert(old_src->ssa->parent_instr->type == nir_instr_type_deref); -+ nir_deref_instr *param_deref = nir_instr_as_deref(old_src->ssa->parent_instr); -+ assert(param_deref->deref_type == nir_deref_type_var); -+ -+ nir_src_rewrite(old_src, nir_load_deref(b, param_deref)); -+ -+ b->cursor = nir_after_instr(&call->instr); -+ -+ unsigned num_components = glsl_get_vector_elements(param_deref->type); -+ -+ nir_store_deref( -+ b, param_deref, -+ nir_load_return_param_amd(b, num_components, glsl_base_type_get_bit_size(param_deref->type->base_type), -+ .call_idx = call_idx, .param_idx = i + 2), -+ (1u << num_components) - 1); -+ -+ assert(call->callee->params[callee_param_idx].bit_size == glsl_get_bit_size(param_deref->type)); -+ assert(call->callee->params[callee_param_idx].num_components == num_components); -+ } -+ -+ radv_nir_lower_callee_signature(call->callee, visited_funcs); -+ -+ b->cursor = nir_after_instr(&call->instr); -+ -+ nir_call_instr *new_call = nir_call_instr_create(b->shader, call->callee); -+ new_call->indirect_callee = nir_src_for_ssa(call->indirect_callee.ssa); -+ new_call->params[0] = nir_src_for_ssa(call->indirect_callee.ssa); -+ new_call->params[1] = nir_src_for_ssa(nir_read_first_invocation(b, call->indirect_callee.ssa)); -+ for (unsigned i = 2; i < new_call->num_params; ++i) -+ new_call->params[i] = nir_src_for_ssa(call->params[i - 2].ssa); -+ -+ nir_builder_instr_insert(b, &new_call->instr); -+ b->cursor = nir_after_instr(&new_call->instr); -+ _mesa_set_add(visited_calls, new_call); -+ -+ nir_instr_remove(&call->instr); -+} -+ -+static bool -+lower_call_abi_for_caller(nir_function_impl *impl, struct set *visited_funcs) -+{ -+ bool progress = false; -+ unsigned cur_call_idx = 0; -+ struct set *visited_calls = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); -+ -+ nir_foreach_block (block, impl) { -+ nir_foreach_instr_safe (instr, block) { -+ if (instr->type != nir_instr_type_call) -+ continue; -+ nir_call_instr *call = nir_instr_as_call(instr); -+ if (call->callee->impl) -+ continue; -+ if (_mesa_set_search(visited_calls, call)) -+ continue; -+ -+ nir_builder b = nir_builder_create(impl); -+ lower_call_abi_for_call(&b, call, &cur_call_idx, visited_funcs, visited_calls); -+ progress = true; -+ } -+ } -+ -+ _mesa_set_destroy(visited_calls, NULL); -+ -+ return progress; -+} -+ -+bool -+radv_nir_lower_call_abi(nir_shader *shader, unsigned wave_size) -+{ -+ struct set *visited_funcs = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); -+ -+ bool progress = false; -+ nir_foreach_function_with_impl (function, impl, shader) { -+ bool func_progress = false; -+ if (function->is_exported) { -+ lower_call_abi_for_callee(function, wave_size, visited_funcs); -+ func_progress = true; -+ } -+ func_progress |= lower_call_abi_for_caller(impl, visited_funcs); -+ -+ if (func_progress) -+ nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); -+ progress |= func_progress; -+ } -+ -+ _mesa_set_destroy(visited_funcs, NULL); -+ -+ return progress; -+} -diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c -index daaf4e9ba4f00..fc7195b5067ca 100644 ---- a/src/amd/vulkan/radv_pipeline.c -+++ b/src/amd/vulkan/radv_pipeline.c -@@ -575,6 +575,10 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat - stage->nir, io_to_mem || lowered_ngg || stage->stage == MESA_SHADER_COMPUTE || stage->stage == MESA_SHADER_TASK, - gfx_level >= GFX8); - -+ NIR_PASS(_, stage->nir, radv_nir_lower_call_abi, stage->info.wave_size); -+ NIR_PASS(_, stage->nir, nir_lower_global_vars_to_local); -+ NIR_PASS(_, stage->nir, nir_lower_vars_to_ssa); -+ - NIR_PASS(_, stage->nir, nir_lower_fp16_casts, nir_lower_fp16_split_fp64); - - if (stage->nir->info.bit_sizes_int & (8 | 16)) { -diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c -index 2fc4eda71aeb0..1f780f0621cac 100644 ---- a/src/compiler/nir/nir_divergence_analysis.c -+++ b/src/compiler/nir/nir_divergence_analysis.c -@@ -892,6 +892,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) - case nir_intrinsic_load_sample_mask: - case nir_intrinsic_quad_ballot_agx: - case nir_intrinsic_load_agx: -+ case nir_intrinsic_load_return_param_amd: - is_divergent = true; - break; - -diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py -index 808ee31420ba0..32ab9b8a6acb8 100644 ---- a/src/compiler/nir/nir_intrinsics.py -+++ b/src/compiler/nir/nir_intrinsics.py -@@ -2375,5 +2375,8 @@ intrinsic("enqueue_node_payloads", src_comp=[-1]) - # Returns true if it has been called for every payload. - intrinsic("finalize_incoming_node_payload", src_comp=[-1], dest_comp=1) - -+intrinsic("store_param_amd", src_comp=[-1], indices=[PARAM_IDX]) -+intrinsic("load_return_param_amd", dest_comp=0, indices=[CALL_IDX, PARAM_IDX]) -+ - system_value("call_return_address_amd", 1, bit_sizes=[64]) - intrinsic("set_next_call_pc_amd", src_comp=[1, 1], bit_sizes=[64]) --- -GitLab - - -From fbe63f63878376a556e9eab7999edab5f332f257 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sun, 7 Jan 2024 22:42:03 +0100 -Subject: [PATCH 42/71] aco: Compile all functions in RT shaders - ---- - .../compiler/aco_instruction_selection.cpp | 43 +- - .../aco_instruction_selection_setup.cpp | 639 +++++++++--------- - 2 files changed, 345 insertions(+), 337 deletions(-) - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index d0d0dc1b036df..95baf3a302d0c 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -11891,30 +11891,35 @@ void - select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders, - const struct ac_shader_args* args) - { -+ bool first_block = true; - for (unsigned i = 0; i < shader_count; i++) { -- if (i) { -- ctx.block = ctx.program->create_and_insert_block(); -- ctx.block->kind = block_kind_top_level | block_kind_resume; -- } -+ nir_foreach_function_impl (impl, shaders[i]) { -+ if (!first_block) { -+ ctx.block = ctx.program->create_and_insert_block(); -+ ctx.block->kind = block_kind_top_level | block_kind_resume; -+ } -+ nir_shader* nir = shaders[i]; - -- nir_shader* nir = shaders[i]; -- init_context(&ctx, nir); -- setup_fp_mode(&ctx, nir); -+ init_context(&ctx, nir); -+ setup_fp_mode(&ctx, nir); - -- Instruction* startpgm = add_startpgm(&ctx); -- append_logical_start(ctx.block); -- split_arguments(&ctx, startpgm); -- visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body); -- append_logical_end(ctx.block); -- ctx.block->kind |= block_kind_uniform; -+ Instruction* startpgm = add_startpgm(&ctx); -+ append_logical_start(ctx.block); -+ split_arguments(&ctx, startpgm); -+ visit_cf_list(&ctx, &impl->body); -+ append_logical_end(ctx.block); -+ ctx.block->kind |= block_kind_uniform; - -- /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen -- * shader without shader calls. -- */ -- if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN) -- insert_rt_jump_next(ctx, args); -+ /* Fix output registers and jump to next shader. We can skip this when dealing with a -+ * raygen shader without shader calls. -+ */ -+ if ((shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN) && -+ impl == nir_shader_get_entrypoint(nir)) -+ insert_rt_jump_next(ctx, args); - -- cleanup_context(&ctx); -+ cleanup_context(&ctx); -+ first_block = false; -+ } - } - - ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val; -diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp -index f1cd92aad5fd2..600c63c8b9ce3 100644 ---- a/src/amd/compiler/aco_instruction_selection_setup.cpp -+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp -@@ -257,8 +257,8 @@ setup_nir(isel_context* ctx, nir_shader* nir) - nir_opt_dce(nir); - } - -- nir_function_impl* func = nir_shader_get_entrypoint(nir); -- nir_index_ssa_defs(func); -+ nir_foreach_function_impl (impl, nir) -+ nir_index_ssa_defs(impl); - } - - /* Returns true if we can skip uniformization of a merge phi. This makes the destination divergent, -@@ -349,7 +349,6 @@ skip_uniformize_merge_phi(nir_def* ssa, unsigned depth) - void - init_context(isel_context* ctx, nir_shader* shader) - { -- nir_function_impl* impl = nir_shader_get_entrypoint(shader); - ctx->shader = shader; - - /* Init NIR range analysis. */ -@@ -366,356 +365,359 @@ init_context(isel_context* ctx, nir_shader* shader) - - ac_nir_opt_shared_append(shader); - -- uint32_t options = -- shader->options->divergence_analysis_options | nir_divergence_ignore_undef_if_phi_srcs; -- nir_divergence_analysis_impl(impl, (nir_divergence_options)options); -- shader->info.divergence_analysis_run = true; -- if (nir_opt_uniform_atomics(shader, false)) { -- nir_lower_int64(shader); -+ nir_foreach_function_impl (impl, shader) { -+ uint32_t options = -+ shader->options->divergence_analysis_options | nir_divergence_ignore_undef_if_phi_srcs; - nir_divergence_analysis_impl(impl, (nir_divergence_options)options); -- } -+ shader->info.divergence_analysis_run = true; -+ if (nir_opt_uniform_atomics(shader, false)) { -+ nir_lower_int64(shader); -+ nir_divergence_analysis_impl(impl, (nir_divergence_options)options); -+ } - -- apply_nuw_to_offsets(ctx, impl); -+ apply_nuw_to_offsets(ctx, impl); - -- /* sanitize control flow */ -- sanitize_cf_list(impl, &impl->body); -- nir_metadata_preserve(impl, nir_metadata_none); -+ /* sanitize control flow */ -+ sanitize_cf_list(impl, &impl->body); -+ nir_metadata_preserve(impl, nir_metadata_none); - -- /* we'll need these for isel */ -- nir_metadata_require(impl, nir_metadata_block_index | nir_metadata_dominance); -+ /* we'll need these for isel */ -+ nir_metadata_require(impl, nir_metadata_block_index | nir_metadata_dominance); - -- if (ctx->options->dump_preoptir) { -- fprintf(stderr, "NIR shader before instruction selection:\n"); -- nir_print_shader(shader, stderr); -- } -+ if (ctx->options->dump_preoptir) { -+ fprintf(stderr, "NIR shader before instruction selection:\n"); -+ nir_print_shader(shader, stderr); -+ } - -- ctx->first_temp_id = ctx->program->peekAllocationId(); -- ctx->program->allocateRange(impl->ssa_alloc); -- RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id; -- -- unsigned call_count = 0; -- -- /* TODO: make this recursive to improve compile times */ -- bool done = false; -- while (!done) { -- done = true; -- nir_foreach_block (block, impl) { -- nir_foreach_instr (instr, block) { -- switch (instr->type) { -- case nir_instr_type_alu: { -- nir_alu_instr* alu_instr = nir_instr_as_alu(instr); -- RegType type = alu_instr->def.divergent ? RegType::vgpr : RegType::sgpr; -- -- /* packed 16bit instructions have to be VGPR */ -- if (alu_instr->def.num_components == 2 && -- nir_op_infos[alu_instr->op].output_size == 0) -+ ctx->first_temp_id = ctx->program->peekAllocationId(); -+ ctx->program->allocateRange(impl->ssa_alloc); -+ RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id; -+ -+ unsigned call_count = 0; -+ -+ /* TODO: make this recursive to improve compile times */ -+ bool done = false; -+ while (!done) { -+ done = true; -+ nir_foreach_block (block, impl) { -+ nir_foreach_instr (instr, block) { -+ switch (instr->type) { -+ case nir_instr_type_alu: { -+ nir_alu_instr* alu_instr = nir_instr_as_alu(instr); -+ RegType type = alu_instr->def.divergent ? RegType::vgpr : RegType::sgpr; -+ -+ /* packed 16bit instructions have to be VGPR */ -+ if (alu_instr->def.num_components == 2 && -+ nir_op_infos[alu_instr->op].output_size == 0) - type = RegType::vgpr; - -- switch (alu_instr->op) { -- case nir_op_f2i16: -- case nir_op_f2u16: -- case nir_op_f2i32: -- case nir_op_f2u32: -- case nir_op_b2i8: -- case nir_op_b2i16: -- case nir_op_b2i32: -- case nir_op_b2b32: -- case nir_op_b2f16: -- case nir_op_b2f32: -- case nir_op_mov: break; -- case nir_op_fmulz: -- case nir_op_ffmaz: -- case nir_op_f2f64: -- case nir_op_u2f64: -- case nir_op_i2f64: -- case nir_op_pack_unorm_2x16: -- case nir_op_pack_snorm_2x16: -- case nir_op_pack_uint_2x16: -- case nir_op_pack_sint_2x16: -- case nir_op_ldexp: -- case nir_op_frexp_sig: -- case nir_op_frexp_exp: -- case nir_op_cube_amd: -- case nir_op_msad_4x8: -- case nir_op_mqsad_4x8: -- case nir_op_udot_4x8_uadd: -- case nir_op_sdot_4x8_iadd: -- case nir_op_sudot_4x8_iadd: -- case nir_op_udot_4x8_uadd_sat: -- case nir_op_sdot_4x8_iadd_sat: -- case nir_op_sudot_4x8_iadd_sat: -- case nir_op_udot_2x16_uadd: -- case nir_op_sdot_2x16_iadd: -- case nir_op_udot_2x16_uadd_sat: -- case nir_op_sdot_2x16_iadd_sat: type = RegType::vgpr; break; -- case nir_op_fmul: -- case nir_op_ffma: -- case nir_op_fadd: -- case nir_op_fsub: -- case nir_op_fmax: -- case nir_op_fmin: -- case nir_op_fsat: -- case nir_op_fneg: -- case nir_op_fabs: -- case nir_op_fsign: -- case nir_op_i2f16: -- case nir_op_i2f32: -- case nir_op_u2f16: -- case nir_op_u2f32: -- case nir_op_f2f16: -- case nir_op_f2f16_rtz: -- case nir_op_f2f16_rtne: -- case nir_op_f2f32: -- case nir_op_fquantize2f16: -- case nir_op_ffract: -- case nir_op_ffloor: -- case nir_op_fceil: -- case nir_op_ftrunc: -- case nir_op_fround_even: -- case nir_op_frcp: -- case nir_op_frsq: -- case nir_op_fsqrt: -- case nir_op_fexp2: -- case nir_op_flog2: -- case nir_op_fsin_amd: -- case nir_op_fcos_amd: -- case nir_op_pack_half_2x16_rtz_split: -- case nir_op_pack_half_2x16_split: -- case nir_op_unpack_half_2x16_split_x: -- case nir_op_unpack_half_2x16_split_y: { -- if (ctx->program->gfx_level < GFX11_5 || -+ switch (alu_instr->op) { -+ case nir_op_f2i16: -+ case nir_op_f2u16: -+ case nir_op_f2i32: -+ case nir_op_f2u32: -+ case nir_op_b2i8: -+ case nir_op_b2i16: -+ case nir_op_b2i32: -+ case nir_op_b2b32: -+ case nir_op_b2f16: -+ case nir_op_b2f32: -+ case nir_op_mov: break; -+ case nir_op_fmulz: -+ case nir_op_ffmaz: -+ case nir_op_f2f64: -+ case nir_op_u2f64: -+ case nir_op_i2f64: -+ case nir_op_pack_unorm_2x16: -+ case nir_op_pack_snorm_2x16: -+ case nir_op_pack_uint_2x16: -+ case nir_op_pack_sint_2x16: -+ case nir_op_ldexp: -+ case nir_op_frexp_sig: -+ case nir_op_frexp_exp: -+ case nir_op_cube_amd: -+ case nir_op_msad_4x8: -+ case nir_op_mqsad_4x8: -+ case nir_op_udot_4x8_uadd: -+ case nir_op_sdot_4x8_iadd: -+ case nir_op_sudot_4x8_iadd: -+ case nir_op_udot_4x8_uadd_sat: -+ case nir_op_sdot_4x8_iadd_sat: -+ case nir_op_sudot_4x8_iadd_sat: -+ case nir_op_udot_2x16_uadd: -+ case nir_op_sdot_2x16_iadd: -+ case nir_op_udot_2x16_uadd_sat: -+ case nir_op_sdot_2x16_iadd_sat: type = RegType::vgpr; break; -+ case nir_op_fmul: -+ case nir_op_ffma: -+ case nir_op_fadd: -+ case nir_op_fsub: -+ case nir_op_fmax: -+ case nir_op_fmin: -+ case nir_op_fsat: -+ case nir_op_fneg: -+ case nir_op_fabs: -+ case nir_op_fsign: -+ case nir_op_i2f16: -+ case nir_op_i2f32: -+ case nir_op_u2f16: -+ case nir_op_u2f32: -+ case nir_op_f2f16: -+ case nir_op_f2f16_rtz: -+ case nir_op_f2f16_rtne: -+ case nir_op_f2f32: -+ case nir_op_fquantize2f16: -+ case nir_op_ffract: -+ case nir_op_ffloor: -+ case nir_op_fceil: -+ case nir_op_ftrunc: -+ case nir_op_fround_even: -+ case nir_op_frcp: -+ case nir_op_frsq: -+ case nir_op_fsqrt: -+ case nir_op_fexp2: -+ case nir_op_flog2: -+ case nir_op_fsin_amd: -+ case nir_op_fcos_amd: -+ case nir_op_pack_half_2x16_rtz_split: -+ case nir_op_pack_half_2x16_split: -+ case nir_op_unpack_half_2x16_split_x: -+ case nir_op_unpack_half_2x16_split_y: { -+ if (ctx->program->gfx_level < GFX11_5 || - alu_instr->src[0].src.ssa->bit_size > 32) { - type = RegType::vgpr; - break; - } -- FALLTHROUGH; -- } -- default: -- for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) { -- if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr) -- type = RegType::vgpr; -+ FALLTHROUGH;} -+ default: -+ for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) { -+ if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr) -+ type = RegType::vgpr; -+ } -+ break; - } -- break; -- } - -- RegClass rc = -- get_reg_class(ctx, type, alu_instr->def.num_components, alu_instr->def.bit_size); -- regclasses[alu_instr->def.index] = rc; -- break; -- } -- case nir_instr_type_load_const: { -- unsigned num_components = nir_instr_as_load_const(instr)->def.num_components; -- unsigned bit_size = nir_instr_as_load_const(instr)->def.bit_size; -- RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size); -- regclasses[nir_instr_as_load_const(instr)->def.index] = rc; -- break; -- } -- case nir_instr_type_intrinsic: { -- nir_intrinsic_instr* intrinsic = nir_instr_as_intrinsic(instr); -- if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest) -+ RegClass rc = get_reg_class(ctx, type, alu_instr->def.num_components, -+ alu_instr->def.bit_size); -+ regclasses[alu_instr->def.index] = rc; - break; -- if (intrinsic->intrinsic == nir_intrinsic_strict_wqm_coord_amd) { -- regclasses[intrinsic->def.index] = -- RegClass::get(RegType::vgpr, intrinsic->def.num_components * 4 + -- nir_intrinsic_base(intrinsic)) -- .as_linear(); -+ } -+ case nir_instr_type_load_const: { -+ unsigned num_components = nir_instr_as_load_const(instr)->def.num_components; -+ unsigned bit_size = nir_instr_as_load_const(instr)->def.bit_size; -+ RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size); -+ regclasses[nir_instr_as_load_const(instr)->def.index] = rc; - break; - } -- RegType type = RegType::sgpr; -- switch (intrinsic->intrinsic) { -- case nir_intrinsic_load_push_constant: -- case nir_intrinsic_load_workgroup_id: -- case nir_intrinsic_load_num_workgroups: -- case nir_intrinsic_load_sbt_base_amd: -- case nir_intrinsic_load_subgroup_id: -- case nir_intrinsic_load_num_subgroups: -- case nir_intrinsic_load_first_vertex: -- case nir_intrinsic_load_base_instance: -- case nir_intrinsic_vote_all: -- case nir_intrinsic_vote_any: -- case nir_intrinsic_read_first_invocation: -- case nir_intrinsic_as_uniform: -- case nir_intrinsic_read_invocation: -- case nir_intrinsic_first_invocation: -- case nir_intrinsic_ballot: -- case nir_intrinsic_ballot_relaxed: -- case nir_intrinsic_bindless_image_samples: -- case nir_intrinsic_load_scalar_arg_amd: -- case nir_intrinsic_load_lds_ngg_scratch_base_amd: -- case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: -- case nir_intrinsic_load_smem_amd: -- case nir_intrinsic_unit_test_uniform_amd: type = RegType::sgpr; break; -- case nir_intrinsic_load_sample_id: -- case nir_intrinsic_load_input: -- case nir_intrinsic_load_per_primitive_input: -- case nir_intrinsic_load_output: -- case nir_intrinsic_load_input_vertex: -- case nir_intrinsic_load_per_vertex_input: -- case nir_intrinsic_load_per_vertex_output: -- case nir_intrinsic_load_vertex_id_zero_base: -- case nir_intrinsic_load_barycentric_sample: -- case nir_intrinsic_load_barycentric_pixel: -- case nir_intrinsic_load_barycentric_model: -- case nir_intrinsic_load_barycentric_centroid: -- case nir_intrinsic_load_barycentric_at_offset: -- case nir_intrinsic_load_interpolated_input: -- case nir_intrinsic_load_frag_coord: -- case nir_intrinsic_load_frag_shading_rate: -- case nir_intrinsic_load_sample_pos: -- case nir_intrinsic_load_local_invocation_id: -- case nir_intrinsic_load_local_invocation_index: -- case nir_intrinsic_load_subgroup_invocation: -- case nir_intrinsic_load_tess_coord: -- case nir_intrinsic_write_invocation_amd: -- case nir_intrinsic_mbcnt_amd: -- case nir_intrinsic_lane_permute_16_amd: -- case nir_intrinsic_dpp16_shift_amd: -- case nir_intrinsic_load_instance_id: -- case nir_intrinsic_ssbo_atomic: -- case nir_intrinsic_ssbo_atomic_swap: -- case nir_intrinsic_global_atomic_amd: -- case nir_intrinsic_global_atomic_swap_amd: -- case nir_intrinsic_bindless_image_atomic: -- case nir_intrinsic_bindless_image_atomic_swap: -- case nir_intrinsic_bindless_image_size: -- case nir_intrinsic_shared_atomic: -- case nir_intrinsic_shared_atomic_swap: -- case nir_intrinsic_load_scratch: -- case nir_intrinsic_load_invocation_id: -- case nir_intrinsic_load_primitive_id: -- case nir_intrinsic_load_typed_buffer_amd: -- case nir_intrinsic_load_buffer_amd: -- case nir_intrinsic_load_initial_edgeflags_amd: -- case nir_intrinsic_gds_atomic_add_amd: -- case nir_intrinsic_bvh64_intersect_ray_amd: -- case nir_intrinsic_load_vector_arg_amd: -- case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd: -- case nir_intrinsic_cmat_muladd_amd: -- case nir_intrinsic_unit_test_divergent_amd: type = RegType::vgpr; break; -- case nir_intrinsic_load_shared: -- case nir_intrinsic_load_shared2_amd: -- /* When the result of these loads is only used by cross-lane instructions, -- * it is beneficial to use a VGPR destination. This is because this allows -- * to put the s_waitcnt further down, which decreases latency. -- */ -- if (only_used_by_cross_lane_instrs(&intrinsic->def)) { -- type = RegType::vgpr; -+ case nir_instr_type_intrinsic: { -+ nir_intrinsic_instr* intrinsic = nir_instr_as_intrinsic(instr); -+ if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest) -+ break; -+ if (intrinsic->intrinsic == nir_intrinsic_strict_wqm_coord_amd) { -+ regclasses[intrinsic->def.index] = -+ RegClass::get(RegType::vgpr, intrinsic->def.num_components * 4 + -+ nir_intrinsic_base(intrinsic)) -+ .as_linear(); - break; - } -- FALLTHROUGH; -- case nir_intrinsic_shuffle: -- case nir_intrinsic_quad_broadcast: -- case nir_intrinsic_quad_swap_horizontal: -- case nir_intrinsic_quad_swap_vertical: -- case nir_intrinsic_quad_swap_diagonal: -- case nir_intrinsic_quad_swizzle_amd: -- case nir_intrinsic_masked_swizzle_amd: -- case nir_intrinsic_rotate: -- case nir_intrinsic_inclusive_scan: -- case nir_intrinsic_exclusive_scan: -- case nir_intrinsic_reduce: -- case nir_intrinsic_load_ubo: -- case nir_intrinsic_load_ssbo: -- case nir_intrinsic_load_global_amd: -- type = intrinsic->def.divergent ? RegType::vgpr : RegType::sgpr; -- break; -- case nir_intrinsic_ddx: -- case nir_intrinsic_ddy: -- case nir_intrinsic_ddx_fine: -- case nir_intrinsic_ddy_fine: -- case nir_intrinsic_ddx_coarse: -- case nir_intrinsic_ddy_coarse: -+ RegType type = RegType::sgpr; -+ switch (intrinsic->intrinsic) { -+ case nir_intrinsic_load_push_constant: -+ case nir_intrinsic_load_workgroup_id: -+ case nir_intrinsic_load_num_workgroups: -+ case nir_intrinsic_load_ray_launch_size: -+ case nir_intrinsic_load_sbt_base_amd: -+ case nir_intrinsic_load_subgroup_id: -+ case nir_intrinsic_load_num_subgroups: -+ case nir_intrinsic_load_first_vertex: -+ case nir_intrinsic_load_base_instance: -+ case nir_intrinsic_vote_all: -+ case nir_intrinsic_vote_any: -+ case nir_intrinsic_read_first_invocation: -+ case nir_intrinsic_as_uniform: -+ case nir_intrinsic_read_invocation: -+ case nir_intrinsic_first_invocation: -+ case nir_intrinsic_ballot: -+ case nir_intrinsic_ballot_relaxed: -+ case nir_intrinsic_bindless_image_samples: -+ case nir_intrinsic_load_scalar_arg_amd: -+ case nir_intrinsic_load_lds_ngg_scratch_base_amd: -+ case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: -+ case nir_intrinsic_load_smem_amd: -+ case nir_intrinsic_unit_test_uniform_amd: type = RegType::sgpr; break; -+ case nir_intrinsic_load_sample_id: -+ case nir_intrinsic_load_input: -+ case nir_intrinsic_load_per_primitive_input: -+ case nir_intrinsic_load_output: -+ case nir_intrinsic_load_input_vertex: -+ case nir_intrinsic_load_per_vertex_input: -+ case nir_intrinsic_load_per_vertex_output: -+ case nir_intrinsic_load_vertex_id_zero_base: -+ case nir_intrinsic_load_barycentric_sample: -+ case nir_intrinsic_load_barycentric_pixel: -+ case nir_intrinsic_load_barycentric_model: -+ case nir_intrinsic_load_barycentric_centroid: -+ case nir_intrinsic_load_barycentric_at_offset: -+ case nir_intrinsic_load_interpolated_input: -+ case nir_intrinsic_load_frag_coord: -+ case nir_intrinsic_load_frag_shading_rate: -+ case nir_intrinsic_load_sample_pos: -+ case nir_intrinsic_load_local_invocation_id: -+ case nir_intrinsic_load_local_invocation_index: -+ case nir_intrinsic_load_subgroup_invocation: -+ case nir_intrinsic_load_ray_launch_id: -+ case nir_intrinsic_load_tess_coord: -+ case nir_intrinsic_write_invocation_amd: -+ case nir_intrinsic_mbcnt_amd: -+ case nir_intrinsic_lane_permute_16_amd: -+ case nir_intrinsic_dpp16_shift_amd: -+ case nir_intrinsic_load_instance_id: -+ case nir_intrinsic_ssbo_atomic: -+ case nir_intrinsic_ssbo_atomic_swap: -+ case nir_intrinsic_global_atomic_amd: -+ case nir_intrinsic_global_atomic_swap_amd: -+ case nir_intrinsic_bindless_image_atomic: -+ case nir_intrinsic_bindless_image_atomic_swap: -+ case nir_intrinsic_bindless_image_size: -+ case nir_intrinsic_shared_atomic: -+ case nir_intrinsic_shared_atomic_swap: -+ case nir_intrinsic_load_scratch: -+ case nir_intrinsic_load_invocation_id: -+ case nir_intrinsic_load_primitive_id: -+ case nir_intrinsic_load_typed_buffer_amd: -+ case nir_intrinsic_load_buffer_amd: -+ case nir_intrinsic_load_initial_edgeflags_amd: -+ case nir_intrinsic_gds_atomic_add_amd: -+ case nir_intrinsic_bvh64_intersect_ray_amd: -+ case nir_intrinsic_load_vector_arg_amd: -+ case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd: -+ case nir_intrinsic_cmat_muladd_amd: -+ case nir_intrinsic_unit_test_divergent_amd: type = RegType::vgpr; break; -+ case nir_intrinsic_load_shared: -+ case nir_intrinsic_load_shared2_amd: -+ /* When the result of these loads is only used by cross-lane instructions, -+ * it is beneficial to use a VGPR destination. This is because this allows -+ * to put the s_waitcnt further down, which decreases latency. -+ */ -+ if (only_used_by_cross_lane_instrs(&intrinsic->def)) { -+ type = RegType::vgpr; -+ break; -+ } -+ FALLTHROUGH; -+ case nir_intrinsic_shuffle: -+ case nir_intrinsic_quad_broadcast: -+ case nir_intrinsic_quad_swap_horizontal: -+ case nir_intrinsic_quad_swap_vertical: -+ case nir_intrinsic_quad_swap_diagonal: -+ case nir_intrinsic_quad_swizzle_amd: -+ case nir_intrinsic_masked_swizzle_amd: -+ case nir_intrinsic_rotate: -+ case nir_intrinsic_inclusive_scan: -+ case nir_intrinsic_exclusive_scan: -+ case nir_intrinsic_reduce: -+ case nir_intrinsic_load_ubo: -+ case nir_intrinsic_load_ssbo: -+ case nir_intrinsic_load_global_amd: -+ type = intrinsic->def.divergent ? RegType::vgpr : RegType::sgpr; -+ break; -+ case nir_intrinsic_ddx: -+ case nir_intrinsic_ddy: -+ case nir_intrinsic_ddx_fine: -+ case nir_intrinsic_ddy_fine: -+ case nir_intrinsic_ddx_coarse: -+ case nir_intrinsic_ddy_coarse: - type = RegType::vgpr; - break; -- case nir_intrinsic_load_view_index: -- type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr; -- break; -- default: -- for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; -- i++) { -- if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr) -- type = RegType::vgpr; -+ case nir_intrinsic_load_view_index: -+ type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr; -+ break; -+ default: -+ for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; -+ i++) { -+ if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr) -+ type = RegType::vgpr; -+ } -+ break; - } -+ RegClass rc = get_reg_class(ctx, type, intrinsic->def.num_components, -+ intrinsic->def.bit_size); -+ regclasses[intrinsic->def.index] = rc; - break; - } -- RegClass rc = -- get_reg_class(ctx, type, intrinsic->def.num_components, intrinsic->def.bit_size); -- regclasses[intrinsic->def.index] = rc; -- break; -- } -- case nir_instr_type_tex: { -- nir_tex_instr* tex = nir_instr_as_tex(instr); -- RegType type = tex->def.divergent ? RegType::vgpr : RegType::sgpr; -+ case nir_instr_type_tex: { -+ nir_tex_instr* tex = nir_instr_as_tex(instr); -+ RegType type = tex->def.divergent ? RegType::vgpr : RegType::sgpr; - -- if (tex->op == nir_texop_texture_samples) { -- assert(!tex->def.divergent); -- } -- -- RegClass rc = get_reg_class(ctx, type, tex->def.num_components, tex->def.bit_size); -- regclasses[tex->def.index] = rc; -- break; -- } -- case nir_instr_type_undef: { -- unsigned num_components = nir_instr_as_undef(instr)->def.num_components; -- unsigned bit_size = nir_instr_as_undef(instr)->def.bit_size; -- RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size); -- regclasses[nir_instr_as_undef(instr)->def.index] = rc; -- break; -- } -- case nir_instr_type_phi: { -- nir_phi_instr* phi = nir_instr_as_phi(instr); -- RegType type = RegType::sgpr; -- unsigned num_components = phi->def.num_components; -- assert((phi->def.bit_size != 1 || num_components == 1) && -- "Multiple components not supported on boolean phis."); -- -- if (phi->def.divergent) { -- type = RegType::vgpr; -- } else { -- bool vgpr_src = false; -- nir_foreach_phi_src (src, phi) -- vgpr_src |= regclasses[src->src.ssa->index].type() == RegType::vgpr; -+ if (tex->op == nir_texop_texture_samples) { -+ assert(!tex->def.divergent); -+ } - -- if (vgpr_src) { -+ RegClass rc = -+ get_reg_class(ctx, type, tex->def.num_components, tex->def.bit_size); -+ regclasses[tex->def.index] = rc; -+ break; -+ } -+ case nir_instr_type_undef: { -+ unsigned num_components = nir_instr_as_undef(instr)->def.num_components; -+ unsigned bit_size = nir_instr_as_undef(instr)->def.bit_size; -+ RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size); -+ regclasses[nir_instr_as_undef(instr)->def.index] = rc; -+ break; -+ } -+ case nir_instr_type_phi: { -+ nir_phi_instr* phi = nir_instr_as_phi(instr); -+ RegType type = RegType::sgpr; -+ unsigned num_components = phi->def.num_components; -+ assert((phi->def.bit_size != 1 || num_components == 1) && -+ "Multiple components not supported on boolean phis."); -+ -+ if (phi->def.divergent) { - type = RegType::vgpr; -+ } else { -+ bool vgpr_src = false; -+ nir_foreach_phi_src (src, phi) -+ vgpr_src |= regclasses[src->src.ssa->index].type() == RegType::vgpr; - -- /* This might be the case because of nir_divergence_ignore_undef_if_phi_srcs. */ -- bool divergent_merge = false; -- if (nir_cf_node_prev(&block->cf_node) && -- nir_cf_node_prev(&block->cf_node)->type == nir_cf_node_if) { -- nir_if* nif = nir_cf_node_as_if(nir_cf_node_prev(&block->cf_node)); -- divergent_merge = nir_src_is_divergent(&nif->condition); -- } -+ if (vgpr_src) { -+ type = RegType::vgpr; - -- /* In case of uniform phis after divergent merges, ensure that the dst is an -- * SGPR and does not contain undefined values for some invocations. -- */ -- if (divergent_merge && !skip_uniformize_merge_phi(&phi->def, 0)) -- type = RegType::sgpr; -+ /* This might be the case because of nir_divergence_ignore_undef_if_phi_srcs. */ -+ bool divergent_merge = false; -+ if (nir_cf_node_prev(&block->cf_node) && -+ nir_cf_node_prev(&block->cf_node)->type == nir_cf_node_if) { -+ nir_if* nif = nir_cf_node_as_if(nir_cf_node_prev(&block->cf_node)); -+ divergent_merge = nir_src_is_divergent(&nif->condition); -+ } -+ -+ /* In case of uniform phis after divergent merges, ensure that the dst is an -+ * SGPR and does not contain undefined values for some invocations. -+ */ -+ if (divergent_merge && !skip_uniformize_merge_phi(&phi->def, 0)) -+ type = RegType::sgpr; -+ } - } -- } - -- RegClass rc = get_reg_class(ctx, type, num_components, phi->def.bit_size); -- if (rc != regclasses[phi->def.index]) -- done = false; -- regclasses[phi->def.index] = rc; -- break; -- } -- case nir_instr_type_call: { -- ++call_count; -- break; -- } -- default: break; -+ RegClass rc = get_reg_class(ctx, type, num_components, phi->def.bit_size); -+ if (rc != regclasses[phi->def.index]) -+ done = false; -+ regclasses[phi->def.index] = rc; -+ break; -+ } -+ case nir_instr_type_call: { -+ ++call_count; -+ break; -+ } -+ default: break; -+ } - } - } - } -- } -- -- ctx->call_infos.reserve(call_count); - -- ctx->program->config->spi_ps_input_ena = ctx->program->info.ps.spi_ps_input_ena; -- ctx->program->config->spi_ps_input_addr = ctx->program->info.ps.spi_ps_input_addr; -+ ctx->call_infos.reserve(call_count); - -+ ctx->program->config->spi_ps_input_ena = ctx->program->info.ps.spi_ps_input_ena; -+ ctx->program->config->spi_ps_input_addr = ctx->program->info.ps.spi_ps_input_addr; -+ } - /* align and copy constant data */ - while (ctx->program->constant_data.size() % 4u) - ctx->program->constant_data.push_back(0); -@@ -794,7 +796,8 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c - - unsigned nir_num_blocks = 0; - for (unsigned i = 0; i < shader_count; i++) -- nir_num_blocks += nir_shader_get_entrypoint(shaders[i])->num_blocks; -+ nir_foreach_function_impl (impl, shaders[i]) -+ nir_num_blocks += impl->num_blocks; - ctx.program->blocks.reserve(nir_num_blocks * 2); - ctx.block = ctx.program->create_and_insert_block(); - ctx.block->kind = block_kind_top_level; --- -GitLab - - -From 5bfdc4d5da9fd66e98e3d04f0320719331a5bfaa Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sat, 23 Mar 2024 11:20:58 +0100 -Subject: [PATCH 43/71] aco: Add param temps in startpgm - ---- - src/amd/compiler/aco_assembler.cpp | 3 ++- - .../compiler/aco_instruction_selection.cpp | 23 ++++++++++++++++++- - src/amd/compiler/aco_ir.h | 1 + - 3 files changed, 25 insertions(+), 2 deletions(-) - -diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp -index 9f50c3f59821b..9a774aec8621c 100644 ---- a/src/amd/compiler/aco_assembler.cpp -+++ b/src/amd/compiler/aco_assembler.cpp -@@ -1755,7 +1755,8 @@ emit_program(Program* program, std::vector& code, std::vectorconstant_data.data() + program->constant_data.size())); - - program->config->scratch_bytes_per_wave = -- align(program->config->scratch_bytes_per_wave, program->dev.scratch_alloc_granule); -+ align(program->config->scratch_bytes_per_wave + program->scratch_arg_size, -+ program->dev.scratch_alloc_granule); - - return exec_size; - } -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index 95baf3a302d0c..c44a7324d58e8 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -11557,8 +11557,12 @@ create_fs_end_for_epilog(isel_context* ctx) - } - - Instruction* --add_startpgm(struct isel_context* ctx) -+add_startpgm(struct isel_context* ctx, bool is_callee = false) - { -+ ctx->program->arg_sgpr_count = ctx->args->num_sgprs_used; -+ ctx->program->arg_vgpr_count = ctx->args->num_vgprs_used; -+ ctx->program->scratch_arg_size += ctx->callee_info.scratch_param_size; -+ - unsigned def_count = 0; - for (unsigned i = 0; i < ctx->args->arg_count; i++) { - if (ctx->args->args[i].skip) -@@ -11569,6 +11573,9 @@ add_startpgm(struct isel_context* ctx) - else - def_count++; - } -+ unsigned used_arg_count = def_count; -+ def_count += -+ ctx->callee_info.reg_param_count + (is_callee ? 2 : 0); /* parameters + return address */ - - if (ctx->stage.hw == AC_HW_COMPUTE_SHADER && ctx->program->gfx_level >= GFX12) - def_count += 3; -@@ -11634,6 +11641,20 @@ add_startpgm(struct isel_context* ctx) - ctx->workgroup_id[i] = ids[i].used ? Operand(get_arg(ctx, ids[i])) : Operand::zero(); - } - -+ if (is_callee) { -+ unsigned def_idx = used_arg_count; -+ -+ ctx->program->stack_ptr = ctx->callee_info.stack_ptr.def.getTemp(); -+ startpgm->definitions[def_idx++] = ctx->callee_info.stack_ptr.def; -+ startpgm->definitions[def_idx++] = ctx->callee_info.return_address.def; -+ -+ for (auto& info : ctx->callee_info.param_infos) { -+ if (!info.is_reg) -+ continue; -+ startpgm->definitions[def_idx++] = info.def; -+ } -+ } -+ - /* epilog has no scratch */ - if (ctx->args->scratch_offset.used) { - if (ctx->program->gfx_level < GFX9) { -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index 920174ac50798..7989d27dfe75b 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -2388,6 +2388,7 @@ public: - ABI callee_abi = {}; - unsigned short arg_sgpr_count; - unsigned short arg_vgpr_count; -+ unsigned scratch_arg_size = 0; - - struct { - monotonic_buffer_resource memory; --- -GitLab - - -From c2b0a99236c67af869bef06a2e3d2af329206ef7 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 6 Mar 2024 13:27:56 +0100 -Subject: [PATCH 44/71] aco: Implement call parameter intrinsics - ---- - .../compiler/aco_instruction_selection.cpp | 158 ++++++++++++++++++ - .../aco_instruction_selection_setup.cpp | 13 +- - 2 files changed, 170 insertions(+), 1 deletion(-) - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index c44a7324d58e8..f3ec6fa04dd36 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -8341,6 +8341,107 @@ visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr) - emit_split_vector(ctx, dst, instr->def.num_components); - } - -+void -+load_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param, Temp stack_ptr, -+ unsigned scratch_param_size, Temp dst) -+{ -+ int32_t const_offset = param.scratch_offset - scratch_param_size; -+ unsigned byte_size = dst.bytes(); -+ if (ctx->program->gfx_level < GFX9) { -+ Temp scratch_rsrc = load_scratch_resource(ctx->program, bld, true, false); -+ -+ Temp soffset = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), -+ stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr), -+ Operand::c32(-const_offset * ctx->program->wave_size)); -+ -+ aco_opcode op; -+ switch (byte_size) { -+ case 4: op = aco_opcode::buffer_load_dword; break; -+ case 8: op = aco_opcode::buffer_load_dwordx2; break; -+ case 12: op = aco_opcode::buffer_load_dwordx3; break; -+ case 16: op = aco_opcode::buffer_load_dwordx4; break; -+ default: unreachable("Unexpected param size"); -+ } -+ -+ Instruction* instr = -+ bld.mubuf(op, Definition(dst), scratch_rsrc, Operand(v1), soffset, 0, false); -+ instr->mubuf().sync = memory_sync_info(storage_scratch); -+ instr->mubuf().cache.value = ac_swizzled; -+ return; -+ } -+ -+ if (const_offset < ctx->program->dev.scratch_global_offset_min) { -+ stack_ptr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), -+ stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr), -+ Operand::c32(const_offset)); -+ const_offset = 0; -+ } -+ -+ aco_opcode op; -+ switch (byte_size) { -+ case 4: op = aco_opcode::scratch_load_dword; break; -+ case 8: op = aco_opcode::scratch_load_dwordx2; break; -+ case 12: op = aco_opcode::scratch_load_dwordx3; break; -+ case 16: op = aco_opcode::scratch_load_dwordx4; break; -+ default: unreachable("Unexpected param size"); -+ } -+ -+ bld.scratch(op, Definition(dst), Operand(v1), -+ stack_ptr == Temp() ? Operand(s1) : Operand(stack_ptr), (int16_t)const_offset, -+ memory_sync_info(storage_scratch)); -+} -+ -+void -+store_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param, Temp stack_ptr, -+ unsigned scratch_param_size, Temp data) -+{ -+ int32_t const_offset = param.scratch_offset - scratch_param_size; -+ unsigned byte_size = data.bytes(); -+ if (ctx->program->gfx_level < GFX9) { -+ Temp scratch_rsrc = load_scratch_resource(ctx->program, bld, true, false); -+ -+ Temp soffset = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), -+ stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr), -+ Operand::c32(-const_offset * ctx->program->wave_size)); -+ -+ assert(-const_offset * ctx->program->wave_size < 0x1ff00); -+ -+ aco_opcode op; -+ switch (byte_size) { -+ case 4: op = aco_opcode::buffer_store_dword; break; -+ case 8: op = aco_opcode::buffer_store_dwordx2; break; -+ case 12: op = aco_opcode::buffer_store_dwordx3; break; -+ case 16: op = aco_opcode::buffer_store_dwordx4; break; -+ default: unreachable("Unexpected param size"); -+ } -+ -+ Instruction* instr = -+ bld.mubuf(op, scratch_rsrc, Operand(v1), Operand(soffset), as_vgpr(bld, data), 0, false); -+ instr->mubuf().sync = memory_sync_info(storage_scratch); -+ instr->mubuf().cache.value = ac_swizzled; -+ return; -+ } -+ -+ if (const_offset < ctx->program->dev.scratch_global_offset_min) { -+ stack_ptr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), -+ stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr), -+ Operand::c32(const_offset)); -+ const_offset = 0; -+ } -+ -+ aco_opcode op; -+ switch (byte_size) { -+ case 4: op = aco_opcode::scratch_store_dword; break; -+ case 8: op = aco_opcode::scratch_store_dwordx2; break; -+ case 12: op = aco_opcode::scratch_store_dwordx3; break; -+ case 16: op = aco_opcode::scratch_store_dwordx4; break; -+ default: unreachable("Unexpected param size"); -+ } -+ -+ bld.scratch(op, Operand(v1), stack_ptr == Temp() ? Operand(s1) : Operand(stack_ptr), -+ as_vgpr(bld, data), (int16_t)const_offset, memory_sync_info(storage_scratch)); -+} -+ - void - visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) - { -@@ -9640,6 +9741,63 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) - bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)), - Operand::c32(nir_intrinsic_base(instr))); - break; -+ case nir_intrinsic_load_return_param_amd: { -+ call_info& info = ctx->call_infos[nir_intrinsic_call_idx(instr)]; -+ -+ assert(nir_intrinsic_param_idx(instr) < info.nir_instr->callee->num_params); -+ -+ unsigned index_in_return_params = 0u; -+ for (unsigned i = 0; i < info.nir_instr->callee->num_params; ++i) { -+ if (nir_intrinsic_param_idx(instr) == i) { -+ assert(info.nir_instr->callee->params[i].is_return); -+ break; -+ } -+ if (info.nir_instr->callee->params[i].is_return) { -+ ++index_in_return_params; -+ } -+ } -+ -+ if (info.return_info[index_in_return_params].is_reg) { -+ bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), -+ Operand(info.return_info[index_in_return_params].def.getTemp())); -+ } else { -+ Temp stack_ptr; -+ if (ctx->callee_info.stack_ptr.is_reg) -+ stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), -+ Operand::c32(info.scratch_param_size), -+ Operand(ctx->callee_info.stack_ptr.def.getTemp())); -+ else -+ stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), -+ Operand::c32(info.scratch_param_size)); -+ load_scratch_param(ctx, bld, info.return_info[index_in_return_params], stack_ptr, -+ info.scratch_param_size, get_ssa_temp(ctx, &instr->def)); -+ } -+ break; -+ } -+ case nir_intrinsic_load_param: { -+ const auto& param = ctx->callee_info.param_infos[nir_intrinsic_param_idx(instr)]; -+ if (param.is_reg) -+ bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), Operand(param.def.getTemp())); -+ else -+ load_scratch_param( -+ ctx, bld, param, -+ ctx->callee_info.stack_ptr.is_reg ? ctx->callee_info.stack_ptr.def.getTemp() : Temp(), -+ ctx->callee_info.scratch_param_size, get_ssa_temp(ctx, &instr->def)); -+ break; -+ } -+ case nir_intrinsic_store_param_amd: { -+ auto& param = ctx->callee_info.param_infos[nir_intrinsic_param_idx(instr)]; -+ if (param.is_reg) -+ param.def.setTemp(param.def.regClass().type() == RegType::vgpr -+ ? as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)) -+ : bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa))); -+ else -+ store_scratch_param( -+ ctx, bld, param, -+ ctx->callee_info.stack_ptr.is_reg ? ctx->callee_info.stack_ptr.def.getTemp() : Temp(), -+ ctx->callee_info.scratch_param_size, get_ssa_temp(ctx, instr->src[0].ssa)); -+ break; -+ } - case nir_intrinsic_load_call_return_address_amd: { - bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), - Operand(ctx->callee_info.return_address.def.getTemp())); -diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp -index 600c63c8b9ce3..1bdbe28ec17bd 100644 ---- a/src/amd/compiler/aco_instruction_selection_setup.cpp -+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp -@@ -5,12 +5,13 @@ - */ - - #include "aco_instruction_selection.h" -+#include "aco_nir_call_attribs.h" - - #include "common/ac_nir.h" - #include "common/sid.h" - --#include "nir_control_flow.h" - #include "nir_builder.h" -+#include "nir_control_flow.h" - - #include - -@@ -631,6 +632,16 @@ init_context(isel_context* ctx, nir_shader* shader) - case nir_intrinsic_load_view_index: - type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr; - break; -+ case nir_intrinsic_load_return_param_amd: { -+ type = RegType::vgpr; -+ break; -+ } -+ case nir_intrinsic_load_param: { -+ nir_parameter* param = -+ &impl->function->params[nir_intrinsic_param_idx(intrinsic)]; -+ type = param->is_uniform ? RegType::sgpr : RegType::vgpr; -+ break; -+ } - default: - for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; - i++) { --- -GitLab - - -From 04c145740dcc48f05926edf8db90fc38b02bf2e5 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Thu, 6 Jun 2024 07:17:15 +0200 -Subject: [PATCH 45/71] aco: Add common utility to load scratch descriptor - -Also modifies the scratch descriptor to take the stack pointer into -account. ---- - .../compiler/aco_instruction_selection.cpp | 40 +-------- - src/amd/compiler/aco_scratch_rsrc.h | 82 +++++++++++++++++++ - src/amd/compiler/aco_spill.cpp | 54 +----------- - 3 files changed, 87 insertions(+), 89 deletions(-) - create mode 100644 src/amd/compiler/aco_scratch_rsrc.h - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index f3ec6fa04dd36..6ed8dd84c777f 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -11,6 +11,7 @@ - #include "aco_interface.h" - #include "aco_ir.h" - #include "aco_nir_call_attribs.h" -+#include "aco_scratch_rsrc.h" - - #include "common/ac_descriptors.h" - #include "common/ac_gpu_info.h" -@@ -7701,41 +7702,6 @@ visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr) - } - } - --Temp --get_scratch_resource(isel_context* ctx) --{ -- Builder bld(ctx->program, ctx->block); -- Temp scratch_addr = ctx->program->private_segment_buffer; -- if (!scratch_addr.bytes()) { -- Temp addr_lo = -- bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo)); -- Temp addr_hi = -- bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi)); -- scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi); -- } else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) { -- scratch_addr = -- bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero()); -- } -- -- struct ac_buffer_state ac_state = {0}; -- uint32_t desc[4]; -- -- ac_state.size = 0xffffffff; -- ac_state.format = PIPE_FORMAT_R32_FLOAT; -- for (int i = 0; i < 4; i++) -- ac_state.swizzle[i] = PIPE_SWIZZLE_0; -- /* older generations need element size = 4 bytes. element size removed in GFX9 */ -- ac_state.element_size = ctx->program->gfx_level <= GFX8 ? 1u : 0u; -- ac_state.index_stride = ctx->program->wave_size == 64 ? 3u : 2u; -- ac_state.add_tid = true; -- ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW; -- -- ac_build_buffer_descriptor(ctx->program->gfx_level, &ac_state, desc); -- -- return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(desc[2]), -- Operand::c32(desc[3])); --} -- - void - visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr) - { -@@ -7778,7 +7744,7 @@ visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr) - params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1; - emit_load(ctx, bld, info, params); - } else { -- info.resource = get_scratch_resource(ctx); -+ info.resource = load_scratch_resource(ctx->program, bld, false, true); - info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa))); - info.soffset = ctx->program->scratch_offset; - emit_load(ctx, bld, info, scratch_mubuf_load_params); -@@ -7841,7 +7807,7 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr) - memory_sync_info(storage_scratch, semantic_private)); - } - } else { -- Temp rsrc = get_scratch_resource(ctx); -+ Temp rsrc = load_scratch_resource(ctx->program, bld, false, true); - offset = as_vgpr(ctx, offset); - for (unsigned i = 0; i < write_count; i++) { - aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); -diff --git a/src/amd/compiler/aco_scratch_rsrc.h b/src/amd/compiler/aco_scratch_rsrc.h -new file mode 100644 -index 0000000000000..5b0af2bca46f0 ---- /dev/null -+++ b/src/amd/compiler/aco_scratch_rsrc.h -@@ -0,0 +1,82 @@ -+/* -+ * Copyright © 2024 Valve Corporation. -+ * -+ * SPDX-License-Identifier: MIT -+ */ -+ -+#include "aco_builder.h" -+#include "aco_ir.h" -+ -+#include "ac_descriptors.h" -+#include "amdgfxregs.h" -+ -+#ifndef ACO_SCRATCH_RSRC_H -+#define ACO_SCRATCH_RSRC_H -+ -+namespace aco { -+ -+inline Temp -+load_scratch_resource(Program* program, Builder& bld, bool apply_scratch_offset, -+ bool apply_stack_ptr) -+{ -+ Temp private_segment_buffer = program->private_segment_buffer; -+ if (!private_segment_buffer.bytes()) { -+ Temp addr_lo = -+ bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo)); -+ Temp addr_hi = -+ bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi)); -+ private_segment_buffer = -+ bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi); -+ } else if (program->stage.hw != AC_HW_COMPUTE_SHADER) { -+ private_segment_buffer = -+ bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero()); -+ } -+ -+ if ((apply_stack_ptr && program->stack_ptr != Temp()) || apply_scratch_offset) { -+ Temp addr_lo = bld.tmp(s1); -+ Temp addr_hi = bld.tmp(s1); -+ bld.pseudo(aco_opcode::p_split_vector, Definition(addr_lo), Definition(addr_hi), -+ private_segment_buffer); -+ -+ if (apply_stack_ptr && program->stack_ptr != Temp()) { -+ Temp carry = bld.tmp(s1); -+ addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo, -+ program->stack_ptr); -+ addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi, -+ Operand::c32(0), bld.scc(carry)); -+ } -+ -+ if (apply_scratch_offset) { -+ Temp carry = bld.tmp(s1); -+ addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo, -+ program->scratch_offset); -+ addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi, -+ Operand::c32(0), bld.scc(carry)); -+ } -+ -+ private_segment_buffer = -+ bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi); -+ } -+ -+ struct ac_buffer_state ac_state = {0}; -+ uint32_t desc[4]; -+ -+ ac_state.size = 0xffffffff; -+ ac_state.format = PIPE_FORMAT_R32_FLOAT; -+ for (int i = 0; i < 4; i++) -+ ac_state.swizzle[i] = PIPE_SWIZZLE_0; -+ /* older generations need element size = 4 bytes. element size removed in GFX9 */ -+ ac_state.element_size = program->gfx_level <= GFX8 ? 1u : 0u; -+ ac_state.index_stride = program->wave_size == 64 ? 3u : 2u; -+ ac_state.add_tid = true; -+ ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW; -+ -+ ac_build_buffer_descriptor(program->gfx_level, &ac_state, desc); -+ -+ return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, -+ Operand::c32(desc[2]), Operand::c32(desc[3])); -+} -+ -+} // namespace aco -+ -+#endif // ACO_SCRATCH_RSRC_H -diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp -index 2e30bf9e2783e..c271cbcf01eb8 100644 ---- a/src/amd/compiler/aco_spill.cpp -+++ b/src/amd/compiler/aco_spill.cpp -@@ -7,6 +7,7 @@ - - #include "aco_builder.h" - #include "aco_ir.h" -+#include "aco_scratch_rsrc.h" - #include "aco_util.h" - - #include "common/ac_descriptors.h" -@@ -1134,57 +1135,6 @@ spill_block(spill_ctx& ctx, unsigned block_idx) - } - } - --Temp --load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset) --{ -- Temp private_segment_buffer = ctx.program->private_segment_buffer; -- if (!private_segment_buffer.bytes()) { -- Temp addr_lo = -- bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo)); -- Temp addr_hi = -- bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi)); -- private_segment_buffer = -- bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi); -- } else if (ctx.program->stage.hw != AC_HW_COMPUTE_SHADER) { -- private_segment_buffer = -- bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero()); -- } -- -- if (apply_scratch_offset) { -- Temp addr_lo = bld.tmp(s1); -- Temp addr_hi = bld.tmp(s1); -- bld.pseudo(aco_opcode::p_split_vector, Definition(addr_lo), Definition(addr_hi), -- private_segment_buffer); -- -- Temp carry = bld.tmp(s1); -- addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo, -- ctx.program->scratch_offset); -- addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi, -- Operand::c32(0), bld.scc(carry)); -- -- private_segment_buffer = -- bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi); -- } -- -- struct ac_buffer_state ac_state = {0}; -- uint32_t desc[4]; -- -- ac_state.size = 0xffffffff; -- ac_state.format = PIPE_FORMAT_R32_FLOAT; -- for (int i = 0; i < 4; i++) -- ac_state.swizzle[i] = PIPE_SWIZZLE_0; -- /* older generations need element size = 4 bytes. element size removed in GFX9 */ -- ac_state.element_size = ctx.program->gfx_level <= GFX8 ? 1u : 0u; -- ac_state.index_stride = ctx.program->wave_size == 64 ? 3u : 2u; -- ac_state.add_tid = true; -- ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW; -- -- ac_build_buffer_descriptor(ctx.program->gfx_level, &ac_state, desc); -- -- return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, -- Operand::c32(desc[2]), Operand::c32(desc[3])); --} -- - void - setup_vgpr_spill_reload(spill_ctx& ctx, Block& block, - std::vector>& instructions, uint32_t spill_slot, -@@ -1249,7 +1199,7 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block, - } - } else { - if (ctx.scratch_rsrc == Temp()) -- ctx.scratch_rsrc = load_scratch_resource(ctx, rsrc_bld, overflow); -+ ctx.scratch_rsrc = load_scratch_resource(ctx.program, rsrc_bld, overflow, true); - - if (overflow) { - uint32_t soffset = --- -GitLab - - -From 912041711336f7e14a19439aeffd8a404990fd55 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 13 May 2024 06:14:32 +0200 -Subject: [PATCH 46/71] aco: Add Program::is_callee and set it for RT shaders - ---- - src/amd/compiler/aco_instruction_selection.cpp | 2 ++ - src/amd/compiler/aco_ir.h | 1 + - 2 files changed, 3 insertions(+) - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index 6ed8dd84c777f..d3d15c9500d5e 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -12048,6 +12048,8 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c - init_context(&ctx, nir); - setup_fp_mode(&ctx, nir); - -+ ctx.program->is_callee = true; -+ - Instruction* startpgm = add_startpgm(&ctx); - append_logical_start(ctx.block); - split_arguments(&ctx, startpgm); -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index 7989d27dfe75b..2bc7b91c81584 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -2385,6 +2385,7 @@ public: - /* For shader part with previous shader part that has lds access. */ - bool pending_lds_access = false; - -+ bool is_callee = false; - ABI callee_abi = {}; - unsigned short arg_sgpr_count; - unsigned short arg_vgpr_count; --- -GitLab - - -From 441ab8b850fb95ed9a8cfc7ae0fe0e258385fdaa Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sat, 4 May 2024 17:54:14 +0200 -Subject: [PATCH 47/71] radv,aco: Use function call structure for RT programs - ---- - .../compiler/aco_instruction_selection.cpp | 208 ++++++++++++------ - src/amd/compiler/aco_interface.cpp | 7 +- - src/amd/compiler/aco_interface.h | 4 +- - src/amd/compiler/aco_ir.h | 4 +- - src/amd/vulkan/radv_pipeline_rt.c | 6 +- - src/amd/vulkan/radv_shader.c | 8 +- - src/amd/vulkan/radv_shader.h | 3 +- - 7 files changed, 165 insertions(+), 75 deletions(-) - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index d3d15c9500d5e..901b9ca843eb1 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -12003,33 +12003,53 @@ merged_wave_info_to_mask(isel_context* ctx, unsigned i) - return lanecount_to_mask(ctx, count, i * 8u); - } - --static void --insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args) -+void -+insert_return(isel_context& ctx) - { -- unsigned src_count = 0; -- for (unsigned i = 0; i < ctx.args->arg_count; i++) -- src_count += !!BITSET_TEST(ctx.output_args, i); -- -+ unsigned return_param_count = 0; -+ for (auto& param_def : ctx.callee_info.param_infos) { -+ if (!param_def.is_reg || param_def.discardable) -+ continue; -+ ++return_param_count; -+ } -+ unsigned src_count = return_param_count + 2; -+ if (ctx.next_pc != Temp()) -+ src_count += ctx.args->arg_count; - Instruction* ret = create_instruction(aco_opcode::p_return, Format::PSEUDO, src_count, 0); - ctx.block->instructions.emplace_back(ret); - -- src_count = 0; -- for (unsigned i = 0; i < ctx.args->arg_count; i++) { -- if (!BITSET_TEST(ctx.output_args, i)) -- continue; -- -- enum ac_arg_regfile file = ctx.args->args[i].file; -- unsigned size = ctx.args->args[i].size; -- unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256); -- RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size); -- Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg}) -- : Operand(PhysReg{reg}, type); -- ret->operands[src_count] = op; -- src_count++; -+ if (ctx.next_pc != Temp()) { -+ for (unsigned i = 0; i < ctx.args->arg_count; i++) { -+ enum ac_arg_regfile file = ctx.args->args[i].file; -+ unsigned size = ctx.args->args[i].size; -+ unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256); -+ RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size); -+ Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg}) -+ : Operand(PhysReg{reg}, type); -+ ret->operands[i] = op; -+ } - } - -- Builder bld(ctx.program, ctx.block); -- bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr)); -+ unsigned def_idx = ctx.next_pc != Temp() ? ctx.args->arg_count : 0; -+ for (unsigned i = 0; i < ctx.callee_info.param_infos.size(); ++i) { -+ const auto& param_info = ctx.callee_info.param_infos[i]; -+ if (!param_info.is_reg || param_info.discardable) -+ continue; -+ Temp param_temp = param_info.def.getTemp(); -+ if (i == 0 && ctx.next_pc != Temp()) -+ param_temp = ctx.next_divergent_pc; -+ else if (i == 1 && ctx.next_pc != Temp()) -+ param_temp = ctx.next_pc; -+ Operand op = Operand(param_temp); -+ op.setPrecolored(param_info.def.physReg()); -+ ret->operands[def_idx++] = op; -+ } -+ Operand op = Operand(ctx.callee_info.return_address.def.getTemp()); -+ op.setPrecolored(ctx.callee_info.return_address.def.physReg()); -+ ret->operands[def_idx++] = op; -+ Operand stack_op = Operand(ctx.callee_info.stack_ptr.def.getTemp()); -+ stack_op.setPrecolored(ctx.callee_info.stack_ptr.def.physReg()); -+ ret->operands[def_idx++] = stack_op; - } - - void -@@ -12048,21 +12068,38 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c - init_context(&ctx, nir); - setup_fp_mode(&ctx, nir); - -+ ABI abi; -+ /* TODO: callable abi? */ -+ switch (shaders[i]->info.stage) { -+ case MESA_SHADER_RAYGEN: -+ case MESA_SHADER_CLOSEST_HIT: -+ case MESA_SHADER_MISS: -+ case MESA_SHADER_CALLABLE: abi = rtRaygenABI; break; -+ case MESA_SHADER_INTERSECTION: abi = rtTraversalABI; break; -+ case MESA_SHADER_ANY_HIT: abi = rtAnyHitABI; break; -+ default: unreachable("invalid RT shader stage"); -+ } -+ -+ ctx.callee_abi = make_abi(abi, ctx.program); -+ ctx.program->callee_abi = ctx.callee_abi; -+ ctx.callee_info = get_callee_info(ctx.callee_abi, impl->function->num_params, -+ impl->function->params, ctx.program); - ctx.program->is_callee = true; - -- Instruction* startpgm = add_startpgm(&ctx); -+ Instruction* startpgm = add_startpgm(&ctx, true); - append_logical_start(ctx.block); - split_arguments(&ctx, startpgm); - visit_cf_list(&ctx, &impl->body); - append_logical_end(ctx.block); - ctx.block->kind |= block_kind_uniform; - -- /* Fix output registers and jump to next shader. We can skip this when dealing with a -- * raygen shader without shader calls. -- */ -- if ((shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN) && -- impl == nir_shader_get_entrypoint(nir)) -- insert_rt_jump_next(ctx, args); -+ if (ctx.next_pc != Temp()) { -+ insert_return(ctx); -+ -+ Builder(ctx.program, ctx.block).sop1(aco_opcode::s_setpc_b64, Operand(ctx.next_pc)); -+ } else { -+ Builder(ctx.program, ctx.block).sopp(aco_opcode::s_endpgm); -+ } - - cleanup_context(&ctx); - first_block = false; -@@ -12879,7 +12916,8 @@ calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args, - void - select_rt_prolog(Program* program, ac_shader_config* config, - const struct aco_compiler_options* options, const struct aco_shader_info* info, -- const struct ac_shader_args* in_args, const struct ac_shader_args* out_args) -+ const struct ac_shader_args* in_args, const struct ac_arg* descriptors, -+ unsigned raygen_param_count, nir_parameter* raygen_params) - { - init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode, - config); -@@ -12890,8 +12928,11 @@ select_rt_prolog(Program* program, ac_shader_config* config, - calc_min_waves(program); - Builder bld(program, block); - block->instructions.reserve(32); -- unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used); -- unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used); -+ unsigned num_sgprs = in_args->num_sgprs_used; -+ unsigned num_vgprs = in_args->num_vgprs_used; -+ -+ struct callee_info raygen_info = -+ get_callee_info(make_abi(rtRaygenABI, program), raygen_param_count, raygen_params, NULL); - - /* Inputs: - * Ring offsets: s[0-1] -@@ -12906,9 +12947,11 @@ select_rt_prolog(Program* program, ac_shader_config* config, - * Local invocation IDs: v[0-2] - */ - PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets); -+ PhysReg in_descriptors = get_arg_reg(in_args, *descriptors); -+ PhysReg in_push_constants = get_arg_reg(in_args, in_args->push_constants); - PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors); -+ PhysReg in_traversal_addr = get_arg_reg(in_args, in_args->rt.traversal_shader_addr); - PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr); -- PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base); - PhysReg in_wg_id_x; - PhysReg in_wg_id_y; - PhysReg in_wg_id_z; -@@ -12942,46 +12985,84 @@ select_rt_prolog(Program* program, ac_shader_config* config, - * Shader VA: v[4-5] - * Shader Record Ptr: v[6-7] - */ -- PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr); -- PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_sizes[0]); -- PhysReg out_launch_size_y = get_arg_reg(out_args, out_args->rt.launch_sizes[1]); -- PhysReg out_launch_size_z = get_arg_reg(out_args, out_args->rt.launch_sizes[2]); -+ assert(raygen_info.stack_ptr.is_reg); -+ assert(raygen_info.return_address.is_reg); -+ assert(raygen_info.param_infos[0].is_reg); -+ assert(raygen_info.param_infos[1].is_reg); -+ assert(raygen_info.param_infos[RAYGEN_ARG_LAUNCH_ID + 2].is_reg); -+ assert(raygen_info.param_infos[RAYGEN_ARG_LAUNCH_SIZE + 2].is_reg); -+ assert(raygen_info.param_infos[RAYGEN_ARG_DESCRIPTORS + 2].is_reg); -+ assert(raygen_info.param_infos[RAYGEN_ARG_PUSH_CONSTANTS + 2].is_reg); -+ assert(raygen_info.param_infos[RAYGEN_ARG_SBT_DESCRIPTORS + 2].is_reg); -+ assert(raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR + 2].is_reg); -+ assert(raygen_info.param_infos[RAYGEN_ARG_SHADER_RECORD_PTR + 2].is_reg); -+ PhysReg out_stack_ptr_param = raygen_info.stack_ptr.def.physReg(); -+ PhysReg out_return_shader_addr = raygen_info.return_address.def.physReg(); -+ PhysReg out_divergent_shader_addr = raygen_info.param_infos[0].def.physReg(); -+ PhysReg out_uniform_shader_addr = raygen_info.param_infos[1].def.physReg(); -+ PhysReg out_launch_size_x = raygen_info.param_infos[RAYGEN_ARG_LAUNCH_SIZE + 2].def.physReg(); -+ PhysReg out_launch_size_y = out_launch_size_x.advance(4); -+ PhysReg out_launch_size_z = out_launch_size_y.advance(4); - PhysReg out_launch_ids[3]; -- for (unsigned i = 0; i < 3; i++) -- out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_ids[i]); -- PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base); -- PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record); -+ out_launch_ids[0] = raygen_info.param_infos[RAYGEN_ARG_LAUNCH_ID + 2].def.physReg(); -+ for (unsigned i = 1; i < 3; i++) -+ out_launch_ids[i] = out_launch_ids[i - 1].advance(4); -+ PhysReg out_descriptors = raygen_info.param_infos[RAYGEN_ARG_DESCRIPTORS + 2].def.physReg(); -+ PhysReg out_push_constants = -+ raygen_info.param_infos[RAYGEN_ARG_PUSH_CONSTANTS + 2].def.physReg(); -+ PhysReg out_sbt_descriptors = -+ raygen_info.param_infos[RAYGEN_ARG_SBT_DESCRIPTORS + 2].def.physReg(); -+ PhysReg out_traversal_addr = -+ raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR + 2].def.physReg(); -+ PhysReg out_record_ptr = raygen_info.param_infos[RAYGEN_ARG_SHADER_RECORD_PTR + 2].def.physReg(); -+ -+ num_sgprs = std::max(num_sgprs, out_stack_ptr_param.reg()); -+ num_vgprs = std::max(num_vgprs, out_record_ptr.reg() - 256 + 2); - - /* Temporaries: */ - num_sgprs = align(num_sgprs, 2); -+ num_sgprs += 2; - PhysReg tmp_raygen_sbt = PhysReg{num_sgprs}; - num_sgprs += 2; -+ PhysReg tmp_launch_size_addr = PhysReg{num_sgprs}; -+ num_sgprs += 2; - PhysReg tmp_ring_offsets = PhysReg{num_sgprs}; - num_sgprs += 2; -+ PhysReg tmp_traversal_addr = PhysReg{num_sgprs}; -+ num_sgprs += 2; - PhysReg tmp_wg_id_x_times_size = PhysReg{num_sgprs}; - num_sgprs++; - - PhysReg tmp_invocation_idx = PhysReg{256 + num_vgprs++}; - - /* Confirm some assumptions about register aliasing */ -- assert(in_ring_offsets == out_uniform_shader_addr); -- assert(get_arg_reg(in_args, in_args->push_constants) == -- get_arg_reg(out_args, out_args->push_constants)); -- assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) == -- get_arg_reg(out_args, out_args->rt.sbt_descriptors)); -- assert(in_launch_size_addr == out_launch_size_x); -- assert(in_stack_base == out_launch_size_z); -- assert(in_local_ids[0] == out_launch_ids[0]); -+ assert(in_descriptors == out_uniform_shader_addr); -+ assert(in_sbt_desc == out_launch_size_x); -+ assert(in_traversal_addr == out_launch_size_z); -+ assert(in_wg_id_x == out_traversal_addr); - - /* gfx_level >= GFX9 || in_scratch_offset.reg() >= out_args->num_sgprs_used); -+ assert(options->gfx_level >= GFX9 || -+ in_scratch_offset.reg() >= -+ raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR].def.physReg()); - - /* load raygen sbt */ - bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2), - Operand::c32(0u)); - -+ bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_launch_size_addr, s2), -+ Operand(in_launch_size_addr, s2)); -+ bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_traversal_addr, s2), -+ Operand(in_traversal_addr, s2)); -+ bld.sop1(aco_opcode::s_mov_b32, Definition(out_descriptors, s1), Operand(in_descriptors, s1)); -+ bld.sop1(aco_opcode::s_mov_b32, Definition(out_push_constants, s1), -+ Operand(in_push_constants, s1)); -+ bld.sop1(aco_opcode::s_mov_b32, Definition(out_sbt_descriptors, s1), Operand(in_sbt_desc, s1)); -+ bld.sop1(aco_opcode::s_mov_b32, Definition(out_sbt_descriptors.advance(4), s1), -+ Operand(in_sbt_desc.advance(4), s1)); -+ - /* init scratch */ - if (options->gfx_level < GFX9) { - /* copy ring offsets to temporary location*/ -@@ -12992,18 +13073,15 @@ select_rt_prolog(Program* program, ac_shader_config* config, - Operand(in_scratch_offset, s1)); - } - -- /* set stack ptr */ -- bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1)); -- - /* load raygen address */ - bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2), - Operand(tmp_raygen_sbt, s2), Operand::c32(0u)); - - /* load ray launch sizes */ - bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1), -- Operand(in_launch_size_addr, s2), Operand::c32(8u)); -+ Operand(tmp_launch_size_addr, s2), Operand::c32(8u)); - bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2), -- Operand(in_launch_size_addr, s2), Operand::c32(0u)); -+ Operand(tmp_launch_size_addr, s2), Operand::c32(0u)); - - /* calculate ray launch ids */ - if (options->gfx_level >= GFX11) { -@@ -13059,6 +13137,11 @@ select_rt_prolog(Program* program, ac_shader_config* config, - Operand::c32(-1u), Operand(tmp_invocation_idx, v1)); - } - -+ bld.sop1(aco_opcode::s_mov_b32, Definition(out_traversal_addr, s1), -+ Operand(tmp_traversal_addr, s1)); -+ bld.sop1(aco_opcode::s_mov_b32, Definition(out_traversal_addr.advance(4), s1), -+ Operand(tmp_traversal_addr.advance(4), s1)); -+ - /* Make fixup operations a no-op if this is not a converted 2D dispatch. */ - bld.sopc(aco_opcode::s_cmp_lg_u32, Definition(scc, s1), - Operand::c32(ACO_RT_CONVERTED_2D_LAUNCH_SIZE), Operand(out_launch_size_y, s1)); -@@ -13070,14 +13153,15 @@ select_rt_prolog(Program* program, ac_shader_config* config, - bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(), - Operand(out_launch_ids[1], v1), Operand(vcc, bld.lm)); - -- if (options->gfx_level < GFX9) { -- /* write scratch/ring offsets to outputs, if needed */ -- bld.sop1(aco_opcode::s_mov_b32, -- Definition(get_arg_reg(out_args, out_args->scratch_offset), s1), -- Operand(in_scratch_offset, s1)); -- bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2), -- Operand(tmp_ring_offsets, s2)); -- } -+ if (program->gfx_level < GFX8) -+ bld.vop3(aco_opcode::v_lshr_b64, Definition(out_divergent_shader_addr, v2), -+ Operand(out_uniform_shader_addr, s2), Operand::c32(0)); -+ else -+ bld.vop3(aco_opcode::v_lshrrev_b64, Definition(out_divergent_shader_addr, v2), -+ Operand::c32(0), Operand(out_uniform_shader_addr, s2)); -+ bld.sop1(aco_opcode::s_mov_b64, Definition(out_return_shader_addr, s2), Operand::c32(0)); -+ -+ bld.sopk(aco_opcode::s_movk_i32, Definition(out_stack_ptr_param, s1), 0); - - /* jump to raygen */ - bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2)); -diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp -index 32a28908f90f0..5c7956caeedd4 100644 ---- a/src/amd/compiler/aco_interface.cpp -+++ b/src/amd/compiler/aco_interface.cpp -@@ -307,8 +307,8 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_ - void - aco_compile_rt_prolog(const struct aco_compiler_options* options, - const struct aco_shader_info* info, const struct ac_shader_args* in_args, -- const struct ac_shader_args* out_args, aco_callback* build_prolog, -- void** binary) -+ const struct ac_arg* descriptors, unsigned raygen_param_count, -+ nir_parameter* raygen_params, aco_callback* build_prolog, void** binary) - { - init(); - -@@ -319,7 +319,8 @@ aco_compile_rt_prolog(const struct aco_compiler_options* options, - program->debug.func = NULL; - program->debug.private_data = NULL; - -- select_rt_prolog(program.get(), &config, options, info, in_args, out_args); -+ select_rt_prolog(program.get(), &config, options, info, in_args, descriptors, raygen_param_count, -+ raygen_params); - validate(program.get()); - insert_waitcnt(program.get()); - insert_NOPs(program.get()); -diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h -index 462727432a1ac..efc3172647183 100644 ---- a/src/amd/compiler/aco_interface.h -+++ b/src/amd/compiler/aco_interface.h -@@ -49,8 +49,8 @@ void aco_compile_shader(const struct aco_compiler_options* options, - - void aco_compile_rt_prolog(const struct aco_compiler_options* options, - const struct aco_shader_info* info, const struct ac_shader_args* in_args, -- const struct ac_shader_args* out_args, aco_callback* build_prolog, -- void** binary); -+ const struct ac_arg* descriptors, unsigned raygen_param_count, -+ nir_parameter* raygen_params, aco_callback* build_prolog, void** binary); - - void aco_compile_vs_prolog(const struct aco_compiler_options* options, - const struct aco_shader_info* info, -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index 2bc7b91c81584..ccf2710d5453f 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -23,6 +23,7 @@ - #include - - typedef struct nir_shader nir_shader; -+typedef struct nir_parameter nir_parameter; - - namespace aco { - -@@ -2462,7 +2463,8 @@ void select_trap_handler_shader(Program* program, struct nir_shader* shader, - void select_rt_prolog(Program* program, ac_shader_config* config, - const struct aco_compiler_options* options, - const struct aco_shader_info* info, const struct ac_shader_args* in_args, -- const struct ac_shader_args* out_args); -+ const struct ac_arg* descriptors, unsigned raygen_param_count, -+ nir_parameter* raygen_params); - void select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, - ac_shader_config* config, const struct aco_compiler_options* options, - const struct aco_shader_info* info, const struct ac_shader_args* args); -diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c -index 196f8aa23a032..216eb1bb09f89 100644 ---- a/src/amd/vulkan/radv_pipeline_rt.c -+++ b/src/amd/vulkan/radv_pipeline_rt.c -@@ -808,8 +808,12 @@ static void - compile_rt_prolog(struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline) - { - const struct radv_physical_device *pdev = radv_device_physical(device); -+ struct nir_function raygen_stub = {}; - -- pipeline->prolog = radv_create_rt_prolog(device); -+ /* Create a dummy function signature for raygen shaders in order to pass parameter info to the prolog */ -+ radv_nir_init_rt_function_params(&raygen_stub, MESA_SHADER_RAYGEN, 0); -+ radv_nir_lower_callee_signature(&raygen_stub, NULL); -+ pipeline->prolog = radv_create_rt_prolog(device, raygen_stub.num_params, raygen_stub.params); - - /* create combined config */ - struct ac_shader_config *config = &pipeline->prolog->config; -diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c -index e5aa2ff636e1f..60648b2405321 100644 ---- a/src/amd/vulkan/radv_shader.c -+++ b/src/amd/vulkan/radv_shader.c -@@ -3073,13 +3073,12 @@ radv_aco_build_shader_part(void **bin, uint32_t num_sgprs, uint32_t num_vgprs, c - } - - struct radv_shader * --radv_create_rt_prolog(struct radv_device *device) -+radv_create_rt_prolog(struct radv_device *device, unsigned raygen_param_count, nir_parameter *raygen_params) - { - const struct radv_physical_device *pdev = radv_device_physical(device); - const struct radv_instance *instance = radv_physical_device_instance(pdev); - struct radv_shader *prolog; - struct radv_shader_args in_args = {0}; -- struct radv_shader_args out_args = {0}; - struct radv_nir_compiler_options options = {0}; - radv_fill_nir_compiler_options(&options, device, NULL, false, instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS, false, - radv_device_fault_detection_enabled(device), false); -@@ -3100,7 +3099,6 @@ radv_create_rt_prolog(struct radv_device *device) - info.cs.uses_block_id[i] = true; - - radv_declare_shader_args(device, NULL, &info, MESA_SHADER_COMPUTE, MESA_SHADER_NONE, &in_args); -- radv_declare_rt_shader_args(options.info->gfx_level, &out_args); - info.user_sgprs_locs = in_args.user_sgprs_locs; - - #if AMD_LLVM_AVAILABLE -@@ -3114,8 +3112,8 @@ radv_create_rt_prolog(struct radv_device *device) - struct aco_compiler_options ac_opts; - radv_aco_convert_shader_info(&ac_info, &info, &in_args, &device->cache_key, options.info->gfx_level); - radv_aco_convert_opts(&ac_opts, &options, &in_args, &stage_key); -- aco_compile_rt_prolog(&ac_opts, &ac_info, &in_args.ac, &out_args.ac, &radv_aco_build_shader_binary, -- (void **)&binary); -+ aco_compile_rt_prolog(&ac_opts, &ac_info, &in_args.ac, &in_args.descriptor_sets[0], raygen_param_count, raygen_params, -+ &radv_aco_build_shader_binary, (void **)&binary); - binary->info = info; - - radv_postprocess_binary_config(device, binary, &in_args); -diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h -index 10e062fb041b9..5ee1ee40466cf 100644 ---- a/src/amd/vulkan/radv_shader.h -+++ b/src/amd/vulkan/radv_shader.h -@@ -576,7 +576,8 @@ void radv_free_shader_memory(struct radv_device *device, union radv_shader_arena - - struct radv_shader *radv_create_trap_handler_shader(struct radv_device *device); - --struct radv_shader *radv_create_rt_prolog(struct radv_device *device); -+struct radv_shader *radv_create_rt_prolog(struct radv_device *device, unsigned raygen_param_count, -+ nir_parameter *raygen_params); - - struct radv_shader_part *radv_shader_part_create(struct radv_device *device, struct radv_shader_part_binary *binary, - unsigned wave_size); --- -GitLab - - -From 26d71a1077a1d0b29e4e426c5a83d0a04a7b18d6 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 13 May 2024 06:17:34 +0200 -Subject: [PATCH 48/71] aco/ssa_elimination: Don't remove exec writes for last - blocks of callee shaders - -The caller is going to use the exec mask written there. ---- - src/amd/compiler/aco_ssa_elimination.cpp | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp -index a1477244f51d9..e63dd63ad917c 100644 ---- a/src/amd/compiler/aco_ssa_elimination.cpp -+++ b/src/amd/compiler/aco_ssa_elimination.cpp -@@ -758,7 +758,8 @@ eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block) - /* Check if any successor needs the outgoing exec mask from the current block. */ - - bool exec_write_used; -- if (block.kind & block_kind_end_with_regs) { -+ if (block.kind & block_kind_end_with_regs || -+ (block.linear_succs.empty() && ctx.program->is_callee)) { - /* Last block of a program with succeed shader part should respect final exec write. */ - exec_write_used = true; - } else { --- -GitLab - - -From 6935d9d0a326ae77622e57057ee433faf3c33146 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 6 Mar 2024 14:53:39 +0100 -Subject: [PATCH 49/71] aco/isel: Handle calls - ---- - .../compiler/aco_instruction_selection.cpp | 130 ++++++++++++++++++ - 1 file changed, 130 insertions(+) - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index 901b9ca843eb1..b926d357739a4 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -10800,6 +10800,135 @@ get_callee_info(const ABI& abi, unsigned param_count, const nir_parameter* param - return info; - } - -+void -+visit_call(isel_context* ctx, nir_call_instr* instr) -+{ -+ Builder bld(ctx->program, ctx->block); -+ -+ ABI abi; -+ /* TODO: callable abi? */ -+ switch (instr->callee->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) { -+ case ACO_NIR_CALL_ABI_RT_RECURSIVE: abi = make_abi(rtRaygenABI, ctx->program); break; -+ case ACO_NIR_CALL_ABI_TRAVERSAL: abi = make_abi(rtTraversalABI, ctx->program); break; -+ case ACO_NIR_CALL_ABI_AHIT_ISEC: abi = make_abi(rtAnyHitABI, ctx->program); break; -+ default: unreachable("invalid abi"); -+ } -+ -+ struct callee_info info = -+ get_callee_info(abi, instr->callee->num_params, instr->callee->params, nullptr); -+ std::vector return_infos; -+ -+ Instruction* stack_instr; -+ Definition stack_ptr; -+ if (info.stack_ptr.is_reg) { -+ stack_instr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), -+ Operand::c32(info.scratch_param_size), -+ Operand(ctx->callee_info.stack_ptr.def.getTemp())); -+ stack_ptr = ctx->callee_info.stack_ptr.def; -+ } else { -+ stack_instr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), -+ Operand::c32(info.scratch_param_size)); -+ stack_ptr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s1), Operand::c32(0)).def(0); -+ } -+ -+ for (unsigned i = 0; i < info.param_infos.size(); ++i) { -+ if (info.param_infos[i].is_reg) -+ continue; -+ -+ store_scratch_param(ctx, bld, info.param_infos[i], stack_instr->definitions[0].getTemp(), -+ info.scratch_param_size, get_ssa_temp(ctx, instr->params[i].ssa)); -+ } -+ -+ unsigned extra_def_count = 1; -+ -+ Temp vcc_backup; -+ if (ctx->program->dev.sgpr_limit <= vcc_hi.reg()) { -+ vcc_backup = bld.copy(bld.def(bld.lm), Operand(vcc, bld.lm)); -+ --extra_def_count; -+ } -+ -+ unsigned extra_param_count = 3; -+ if (ctx->program->gfx_level < GFX9) -+ ++extra_param_count; -+ -+ unsigned param_size = info.scratch_param_size; -+ if (ctx->program->gfx_level < GFX9) -+ param_size *= ctx->program->wave_size; -+ -+ Instruction* call_instr = -+ create_instruction(aco_opcode::p_call, Format::PSEUDO_CALL, -+ info.reg_param_count + ctx->args->arg_count + extra_param_count, -+ info.reg_return_param_count + extra_def_count); -+ call_instr->call().abi = abi; -+ call_instr->operands[0] = Operand(ctx->callee_info.return_address.def.getTemp(), -+ info.return_address.def.physReg()); -+ call_instr->operands[1] = Operand(stack_ptr.getTemp(), info.stack_ptr.def.physReg()); -+ call_instr->operands[2] = Operand::c32(param_size); -+ if (ctx->program->gfx_level < GFX9) { -+ call_instr->operands[info.reg_param_count + ctx->args->arg_count + 3] = -+ Operand(load_scratch_resource(ctx->program, bld, true, false)); -+ call_instr->operands[info.reg_param_count + ctx->args->arg_count + 3].setLateKill(true); -+ } -+ -+ unsigned reg_return_param_idx = 0; -+ for (unsigned i = 0; i < info.param_infos.size(); ++i) { -+ if (!info.param_infos[i].is_reg) { -+ if (instr->callee->params[i].is_return) { -+ return_infos.emplace_back(parameter_info{ -+ .is_reg = false, -+ .scratch_offset = info.param_infos[i].scratch_offset, -+ }); -+ } -+ continue; -+ } -+ -+ if (instr->callee->params[i].is_uniform) -+ call_instr->operands[i + 3] = Operand(get_ssa_temp(ctx, instr->params[i].ssa)); -+ else -+ call_instr->operands[i + 3] = -+ Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->params[i].ssa))); -+ -+ if (instr->callee->params[i].is_return) { -+ assert(!instr->callee->params[i].is_uniform); -+ Definition def = -+ bld.def(RegClass(RegType::vgpr, DIV_ROUND_UP(instr->callee->params[i].bit_size, 32)), -+ info.param_infos[i].def.physReg()); -+ call_instr->definitions[extra_def_count + reg_return_param_idx++] = def; -+ return_infos.emplace_back(parameter_info{ -+ .is_reg = true, -+ .def = def, -+ }); -+ } -+ -+ call_instr->operands[i + 3].setPrecolored(info.param_infos[i].def.physReg()); -+ } -+ -+ for (unsigned i = 0; i < ctx->args->arg_count; i++) { -+ enum ac_arg_regfile file = ctx->args->args[i].file; -+ unsigned size = ctx->args->args[i].size; -+ unsigned reg = ctx->args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256); -+ RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size); -+ Operand op = ctx->arg_temps[i].id() ? Operand(ctx->arg_temps[i], PhysReg{reg}) -+ : Operand(PhysReg{reg}, type); -+ op.setLateKill(true); -+ call_instr->operands[info.reg_param_count + 3 + i] = op; -+ } -+ -+ if (ctx->program->dev.sgpr_limit <= vcc_hi.reg()) -+ bld.copy(bld.def(bld.lm, vcc), Operand(vcc_backup)); -+ else -+ call_instr->definitions[0] = bld.def(s2, vcc); -+ -+ ctx->block->instructions.emplace_back(static_cast(call_instr)); -+ -+ ctx->call_infos.emplace_back(call_info{ -+ .nir_instr = instr, -+ .aco_instr = call_instr, -+ .return_info = std::move(return_infos), -+ .scratch_param_size = info.scratch_param_size, -+ }); -+} -+ - void - visit_block(isel_context* ctx, nir_block* block) - { -@@ -10823,6 +10952,7 @@ visit_block(isel_context* ctx, nir_block* block) - case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break; - case nir_instr_type_deref: break; - case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break; -+ case nir_instr_type_call: visit_call(ctx, nir_instr_as_call(instr)); break; - default: isel_err(instr, "Unknown NIR instr type"); - } - } --- -GitLab - - -From 5a1503448739d2e2012bb0392711e3f6612df00f Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 6 Mar 2024 14:56:16 +0100 -Subject: [PATCH 50/71] aco/lower_to_hw_instr: Lower calls - ---- - src/amd/compiler/aco_lower_to_hw_instr.cpp | 10 ++++++++++ - 1 file changed, 10 insertions(+) - -diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp -index 1e1737319c3f6..c9a918d8a373f 100644 ---- a/src/amd/compiler/aco_lower_to_hw_instr.cpp -+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp -@@ -3017,6 +3017,16 @@ lower_to_hw_instr(Program* program) - } else if (instr->isMIMG() && instr->mimg().strict_wqm) { - lower_image_sample(&ctx, instr); - ctx.instructions.emplace_back(std::move(instr)); -+ } else if (instr->isCall()) { -+ PhysReg stack_reg = instr->operands[1].physReg(); -+ if (instr->operands[2].constantValue()) -+ bld.sop2(aco_opcode::s_add_u32, Definition(stack_reg, s1), Definition(scc, s1), -+ Operand(stack_reg, s1), instr->operands[2]); -+ bld.sop1(aco_opcode::s_swappc_b64, Definition(instr->operands[0].physReg(), s2), -+ Operand(instr->operands[4].physReg(), s2)); -+ if (instr->operands[2].constantValue()) -+ bld.sop2(aco_opcode::s_sub_u32, Definition(stack_reg, s1), Definition(scc, s1), -+ Operand(stack_reg, s1), instr->operands[2]); - } else { - ctx.instructions.emplace_back(std::move(instr)); - } --- -GitLab - - -From 6a4e937529ba36e41712205f201a308e98c6a8c9 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 13 Mar 2024 10:59:52 +0100 -Subject: [PATCH 51/71] aco/live_var_analysis: Handle calls - ---- - src/amd/compiler/aco_live_var_analysis.cpp | 47 ++++++++++++++++++++++ - 1 file changed, 47 insertions(+) - -diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp -index a635c94496143..64814e983bb2e 100644 ---- a/src/amd/compiler/aco_live_var_analysis.cpp -+++ b/src/amd/compiler/aco_live_var_analysis.cpp -@@ -29,9 +29,46 @@ get_temp_register_demand(Instruction* instr, RegisterDemand& demand_before, Regi - demand_before += op.getTemp(); - } - } -+ -+ if (instr->isCall()) -+ demand_after += instr->call().blocked_abi_demand; - } - } - -+void -+compute_blocked_abi_demand(Program* program, unsigned linear_vgpr_demand, Pseudo_call_instruction& instr) -+{ -+ const unsigned max_vgpr = get_addr_vgpr_from_waves(program, program->min_waves); -+ /* Linear VGPRs can intersect with preserved VGPRs, we insert spill code for them in -+ * spill_preserved. -+ */ -+ unsigned preserved_vgprs = max_vgpr - (instr.abi.clobberedRegs.vgpr.hi() - 256); -+ linear_vgpr_demand -= std::min(preserved_vgprs, linear_vgpr_demand); -+ -+ unsigned preserved_vgpr_demand = -+ instr.abi.clobberedRegs.vgpr.size - -+ std::min(linear_vgpr_demand, instr.abi.clobberedRegs.vgpr.size); -+ unsigned preserved_sgpr_demand = instr.abi.clobberedRegs.sgpr.size; -+ -+ /* Don't count definitions contained in clobbered call regs twice */ -+ for (auto& definition : instr.definitions) { -+ if (definition.isTemp() && definition.isFixed()) { -+ auto def_regs = PhysRegInterval{PhysReg{definition.physReg().reg()}, definition.size()}; -+ for (auto reg : def_regs) { -+ if (instr.abi.clobberedRegs.sgpr.contains(reg)) -+ --preserved_sgpr_demand; -+ if (instr.abi.clobberedRegs.vgpr.contains(reg)) -+ --preserved_vgpr_demand; -+ } -+ } -+ } -+ if (instr.abi.clobberedRegs.sgpr.contains(instr.operands[1].physReg()) && -+ !instr.operands[1].isKill()) -+ --preserved_sgpr_demand; -+ -+ instr.blocked_abi_demand = RegisterDemand(preserved_vgpr_demand, preserved_sgpr_demand); -+} -+ - RegisterDemand - get_live_changes(Instruction* instr) - { -@@ -313,6 +350,16 @@ process_live_temps_per_block(live_ctx& ctx, Block* block) - } - } - -+ if (insn->isCall()) { -+ unsigned linear_vgpr_demand = 0; -+ for (unsigned t : live) { -+ if (ctx.program->temp_rc[t].is_linear_vgpr()) -+ linear_vgpr_demand += ctx.program->temp_rc[t].size(); -+ } -+ compute_blocked_abi_demand(ctx.program, linear_vgpr_demand, insn->call()); -+ insn->register_demand += insn->call().blocked_abi_demand; -+ } -+ - operand_demand += new_demand; - insn->register_demand.update(operand_demand); - block->register_demand.update(insn->register_demand); --- -GitLab - - -From 66c7c6cc5e167e8a763fe17520e575ad6cae7f50 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sat, 23 Mar 2024 10:29:13 +0100 -Subject: [PATCH 52/71] aco/ra: add utility to block interval - ---- - src/amd/compiler/aco_register_allocation.cpp | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index 4d73525bd0660..9012a742bda33 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -264,6 +264,8 @@ public: - fill(start, rc.size(), 0xFFFFFFFF); - } - -+ void block(PhysRegInterval interval) { fill(interval.lo(), interval.size, 0xFFFFFFFF); } -+ - bool is_blocked(PhysReg start) const - { - if (regs[start] == 0xFFFFFFFF) --- -GitLab - - -From f2f3a2b63f646a30906c47bac0bb095618b12e9f Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sat, 23 Mar 2024 10:31:35 +0100 -Subject: [PATCH 53/71] aco/ra: handle clobbered regions by calls - ---- - src/amd/compiler/aco_register_allocation.cpp | 53 ++++++++++++++++++++ - 1 file changed, 53 insertions(+) - -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index 9012a742bda33..68502a79476e2 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -2104,6 +2104,12 @@ handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file, - bool found = false; - for (auto reg : regs.second) { - PhysRegInterval range = {reg, ctx.program->temp_rc[regs.first].size()}; -+ if (instr->isCall()) { -+ if (intersects(instr->call().abi.clobberedRegs.vgpr, range)) -+ continue; -+ if (intersects(instr->call().abi.clobberedRegs.sgpr, range)) -+ continue; -+ } - bool intersects_with_def = false; - for (const auto& def : instr->definitions) { - if (!def.isTemp() || !def.isFixed()) -@@ -3142,6 +3148,49 @@ register_allocation(Program* program, ra_test_policy policy) - register_file.clear(op); - } - -+ if (instr->isCall()) { -+ /* create parallelcopy pair to move blocking vars */ -+ RegisterFile tmp_file = register_file; -+ std::vector vars = -+ collect_vars(ctx, tmp_file, instr->call().abi.clobberedRegs.sgpr); -+ std::vector vars2 = -+ collect_vars(ctx, tmp_file, instr->call().abi.clobberedRegs.vgpr); -+ -+ /* Allow linear VGPRs in the clobbered range, they are spilled in spill_preserved. */ -+ for (auto it = vars2.begin(); it != vars2.end();) { -+ if (program->temp_rc[*it].is_linear_vgpr()) { -+ it = vars2.erase(it); -+ tmp_file.block(ctx.assignments[*it].reg, program->temp_rc[*it]); -+ } else { -+ ++it; -+ } -+ } -+ for (auto it = vars.begin(); it != vars.end();) { -+ if (instr->operands[1].tempId() == *it) -+ it = vars.erase(it); -+ else -+ ++it; -+ } -+ -+ vars.insert(vars.end(), vars2.begin(), vars2.end()); -+ -+ tmp_file.fill_killed_operands(instr.get()); -+ tmp_file.block(instr->call().abi.clobberedRegs.sgpr); -+ tmp_file.block(instr->call().abi.clobberedRegs.vgpr); -+ -+ adjust_max_used_regs(ctx, RegClass::s1, -+ instr->call().abi.clobberedRegs.sgpr.hi().reg() - 1); -+ adjust_max_used_regs(ctx, RegClass::v1, -+ instr->call().abi.clobberedRegs.vgpr.hi().reg() - 1); -+ -+ ASSERTED bool success = false; -+ success = -+ get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, PhysRegInterval{}); -+ assert(success); -+ -+ update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops); -+ } -+ - optimize_encoding(ctx, register_file, instr); - - /* Handle definitions which must have the same register as an operand. -@@ -3171,6 +3220,10 @@ register_allocation(Program* program, ra_test_policy policy) - RegisterFile tmp_file(register_file); - /* re-enable the killed operands, so that we don't move the blocking vars there */ - tmp_file.fill_killed_operands(instr.get()); -+ if (instr->isCall()) { -+ tmp_file.block(instr->call().abi.clobberedRegs.sgpr); -+ tmp_file.block(instr->call().abi.clobberedRegs.vgpr); -+ } - - ASSERTED bool success = false; - success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, def_regs); --- -GitLab - - -From 04f918a810d1b5953922cf91c9ea068a3d6c54db Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Fri, 3 May 2024 17:37:04 +0200 -Subject: [PATCH 54/71] aco/insert_waitcnt: Insert waitcnts before s_swappc too - ---- - src/amd/compiler/aco_insert_waitcnt.cpp | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp -index e6263d6f191f2..510ed8613c41d 100644 ---- a/src/amd/compiler/aco_insert_waitcnt.cpp -+++ b/src/amd/compiler/aco_insert_waitcnt.cpp -@@ -344,6 +344,10 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf - force_waitcnt(ctx, imm); - } - -+ if (instr->opcode == aco_opcode::s_swappc_b64) -+ u_foreach_bit (i, (~counter_vs) & ctx.nonzero) -+ imm[i] = 0; -+ - /* Make sure POPS coherent memory accesses have reached the L2 cache before letting the - * overlapping waves proceed into the ordered section. - */ --- -GitLab - - -From 35688a25c2e66aa5a8ddbe2c2700cf0fe0e7642b Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 13 May 2024 06:30:07 +0200 -Subject: [PATCH 55/71] aco/ra: Add utility to clear PhysRegInterval - ---- - src/amd/compiler/aco_register_allocation.cpp | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index 68502a79476e2..eb87bf111f5a8 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -266,6 +266,8 @@ public: - - void block(PhysRegInterval interval) { fill(interval.lo(), interval.size, 0xFFFFFFFF); } - -+ void clear(PhysRegInterval interval) { fill(interval.lo(), interval.size, 0); } -+ - bool is_blocked(PhysReg start) const - { - if (regs[start] == 0xFFFFFFFF) --- -GitLab - - -From 15ce5c3c90909b56b7c62d00d7e5022f4244140e Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sat, 18 May 2024 10:19:58 +0200 -Subject: [PATCH 56/71] aco/util: Add aco::unordered_set - ---- - src/amd/compiler/aco_util.h | 9 +++++++++ - 1 file changed, 9 insertions(+) - -diff --git a/src/amd/compiler/aco_util.h b/src/amd/compiler/aco_util.h -index 68a6c686408f0..0c5f9566bd213 100644 ---- a/src/amd/compiler/aco_util.h -+++ b/src/amd/compiler/aco_util.h -@@ -20,6 +20,7 @@ - #include - #include - #include -+#include - #include - - namespace aco { -@@ -390,6 +391,14 @@ template , class Pred = std::equ - using unordered_map = - std::unordered_map>>; - -+/* -+ * aco::unordered_set - alias for std::unordered_map with monotonic_allocator -+ * -+ * This template specialization mimics std::pmr::unordered_set. -+ */ -+template , class Pred = std::equal_to> -+using unordered_set = std::unordered_set>; -+ - /* - * Cache-friendly set of 32-bit IDs with fast insert/erase/lookup and - * the ability to efficiently iterate over contained elements. --- -GitLab - - -From be7080caa16a484d00a6213c284f91421bb9abb1 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 13 May 2024 06:23:55 +0200 -Subject: [PATCH 57/71] aco: Add pass for spilling call-related VGPRs - -Spills preserved VGPRs for callees and linear VGPRs added by the -spiller. ---- - .../compiler/aco_instruction_selection.cpp | 65 ++- - src/amd/compiler/aco_interface.cpp | 2 + - src/amd/compiler/aco_ir.h | 11 +- - src/amd/compiler/aco_opcodes.py | 3 + - src/amd/compiler/aco_opt_value_numbering.cpp | 3 +- - src/amd/compiler/aco_register_allocation.cpp | 62 +- - src/amd/compiler/aco_spill_preserved.cpp | 547 ++++++++++++++++++ - src/amd/compiler/meson.build | 1 + - 8 files changed, 670 insertions(+), 24 deletions(-) - create mode 100644 src/amd/compiler/aco_spill_preserved.cpp - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index b926d357739a4..deb97c1867667 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -106,9 +106,21 @@ append_logical_start(Block* b) - } - - static void --append_logical_end(Block* b) -+append_logical_end(isel_context* ctx) - { -- Builder(NULL, b).pseudo(aco_opcode::p_logical_end); -+ Builder bld(ctx->program, ctx->block); -+ -+ Operand stack_ptr_op; -+ if (ctx->program->gfx_level >= GFX9) -+ stack_ptr_op = Operand(ctx->callee_info.stack_ptr.def.getTemp()); -+ else -+ stack_ptr_op = Operand(load_scratch_resource(ctx->program, bld, true, true)); -+ stack_ptr_op.setLateKill(true); -+ if (ctx->program->is_callee) -+ bld.pseudo(aco_opcode::p_reload_preserved_vgpr, bld.def(s1), bld.def(bld.lm), -+ bld.def(s1, scc), stack_ptr_op); -+ -+ bld.pseudo(aco_opcode::p_logical_end); - } - - Temp -@@ -10485,7 +10497,7 @@ void - begin_loop(isel_context* ctx, loop_context* lc) - { - // TODO: we might want to wrap the loop around a branch if exec.potentially_empty=true -- append_logical_end(ctx->block); -+ append_logical_end(ctx); - ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform; - Builder bld(ctx->program, ctx->block); - bld.branch(aco_opcode::p_branch, bld.def(s2)); -@@ -10543,7 +10555,7 @@ end_loop(isel_context* ctx, loop_context* lc) - if (!ctx->cf_info.has_branch) { - unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx; - Builder bld(ctx->program, ctx->block); -- append_logical_end(ctx->block); -+ append_logical_end(ctx); - - /* No need to check exec.potentially_empty_break/continue originating inside the loop. In the - * only case where it's possible at this point (divergent break after divergent continue), we -@@ -10610,7 +10622,7 @@ emit_loop_jump(isel_context* ctx, bool is_break) - { - Builder bld(ctx->program, ctx->block); - Block* logical_target; -- append_logical_end(ctx->block); -+ append_logical_end(ctx); - unsigned idx = ctx->block->index; - - if (is_break) { -@@ -11072,7 +11084,7 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond, - { - ic->cond = cond; - -- append_logical_end(ctx->block); -+ append_logical_end(ctx); - ctx->block->kind |= block_kind_branch; - - /* branch to linear then block */ -@@ -11118,7 +11130,7 @@ begin_divergent_if_else(isel_context* ctx, if_context* ic, - nir_selection_control sel_ctrl = nir_selection_control_none) - { - Block* BB_then_logical = ctx->block; -- append_logical_end(BB_then_logical); -+ append_logical_end(ctx); - /* branch from logical then block to invert block */ - aco_ptr branch; - branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); -@@ -11177,7 +11189,7 @@ static void - end_divergent_if(isel_context* ctx, if_context* ic) - { - Block* BB_else_logical = ctx->block; -- append_logical_end(BB_else_logical); -+ append_logical_end(ctx); - - /* branch from logical else block to endif block */ - aco_ptr branch; -@@ -11222,7 +11234,7 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond) - { - assert(cond.regClass() == s1); - -- append_logical_end(ctx->block); -+ append_logical_end(ctx); - ctx->block->kind |= block_kind_uniform; - - aco_ptr branch; -@@ -11257,7 +11269,7 @@ begin_uniform_if_else(isel_context* ctx, if_context* ic) - Block* BB_then = ctx->block; - - if (!ctx->cf_info.has_branch) { -- append_logical_end(BB_then); -+ append_logical_end(ctx); - /* branch from then block to endif block */ - aco_ptr branch; - branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); -@@ -11291,7 +11303,7 @@ end_uniform_if(isel_context* ctx, if_context* ic) - Block* BB_else = ctx->block; - - if (!ctx->cf_info.has_branch) { -- append_logical_end(BB_else); -+ append_logical_end(ctx); - /* branch from then block to endif block */ - aco_ptr branch; - branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); -@@ -12217,13 +12229,34 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c - ctx.program->is_callee = true; - - Instruction* startpgm = add_startpgm(&ctx, true); -+ -+ Builder bld(ctx.program, ctx.block); -+ -+ Operand stack_ptr_op; -+ if (ctx.program->gfx_level >= GFX9) -+ stack_ptr_op = Operand(ctx.callee_info.stack_ptr.def.getTemp()); -+ else -+ stack_ptr_op = Operand(load_scratch_resource(ctx.program, bld, true, true)); -+ stack_ptr_op.setLateKill(true); -+ bld.pseudo(aco_opcode::p_spill_preserved_vgpr, bld.def(s1), bld.def(bld.lm), -+ bld.def(s1, scc), stack_ptr_op); -+ - append_logical_start(ctx.block); - split_arguments(&ctx, startpgm); - visit_cf_list(&ctx, &impl->body); -- append_logical_end(ctx.block); -+ append_logical_end(&ctx); - ctx.block->kind |= block_kind_uniform; - - if (ctx.next_pc != Temp()) { -+ bld = Builder(ctx.program, ctx.block); -+ if (ctx.program->gfx_level >= GFX9) -+ stack_ptr_op = Operand(ctx.callee_info.stack_ptr.def.getTemp()); -+ else -+ stack_ptr_op = Operand(load_scratch_resource(ctx.program, bld, true, true)); -+ stack_ptr_op.setLateKill(true); -+ bld.pseudo(aco_opcode::p_reload_preserved_vgpr, bld.def(s1), bld.def(bld.lm), -+ bld.def(s1, scc), stack_ptr_op); -+ - insert_return(ctx); - - Builder(ctx.program, ctx.block).sop1(aco_opcode::s_setpc_b64, Operand(ctx.next_pc)); -@@ -12503,7 +12536,7 @@ select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, cons - if (need_endpgm) { - program->config->float_mode = program->blocks[0].fp_mode.val; - -- append_logical_end(ctx.block); -+ append_logical_end(&ctx); - ctx.block->kind |= block_kind_uniform; - - if ((!program->info.ps.has_epilog && !is_first_stage_of_merged_shader) || -@@ -12918,7 +12951,7 @@ select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shade - - program->config->float_mode = program->blocks[0].fp_mode.val; - -- append_logical_end(ctx.block); -+ append_logical_end(&ctx); - ctx.block->kind |= block_kind_uniform; - bld.sopp(aco_opcode::s_endpgm); - -@@ -13864,7 +13897,7 @@ select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config, - - program->config->float_mode = program->blocks[0].fp_mode.val; - -- append_logical_end(ctx.block); -+ append_logical_end(&ctx); - ctx.block->kind |= block_kind_export_end; - bld.reset(ctx.block); - bld.sopp(aco_opcode::s_endpgm); -@@ -13900,7 +13933,7 @@ select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config, - - program->config->float_mode = program->blocks[0].fp_mode.val; - -- append_logical_end(ctx.block); -+ append_logical_end(&ctx); - - build_end_with_regs(&ctx, regs); - -diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp -index 5c7956caeedd4..921fc3894c694 100644 ---- a/src/amd/compiler/aco_interface.cpp -+++ b/src/amd/compiler/aco_interface.cpp -@@ -172,6 +172,8 @@ aco_postprocess_shader(const struct aco_compiler_options* options, - validate(program.get()); - } - -+ spill_preserved(program.get()); -+ - ssa_elimination(program.get()); - } - -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index ccf2710d5453f..e2101ae5162bc 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -2079,7 +2079,9 @@ is_dead(const std::vector& uses, const Instruction* instr) - { - if (instr->definitions.empty() || instr->isBranch() || instr->isCall() || - instr->opcode == aco_opcode::p_startpgm || instr->opcode == aco_opcode::p_init_scratch || -- instr->opcode == aco_opcode::p_dual_src_export_gfx11) -+ instr->opcode == aco_opcode::p_dual_src_export_gfx11 || -+ instr->opcode == aco_opcode::p_spill_preserved_vgpr || -+ instr->opcode == aco_opcode::p_reload_preserved_vgpr) - return false; - - if (std::any_of(instr->definitions.begin(), instr->definitions.end(), -@@ -2492,6 +2494,7 @@ void setup_reduce_temp(Program* program); - void lower_to_cssa(Program* program); - void register_allocation(Program* program, ra_test_policy = {}); - void reindex_ssa(Program* program); -+void spill_preserved(Program* program); - void ssa_elimination(Program* program); - void lower_to_hw_instr(Program* program); - void schedule_program(Program* program); -@@ -2608,4 +2611,10 @@ extern const Info instr_info; - - } // namespace aco - -+namespace std { -+template <> struct hash { -+ size_t operator()(aco::PhysReg temp) const noexcept { return std::hash{}(temp.reg_b); } -+}; -+} // namespace std -+ - #endif /* ACO_IR_H */ -diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py -index 696a5a945b310..8d0b93a044270 100644 ---- a/src/amd/compiler/aco_opcodes.py -+++ b/src/amd/compiler/aco_opcodes.py -@@ -333,6 +333,9 @@ insn("p_unit_test") - - insn("p_callee_stack_ptr") - -+insn("p_spill_preserved_vgpr") -+insn("p_reload_preserved_vgpr") -+ - insn("p_create_vector") - insn("p_extract_vector") - insn("p_split_vector") -diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp -index d5be9e9302d66..a199216907a5e 100644 ---- a/src/amd/compiler/aco_opt_value_numbering.cpp -+++ b/src/amd/compiler/aco_opt_value_numbering.cpp -@@ -313,7 +313,8 @@ can_eliminate(aco_ptr& instr) - if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi || - instr->opcode == aco_opcode::p_linear_phi || - instr->opcode == aco_opcode::p_pops_gfx9_add_exiting_wave_id || -- instr->definitions[0].isNoCSE()) -+ instr->definitions[0].isNoCSE() || instr->opcode == aco_opcode::p_spill_preserved_vgpr || -+ instr->opcode == aco_opcode::p_reload_preserved_vgpr) - return false; - - return true; -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index eb87bf111f5a8..88f40f894e79c 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -19,12 +19,6 @@ - #include - #include - --namespace std { --template <> struct hash { -- size_t operator()(aco::PhysReg temp) const noexcept { return std::hash{}(temp.reg_b); } --}; --} // namespace std -- - namespace aco { - namespace { - -@@ -2492,6 +2486,23 @@ init_reg_file(ra_ctx& ctx, const std::vector& live_out_per_block, Block& - const IDSet& live_in = live_out_per_block[block.index]; - assert(block.index != 0 || live_in.empty()); - -+ /* Callee shaders only get a chance to spill preserved registers after p_startpgm. -+ * To make sure nothing uses these regs until we can spill them, block them here. -+ */ -+ if (block.index == 0 && ctx.program->is_callee) { -+ PhysRegInterval preserved_vgpr_lo = PhysRegInterval{ -+ .lo_ = PhysReg{256u + ctx.program->arg_vgpr_count}, -+ .size = -+ ctx.program->callee_abi.clobberedRegs.vgpr.lo() - 256 - ctx.program->arg_vgpr_count, -+ }; -+ PhysRegInterval preserved_vgpr_hi = PhysRegInterval{ -+ .lo_ = ctx.program->callee_abi.clobberedRegs.vgpr.hi(), -+ .size = PhysReg{256u + ctx.vgpr_limit} - ctx.program->callee_abi.clobberedRegs.vgpr.hi(), -+ }; -+ register_file.block(preserved_vgpr_hi); -+ register_file.block(preserved_vgpr_lo); -+ } -+ - if (block.kind & block_kind_loop_header) { - ctx.loop_header.emplace_back(block.index); - /* already rename phis incoming value */ -@@ -3093,6 +3104,31 @@ register_allocation(Program* program, ra_test_policy policy) - instructions.emplace_back(std::move(instr)); - break; - } -+ if (instr->opcode == aco_opcode::p_reload_preserved_vgpr && block.linear_succs.empty()) { -+ PhysRegInterval preserved_vgpr_lo = PhysRegInterval{ -+ .lo_ = PhysReg{256u + ctx.program->arg_vgpr_count}, -+ .size = ctx.program->callee_abi.clobberedRegs.vgpr.lo() - 256u - -+ ctx.program->arg_vgpr_count, -+ }; -+ PhysRegInterval preserved_vgpr_hi = PhysRegInterval{ -+ .lo_ = ctx.program->callee_abi.clobberedRegs.vgpr.hi(), -+ .size = -+ PhysReg{256u + ctx.vgpr_limit} - ctx.program->callee_abi.clobberedRegs.vgpr.hi(), -+ }; -+ std::vector vars = collect_vars(ctx, register_file, preserved_vgpr_lo); -+ std::vector vars2 = collect_vars(ctx, register_file, preserved_vgpr_hi); -+ vars.insert(vars.end(), vars2.begin(), vars2.end()); -+ -+ register_file.block(preserved_vgpr_lo); -+ register_file.block(preserved_vgpr_hi); -+ -+ ASSERTED bool success = false; -+ success = get_regs_for_copies(ctx, register_file, parallelcopy, vars, instr, -+ PhysRegInterval{}); -+ assert(success); -+ -+ update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0); -+ } - - assert(!is_phi(instr)); - -@@ -3397,6 +3433,20 @@ register_allocation(Program* program, ra_test_policy policy) - instr->format = asVOP3(instr->format); - } - -+ if (instr->opcode == aco_opcode::p_spill_preserved_vgpr) { -+ PhysRegInterval preserved_vgpr_lo = PhysRegInterval{ -+ .lo_ = PhysReg{256u + ctx.program->arg_vgpr_count}, -+ .size = ctx.program->callee_abi.clobberedRegs.vgpr.lo() - 256u - -+ ctx.program->arg_vgpr_count, -+ }; -+ PhysRegInterval preserved_vgpr_hi = PhysRegInterval{ -+ .lo_ = ctx.program->callee_abi.clobberedRegs.vgpr.hi(), -+ .size = -+ PhysReg{256u + ctx.vgpr_limit} - ctx.program->callee_abi.clobberedRegs.vgpr.hi(), -+ }; -+ register_file.clear(preserved_vgpr_hi); -+ register_file.clear(preserved_vgpr_lo); -+ } - instructions.emplace_back(std::move(*instr_it)); - - } /* end for Instr */ -diff --git a/src/amd/compiler/aco_spill_preserved.cpp b/src/amd/compiler/aco_spill_preserved.cpp -new file mode 100644 -index 0000000000000..a6a6dd04c2d9f ---- /dev/null -+++ b/src/amd/compiler/aco_spill_preserved.cpp -@@ -0,0 +1,547 @@ -+/* -+ * Copyright © 2024 Valve Corporation -+ * -+ * SPDX-License-Identifier: MIT -+ */ -+ -+#include "aco_builder.h" -+#include "aco_ir.h" -+ -+#include -+#include -+ -+namespace aco { -+ -+struct postdom_info { -+ unsigned logical_imm_postdom; -+ unsigned linear_imm_postdom; -+}; -+ -+struct spill_preserved_ctx { -+ Program* program; -+ aco::monotonic_buffer_resource memory; -+ -+ aco::unordered_map preserved_spill_offsets; -+ aco::unordered_set preserved_regs; -+ aco::unordered_set preserved_linear_regs; -+ -+ aco::unordered_map> reg_block_uses; -+ std::vector dom_info; -+ -+ unsigned next_preserved_offset; -+ -+ explicit spill_preserved_ctx(Program* program_) -+ : program(program_), memory(), preserved_spill_offsets(memory), preserved_regs(memory), -+ preserved_linear_regs(memory), reg_block_uses(memory), -+ next_preserved_offset( -+ DIV_ROUND_UP(program_->config->scratch_bytes_per_wave, program_->wave_size)) -+ { -+ dom_info.resize(program->blocks.size(), {-1u, -1u}); -+ } -+}; -+ -+void -+add_instr(spill_preserved_ctx& ctx, unsigned block_index, bool seen_reload, -+ const aco_ptr& instr) -+{ -+ for (auto& def : instr->definitions) { -+ assert(def.isFixed()); -+ if (def.regClass().type() == RegType::sgpr) -+ continue; -+ /* Round down subdword registers to their base */ -+ PhysReg start_reg = PhysReg{def.physReg().reg()}; -+ for (auto reg : PhysRegInterval{start_reg, def.regClass().size()}) { -+ if (reg < 256u + ctx.program->arg_vgpr_count) -+ continue; -+ if (ctx.program->callee_abi.clobberedRegs.vgpr.contains(reg) && -+ !def.regClass().is_linear_vgpr()) -+ continue; -+ /* Don't count start_linear_vgpr without a copy as a use since the value doesn't matter. -+ * This allows us to move reloads a bit further up the CF. -+ */ -+ if (instr->opcode == aco_opcode::p_start_linear_vgpr && instr->operands.empty()) -+ continue; -+ -+ if (def.regClass().is_linear_vgpr()) -+ ctx.preserved_linear_regs.insert(reg); -+ else -+ ctx.preserved_regs.insert(reg); -+ -+ if (seen_reload) { -+ if (def.regClass().is_linear_vgpr()) -+ for (auto succ : ctx.program->blocks[block_index].linear_succs) -+ ctx.reg_block_uses[reg].emplace(succ); -+ else -+ for (auto succ : ctx.program->blocks[block_index].logical_succs) -+ ctx.reg_block_uses[reg].emplace(succ); -+ } else { -+ ctx.reg_block_uses[reg].emplace(block_index); -+ } -+ } -+ } -+ for (auto& op : instr->operands) { -+ assert(op.isFixed()); -+ if (op.regClass().type() == RegType::sgpr) -+ continue; -+ if (op.isConstant()) -+ continue; -+ /* Round down subdword registers to their base */ -+ PhysReg start_reg = PhysReg{op.physReg().reg()}; -+ for (auto reg : PhysRegInterval{start_reg, op.regClass().size()}) { -+ if (reg < 256u + ctx.program->arg_vgpr_count) -+ continue; -+ /* Don't count end_linear_vgpr as a use since the value doesn't matter. -+ * This allows us to move reloads a bit further up the CF. -+ */ -+ if (instr->opcode == aco_opcode::p_end_linear_vgpr) -+ continue; -+ if (ctx.program->callee_abi.clobberedRegs.vgpr.contains(reg) && -+ !op.regClass().is_linear_vgpr()) -+ continue; -+ if (op.regClass().is_linear_vgpr()) -+ ctx.preserved_linear_regs.insert(reg); -+ -+ if (seen_reload) { -+ if (op.regClass().is_linear_vgpr()) -+ for (auto succ : ctx.program->blocks[block_index].linear_succs) -+ ctx.reg_block_uses[reg].emplace(succ); -+ else -+ for (auto succ : ctx.program->blocks[block_index].logical_succs) -+ ctx.reg_block_uses[reg].emplace(succ); -+ } else { -+ ctx.reg_block_uses[reg].emplace(block_index); -+ } -+ } -+ } -+} -+ -+void -+spill_preserved(spill_preserved_ctx& ctx, PhysReg reg, std::vector>& spills, -+ std::vector>& lvgpr_spills) -+{ -+ unsigned offset; -+ -+ auto offset_iter = ctx.preserved_spill_offsets.find(reg); -+ if (offset_iter == ctx.preserved_spill_offsets.end()) { -+ offset = ctx.next_preserved_offset; -+ ctx.next_preserved_offset += 4; -+ ctx.preserved_spill_offsets.emplace(reg, offset); -+ } else { -+ offset = offset_iter->second; -+ } -+ -+ if (ctx.preserved_linear_regs.find(reg) != ctx.preserved_linear_regs.end()) -+ lvgpr_spills.emplace_back(reg, offset); -+ else -+ spills.emplace_back(reg, offset); -+} -+ -+void -+emit_spills_reloads_internal(spill_preserved_ctx& ctx, Builder& bld, -+ std::vector>& spills, PhysReg stack_reg, -+ PhysReg soffset, bool reload, bool linear, bool soffset_valid) -+{ -+ if (spills.empty()) -+ return; -+ -+ int end_offset = spills.back().second; -+ int start_offset = spills.front().second; -+ if (ctx.program->gfx_level >= GFX9) -+ assert(end_offset - start_offset < ctx.program->dev.scratch_global_offset_max); -+ -+ bool overflow = -+ end_offset > ctx.program->dev.scratch_global_offset_max || ctx.program->gfx_level < GFX9; -+ if (overflow) { -+ if (ctx.program->gfx_level >= GFX9) -+ bld.sop2(aco_opcode::s_add_u32, Definition(soffset, s1), Definition(scc, s1), -+ Operand(stack_reg, s1), Operand::c32(start_offset)); -+ else if (soffset_valid) -+ bld.sop2(aco_opcode::s_add_u32, Definition(soffset, s1), Definition(scc, s1), -+ Operand(soffset, s1), Operand::c32(start_offset * ctx.program->wave_size)); -+ else -+ bld.sop1(aco_opcode::s_mov_b32, Definition(soffset, s1), -+ Operand::c32(start_offset * ctx.program->wave_size)); -+ } -+ -+ Operand soffset_op; -+ if (ctx.program->gfx_level >= GFX9) -+ soffset_op = Operand(overflow ? soffset : stack_reg, s1); -+ else -+ soffset_op = soffset_valid || overflow ? Operand(soffset, s1) : Operand(sgpr_null, s1); -+ -+ for (const auto& spill : spills) { -+ if (ctx.program->gfx_level >= GFX9) { -+ if (reload) -+ bld.scratch(aco_opcode::scratch_load_dword, -+ Definition(spill.first, linear ? v1.as_linear() : v1), Operand(v1), -+ soffset_op, overflow ? spill.second - start_offset : spill.second, -+ memory_sync_info(storage_vgpr_spill, semantic_private)); -+ else -+ bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), soffset_op, -+ Operand(spill.first, linear ? v1.as_linear() : v1), -+ overflow ? spill.second - start_offset : spill.second, -+ memory_sync_info(storage_vgpr_spill, semantic_private)); -+ } else { -+ if (reload) { -+ Instruction* instr = bld.mubuf( -+ aco_opcode::buffer_load_dword, Definition(spill.first, linear ? v1.as_linear() : v1), -+ Operand(stack_reg, s4), Operand(v1), soffset_op, -+ overflow ? spill.second - start_offset : spill.second, false); -+ instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); -+ instr->mubuf().cache.value = ac_swizzled; -+ } else { -+ Instruction* instr = -+ bld.mubuf(aco_opcode::buffer_store_dword, Operand(stack_reg, s4), Operand(v1), -+ soffset_op, Operand(spill.first, linear ? v1.as_linear() : v1), -+ overflow ? spill.second - start_offset : spill.second, false); -+ instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); -+ instr->mubuf().cache.value = ac_swizzled; -+ } -+ } -+ } -+ -+ if (overflow && ctx.program->gfx_level < GFX9) -+ bld.sop2(aco_opcode::s_sub_i32, Definition(soffset, s1), Definition(scc, s1), -+ Operand(soffset, s1), Operand::c32(start_offset * ctx.program->wave_size)); -+} -+ -+void -+emit_spills_reloads(spill_preserved_ctx& ctx, std::vector>& instructions, -+ std::vector>::iterator& insert_point, -+ std::vector>& spills, -+ std::vector>& lvgpr_spills, bool reload) -+{ -+ auto spill_reload_compare = [](const auto& first, const auto& second) -+ { return first.second < second.second; }; -+ -+ std::sort(spills.begin(), spills.end(), spill_reload_compare); -+ std::sort(lvgpr_spills.begin(), lvgpr_spills.end(), spill_reload_compare); -+ -+ PhysReg stack_reg = (*insert_point)->operands[0].physReg(); -+ PhysReg soffset = (*insert_point)->definitions[0].physReg(); -+ PhysReg exec_backup = (*insert_point)->definitions[1].physReg(); -+ -+ std::vector> spill_instructions; -+ Builder bld(ctx.program, &spill_instructions); -+ -+ emit_spills_reloads_internal(ctx, bld, spills, stack_reg, soffset, reload, false, false); -+ if (!lvgpr_spills.empty()) { -+ bld.sop1(Builder::s_or_saveexec, Definition(exec_backup, bld.lm), Definition(scc, s1), -+ Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm)); -+ emit_spills_reloads_internal(ctx, bld, lvgpr_spills, stack_reg, soffset, reload, true, false); -+ bld.sop1(Builder::WaveSpecificOpcode::s_mov, Definition(exec, bld.lm), -+ Operand(exec_backup, bld.lm)); -+ } -+ -+ insert_point = instructions.erase(insert_point); -+ instructions.insert(insert_point, std::move_iterator(spill_instructions.begin()), -+ std::move_iterator(spill_instructions.end())); -+} -+ -+void -+init_block_info(spill_preserved_ctx& ctx) -+{ -+ unsigned cur_loop_header = -1u; -+ for (unsigned index = ctx.program->blocks.size() - 1; index < ctx.program->blocks.size();) { -+ const Block& block = ctx.program->blocks[index]; -+ -+ if (block.linear_succs.empty()) { -+ ctx.dom_info[index].logical_imm_postdom = block.index; -+ ctx.dom_info[index].linear_imm_postdom = block.index; -+ } else { -+ int new_logical_postdom = -1; -+ int new_linear_postdom = -1; -+ for (unsigned succ_idx : block.logical_succs) { -+ if ((int)ctx.dom_info[succ_idx].logical_imm_postdom == -1) { -+ assert(cur_loop_header == -1u || succ_idx >= cur_loop_header); -+ if (cur_loop_header == -1u) -+ cur_loop_header = succ_idx; -+ continue; -+ } -+ -+ if (new_logical_postdom == -1) { -+ new_logical_postdom = (int)succ_idx; -+ continue; -+ } -+ -+ while ((int)succ_idx != new_logical_postdom) { -+ if ((int)succ_idx < new_logical_postdom) -+ succ_idx = ctx.dom_info[succ_idx].logical_imm_postdom; -+ if ((int)succ_idx > new_logical_postdom) -+ new_logical_postdom = (int)ctx.dom_info[new_logical_postdom].logical_imm_postdom; -+ } -+ } -+ -+ for (unsigned succ_idx : block.linear_succs) { -+ if ((int)ctx.dom_info[succ_idx].linear_imm_postdom == -1) { -+ assert(cur_loop_header == -1u || succ_idx >= cur_loop_header); -+ if (cur_loop_header == -1u) -+ cur_loop_header = succ_idx; -+ continue; -+ } -+ -+ if (new_linear_postdom == -1) { -+ new_linear_postdom = (int)succ_idx; -+ continue; -+ } -+ -+ while ((int)succ_idx != new_linear_postdom) { -+ if ((int)succ_idx < new_linear_postdom) -+ succ_idx = ctx.dom_info[succ_idx].linear_imm_postdom; -+ if ((int)succ_idx > new_linear_postdom) -+ new_linear_postdom = (int)ctx.dom_info[new_linear_postdom].linear_imm_postdom; -+ } -+ } -+ -+ ctx.dom_info[index].logical_imm_postdom = new_logical_postdom; -+ ctx.dom_info[index].linear_imm_postdom = new_linear_postdom; -+ } -+ -+ bool seen_reload_vgpr = false; -+ for (auto& instr : block.instructions) { -+ if (instr->opcode == aco_opcode::p_reload_preserved_vgpr) { -+ seen_reload_vgpr = true; -+ continue; -+ } -+ -+ add_instr(ctx, index, seen_reload_vgpr, instr); -+ } -+ -+ /* Process predecessors of loop headers again, since post-dominance information of the header -+ * was not available the first time -+ */ -+ unsigned next_idx = index - 1; -+ if (index == cur_loop_header) { -+ assert(block.kind & block_kind_loop_header); -+ for (auto pred : block.logical_preds) -+ if (ctx.dom_info[pred].logical_imm_postdom == -1u) -+ next_idx = std::max(next_idx, pred); -+ for (auto pred : block.linear_preds) -+ if (ctx.dom_info[pred].linear_imm_postdom == -1u) -+ next_idx = std::max(next_idx, pred); -+ cur_loop_header = -1u; -+ } -+ index = next_idx; -+ } -+} -+ -+struct call_spill { -+ unsigned instr_idx; -+ std::vector> spills; -+}; -+ -+void -+emit_call_spills(spill_preserved_ctx& ctx) -+{ -+ std::set linear_vgprs; -+ std::unordered_map> block_call_spills; -+ -+ unsigned max_scratch_offset = ctx.next_preserved_offset; -+ -+ for (auto& block : ctx.program->blocks) { -+ for (auto it = block.instructions.begin(); it != block.instructions.end(); ++it) { -+ auto& instr = *it; -+ -+ if (instr->opcode == aco_opcode::p_call) { -+ unsigned scratch_offset = ctx.next_preserved_offset; -+ struct call_spill spill = { -+ .instr_idx = (unsigned)(it - block.instructions.begin()), -+ }; -+ for (auto& reg : linear_vgprs) { -+ if (!instr->call().abi.clobberedRegs.vgpr.contains(reg)) -+ continue; -+ spill.spills.emplace_back(reg, scratch_offset); -+ scratch_offset += 4; -+ } -+ max_scratch_offset = std::max(max_scratch_offset, scratch_offset); -+ -+ block_call_spills[block.index].emplace_back(std::move(spill)); -+ } else if (instr->opcode == aco_opcode::p_start_linear_vgpr) { -+ linear_vgprs.insert(instr->definitions[0].physReg()); -+ } else if (instr->opcode == aco_opcode::p_end_linear_vgpr) { -+ for (auto& op : instr->operands) -+ linear_vgprs.erase(op.physReg()); -+ } -+ } -+ } -+ -+ /* XXX: This should also be possible on GFX9, although small negative scratch offsets -+ * seem to hang the GPU, so disable it there now. -+ */ -+ if (ctx.program->gfx_level >= GFX10) -+ for (auto& block_calls : block_call_spills) -+ for (auto& call_spills : block_calls.second) -+ for (auto& spill : call_spills.spills) -+ spill.second -= max_scratch_offset; -+ -+ for (auto& block_calls : block_call_spills) { -+ for (unsigned i = 0; i < block_calls.second.size(); ++i) { -+ auto& block = ctx.program->blocks[block_calls.first]; -+ auto& call = block_calls.second[i]; -+ auto& instr = block.instructions[call.instr_idx]; -+ auto it = block.instructions.begin() + call.instr_idx; -+ unsigned num_inserted_instrs = 0; -+ -+ std::vector> spill_instructions; -+ Builder bld(ctx.program, &spill_instructions); -+ -+ PhysReg stack_reg = instr->operands[1].physReg(); -+ PhysReg soffset = PhysReg{UINT32_MAX}; -+ PhysReg scratch_rsrc = PhysReg{UINT32_MAX}; -+ if (ctx.program->gfx_level < GFX9) -+ scratch_rsrc = instr->operands.back().physReg(); -+ -+ if (ctx.program->gfx_level >= GFX10) { -+ bld.sop2(aco_opcode::s_add_u32, Definition(stack_reg, s1), Definition(scc, s1), -+ Operand(stack_reg, s1), Operand::c32(max_scratch_offset)); -+ emit_spills_reloads_internal(ctx, bld, call.spills, stack_reg, soffset, false, true, -+ false); -+ } else if (ctx.program->gfx_level == GFX9) { -+ emit_spills_reloads_internal(ctx, bld, call.spills, stack_reg, soffset, false, true, -+ false); -+ bld.sop2(aco_opcode::s_add_u32, Definition(stack_reg, s1), Definition(scc, s1), -+ Operand(stack_reg, s1), Operand::c32(max_scratch_offset)); -+ } else { -+ emit_spills_reloads_internal(ctx, bld, call.spills, scratch_rsrc, stack_reg, false, -+ true, true); -+ bld.sop2(aco_opcode::s_add_u32, Definition(stack_reg, s1), Definition(scc, s1), -+ Operand(stack_reg, s1), -+ Operand::c32(max_scratch_offset * ctx.program->wave_size)); -+ } -+ -+ it = block.instructions.insert(it, std::move_iterator(spill_instructions.begin()), -+ std::move_iterator(spill_instructions.end())); -+ it += spill_instructions.size() + 1; -+ num_inserted_instrs += spill_instructions.size(); -+ -+ spill_instructions.clear(); -+ -+ if (ctx.program->gfx_level >= GFX10) { -+ emit_spills_reloads_internal(ctx, bld, call.spills, stack_reg, soffset, true, true, -+ false); -+ bld.sop2(aco_opcode::s_sub_u32, Definition(stack_reg, s1), Definition(scc, s1), -+ Operand(stack_reg, s1), Operand::c32(max_scratch_offset)); -+ } else if (ctx.program->gfx_level == GFX9) { -+ bld.sop2(aco_opcode::s_sub_u32, Definition(stack_reg, s1), Definition(scc, s1), -+ Operand(stack_reg, s1), Operand::c32(max_scratch_offset)); -+ emit_spills_reloads_internal(ctx, bld, call.spills, stack_reg, soffset, true, true, -+ false); -+ } else { -+ bld.sop2(aco_opcode::s_sub_u32, Definition(stack_reg, s1), Definition(scc, s1), -+ Operand(stack_reg, s1), -+ Operand::c32(max_scratch_offset * ctx.program->wave_size)); -+ emit_spills_reloads_internal(ctx, bld, call.spills, scratch_rsrc, stack_reg, true, true, -+ true); -+ } -+ -+ block.instructions.insert(it, std::move_iterator(spill_instructions.begin()), -+ std::move_iterator(spill_instructions.end())); -+ num_inserted_instrs += spill_instructions.size(); -+ -+ for (unsigned j = i + 1; j < block_calls.second.size(); ++j) -+ block_calls.second[j].instr_idx += num_inserted_instrs; -+ } -+ } -+ -+ ctx.next_preserved_offset = max_scratch_offset; -+} -+ -+void -+emit_preserved_spills(spill_preserved_ctx& ctx) -+{ -+ std::vector> spills; -+ std::vector> lvgpr_spills; -+ -+ for (auto reg : ctx.preserved_regs) -+ spill_preserved(ctx, reg, spills, lvgpr_spills); -+ for (auto reg : ctx.preserved_linear_regs) -+ spill_preserved(ctx, reg, spills, lvgpr_spills); -+ -+ auto start_instr = std::find_if(ctx.program->blocks.front().instructions.begin(), -+ ctx.program->blocks.front().instructions.end(), -+ [](const auto& instr) -+ { return instr->opcode == aco_opcode::p_spill_preserved_vgpr; }); -+ emit_spills_reloads(ctx, ctx.program->blocks.front().instructions, start_instr, spills, -+ lvgpr_spills, false); -+ -+ auto block_reloads = -+ std::vector>>(ctx.program->blocks.size()); -+ auto lvgpr_block_reloads = -+ std::vector>>(ctx.program->blocks.size()); -+ -+ for (auto it = ctx.reg_block_uses.begin(); it != ctx.reg_block_uses.end();) { -+ bool is_linear = ctx.preserved_linear_regs.find(it->first) != ctx.preserved_linear_regs.end(); -+ -+ if (!is_linear && ctx.preserved_regs.find(it->first) == ctx.preserved_regs.end()) { -+ it = ctx.reg_block_uses.erase(it); -+ continue; -+ } -+ -+ unsigned min_common_postdom = 0; -+ -+ for (auto succ_idx : it->second) { -+ while (succ_idx != min_common_postdom) { -+ if (min_common_postdom < succ_idx) { -+ min_common_postdom = is_linear -+ ? ctx.dom_info[min_common_postdom].linear_imm_postdom -+ : ctx.dom_info[min_common_postdom].logical_imm_postdom; -+ } else { -+ succ_idx = is_linear ? ctx.dom_info[succ_idx].linear_imm_postdom -+ : ctx.dom_info[succ_idx].logical_imm_postdom; -+ } -+ } -+ } -+ -+ while (std::find_if(ctx.program->blocks[min_common_postdom].instructions.rbegin(), -+ ctx.program->blocks[min_common_postdom].instructions.rend(), -+ [](const auto& instr) { -+ return instr->opcode == aco_opcode::p_reload_preserved_vgpr; -+ }) == ctx.program->blocks[min_common_postdom].instructions.rend()) -+ min_common_postdom = is_linear ? ctx.dom_info[min_common_postdom].linear_imm_postdom -+ : ctx.dom_info[min_common_postdom].logical_imm_postdom; -+ -+ if (is_linear) { -+ lvgpr_block_reloads[min_common_postdom].emplace_back( -+ it->first, ctx.preserved_spill_offsets[it->first]); -+ ctx.preserved_linear_regs.erase(it->first); -+ } else { -+ block_reloads[min_common_postdom].emplace_back(it->first, -+ ctx.preserved_spill_offsets[it->first]); -+ ctx.preserved_regs.erase(it->first); -+ } -+ -+ it = ctx.reg_block_uses.erase(it); -+ } -+ -+ for (unsigned i = 0; i < ctx.program->blocks.size(); ++i) { -+ auto instr_it = std::find_if( -+ ctx.program->blocks[i].instructions.rbegin(), ctx.program->blocks[i].instructions.rend(), -+ [](const auto& instr) { return instr->opcode == aco_opcode::p_reload_preserved_vgpr; }); -+ if (instr_it == ctx.program->blocks[i].instructions.rend()) { -+ assert(block_reloads[i].empty() && lvgpr_block_reloads[i].empty()); -+ continue; -+ } -+ auto end_instr = std::prev(instr_it.base()); -+ emit_spills_reloads(ctx, ctx.program->blocks[i].instructions, end_instr, block_reloads[i], -+ lvgpr_block_reloads[i], true); -+ } -+} -+ -+void -+spill_preserved(Program* program) -+{ -+ if (!program->is_callee) -+ return; -+ -+ spill_preserved_ctx ctx(program); -+ -+ init_block_info(ctx); -+ -+ if (!program->bypass_reg_preservation) -+ emit_preserved_spills(ctx); -+ -+ emit_call_spills(ctx); -+ -+ program->config->scratch_bytes_per_wave = ctx.next_preserved_offset * program->wave_size; -+} -+} // namespace aco -diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build -index b235f626f97af..38006e78543dc 100644 ---- a/src/amd/compiler/meson.build -+++ b/src/amd/compiler/meson.build -@@ -62,6 +62,7 @@ libaco_files = files( - 'aco_scheduler.cpp', - 'aco_scheduler_ilp.cpp', - 'aco_spill.cpp', -+ 'aco_spill_preserved.cpp', - 'aco_ssa_elimination.cpp', - 'aco_statistics.cpp', - 'aco_util.h', --- -GitLab - - -From 35220611d653ced3a7ed06565c71815e9d135b5e Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 13 May 2024 06:26:51 +0200 -Subject: [PATCH 58/71] aco: Add cur_reg_demand to Program - -For checking whether spilling of preserved SGPRs is needed. ---- - src/amd/compiler/aco_ir.h | 1 + - src/amd/compiler/aco_live_var_analysis.cpp | 1 + - 2 files changed, 2 insertions(+) - -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index e2101ae5162bc..6f510fef17a04 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -2345,6 +2345,7 @@ public: - std::vector blocks; - std::vector temp_rc = {s1}; - RegisterDemand max_reg_demand = RegisterDemand(); -+ RegisterDemand cur_reg_demand = RegisterDemand(); - ac_shader_config* config; - struct aco_shader_info info; - enum amd_gfx_level gfx_level; -diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp -index 64814e983bb2e..52561464b0e1e 100644 ---- a/src/amd/compiler/aco_live_var_analysis.cpp -+++ b/src/amd/compiler/aco_live_var_analysis.cpp -@@ -565,6 +565,7 @@ update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) - uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); - uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); - -+ program->cur_reg_demand = new_demand; - /* this won't compile, register pressure reduction necessary */ - if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) { - program->num_waves = 0; --- -GitLab - - -From 20e1d11ec9b648ecc2d41bd5974c91545880e7b8 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 13 May 2024 06:28:31 +0200 -Subject: [PATCH 59/71] aco: Spill callee-preserved SGPRs - ---- - src/amd/compiler/aco_opcodes.py | 2 + - src/amd/compiler/aco_register_allocation.cpp | 46 ++++++- - src/amd/compiler/aco_scheduler.cpp | 8 ++ - src/amd/compiler/aco_spill.cpp | 119 +++++++++++++++++-- - 4 files changed, 167 insertions(+), 8 deletions(-) - -diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py -index 8d0b93a044270..a2f0876838f92 100644 ---- a/src/amd/compiler/aco_opcodes.py -+++ b/src/amd/compiler/aco_opcodes.py -@@ -334,7 +334,9 @@ insn("p_unit_test") - insn("p_callee_stack_ptr") - - insn("p_spill_preserved_vgpr") -+insn("p_spill_preserved_sgpr") - insn("p_reload_preserved_vgpr") -+insn("p_reload_preserved_sgpr") - - insn("p_create_vector") - insn("p_extract_vector") -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index 88f40f894e79c..b8915e520e8e1 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -3054,11 +3054,35 @@ register_allocation(Program* program, ra_test_policy policy) - ra_ctx ctx(program, policy); - get_affinities(ctx); - -+ std::unordered_set blocked_sgpr; -+ if (ctx.program->is_callee) { -+ PhysRegInterval preserved_sgpr_lo = PhysRegInterval{ -+ .lo_ = PhysReg{ctx.program->arg_sgpr_count}, -+ .size = ctx.program->callee_abi.clobberedRegs.sgpr.lo() - ctx.program->arg_sgpr_count, -+ }; -+ PhysRegInterval preserved_sgpr_hi = PhysRegInterval{ -+ .lo_ = ctx.program->callee_abi.clobberedRegs.sgpr.hi(), -+ .size = PhysReg{ctx.sgpr_limit} - ctx.program->callee_abi.clobberedRegs.sgpr.hi(), -+ }; -+ for (auto reg : preserved_sgpr_lo) { -+ blocked_sgpr.insert(reg); -+ adjust_max_used_regs(ctx, RegClass::s1, reg); -+ } -+ for (auto reg : preserved_sgpr_hi) { -+ blocked_sgpr.insert(reg); -+ adjust_max_used_regs(ctx, RegClass::s1, reg); -+ } -+ } -+ - for (Block& block : program->blocks) { - ctx.block = █ - - /* initialize register file */ - RegisterFile register_file = init_reg_file(ctx, program->live.live_in, block); -+ for (auto& reg : blocked_sgpr) { -+ if (register_file.is_empty_or_blocked(reg)) -+ register_file.block(reg, s1); -+ } - ctx.war_hint.reset(); - ctx.rr_vgpr_it = {PhysReg{256}}; - ctx.rr_sgpr_it = {PhysReg{0}}; -@@ -3104,7 +3128,27 @@ register_allocation(Program* program, ra_test_policy policy) - instructions.emplace_back(std::move(instr)); - break; - } -- if (instr->opcode == aco_opcode::p_reload_preserved_vgpr && block.linear_succs.empty()) { -+ if (instr->opcode == aco_opcode::p_spill_preserved_sgpr) { -+ if (register_file.is_blocked(instr->operands[0].physReg())) -+ register_file.clear(instr->operands[0]); -+ blocked_sgpr.erase(instr->operands[0].physReg()); -+ continue; -+ } else if (instr->opcode == aco_opcode::p_reload_preserved_sgpr) { -+ blocked_sgpr.insert(instr->operands[0].physReg()); -+ std::vector vars = collect_vars( -+ ctx, register_file, {instr->operands[0].physReg(), instr->operands[0].size()}); -+ register_file.block(instr->operands[0].physReg(), instr->operands[0].regClass()); -+ ASSERTED bool success = false; -+ success = get_regs_for_copies(ctx, register_file, parallelcopy, vars, instr, -+ PhysRegInterval{}); -+ assert(success); -+ -+ update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0); -+ register_file.block(instr->operands[0].physReg(), instr->operands[0].regClass()); -+ emit_parallel_copy(ctx, parallelcopy, instr, instructions, temp_in_scc, register_file); -+ continue; -+ } else if (instr->opcode == aco_opcode::p_reload_preserved_vgpr && -+ block.linear_succs.empty()) { - PhysRegInterval preserved_vgpr_lo = PhysRegInterval{ - .lo_ = PhysReg{256u + ctx.program->arg_vgpr_count}, - .size = ctx.program->callee_abi.clobberedRegs.vgpr.lo() - 256u - -diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp -index 4115c0bf3d7cf..e6eb1e49a4021 100644 ---- a/src/amd/compiler/aco_scheduler.cpp -+++ b/src/amd/compiler/aco_scheduler.cpp -@@ -1266,6 +1266,14 @@ schedule_program(Program* program) - assert(ctx.num_waves > 0); - ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2), - int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))}; -+ /* If not all preserved SGPRs in callee shaders were spilled, don't try using them for -+ * scheduling. -+ */ -+ if (program->is_callee) { -+ ctx.mv.max_registers.sgpr = -+ std::max(std::min(ctx.mv.max_registers.sgpr, program->cur_reg_demand.sgpr), -+ (int16_t)program->callee_abi.clobberedRegs.sgpr.size); -+ } - - /* NGG culling shaders are very sensitive to position export scheduling. - * Schedule less aggressively when early primitive export is used, and -diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp -index c271cbcf01eb8..e143b51809570 100644 ---- a/src/amd/compiler/aco_spill.cpp -+++ b/src/amd/compiler/aco_spill.cpp -@@ -75,6 +75,8 @@ struct spill_ctx { - std::vector> spills_entry; - std::vector> spills_exit; - -+ std::vector preserved_spill_ids; -+ - std::vector processed; - std::vector loop; - -@@ -138,11 +140,27 @@ struct spill_ctx { - for (auto pair : loop.back().spills) - add_interference(spill_id, pair.second); - } -+ for (auto id : preserved_spill_ids) -+ add_interference(spill_id, id); - - spills[to_spill] = spill_id; - return spill_id; - } - -+ uint32_t add_preserved_spill(RegClass rc, -+ std::vector>& block_spills) -+ { -+ const uint32_t spill_id = allocate_spill_id(rc); -+ for (auto& spills : block_spills) -+ for (auto pair : spills) -+ add_interference(spill_id, pair.second); -+ for (auto id : preserved_spill_ids) -+ add_interference(spill_id, id); -+ preserved_spill_ids.push_back(spill_id); -+ -+ return spill_id; -+ } -+ - void add_interference(uint32_t first, uint32_t second) - { - if (interferences[first].first.type() != interferences[second].first.type()) -@@ -1461,6 +1479,8 @@ end_unused_spill_vgprs(spill_ctx& ctx, Block& block, std::vector& vgpr_spi - if (pair.first.type() == RegType::sgpr && ctx.is_reloaded[pair.second]) - is_used[slots[pair.second] / ctx.wave_size] = true; - } -+ for (auto preserved : ctx.preserved_spill_ids) -+ is_used[slots[preserved] / ctx.wave_size] = true; - - std::vector temps; - for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) { -@@ -1635,6 +1655,13 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) - } - } - -+ if (!(*it)->definitions[0].isTemp()) { -+ auto id_it = std::find(ctx.preserved_spill_ids.begin(), -+ ctx.preserved_spill_ids.end(), spill_id); -+ assert(id_it != ctx.preserved_spill_ids.end()); -+ ctx.preserved_spill_ids.erase(id_it); -+ } -+ - /* reload sgpr: just add the vgpr temp to operands */ - Instruction* reload = create_instruction(aco_opcode::p_reload, Format::PSEUDO, 2, 1); - reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]); -@@ -1653,6 +1680,37 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) - ctx.program->config->scratch_bytes_per_wave += ctx.vgpr_spill_slots * 4 * ctx.program->wave_size; - } - -+void -+spill_reload_preserved_sgpr(spill_ctx& ctx, std::vector>& spill_instructions, -+ std::vector>& reload_instructions, PhysReg reg) -+{ -+ uint32_t spill_id = ctx.add_preserved_spill(RegClass::s1, ctx.spills_exit); -+ -+ aco_ptr spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; -+ spill->operands[0] = Operand(reg, RegClass::s1); -+ spill->operands[1] = Operand::c32(spill_id); -+ -+ aco_ptr unblock{ -+ create_instruction(aco_opcode::p_spill_preserved_sgpr, Format::PSEUDO, 1, 0)}; -+ unblock->operands[0] = Operand(reg, RegClass::s1); -+ -+ spill_instructions.emplace_back(std::move(spill)); -+ spill_instructions.emplace_back(std::move(unblock)); -+ -+ aco_ptr block{ -+ create_instruction(aco_opcode::p_reload_preserved_sgpr, Format::PSEUDO, 1, 0)}; -+ block->operands[0] = Operand(reg, RegClass::s1); -+ -+ aco_ptr reload{create_instruction(aco_opcode::p_reload, Format::PSEUDO, 1, 1)}; -+ reload->operands[0] = Operand::c32(spill_id); -+ reload->definitions[0] = Definition(reg, RegClass::s1); -+ -+ reload_instructions.emplace_back(std::move(block)); -+ reload_instructions.emplace_back(std::move(reload)); -+ -+ ctx.is_reloaded[spill_id] = true; -+} -+ - } /* end namespace */ - - void -@@ -1663,8 +1721,16 @@ spill(Program* program) - - program->progress = CompilationProgress::after_spilling; - -+ const uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); -+ const uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); -+ uint16_t abi_sgpr_limit = -+ std::min((uint16_t)(program->callee_abi.clobberedRegs.sgpr.size + program->arg_sgpr_count), -+ sgpr_limit); -+ if (!program->is_callee) -+ abi_sgpr_limit = sgpr_limit; -+ - /* no spilling when register pressure is low enough */ -- if (program->num_waves > 0) -+ if (program->num_waves > 0 && program->cur_reg_demand.sgpr <= abi_sgpr_limit) - return; - - /* lower to CSSA before spilling to ensure correctness w.r.t. phis */ -@@ -1672,14 +1738,12 @@ spill(Program* program) - - /* calculate target register demand */ - const RegisterDemand demand = program->max_reg_demand; /* current max */ -- const uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); -- const uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); - uint16_t extra_vgprs = 0; - uint16_t extra_sgprs = 0; - - /* calculate extra VGPRs required for spilling SGPRs */ -- if (demand.sgpr > sgpr_limit) { -- unsigned sgpr_spills = demand.sgpr - sgpr_limit; -+ if (demand.sgpr > abi_sgpr_limit) { -+ unsigned sgpr_spills = demand.sgpr - abi_sgpr_limit; - extra_vgprs = DIV_ROUND_UP(sgpr_spills * 2, program->wave_size) + 1; - } - /* add extra SGPRs required for spilling VGPRs */ -@@ -1688,9 +1752,9 @@ spill(Program* program) - extra_sgprs = 1; /* SADDR */ - else - extra_sgprs = 5; /* scratch_resource (s4) + scratch_offset (s1) */ -- if (demand.sgpr + extra_sgprs > sgpr_limit) { -+ if (demand.sgpr + extra_sgprs > abi_sgpr_limit) { - /* re-calculate in case something has changed */ -- unsigned sgpr_spills = demand.sgpr + extra_sgprs - sgpr_limit; -+ unsigned sgpr_spills = demand.sgpr + extra_sgprs - abi_sgpr_limit; - extra_vgprs = DIV_ROUND_UP(sgpr_spills * 2, program->wave_size) + 1; - } - } -@@ -1702,10 +1766,51 @@ spill(Program* program) - gather_ssa_use_info(ctx); - get_rematerialize_info(ctx); - -+ /* Prepare spilling of preserved SGPRs. Don't insert the instructions yet so live info -+ * stays valid. -+ */ -+ std::vector> preserved_spill_instructions; -+ std::vector> preserved_reload_instructions; -+ if (demand.sgpr > abi_sgpr_limit && ctx.program->is_callee) { -+ ctx.preserved_spill_ids.reserve(demand.sgpr - abi_sgpr_limit); -+ -+ for (PhysReg reg = PhysReg{program->arg_sgpr_count}; -+ reg < program->callee_abi.clobberedRegs.sgpr.lo(); reg = reg.advance(4)) -+ spill_reload_preserved_sgpr(ctx, preserved_spill_instructions, -+ preserved_reload_instructions, reg); -+ -+ unsigned max_reg = -+ std::min((unsigned)program->cur_reg_demand.sgpr + extra_sgprs, (unsigned)sgpr_limit); -+ for (PhysReg reg = program->callee_abi.clobberedRegs.sgpr.hi(); reg < max_reg; -+ reg = reg.advance(4)) -+ spill_reload_preserved_sgpr(ctx, preserved_spill_instructions, -+ preserved_reload_instructions, reg); -+ } -+ - /* create spills and reloads */ - for (unsigned i = 0; i < program->blocks.size(); i++) - spill_block(ctx, i); - -+ if (!preserved_spill_instructions.empty()) { -+ auto spill_insert_point = std::find_if( -+ program->blocks.front().instructions.begin(), program->blocks.front().instructions.end(), -+ [](const auto& instr) { return instr->opcode == aco_opcode::p_spill_preserved_vgpr; }); -+ assert(spill_insert_point != program->blocks.front().instructions.end()); -+ -+ spill_insert_point = std::next(spill_insert_point); -+ program->blocks.front().instructions.insert( -+ spill_insert_point, std::move_iterator(preserved_spill_instructions.begin()), -+ std::move_iterator(preserved_spill_instructions.end())); -+ -+ auto reload_insert_point = std::find_if( -+ program->blocks.back().instructions.begin(), program->blocks.back().instructions.end(), -+ [](const auto& instr) { return instr->opcode == aco_opcode::p_reload_preserved_vgpr; }); -+ assert(reload_insert_point != program->blocks.back().instructions.end()); -+ program->blocks.back().instructions.insert( -+ reload_insert_point, std::move_iterator(preserved_reload_instructions.begin()), -+ std::move_iterator(preserved_reload_instructions.end())); -+ } -+ - /* assign spill slots and DCE rematerialized code */ - assign_spill_slots(ctx, extra_vgprs); - --- -GitLab - - -From 3f8defc2ff59734c6e9b2bdc2554fc4f30204a1a Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 13 May 2024 06:29:40 +0200 -Subject: [PATCH 60/71] aco/ra: Also consider blocked registers as not - containing temps - ---- - src/amd/compiler/aco_register_allocation.cpp | 12 ++++++++++-- - 1 file changed, 10 insertions(+), 2 deletions(-) - -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index b8915e520e8e1..ff8475e19014d 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -231,6 +231,14 @@ public: - return res; - } - -+ unsigned count_zero_or_blocked(PhysRegInterval reg_interval) const -+ { -+ unsigned res = 0; -+ for (PhysReg reg : reg_interval) -+ res += !regs[reg] || regs[reg] == 0xFFFFFFFF; -+ return res; -+ } -+ - /* Returns true if any of the bytes in the given range are allocated or blocked */ - bool test(PhysReg start, unsigned num_bytes) const - { -@@ -3501,8 +3509,8 @@ register_allocation(Program* program, ra_test_policy policy) - - ASSERTED PhysRegInterval vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, false); - ASSERTED PhysRegInterval sgpr_bounds = get_reg_bounds(ctx, RegType::sgpr, false); -- assert(register_file.count_zero(vgpr_bounds) == ctx.vgpr_bounds); -- assert(register_file.count_zero(sgpr_bounds) == ctx.sgpr_bounds); -+ assert(register_file.count_zero_or_blocked(vgpr_bounds) == ctx.vgpr_bounds); -+ assert(register_file.count_zero_or_blocked(sgpr_bounds) == ctx.sgpr_bounds); - } else if (should_compact_linear_vgprs(ctx, register_file)) { - aco_ptr br = std::move(instructions.back()); - instructions.pop_back(); --- -GitLab - - -From 475664aaa95eaf7cf58abef67f524a658363d379 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Mon, 13 May 2024 06:30:35 +0200 -Subject: [PATCH 61/71] aco/ra: Skip blocked regs in get_reg_impl - ---- - src/amd/compiler/aco_register_allocation.cpp | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index ff8475e19014d..aec47824719a9 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -1307,7 +1307,7 @@ get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, std::vector -Date: Mon, 13 May 2024 06:31:01 +0200 -Subject: [PATCH 62/71] aco/isel: Bypass reg preservation for noreturn shaders - ---- - src/amd/compiler/aco_instruction_selection.cpp | 1 + - src/amd/compiler/aco_ir.h | 1 + - 2 files changed, 2 insertions(+) - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index deb97c1867667..6c98777b12689 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -12261,6 +12261,7 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c - - Builder(ctx.program, ctx.block).sop1(aco_opcode::s_setpc_b64, Operand(ctx.next_pc)); - } else { -+ ctx.program->bypass_reg_preservation = true; - Builder(ctx.program, ctx.block).sopp(aco_opcode::s_endpgm); - } - -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index 6f510fef17a04..2ab9eaa5d653c 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -2390,6 +2390,7 @@ public: - bool pending_lds_access = false; - - bool is_callee = false; -+ bool bypass_reg_preservation = false; - ABI callee_abi = {}; - unsigned short arg_sgpr_count; - unsigned short arg_vgpr_count; --- -GitLab - - -From 8de0e68756db0eea3b7e332bf47b295863de41a1 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Fri, 31 May 2024 16:46:28 +0200 -Subject: [PATCH 63/71] aco/ra: Add separate counter for blocked registers - -We can't assume blocked registers are free in get_reg_impl, but -we don't want to pessimize register usage estimations either. ---- - src/amd/compiler/aco_register_allocation.cpp | 25 ++++++++++++++++---- - 1 file changed, 21 insertions(+), 4 deletions(-) - -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index aec47824719a9..5b4b50652006e 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -97,6 +97,8 @@ struct ra_ctx { - aco_ptr phi_dummy; - uint16_t max_used_sgpr = 0; - uint16_t max_used_vgpr = 0; -+ uint16_t max_blocked_sgpr = 0; -+ uint16_t max_blocked_vgpr = 0; - uint16_t sgpr_limit; - uint16_t vgpr_limit; - std::bitset<512> war_hint; -@@ -765,6 +767,21 @@ adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) - } - } - -+void -+adjust_max_blocked_regs(ra_ctx& ctx, RegType type, unsigned reg) -+{ -+ uint16_t max_addressible_sgpr = ctx.sgpr_limit; -+ if (type == RegType::vgpr) { -+ assert(reg >= 256); -+ uint16_t hi = reg - 256 - 1; -+ assert(hi <= 255); -+ ctx.max_blocked_vgpr = std::max(ctx.max_blocked_vgpr, hi); -+ } else if (reg <= max_addressible_sgpr) { -+ uint16_t hi = reg - 1; -+ ctx.max_blocked_sgpr = std::max(ctx.max_blocked_sgpr, std::min(hi, max_addressible_sgpr)); -+ } -+} -+ - enum UpdateRenames { - rename_not_killed_ops = 0x1, - }; -@@ -3268,10 +3285,10 @@ register_allocation(Program* program, ra_test_policy policy) - tmp_file.block(instr->call().abi.clobberedRegs.sgpr); - tmp_file.block(instr->call().abi.clobberedRegs.vgpr); - -- adjust_max_used_regs(ctx, RegClass::s1, -- instr->call().abi.clobberedRegs.sgpr.hi().reg() - 1); -- adjust_max_used_regs(ctx, RegClass::v1, -- instr->call().abi.clobberedRegs.vgpr.hi().reg() - 1); -+ adjust_max_blocked_regs(ctx, RegType::sgpr, -+ instr->call().abi.clobberedRegs.sgpr.hi().reg()); -+ adjust_max_blocked_regs(ctx, RegType::vgpr, -+ instr->call().abi.clobberedRegs.vgpr.hi().reg()); - - ASSERTED bool success = false; - success = --- -GitLab - - -From ffb65b8b229cab1e36a6334344088aa9f0928d3a Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sat, 1 Jun 2024 11:50:04 +0200 -Subject: [PATCH 64/71] aco/spill: Don't spill scratch_rsrc-related temps - -These temps are used to create the scratch_rsrc. Spilling them will -never benefit anything, because assign_spill_slots will insert code -that keeps them live. Since the spiller assumes all spilled variables -to be dead, this can cause more variables being live than intended and -spilling to fail. ---- - src/amd/compiler/aco_spill.cpp | 10 +++++++++- - 1 file changed, 9 insertions(+), 1 deletion(-) - -diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp -index e143b51809570..b36a15b68e553 100644 ---- a/src/amd/compiler/aco_spill.cpp -+++ b/src/amd/compiler/aco_spill.cpp -@@ -371,6 +371,9 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) - if (var.type() != type || ctx.spills_entry[block_idx].count(var) || - var.regClass().is_linear_vgpr()) - continue; -+ if (var == ctx.program->stack_ptr || var == ctx.program->scratch_offset || -+ var == ctx.program->private_segment_buffer) -+ continue; - - unsigned can_remat = ctx.remat.count(var); - if (can_remat > remat || (can_remat == remat && ctx.ssa_infos[t].score() > score)) { -@@ -415,7 +418,8 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) - continue; - Temp var = phi->definitions[0].getTemp(); - if (var.type() == type && !ctx.spills_entry[block_idx].count(var) && -- ctx.ssa_infos[var.id()].score() > score) { -+ ctx.ssa_infos[var.id()].score() > score && var != ctx.program->stack_ptr && -+ var != ctx.program->scratch_offset && var != ctx.program->private_segment_buffer) { - to_spill = var; - score = ctx.ssa_infos[var.id()].score(); - } -@@ -965,6 +969,10 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s - - if (can_rematerialize > do_rematerialize || loop_variable > avoid_respill || - ctx.ssa_infos[t].score() > score) { -+ if (var == ctx.program->stack_ptr || var == ctx.program->scratch_offset || -+ var == ctx.program->private_segment_buffer) -+ continue; -+ - unsigned cur_operand_idx = -1u; - bool can_spill = true; - for (auto it = instr->operands.begin(); it != instr->operands.end(); ++it) { --- -GitLab - - -From 524d5f329cc352e8049ef573a728d47f2f6741e3 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Wed, 5 Jun 2024 11:06:32 +0200 -Subject: [PATCH 65/71] aco/spill: Ignore extra VGPRs/SGPRs for calls - -For VGPRs, we make sure they're spilled in the spill_preserved pass. -For SGPRs, we make sure to reinitialize scratch_rsrc after calls. ---- - src/amd/compiler/aco_spill.cpp | 20 +++++++++++++++----- - 1 file changed, 15 insertions(+), 5 deletions(-) - -diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp -index b36a15b68e553..943a3788a15c2 100644 ---- a/src/amd/compiler/aco_spill.cpp -+++ b/src/amd/compiler/aco_spill.cpp -@@ -88,17 +88,20 @@ struct spill_ctx { - std::set unused_remats; - unsigned wave_size; - -+ RegisterDemand extra_demand; -+ - unsigned sgpr_spill_slots; - unsigned vgpr_spill_slots; - Temp scratch_rsrc; - -- spill_ctx(const RegisterDemand target_pressure_, Program* program_) -+ spill_ctx(const RegisterDemand target_pressure_, RegisterDemand extra_demand_, Program* program_) - : target_pressure(target_pressure_), program(program_), memory(), - renames(program->blocks.size(), aco::map(memory)), - spills_entry(program->blocks.size(), aco::unordered_map(memory)), - spills_exit(program->blocks.size(), aco::unordered_map(memory)), - processed(program->blocks.size(), false), ssa_infos(program->peekAllocationId()), -- remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0) -+ remat(memory), wave_size(program->wave_size), extra_demand(extra_demand_), -+ sgpr_spill_slots(0), vgpr_spill_slots(0) - {} - - void add_affinity(uint32_t first, uint32_t second) -@@ -943,8 +946,14 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s - RegisterDemand new_demand = instr->register_demand; - std::optional live_changes; - -+ RegisterDemand ignored_regs = {}; -+ -+ /* We spill linear VGPRs for calls in spill_preserved */ -+ if (instr->isCall() || (!instructions.empty() && instructions.back()->isCall())) -+ ignored_regs += ctx.extra_demand; -+ - /* if reg pressure is too high, spill variable with furthest next use */ -- while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) { -+ while ((new_demand - spilled_registers).exceeds(ctx.target_pressure + ignored_regs)) { - float score = 0.0; - Temp to_spill = Temp(); - unsigned operand_idx = -1u; -@@ -953,7 +962,8 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s - unsigned avoid_respill = 0; - - RegType type = RegType::sgpr; -- if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) -+ if (new_demand.vgpr - spilled_registers.vgpr > -+ (ctx.target_pressure.vgpr + ignored_regs.vgpr)) - type = RegType::vgpr; - - for (unsigned t : ctx.program->live.live_in[block_idx]) { -@@ -1770,7 +1780,7 @@ spill(Program* program) - const RegisterDemand target(vgpr_limit - extra_vgprs, sgpr_limit - extra_sgprs); - - /* initialize ctx */ -- spill_ctx ctx(target, program); -+ spill_ctx ctx(target, RegisterDemand(extra_vgprs, extra_sgprs), program); - gather_ssa_use_info(ctx); - get_rematerialize_info(ctx); - --- -GitLab - - -From 9bedff4e6eef064be53aaa64c14cb40318e311b9 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sat, 1 Jun 2024 16:38:24 +0200 -Subject: [PATCH 66/71] aco: Add and set block->contains_call - ---- - src/amd/compiler/aco_instruction_selection.cpp | 1 + - src/amd/compiler/aco_ir.h | 1 + - 2 files changed, 2 insertions(+) - -diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp -index 6c98777b12689..fae8d57479bb8 100644 ---- a/src/amd/compiler/aco_instruction_selection.cpp -+++ b/src/amd/compiler/aco_instruction_selection.cpp -@@ -10939,6 +10939,7 @@ visit_call(isel_context* ctx, nir_call_instr* instr) - .return_info = std::move(return_infos), - .scratch_param_size = info.scratch_param_size, - }); -+ ctx->block->contains_call = true; - } - - void -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index 2ab9eaa5d653c..14f2c07eda7a8 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -2219,6 +2219,7 @@ struct Block { - /* this information is needed for predecessors to blocks with phis when - * moving out of ssa */ - bool scc_live_out = false; -+ bool contains_call = true; - - Block() : index(0) {} - }; --- -GitLab - - -From ca4c18e7be750667c68229346bba989d28255ceb Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sat, 1 Jun 2024 12:00:48 +0200 -Subject: [PATCH 67/71] aco/spill: Reset scratch_rsrc on calls - ---- - src/amd/compiler/aco_spill.cpp | 46 ++++++++++++++++++++++++++++------ - 1 file changed, 39 insertions(+), 7 deletions(-) - -diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp -index 943a3788a15c2..61ffd57b497f9 100644 ---- a/src/amd/compiler/aco_spill.cpp -+++ b/src/amd/compiler/aco_spill.cpp -@@ -93,6 +93,7 @@ struct spill_ctx { - unsigned sgpr_spill_slots; - unsigned vgpr_spill_slots; - Temp scratch_rsrc; -+ unsigned scratch_rsrc_block = -1u; - - spill_ctx(const RegisterDemand target_pressure_, RegisterDemand extra_demand_, Program* program_) - : target_pressure(target_pressure_), program(program_), memory(), -@@ -1192,19 +1193,28 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block, - bool overflow = (ctx.vgpr_spill_slots - 1) * 4 > offset_range; - - Builder rsrc_bld(ctx.program); -+ unsigned bld_block = block.index; - if (block.kind & block_kind_top_level) { - rsrc_bld.reset(&instructions); - } else if (ctx.scratch_rsrc == Temp() && (!overflow || ctx.program->gfx_level < GFX9)) { - Block* tl_block = █ -- while (!(tl_block->kind & block_kind_top_level)) -+ while (!(tl_block->kind & block_kind_top_level) && -+ std::find_if(tl_block->instructions.begin(), tl_block->instructions.end(), -+ [](auto& instr) -+ { return !instr || instr->isCall(); }) == tl_block->instructions.end()) - tl_block = &ctx.program->blocks[tl_block->linear_idom]; - - /* find p_logical_end */ -- std::vector>& prev_instructions = tl_block->instructions; -- unsigned idx = prev_instructions.size() - 1; -- while (prev_instructions[idx]->opcode != aco_opcode::p_logical_end) -- idx--; -- rsrc_bld.reset(&prev_instructions, std::next(prev_instructions.begin(), idx)); -+ if (tl_block->kind & block_kind_top_level) { -+ std::vector>& prev_instructions = tl_block->instructions; -+ unsigned idx = prev_instructions.size() - 1; -+ while (prev_instructions[idx]->opcode != aco_opcode::p_logical_end) -+ idx--; -+ rsrc_bld.reset(&prev_instructions, std::next(prev_instructions.begin(), idx)); -+ bld_block = tl_block->index; -+ } else { -+ rsrc_bld.reset(&instructions); -+ } - } - - /* If spilling overflows the constant offset range at any point, we need to emit the soffset -@@ -1232,10 +1242,13 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block, - Operand(ctx.program->stack_ptr), Operand::c32(saddr)); - else - ctx.scratch_rsrc = offset_bld.copy(offset_bld.def(s1), Operand::c32(saddr)); -+ ctx.scratch_rsrc_block = bld_block; - } - } else { -- if (ctx.scratch_rsrc == Temp()) -+ if (ctx.scratch_rsrc == Temp()) { - ctx.scratch_rsrc = load_scratch_resource(ctx.program, rsrc_bld, overflow, true); -+ ctx.scratch_rsrc_block = bld_block; -+ } - - if (overflow) { - uint32_t soffset = -@@ -1571,6 +1584,22 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) - unsigned last_top_level_block_idx = 0; - for (Block& block : ctx.program->blocks) { - -+ if (ctx.scratch_rsrc_block < ctx.program->blocks.size() && -+ !(ctx.program->blocks[ctx.scratch_rsrc_block].kind & block_kind_top_level)) -+ ctx.scratch_rsrc = Temp(); -+ -+ if (block.kind & block_kind_loop_header) { -+ for (unsigned index = block.index; -+ index < ctx.program->blocks.size() && -+ ctx.program->blocks[index].loop_nest_depth >= block.loop_nest_depth; -+ ++index) { -+ if (ctx.program->blocks[index].contains_call) { -+ ctx.scratch_rsrc = Temp(); -+ break; -+ } -+ } -+ } -+ - if (block.kind & block_kind_top_level) { - last_top_level_block_idx = block.index; - -@@ -1588,6 +1617,9 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) - Builder bld(ctx.program, &instructions); - for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { - -+ if ((*it)->isCall()) -+ ctx.scratch_rsrc = Temp(); -+ - if ((*it)->opcode == aco_opcode::p_spill) { - uint32_t spill_id = (*it)->operands[1].constantValue(); - --- -GitLab - - -From b90eeacb2aa89b4d33315cc3e49c13611710d945 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Thu, 6 Jun 2024 08:08:02 +0200 -Subject: [PATCH 68/71] radv: Re-enable RT pipelines on GFX9+ - ---- - src/amd/vulkan/radv_physical_device.c | 6 +----- - 1 file changed, 1 insertion(+), 5 deletions(-) - -diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c -index 98826470d4d60..382195e70a77e 100644 ---- a/src/amd/vulkan/radv_physical_device.c -+++ b/src/amd/vulkan/radv_physical_device.c -@@ -111,14 +111,10 @@ radv_filter_minmax_enabled(const struct radv_physical_device *pdev) - bool - radv_enable_rt(const struct radv_physical_device *pdev, bool rt_pipelines) - { -- /* Temporarily under construction! */ -- if (rt_pipelines) -- return false; -- - if (pdev->info.gfx_level < GFX10_3 && !radv_emulate_rt(pdev)) - return false; - -- if (rt_pipelines && pdev->use_llvm) -+ if (rt_pipelines && (pdev->use_llvm || pdev->info.gfx_level < GFX9)) - return false; - - return true; --- -GitLab - - -From c73f158059b287185f612d3ea1e1ef8bcc46f58b Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Thu, 6 Jun 2024 08:03:43 +0200 -Subject: [PATCH 69/71] aco: Add separate register usage tracking for - ABI-preserved regs - -If a shader uses fewer registers than are preserved by an ABI, we'll -want to set the register demand to the actual register usage instead of -the demand set by preserved call registers. - -Totals from 11 (0.01% of 81072) affected shaders: -MaxWaves: 120 -> 176 (+46.67%) -Instrs: 9493 -> 9516 (+0.24%) -CodeSize: 54868 -> 55012 (+0.26%); split: -0.03%, +0.29% -VGPRs: 1088 -> 640 (-41.18%) -Latency: 140184 -> 141125 (+0.67%); split: -0.06%, +0.73% -InvThroughput: 38824 -> 35752 (-7.91%); split: -7.93%, +0.02% -VClause: 256 -> 262 (+2.34%) -SClause: 129 -> 136 (+5.43%) -Copies: 1379 -> 1402 (+1.67%); split: -0.15%, +1.81% -VALU: 6386 -> 6405 (+0.30%); split: -0.03%, +0.33% -SALU: 968 -> 972 (+0.41%) -VMEM: 1028 -> 1030 (+0.19%) ---- - src/amd/compiler/aco_ir.h | 7 +++- - src/amd/compiler/aco_live_var_analysis.cpp | 24 +++++++---- - src/amd/compiler/aco_lower_to_cssa.cpp | 10 ++++- - src/amd/compiler/aco_register_allocation.cpp | 22 +++++----- - src/amd/compiler/aco_scheduler.cpp | 43 +++++++++++++++++++- - src/amd/compiler/aco_spill.cpp | 4 +- - 6 files changed, 84 insertions(+), 26 deletions(-) - -diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h -index 14f2c07eda7a8..92b21a8b4ed6a 100644 ---- a/src/amd/compiler/aco_ir.h -+++ b/src/amd/compiler/aco_ir.h -@@ -2346,7 +2346,7 @@ public: - std::vector blocks; - std::vector temp_rc = {s1}; - RegisterDemand max_reg_demand = RegisterDemand(); -- RegisterDemand cur_reg_demand = RegisterDemand(); -+ RegisterDemand max_real_reg_demand = RegisterDemand(); - ac_shader_config* config; - struct aco_shader_info info; - enum amd_gfx_level gfx_level; -@@ -2485,7 +2485,8 @@ void select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config, - void lower_phis(Program* program); - void lower_subdword(Program* program); - void calc_min_waves(Program* program); --void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); -+void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand, -+ const RegisterDemand new_real_demand); - void live_var_analysis(Program* program); - std::vector dead_code_analysis(Program* program); - void dominator_tree(Program* program); -@@ -2561,6 +2562,8 @@ int get_op_fixed_to_def(Instruction* instr); - RegisterDemand get_live_changes(Instruction* instr); - RegisterDemand get_temp_registers(Instruction* instr); - RegisterDemand get_temp_reg_changes(Instruction* instr); -+void compute_blocked_abi_demand(Program* program, unsigned linear_vgpr_demand, -+ Pseudo_call_instruction& instr); - - /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */ - uint16_t get_extra_sgprs(Program* program); -diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp -index 52561464b0e1e..9d6284b38e0a3 100644 ---- a/src/amd/compiler/aco_live_var_analysis.cpp -+++ b/src/amd/compiler/aco_live_var_analysis.cpp -@@ -207,6 +207,7 @@ void - process_live_temps_per_block(live_ctx& ctx, Block* block) - { - RegisterDemand new_demand; -+ RegisterDemand real_block_demand; - block->register_demand = RegisterDemand(); - IDSet live = compute_live_out(ctx, block); - -@@ -363,6 +364,10 @@ process_live_temps_per_block(live_ctx& ctx, Block* block) - operand_demand += new_demand; - insn->register_demand.update(operand_demand); - block->register_demand.update(insn->register_demand); -+ if (insn->isCall()) -+ real_block_demand.update(insn->register_demand - insn->call().blocked_abi_demand); -+ else -+ real_block_demand.update(insn->register_demand); - } - - /* handle phi definitions */ -@@ -419,6 +424,7 @@ process_live_temps_per_block(live_ctx& ctx, Block* block) - block->live_in_demand = new_demand; - block->live_in_demand.sgpr += 2; /* Add 2 SGPRs for potential long-jumps. */ - block->register_demand.update(block->live_in_demand); -+ ctx.program->max_real_reg_demand.update(real_block_demand); - ctx.program->max_reg_demand.update(block->register_demand); - ctx.handled_once = std::min(ctx.handled_once, block->index); - -@@ -559,29 +565,30 @@ max_suitable_waves(Program* program, uint16_t waves) - } - - void --update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) -+update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand, -+ const RegisterDemand new_real_demand) - { - assert(program->min_waves >= 1); - uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); - uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); - -- program->cur_reg_demand = new_demand; -+ program->max_reg_demand = new_demand; - /* this won't compile, register pressure reduction necessary */ - if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) { - program->num_waves = 0; -- program->max_reg_demand = new_demand; - } else { -- program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr); -+ program->num_waves = -+ program->dev.physical_sgprs / get_sgpr_alloc(program, new_real_demand.sgpr); - uint16_t vgpr_demand = -- get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2; -+ get_vgpr_alloc(program, new_real_demand.vgpr) + program->config->num_shared_vgprs / 2; - program->num_waves = - std::min(program->num_waves, program->dev.physical_vgprs / vgpr_demand); - program->num_waves = std::min(program->num_waves, program->dev.max_waves_per_simd); - - /* Adjust for LDS and workgroup multiples and calculate max_reg_demand */ - program->num_waves = max_suitable_waves(program, program->num_waves); -- program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves); -- program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves); -+ program->max_real_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves); -+ program->max_real_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves); - } - } - -@@ -592,6 +599,7 @@ live_var_analysis(Program* program) - program->live.memory.release(); - program->live.live_in.resize(program->blocks.size(), IDSet(program->live.memory)); - program->max_reg_demand = RegisterDemand(); -+ program->max_real_reg_demand = RegisterDemand(); - program->needs_vcc = program->gfx_level >= GFX10; - - live_ctx ctx; -@@ -607,7 +615,7 @@ live_var_analysis(Program* program) - - /* calculate the program's register demand and number of waves */ - if (program->progress < CompilationProgress::after_ra) -- update_vgpr_sgpr_demand(program, program->max_reg_demand); -+ update_vgpr_sgpr_demand(program, program->max_reg_demand, program->max_real_reg_demand); - } - - } // namespace aco -diff --git a/src/amd/compiler/aco_lower_to_cssa.cpp b/src/amd/compiler/aco_lower_to_cssa.cpp -index 4268e21d820d2..237aaa01f4bc7 100644 ---- a/src/amd/compiler/aco_lower_to_cssa.cpp -+++ b/src/amd/compiler/aco_lower_to_cssa.cpp -@@ -519,6 +519,7 @@ emit_parallelcopies(cssa_ctx& ctx) - } - - RegisterDemand new_demand; -+ RegisterDemand real_new_demand; - for (Block& block : ctx.program->blocks) { - /* Finally, rename coalesced phi operands */ - for (aco_ptr& phi : block.instructions) { -@@ -538,13 +539,18 @@ emit_parallelcopies(cssa_ctx& ctx) - - /* Resummarize the block's register demand */ - block.register_demand = block.live_in_demand; -- for (const aco_ptr& instr : block.instructions) -+ for (const aco_ptr& instr : block.instructions) { - block.register_demand.update(instr->register_demand); -+ if (instr->isCall()) -+ real_new_demand.update(instr->register_demand - instr->call().blocked_abi_demand); -+ else -+ real_new_demand.update(instr->register_demand); -+ } - new_demand.update(block.register_demand); - } - - /* Update max_reg_demand and num_waves */ -- update_vgpr_sgpr_demand(ctx.program, new_demand); -+ update_vgpr_sgpr_demand(ctx.program, new_demand, real_new_demand); - - assert(renames.empty()); - } -diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp -index 5b4b50652006e..e0d6f6bfeaf5a 100644 ---- a/src/amd/compiler/aco_register_allocation.cpp -+++ b/src/amd/compiler/aco_register_allocation.cpp -@@ -121,8 +121,8 @@ struct ra_ctx { - sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); - vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); - -- sgpr_bounds = program->max_reg_demand.sgpr; -- vgpr_bounds = program->max_reg_demand.vgpr; -+ sgpr_bounds = program->max_real_reg_demand.sgpr; -+ vgpr_bounds = program->max_real_reg_demand.vgpr; - num_linear_vgprs = 0; - } - }; -@@ -1426,16 +1426,18 @@ increase_register_file(ra_ctx& ctx, RegClass rc) - { - if (rc.type() == RegType::vgpr && ctx.num_linear_vgprs == 0 && - ctx.vgpr_bounds < ctx.vgpr_limit) { -+ RegisterDemand new_demand = -+ RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_real_reg_demand.sgpr); - /* If vgpr_bounds is less than max_reg_demand.vgpr, this should be a no-op. */ -- update_vgpr_sgpr_demand( -- ctx.program, RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_reg_demand.sgpr)); -+ update_vgpr_sgpr_demand(ctx.program, new_demand, new_demand); - -- ctx.vgpr_bounds = ctx.program->max_reg_demand.vgpr; -- } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) { -- update_vgpr_sgpr_demand( -- ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.sgpr_bounds + 1)); -+ ctx.vgpr_bounds = ctx.program->max_real_reg_demand.vgpr; -+ } else if (rc.type() == RegType::sgpr && ctx.program->max_real_reg_demand.sgpr < ctx.sgpr_limit) { -+ RegisterDemand new_demand = -+ RegisterDemand(ctx.program->max_real_reg_demand.vgpr, ctx.sgpr_bounds + 1); -+ update_vgpr_sgpr_demand(ctx.program, new_demand, new_demand); - -- ctx.sgpr_bounds = ctx.program->max_reg_demand.sgpr; -+ ctx.sgpr_bounds = ctx.program->max_real_reg_demand.sgpr; - } else { - return false; - } -@@ -2049,7 +2051,7 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) - ; - if (reg < 0) { - reg = ctx.max_used_sgpr + 1; -- for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++) -+ for (; reg < ctx.program->max_real_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++) - ; - } - -diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp -index e6eb1e49a4021..438e45def661c 100644 ---- a/src/amd/compiler/aco_scheduler.cpp -+++ b/src/amd/compiler/aco_scheduler.cpp -@@ -1271,7 +1271,7 @@ schedule_program(Program* program) - */ - if (program->is_callee) { - ctx.mv.max_registers.sgpr = -- std::max(std::min(ctx.mv.max_registers.sgpr, program->cur_reg_demand.sgpr), -+ std::max(std::min(ctx.mv.max_registers.sgpr, program->max_reg_demand.sgpr), - (int16_t)program->callee_abi.clobberedRegs.sgpr.size); - } - -@@ -1291,10 +1291,49 @@ schedule_program(Program* program) - - /* update max_reg_demand and num_waves */ - RegisterDemand new_demand; -+ RegisterDemand real_new_demand; - for (Block& block : program->blocks) { - new_demand.update(block.register_demand); -+ if (block.contains_call) { -+ unsigned linear_vgpr_demand = 0; -+ for (auto t : program->live.live_in[block.index]) -+ if (program->temp_rc[t].is_linear_vgpr()) -+ linear_vgpr_demand += program->temp_rc[t].size(); -+ -+ for (unsigned i = block.instructions.size() - 1; i < block.instructions.size(); --i) { -+ Instruction* instr = block.instructions[i].get(); -+ -+ for (auto& def : instr->definitions) { -+ if (def.regClass().is_linear_vgpr() && !def.isKill()) -+ linear_vgpr_demand -= def.size(); -+ } -+ for (auto& op : instr->operands) { -+ if (op.regClass().is_linear_vgpr() && op.isFirstKill()) -+ linear_vgpr_demand += op.size(); -+ } -+ -+ if (!block.instructions[i]->isCall()) { -+ real_new_demand.update(block.instructions[i]->register_demand); -+ continue; -+ } -+ -+ compute_blocked_abi_demand(program, linear_vgpr_demand, instr->call()); -+ -+ const unsigned max_vgpr = get_addr_vgpr_from_waves(program, program->min_waves); -+ const unsigned max_sgpr = get_addr_sgpr_from_waves(program, program->min_waves); -+ -+ if (instr->call().abi.clobberedRegs.vgpr.hi() == PhysReg{256 + max_vgpr} && -+ instr->call().abi.clobberedRegs.sgpr.hi() == PhysReg{max_sgpr}) -+ real_new_demand.update(block.instructions[i]->register_demand - -+ instr->call().blocked_abi_demand); -+ else -+ real_new_demand.update(block.instructions[i]->register_demand); -+ } -+ } else { -+ real_new_demand.update(block.register_demand); -+ } - } -- update_vgpr_sgpr_demand(program, new_demand); -+ update_vgpr_sgpr_demand(program, new_demand, real_new_demand); - - /* Validate live variable information */ - if (!validate_live_vars(program)) -diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp -index 61ffd57b497f9..2ebe7c28fa8fd 100644 ---- a/src/amd/compiler/aco_spill.cpp -+++ b/src/amd/compiler/aco_spill.cpp -@@ -1780,7 +1780,7 @@ spill(Program* program) - abi_sgpr_limit = sgpr_limit; - - /* no spilling when register pressure is low enough */ -- if (program->num_waves > 0 && program->cur_reg_demand.sgpr <= abi_sgpr_limit) -+ if (program->num_waves > 0 && program->max_reg_demand.sgpr <= abi_sgpr_limit) - return; - - /* lower to CSSA before spilling to ensure correctness w.r.t. phis */ -@@ -1830,7 +1830,7 @@ spill(Program* program) - preserved_reload_instructions, reg); - - unsigned max_reg = -- std::min((unsigned)program->cur_reg_demand.sgpr + extra_sgprs, (unsigned)sgpr_limit); -+ std::min((unsigned)program->max_reg_demand.sgpr + extra_sgprs, (unsigned)sgpr_limit); - for (PhysReg reg = program->callee_abi.clobberedRegs.sgpr.hi(); reg < max_reg; - reg = reg.advance(4)) - spill_reload_preserved_sgpr(ctx, preserved_spill_instructions, --- -GitLab - - -From 450c3456e89dd5d8604128482be7768eebda4b1e Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Tue, 4 Jun 2024 15:08:48 +0200 -Subject: [PATCH 70/71] aco/spill: Restore registers spilled by call - immediately - -Makes for better latency hiding if we're not short on registers -otherwise. - -Totals from 7 (0.01% of 81072) affected shaders: -Instrs: 9084 -> 8980 (-1.14%) -CodeSize: 52564 -> 51976 (-1.12%) -SpillSGPRs: 244 -> 248 (+1.64%); split: -3.28%, +4.92% -SpillVGPRs: 360 -> 367 (+1.94%) -Latency: 138989 -> 135669 (-2.39%); split: -2.49%, +0.10% -InvThroughput: 35120 -> 35301 (+0.52%); split: -0.06%, +0.57% -VClause: 258 -> 241 (-6.59%) -SClause: 116 -> 117 (+0.86%) -Copies: 1290 -> 1311 (+1.63%) -Branches: 131 -> 119 (-9.16%) -VALU: 6125 -> 6143 (+0.29%); split: -0.20%, +0.49% -SALU: 920 -> 913 (-0.76%); split: -0.98%, +0.22% -VMEM: 1026 -> 989 (-3.61%) ---- - src/amd/compiler/aco_spill.cpp | 21 +++++++++++++++++++++ - 1 file changed, 21 insertions(+) - -diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp -index 2ebe7c28fa8fd..dea810ce42cf4 100644 ---- a/src/amd/compiler/aco_spill.cpp -+++ b/src/amd/compiler/aco_spill.cpp -@@ -908,6 +908,8 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s - - auto& current_spills = ctx.spills_exit[block_idx]; - -+ std::vector call_spills; -+ - while (idx < block->instructions.size()) { - aco_ptr& instr = block->instructions[idx]; - -@@ -922,6 +924,22 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s - - std::map> reloads; - -+ if (!call_spills.empty()) { -+ RegisterDemand demand = instr->register_demand; -+ while (!(demand - spilled_registers).exceeds(ctx.target_pressure) && -+ !call_spills.empty()) { -+ Temp old_tmp = call_spills.back(); -+ call_spills.pop_back(); -+ -+ Temp new_tmp = ctx.program->allocateTmp(ctx.program->temp_rc[old_tmp.id()]); -+ ctx.renames[block_idx][old_tmp] = new_tmp; -+ reloads[old_tmp] = std::make_pair(new_tmp, current_spills[old_tmp]); -+ current_spills.erase(old_tmp); -+ spilled_registers -= new_tmp; -+ } -+ call_spills.clear(); -+ } -+ - /* rename and reload operands */ - for (Operand& op : instr->operands) { - if (!op.isTemp()) -@@ -1051,6 +1069,9 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s - } - - uint32_t spill_id = ctx.add_to_spills(to_spill, current_spills); -+ if (instr->isCall()) -+ call_spills.emplace_back(to_spill); -+ - /* add interferences with reloads */ - for (std::pair>& pair : reloads) - ctx.add_interference(spill_id, pair.second.second); --- -GitLab - - -From ceea1b8cab549bf5b79c51c54c6f995a5fa79a62 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Tue, 4 Jun 2024 15:12:21 +0200 -Subject: [PATCH 71/71] aco/vn: Don't combine expressions across calls - -This increases live state across calls, which in turn increases spilling -and makes for slower shaders overall. - -Totals from 7 (0.01% of 81072) affected shaders: -Instrs: 8980 -> 8955 (-0.28%); split: -0.88%, +0.60% -CodeSize: 51976 -> 51684 (-0.56%); split: -1.02%, +0.46% -SpillSGPRs: 248 -> 244 (-1.61%); split: -3.63%, +2.02% -SpillVGPRs: 367 -> 365 (-0.54%); split: -1.09%, +0.54% -Scratch: 32768 -> 31744 (-3.12%) -Latency: 135669 -> 128720 (-5.12%); split: -5.13%, +0.01% -InvThroughput: 35301 -> 34783 (-1.47%); split: -1.51%, +0.05% -VClause: 241 -> 242 (+0.41%) -SClause: 117 -> 120 (+2.56%) -Copies: 1311 -> 1338 (+2.06%); split: -0.69%, +2.75% -PreSGPRs: 899 -> 895 (-0.44%); split: -1.56%, +1.11% -PreVGPRs: 1103 -> 1099 (-0.36%) -VALU: 6143 -> 6098 (-0.73%); split: -1.22%, +0.49% -SALU: 913 -> 933 (+2.19%); split: -0.11%, +2.30% -VMEM: 989 -> 967 (-2.22%) -SMEM: 201 -> 214 (+6.47%) ---- - src/amd/compiler/aco_opt_value_numbering.cpp | 24 ++++++++++++++++++++ - 1 file changed, 24 insertions(+) - -diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp -index a199216907a5e..c35dbdaa5fcb7 100644 ---- a/src/amd/compiler/aco_opt_value_numbering.cpp -+++ b/src/amd/compiler/aco_opt_value_numbering.cpp -@@ -43,6 +43,8 @@ struct InstrHash { - for (const Operand& op : instr->operands) - hash = murmur_32_scramble(hash, op.constantValue()); - -+ hash = murmur_32_scramble(hash, instr->pass_flags >> 16); -+ - size_t data_size = get_instr_data_size(instr->format); - - /* skip format, opcode and pass_flags and op/def spans */ -@@ -240,6 +242,9 @@ struct vn_ctx { - expr_set expr_values; - aco::unordered_map renames; - -+ /* For each block, a counter of how many calls were encountered in the linear/logical CFG. */ -+ std::vector> call_indices; -+ - /* The exec id should be the same on the same level of control flow depth. - * Together with the check for dominator relations, it is safe to assume - * that the same exec_id also means the same execution mask. -@@ -254,6 +259,7 @@ struct vn_ctx { - for (Block& block : program->blocks) - size += block.instructions.size(); - expr_values.reserve(size); -+ call_indices.resize(program->blocks.size(), {0, 0}); - } - }; - -@@ -341,6 +347,13 @@ process_block(vn_ctx& ctx, Block& block) - std::vector> new_instructions; - new_instructions.reserve(block.instructions.size()); - -+ uint32_t linear_call_idx = 0; -+ uint32_t logical_call_idx = 0; -+ for (auto index : block.linear_preds) -+ linear_call_idx = std::max(linear_call_idx, ctx.call_indices[index].first); -+ for (auto index : block.logical_preds) -+ logical_call_idx = std::max(logical_call_idx, ctx.call_indices[index].second); -+ - for (aco_ptr& instr : block.instructions) { - /* first, rename operands */ - for (Operand& op : instr->operands) { -@@ -354,6 +367,10 @@ process_block(vn_ctx& ctx, Block& block) - if (instr->opcode == aco_opcode::p_discard_if || - instr->opcode == aco_opcode::p_demote_to_helper || instr->opcode == aco_opcode::p_end_wqm) - ctx.exec_id++; -+ if (instr->isCall()) { -+ ++linear_call_idx; -+ ++logical_call_idx; -+ } - - /* simple copy-propagation through renaming */ - bool copy_instr = -@@ -370,7 +387,12 @@ process_block(vn_ctx& ctx, Block& block) - continue; - } - -+ bool use_linear_call_idx = -+ std::any_of(instr->definitions.begin(), instr->definitions.end(), -+ [](const auto& def) { return def.regClass().is_linear(); }); -+ - instr->pass_flags = ctx.exec_id; -+ instr->pass_flags |= (use_linear_call_idx ? linear_call_idx : logical_call_idx) << 16; - std::pair res = ctx.expr_values.emplace(instr.get(), block.index); - - /* if there was already an expression with the same value number */ -@@ -409,6 +431,8 @@ process_block(vn_ctx& ctx, Block& block) - } - } - -+ ctx.call_indices[block.index] = {linear_call_idx, logical_call_idx}; -+ - block.instructions = std::move(new_instructions); - } - --- -GitLab -