diff --git a/patches/rtperf.diff b/patches/rtperf.diff new file mode 100644 index 0000000..f102864 --- /dev/null +++ b/patches/rtperf.diff @@ -0,0 +1,11451 @@ +From dc44099798c94c194dedcb107e7aadee0d4c8e0b Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Tue, 4 Jun 2024 15:09:20 +0200 +Subject: [PATCH 01/71] !29536 + +--- + src/amd/compiler/aco_interface.cpp | 2 + + src/amd/compiler/aco_ir.h | 1 + + src/amd/compiler/aco_vectorize_spills.cpp | 253 ++++++++++++++++++++++ + src/amd/compiler/meson.build | 1 + + 4 files changed, 257 insertions(+) + create mode 100644 src/amd/compiler/aco_vectorize_spills.cpp + +diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp +index bc24b31a5bb6a..32a28908f90f0 100644 +--- a/src/amd/compiler/aco_interface.cpp ++++ b/src/amd/compiler/aco_interface.cpp +@@ -152,6 +152,8 @@ aco_postprocess_shader(const struct aco_compiler_options* options, + schedule_program(program.get()); + validate(program.get()); + ++ vectorize_spills(program.get()); ++ + /* Register Allocation */ + register_allocation(program.get()); + +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index 7542c1e0db143..96bc3c540e0bf 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -2263,6 +2263,7 @@ void combine_delay_alu(Program* program); + bool dealloc_vgprs(Program* program); + void insert_NOPs(Program* program); + void form_hard_clauses(Program* program); ++void vectorize_spills(Program* program); + unsigned emit_program(Program* program, std::vector& code, + std::vector* symbols = NULL, bool append_endpgm = true); + /** +diff --git a/src/amd/compiler/aco_vectorize_spills.cpp b/src/amd/compiler/aco_vectorize_spills.cpp +new file mode 100644 +index 0000000000000..b989306b5a3c2 +--- /dev/null ++++ b/src/amd/compiler/aco_vectorize_spills.cpp +@@ -0,0 +1,253 @@ ++/* ++ * Copyright © 2024 Valve Corporation ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#include "aco_builder.h" ++#include "aco_ir.h" ++ ++#include ++ ++namespace aco { ++ ++struct vectorize_ctx { ++ std::vector> instrs_to_vectorize; ++ ++ std::vector> vectors; ++ std::vector> vectorized_instrs; ++ ++ std::vector component_idxs; ++ ++ std::unordered_set killed_soffset_ids; ++ std::unordered_set seen_soffset_ids; ++ ++ std::vector>::iterator insert_point; ++ Block* block; ++ Program* program; ++}; ++ ++void ++vectorize_and_insert(vectorize_ctx& ctx, bool store) ++{ ++ std::sort(ctx.instrs_to_vectorize.begin(), ctx.instrs_to_vectorize.end(), ++ [](const auto& one, const auto& other) ++ { return one->scratch().offset < other->scratch().offset; }); ++ ++ Builder instr_bld(ctx.program, &ctx.vectorized_instrs); ++ ++ for (unsigned i = 0; i < ctx.instrs_to_vectorize.size(); ++i) { ++ ctx.component_idxs.push_back(i); ++ for (auto j = i + 1; j < ctx.instrs_to_vectorize.size(); ++j) { ++ const auto& component = ctx.instrs_to_vectorize[ctx.component_idxs.back()]; ++ const auto& instr = ctx.instrs_to_vectorize[j]; ++ /* skip stores with unrelated soffset */ ++ if (instr->operands[1].tempId() != component->operands[1].tempId()) ++ continue; ++ int16_t next_offset; ++ if (store) ++ next_offset = component->scratch().offset + (int16_t)component->operands[2].bytes(); ++ else ++ next_offset = component->scratch().offset + (int16_t)component->definitions[0].bytes(); ++ ++ /* there's a gap, can't vectorize across it */ ++ if (instr->scratch().offset > next_offset) ++ break; ++ /* XXX: Hitting this means there are intersecting stores. This shouldn't happen! */ ++ if (instr->scratch().offset != next_offset) ++ break; ++ ++ if (instr->operands[1].isKill()) ++ ctx.killed_soffset_ids.insert(instr->operands[1].tempId()); ++ ++ ctx.component_idxs.push_back(j); ++ } ++ ++ if (ctx.component_idxs.empty()) ++ continue; ++ ++ size_t comp_idx = 0; ++ while (comp_idx < ctx.component_idxs.size()) { ++ size_t vector_size = 4; ++ while (vector_size > ctx.component_idxs.size() - comp_idx) ++ vector_size >>= 1; ++ ++ auto& first_component = ctx.instrs_to_vectorize[ctx.component_idxs[comp_idx]]; ++ ++ if (vector_size == 1) { ++ ctx.vectorized_instrs.emplace_back(std::move(first_component)); ++ ++comp_idx; ++ continue; ++ } ++ ++ if (store) { ++ Temp vec_tmp = ctx.program->allocateTmp(RegClass(RegType::vgpr, vector_size)); ++ Instruction* vec = ++ create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, vector_size, 1); ++ for (unsigned c = 0; c < vector_size; ++c) { ++ auto& component = ctx.instrs_to_vectorize[ctx.component_idxs[comp_idx + c]]; ++ vec->operands[c] = component->operands[2]; ++ } ++ vec->definitions[0] = Definition(vec_tmp); ++ ctx.vectors.emplace_back(vec); ++ ++ aco_opcode opcode; ++ switch (vector_size) { ++ case 4: opcode = aco_opcode::scratch_store_dwordx4; break; ++ case 2: opcode = aco_opcode::scratch_store_dwordx2; break; ++ default: unreachable("invalid vector size"); ++ } ++ ++ Operand vec_op = Operand(vec_tmp); ++ vec_op.setFirstKill(true); ++ instr_bld.scratch(opcode, Operand(v1), first_component->operands[1], vec_op, ++ first_component->scratch().offset, first_component->scratch().sync); ++ } else { ++ Temp vec_tmp = ctx.program->allocateTmp(RegClass(RegType::vgpr, vector_size)); ++ ++ aco_opcode opcode; ++ switch (vector_size) { ++ case 4: opcode = aco_opcode::scratch_load_dwordx4; break; ++ case 2: opcode = aco_opcode::scratch_load_dwordx2; break; ++ default: unreachable("invalid vector size"); ++ } ++ ++ instr_bld.scratch(opcode, Definition(vec_tmp), Operand(v1), ++ first_component->operands[1], first_component->scratch().offset, ++ first_component->scratch().sync); ++ ++ Instruction* vec = ++ create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, vector_size); ++ for (unsigned c = 0; c < vector_size; ++c) { ++ auto& component = ctx.instrs_to_vectorize[ctx.component_idxs[comp_idx + c]]; ++ vec->definitions[c] = component->definitions[0]; ++ } ++ vec->operands[0] = Operand(vec_tmp); ++ vec->operands[0].setFirstKill(true); ++ ctx.vectors.emplace_back(vec); ++ } ++ comp_idx += vector_size; ++ } ++ ++ for (unsigned j = 0; j < ctx.component_idxs.size(); ++j) { ++ auto idx = ctx.component_idxs[j]; ++ ctx.instrs_to_vectorize.erase(ctx.instrs_to_vectorize.begin() + (idx - j)); ++ } ++ /* Adjust for deleted instruction */ ++ --i; ++ ++ ctx.component_idxs.clear(); ++ } ++ ++ for (auto it = ctx.vectorized_instrs.rbegin(); it != ctx.vectorized_instrs.rend(); ++it) { ++ auto soffset_id = (*it)->operands[1].tempId(); ++ if (ctx.seen_soffset_ids.find(soffset_id) == ctx.seen_soffset_ids.end()) { ++ if (ctx.killed_soffset_ids.find(soffset_id) != ctx.killed_soffset_ids.end()) ++ (*it)->operands[1].setFirstKill(true); ++ ctx.seen_soffset_ids.insert(soffset_id); ++ } ++ } ++ ++ if (store) { ++ ctx.insert_point = ++ ctx.block->instructions.insert(ctx.insert_point, std::move_iterator(ctx.vectors.begin()), ++ std::move_iterator(ctx.vectors.end())); ++ ctx.insert_point += ctx.vectors.size(); ++ ctx.insert_point = ctx.block->instructions.insert( ++ ctx.insert_point, std::move_iterator(ctx.vectorized_instrs.rbegin()), ++ std::move_iterator(ctx.vectorized_instrs.rend())); ++ ctx.insert_point += ctx.vectorized_instrs.size(); ++ } else { ++ ctx.insert_point = ctx.block->instructions.insert( ++ ctx.insert_point, std::move_iterator(ctx.vectorized_instrs.rbegin()), ++ std::move_iterator(ctx.vectorized_instrs.rend())); ++ ctx.insert_point += ctx.vectorized_instrs.size(); ++ ctx.insert_point = ++ ctx.block->instructions.insert(ctx.insert_point, std::move_iterator(ctx.vectors.begin()), ++ std::move_iterator(ctx.vectors.end())); ++ ctx.insert_point += ctx.vectors.size(); ++ } ++ ++ ctx.vectors.clear(); ++ ctx.vectorized_instrs.clear(); ++ ctx.instrs_to_vectorize.clear(); ++ ctx.seen_soffset_ids.clear(); ++ ctx.killed_soffset_ids.clear(); ++} ++ ++void ++vectorize_spills(Program* program) ++{ ++ vectorize_ctx ctx; ++ ctx.program = program; ++ aco::monotonic_buffer_resource memory; ++ ++ for (auto& block : program->blocks) { ++ ctx.block = █ ++ IDSet conflicting_temps(memory); ++ ++ /* Try vectorizing stores */ ++ for (auto it = block.instructions.begin(); it != block.instructions.end();) { ++ bool vectorize_now = !(*it)->isVMEM() && it != block.instructions.begin(); ++ ++ /* Only look for stores that kill their operand. We can move/combine these with other ++ * instructions without affecting register demand. ++ */ ++ if ((*it)->opcode == aco_opcode::scratch_store_dword && (*it)->operands[2].isKill() && ++ !(*it)->operands[2].regClass().is_subdword()) { ++ if (conflicting_temps.count((*it)->operands[2].tempId())) { ++ vectorize_now = true; ++ --it; ++ } else { ++ bool first = ctx.instrs_to_vectorize.empty(); ++ ctx.instrs_to_vectorize.emplace_back(std::move(*it)); ++ it = block.instructions.erase(it); ++ if (first) ++ ctx.insert_point = it; ++ continue; ++ } ++ } ++ ++ if (vectorize_now) { ++ auto clause_size = it - ctx.insert_point; ++ vectorize_and_insert(ctx, true); ++ it = ctx.insert_point + clause_size; ++ conflicting_temps = IDSet(memory); ++ } else { ++ for (auto& def : (*it)->definitions) ++ if (def.isTemp()) ++ conflicting_temps.insert(def.tempId()); ++ } ++ ++it; ++ } ++ /* Try vectorizing loads */ ++ for (auto it = block.instructions.begin(); it != block.instructions.end();) { ++ bool vectorize_now = !(*it)->isVMEM() && it != block.instructions.begin(); ++ for (auto& op : (*it)->operands) { ++ if (op.isTemp() && conflicting_temps.count(op.tempId())) { ++ vectorize_now = true; ++ --it; ++ } ++ } ++ ++ /* Loads that kill their definition are dead and shouldn't appear with spilling */ ++ if (!vectorize_now && (*it)->opcode == aco_opcode::scratch_load_dword && ++ !(*it)->definitions[0].isKill() && !(*it)->definitions[0].regClass().is_subdword()) { ++ ctx.instrs_to_vectorize.emplace_back(std::move(*it)); ++ conflicting_temps.insert((*it)->definitions[0].tempId()); ++ it = block.instructions.erase(it); ++ continue; ++ } ++ ++ if (vectorize_now) { ++ ctx.insert_point = it; ++ vectorize_and_insert(ctx, false); ++ it = ctx.insert_point; ++ conflicting_temps = IDSet(memory); ++ } ++ ++it; ++ } ++ } ++} ++ ++} // namespace aco +diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build +index ae2d6a41b793a..b235f626f97af 100644 +--- a/src/amd/compiler/meson.build ++++ b/src/amd/compiler/meson.build +@@ -66,6 +66,7 @@ libaco_files = files( + 'aco_statistics.cpp', + 'aco_util.h', + 'aco_validate.cpp', ++ 'aco_vectorize_spills.cpp', + ) + + cpp_args_aco = cpp.get_supported_arguments(['-fno-exceptions', '-fno-rtti', '-Wimplicit-fallthrough', '-Wshadow']) +-- +GitLab + + +From 8123a30fc5553bbf237833fbb7a5b39ce677664d Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 25 Mar 2024 16:52:45 +0100 +Subject: [PATCH 02/71] !29576 + +--- + src/amd/compiler/aco_ir.h | 1 + + src/amd/compiler/aco_register_allocation.cpp | 316 +++++++++++-------- + 2 files changed, 193 insertions(+), 124 deletions(-) + +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index 96bc3c540e0bf..8a501797092ed 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -742,6 +742,7 @@ public: + isPrecolored_ = isFixed_; + } + ++ + constexpr bool isConstant() const noexcept { return isConstant_; } + + constexpr bool isLiteral() const noexcept { return isConstant() && reg_ == 255; } +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index 7ff35c079e2ed..fc62487627fad 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -14,8 +14,17 @@ + #include + #include + #include ++#include ++#include ++#include + #include + ++namespace std { ++template <> struct hash { ++ size_t operator()(aco::PhysReg temp) const noexcept { return std::hash{}(temp.reg_b); } ++}; ++} // namespace std ++ + namespace aco { + namespace { + +@@ -29,6 +38,19 @@ void add_subdword_operand(ra_ctx& ctx, aco_ptr& instr, unsigned idx + void add_subdword_definition(Program* program, aco_ptr& instr, PhysReg reg, + bool allow_16bit_write); + ++struct parallelcopy { ++ constexpr parallelcopy() : skip_renaming(false) {} ++ constexpr parallelcopy(Operand op_, Definition def_) : op(op_), def(def_), skip_renaming(false) ++ {} ++ constexpr parallelcopy(Operand op_, Definition def_, bool skip_renaming_) ++ : op(op_), def(def_), skip_renaming(skip_renaming_) ++ {} ++ ++ Operand op; ++ Definition def; ++ bool skip_renaming; ++}; ++ + struct assignment { + PhysReg reg; + RegClass rc; +@@ -270,7 +292,11 @@ public: + std::array regs; + std::map> subdword_regs; + +- const uint32_t& operator[](PhysReg index) const { return regs[index]; } ++ const uint32_t& operator[](PhysReg index) const ++ { ++ assert(index.reg() < 512); ++ return regs[index]; ++ } + + uint32_t& operator[](PhysReg index) { return regs[index]; } + +@@ -357,7 +383,12 @@ public: + } + } + +- void clear(Operand op) { clear(op.physReg(), op.regClass()); } ++ void clear(Operand op) ++ { ++ if (op.isTemp() && get_id(op.physReg()) && !is_blocked(op.physReg())) ++ assert(get_id(op.physReg()) == op.tempId()); ++ clear(op.physReg(), op.regClass()); ++ } + + void fill(Definition def) + { +@@ -805,22 +836,21 @@ enum UpdateRenames { + MESA_DEFINE_CPP_ENUM_BITFIELD_OPERATORS(UpdateRenames); + + void +-update_renames(ra_ctx& ctx, RegisterFile& reg_file, +- std::vector>& parallelcopies, ++update_renames(ra_ctx& ctx, RegisterFile& reg_file, std::vector& parallelcopies, + aco_ptr& instr, UpdateRenames flags) + { + /* clear operands */ +- for (std::pair& copy : parallelcopies) { ++ for (parallelcopy& copy : parallelcopies) { + /* the definitions with id are not from this function and already handled */ +- if (copy.second.isTemp()) ++ if (copy.def.isTemp()) + continue; +- reg_file.clear(copy.first); ++ reg_file.clear(copy.op); + } + + /* allocate id's and rename operands: this is done transparently here */ + auto it = parallelcopies.begin(); + while (it != parallelcopies.end()) { +- if (it->second.isTemp()) { ++ if (it->def.isTemp()) { + ++it; + continue; + } +@@ -828,9 +858,9 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, + /* check if we moved a definition: change the register and remove copy */ + bool is_def = false; + for (Definition& def : instr->definitions) { +- if (def.isTemp() && def.getTemp() == it->first.getTemp()) { ++ if (def.isTemp() && def.getTemp() == it->op.getTemp()) { + // FIXME: ensure that the definition can use this reg +- def.setFixed(it->second.physReg()); ++ def.setFixed(it->def.physReg()); + reg_file.fill(def); + ctx.assignments[def.tempId()].reg = def.physReg(); + it = parallelcopies.erase(it); +@@ -842,34 +872,52 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, + continue; + + /* check if we moved another parallelcopy definition */ +- for (std::pair& other : parallelcopies) { +- if (!other.second.isTemp()) ++ for (parallelcopy& other : parallelcopies) { ++ if (!other.def.isTemp()) + continue; +- if (it->first.getTemp() == other.second.getTemp()) { +- other.second.setFixed(it->second.physReg()); +- ctx.assignments[other.second.tempId()].reg = other.second.physReg(); +- it = parallelcopies.erase(it); +- is_def = true; ++ if (it->op.getTemp() == other.def.getTemp()) { ++ bool other_ensures_precoloring = false; ++ + /* check if we moved an operand, again */ + bool fill = true; + for (Operand& op : instr->operands) { +- if (op.isTemp() && op.tempId() == other.second.tempId()) { +- // FIXME: ensure that the operand can use this reg +- op.setFixed(other.second.physReg()); +- fill = !op.isKillBeforeDef(); +- } ++ if (!op.isTemp() || op.tempId() != other.def.tempId()) ++ continue; ++ bool isKillBeforeDef = op.isFirstKillBeforeDef(); ++ fill = !isKillBeforeDef; ++ ++ if (other.def.physReg() == op.physReg() && op.isPrecolored()) ++ other_ensures_precoloring = true; ++ else ++ op.setFixed(it->def.physReg()); ++ break; ++ } ++ ++ Definition fill_def; ++ ++ if (other_ensures_precoloring) { ++ it->op = other.op; ++ ctx.assignments[other.op.tempId()].reg = it->def.physReg(); ++ fill_def = it->def; ++ } else { ++ other.def.setFixed(it->def.physReg()); ++ ctx.assignments[other.def.tempId()].reg = other.def.physReg(); ++ it = parallelcopies.erase(it); ++ fill_def = other.def; + } ++ is_def = true; ++ + if (fill) +- reg_file.fill(other.second); ++ reg_file.fill(fill_def); + break; + } + } + if (is_def) + continue; + +- std::pair& copy = *it; +- copy.second.setTemp(ctx.program->allocateTmp(copy.second.regClass())); +- ctx.assignments.emplace_back(copy.second.physReg(), copy.second.regClass()); ++ parallelcopy& copy = *it; ++ copy.def.setTemp(ctx.program->allocateTmp(copy.def.regClass())); ++ ctx.assignments.emplace_back(copy.def.physReg(), copy.def.regClass()); + assert(ctx.assignments.size() == ctx.program->peekAllocationId()); + + /* check if we moved an operand */ +@@ -879,19 +927,19 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, + Operand& op = instr->operands[i]; + if (!op.isTemp()) + continue; +- if (op.tempId() == copy.first.tempId()) { ++ if (op.tempId() == copy.op.tempId()) { + /* only rename precolored operands if the copy-location matches */ +- bool omit_renaming = op.isPrecolored() && op.physReg() != copy.second.physReg(); ++ bool omit_renaming = op.isPrecolored() && op.physReg() != copy.def.physReg(); + + /* Omit renaming in some cases for p_create_vector in order to avoid + * unnecessary shuffle code. */ + if (!(flags & rename_not_killed_ops) && !op.isKillBeforeDef()) { + omit_renaming = true; +- for (std::pair& pc : parallelcopies) { +- PhysReg def_reg = pc.second.physReg(); +- omit_renaming &= def_reg > copy.first.physReg() +- ? (copy.first.physReg() + copy.first.size() <= def_reg.reg()) +- : (def_reg + pc.second.size() <= copy.first.physReg().reg()); ++ for (parallelcopy& pc : parallelcopies) { ++ PhysReg def_reg = pc.def.physReg(); ++ omit_renaming &= def_reg > copy.op.physReg() ++ ? (copy.op.physReg() + copy.op.size() <= def_reg.reg()) ++ : (def_reg + pc.def.size() <= copy.op.physReg().reg()); + } + } + +@@ -905,8 +953,8 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, + if (omit_renaming) + continue; + +- op.setTemp(copy.second.getTemp()); +- op.setFixed(copy.second.physReg()); ++ op.setTemp(copy.def.getTemp()); ++ op.setFixed(copy.def.physReg()); + + fill = !op.isKillBeforeDef() || op.isPrecolored(); + } +@@ -914,7 +962,7 @@ update_renames(ra_ctx& ctx, RegisterFile& reg_file, + + /* Apply changes to register file. */ + if (fill) +- reg_file.fill(copy.second); ++ reg_file.fill(copy.def); + + ++it; + } +@@ -1050,7 +1098,7 @@ collect_vars(ra_ctx& ctx, RegisterFile& reg_file, const PhysRegInterval reg_inte + + std::optional + get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file, +- std::vector>& parallelcopies, ++ std::vector& parallelcopies, + aco_ptr& instr, const PhysRegInterval def_reg, + DefInfo info, unsigned id) + { +@@ -1102,8 +1150,7 @@ get_reg_for_create_vector_copy(ra_ctx& ctx, RegisterFile& reg_file, + } + + bool +-get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, +- std::vector>& parallelcopies, ++get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, std::vector& parallelcopies, + const std::vector& vars, aco_ptr& instr, + const PhysRegInterval def_reg) + { +@@ -1253,9 +1300,8 @@ get_regs_for_copies(ra_ctx& ctx, RegisterFile& reg_file, + } + + std::optional +-get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, +- std::vector>& parallelcopies, const DefInfo& info, +- aco_ptr& instr) ++get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, std::vector& parallelcopies, ++ const DefInfo& info, aco_ptr& instr) + { + const PhysRegInterval& bounds = info.bounds; + uint32_t size = info.size; +@@ -1381,7 +1427,7 @@ get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, + if (!is_phi(instr) && instr->opcode != aco_opcode::p_create_vector) + tmp_file.fill_killed_operands(instr.get()); + +- std::vector> pc; ++ std::vector pc; + if (!get_regs_for_copies(ctx, tmp_file, pc, vars, instr, best_win)) + return {}; + +@@ -1460,11 +1506,13 @@ struct IDAndInfo { + }; + + void +-add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val) ++add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val, bool add_to_ctx = true) + { +- ctx.renames[ctx.block->index][orig_val.id()] = new_val; + ctx.orig_names.emplace(new_val.id(), orig_val); +- ctx.assignments[orig_val.id()].renamed = true; ++ if (add_to_ctx) { ++ ctx.renames[ctx.block->index][orig_val.id()] = new_val; ++ ctx.assignments[orig_val.id()].renamed = true; ++ } + } + + /* Reallocates vars by sorting them and placing each variable after the previous +@@ -1473,7 +1521,7 @@ add_rename(ra_ctx& ctx, Temp orig_val, Temp new_val) + */ + PhysReg + compact_relocate_vars(ra_ctx& ctx, const std::vector& vars, +- std::vector>& parallelcopies, PhysReg start) ++ std::vector& parallelcopies, PhysReg start) + { + /* This function assumes RegisterDemand/live_var_analysis rounds up sub-dword + * temporary sizes to dwords. +@@ -1624,7 +1672,7 @@ get_reg_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, aco_ptr>& parallelcopies) ++ std::vector& parallelcopies) + { + PhysRegInterval linear_vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, true); + int zeros = reg_file.count_zero(linear_vgpr_bounds); +@@ -1650,7 +1698,7 @@ compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file, + */ + PhysReg + alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr& instr, +- std::vector>& parallelcopies) ++ std::vector& parallelcopies) + { + assert(instr->opcode == aco_opcode::p_start_linear_vgpr); + assert(instr->definitions.size() == 1 && instr->definitions[0].bytes() % 4 == 0); +@@ -1683,7 +1731,7 @@ alloc_linear_vgpr(ra_ctx& ctx, const RegisterFile& reg_file, aco_ptr> pc; ++ std::vector pc; + if (!ctx.policy.skip_optimistic_path && + get_regs_for_copies(ctx, tmp_file, pc, blocking_vars, instr, reg_win)) { + parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); +@@ -1734,7 +1782,7 @@ should_compact_linear_vgprs(ra_ctx& ctx, const RegisterFile& reg_file) + + PhysReg + get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, +- std::vector>& parallelcopies, aco_ptr& instr, ++ std::vector& parallelcopies, aco_ptr& instr, + int operand_index = -1) + { + auto split_vec = ctx.split_vectors.find(temp.id()); +@@ -1808,7 +1856,7 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, + return *res; + + /* try compacting the linear vgprs to make more space */ +- std::vector> pc; ++ std::vector pc; + if (info.rc.type() == RegType::vgpr && (ctx.block->kind & block_kind_top_level) && + compact_linear_vgprs(ctx, reg_file, pc)) { + parallelcopies.insert(parallelcopies.end(), pc.begin(), pc.end()); +@@ -1816,8 +1864,8 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, + /* We don't need to fill the copy definitions in because we don't care about the linear VGPR + * space here. */ + RegisterFile tmp_file(reg_file); +- for (std::pair& copy : pc) +- tmp_file.clear(copy.first); ++ for (parallelcopy& copy : pc) ++ tmp_file.clear(copy.op); + + return get_reg(ctx, tmp_file, temp, parallelcopies, instr, operand_index); + } +@@ -1875,8 +1923,7 @@ get_reg(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, + + PhysReg + get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, +- std::vector>& parallelcopies, +- aco_ptr& instr) ++ std::vector& parallelcopies, aco_ptr& instr) + { + RegClass rc = temp.regClass(); + /* create_vector instructions have different costs w.r.t. register coalescing */ +@@ -1993,7 +2040,7 @@ get_reg_create_vector(ra_ctx& ctx, const RegisterFile& reg_file, Temp temp, + std::vector vars = collect_vars(ctx, tmp_file, PhysRegInterval{best_pos, size}); + + bool success = false; +- std::vector> pc; ++ std::vector pc; + success = get_regs_for_copies(ctx, tmp_file, pc, vars, instr, PhysRegInterval{best_pos, size}); + + if (!success) { +@@ -2084,59 +2131,81 @@ operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr& instr, unsign + + void + handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file, +- std::vector>& parallelcopy, +- aco_ptr& instr) ++ std::vector& parallelcopy, aco_ptr& instr) + { + assert(instr->operands.size() <= 128); + assert(parallelcopy.empty()); + + RegisterFile tmp_file(register_file); ++ std::unordered_map> temp_regs; ++ std::vector blocking_vars; + +- BITSET_DECLARE(mask, 128) = {0}; +- +- for (unsigned i = 0; i < instr->operands.size(); i++) { +- Operand& op = instr->operands[i]; +- +- if (!op.isPrecolored()) ++ for (auto it = instr->operands.begin(); it != instr->operands.end(); ++it) { ++ if (!it->isPrecolored()) + continue; + +- assert(op.isTemp()); +- PhysReg src = ctx.assignments[op.tempId()].reg; +- adjust_max_used_regs(ctx, op.regClass(), op.physReg()); ++ assert(it->isTemp()); ++ adjust_max_used_regs(ctx, it->regClass(), it->physReg()); ++ PhysReg src = ctx.assignments[it->tempId()].reg; ++ temp_regs[it->tempId()].emplace(it->physReg()); + +- if (op.physReg() == src) { +- tmp_file.block(op.physReg(), op.regClass()); +- continue; ++ if (src == it->physReg()) { ++ tmp_file.block(it->physReg(), it->regClass()); ++ } else { ++ /* clear from register_file so fixed operands are not collected be collect_vars() */ ++ if (!tmp_file.is_blocked(src)) ++ tmp_file.clear(src, it->regClass()); // TODO: try to avoid moving block vars to src + } + + /* An instruction can have at most one operand precolored to the same register. */ + assert(std::none_of(parallelcopy.begin(), parallelcopy.end(), +- [&](auto copy) { return copy.second.physReg() == op.physReg(); })); ++ [&](auto copy) { return copy.def.physReg() == it->physReg(); })); ++ } ++ ++ for (auto& regs : temp_regs) { ++ PhysReg src = ctx.assignments[regs.first].reg; + +- /* clear from register_file so fixed operands are not collected be collect_vars() */ +- tmp_file.clear(src, op.regClass()); // TODO: try to avoid moving block vars to src ++ PhysReg live_reg = *regs.second.begin(); ++ if (regs.second.size() > 1) { ++ bool found = false; ++ for (auto reg : regs.second) { ++ PhysRegInterval range = {reg, ctx.program->temp_rc[regs.first].size()}; ++ bool intersects_with_def = false; ++ for (const auto& def : instr->definitions) { ++ if (!def.isTemp() || !def.isFixed()) ++ continue; ++ PhysRegInterval def_range = {def.physReg(), def.regClass().size()}; ++ if (intersects(def_range, range)) { ++ intersects_with_def = true; ++ break; ++ } ++ } ++ if (intersects_with_def) ++ continue; + +- BITSET_SET(mask, i); ++ if (!found || reg == src) { ++ live_reg = reg; ++ found = true; ++ if (reg == src) ++ break; ++ } ++ } ++ } + +- Operand pc_op(instr->operands[i].getTemp()); +- pc_op.setFixed(src); +- Definition pc_def = Definition(op.physReg(), pc_op.regClass()); +- parallelcopy.emplace_back(pc_op, pc_def); +- } ++ RegClass rc = ctx.program->temp_rc[regs.first]; + +- if (BITSET_IS_EMPTY(mask)) +- return; ++ for (auto reg : regs.second) { ++ if (reg == src) ++ continue; + +- unsigned i; +- std::vector blocking_vars; +- BITSET_FOREACH_SET (i, mask, instr->operands.size()) { +- Operand& op = instr->operands[i]; +- PhysRegInterval target{op.physReg(), op.size()}; +- std::vector blocking_vars2 = collect_vars(ctx, tmp_file, target); +- blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end()); ++ Definition copy_def = Definition(reg, rc); ++ parallelcopy.emplace_back(Operand(Temp(regs.first, rc), src), copy_def, reg != live_reg); + +- /* prevent get_regs_for_copies() from using these registers */ +- tmp_file.block(op.physReg(), op.regClass()); ++ PhysRegInterval target{reg, rc.size()}; ++ std::vector blocking_vars2 = collect_vars(ctx, tmp_file, target); ++ blocking_vars.insert(blocking_vars.end(), blocking_vars2.begin(), blocking_vars2.end()); ++ tmp_file.block(reg, rc); ++ } + } + + get_regs_for_copies(ctx, tmp_file, parallelcopy, blocking_vars, instr, PhysRegInterval()); +@@ -2145,8 +2214,8 @@ handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file, + + void + get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, +- std::vector>& parallelcopy, +- aco_ptr& instr, Operand& operand, unsigned operand_index) ++ std::vector& parallelcopy, aco_ptr& instr, ++ Operand& operand, unsigned operand_index) + { + /* clear the operand in case it's only a stride mismatch */ + PhysReg src = ctx.assignments[operand.tempId()].reg; +@@ -2166,45 +2235,44 @@ get_reg_phi(ra_ctx& ctx, IDSet& live_in, RegisterFile& register_file, + std::vector>& instructions, Block& block, + aco_ptr& phi, Temp tmp) + { +- std::vector> parallelcopy; ++ std::vector parallelcopy; + PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, phi); + update_renames(ctx, register_file, parallelcopy, phi, rename_not_killed_ops); + + /* process parallelcopy */ +- for (std::pair pc : parallelcopy) { ++ for (struct parallelcopy pc : parallelcopy) { + /* see if it's a copy from a different phi */ + // TODO: prefer moving some previous phis over live-ins + // TODO: somehow prevent phis fixed before the RA from being updated (shouldn't be a + // problem in practice since they can only be fixed to exec) + Instruction* prev_phi = NULL; + for (auto phi_it = instructions.begin(); phi_it != instructions.end(); ++phi_it) { +- if ((*phi_it)->definitions[0].tempId() == pc.first.tempId()) ++ if ((*phi_it)->definitions[0].tempId() == pc.op.tempId()) + prev_phi = phi_it->get(); + } + if (prev_phi) { + /* if so, just update that phi's register */ +- prev_phi->definitions[0].setFixed(pc.second.physReg()); ++ prev_phi->definitions[0].setFixed(pc.def.physReg()); + register_file.fill(prev_phi->definitions[0]); +- ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.second.physReg(), +- pc.second.regClass()}; ++ ctx.assignments[prev_phi->definitions[0].tempId()] = {pc.def.physReg(), pc.def.regClass()}; + continue; + } + + /* rename */ +- auto orig_it = ctx.orig_names.find(pc.first.tempId()); +- Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.first.getTemp(); +- add_rename(ctx, orig, pc.second.getTemp()); ++ auto orig_it = ctx.orig_names.find(pc.op.tempId()); ++ Temp orig = orig_it != ctx.orig_names.end() ? orig_it->second : pc.op.getTemp(); ++ add_rename(ctx, orig, pc.def.getTemp()); + + /* otherwise, this is a live-in and we need to create a new phi + * to move it in this block's predecessors */ + aco_opcode opcode = +- pc.first.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; ++ pc.op.getTemp().is_linear() ? aco_opcode::p_linear_phi : aco_opcode::p_phi; + Block::edge_vec& preds = +- pc.first.getTemp().is_linear() ? block.linear_preds : block.logical_preds; ++ pc.op.getTemp().is_linear() ? block.linear_preds : block.logical_preds; + aco_ptr new_phi{create_instruction(opcode, Format::PSEUDO, preds.size(), 1)}; +- new_phi->definitions[0] = pc.second; ++ new_phi->definitions[0] = pc.def; + for (unsigned i = 0; i < preds.size(); i++) +- new_phi->operands[i] = Operand(pc.first); ++ new_phi->operands[i] = Operand(pc.op); + instructions.emplace_back(std::move(new_phi)); + + /* Remove from live_in, because handle_loop_phis() would re-create this phi later if this is +@@ -2916,7 +2984,7 @@ optimize_encoding(ra_ctx& ctx, RegisterFile& register_file, aco_ptr + } + + void +-emit_parallel_copy_internal(ra_ctx& ctx, std::vector>& parallelcopy, ++emit_parallel_copy_internal(ra_ctx& ctx, std::vector& parallelcopy, + aco_ptr& instr, + std::vector>& instructions, bool temp_in_scc, + RegisterFile& register_file) +@@ -2931,31 +2999,31 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vectoroperands[i] = parallelcopy[i].first; +- pc->definitions[i] = parallelcopy[i].second; ++ pc->operands[i] = parallelcopy[i].op; ++ pc->definitions[i] = parallelcopy[i].def; + assert(pc->operands[i].size() == pc->definitions[i].size()); + + /* it might happen that the operand is already renamed. we have to restore the + * original name. */ + auto it = ctx.orig_names.find(pc->operands[i].tempId()); + Temp orig = it != ctx.orig_names.end() ? it->second : pc->operands[i].getTemp(); +- add_rename(ctx, orig, pc->definitions[i].getTemp()); ++ add_rename(ctx, orig, pc->definitions[i].getTemp(), !parallelcopy[i].skip_renaming); + } + + if (temp_in_scc && (sgpr_operands_alias_defs || linear_vgpr)) { +@@ -2982,18 +3050,18 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector>& parallelcopy, ++emit_parallel_copy(ra_ctx& ctx, std::vector& parallelcopy, + aco_ptr& instr, std::vector>& instructions, + bool temp_in_scc, RegisterFile& register_file) + { + if (parallelcopy.empty()) + return; + +- std::vector> linear_vgpr; ++ std::vector linear_vgpr; + if (ctx.num_linear_vgprs) { + unsigned next = 0; + for (unsigned i = 0; i < parallelcopy.size(); i++) { +- if (parallelcopy[i].first.regClass().is_linear_vgpr()) { ++ if (parallelcopy[i].def.regClass().is_linear_vgpr()) { + linear_vgpr.push_back(parallelcopy[i]); + continue; + } +@@ -3063,7 +3131,7 @@ register_allocation(Program* program, ra_test_policy policy) + auto instr_it = std::find_if(block.instructions.begin(), block.instructions.end(), NonPhi); + for (; instr_it != block.instructions.end(); ++instr_it) { + aco_ptr& instr = *instr_it; +- std::vector> parallelcopy; ++ std::vector parallelcopy; + bool temp_in_scc = register_file[scc]; + + if (instr->opcode == aco_opcode::p_branch) { +@@ -3084,7 +3152,6 @@ register_allocation(Program* program, ra_test_policy policy) + /* rename operands */ + operand.setTemp(read_variable(ctx, operand.getTemp(), block.index)); + assert(ctx.assignments[operand.tempId()].assigned); +- + fixed |= + operand.isPrecolored() && ctx.assignments[operand.tempId()].reg != operand.physReg(); + } +@@ -3101,8 +3168,9 @@ register_allocation(Program* program, ra_test_policy policy) + } + } + +- if (fixed) ++ if (fixed) { + handle_fixed_operands(ctx, register_file, parallelcopy, instr); ++ } + + for (unsigned i = 0; i < instr->operands.size(); ++i) { + auto& operand = instr->operands[i]; +@@ -3347,7 +3415,7 @@ register_allocation(Program* program, ra_test_policy policy) + bool temp_in_scc = + register_file[scc] || (!br->operands.empty() && br->operands[0].physReg() == scc); + +- std::vector> parallelcopy; ++ std::vector parallelcopy; + compact_linear_vgprs(ctx, register_file, parallelcopy); + update_renames(ctx, register_file, parallelcopy, br, rename_not_killed_ops); + emit_parallel_copy_internal(ctx, parallelcopy, br, instructions, temp_in_scc, register_file); +-- +GitLab + + +From 51acc061a662fc8fcc1e257a12346474af5912d6 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 24 Jun 2024 16:48:43 +0200 +Subject: [PATCH 03/71] !29730 + +--- + src/amd/compiler/aco_ir.h | 1 + + src/amd/compiler/aco_live_var_analysis.cpp | 50 +++++--- + src/amd/compiler/aco_spill.cpp | 133 ++++++++++++++++++--- + 3 files changed, 151 insertions(+), 33 deletions(-) + +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index 8a501797092ed..d838b728e19ce 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -2314,6 +2314,7 @@ int get_op_fixed_to_def(Instruction* instr); + /* utilities for dealing with register demand */ + RegisterDemand get_live_changes(Instruction* instr); + RegisterDemand get_temp_registers(Instruction* instr); ++RegisterDemand get_temp_reg_changes(Instruction* instr); + + /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */ + uint16_t get_extra_sgprs(Program* program); +diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp +index 8744258a1b9aa..a635c94496143 100644 +--- a/src/amd/compiler/aco_live_var_analysis.cpp ++++ b/src/amd/compiler/aco_live_var_analysis.cpp +@@ -9,6 +9,29 @@ + + namespace aco { + ++namespace { ++void ++get_temp_register_demand(Instruction* instr, RegisterDemand& demand_before, RegisterDemand& demand_after) ++{ ++ for (Definition def : instr->definitions) { ++ if (def.isKill()) ++ demand_after += def.getTemp(); ++ else if (def.isTemp()) ++ demand_before -= def.getTemp(); ++ } ++ ++ for (Operand op : instr->operands) { ++ if (op.isFirstKill() || op.isCopyKill()) { ++ demand_before += op.getTemp(); ++ if (op.isLateKill()) ++ demand_after += op.getTemp(); ++ } else if (op.isClobbered() && !op.isKill()) { ++ demand_before += op.getTemp(); ++ } ++ } ++} ++} ++ + RegisterDemand + get_live_changes(Instruction* instr) + { +@@ -34,27 +57,22 @@ get_temp_registers(Instruction* instr) + RegisterDemand demand_before; + RegisterDemand demand_after; + +- for (Definition def : instr->definitions) { +- if (def.isKill()) +- demand_after += def.getTemp(); +- else if (def.isTemp()) +- demand_before -= def.getTemp(); +- } +- +- for (Operand op : instr->operands) { +- if (op.isFirstKill() || op.isCopyKill()) { +- demand_before += op.getTemp(); +- if (op.isLateKill()) +- demand_after += op.getTemp(); +- } else if (op.isClobbered() && !op.isKill()) { +- demand_before += op.getTemp(); +- } +- } ++ get_temp_register_demand(instr, demand_before, demand_after); + + demand_after.update(demand_before); + return demand_after; + } + ++RegisterDemand get_temp_reg_changes(Instruction* instr) ++{ ++ RegisterDemand demand_before; ++ RegisterDemand demand_after; ++ ++ get_temp_register_demand(instr, demand_before, demand_after); ++ ++ return demand_after - demand_before; ++} ++ + namespace { + + struct live_ctx { +diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp +index ae7ae16e3298b..be45b0eda7632 100644 +--- a/src/amd/compiler/aco_spill.cpp ++++ b/src/amd/compiler/aco_spill.cpp +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -909,7 +910,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s + /* the Operand is spilled: add it to reloads */ + Temp new_tmp = ctx.program->allocateTmp(op.regClass()); + ctx.renames[block_idx][op.getTemp()] = new_tmp; +- reloads[new_tmp] = std::make_pair(op.getTemp(), current_spills[op.getTemp()]); ++ reloads[op.getTemp()] = std::make_pair(new_tmp, current_spills[op.getTemp()]); + current_spills.erase(op.getTemp()); + spilled_registers -= new_tmp; + } +@@ -917,13 +918,17 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s + /* check if register demand is low enough during and after the current instruction */ + if (block->register_demand.exceeds(ctx.target_pressure)) { + RegisterDemand new_demand = instr->register_demand; ++ std::optional live_changes; + + /* if reg pressure is too high, spill variable with furthest next use */ + while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) { + float score = 0.0; + Temp to_spill = Temp(); ++ unsigned operand_idx = -1u; ++ unsigned respill_slot = -1u; + unsigned do_rematerialize = 0; + unsigned avoid_respill = 0; ++ + RegType type = RegType::sgpr; + if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) + type = RegType::vgpr; +@@ -941,24 +946,68 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s + + if (can_rematerialize > do_rematerialize || loop_variable > avoid_respill || + ctx.ssa_infos[t].score() > score) { +- /* Don't spill operands */ +- if (std::any_of(instr->operands.begin(), instr->operands.end(), +- [&](Operand& op) { return op.isTemp() && op.getTemp() == var; })) ++ unsigned cur_operand_idx = -1u; ++ bool can_spill = true; ++ for (auto it = instr->operands.begin(); it != instr->operands.end(); ++it) { ++ if (!it->isTemp() || it->getTemp() != var) ++ continue; ++ ++ /* Vectors with size >4 require a p_split_vector. When spilling an operand, ++ * the p_split_vector cannot kill the vector (because it's also an operand ++ * to the current instruction) and will therefore increase register demand ++ * instead of helping reduce it. ++ */ ++ if (it->regClass().size() > 4) { ++ can_spill = false; ++ break; ++ } ++ ++ if (!live_changes) ++ live_changes = get_temp_reg_changes(instr.get()); ++ ++ /* Don't spill operands if killing operands won't help with register pressure */ ++ if ((type == RegType::sgpr && live_changes->sgpr < (int16_t)it->size()) || ++ (type == RegType::vgpr && live_changes->vgpr < (int16_t)it->size())) { ++ can_spill = false; ++ break; ++ } ++ ++ cur_operand_idx = it - instr->operands.begin(); ++ if (it->isLateKill() || it->isKill()) ++ can_spill = false; ++ break; ++ } ++ if (!can_spill) + continue; + ++ bool is_spilled_operand = reloads.count(var); ++ + to_spill = var; + score = ctx.ssa_infos[t].score(); + do_rematerialize = can_rematerialize; +- avoid_respill = loop_variable; ++ avoid_respill = loop_variable || is_spilled_operand; ++ operand_idx = cur_operand_idx; ++ ++ /* This variable is spilled at the loop-header of the current loop. ++ * Re-use the spill-slot in order to avoid an extra store. ++ */ ++ if (loop_variable) ++ respill_slot = ctx.loop.back().spills[var]; ++ else if (is_spilled_operand) ++ respill_slot = reloads[var].second; + } + } + assert(to_spill != Temp()); + +- if (avoid_respill) { +- /* This variable is spilled at the loop-header of the current loop. +- * Re-use the spill-slot in order to avoid an extra store. ++ if (operand_idx != -1u) { ++ /* We might not be able to spill all operands. Keep live_changes up-to-date so we ++ * stop when we spilled every operand we can. + */ +- current_spills[to_spill] = ctx.loop.back().spills[to_spill]; ++ *live_changes -= instr->operands[operand_idx].getTemp(); ++ } ++ ++ if (avoid_respill) { ++ current_spills[to_spill] = respill_slot; + spilled_registers += to_spill; + continue; + } +@@ -1007,7 +1056,7 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s + /* add reloads and instruction to new instructions */ + for (std::pair>& pair : reloads) { + aco_ptr reload = +- do_reload(ctx, pair.second.first, pair.first, pair.second.second); ++ do_reload(ctx, pair.first, pair.second.first, pair.second.second); + instructions.emplace_back(std::move(reload)); + } + instructions.emplace_back(std::move(instr)); +@@ -1227,7 +1276,7 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector>& inst + assert(temp.type() == RegType::vgpr && !temp.is_linear()); + + Builder bld(ctx.program, &instructions); +- if (temp.size() > 1) { ++ if (temp.size() > 4) { + Instruction* split{ + create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())}; + split->operands[0] = Operand(temp); +@@ -1246,11 +1295,36 @@ spill_vgpr(spill_ctx& ctx, Block& block, std::vector>& inst + instr->mubuf().cache.value = ac_swizzled; + } + } +- } else if (ctx.program->gfx_level >= GFX9) { +- bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), ctx.scratch_rsrc, temp, offset, ++ return; ++ } ++ ++ aco_opcode opcode; ++ switch (temp.size()) { ++ case 4: ++ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_store_dwordx4 ++ : aco_opcode::buffer_store_dwordx4; ++ break; ++ case 3: ++ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_store_dwordx3 ++ : aco_opcode::buffer_store_dwordx3; ++ break; ++ case 2: ++ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_store_dwordx2 ++ : aco_opcode::buffer_store_dwordx2; ++ break; ++ case 1: ++ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_store_dword ++ : aco_opcode::buffer_store_dword; ++ break; ++ default: ++ unreachable("Unhandled vector size!\n"); ++ } ++ ++ if (ctx.program->gfx_level >= GFX9) { ++ bld.scratch(opcode, Operand(v1), ctx.scratch_rsrc, temp, offset, + memory_sync_info(storage_vgpr_spill, semantic_private)); + } else { +- Instruction* instr = bld.mubuf(aco_opcode::buffer_store_dword, ctx.scratch_rsrc, Operand(v1), ++ Instruction* instr = bld.mubuf(opcode, ctx.scratch_rsrc, Operand(v1), + scratch_offset, temp, offset, false); + instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); + instr->mubuf().cache.value = ac_swizzled; +@@ -1291,11 +1365,36 @@ reload_vgpr(spill_ctx& ctx, Block& block, std::vector>& ins + } + } + bld.insert(vec); +- } else if (ctx.program->gfx_level >= GFX9) { +- bld.scratch(aco_opcode::scratch_load_dword, def, Operand(v1), ctx.scratch_rsrc, offset, ++ return; ++ } ++ ++ aco_opcode opcode; ++ switch (def.size()) { ++ case 4: ++ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_load_dwordx4 ++ : aco_opcode::buffer_load_dwordx4; ++ break; ++ case 3: ++ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_load_dwordx3 ++ : aco_opcode::buffer_load_dwordx3; ++ break; ++ case 2: ++ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_load_dwordx2 ++ : aco_opcode::buffer_load_dwordx2; ++ break; ++ case 1: ++ opcode = ctx.program->gfx_level >= GFX9 ? aco_opcode::scratch_load_dword ++ : aco_opcode::buffer_load_dword; ++ break; ++ default: ++ unreachable("Unhandled vector size!\n"); ++ } ++ ++ if (ctx.program->gfx_level >= GFX9) { ++ bld.scratch(opcode, def, Operand(v1), ctx.scratch_rsrc, offset, + memory_sync_info(storage_vgpr_spill, semantic_private)); + } else { +- Instruction* instr = bld.mubuf(aco_opcode::buffer_load_dword, def, ctx.scratch_rsrc, ++ Instruction* instr = bld.mubuf(opcode, def, ctx.scratch_rsrc, + Operand(v1), scratch_offset, offset, false); + instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); + instr->mubuf().cache.value = ac_swizzled; +-- +GitLab + + +From a0276e8120c286a81006d1636f5e5e552c807d69 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 17 Jun 2024 12:55:48 +0200 +Subject: [PATCH 04/71] !29577 + +--- + src/compiler/nir/meson.build | 1 + + src/compiler/nir/nir.c | 7 +- + src/compiler/nir/nir.h | 35 ++- + src/compiler/nir/nir_builder.h | 22 ++ + src/compiler/nir/nir_clone.c | 1 + + src/compiler/nir/nir_divergence_analysis.c | 31 ++- + src/compiler/nir/nir_functions.c | 5 +- + src/compiler/nir/nir_gather_info.c | 6 +- + src/compiler/nir/nir_inline_helpers.h | 2 + + src/compiler/nir/nir_lower_memory_model.c | 33 +-- + src/compiler/nir/nir_metadata.c | 13 ++ + src/compiler/nir/nir_opt_call.c | 259 +++++++++++++++++++++ + src/compiler/nir/nir_print.c | 7 + + src/compiler/nir/nir_serialize.c | 11 + + src/compiler/nir/nir_sweep.c | 9 - + src/compiler/nir/nir_validate.c | 5 + + src/compiler/spirv/vtn_cfg.c | 3 + + 17 files changed, 410 insertions(+), 40 deletions(-) + create mode 100644 src/compiler/nir/nir_opt_call.c + +diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build +index 514f5e0e1b7a1..2df6b28d73b39 100644 +--- a/src/compiler/nir/meson.build ++++ b/src/compiler/nir/meson.build +@@ -219,6 +219,7 @@ files_libnir = files( + 'nir_normalize_cubemap_coords.c', + 'nir_opt_access.c', + 'nir_opt_barriers.c', ++ 'nir_opt_call.c', + 'nir_opt_combine_stores.c', + 'nir_opt_comparison_pre.c', + 'nir_opt_conditional_discard.c', +diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c +index 513fd04f36f99..0b2736e4a0308 100644 +--- a/src/compiler/nir/nir.c ++++ b/src/compiler/nir/nir.c +@@ -502,6 +502,7 @@ nir_function_create(nir_shader *shader, const char *name) + func->is_preamble = false; + func->dont_inline = false; + func->should_inline = false; ++ func->driver_attributes = 0; + func->is_subroutine = false; + func->is_tmp_globals_wrapper = false; + func->subroutine_index = 0; +@@ -1584,8 +1585,8 @@ nir_def_rewrite_uses_src(nir_def *def, nir_src new_src) + nir_def_rewrite_uses(def, new_src.ssa); + } + +-static bool +-is_instr_between(nir_instr *start, nir_instr *end, nir_instr *between) ++bool ++nir_instr_is_between(nir_instr *start, nir_instr *end, nir_instr *between) + { + assert(start->block == end->block); + +@@ -1629,7 +1630,7 @@ nir_def_rewrite_uses_after(nir_def *def, nir_def *new_ssa, + * not be dominated by after_me is if it is between def and after_me in + * the instruction list. + */ +- if (is_instr_between(def->parent_instr, after_me, nir_src_parent_instr(use_src))) ++ if (nir_instr_is_between(def->parent_instr, after_me, nir_src_parent_instr(use_src))) + continue; + } + +diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h +index 7a781b7fefb4e..10a592f4b87bb 100644 +--- a/src/compiler/nir/nir.h ++++ b/src/compiler/nir/nir.h +@@ -1915,6 +1915,10 @@ typedef struct { + nir_instr instr; + + struct nir_function *callee; ++ /* If this function call is indirect, the function pointer to call. ++ * Otherwise, null initialized. ++ */ ++ nir_src indirect_callee; + + unsigned num_params; + nir_src params[]; +@@ -3646,13 +3650,28 @@ typedef struct { + uint8_t num_components; + uint8_t bit_size; + +- /* True if this paramater is actually the function return variable */ ++ /* True if this parameter is a deref used for returning values */ + bool is_return; + + bool implicit_conversion_prohibited; + ++ /* True if this parameter is not divergent. This is inverted to make ++ * parameters divergent by default unless explicitly specified ++ * otherwise. ++ */ ++ bool is_uniform; ++ + nir_variable_mode mode; + ++ /* Drivers may optionally stash flags here describing the parameter. ++ * For example, this might encode whether the driver expects the value ++ * to be uniform or divergent, if the driver handles divergent parameters ++ * differently from uniform ones. ++ * ++ * NIR will preserve this value but does not interpret it in any way. ++ */ ++ uint32_t driver_attributes; ++ + /* The type of the function param */ + const struct glsl_type *type; + } nir_parameter; +@@ -3675,6 +3694,14 @@ typedef struct nir_function { + */ + nir_function_impl *impl; + ++ /* Drivers may optionally stash flags here describing the function call. ++ * For example, this might encode the ABI used for the call if a driver ++ * supports multiple ABIs. ++ * ++ * NIR will preserve this value but does not interpret it in any way. ++ */ ++ uint32_t driver_attributes; ++ + bool is_entrypoint; + /* from SPIR-V linkage, only for libraries */ + bool is_exported; +@@ -5053,6 +5080,8 @@ void nir_instr_clear_src(nir_instr *instr, nir_src *src); + + void nir_instr_move_src(nir_instr *dest_instr, nir_src *dest, nir_src *src); + ++bool nir_instr_is_between(nir_instr *start, nir_instr *end, nir_instr *between); ++ + void nir_def_init(nir_instr *instr, nir_def *def, + unsigned num_components, unsigned bit_size); + static inline void +@@ -6789,6 +6818,10 @@ bool nir_opt_combine_barriers(nir_shader *shader, + void *data); + bool nir_opt_barrier_modes(nir_shader *shader); + ++typedef bool (*can_remat_cb)(nir_instr *instr); ++ ++bool nir_minimize_call_live_states(nir_shader *shader); ++ + bool nir_opt_combine_stores(nir_shader *shader, nir_variable_mode modes); + + bool nir_copy_prop_impl(nir_function_impl *impl); +diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h +index 5e07f588b4a5b..97a12e8c9ffc4 100644 +--- a/src/compiler/nir/nir_builder.h ++++ b/src/compiler/nir/nir_builder.h +@@ -2218,6 +2218,22 @@ nir_build_call(nir_builder *build, nir_function *func, size_t count, + nir_builder_instr_insert(build, &call->instr); + } + ++static inline void ++nir_build_indirect_call(nir_builder *build, nir_function *func, nir_def *callee, ++ size_t count, nir_def **args) ++{ ++ assert(count == func->num_params && "parameter count must match"); ++ assert(!func->impl && "cannot call directly defined functions indirectly"); ++ nir_call_instr *call = nir_call_instr_create(build->shader, func); ++ ++ for (unsigned i = 0; i < func->num_params; ++i) { ++ call->params[i] = nir_src_for_ssa(args[i]); ++ } ++ call->indirect_callee = nir_src_for_ssa(callee); ++ ++ nir_builder_instr_insert(build, &call->instr); ++} ++ + static inline void + nir_discard(nir_builder *build) + { +@@ -2251,6 +2267,12 @@ nir_build_string(nir_builder *build, const char *value); + nir_build_call(build, func, ARRAY_SIZE(args), args); \ + } while (0) + ++#define nir_call_indirect(build, func, callee, ...) \ ++ do { \ ++ nir_def *_args[] = { __VA_ARGS__ }; \ ++ nir_build_indirect_call(build, func, callee, ARRAY_SIZE(_args), _args); \ ++ } while (0) ++ + nir_def * + nir_compare_func(nir_builder *b, enum compare_func func, + nir_def *src0, nir_def *src1); +diff --git a/src/compiler/nir/nir_clone.c b/src/compiler/nir/nir_clone.c +index a8359fcd8da76..0bfd9623686ec 100644 +--- a/src/compiler/nir/nir_clone.c ++++ b/src/compiler/nir/nir_clone.c +@@ -714,6 +714,7 @@ nir_function_clone(nir_shader *ns, const nir_function *fxn) + nfxn->should_inline = fxn->should_inline; + nfxn->dont_inline = fxn->dont_inline; + nfxn->is_subroutine = fxn->is_subroutine; ++ nfxn->driver_attributes = fxn->driver_attributes; + nfxn->is_tmp_globals_wrapper = fxn->is_tmp_globals_wrapper; + nfxn->num_subroutine_types = fxn->num_subroutine_types; + nfxn->subroutine_index = fxn->subroutine_index; +diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c +index 7c1b94ea81eb2..183b3bc81820d 100644 +--- a/src/compiler/nir/nir_divergence_analysis.c ++++ b/src/compiler/nir/nir_divergence_analysis.c +@@ -39,6 +39,7 @@ + struct divergence_state { + const gl_shader_stage stage; + nir_shader *shader; ++ nir_function_impl *impl; + nir_divergence_options options; + nir_loop *loop; + +@@ -713,11 +714,15 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) + src_divergent(instr->src[1], state); + break; + ++ case nir_intrinsic_load_param: ++ is_divergent = ++ !state->impl->function->params[nir_intrinsic_param_idx(instr)].is_uniform; ++ break; ++ + /* Intrinsics which are always divergent */ + case nir_intrinsic_inverse_ballot: + case nir_intrinsic_load_color0: + case nir_intrinsic_load_color1: +- case nir_intrinsic_load_param: + case nir_intrinsic_load_sample_id: + case nir_intrinsic_load_sample_id_no_per_sample: + case nir_intrinsic_load_sample_mask_in: +@@ -1089,8 +1094,9 @@ instr_is_loop_invariant(nir_instr *instr, struct divergence_state *state) + case nir_instr_type_deref: + case nir_instr_type_tex: + return nir_foreach_src(instr, src_invariant, state->loop); +- case nir_instr_type_phi: + case nir_instr_type_call: ++ return false; ++ case nir_instr_type_phi: + case nir_instr_type_parallel_copy: + default: + unreachable("NIR divergence analysis: Unsupported instruction type."); +@@ -1115,9 +1121,10 @@ update_instr_divergence(nir_instr *instr, struct divergence_state *state) + return visit_deref(state->shader, nir_instr_as_deref(instr), state); + case nir_instr_type_debug_info: + return false; ++ case nir_instr_type_call: ++ return false; + case nir_instr_type_jump: + case nir_instr_type_phi: +- case nir_instr_type_call: + case nir_instr_type_parallel_copy: + default: + unreachable("NIR divergence analysis: Unsupported instruction type."); +@@ -1405,6 +1412,7 @@ nir_divergence_analysis_impl(nir_function_impl *impl, nir_divergence_options opt + struct divergence_state state = { + .stage = impl->function->shader->info.stage, + .shader = impl->function->shader, ++ .impl = impl, + .options = options, + .loop = NULL, + .divergent_loop_cf = false, +@@ -1422,8 +1430,10 @@ void + nir_divergence_analysis(nir_shader *shader) + { + shader->info.divergence_analysis_run = true; +- nir_divergence_analysis_impl(nir_shader_get_entrypoint(shader), +- shader->options->divergence_analysis_options); ++ nir_foreach_function_impl(impl, shader) { ++ nir_divergence_analysis_impl(impl, ++ shader->options->divergence_analysis_options); ++ } + } + + /* Compute divergence between vertices of the same primitive. This uses +@@ -1444,10 +1454,13 @@ nir_vertex_divergence_analysis(nir_shader *shader) + .first_visit = true, + }; + +- nir_metadata_require(nir_shader_get_entrypoint(shader), +- nir_metadata_block_index); +- visit_cf_list(&nir_shader_get_entrypoint(shader)->body, &state); +- nir_metadata_preserve(nir_shader_get_entrypoint(shader), nir_metadata_all); ++ nir_foreach_function_impl(impl, shader) { ++ state.first_visit = true; ++ state.impl = impl; ++ nir_metadata_require(impl, nir_metadata_block_index); ++ visit_cf_list(&impl->body, &state); ++ nir_metadata_preserve(impl, nir_metadata_all); ++ } + } + + bool +diff --git a/src/compiler/nir/nir_functions.c b/src/compiler/nir/nir_functions.c +index 3ad986f697905..355161cf1b40d 100644 +--- a/src/compiler/nir/nir_functions.c ++++ b/src/compiler/nir/nir_functions.c +@@ -194,7 +194,10 @@ static bool inline_functions_pass(nir_builder *b, + return false; + + nir_call_instr *call = nir_instr_as_call(instr); +- assert(call->callee->impl); ++ if (!call->callee->impl) ++ return false; ++ ++ assert(!call->indirect_callee.ssa); + + if (b->shader->options->driver_functions && + b->shader->info.stage == MESA_SHADER_KERNEL) { +diff --git a/src/compiler/nir/nir_gather_info.c b/src/compiler/nir/nir_gather_info.c +index a5932cf3b3082..9af452acfb546 100644 +--- a/src/compiler/nir/nir_gather_info.c ++++ b/src/compiler/nir/nir_gather_info.c +@@ -954,8 +954,10 @@ gather_func_info(nir_function_impl *func, nir_shader *shader, + nir_call_instr *call = nir_instr_as_call(instr); + nir_function_impl *impl = call->callee->impl; + +- assert(impl || !"nir_shader_gather_info only works with linked shaders"); +- gather_func_info(impl, shader, visited_funcs, dead_ctx); ++ if (!call->indirect_callee.ssa) ++ assert(impl || !"nir_shader_gather_info only works with linked shaders"); ++ if (impl) ++ gather_func_info(impl, shader, visited_funcs, dead_ctx); + break; + } + default: +diff --git a/src/compiler/nir/nir_inline_helpers.h b/src/compiler/nir/nir_inline_helpers.h +index 8f3994f5353d6..17f2581cceee1 100644 +--- a/src/compiler/nir/nir_inline_helpers.h ++++ b/src/compiler/nir/nir_inline_helpers.h +@@ -107,6 +107,8 @@ nir_foreach_src(nir_instr *instr, nir_foreach_src_cb cb, void *state) + } + case nir_instr_type_call: { + nir_call_instr *call = nir_instr_as_call(instr); ++ if (call->indirect_callee.ssa && !_nir_visit_src(&call->indirect_callee, cb, state)) ++ return false; + for (unsigned i = 0; i < call->num_params; i++) { + if (!_nir_visit_src(&call->params[i], cb, state)) + return false; +diff --git a/src/compiler/nir/nir_lower_memory_model.c b/src/compiler/nir/nir_lower_memory_model.c +index 95d9f4e9526dc..c797eae8a4eb5 100644 +--- a/src/compiler/nir/nir_lower_memory_model.c ++++ b/src/compiler/nir/nir_lower_memory_model.c +@@ -229,21 +229,24 @@ nir_lower_memory_model(nir_shader *shader) + { + bool progress = false; + +- nir_function_impl *impl = nir_shader_get_entrypoint(shader); +- struct exec_list *cf_list = &impl->body; +- +- uint32_t modes = 0; +- foreach_list_typed(nir_cf_node, cf_node, node, cf_list) +- progress |= lower_make_visible(cf_node, &modes); +- +- modes = 0; +- foreach_list_typed_reverse(nir_cf_node, cf_node, node, cf_list) +- progress |= lower_make_available(cf_node, &modes); +- +- if (progress) +- nir_metadata_preserve(impl, nir_metadata_control_flow); +- else +- nir_metadata_preserve(impl, nir_metadata_all); ++ nir_foreach_function_impl(impl, shader) { ++ bool impl_progress = false; ++ struct exec_list *cf_list = &impl->body; ++ ++ uint32_t modes = 0; ++ foreach_list_typed(nir_cf_node, cf_node, node, cf_list) ++ impl_progress |= lower_make_visible(cf_node, &modes); ++ ++ modes = 0; ++ foreach_list_typed_reverse(nir_cf_node, cf_node, node, cf_list) ++ impl_progress |= lower_make_available(cf_node, &modes); ++ ++ if (impl_progress) ++ nir_metadata_preserve(impl, nir_metadata_control_flow); ++ else ++ nir_metadata_preserve(impl, nir_metadata_all); ++ progress |= impl_progress; ++ } + + return progress; + } +diff --git a/src/compiler/nir/nir_metadata.c b/src/compiler/nir/nir_metadata.c +index e0085991bbc06..29e2ceaa499d1 100644 +--- a/src/compiler/nir/nir_metadata.c ++++ b/src/compiler/nir/nir_metadata.c +@@ -61,6 +61,19 @@ nir_metadata_require(nir_function_impl *impl, nir_metadata required, ...) + void + nir_metadata_preserve(nir_function_impl *impl, nir_metadata preserved) + { ++ /* If we discard valid liveness information, immediately free the ++ * liveness information for each block. For large shaders, it can ++ * consume a huge amount of memory, and it's usually not immediately ++ * needed after dirtying. ++ */ ++ if ((impl->valid_metadata & ~preserved) & nir_metadata_live_defs) { ++ nir_foreach_block(block, impl) { ++ ralloc_free(block->live_in); ++ ralloc_free(block->live_out); ++ block->live_in = block->live_out = NULL; ++ } ++ } ++ + impl->valid_metadata &= preserved; + } + +diff --git a/src/compiler/nir/nir_opt_call.c b/src/compiler/nir/nir_opt_call.c +new file mode 100644 +index 0000000000000..421f78096042a +--- /dev/null ++++ b/src/compiler/nir/nir_opt_call.c +@@ -0,0 +1,259 @@ ++/* ++ * Copyright © 2024 Valve Corporation ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#include "nir.h" ++#include "nir_builder.h" ++#include "nir_phi_builder.h" ++ ++struct call_liveness_entry { ++ struct list_head list; ++ nir_call_instr *instr; ++ const BITSET_WORD *live_set; ++}; ++ ++static bool ++can_remat_instr(nir_instr *instr) ++{ ++ switch (instr->type) { ++ case nir_instr_type_alu: ++ case nir_instr_type_load_const: ++ case nir_instr_type_undef: ++ return true; ++ case nir_instr_type_intrinsic: ++ switch (nir_instr_as_intrinsic(instr)->intrinsic) { ++ case nir_intrinsic_load_ray_launch_id: ++ case nir_intrinsic_load_ray_launch_size: ++ case nir_intrinsic_vulkan_resource_index: ++ case nir_intrinsic_vulkan_resource_reindex: ++ case nir_intrinsic_load_vulkan_descriptor: ++ case nir_intrinsic_load_push_constant: ++ case nir_intrinsic_load_global_constant: ++ case nir_intrinsic_load_smem_amd: ++ case nir_intrinsic_load_scalar_arg_amd: ++ case nir_intrinsic_load_vector_arg_amd: ++ return true; ++ default: ++ return false; ++ } ++ default: ++ return false; ++ } ++} ++ ++static void ++remat_ssa_def(nir_builder *b, nir_def *def, struct hash_table *remap_table, ++ struct hash_table *phi_value_table, struct nir_phi_builder *phi_builder, ++ BITSET_WORD *def_blocks) ++{ ++ memset(def_blocks, 0, BITSET_WORDS(b->impl->num_blocks) * sizeof(BITSET_WORD)); ++ BITSET_SET(def_blocks, def->parent_instr->block->index); ++ BITSET_SET(def_blocks, nir_cursor_current_block(b->cursor)->index); ++ struct nir_phi_builder_value *val = nir_phi_builder_add_value(phi_builder, def->num_components, def->bit_size, def_blocks); ++ _mesa_hash_table_insert(phi_value_table, def, val); ++ ++ nir_instr *clone = nir_instr_clone_deep(b->shader, def->parent_instr, remap_table); ++ nir_builder_instr_insert(b, clone); ++ nir_def *new_def = nir_instr_def(clone); ++ ++ _mesa_hash_table_insert(remap_table, def, new_def); ++ if (nir_cursor_current_block(b->cursor)->index != def->parent_instr->block->index) ++ nir_phi_builder_value_set_block_def(val, def->parent_instr->block, def); ++ nir_phi_builder_value_set_block_def(val, nir_cursor_current_block(b->cursor), new_def); ++} ++ ++struct remat_chain_check_data { ++ struct hash_table *remap_table; ++ unsigned chain_length; ++}; ++ ++static bool ++can_remat_chain(nir_src *src, void *data) ++{ ++ struct remat_chain_check_data *check_data = data; ++ ++ if (_mesa_hash_table_search(check_data->remap_table, src->ssa)) ++ return true; ++ ++ if (!can_remat_instr(src->ssa->parent_instr)) ++ return false; ++ ++ if (check_data->chain_length++ >= 16) ++ return false; ++ ++ return nir_foreach_src(src->ssa->parent_instr, can_remat_chain, check_data); ++} ++ ++struct remat_chain_data { ++ nir_builder *b; ++ struct hash_table *remap_table; ++ struct hash_table *phi_value_table; ++ struct nir_phi_builder *phi_builder; ++ BITSET_WORD *def_blocks; ++}; ++ ++static bool ++do_remat_chain(nir_src *src, void *data) ++{ ++ struct remat_chain_data *remat_data = data; ++ ++ if (_mesa_hash_table_search(remat_data->remap_table, src->ssa)) ++ return true; ++ ++ nir_foreach_src(src->ssa->parent_instr, do_remat_chain, remat_data); ++ ++ remat_ssa_def(remat_data->b, src->ssa, remat_data->remap_table, remat_data->phi_value_table, remat_data->phi_builder, remat_data->def_blocks); ++ return true; ++} ++ ++struct src_rewrite_ctx { ++ struct hash_table *phi_value_table; ++ nir_instr *instr; ++}; ++ ++static bool ++rewrite_instr_src_from_phi_builder(nir_src *src, void *data) ++{ ++ struct src_rewrite_ctx *ctx = data; ++ ++ if (nir_src_is_const(*src)) { ++ nir_builder b = nir_builder_at(nir_before_instr(ctx->instr)); ++ nir_src_rewrite(src, nir_build_imm(&b, src->ssa->num_components, src->ssa->bit_size, nir_src_as_const_value(*src))); ++ return true; ++ } ++ ++ struct hash_entry *entry = _mesa_hash_table_search(ctx->phi_value_table, src->ssa); ++ if (!entry) ++ return true; ++ ++ nir_block *block = nir_src_parent_instr(src)->block; ++ nir_def *new_def = nir_phi_builder_value_get_block_def(entry->data, block); ++ ++ bool can_rewrite = true; ++ if (new_def->parent_instr->block == block && new_def->index != UINT32_MAX) ++ can_rewrite = nir_src_parent_instr(src) != nir_block_first_instr(block) && ++ !nir_instr_is_between(nir_block_first_instr(block), ++ new_def->parent_instr, ++ nir_src_parent_instr(src)); ++ ++ if (can_rewrite) ++ nir_src_rewrite(src, new_def); ++ return true; ++} ++ ++static bool ++nir_minimize_call_live_states_impl(nir_function_impl *impl) ++{ ++ nir_metadata_require(impl, nir_metadata_block_index | nir_metadata_live_defs | nir_metadata_dominance); ++ bool progress = false; ++ void *mem_ctx = ralloc_context(NULL); ++ ++ struct list_head call_list; ++ list_inithead(&call_list); ++ unsigned num_defs = impl->ssa_alloc; ++ ++ nir_def **rematerializable = rzalloc_array_size(mem_ctx, sizeof(nir_def *), num_defs); ++ ++ nir_foreach_block(block, impl) { ++ nir_foreach_instr(instr, block) { ++ nir_def *def = nir_instr_def(instr); ++ if (def && ++ can_remat_instr(instr)) { ++ rematerializable[def->index] = def; ++ } ++ ++ if (instr->type != nir_instr_type_call) ++ continue; ++ nir_call_instr *call = nir_instr_as_call(instr); ++ if (!call->indirect_callee.ssa) ++ continue; ++ ++ struct call_liveness_entry *entry = ralloc_size(mem_ctx, sizeof(struct call_liveness_entry)); ++ entry->instr = call; ++ entry->live_set = nir_get_live_defs(nir_after_instr(instr), mem_ctx); ++ list_addtail(&entry->list, &call_list); ++ } ++ } ++ ++ const unsigned block_words = BITSET_WORDS(impl->num_blocks); ++ BITSET_WORD *def_blocks = ralloc_array(mem_ctx, BITSET_WORD, block_words); ++ ++ list_for_each_entry(struct call_liveness_entry, entry, &call_list, list) { ++ unsigned i; ++ ++ nir_builder b = nir_builder_at(nir_after_instr(&entry->instr->instr)); ++ ++ struct nir_phi_builder *builder = nir_phi_builder_create(impl); ++ struct hash_table *phi_value_table = ++ _mesa_pointer_hash_table_create(mem_ctx); ++ struct hash_table *remap_table = ++ _mesa_pointer_hash_table_create(mem_ctx); ++ ++ BITSET_FOREACH_SET(i, entry->live_set, num_defs) { ++ if (!rematerializable[i] || _mesa_hash_table_search(remap_table, rematerializable[i])) ++ continue; ++ ++ progress = true; ++ assert(!_mesa_hash_table_search(phi_value_table, rematerializable[i])); ++ ++ struct remat_chain_check_data check_data = { ++ .remap_table = remap_table, ++ .chain_length = 1, ++ }; ++ ++ if (!nir_foreach_src(rematerializable[i]->parent_instr, can_remat_chain, &check_data)) ++ continue; ++ ++ struct remat_chain_data remat_data = { ++ .b = &b, ++ .remap_table = remap_table, ++ .phi_value_table = phi_value_table, ++ .phi_builder = builder, ++ .def_blocks = def_blocks, ++ }; ++ ++ nir_foreach_src(rematerializable[i]->parent_instr, do_remat_chain, &remat_data); ++ ++ remat_ssa_def(&b, rematerializable[i], remap_table, phi_value_table, builder, def_blocks); ++ } ++ _mesa_hash_table_destroy(remap_table, NULL); ++ ++ nir_foreach_block(block, impl) { ++ nir_foreach_instr(instr, block) { ++ if (instr->type == nir_instr_type_phi) ++ continue; ++ ++ struct src_rewrite_ctx ctx = { ++ .phi_value_table = phi_value_table, ++ .instr = instr, ++ }; ++ nir_foreach_src(instr, rewrite_instr_src_from_phi_builder, &ctx); ++ } ++ } ++ ++ nir_phi_builder_finish(builder); ++ _mesa_hash_table_destroy(phi_value_table, NULL); ++ } ++ ++ ralloc_free(mem_ctx); ++ ++ nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); ++ return progress; ++} ++ ++/* Tries to rematerialize as many live vars as possible after calls. ++ * Note: nir_opt_cse will undo any rematerializations done by this pass, ++ * so it shouldn't be run afterward. ++ */ ++bool ++nir_minimize_call_live_states(nir_shader *shader) ++{ ++ bool progress = false; ++ ++ nir_foreach_function_impl(impl, shader) { ++ progress |= nir_minimize_call_live_states_impl(impl); ++ } ++ ++ return progress; ++} +\ No newline at end of file +diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c +index 41f3eae83fe7d..ff90a20320268 100644 +--- a/src/compiler/nir/nir_print.c ++++ b/src/compiler/nir/nir_print.c +@@ -1884,7 +1884,14 @@ print_call_instr(nir_call_instr *instr, print_state *state) + + print_no_dest_padding(state); + ++ bool indirect = instr->indirect_callee.ssa; ++ + fprintf(fp, "call %s ", instr->callee->name); ++ if (indirect) { ++ fprintf(fp, "(indirect "); ++ print_src(&instr->indirect_callee, state, nir_type_invalid); ++ fprintf(fp, ") "); ++ } + + for (unsigned i = 0; i < instr->num_params; i++) { + if (i != 0) +diff --git a/src/compiler/nir/nir_serialize.c b/src/compiler/nir/nir_serialize.c +index 2735683dd083f..ad77c88a47840 100644 +--- a/src/compiler/nir/nir_serialize.c ++++ b/src/compiler/nir/nir_serialize.c +@@ -1975,6 +1975,8 @@ write_function(write_ctx *ctx, const nir_function *fxn) + if (fxn->name) + blob_write_string(ctx->blob, fxn->name); + ++ blob_write_uint32(ctx->blob, fxn->driver_attributes); ++ + blob_write_uint32(ctx->blob, fxn->subroutine_index); + blob_write_uint32(ctx->blob, fxn->num_subroutine_types); + for (unsigned i = 0; i < fxn->num_subroutine_types; i++) { +@@ -1988,9 +1990,14 @@ write_function(write_ctx *ctx, const nir_function *fxn) + uint32_t val = + ((uint32_t)fxn->params[i].num_components) | + ((uint32_t)fxn->params[i].bit_size) << 8; ++ if (fxn->params[i].is_return) ++ val |= (1u << 16); ++ if (fxn->params[i].is_uniform) ++ val |= (1u << 17); + blob_write_uint32(ctx->blob, val); + encode_type_to_blob(ctx->blob, fxn->params[i].type); + blob_write_uint32(ctx->blob, encode_deref_modes(fxn->params[i].mode)); ++ blob_write_uint32(ctx->blob, fxn->params[i].driver_attributes); + } + + /* At first glance, it looks like we should write the function_impl here. +@@ -2010,6 +2017,7 @@ read_function(read_ctx *ctx) + + nir_function *fxn = nir_function_create(ctx->nir, name); + ++ fxn->driver_attributes = blob_read_uint32(ctx->blob); + fxn->subroutine_index = blob_read_uint32(ctx->blob); + fxn->num_subroutine_types = blob_read_uint32(ctx->blob); + for (unsigned i = 0; i < fxn->num_subroutine_types; i++) { +@@ -2024,8 +2032,11 @@ read_function(read_ctx *ctx) + uint32_t val = blob_read_uint32(ctx->blob); + fxn->params[i].num_components = val & 0xff; + fxn->params[i].bit_size = (val >> 8) & 0xff; ++ fxn->params[i].is_return = val & (1u << 16); ++ fxn->params[i].is_uniform = val & (1u << 17); + fxn->params[i].type = decode_type_from_blob(ctx->blob); + fxn->params[i].mode = decode_deref_modes(blob_read_uint32(ctx->blob)); ++ fxn->params[i].driver_attributes = blob_read_uint32(ctx->blob); + } + + fxn->is_entrypoint = flags & 0x1; +diff --git a/src/compiler/nir/nir_sweep.c b/src/compiler/nir/nir_sweep.c +index 9acd60a60b875..009343c3cf957 100644 +--- a/src/compiler/nir/nir_sweep.c ++++ b/src/compiler/nir/nir_sweep.c +@@ -47,15 +47,6 @@ sweep_block(nir_shader *nir, nir_block *block) + { + ralloc_steal(nir, block); + +- /* sweep_impl will mark all metadata invalid. We can safely release all of +- * this here. +- */ +- ralloc_free(block->live_in); +- block->live_in = NULL; +- +- ralloc_free(block->live_out); +- block->live_out = NULL; +- + nir_foreach_instr(instr, block) { + gc_mark_live(nir->gctx, instr); + +diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c +index ee2c9cd32c4aa..1f712962556d9 100644 +--- a/src/compiler/nir/nir_validate.c ++++ b/src/compiler/nir/nir_validate.c +@@ -966,6 +966,11 @@ validate_call_instr(nir_call_instr *instr, validate_state *state) + { + validate_assert(state, instr->num_params == instr->callee->num_params); + ++ if (instr->indirect_callee.ssa) { ++ validate_assert(state, !instr->callee->impl); ++ validate_src(&instr->indirect_callee, state); ++ } ++ + for (unsigned i = 0; i < instr->num_params; i++) { + validate_sized_src(&instr->params[i], state, + instr->callee->params[i].bit_size, +diff --git a/src/compiler/spirv/vtn_cfg.c b/src/compiler/spirv/vtn_cfg.c +index e1b9d21ecfc10..e2afb3f8eaaa9 100644 +--- a/src/compiler/spirv/vtn_cfg.c ++++ b/src/compiler/spirv/vtn_cfg.c +@@ -55,6 +55,7 @@ glsl_type_add_to_function_params(const struct glsl_type *type, + func->params[(*param_idx)++] = (nir_parameter) { + .num_components = glsl_get_vector_elements(type), + .bit_size = glsl_get_bit_size(type), ++ .type = type, + }; + } else if (glsl_type_is_array_or_matrix(type)) { + unsigned elems = glsl_get_length(type); +@@ -290,6 +291,8 @@ vtn_cfg_handle_prepass_instruction(struct vtn_builder *b, SpvOp opcode, + func->params[idx++] = (nir_parameter) { + .num_components = nir_address_format_num_components(addr_format), + .bit_size = nir_address_format_bit_size(addr_format), ++ .is_return = true, ++ .type = func_type->return_type->type, + }; + } + +-- +GitLab + + +From 4c4b5a7e7b853d0ddcde5436d58cfa43c310d401 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Thu, 3 Oct 2024 15:58:19 +0200 +Subject: [PATCH 05/71] aco/lower_to_hw_instr: Also consider operand alignment + requirements + +--- + src/amd/compiler/aco_lower_to_hw_instr.cpp | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp +index 0e18aa66069f8..fa3c805f491b5 100644 +--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp ++++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp +@@ -1191,16 +1191,17 @@ split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op, + if ((ctx->program->gfx_level < GFX10 || ctx->program->gfx_level >= GFX11) && + src.def.regClass().type() == RegType::vgpr) + max_size = MIN2(max_size, 4); +- unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16; ++ unsigned max_def_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16; ++ unsigned max_op_align = src.op.regClass().type() == RegType::vgpr ? 4 : 16; + + /* make sure the size is a power of two and reg % bytes == 0 */ + unsigned bytes = 1; + for (; bytes <= max_size; bytes *= 2) { + unsigned next = bytes * 2u; +- bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 && ++ bool can_increase = def_reg.reg_b % MIN2(next, max_def_align) == 0 && + offset + next <= src.bytes && next <= max_size; + if (!src.op.isConstant() && can_increase) +- can_increase = op_reg.reg_b % MIN2(next, max_align) == 0; ++ can_increase = op_reg.reg_b % MIN2(next, max_op_align) == 0; + for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++) + can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0); + if (!can_increase) +-- +GitLab + + +From 325296b50ec3a85b9400189aec2b65b4c18bc40d Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Thu, 3 Oct 2024 15:58:45 +0200 +Subject: [PATCH 06/71] aco/ra: Disallow unaligned SGPR assignment + +--- + src/amd/compiler/aco_register_allocation.cpp | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index fc62487627fad..a8068b0da316a 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -2115,6 +2115,9 @@ operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr& instr, unsign + return false; + } + ++ if (rc.type() == RegType::sgpr && reg.reg() % rc.size()) ++ return false; ++ + switch (instr->format) { + case Format::SMEM: + return reg != scc && reg != exec && +-- +GitLab + + +From 50d5f59160434a154a93d2c8db9eca0a27551416 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Fri, 4 Oct 2024 07:20:12 +0200 +Subject: [PATCH 07/71] aco/ra: Fix SGPR parallelcopy operands straddling + 64-reg boundary + +--- + src/amd/compiler/aco_register_allocation.cpp | 18 +++++++++++++++--- + 1 file changed, 15 insertions(+), 3 deletions(-) + +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index a8068b0da316a..3ce0680bf52d6 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -3009,12 +3009,24 @@ emit_parallel_copy_internal(ra_ctx& ctx, std::vector& parallelcopy + if (!sgpr_operands_alias_defs) { + unsigned reg = parallelcopy[i].op.physReg().reg(); + unsigned size = parallelcopy[i].op.getTemp().size(); +- sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size); ++ if ((reg + size) / 64u == reg / 64u) { ++ sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size); ++ } else { ++ sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, 64u - (reg % 64u)); ++ sgpr_operands[(reg + size) / 64u] |= u_bit_consecutive64(0, (reg + size) % 64u); ++ } + + reg = parallelcopy[i].def.physReg().reg(); + size = parallelcopy[i].def.getTemp().size(); +- if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size)) +- sgpr_operands_alias_defs = true; ++ if ((reg + size) / 64u == reg / 64u) { ++ if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size)) ++ sgpr_operands_alias_defs = true; ++ } else { ++ if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, 64u - (reg % 64u))) ++ sgpr_operands_alias_defs = true; ++ if (sgpr_operands[(reg + size) / 64u] & u_bit_consecutive64(0, (reg + size) % 64u)) ++ sgpr_operands_alias_defs = true; ++ } + } + } + +-- +GitLab + + +From 0d80a9a6eb1d317727688914ad8f612dc7bace13 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 17 Jun 2024 13:13:21 +0200 +Subject: [PATCH 08/71] radv: Gather info for all functions + +--- + src/amd/vulkan/radv_pipeline.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c +index a9df9b6b8aea3..82a5aac71437d 100644 +--- a/src/amd/vulkan/radv_pipeline.c ++++ b/src/amd/vulkan/radv_pipeline.c +@@ -431,7 +431,9 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat + NIR_PASS(_, stage->nir, nir_opt_constant_folding); + + /* Gather info again, to update whether 8/16-bit are used. */ +- nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir)); ++ nir_foreach_function_impl (impl, stage->nir) ++ if (impl->function->is_entrypoint || impl->function->is_exported) ++ nir_shader_gather_info(stage->nir, impl); + } + } + +-- +GitLab + + +From 5e1e7090670cf7db02ea16a86790104a008c8813 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 11:27:06 +0200 +Subject: [PATCH 09/71] nir/intrinsics: Add incoming/outgoing payload + load/store instructions + +With RT function calls, these are going to get lowered to: +- load/store_param (incoming payload) +- load/store_var (outgoing payload) +--- + src/compiler/nir/nir_intrinsics.py | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py +index 31af10c320ba8..798e961c0c8e3 100644 +--- a/src/compiler/nir/nir_intrinsics.py ++++ b/src/compiler/nir/nir_intrinsics.py +@@ -1703,6 +1703,10 @@ intrinsic("execute_miss_amd", src_comp=[1]) + # BASE=dword index + intrinsic("load_hit_attrib_amd", dest_comp=1, bit_sizes=[32], indices=[BASE]) + intrinsic("store_hit_attrib_amd", src_comp=[1], indices=[BASE]) ++intrinsic("load_incoming_ray_payload_amd", dest_comp=1, bit_sizes=[32], indices=[BASE]) ++intrinsic("store_incoming_ray_payload_amd", src_comp=[1], indices=[BASE]) ++intrinsic("load_outgoing_ray_payload_amd", dest_comp=1, bit_sizes=[32], indices=[BASE]) ++intrinsic("store_outgoing_ray_payload_amd", src_comp=[1], indices=[BASE]) + + # Load forced VRS rates. + intrinsic("load_force_vrs_rates_amd", dest_comp=1, bit_sizes=[32], flags=[CAN_ELIMINATE, CAN_REORDER]) +-- +GitLab + + +From 47aae01aefb03df60f1ca9e6c80f17b76a83f031 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Thu, 6 Jun 2024 08:07:34 +0200 +Subject: [PATCH 10/71] radv: Temporarily disable RT pipelines + +--- + src/amd/vulkan/radv_physical_device.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c +index 5022ead6c9d76..98826470d4d60 100644 +--- a/src/amd/vulkan/radv_physical_device.c ++++ b/src/amd/vulkan/radv_physical_device.c +@@ -111,6 +111,10 @@ radv_filter_minmax_enabled(const struct radv_physical_device *pdev) + bool + radv_enable_rt(const struct radv_physical_device *pdev, bool rt_pipelines) + { ++ /* Temporarily under construction! */ ++ if (rt_pipelines) ++ return false; ++ + if (pdev->info.gfx_level < GFX10_3 && !radv_emulate_rt(pdev)) + return false; + +-- +GitLab + + +From e268331ef1d7dcd0bb7642286f358ce7ccd50a5c Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 11:28:21 +0200 +Subject: [PATCH 11/71] nir: Remove + nir_intrinsic_load_rt_arg_scratch_offset_amd + +Not needed anymore. +--- + src/amd/vulkan/nir/radv_nir_rt_shader.c | 11 ----------- + src/amd/vulkan/radv_pipeline_rt.c | 1 - + src/compiler/nir/nir_divergence_analysis.c | 1 - + src/compiler/nir/nir_intrinsics.py | 3 --- + 4 files changed, 16 deletions(-) + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index 8dcd853aa724d..9224c169319fc 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -488,10 +488,6 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) + nir_src_rewrite(&intr->src[1], nir_iadd_nuw(b, nir_load_var(b, vars->stack_ptr), intr->src[1].ssa)); + return true; + } +- case nir_intrinsic_load_rt_arg_scratch_offset_amd: { +- ret = nir_load_var(b, vars->arg); +- break; +- } + case nir_intrinsic_load_shader_record_ptr: { + ret = nir_load_var(b, vars->shader_record_ptr); + break; +@@ -1086,12 +1082,6 @@ lower_any_hit_for_intersection(nir_shader *any_hit) + b->cursor = nir_before_instr(instr); + nir_src_rewrite(&intrin->src[1], nir_iadd_nuw(b, scratch_offset, intrin->src[1].ssa)); + break; +- case nir_intrinsic_load_rt_arg_scratch_offset_amd: +- b->cursor = nir_after_instr(instr); +- nir_def *arg_offset = nir_isub(b, &intrin->def, scratch_offset); +- nir_def_rewrite_uses_after(&intrin->def, arg_offset, arg_offset->parent_instr); +- break; +- + default: + break; + } +@@ -1732,7 +1722,6 @@ radv_build_traversal_shader(struct radv_device *device, struct radv_ray_tracing_ + nir_store_var(&b, vars.cull_mask_and_flags, nir_load_cull_mask_and_flags_amd(&b), 0x1); + nir_store_var(&b, vars.origin, nir_load_ray_world_origin(&b), 0x7); + nir_store_var(&b, vars.direction, nir_load_ray_world_direction(&b), 0x7); +- nir_store_var(&b, vars.arg, nir_load_rt_arg_scratch_offset_amd(&b), 0x1); + nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 0x1); + + radv_build_traversal(device, pipeline, pCreateInfo, false, &b, &vars, false, info); +diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c +index 8d9ba4d6047a6..11acaa74dfc54 100644 +--- a/src/amd/vulkan/radv_pipeline_rt.c ++++ b/src/amd/vulkan/radv_pipeline_rt.c +@@ -318,7 +318,6 @@ should_move_rt_instruction(nir_intrinsic_instr *instr) + switch (instr->intrinsic) { + case nir_intrinsic_load_hit_attrib_amd: + return nir_intrinsic_base(instr) < RADV_MAX_HIT_ATTRIB_DWORDS; +- case nir_intrinsic_load_rt_arg_scratch_offset_amd: + case nir_intrinsic_load_ray_flags: + case nir_intrinsic_load_ray_object_origin: + case nir_intrinsic_load_ray_world_origin: +diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c +index 183b3bc81820d..78943c897922f 100644 +--- a/src/compiler/nir/nir_divergence_analysis.c ++++ b/src/compiler/nir/nir_divergence_analysis.c +@@ -835,7 +835,6 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) + case nir_intrinsic_load_packed_passthrough_primitive_amd: + case nir_intrinsic_load_initial_edgeflags_amd: + case nir_intrinsic_gds_atomic_add_amd: +- case nir_intrinsic_load_rt_arg_scratch_offset_amd: + case nir_intrinsic_load_intersection_opaque_amd: + case nir_intrinsic_load_vector_arg_amd: + case nir_intrinsic_load_btd_stack_id_intel: +diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py +index 798e961c0c8e3..2a6de0c4b6f25 100644 +--- a/src/compiler/nir/nir_intrinsics.py ++++ b/src/compiler/nir/nir_intrinsics.py +@@ -1673,9 +1673,6 @@ intrinsic("bvh64_intersect_ray_amd", [4, 2, 1, 3, 3, 3], 4, flags=[CAN_ELIMINATE + # Return of a callable in raytracing pipelines + intrinsic("rt_return_amd") + +-# offset into scratch for the input callable data in a raytracing pipeline. +-system_value("rt_arg_scratch_offset_amd", 1) +- + # Whether to call the anyhit shader for an intersection in an intersection shader. + system_value("intersection_opaque_amd", 1, bit_sizes=[1]) + +-- +GitLab + + +From 8100ae695c5322e10227619b5e1b6027c2b35a02 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 11:31:55 +0200 +Subject: [PATCH 12/71] radv/rt: Remove RT priorities + +They have been been useful for ensuring reconvergence, but RT function +calls ensure that on their own now. +--- + src/amd/vulkan/nir/radv_nir_rt_shader.c | 37 ------------------------- + src/amd/vulkan/radv_cmd_buffer.c | 2 +- + src/amd/vulkan/radv_pipeline_rt.c | 2 +- + src/amd/vulkan/radv_shader.h | 27 ------------------ + 4 files changed, 2 insertions(+), 66 deletions(-) + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index 9224c169319fc..3f50c7297baae 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -1843,43 +1843,6 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, + lower_hit_attribs(shader, hit_attribs, 0); + } + +-/** Select the next shader based on priorities: +- * +- * Detect the priority of the shader stage by the lowest bits in the address (low to high): +- * - Raygen - idx 0 +- * - Traversal - idx 1 +- * - Closest Hit / Miss - idx 2 +- * - Callable - idx 3 +- * +- * +- * This gives us the following priorities: +- * Raygen : Callable > > Traversal > Raygen +- * Traversal : > Chit / Miss > > Raygen +- * CHit / Miss : Callable > Chit / Miss > Traversal > Raygen +- * Callable : Callable > Chit / Miss > > Raygen +- */ +-static nir_def * +-select_next_shader(nir_builder *b, nir_def *shader_addr, unsigned wave_size) +-{ +- gl_shader_stage stage = b->shader->info.stage; +- nir_def *prio = nir_iand_imm(b, shader_addr, radv_rt_priority_mask); +- nir_def *ballot = nir_ballot(b, 1, wave_size, nir_imm_bool(b, true)); +- nir_def *ballot_traversal = nir_ballot(b, 1, wave_size, nir_ieq_imm(b, prio, radv_rt_priority_traversal)); +- nir_def *ballot_hit_miss = nir_ballot(b, 1, wave_size, nir_ieq_imm(b, prio, radv_rt_priority_hit_miss)); +- nir_def *ballot_callable = nir_ballot(b, 1, wave_size, nir_ieq_imm(b, prio, radv_rt_priority_callable)); +- +- if (stage != MESA_SHADER_CALLABLE && stage != MESA_SHADER_INTERSECTION) +- ballot = nir_bcsel(b, nir_ine_imm(b, ballot_traversal, 0), ballot_traversal, ballot); +- if (stage != MESA_SHADER_RAYGEN) +- ballot = nir_bcsel(b, nir_ine_imm(b, ballot_hit_miss, 0), ballot_hit_miss, ballot); +- if (stage != MESA_SHADER_INTERSECTION) +- ballot = nir_bcsel(b, nir_ine_imm(b, ballot_callable, 0), ballot_callable, ballot); +- +- nir_def *lsb = nir_find_lsb(b, ballot); +- nir_def *next = nir_read_invocation(b, shader_addr, lsb); +- return nir_iand_imm(b, next, ~radv_rt_priority_mask); +-} +- + static void + radv_store_arg(nir_builder *b, const struct radv_shader_args *args, const struct radv_ray_tracing_stage_info *info, + struct ac_arg arg, nir_def *value) +diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c +index d205cebbda64c..96bda7c3cf639 100644 +--- a/src/amd/vulkan/radv_cmd_buffer.c ++++ b/src/amd/vulkan/radv_cmd_buffer.c +@@ -7551,7 +7551,7 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer, struct radv_compu + const uint32_t traversal_shader_addr_offset = radv_get_user_sgpr_loc(rt_prolog, AC_UD_CS_TRAVERSAL_SHADER_ADDR); + struct radv_shader *traversal_shader = cmd_buffer->state.shaders[MESA_SHADER_INTERSECTION]; + if (traversal_shader_addr_offset && traversal_shader) { +- uint64_t traversal_va = traversal_shader->va | radv_rt_priority_traversal; ++ uint64_t traversal_va = traversal_shader->va; + radv_emit_shader_pointer(device, cmd_buffer->cs, traversal_shader_addr_offset, traversal_va, true); + } + } +diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c +index 11acaa74dfc54..32a1cba1269f3 100644 +--- a/src/amd/vulkan/radv_pipeline_rt.c ++++ b/src/amd/vulkan/radv_pipeline_rt.c +@@ -1138,7 +1138,7 @@ radv_rt_pipeline_create(VkDevice _device, VkPipelineCache _cache, const VkRayTra + if (pipeline->groups[i].recursive_shader != VK_SHADER_UNUSED_KHR) { + struct radv_shader *shader = pipeline->stages[pipeline->groups[i].recursive_shader].shader; + if (shader) +- pipeline->groups[i].handle.recursive_shader_ptr = shader->va | radv_get_rt_priority(shader->info.stage); ++ pipeline->groups[i].handle.recursive_shader_ptr = shader->va; + } + } + +diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h +index 300358a346dbb..968ebbe6d4af4 100644 +--- a/src/amd/vulkan/radv_shader.h ++++ b/src/amd/vulkan/radv_shader.h +@@ -682,33 +682,6 @@ nir_shader *radv_build_traversal_shader(struct radv_device *device, struct radv_ + const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, + struct radv_ray_tracing_stage_info *info); + +-enum radv_rt_priority { +- radv_rt_priority_raygen = 0, +- radv_rt_priority_traversal = 1, +- radv_rt_priority_hit_miss = 2, +- radv_rt_priority_callable = 3, +- radv_rt_priority_mask = 0x3, +-}; +- +-static inline enum radv_rt_priority +-radv_get_rt_priority(gl_shader_stage stage) +-{ +- switch (stage) { +- case MESA_SHADER_RAYGEN: +- return radv_rt_priority_raygen; +- case MESA_SHADER_INTERSECTION: +- case MESA_SHADER_ANY_HIT: +- return radv_rt_priority_traversal; +- case MESA_SHADER_CLOSEST_HIT: +- case MESA_SHADER_MISS: +- return radv_rt_priority_hit_miss; +- case MESA_SHADER_CALLABLE: +- return radv_rt_priority_callable; +- default: +- unreachable("Unimplemented RT shader stage."); +- } +-} +- + struct radv_shader_layout; + enum radv_pipeline_type; + +-- +GitLab + + +From 8849cf03b0c29eb6b864a4056195ca7dc9f53a68 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 11:39:33 +0200 +Subject: [PATCH 13/71] radv/rt: Refactor radv_nir_lower_rt_vars + +Now we can use it on load/store instruction. Will be used for lowering +payloads to load/store_*_payload instructions. +--- + .../nir/radv_nir_lower_hit_attrib_derefs.c | 93 ++++++++++++++----- + 1 file changed, 70 insertions(+), 23 deletions(-) + +diff --git a/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c b/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c +index 38e14dd4015fc..9db157dd4baf0 100644 +--- a/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c ++++ b/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c +@@ -10,13 +10,19 @@ + #include "radv_constants.h" + #include "radv_nir.h" + ++typedef nir_def *(*load_intrin_cb)(nir_builder *b, unsigned base); ++typedef void (*store_intrin_cb)(nir_builder *b, nir_def *val, unsigned base); ++ + struct lower_hit_attrib_deref_args { + nir_variable_mode mode; + uint32_t base_offset; ++ ++ load_intrin_cb load_cb; ++ store_intrin_cb store_cb; + }; + + static bool +-lower_hit_attrib_deref(nir_builder *b, nir_instr *instr, void *data) ++lower_rt_var_deref(nir_builder *b, nir_instr *instr, void *data) + { + if (instr->type != nir_instr_type_intrinsic) + return false; +@@ -48,19 +54,16 @@ lower_hit_attrib_deref(nir_builder *b, nir_instr *instr, void *data) + uint32_t comp_offset = offset % 4; + + if (bit_size == 64) { +- components[comp] = nir_pack_64_2x32_split(b, nir_load_hit_attrib_amd(b, .base = base), +- nir_load_hit_attrib_amd(b, .base = base + 1)); ++ components[comp] = nir_pack_64_2x32_split(b, args->load_cb(b, base), args->load_cb(b, base + 1)); + } else if (bit_size == 32) { +- components[comp] = nir_load_hit_attrib_amd(b, .base = base); ++ components[comp] = args->load_cb(b, base); + } else if (bit_size == 16) { +- components[comp] = +- nir_channel(b, nir_unpack_32_2x16(b, nir_load_hit_attrib_amd(b, .base = base)), comp_offset / 2); ++ components[comp] = nir_channel(b, nir_unpack_32_2x16(b, args->load_cb(b, base)), comp_offset / 2); + } else if (bit_size == 8) { +- components[comp] = +- nir_channel(b, nir_unpack_bits(b, nir_load_hit_attrib_amd(b, .base = base), 8), comp_offset); ++ components[comp] = nir_channel(b, nir_unpack_bits(b, args->load_cb(b, base), 8), comp_offset); + } else { + assert(bit_size == 1); +- components[comp] = nir_i2b(b, nir_load_hit_attrib_amd(b, .base = base)); ++ components[comp] = nir_i2b(b, args->load_cb(b, base)); + } + } + +@@ -78,25 +81,25 @@ lower_hit_attrib_deref(nir_builder *b, nir_instr *instr, void *data) + nir_def *component = nir_channel(b, value, comp); + + if (bit_size == 64) { +- nir_store_hit_attrib_amd(b, nir_unpack_64_2x32_split_x(b, component), .base = base); +- nir_store_hit_attrib_amd(b, nir_unpack_64_2x32_split_y(b, component), .base = base + 1); ++ args->store_cb(b, nir_unpack_64_2x32_split_x(b, component), base); ++ args->store_cb(b, nir_unpack_64_2x32_split_y(b, component), base + 1); + } else if (bit_size == 32) { +- nir_store_hit_attrib_amd(b, component, .base = base); ++ args->store_cb(b, component, base); + } else if (bit_size == 16) { +- nir_def *prev = nir_unpack_32_2x16(b, nir_load_hit_attrib_amd(b, .base = base)); ++ nir_def *prev = nir_unpack_32_2x16(b, args->load_cb(b, base)); + nir_def *components[2]; + for (uint32_t word = 0; word < 2; word++) + components[word] = (word == comp_offset / 2) ? nir_channel(b, value, comp) : nir_channel(b, prev, word); +- nir_store_hit_attrib_amd(b, nir_pack_32_2x16(b, nir_vec(b, components, 2)), .base = base); ++ args->store_cb(b, nir_pack_32_2x16(b, nir_vec(b, components, 2)), base); + } else if (bit_size == 8) { +- nir_def *prev = nir_unpack_bits(b, nir_load_hit_attrib_amd(b, .base = base), 8); ++ nir_def *prev = nir_unpack_bits(b, args->load_cb(b, base), 8); + nir_def *components[4]; + for (uint32_t byte = 0; byte < 4; byte++) + components[byte] = (byte == comp_offset) ? nir_channel(b, value, comp) : nir_channel(b, prev, byte); +- nir_store_hit_attrib_amd(b, nir_pack_32_4x8(b, nir_vec(b, components, 4)), .base = base); ++ args->store_cb(b, nir_pack_32_4x8(b, nir_vec(b, components, 4)), base); + } else { + assert(bit_size == 1); +- nir_store_hit_attrib_amd(b, nir_b2i32(b, component), .base = base); ++ args->store_cb(b, nir_b2i32(b, component), base); + } + } + } +@@ -123,13 +126,14 @@ radv_lower_payload_arg_to_offset(nir_builder *b, nir_intrinsic_instr *instr, voi + } + + static bool +-radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, uint32_t base_offset) ++radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, load_intrin_cb load_cb, store_intrin_cb store_cb, ++ uint32_t base_offset) + { + bool progress = false; + + progress |= nir_lower_indirect_derefs(shader, mode, UINT32_MAX); + +- progress |= nir_lower_vars_to_explicit_types(shader, mode, glsl_get_natural_size_align_bytes); ++ NIR_PASS(_, shader, nir_lower_vars_to_explicit_types, mode, glsl_get_natural_size_align_bytes); + + if (shader->info.stage == MESA_SHADER_RAYGEN && mode == nir_var_function_temp) + progress |= nir_shader_intrinsics_pass(shader, radv_lower_payload_arg_to_offset, nir_metadata_control_flow, NULL); +@@ -137,9 +141,11 @@ radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, uint32_t base + struct lower_hit_attrib_deref_args args = { + .mode = mode, + .base_offset = base_offset, ++ .load_cb = load_cb, ++ .store_cb = store_cb, + }; + +- progress |= nir_shader_instructions_pass(shader, lower_hit_attrib_deref, nir_metadata_control_flow, &args); ++ progress |= nir_shader_instructions_pass(shader, lower_rt_var_deref, nir_metadata_control_flow, &args); + + if (progress) { + nir_remove_dead_derefs(shader); +@@ -149,16 +155,57 @@ radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, uint32_t base + return progress; + } + ++static nir_def * ++load_hit_attrib_cb(nir_builder *b, unsigned base) ++{ ++ return nir_load_hit_attrib_amd(b, .base = base); ++} ++ ++static void ++store_hit_attrib_cb(nir_builder *b, nir_def *val, unsigned base) ++{ ++ nir_store_hit_attrib_amd(b, val, .base = base); ++} ++ + bool + radv_nir_lower_hit_attrib_derefs(nir_shader *shader) + { +- return radv_nir_lower_rt_vars(shader, nir_var_ray_hit_attrib, 0); ++ bool progress = false; ++ progress |= nir_lower_vars_to_explicit_types(shader, nir_var_ray_hit_attrib, glsl_get_natural_size_align_bytes); ++ progress |= radv_nir_lower_rt_vars(shader, nir_var_ray_hit_attrib, load_hit_attrib_cb, store_hit_attrib_cb, 0); ++ return progress; ++} ++ ++static nir_def * ++load_incoming_payload_cb(nir_builder *b, unsigned base) ++{ ++ return nir_load_incoming_ray_payload_amd(b, .base = base); ++} ++ ++static void ++store_incoming_payload_cb(nir_builder *b, nir_def *val, unsigned base) ++{ ++ nir_store_incoming_ray_payload_amd(b, val, .base = base); ++} ++ ++static nir_def * ++load_outgoing_payload_cb(nir_builder *b, unsigned base) ++{ ++ return nir_load_outgoing_ray_payload_amd(b, .base = base); ++} ++ ++static void ++store_outgoing_payload_cb(nir_builder *b, nir_def *val, unsigned base) ++{ ++ nir_store_outgoing_ray_payload_amd(b, val, .base = base); + } + + bool + radv_nir_lower_ray_payload_derefs(nir_shader *shader, uint32_t offset) + { +- bool progress = radv_nir_lower_rt_vars(shader, nir_var_function_temp, RADV_MAX_HIT_ATTRIB_SIZE + offset); +- progress |= radv_nir_lower_rt_vars(shader, nir_var_shader_call_data, RADV_MAX_HIT_ATTRIB_SIZE + offset); ++ bool progress = radv_nir_lower_rt_vars(shader, nir_var_function_temp, load_outgoing_payload_cb, ++ store_outgoing_payload_cb, offset); ++ progress |= radv_nir_lower_rt_vars(shader, nir_var_shader_call_data, load_incoming_payload_cb, ++ store_incoming_payload_cb, offset); + return progress; + } +-- +GitLab + + +From 7c120680691e255437116f3219d1d4684d28a180 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 11:46:28 +0200 +Subject: [PATCH 14/71] radv/rt: Pass maximum payload size to + radv_rt_nir_to_asm + +--- + src/amd/vulkan/radv_pipeline_rt.c | 27 ++++++++++++++++++++++----- + 1 file changed, 22 insertions(+), 5 deletions(-) + +diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c +index 32a1cba1269f3..0de6d1281b932 100644 +--- a/src/amd/vulkan/radv_pipeline_rt.c ++++ b/src/amd/vulkan/radv_pipeline_rt.c +@@ -356,7 +356,7 @@ move_rt_instructions(nir_shader *shader) + static VkResult + radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, + const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, struct radv_ray_tracing_pipeline *pipeline, +- bool monolithic, struct radv_shader_stage *stage, uint32_t *stack_size, ++ bool monolithic, struct radv_shader_stage *stage, uint32_t *payload_size, uint32_t *stack_size, + struct radv_ray_tracing_stage_info *stage_info, + const struct radv_ray_tracing_stage_info *traversal_stage_info, + struct radv_serialized_shader_arena_block *replay_block, struct radv_shader **out_shader) +@@ -368,7 +368,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, + bool keep_executable_info = radv_pipeline_capture_shaders(device, pipeline->base.base.create_flags); + bool keep_statistic_info = radv_pipeline_capture_shader_stats(device, pipeline->base.base.create_flags); + +- radv_nir_lower_rt_io(stage->nir, monolithic, 0); ++ radv_nir_lower_rt_io(stage->nir, monolithic, 0, payload_size); + + /* Gather shader info. */ + nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir)); +@@ -586,6 +586,10 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca + if (!stages) + return VK_ERROR_OUT_OF_HOST_MEMORY; + ++ uint32_t payload_size = 0; ++ if (pCreateInfo->pLibraryInterface) ++ payload_size = pCreateInfo->pLibraryInterface->maxPipelineRayPayloadSize; ++ + bool library = pipeline->base.base.create_flags & VK_PIPELINE_CREATE_2_LIBRARY_BIT_KHR; + + bool monolithic = !library; +@@ -605,6 +609,19 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca + + NIR_PASS(_, stage->nir, radv_nir_lower_hit_attrib_derefs); + ++ nir_foreach_variable_with_modes (var, stage->nir, nir_var_shader_call_data) { ++ unsigned size, alignment; ++ glsl_get_natural_size_align_bytes(var->type, &size, &alignment); ++ payload_size = MAX2(payload_size, size); ++ } ++ nir_foreach_function_impl (impl, stage->nir) { ++ nir_foreach_variable_in_list (var, &impl->locals) { ++ unsigned size, alignment; ++ glsl_get_natural_size_align_bytes(var->type, &size, &alignment); ++ payload_size = MAX2(payload_size, size); ++ } ++ } ++ + rt_stages[i].info = radv_gather_ray_tracing_stage_info(stage->nir); + + stage->feedback.duration = os_time_get_nano() - stage_start; +@@ -670,8 +687,8 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca + + bool monolithic_raygen = monolithic && stage->stage == MESA_SHADER_RAYGEN; + +- result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, monolithic_raygen, stage, &stack_size, +- &rt_stages[idx].info, NULL, replay_block, &rt_stages[idx].shader); ++ result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, monolithic_raygen, stage, &payload_size, ++ &stack_size, &rt_stages[idx].info, NULL, replay_block, &rt_stages[idx].shader); + if (result != VK_SUCCESS) + goto cleanup; + +@@ -728,7 +745,7 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca + .key = stage_keys[MESA_SHADER_INTERSECTION], + }; + radv_shader_layout_init(pipeline_layout, MESA_SHADER_INTERSECTION, &traversal_stage.layout); +- result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, false, &traversal_stage, NULL, NULL, ++ result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, false, &traversal_stage, &payload_size, NULL, NULL, + &traversal_info, NULL, &pipeline->base.base.shaders[MESA_SHADER_INTERSECTION]); + ralloc_free(traversal_nir); + +-- +GitLab + + +From d7b329a6c5625895e7e020ee948d2c0b9c9e9329 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 11:47:46 +0200 +Subject: [PATCH 15/71] radv/rt: Track traversal shader stack size + +--- + src/amd/vulkan/radv_pipeline_rt.c | 14 ++++++++------ + src/amd/vulkan/radv_pipeline_rt.h | 1 + + 2 files changed, 9 insertions(+), 6 deletions(-) + +diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c +index 0de6d1281b932..3c848361f13e3 100644 +--- a/src/amd/vulkan/radv_pipeline_rt.c ++++ b/src/amd/vulkan/radv_pipeline_rt.c +@@ -745,8 +745,9 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca + .key = stage_keys[MESA_SHADER_INTERSECTION], + }; + radv_shader_layout_init(pipeline_layout, MESA_SHADER_INTERSECTION, &traversal_stage.layout); +- result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, false, &traversal_stage, &payload_size, NULL, NULL, +- &traversal_info, NULL, &pipeline->base.base.shaders[MESA_SHADER_INTERSECTION]); ++ result = radv_rt_nir_to_asm(device, cache, pCreateInfo, pipeline, false, &traversal_stage, &payload_size, ++ &pipeline->traversal_stack_size, NULL, &traversal_info, NULL, ++ &pipeline->base.base.shaders[MESA_SHADER_INTERSECTION]); + ralloc_free(traversal_nir); + + cleanup: +@@ -807,10 +808,11 @@ compute_rt_stack_size(const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, stru + unreachable("Invalid stage type in RT shader"); + } + } +- pipeline->stack_size = +- raygen_size + +- MIN2(pCreateInfo->maxPipelineRayRecursionDepth, 1) * MAX2(chit_miss_size, intersection_size + any_hit_size) + +- MAX2(0, (int)(pCreateInfo->maxPipelineRayRecursionDepth) - 1) * chit_miss_size + 2 * callable_size; ++ pipeline->stack_size = raygen_size + ++ MIN2(pCreateInfo->maxPipelineRayRecursionDepth, 1) * ++ (chit_miss_size + intersection_size + any_hit_size + pipeline->traversal_stack_size) + ++ MAX2(0, (int)(pCreateInfo->maxPipelineRayRecursionDepth) - 1) * chit_miss_size + ++ 2 * callable_size; + } + + static void +diff --git a/src/amd/vulkan/radv_pipeline_rt.h b/src/amd/vulkan/radv_pipeline_rt.h +index 99c0067325923..acfe978924a17 100644 +--- a/src/amd/vulkan/radv_pipeline_rt.h ++++ b/src/amd/vulkan/radv_pipeline_rt.h +@@ -26,6 +26,7 @@ struct radv_ray_tracing_pipeline { + unsigned group_count; + + uint32_t stack_size; ++ uint32_t traversal_stack_size; + + /* set if any shaders from this pipeline require robustness2 in the merged traversal shader */ + bool traversal_storage_robustness2 : 1; +-- +GitLab + + +From a48ee7d583587d09cf042045f5ae89d01a17f4ad Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 11:48:48 +0200 +Subject: [PATCH 16/71] radv/rt: Set stack size to scratch_bytes_per_wave + +--- + src/amd/vulkan/radv_pipeline_rt.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c +index 3c848361f13e3..c86e292a36244 100644 +--- a/src/amd/vulkan/radv_pipeline_rt.c ++++ b/src/amd/vulkan/radv_pipeline_rt.c +@@ -460,6 +460,9 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, + shader = radv_shader_create(device, cache, binary, keep_executable_info || dump_shader); + + if (shader) { ++ if (stack_size) ++ *stack_size += DIV_ROUND_UP(shader->config.scratch_bytes_per_wave, shader->info.wave_size); ++ + radv_shader_generate_debug_info(device, dump_shader, keep_executable_info, binary, shader, shaders, num_shaders, + &stage->info); + +-- +GitLab + + +From 4af66a35fb348043880ebb4c46893bfd6bebb7fc Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 12:15:20 +0200 +Subject: [PATCH 17/71] radv/rt: Use radv_get_rt_shader_entrypoint instead of + nir_shader_get_entrypoint + +--- + src/amd/vulkan/nir/radv_nir_rt_shader.c | 2 +- + src/amd/vulkan/radv_pipeline_rt.c | 2 +- + src/amd/vulkan/radv_shader.h | 9 +++++++++ + 3 files changed, 11 insertions(+), 2 deletions(-) + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index 3f50c7297baae..931c8c3e10ab1 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -1610,7 +1610,7 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin + radv_build_end_trace_token(b, vars, original_tmax, nir_load_var(b, trav_vars.hit), + nir_load_var(b, iteration_instance_count)); + +- nir_metadata_preserve(nir_shader_get_entrypoint(b->shader), nir_metadata_none); ++ nir_metadata_preserve(radv_get_rt_shader_entrypoint(b->shader), nir_metadata_none); + radv_nir_lower_hit_attrib_derefs(b->shader); + + /* Register storage for hit attributes */ +diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c +index c86e292a36244..c4feea4a6f95b 100644 +--- a/src/amd/vulkan/radv_pipeline_rt.c ++++ b/src/amd/vulkan/radv_pipeline_rt.c +@@ -424,7 +424,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, + pipeline, monolithic, traversal_stage_info); + + /* Info might be out-of-date after inlining in radv_nir_lower_rt_abi(). */ +- nir_shader_gather_info(temp_stage.nir, nir_shader_get_entrypoint(temp_stage.nir)); ++ nir_shader_gather_info(temp_stage.nir, radv_get_rt_shader_entrypoint(temp_stage.nir)); + + radv_optimize_nir(temp_stage.nir, stage->key.optimisations_disabled); + radv_postprocess_nir(device, NULL, &temp_stage); +diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h +index 968ebbe6d4af4..36ad1d0dd8bf9 100644 +--- a/src/amd/vulkan/radv_shader.h ++++ b/src/amd/vulkan/radv_shader.h +@@ -507,6 +507,15 @@ struct radv_shader_stage; + void radv_optimize_nir(struct nir_shader *shader, bool optimize_conservatively); + void radv_optimize_nir_algebraic(nir_shader *shader, bool opt_offsets, bool opt_mqsad); + ++static inline nir_function_impl * ++radv_get_rt_shader_entrypoint(nir_shader *shader) ++{ ++ nir_foreach_function_impl (impl, shader) ++ if (impl->function->is_entrypoint || impl->function->is_exported) ++ return impl; ++ return NULL; ++} ++ + void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset); + + struct radv_ray_tracing_stage_info; +-- +GitLab + + +From 38ac43cce19772daf5b566eee5128805a90e75a7 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Fri, 4 Oct 2024 05:48:26 +0200 +Subject: [PATCH 18/71] radv/rt: Only lower vars to explicit types for + monolithic shaders + +--- + src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c | 2 -- + src/amd/vulkan/nir/radv_nir_rt_shader.c | 2 ++ + 2 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c b/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c +index 9db157dd4baf0..7efcad3675c6b 100644 +--- a/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c ++++ b/src/amd/vulkan/nir/radv_nir_lower_hit_attrib_derefs.c +@@ -133,8 +133,6 @@ radv_nir_lower_rt_vars(nir_shader *shader, nir_variable_mode mode, load_intrin_c + + progress |= nir_lower_indirect_derefs(shader, mode, UINT32_MAX); + +- NIR_PASS(_, shader, nir_lower_vars_to_explicit_types, mode, glsl_get_natural_size_align_bytes); +- + if (shader->info.stage == MESA_SHADER_RAYGEN && mode == nir_var_function_temp) + progress |= nir_shader_intrinsics_pass(shader, radv_lower_payload_arg_to_offset, nir_metadata_control_flow, NULL); + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index 931c8c3e10ab1..c2b0e99f74129 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -851,6 +851,8 @@ radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset) + + NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_function_temp, nir_address_format_32bit_offset); + } else { ++ NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_natural_size_align_bytes); ++ NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_shader_temp, glsl_get_natural_size_align_bytes); + NIR_PASS(_, nir, radv_nir_lower_ray_payload_derefs, payload_offset); + } + } +-- +GitLab + + +From c75c5ab22d84c3e168f0879aca26412d0d6d3668 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 11:54:05 +0200 +Subject: [PATCH 19/71] radv/rt: Lower monolithic ray payload load/store + instructions + +--- + src/amd/vulkan/nir/radv_nir_rt_shader.c | 98 +++++++++++++++++-------- + src/amd/vulkan/radv_shader.h | 2 +- + 2 files changed, 69 insertions(+), 31 deletions(-) + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index c2b0e99f74129..061c58d45949f 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -731,12 +731,13 @@ lower_rt_instructions(nir_shader *shader, struct rt_variables *vars, bool late_l + nir_shader_instructions_pass(shader, radv_lower_rt_instruction, nir_metadata_none, &data); + } + +-/* Lowers hit attributes to registers or shared memory. If hit_attribs is NULL, attributes are ++/* Lowers RT I/O vars to registers or shared memory. If hit_attribs is NULL, attributes are + * lowered to shared memory. */ + static void +-lower_hit_attribs(nir_shader *shader, nir_variable **hit_attribs, uint32_t workgroup_size) ++lower_rt_storage(nir_shader *shader, nir_variable **hit_attribs, nir_deref_instr **payload_in, ++ nir_variable **payload_out, uint32_t workgroup_size) + { +- nir_function_impl *impl = nir_shader_get_entrypoint(shader); ++ nir_function_impl *impl = radv_get_rt_shader_entrypoint(shader); + + nir_foreach_variable_with_modes (attrib, shader, nir_var_ray_hit_attrib) + attrib->data.mode = nir_var_shader_temp; +@@ -750,29 +751,55 @@ lower_hit_attribs(nir_shader *shader, nir_variable **hit_attribs, uint32_t workg + + nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); + if (intrin->intrinsic != nir_intrinsic_load_hit_attrib_amd && +- intrin->intrinsic != nir_intrinsic_store_hit_attrib_amd) ++ intrin->intrinsic != nir_intrinsic_store_hit_attrib_amd && ++ intrin->intrinsic != nir_intrinsic_load_incoming_ray_payload_amd && ++ intrin->intrinsic != nir_intrinsic_store_incoming_ray_payload_amd && ++ intrin->intrinsic != nir_intrinsic_load_outgoing_ray_payload_amd && ++ intrin->intrinsic != nir_intrinsic_store_outgoing_ray_payload_amd) + continue; + + b.cursor = nir_after_instr(instr); + +- nir_def *offset; +- if (!hit_attribs) +- offset = nir_imul_imm( +- &b, nir_iadd_imm(&b, nir_load_local_invocation_index(&b), nir_intrinsic_base(intrin) * workgroup_size), +- sizeof(uint32_t)); +- +- if (intrin->intrinsic == nir_intrinsic_load_hit_attrib_amd) { +- nir_def *ret; +- if (hit_attribs) +- ret = nir_load_var(&b, hit_attribs[nir_intrinsic_base(intrin)]); ++ if (intrin->intrinsic == nir_intrinsic_load_hit_attrib_amd || ++ intrin->intrinsic == nir_intrinsic_store_hit_attrib_amd) { ++ nir_def *offset; ++ if (!hit_attribs) ++ offset = nir_imul_imm( ++ &b, ++ nir_iadd_imm(&b, nir_load_local_invocation_index(&b), nir_intrinsic_base(intrin) * workgroup_size), ++ sizeof(uint32_t)); ++ ++ if (intrin->intrinsic == nir_intrinsic_load_hit_attrib_amd) { ++ nir_def *ret; ++ if (hit_attribs) ++ ret = nir_load_var(&b, hit_attribs[nir_intrinsic_base(intrin)]); ++ else ++ ret = nir_load_shared(&b, 1, 32, offset, .base = 0, .align_mul = 4); ++ nir_def_rewrite_uses(nir_instr_def(instr), ret); ++ } else { ++ if (hit_attribs) ++ nir_store_var(&b, hit_attribs[nir_intrinsic_base(intrin)], intrin->src->ssa, 0x1); ++ else ++ nir_store_shared(&b, intrin->src->ssa, offset, .base = 0, .align_mul = 4); ++ } ++ } else if (intrin->intrinsic == nir_intrinsic_load_incoming_ray_payload_amd || ++ intrin->intrinsic == nir_intrinsic_store_incoming_ray_payload_amd) { ++ if (!payload_in) ++ continue; ++ if (intrin->intrinsic == nir_intrinsic_load_incoming_ray_payload_amd) ++ nir_def_rewrite_uses(nir_instr_def(instr), nir_load_deref(&b, payload_in[nir_intrinsic_base(intrin)])); + else +- ret = nir_load_shared(&b, 1, 32, offset, .base = 0, .align_mul = 4); +- nir_def_rewrite_uses(nir_instr_def(instr), ret); +- } else { +- if (hit_attribs) +- nir_store_var(&b, hit_attribs[nir_intrinsic_base(intrin)], intrin->src->ssa, 0x1); ++ nir_store_deref(&b, payload_in[nir_intrinsic_base(intrin)], intrin->src->ssa, 0x1); ++ } else if (intrin->intrinsic == nir_intrinsic_load_outgoing_ray_payload_amd || ++ intrin->intrinsic == nir_intrinsic_store_outgoing_ray_payload_amd) { ++ if (!payload_out) ++ continue; ++ if (intrin->intrinsic == nir_intrinsic_load_outgoing_ray_payload_amd) ++ nir_def_rewrite_uses(nir_instr_def(instr), nir_load_var(&b, payload_out[nir_intrinsic_base(intrin)])); + else +- nir_store_shared(&b, intrin->src->ssa, offset, .base = 0, .align_mul = 4); ++ nir_store_var(&b, payload_out[nir_intrinsic_base(intrin)], intrin->src->ssa, 0x1); ++ } else { ++ continue; + } + nir_instr_remove(instr); + } +@@ -1620,10 +1647,9 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin + + if (!monolithic) { + for (uint32_t i = 0; i < ARRAY_SIZE(hit_attribs); i++) +- hit_attribs[i] = +- nir_local_variable_create(nir_shader_get_entrypoint(b->shader), glsl_uint_type(), "ahit_attrib"); ++ hit_attribs[i] = nir_variable_create(b->shader, nir_var_shader_temp, glsl_uint_type(), "ahit_attrib"); + +- lower_hit_attribs(b->shader, hit_attribs, pdev->rt_wave_size); ++ lower_rt_storage(b->shader, hit_attribs, NULL, NULL, pdev->rt_wave_size); + } + + /* Initialize follow-up shader. */ +@@ -1819,10 +1845,11 @@ radv_count_hit_attrib_slots(nir_builder *b, nir_intrinsic_instr *instr, void *da + + static void + lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, +- struct radv_ray_tracing_pipeline *pipeline, +- const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, struct rt_variables *vars) ++ struct radv_ray_tracing_pipeline *pipeline, const struct radv_shader_info *info, ++ const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, uint32_t payload_size, ++ struct rt_variables *vars) + { +- nir_function_impl *impl = nir_shader_get_entrypoint(shader); ++ nir_function_impl *impl = radv_get_rt_shader_entrypoint(shader); + + struct lower_rt_instruction_monolithic_state state = { + .device = device, +@@ -1842,7 +1869,17 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, + for (uint32_t i = 0; i < hit_attrib_count; i++) + hit_attribs[i] = nir_local_variable_create(impl, glsl_uint_type(), "ahit_attrib"); + +- lower_hit_attribs(shader, hit_attribs, 0); ++ nir_builder b = nir_builder_create(impl); ++ b.cursor = nir_before_impl(impl); ++ nir_variable **payload_vars = rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(payload_size, 4)); ++ nir_deref_instr **payload_storage = ++ rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(payload_size, 4)); ++ for (unsigned i = 0; i < DIV_ROUND_UP(payload_size, 4); ++i) { ++ payload_vars[i] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "_payload"); ++ payload_storage[i] = nir_build_deref_var(&b, payload_vars[i]); ++ } ++ ++ lower_rt_storage(shader, hit_attribs, payload_storage, payload_vars, info->wave_size); + } + + static void +@@ -1857,8 +1894,9 @@ radv_store_arg(nir_builder *b, const struct radv_shader_args *args, const struct + void + radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, + const struct radv_shader_args *args, const struct radv_shader_info *info, uint32_t *stack_size, +- bool resume_shader, struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline, +- bool monolithic, const struct radv_ray_tracing_stage_info *traversal_info) ++ bool resume_shader, uint32_t payload_size, struct radv_device *device, ++ struct radv_ray_tracing_pipeline *pipeline, bool monolithic, ++ const struct radv_ray_tracing_stage_info *traversal_info) + { + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + +@@ -1867,7 +1905,7 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH + struct rt_variables vars = create_rt_variables(shader, device, create_flags, monolithic); + + if (monolithic) +- lower_rt_instructions_monolithic(shader, device, pipeline, pCreateInfo, &vars); ++ lower_rt_instructions_monolithic(shader, device, pipeline, info, pCreateInfo, payload_size, &vars); + + struct radv_rt_shader_info rt_info = {0}; + +diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h +index 36ad1d0dd8bf9..4ba7e36d16952 100644 +--- a/src/amd/vulkan/radv_shader.h ++++ b/src/amd/vulkan/radv_shader.h +@@ -522,7 +522,7 @@ struct radv_ray_tracing_stage_info; + + void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, + const struct radv_shader_args *args, const struct radv_shader_info *info, +- uint32_t *stack_size, bool resume_shader, struct radv_device *device, ++ uint32_t *stack_size, bool resume_shader, uint32_t payload_size, struct radv_device *device, + struct radv_ray_tracing_pipeline *pipeline, bool monolithic, + const struct radv_ray_tracing_stage_info *traversal_info); + +-- +GitLab + + +From 1ef679cac11353eba65d518f0728747550d40926 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 17 Jun 2024 13:02:28 +0200 +Subject: [PATCH 20/71] aco: Add function call attributes + +ACO needs RADV to set certain attributes on NIR functions to help with +compilation of function calls. +--- + src/amd/compiler/aco_nir_call_attribs.h | 29 +++++++++++++++++++++++++ + 1 file changed, 29 insertions(+) + create mode 100644 src/amd/compiler/aco_nir_call_attribs.h + +diff --git a/src/amd/compiler/aco_nir_call_attribs.h b/src/amd/compiler/aco_nir_call_attribs.h +new file mode 100644 +index 0000000000000..33dc011914cd9 +--- /dev/null ++++ b/src/amd/compiler/aco_nir_call_attribs.h +@@ -0,0 +1,29 @@ ++/* ++ * Copyright © 2024 Valve Corporation ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#ifndef ACO_NIR_CALL_ATTRIBS_H ++#define ACO_NIR_CALL_ATTRIBS_H ++ ++enum aco_nir_call_abi { ++ ACO_NIR_CALL_ABI_RT_RECURSIVE, ++ ACO_NIR_CALL_ABI_TRAVERSAL, ++ ACO_NIR_CALL_ABI_AHIT_ISEC, ++}; ++ ++enum aco_nir_function_attribs { ++ ACO_NIR_FUNCTION_ATTRIB_ABI_MASK = 0x7F, ++ /* Different lanes can have different values for the function pointer to call */ ++ ACO_NIR_FUNCTION_ATTRIB_DIVERGENT_CALL = 0x1 << 7, ++ /* Function will never return */ ++ ACO_NIR_FUNCTION_ATTRIB_NORETURN = 0x2 << 7, ++}; ++ ++enum aco_nir_parameter_attribs { ++ /* Parameter value is not used by any callee and does not need to be preserved */ ++ ACO_NIR_PARAM_ATTRIB_DISCARDABLE = 0x1, ++}; ++ ++#endif /* ACO_NIR_CALL_ATTRIBS_H */ +-- +GitLab + + +From 10abf8a72b902de027999226432bca4621cde2de Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Thu, 3 Oct 2024 12:34:25 +0200 +Subject: [PATCH 21/71] radv/rt: Lower descriptor loads to param loads + +--- + .../nir/radv_nir_apply_pipeline_layout.c | 46 +++++++++++++++++-- + 1 file changed, 42 insertions(+), 4 deletions(-) + +diff --git a/src/amd/vulkan/nir/radv_nir_apply_pipeline_layout.c b/src/amd/vulkan/nir/radv_nir_apply_pipeline_layout.c +index fd67c3eb18f5e..991cc31eadafd 100644 +--- a/src/amd/vulkan/nir/radv_nir_apply_pipeline_layout.c ++++ b/src/amd/vulkan/nir/radv_nir_apply_pipeline_layout.c +@@ -5,6 +5,7 @@ + */ + #include "ac_descriptors.h" + #include "ac_shader_util.h" ++#include "aco_nir_call_attribs.h" + #include "nir.h" + #include "nir_builder.h" + #include "radv_descriptor_set.h" +@@ -34,6 +35,42 @@ get_scalar_arg(nir_builder *b, unsigned size, struct ac_arg arg) + return nir_load_scalar_arg_amd(b, size, .base = arg.arg_index); + } + ++static nir_def * ++get_indirect_descriptors_addr(nir_builder *b, apply_layout_state *state) ++{ ++ switch (b->shader->info.stage) { ++ case MESA_SHADER_RAYGEN: ++ case MESA_SHADER_CALLABLE: ++ return nir_load_param(b, RAYGEN_ARG_DESCRIPTORS); ++ case MESA_SHADER_INTERSECTION: ++ return nir_load_param(b, TRAVERSAL_ARG_DESCRIPTORS); ++ case MESA_SHADER_CLOSEST_HIT: ++ case MESA_SHADER_MISS: ++ return nir_load_param(b, CHIT_MISS_ARG_DESCRIPTORS); ++ default: ++ assert(!gl_shader_stage_is_rt(b->shader->info.stage)); ++ return get_scalar_arg(b, 1, state->args->descriptor_sets[0]); ++ } ++} ++ ++static nir_def * ++get_indirect_push_constants_addr(nir_builder *b, apply_layout_state *state) ++{ ++ switch (b->shader->info.stage) { ++ case MESA_SHADER_RAYGEN: ++ case MESA_SHADER_CALLABLE: ++ return nir_load_param(b, RAYGEN_ARG_PUSH_CONSTANTS); ++ case MESA_SHADER_INTERSECTION: ++ return nir_load_param(b, TRAVERSAL_ARG_PUSH_CONSTANTS); ++ case MESA_SHADER_CLOSEST_HIT: ++ case MESA_SHADER_MISS: ++ return nir_load_param(b, CHIT_MISS_ARG_PUSH_CONSTANTS); ++ default: ++ assert(!gl_shader_stage_is_rt(b->shader->info.stage)); ++ return get_scalar_arg(b, 1, state->args->ac.push_constants); ++ } ++} ++ + static nir_def * + convert_pointer_to_64_bit(nir_builder *b, apply_layout_state *state, nir_def *ptr) + { +@@ -44,8 +81,9 @@ static nir_def * + load_desc_ptr(nir_builder *b, apply_layout_state *state, unsigned set) + { + const struct radv_userdata_locations *user_sgprs_locs = &state->info->user_sgprs_locs; +- if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) { +- nir_def *addr = get_scalar_arg(b, 1, state->args->descriptor_sets[0]); ++ if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1 || ++ gl_shader_stage_is_rt(b->shader->info.stage)) { ++ nir_def *addr = get_indirect_descriptors_addr(b, state); + addr = convert_pointer_to_64_bit(b, state, addr); + return nir_load_smem_amd(b, 1, addr, nir_imm_int(b, set * 4)); + } +@@ -67,7 +105,7 @@ visit_vulkan_resource_index(nir_builder *b, apply_layout_state *state, nir_intri + if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || + layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { + unsigned idx = state->layout->set[desc_set].dynamic_offset_start + layout->binding[binding].dynamic_offset_offset; +- set_ptr = get_scalar_arg(b, 1, state->args->ac.push_constants); ++ set_ptr = get_indirect_push_constants_addr(b, state); + offset = state->layout->push_constant_size + idx * 16; + stride = 16; + } else { +@@ -379,7 +417,7 @@ load_push_constant(nir_builder *b, apply_layout_state *state, nir_intrinsic_inst + } + + if (!offset) { +- addr = get_scalar_arg(b, 1, state->args->ac.push_constants); ++ addr = get_indirect_push_constants_addr(b, state); + addr = convert_pointer_to_64_bit(b, state, addr); + offset = nir_iadd_imm_nuw(b, intrin->src[0].ssa, base); + } +-- +GitLab + + +From 41079fe63f7877dacb9fd3d8dc67740ed100439e Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 11:56:09 +0200 +Subject: [PATCH 22/71] radv/rt: Create RT functions to call + +--- + src/amd/compiler/aco_nir_call_attribs.h | 59 +++++ + src/amd/vulkan/nir/radv_nir_rt_shader.c | 276 +++++++++++++++++++++++- + 2 files changed, 331 insertions(+), 4 deletions(-) + +diff --git a/src/amd/compiler/aco_nir_call_attribs.h b/src/amd/compiler/aco_nir_call_attribs.h +index 33dc011914cd9..a879c51ebb3c2 100644 +--- a/src/amd/compiler/aco_nir_call_attribs.h ++++ b/src/amd/compiler/aco_nir_call_attribs.h +@@ -26,4 +26,63 @@ enum aco_nir_parameter_attribs { + ACO_NIR_PARAM_ATTRIB_DISCARDABLE = 0x1, + }; + ++enum aco_nir_raygen_function_arg { ++ RAYGEN_ARG_LAUNCH_ID = 0, ++ RAYGEN_ARG_LAUNCH_SIZE, ++ RAYGEN_ARG_DESCRIPTORS, ++ RAYGEN_ARG_PUSH_CONSTANTS, ++ RAYGEN_ARG_SBT_DESCRIPTORS, ++ RAYGEN_ARG_TRAVERSAL_ADDR, ++ RAYGEN_ARG_SHADER_RECORD_PTR, ++ RAYGEN_ARG_COUNT, ++}; ++ ++enum aco_nir_traversal_function_arg { ++ TRAVERSAL_ARG_LAUNCH_ID = 0, ++ TRAVERSAL_ARG_LAUNCH_SIZE, ++ TRAVERSAL_ARG_DESCRIPTORS, ++ TRAVERSAL_ARG_PUSH_CONSTANTS, ++ TRAVERSAL_ARG_SBT_DESCRIPTORS, ++ TRAVERSAL_ARG_TRAVERSAL_ADDR, ++ TRAVERSAL_ARG_SHADER_RECORD_PTR, ++ TRAVERSAL_ARG_ACCEL_STRUCT, ++ TRAVERSAL_ARG_CULL_MASK_AND_FLAGS, ++ TRAVERSAL_ARG_SBT_OFFSET, ++ TRAVERSAL_ARG_SBT_STRIDE, ++ TRAVERSAL_ARG_MISS_INDEX, ++ TRAVERSAL_ARG_RAY_ORIGIN, ++ TRAVERSAL_ARG_RAY_TMIN, ++ TRAVERSAL_ARG_RAY_DIRECTION, ++ TRAVERSAL_ARG_RAY_TMAX, ++ TRAVERSAL_ARG_PRIMITIVE_ID, ++ TRAVERSAL_ARG_INSTANCE_ADDR, ++ TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS, ++ TRAVERSAL_ARG_HIT_KIND, ++ TRAVERSAL_ARG_PAYLOAD_BASE, ++}; ++ ++enum aco_nir_chit_miss_function_arg { ++ CHIT_MISS_ARG_LAUNCH_ID = 0, ++ CHIT_MISS_ARG_LAUNCH_SIZE, ++ CHIT_MISS_ARG_DESCRIPTORS, ++ CHIT_MISS_ARG_PUSH_CONSTANTS, ++ CHIT_MISS_ARG_SBT_DESCRIPTORS, ++ CHIT_MISS_ARG_TRAVERSAL_ADDR, ++ CHIT_MISS_ARG_SHADER_RECORD_PTR, ++ CHIT_MISS_ARG_ACCEL_STRUCT, ++ CHIT_MISS_ARG_CULL_MASK_AND_FLAGS, ++ CHIT_MISS_ARG_SBT_OFFSET, ++ CHIT_MISS_ARG_SBT_STRIDE, ++ CHIT_MISS_ARG_MISS_INDEX, ++ CHIT_MISS_ARG_RAY_ORIGIN, ++ CHIT_MISS_ARG_RAY_TMIN, ++ CHIT_MISS_ARG_RAY_DIRECTION, ++ CHIT_MISS_ARG_RAY_TMAX, ++ CHIT_MISS_ARG_PRIMITIVE_ID, ++ CHIT_MISS_ARG_INSTANCE_ADDR, ++ CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS, ++ CHIT_MISS_ARG_HIT_KIND, ++ CHIT_MISS_ARG_PAYLOAD_BASE, ++}; ++ + #endif /* ACO_NIR_CALL_ATTRIBS_H */ +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index 061c58d45949f..165c7e18578e0 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -16,6 +16,8 @@ + #include "radv_pipeline_rt.h" + #include "radv_shader.h" + ++#include "aco_nir_call_attribs.h" ++ + #include "vk_pipeline.h" + + /* Traversal stack size. This stack is put in LDS and experimentally 16 entries results in best +@@ -164,6 +166,243 @@ lower_rt_derefs(nir_shader *shader) + return progress; + } + ++static void ++radv_nir_init_rt_function_params(nir_function *function, gl_shader_stage stage, unsigned payload_size) ++{ ++ unsigned payload_base = -1u; ++ ++ switch (stage) { ++ case MESA_SHADER_RAYGEN: ++ function->num_params = RAYGEN_ARG_COUNT; ++ function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params); ++ function->params[RAYGEN_ARG_LAUNCH_ID].num_components = 3; ++ function->params[RAYGEN_ARG_LAUNCH_ID].bit_size = 32; ++ function->params[RAYGEN_ARG_LAUNCH_ID].type = glsl_vector_type(GLSL_TYPE_UINT, 3); ++ function->params[RAYGEN_ARG_LAUNCH_SIZE].num_components = 3; ++ function->params[RAYGEN_ARG_LAUNCH_SIZE].bit_size = 32; ++ function->params[RAYGEN_ARG_LAUNCH_SIZE].type = glsl_vector_type(GLSL_TYPE_UINT, 3); ++ function->params[RAYGEN_ARG_LAUNCH_SIZE].is_uniform = true; ++ function->params[RAYGEN_ARG_DESCRIPTORS].num_components = 1; ++ function->params[RAYGEN_ARG_DESCRIPTORS].bit_size = 32; ++ function->params[RAYGEN_ARG_DESCRIPTORS].type = glsl_uint_type(); ++ function->params[RAYGEN_ARG_DESCRIPTORS].is_uniform = true; ++ function->params[RAYGEN_ARG_PUSH_CONSTANTS].num_components = 1; ++ function->params[RAYGEN_ARG_PUSH_CONSTANTS].bit_size = 32; ++ function->params[RAYGEN_ARG_PUSH_CONSTANTS].type = glsl_uint_type(); ++ function->params[RAYGEN_ARG_PUSH_CONSTANTS].is_uniform = true; ++ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].num_components = 1; ++ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].bit_size = 64; ++ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].type = glsl_uint64_t_type(); ++ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].is_uniform = true; ++ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].num_components = 1; ++ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].bit_size = 64; ++ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].type = glsl_uint64_t_type(); ++ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].is_uniform = true; ++ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].num_components = 1; ++ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].bit_size = 64; ++ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].type = glsl_uint64_t_type(); ++ function->driver_attributes = ACO_NIR_CALL_ABI_RT_RECURSIVE | ACO_NIR_FUNCTION_ATTRIB_NORETURN; ++ break; ++ case MESA_SHADER_CALLABLE: ++ function->num_params = RAYGEN_ARG_COUNT + DIV_ROUND_UP(payload_size, 4); ++ function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params); ++ function->params[RAYGEN_ARG_LAUNCH_ID].num_components = 3; ++ function->params[RAYGEN_ARG_LAUNCH_ID].bit_size = 32; ++ function->params[RAYGEN_ARG_LAUNCH_ID].type = glsl_vector_type(GLSL_TYPE_UINT, 3); ++ function->params[RAYGEN_ARG_LAUNCH_SIZE].num_components = 3; ++ function->params[RAYGEN_ARG_LAUNCH_SIZE].bit_size = 32; ++ function->params[RAYGEN_ARG_LAUNCH_SIZE].type = glsl_vector_type(GLSL_TYPE_UINT, 3); ++ function->params[RAYGEN_ARG_LAUNCH_SIZE].is_uniform = true; ++ function->params[RAYGEN_ARG_DESCRIPTORS].num_components = 1; ++ function->params[RAYGEN_ARG_DESCRIPTORS].bit_size = 32; ++ function->params[RAYGEN_ARG_DESCRIPTORS].type = glsl_uint_type(); ++ function->params[RAYGEN_ARG_DESCRIPTORS].is_uniform = true; ++ function->params[RAYGEN_ARG_PUSH_CONSTANTS].num_components = 1; ++ function->params[RAYGEN_ARG_PUSH_CONSTANTS].bit_size = 32; ++ function->params[RAYGEN_ARG_PUSH_CONSTANTS].type = glsl_uint_type(); ++ function->params[RAYGEN_ARG_PUSH_CONSTANTS].is_uniform = true; ++ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].num_components = 1; ++ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].bit_size = 64; ++ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].type = glsl_uint64_t_type(); ++ function->params[RAYGEN_ARG_SBT_DESCRIPTORS].is_uniform = true; ++ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].num_components = 1; ++ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].bit_size = 64; ++ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].type = glsl_uint64_t_type(); ++ function->params[RAYGEN_ARG_TRAVERSAL_ADDR].is_uniform = true; ++ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].num_components = 1; ++ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].bit_size = 64; ++ function->params[RAYGEN_ARG_SHADER_RECORD_PTR].type = glsl_uint64_t_type(); ++ ++ function->driver_attributes = ACO_NIR_CALL_ABI_RT_RECURSIVE | ACO_NIR_FUNCTION_ATTRIB_DIVERGENT_CALL; ++ payload_base = RAYGEN_ARG_COUNT; ++ break; ++ case MESA_SHADER_INTERSECTION: ++ function->num_params = TRAVERSAL_ARG_PAYLOAD_BASE + DIV_ROUND_UP(payload_size, 4); ++ function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params); ++ function->params[TRAVERSAL_ARG_LAUNCH_ID].num_components = 3; ++ function->params[TRAVERSAL_ARG_LAUNCH_ID].bit_size = 32; ++ function->params[TRAVERSAL_ARG_LAUNCH_ID].type = glsl_vector_type(GLSL_TYPE_UINT, 3); ++ function->params[TRAVERSAL_ARG_LAUNCH_SIZE].num_components = 3; ++ function->params[TRAVERSAL_ARG_LAUNCH_SIZE].bit_size = 32; ++ function->params[TRAVERSAL_ARG_LAUNCH_SIZE].type = glsl_vector_type(GLSL_TYPE_UINT, 3); ++ function->params[TRAVERSAL_ARG_LAUNCH_SIZE].is_uniform = true; ++ function->params[TRAVERSAL_ARG_DESCRIPTORS].num_components = 1; ++ function->params[TRAVERSAL_ARG_DESCRIPTORS].bit_size = 32; ++ function->params[TRAVERSAL_ARG_DESCRIPTORS].type = glsl_uint_type(); ++ function->params[TRAVERSAL_ARG_DESCRIPTORS].is_uniform = true; ++ function->params[TRAVERSAL_ARG_PUSH_CONSTANTS].num_components = 1; ++ function->params[TRAVERSAL_ARG_PUSH_CONSTANTS].bit_size = 32; ++ function->params[TRAVERSAL_ARG_PUSH_CONSTANTS].type = glsl_uint_type(); ++ function->params[TRAVERSAL_ARG_PUSH_CONSTANTS].is_uniform = true; ++ function->params[TRAVERSAL_ARG_SBT_DESCRIPTORS].num_components = 1; ++ function->params[TRAVERSAL_ARG_SBT_DESCRIPTORS].bit_size = 64; ++ function->params[TRAVERSAL_ARG_SBT_DESCRIPTORS].type = glsl_uint64_t_type(); ++ function->params[TRAVERSAL_ARG_SBT_DESCRIPTORS].is_uniform = true; ++ function->params[TRAVERSAL_ARG_TRAVERSAL_ADDR].num_components = 1; ++ function->params[TRAVERSAL_ARG_TRAVERSAL_ADDR].bit_size = 64; ++ function->params[TRAVERSAL_ARG_TRAVERSAL_ADDR].type = glsl_uint64_t_type(); ++ function->params[TRAVERSAL_ARG_TRAVERSAL_ADDR].is_uniform = true; ++ function->params[TRAVERSAL_ARG_SHADER_RECORD_PTR].num_components = 1; ++ function->params[TRAVERSAL_ARG_SHADER_RECORD_PTR].bit_size = 64; ++ function->params[TRAVERSAL_ARG_SHADER_RECORD_PTR].type = glsl_uint64_t_type(); ++ function->params[TRAVERSAL_ARG_ACCEL_STRUCT].num_components = 1; ++ function->params[TRAVERSAL_ARG_ACCEL_STRUCT].bit_size = 64; ++ function->params[TRAVERSAL_ARG_ACCEL_STRUCT].type = glsl_uint64_t_type(); ++ function->params[TRAVERSAL_ARG_CULL_MASK_AND_FLAGS].num_components = 1; ++ function->params[TRAVERSAL_ARG_CULL_MASK_AND_FLAGS].bit_size = 32; ++ function->params[TRAVERSAL_ARG_CULL_MASK_AND_FLAGS].type = glsl_uint_type(); ++ function->params[TRAVERSAL_ARG_SBT_OFFSET].num_components = 1; ++ function->params[TRAVERSAL_ARG_SBT_OFFSET].bit_size = 32; ++ function->params[TRAVERSAL_ARG_SBT_OFFSET].type = glsl_uint_type(); ++ function->params[TRAVERSAL_ARG_SBT_STRIDE].num_components = 1; ++ function->params[TRAVERSAL_ARG_SBT_STRIDE].bit_size = 32; ++ function->params[TRAVERSAL_ARG_SBT_STRIDE].type = glsl_uint_type(); ++ function->params[TRAVERSAL_ARG_MISS_INDEX].num_components = 1; ++ function->params[TRAVERSAL_ARG_MISS_INDEX].bit_size = 32; ++ function->params[TRAVERSAL_ARG_MISS_INDEX].type = glsl_uint_type(); ++ function->params[TRAVERSAL_ARG_RAY_ORIGIN].num_components = 3; ++ function->params[TRAVERSAL_ARG_RAY_ORIGIN].bit_size = 32; ++ function->params[TRAVERSAL_ARG_RAY_ORIGIN].type = glsl_vector_type(GLSL_TYPE_FLOAT, 3); ++ function->params[TRAVERSAL_ARG_RAY_TMIN].num_components = 1; ++ function->params[TRAVERSAL_ARG_RAY_TMIN].bit_size = 32; ++ function->params[TRAVERSAL_ARG_RAY_TMIN].type = glsl_float_type(); ++ function->params[TRAVERSAL_ARG_RAY_DIRECTION].num_components = 3; ++ function->params[TRAVERSAL_ARG_RAY_DIRECTION].bit_size = 32; ++ function->params[TRAVERSAL_ARG_RAY_DIRECTION].type = glsl_vector_type(GLSL_TYPE_FLOAT, 3); ++ function->params[TRAVERSAL_ARG_RAY_TMAX].num_components = 1; ++ function->params[TRAVERSAL_ARG_RAY_TMAX].bit_size = 32; ++ function->params[TRAVERSAL_ARG_RAY_TMAX].type = glsl_float_type(); ++ function->params[TRAVERSAL_ARG_PRIMITIVE_ID].num_components = 1; ++ function->params[TRAVERSAL_ARG_PRIMITIVE_ID].bit_size = 32; ++ function->params[TRAVERSAL_ARG_PRIMITIVE_ID].type = glsl_uint_type(); ++ function->params[TRAVERSAL_ARG_INSTANCE_ADDR].num_components = 1; ++ function->params[TRAVERSAL_ARG_INSTANCE_ADDR].bit_size = 64; ++ function->params[TRAVERSAL_ARG_INSTANCE_ADDR].type = glsl_uint64_t_type(); ++ function->params[TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS].num_components = 1; ++ function->params[TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS].bit_size = 32; ++ function->params[TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS].type = glsl_uint_type(); ++ function->params[TRAVERSAL_ARG_HIT_KIND].num_components = 1; ++ function->params[TRAVERSAL_ARG_HIT_KIND].bit_size = 32; ++ function->params[TRAVERSAL_ARG_HIT_KIND].type = glsl_uint_type(); ++ ++ function->driver_attributes = ACO_NIR_CALL_ABI_TRAVERSAL; ++ payload_base = TRAVERSAL_ARG_PAYLOAD_BASE; ++ break; ++ case MESA_SHADER_CLOSEST_HIT: ++ case MESA_SHADER_MISS: ++ function->num_params = CHIT_MISS_ARG_PAYLOAD_BASE + DIV_ROUND_UP(payload_size, 4); ++ function->params = rzalloc_array_size(function->shader, sizeof(nir_parameter), function->num_params); ++ function->params[CHIT_MISS_ARG_LAUNCH_ID].num_components = 3; ++ function->params[CHIT_MISS_ARG_LAUNCH_ID].bit_size = 32; ++ function->params[CHIT_MISS_ARG_LAUNCH_ID].type = glsl_vector_type(GLSL_TYPE_UINT, 3); ++ function->params[CHIT_MISS_ARG_LAUNCH_SIZE].num_components = 3; ++ function->params[CHIT_MISS_ARG_LAUNCH_SIZE].bit_size = 32; ++ function->params[CHIT_MISS_ARG_LAUNCH_SIZE].type = glsl_vector_type(GLSL_TYPE_UINT, 3); ++ function->params[CHIT_MISS_ARG_LAUNCH_SIZE].is_uniform = true; ++ function->params[CHIT_MISS_ARG_DESCRIPTORS].num_components = 1; ++ function->params[CHIT_MISS_ARG_DESCRIPTORS].bit_size = 32; ++ function->params[CHIT_MISS_ARG_DESCRIPTORS].type = glsl_uint_type(); ++ function->params[CHIT_MISS_ARG_DESCRIPTORS].is_uniform = true; ++ function->params[CHIT_MISS_ARG_PUSH_CONSTANTS].num_components = 1; ++ function->params[CHIT_MISS_ARG_PUSH_CONSTANTS].bit_size = 32; ++ function->params[CHIT_MISS_ARG_PUSH_CONSTANTS].type = glsl_uint_type(); ++ function->params[CHIT_MISS_ARG_PUSH_CONSTANTS].is_uniform = true; ++ function->params[CHIT_MISS_ARG_SBT_DESCRIPTORS].num_components = 1; ++ function->params[CHIT_MISS_ARG_SBT_DESCRIPTORS].bit_size = 64; ++ function->params[CHIT_MISS_ARG_SBT_DESCRIPTORS].type = glsl_uint64_t_type(); ++ function->params[CHIT_MISS_ARG_SBT_DESCRIPTORS].is_uniform = true; ++ function->params[CHIT_MISS_ARG_TRAVERSAL_ADDR].num_components = 1; ++ function->params[CHIT_MISS_ARG_TRAVERSAL_ADDR].bit_size = 64; ++ function->params[CHIT_MISS_ARG_TRAVERSAL_ADDR].type = glsl_uint64_t_type(); ++ function->params[CHIT_MISS_ARG_TRAVERSAL_ADDR].is_uniform = true; ++ function->params[CHIT_MISS_ARG_SHADER_RECORD_PTR].num_components = 1; ++ function->params[CHIT_MISS_ARG_SHADER_RECORD_PTR].bit_size = 64; ++ function->params[CHIT_MISS_ARG_SHADER_RECORD_PTR].type = glsl_uint64_t_type(); ++ function->params[CHIT_MISS_ARG_ACCEL_STRUCT].num_components = 1; ++ function->params[CHIT_MISS_ARG_ACCEL_STRUCT].bit_size = 64; ++ function->params[CHIT_MISS_ARG_ACCEL_STRUCT].driver_attributes = ACO_NIR_PARAM_ATTRIB_DISCARDABLE; ++ function->params[CHIT_MISS_ARG_ACCEL_STRUCT].type = glsl_uint64_t_type(); ++ function->params[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS].num_components = 1; ++ function->params[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS].bit_size = 32; ++ function->params[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS].type = glsl_uint_type(); ++ function->params[CHIT_MISS_ARG_SBT_OFFSET].num_components = 1; ++ function->params[CHIT_MISS_ARG_SBT_OFFSET].bit_size = 32; ++ function->params[CHIT_MISS_ARG_SBT_OFFSET].driver_attributes = ACO_NIR_PARAM_ATTRIB_DISCARDABLE; ++ function->params[CHIT_MISS_ARG_SBT_OFFSET].type = glsl_uint_type(); ++ function->params[CHIT_MISS_ARG_SBT_STRIDE].num_components = 1; ++ function->params[CHIT_MISS_ARG_SBT_STRIDE].bit_size = 32; ++ function->params[CHIT_MISS_ARG_SBT_STRIDE].driver_attributes = ACO_NIR_PARAM_ATTRIB_DISCARDABLE; ++ function->params[CHIT_MISS_ARG_SBT_STRIDE].type = glsl_uint_type(); ++ function->params[CHIT_MISS_ARG_MISS_INDEX].num_components = 1; ++ function->params[CHIT_MISS_ARG_MISS_INDEX].bit_size = 32; ++ function->params[CHIT_MISS_ARG_MISS_INDEX].driver_attributes = ACO_NIR_PARAM_ATTRIB_DISCARDABLE; ++ function->params[CHIT_MISS_ARG_MISS_INDEX].type = glsl_uint_type(); ++ function->params[CHIT_MISS_ARG_RAY_ORIGIN].num_components = 3; ++ function->params[CHIT_MISS_ARG_RAY_ORIGIN].bit_size = 32; ++ function->params[CHIT_MISS_ARG_RAY_ORIGIN].type = glsl_vector_type(GLSL_TYPE_FLOAT, 3); ++ function->params[CHIT_MISS_ARG_RAY_TMIN].num_components = 1; ++ function->params[CHIT_MISS_ARG_RAY_TMIN].bit_size = 32; ++ function->params[CHIT_MISS_ARG_RAY_TMIN].type = glsl_float_type(); ++ function->params[CHIT_MISS_ARG_RAY_DIRECTION].num_components = 3; ++ function->params[CHIT_MISS_ARG_RAY_DIRECTION].bit_size = 32; ++ function->params[CHIT_MISS_ARG_RAY_DIRECTION].type = glsl_vector_type(GLSL_TYPE_FLOAT, 3); ++ function->params[CHIT_MISS_ARG_RAY_TMAX].num_components = 1; ++ function->params[CHIT_MISS_ARG_RAY_TMAX].bit_size = 32; ++ function->params[CHIT_MISS_ARG_RAY_TMAX].type = glsl_float_type(); ++ function->params[CHIT_MISS_ARG_PRIMITIVE_ID].num_components = 1; ++ function->params[CHIT_MISS_ARG_PRIMITIVE_ID].bit_size = 32; ++ function->params[CHIT_MISS_ARG_PRIMITIVE_ID].type = glsl_uint_type(); ++ function->params[CHIT_MISS_ARG_INSTANCE_ADDR].num_components = 1; ++ function->params[CHIT_MISS_ARG_INSTANCE_ADDR].bit_size = 64; ++ function->params[CHIT_MISS_ARG_INSTANCE_ADDR].type = glsl_uint64_t_type(); ++ function->params[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS].num_components = 1; ++ function->params[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS].bit_size = 32; ++ function->params[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS].type = glsl_uint_type(); ++ function->params[CHIT_MISS_ARG_HIT_KIND].num_components = 1; ++ function->params[CHIT_MISS_ARG_HIT_KIND].bit_size = 32; ++ function->params[CHIT_MISS_ARG_HIT_KIND].type = glsl_uint_type(); ++ ++ function->driver_attributes = ACO_NIR_CALL_ABI_RT_RECURSIVE | ACO_NIR_FUNCTION_ATTRIB_DIVERGENT_CALL; ++ payload_base = CHIT_MISS_ARG_PAYLOAD_BASE; ++ break; ++ default: ++ unreachable("invalid RT stage"); ++ } ++ ++ if (payload_base != -1u) { ++ for (unsigned i = 0; i < DIV_ROUND_UP(payload_size, 4); ++i) { ++ function->params[payload_base + i].num_components = 1; ++ function->params[payload_base + i].bit_size = 32; ++ function->params[payload_base + i].is_return = true; ++ function->params[payload_base + i].type = glsl_uint_type(); ++ } ++ } ++ ++ /* Entrypoints can't have parameters. Consider RT stages as callable functions */ ++ function->is_exported = true; ++ function->is_entrypoint = false; ++} ++ + /* + * Global variables for an RT pipeline + */ +@@ -180,6 +419,8 @@ struct rt_variables { + nir_variable *shader_addr; + nir_variable *traversal_addr; + ++ nir_variable *sbt_descriptors; ++ + /* scratch offset of the argument area relative to stack_ptr */ + nir_variable *arg; + uint32_t payload_offset; +@@ -217,12 +458,19 @@ struct rt_variables { + nir_variable *ahit_terminate; + nir_variable *terminated; + ++ nir_variable **out_payload_storage; ++ unsigned payload_size; ++ ++ nir_function *trace_ray_func; ++ nir_function *chit_miss_func; ++ nir_function *callable_func; ++ + unsigned stack_size; + }; + + static struct rt_variables + create_rt_variables(nir_shader *shader, struct radv_device *device, const VkPipelineCreateFlags2KHR flags, +- bool monolithic) ++ unsigned max_payload_size, bool monolithic) + { + struct rt_variables vars = { + .device = device, +@@ -236,6 +484,8 @@ create_rt_variables(nir_shader *shader, struct radv_device *device, const VkPipe + vars.stack_ptr = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "stack_ptr"); + vars.shader_record_ptr = nir_variable_create(shader, nir_var_shader_temp, glsl_uint64_t_type(), "shader_record_ptr"); + ++ vars.sbt_descriptors = nir_variable_create(shader, nir_var_shader_temp, glsl_uint64_t_type(), "sbt_descriptors"); ++ + vars.launch_sizes[0] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "launch_size_x"); + vars.launch_sizes[1] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "launch_size_y"); + vars.launch_sizes[2] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "launch_size_z"); +@@ -269,6 +519,23 @@ create_rt_variables(nir_shader *shader, struct radv_device *device, const VkPipe + vars.ahit_terminate = nir_variable_create(shader, nir_var_shader_temp, glsl_bool_type(), "ahit_terminate"); + vars.terminated = nir_variable_create(shader, nir_var_shader_temp, glsl_bool_type(), "terminated"); + ++ if (max_payload_size) ++ vars.out_payload_storage = rzalloc_array_size(shader, DIV_ROUND_UP(max_payload_size, 4), sizeof(nir_variable *)); ++ vars.payload_size = max_payload_size; ++ for (unsigned i = 0; i < DIV_ROUND_UP(max_payload_size, 4); ++i) { ++ vars.out_payload_storage[i] = ++ nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "out_payload_storage"); ++ } ++ ++ nir_function *trace_ray_func = nir_function_create(shader, "trace_ray_func"); ++ radv_nir_init_function_params(trace_ray_func, MESA_SHADER_INTERSECTION, max_payload_size); ++ vars.trace_ray_func = trace_ray_func; ++ nir_function *chit_miss_func = nir_function_create(shader, "chit_miss_func"); ++ radv_nir_init_function_params(chit_miss_func, MESA_SHADER_CLOSEST_HIT, max_payload_size); ++ vars.chit_miss_func = chit_miss_func; ++ nir_function *callable_func = nir_function_create(shader, "callable_func"); ++ radv_nir_init_function_params(callable_func, MESA_SHADER_CALLABLE, max_payload_size); ++ vars.callable_func = callable_func; + return vars; + } + +@@ -850,7 +1117,8 @@ insert_rt_case(nir_builder *b, nir_shader *shader, struct rt_variables *vars, ni + + nir_opt_dead_cf(shader); + +- struct rt_variables src_vars = create_rt_variables(shader, vars->device, vars->flags, vars->monolithic); ++ struct rt_variables src_vars = ++ create_rt_variables(shader, vars->device, vars->flags, vars->payload_size, vars->monolithic); + map_rt_variables(var_remap, &src_vars, vars); + + NIR_PASS_V(shader, lower_rt_instructions, &src_vars, false, NULL); +@@ -1723,7 +1991,7 @@ radv_build_traversal_shader(struct radv_device *device, struct radv_ray_tracing_ + b.shader->info.workgroup_size[0] = 8; + b.shader->info.workgroup_size[1] = pdev->rt_wave_size == 64 ? 8 : 4; + b.shader->info.shared_size = pdev->rt_wave_size * MAX_STACK_ENTRY_COUNT * sizeof(uint32_t); +- struct rt_variables vars = create_rt_variables(b.shader, device, create_flags, false); ++ struct rt_variables vars = create_rt_variables(b.shader, device, create_flags, false, 0); + + if (info->tmin.state == RADV_RT_CONST_ARG_STATE_VALID) + nir_store_var(&b, vars.tmin, nir_imm_int(&b, info->tmin.value), 0x1); +@@ -1902,7 +2170,7 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH + + const VkPipelineCreateFlagBits2KHR create_flags = vk_rt_pipeline_create_flags(pCreateInfo); + +- struct rt_variables vars = create_rt_variables(shader, device, create_flags, monolithic); ++ struct rt_variables vars = create_rt_variables(shader, device, create_flags, payload_size, monolithic); + + if (monolithic) + lower_rt_instructions_monolithic(shader, device, pipeline, info, pCreateInfo, payload_size, &vars); +-- +GitLab + + +From 7c97b73c788dcab2347225073bac244aa8aea252 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 12:04:36 +0200 +Subject: [PATCH 23/71] radv/rt: Convert lower_rt_derefs to register payloads + +All payloads alias the same registers by the time RT functions get +called. In order to pretend that the payload variables (represented by +function_temp vars) are separate, payload values are copied to the +"global" payload variables (shader_temp variables) just before a shader +call, and copied from there immediately after the shader call. +--- + src/amd/vulkan/nir/radv_nir_rt_shader.c | 84 ++++++++++++++++++++----- + 1 file changed, 68 insertions(+), 16 deletions(-) + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index 165c7e18578e0..0ebb095f52e1c 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -126,6 +126,62 @@ radv_visit_inlined_shaders(nir_builder *b, nir_def *sbt_idx, bool can_have_null_ + free(cases); + } + ++static void ++lower_rt_deref_var(nir_shader *shader, nir_function_impl *impl, nir_instr *instr, struct hash_table *cloned_vars) ++{ ++ nir_deref_instr *deref = nir_instr_as_deref(instr); ++ nir_variable *var = deref->var; ++ struct hash_entry *entry = _mesa_hash_table_search(cloned_vars, var); ++ if (!(var->data.mode & nir_var_function_temp) && !entry) ++ return; ++ ++ hash_table_foreach (cloned_vars, cloned_entry) { ++ if (var == cloned_entry->data) ++ return; ++ } ++ ++ nir_variable *new_var; ++ if (entry) { ++ new_var = entry->data; ++ } else { ++ new_var = nir_variable_clone(var, shader); ++ _mesa_hash_table_insert(cloned_vars, var, new_var); ++ ++ exec_node_remove(&var->node); ++ var->data.mode = nir_var_shader_temp; ++ exec_list_push_tail(&shader->variables, &var->node); ++ ++ exec_list_push_tail(&impl->locals, &new_var->node); ++ } ++ ++ deref->modes = nir_var_shader_temp; ++ ++ nir_foreach_use_safe (use, nir_instr_def(instr)) { ++ if (nir_src_is_if(use)) ++ continue; ++ ++ nir_instr *parent = nir_src_parent_instr(use); ++ if (parent->type != nir_instr_type_intrinsic) ++ continue; ++ ++ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(parent); ++ if (intrin->intrinsic != nir_intrinsic_trace_ray && intrin->intrinsic != nir_intrinsic_execute_callable && ++ intrin->intrinsic != nir_intrinsic_execute_closest_hit_amd && ++ intrin->intrinsic != nir_intrinsic_execute_miss_amd) ++ continue; ++ ++ nir_builder b = nir_builder_at(nir_before_instr(parent)); ++ nir_deref_instr *old_deref = nir_build_deref_var(&b, var); ++ nir_deref_instr *new_deref = nir_build_deref_var(&b, new_var); ++ ++ nir_copy_deref(&b, new_deref, old_deref); ++ b.cursor = nir_after_instr(parent); ++ nir_copy_deref(&b, old_deref, new_deref); ++ ++ nir_src_rewrite(use, nir_instr_def(&new_deref->instr)); ++ } ++} ++ + static bool + lower_rt_derefs(nir_shader *shader) + { +@@ -133,9 +189,7 @@ lower_rt_derefs(nir_shader *shader) + + bool progress = false; + +- nir_builder b = nir_builder_at(nir_before_impl(impl)); +- +- nir_def *arg_offset = nir_load_rt_arg_scratch_offset_amd(&b); ++ struct hash_table *cloned_vars = _mesa_pointer_hash_table_create(shader); + + nir_foreach_block (block, impl) { + nir_foreach_instr_safe (instr, block) { +@@ -143,17 +197,18 @@ lower_rt_derefs(nir_shader *shader) + continue; + + nir_deref_instr *deref = nir_instr_as_deref(instr); +- if (!nir_deref_mode_is(deref, nir_var_shader_call_data)) ++ if (!nir_deref_mode_is(deref, nir_var_function_temp)) + continue; + +- deref->modes = nir_var_function_temp; +- progress = true; +- + if (deref->deref_type == nir_deref_type_var) { +- b.cursor = nir_before_instr(&deref->instr); +- nir_deref_instr *replacement = +- nir_build_deref_cast(&b, arg_offset, nir_var_function_temp, deref->var->type, 0); +- nir_def_replace(&deref->def, &replacement->def); ++ lower_rt_deref_var(shader, impl, instr, cloned_vars); ++ progress = true; ++ } else { ++ assert(deref->deref_type != nir_deref_type_cast); ++ /* Parent modes might have changed, propagate change */ ++ nir_deref_instr *parent = nir_src_as_deref(deref->parent); ++ if (parent->modes != deref->modes) ++ deref->modes = parent->modes; + } + } + } +@@ -1139,12 +1194,9 @@ void + radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset) + { + if (!monolithic) { +- NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_function_temp | nir_var_shader_call_data, +- glsl_get_natural_size_align_bytes); +- + NIR_PASS(_, nir, lower_rt_derefs); +- +- NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_function_temp, nir_address_format_32bit_offset); ++ NIR_PASS(_, nir, nir_split_var_copies); ++ NIR_PASS(_, nir, nir_lower_var_copies); + } else { + NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_function_temp, glsl_get_natural_size_align_bytes); + NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_shader_temp, glsl_get_natural_size_align_bytes); +-- +GitLab + + +From c45e4fbee8cb3c930d13c5ce1c1478b68fdcbbb5 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 12:09:15 +0200 +Subject: [PATCH 24/71] radv/rt: Align radv_nir_lower_rt_io to new lowering + +--- + src/amd/vulkan/nir/radv_nir_rt_shader.c | 10 +++++----- + src/amd/vulkan/radv_shader.h | 2 +- + 2 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index 0ebb095f52e1c..7708dd8809b79 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -1191,7 +1191,7 @@ insert_rt_case(nir_builder *b, nir_shader *shader, struct rt_variables *vars, ni + } + + void +-radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset) ++radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset, uint32_t *payload_size) + { + if (!monolithic) { + NIR_PASS(_, nir, lower_rt_derefs); +@@ -1625,7 +1625,7 @@ radv_build_ahit_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g + radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->any_hit_shader].nir); + assert(nir_stage); + +- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset); ++ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); + + insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.any_hit_index); + ralloc_free(nir_stage); +@@ -1649,7 +1649,7 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g + radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->intersection_shader].nir); + assert(nir_stage); + +- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset); ++ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); + + nir_shader *any_hit_stage = NULL; + if (group->any_hit_shader != VK_SHADER_UNUSED_KHR) { +@@ -1657,7 +1657,7 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g + radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->any_hit_shader].nir); + assert(any_hit_stage); + +- radv_nir_lower_rt_io(any_hit_stage, data->vars->monolithic, data->vars->payload_offset); ++ radv_nir_lower_rt_io(any_hit_stage, data->vars->monolithic, data->vars->payload_offset, NULL); + + /* reserve stack size for any_hit before it is inlined */ + data->pipeline->stages[group->any_hit_shader].stack_size = any_hit_stage->scratch_size; +@@ -1701,7 +1701,7 @@ radv_build_recursive_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_trac + radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->recursive_shader].nir); + assert(nir_stage); + +- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset); ++ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); + + insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.general_index); + ralloc_free(nir_stage); +diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h +index 4ba7e36d16952..f6a0f35c23333 100644 +--- a/src/amd/vulkan/radv_shader.h ++++ b/src/amd/vulkan/radv_shader.h +@@ -516,7 +516,7 @@ radv_get_rt_shader_entrypoint(nir_shader *shader) + return NULL; + } + +-void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset); ++void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset, uint32_t *payload_size); + + struct radv_ray_tracing_stage_info; + +-- +GitLab + + +From 4b54715289586c84b393e264d99e85c327f614f6 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 12:10:31 +0200 +Subject: [PATCH 25/71] radv/rt: Include inlined shader scratch size in + traversal scratch + +When calls without tail-call optimization happen, the traversal shader +must spill, and spilled vars must be placed after shader scratch. +--- + src/amd/vulkan/nir/radv_nir_rt_shader.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index 7708dd8809b79..f29f91ce18178 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -1626,6 +1626,7 @@ radv_build_ahit_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g + assert(nir_stage); + + radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); ++ b->shader->scratch_size = MAX2(nir_stage->scratch_size, b->shader->scratch_size); + + insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.any_hit_index); + ralloc_free(nir_stage); +@@ -1661,10 +1662,12 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g + + /* reserve stack size for any_hit before it is inlined */ + data->pipeline->stages[group->any_hit_shader].stack_size = any_hit_stage->scratch_size; ++ b->shader->scratch_size = MAX2(any_hit_stage->scratch_size, b->shader->scratch_size); + + nir_lower_intersection_shader(nir_stage, any_hit_stage); + ralloc_free(any_hit_stage); + } ++ b->shader->scratch_size = MAX2(nir_stage->scratch_size, b->shader->scratch_size); + + insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.intersection_index); + ralloc_free(nir_stage); +-- +GitLab + + +From a86319221ae3924ff785061af08f4ae16cc851e9 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 12:17:15 +0200 +Subject: [PATCH 26/71] radv/rt: Don't store vars->shader_record_ptr directly + in load_sbt_entry + +When calling functions, we don't want the new shader record to stick +beyond the function call, so only store it when not calling functions. +--- + src/amd/vulkan/nir/radv_nir_rt_shader.c | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index f29f91ce18178..eeec13b0f539c 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -684,7 +684,7 @@ enum sbt_entry { + SBT_ANY_HIT_IDX = offsetof(struct radv_pipeline_group_handle, any_hit_index), + }; + +-static void ++static nir_def * + load_sbt_entry(nir_builder *b, const struct rt_variables *vars, nir_def *idx, enum sbt_type binding, + enum sbt_entry offset) + { +@@ -704,7 +704,7 @@ load_sbt_entry(nir_builder *b, const struct rt_variables *vars, nir_def *idx, en + } + + nir_def *record_addr = nir_iadd_imm(b, addr, RADV_RT_HANDLE_SIZE - offset); +- nir_store_var(b, vars->shader_record_ptr, record_addr, 1); ++ return record_addr; + } + + struct radv_rt_shader_info { +@@ -987,7 +987,7 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) + nir_store_var(b, vars->instance_addr, intr->src[3].ssa, 0x1); + nir_store_var(b, vars->geometry_id_and_flags, intr->src[4].ssa, 0x1); + nir_store_var(b, vars->hit_kind, intr->src[5].ssa, 0x1); +- load_sbt_entry(b, vars, intr->src[0].ssa, SBT_HIT, SBT_RECURSIVE_PTR); ++ nir_def *record = load_sbt_entry(b, vars, intr->src[0].ssa, SBT_HIT, SBT_RECURSIVE_PTR); + + nir_def *should_return = + nir_test_mask(b, nir_load_var(b, vars->cull_mask_and_flags), SpvRayFlagsSkipClosestHitShaderKHRMask); +@@ -1011,7 +1011,7 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) + nir_store_var(b, vars->geometry_id_and_flags, undef, 0x1); + nir_store_var(b, vars->hit_kind, undef, 0x1); + nir_def *miss_index = nir_load_var(b, vars->miss_index); +- load_sbt_entry(b, vars, miss_index, SBT_MISS, SBT_RECURSIVE_PTR); ++ nir_def *record = load_sbt_entry(b, vars, miss_index, SBT_MISS, SBT_RECURSIVE_PTR); + + if (!(vars->flags & VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR)) { + /* In case of a NULL miss shader, do nothing and just return. */ +@@ -1741,7 +1741,8 @@ handle_candidate_triangle(nir_builder *b, struct radv_triangle_intersection *int + nir_store_var(b, inner_vars.instance_addr, nir_load_var(b, data->trav_vars->instance_addr), 0x1); + nir_store_var(b, inner_vars.hit_kind, hit_kind, 0x1); + +- load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, SBT_ANY_HIT_IDX); ++ nir_def *record = load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, SBT_ANY_HIT_IDX); ++ nir_store_var(b, inner_vars.shader_record_ptr, record, 0x1); + + struct radv_rt_case_data case_data = { + .device = data->device, +@@ -1805,7 +1806,8 @@ handle_candidate_aabb(nir_builder *b, struct radv_leaf_intersection *intersectio + nir_store_var(b, inner_vars.instance_addr, nir_load_var(b, data->trav_vars->instance_addr), 0x1); + nir_store_var(b, inner_vars.opaque, intersection->opaque, 1); + +- load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, SBT_INTERSECTION_IDX); ++ nir_def *record = load_sbt_entry(b, &inner_vars, sbt_idx, SBT_HIT, SBT_INTERSECTION_IDX); ++ nir_store_var(b, inner_vars.shader_record_ptr, record, 0x1); + + nir_store_var(b, data->vars->ahit_accept, nir_imm_false(b), 0x1); + nir_store_var(b, data->vars->ahit_terminate, nir_imm_false(b), 0x1); +@@ -1979,7 +1981,8 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin + nir_push_if(b, nir_load_var(b, trav_vars.hit)); + { + if (monolithic) { +- load_sbt_entry(b, vars, nir_load_var(b, vars->idx), SBT_HIT, SBT_CLOSEST_HIT_IDX); ++ nir_def *record = load_sbt_entry(b, vars, nir_load_var(b, vars->idx), SBT_HIT, SBT_CLOSEST_HIT_IDX); ++ nir_store_var(b, vars->shader_record_ptr, record, 0x1); + + nir_def *should_return = + nir_test_mask(b, nir_load_var(b, vars->cull_mask_and_flags), SpvRayFlagsSkipClosestHitShaderKHRMask); +@@ -2011,7 +2014,8 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin + nir_push_else(b, NULL); + { + if (monolithic) { +- load_sbt_entry(b, vars, nir_load_var(b, vars->miss_index), SBT_MISS, SBT_GENERAL_IDX); ++ nir_def *record = load_sbt_entry(b, vars, nir_load_var(b, vars->miss_index), SBT_MISS, SBT_GENERAL_IDX); ++ nir_store_var(b, vars->shader_record_ptr, record, 0x1); + + struct radv_rt_case_data case_data = { + .device = device, +-- +GitLab + + +From 07a8a0b29f5e3d5b969ec8164af8fdefd8ffc28a Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Thu, 3 Oct 2024 15:59:01 +0200 +Subject: [PATCH 27/71] radv/rt: Load SBT descriptor from NIR variables + +--- + src/amd/vulkan/nir/radv_nir_rt_shader.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index eeec13b0f539c..2f13831d9d473 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -965,6 +965,10 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) + nir_pop_if(b, NULL); + break; + } ++ case nir_intrinsic_load_sbt_base_amd: { ++ ret = nir_load_var(b, vars->sbt_descriptors); ++ break; ++ } + case nir_intrinsic_load_sbt_offset_amd: { + ret = nir_load_var(b, vars->sbt_offset); + break; +@@ -2077,6 +2081,7 @@ radv_build_traversal_shader(struct radv_device *device, struct radv_ray_tracing_ + nir_store_var(&b, vars.cull_mask_and_flags, nir_load_cull_mask_and_flags_amd(&b), 0x1); + nir_store_var(&b, vars.origin, nir_load_ray_world_origin(&b), 0x7); + nir_store_var(&b, vars.direction, nir_load_ray_world_direction(&b), 0x7); ++ nir_store_var(&b, vars.sbt_descriptors, nir_load_sbt_base_amd(&b), 0x1); + nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 0x1); + + radv_build_traversal(device, pipeline, pCreateInfo, false, &b, &vars, false, info); +-- +GitLab + + +From 565c4764726d6a68e785c019f49914b00b8930ed Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 12:21:29 +0200 +Subject: [PATCH 28/71] radv/rt: Use function calls for shader calls + +Don't call nir_lower_shader_calls anymore, but emit nir_call +instructions for trace_ray and friends. Also, switch from shader args +to parameters for most things, and change lowerings accordingly. +--- + src/amd/common/ac_shader_args.h | 16 - + src/amd/vulkan/nir/radv_nir_rt_shader.c | 487 +++++++++++++----------- + src/amd/vulkan/radv_pipeline_rt.c | 62 +-- + src/amd/vulkan/radv_shader.h | 7 +- + src/amd/vulkan/radv_shader_args.c | 20 +- + 5 files changed, 290 insertions(+), 302 deletions(-) + +diff --git a/src/amd/common/ac_shader_args.h b/src/amd/common/ac_shader_args.h +index 62ac708c3d185..030a271e22ff3 100644 +--- a/src/amd/common/ac_shader_args.h ++++ b/src/amd/common/ac_shader_args.h +@@ -179,29 +179,13 @@ struct ac_shader_args { + + /* RT */ + struct { +- struct ac_arg uniform_shader_addr; + struct ac_arg sbt_descriptors; + struct ac_arg launch_sizes[3]; + struct ac_arg launch_size_addr; + struct ac_arg launch_ids[3]; + struct ac_arg dynamic_callable_stack_base; + struct ac_arg traversal_shader_addr; +- struct ac_arg shader_addr; +- struct ac_arg shader_record; + struct ac_arg payload_offset; +- struct ac_arg ray_origin; +- struct ac_arg ray_tmin; +- struct ac_arg ray_direction; +- struct ac_arg ray_tmax; +- struct ac_arg cull_mask_and_flags; +- struct ac_arg sbt_offset; +- struct ac_arg sbt_stride; +- struct ac_arg miss_index; +- struct ac_arg accel_struct; +- struct ac_arg primitive_id; +- struct ac_arg instance_addr; +- struct ac_arg geometry_id_and_flags; +- struct ac_arg hit_kind; + } rt; + }; + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index 2f13831d9d473..7968cb36f5d87 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -688,7 +688,7 @@ static nir_def * + load_sbt_entry(nir_builder *b, const struct rt_variables *vars, nir_def *idx, enum sbt_type binding, + enum sbt_entry offset) + { +- nir_def *desc_base_addr = nir_load_sbt_base_amd(b); ++ nir_def *desc_base_addr = nir_load_var(b, vars->sbt_descriptors); + + nir_def *desc = nir_pack_64_2x32(b, nir_load_smem_amd(b, 2, desc_base_addr, nir_imm_int(b, binding))); + +@@ -742,74 +742,58 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) + + nir_def *ret = NULL; + switch (intr->intrinsic) { +- case nir_intrinsic_rt_execute_callable: { +- uint32_t size = align(nir_intrinsic_stack_size(intr), 16); +- nir_def *ret_ptr = nir_load_resume_shader_address_amd(b, nir_intrinsic_call_idx(intr)); +- ret_ptr = nir_ior_imm(b, ret_ptr, radv_get_rt_priority(b->shader->info.stage)); +- +- nir_store_var(b, vars->stack_ptr, nir_iadd_imm_nuw(b, nir_load_var(b, vars->stack_ptr), size), 1); +- nir_store_scratch(b, ret_ptr, nir_load_var(b, vars->stack_ptr), .align_mul = 16); +- +- nir_store_var(b, vars->stack_ptr, nir_iadd_imm_nuw(b, nir_load_var(b, vars->stack_ptr), 16), 1); +- load_sbt_entry(b, vars, intr->src[0].ssa, SBT_CALLABLE, SBT_RECURSIVE_PTR); +- +- nir_store_var(b, vars->arg, nir_iadd_imm(b, intr->src[1].ssa, -size - 16), 1); +- +- vars->stack_size = MAX2(vars->stack_size, size + 16); ++ case nir_intrinsic_execute_callable: { ++ nir_def *record = load_sbt_entry(b, vars, intr->src[0].ssa, SBT_CALLABLE, SBT_RECURSIVE_PTR); ++ ++ unsigned param_count = RAYGEN_ARG_COUNT + DIV_ROUND_UP(vars->payload_size, 4); ++ nir_def **args = rzalloc_array_size(b->shader, sizeof(nir_def *), param_count); ++ args[RAYGEN_ARG_LAUNCH_ID] = nir_vec3(b, nir_load_var(b, vars->launch_ids[0]), nir_load_var(b, vars->launch_ids[1]), nir_load_var(b, vars->launch_ids[2])); ++ args[RAYGEN_ARG_LAUNCH_SIZE] = nir_vec3(b, nir_load_var(b, vars->launch_sizes[0]), nir_load_var(b, vars->launch_sizes[1]), nir_load_var(b, vars->launch_sizes[2])); ++ args[RAYGEN_ARG_DESCRIPTORS] = nir_load_param(b, b->shader->info.stage == MESA_SHADER_RAYGEN || b->shader->info.stage == MESA_SHADER_CALLABLE ? RAYGEN_ARG_DESCRIPTORS : CHIT_MISS_ARG_DESCRIPTORS); ++ args[RAYGEN_ARG_PUSH_CONSTANTS] = nir_load_param(b, b->shader->info.stage == MESA_SHADER_RAYGEN || b->shader->info.stage == MESA_SHADER_CALLABLE ? RAYGEN_ARG_PUSH_CONSTANTS : CHIT_MISS_ARG_PUSH_CONSTANTS); ++ args[RAYGEN_ARG_SBT_DESCRIPTORS] = nir_load_var(b, vars->sbt_descriptors); ++ args[RAYGEN_ARG_TRAVERSAL_ADDR] = nir_load_var(b, vars->traversal_addr); ++ args[RAYGEN_ARG_SHADER_RECORD_PTR] = record; ++ for (unsigned i = 0; i < DIV_ROUND_UP(vars->payload_size, 4); ++i) { ++ args[RAYGEN_ARG_COUNT + i] = nir_instr_def(&nir_build_deref_var(b, vars->out_payload_storage[i])->instr); ++ } ++ nir_build_indirect_call(b, vars->callable_func, nir_load_var(b, vars->shader_addr), param_count, args); + break; + } +- case nir_intrinsic_rt_trace_ray: { +- uint32_t size = align(nir_intrinsic_stack_size(intr), 16); +- nir_def *ret_ptr = nir_load_resume_shader_address_amd(b, nir_intrinsic_call_idx(intr)); +- ret_ptr = nir_ior_imm(b, ret_ptr, radv_get_rt_priority(b->shader->info.stage)); +- +- nir_store_var(b, vars->stack_ptr, nir_iadd_imm_nuw(b, nir_load_var(b, vars->stack_ptr), size), 1); +- nir_store_scratch(b, ret_ptr, nir_load_var(b, vars->stack_ptr), .align_mul = 16); +- +- nir_store_var(b, vars->stack_ptr, nir_iadd_imm_nuw(b, nir_load_var(b, vars->stack_ptr), 16), 1); +- +- nir_store_var(b, vars->shader_addr, nir_load_var(b, vars->traversal_addr), 1); +- nir_store_var(b, vars->arg, nir_iadd_imm(b, intr->src[10].ssa, -size - 16), 1); +- +- vars->stack_size = MAX2(vars->stack_size, size + 16); +- ++ case nir_intrinsic_trace_ray: { ++ nir_def *undef = nir_undef(b, 1, 32); + /* Per the SPIR-V extension spec we have to ignore some bits for some arguments. */ +- nir_store_var(b, vars->accel_struct, intr->src[0].ssa, 0x1); +- nir_store_var(b, vars->cull_mask_and_flags, nir_ior(b, nir_ishl_imm(b, intr->src[2].ssa, 24), intr->src[1].ssa), +- 0x1); +- nir_store_var(b, vars->sbt_offset, nir_iand_imm(b, intr->src[3].ssa, 0xf), 0x1); +- nir_store_var(b, vars->sbt_stride, nir_iand_imm(b, intr->src[4].ssa, 0xf), 0x1); +- nir_store_var(b, vars->miss_index, nir_iand_imm(b, intr->src[5].ssa, 0xffff), 0x1); +- nir_store_var(b, vars->origin, intr->src[6].ssa, 0x7); +- nir_store_var(b, vars->tmin, intr->src[7].ssa, 0x1); +- nir_store_var(b, vars->direction, intr->src[8].ssa, 0x7); +- nir_store_var(b, vars->tmax, intr->src[9].ssa, 0x1); +- break; +- } +- case nir_intrinsic_rt_resume: { +- uint32_t size = align(nir_intrinsic_stack_size(intr), 16); +- +- nir_store_var(b, vars->stack_ptr, nir_iadd_imm(b, nir_load_var(b, vars->stack_ptr), -size), 1); +- break; +- } +- case nir_intrinsic_rt_return_amd: { +- if (b->shader->info.stage == MESA_SHADER_RAYGEN) { +- nir_terminate(b); +- break; ++ nir_def *cull_mask_and_flags = nir_ior(b, nir_ishl_imm(b, intr->src[2].ssa, 24), intr->src[1].ssa); ++ ++ unsigned param_count = TRAVERSAL_ARG_PAYLOAD_BASE + DIV_ROUND_UP(vars->payload_size, 4); ++ nir_def **args = rzalloc_array_size(b->shader, sizeof(nir_def *), param_count); ++ args[TRAVERSAL_ARG_LAUNCH_ID] = nir_vec3(b, nir_load_var(b, vars->launch_ids[0]), nir_load_var(b, vars->launch_ids[1]), nir_load_var(b, vars->launch_ids[2])); ++ args[TRAVERSAL_ARG_LAUNCH_SIZE] = nir_vec3(b, nir_load_var(b, vars->launch_sizes[0]), nir_load_var(b, vars->launch_sizes[1]), nir_load_var(b, vars->launch_sizes[2])); ++ args[TRAVERSAL_ARG_DESCRIPTORS] = nir_load_param(b, b->shader->info.stage == MESA_SHADER_RAYGEN ? RAYGEN_ARG_DESCRIPTORS : CHIT_MISS_ARG_DESCRIPTORS); ++ args[TRAVERSAL_ARG_PUSH_CONSTANTS] = nir_load_param(b, b->shader->info.stage == MESA_SHADER_RAYGEN ? RAYGEN_ARG_PUSH_CONSTANTS : CHIT_MISS_ARG_PUSH_CONSTANTS); ++ args[TRAVERSAL_ARG_SBT_DESCRIPTORS] = nir_load_var(b, vars->sbt_descriptors); ++ args[TRAVERSAL_ARG_TRAVERSAL_ADDR] = nir_load_var(b, vars->traversal_addr); ++ args[TRAVERSAL_ARG_SHADER_RECORD_PTR] = nir_load_var(b, vars->shader_record_ptr); ++ args[TRAVERSAL_ARG_ACCEL_STRUCT] = intr->src[0].ssa; ++ args[TRAVERSAL_ARG_CULL_MASK_AND_FLAGS] = cull_mask_and_flags; ++ args[TRAVERSAL_ARG_SBT_OFFSET] = nir_iand_imm(b, intr->src[3].ssa, 0xf); ++ args[TRAVERSAL_ARG_SBT_STRIDE] = nir_iand_imm(b, intr->src[4].ssa, 0xf); ++ args[TRAVERSAL_ARG_MISS_INDEX] = nir_iand_imm(b, intr->src[5].ssa, 0xffff); ++ args[TRAVERSAL_ARG_RAY_ORIGIN] = intr->src[6].ssa; ++ args[TRAVERSAL_ARG_RAY_TMIN] = intr->src[7].ssa; ++ args[TRAVERSAL_ARG_RAY_DIRECTION] = intr->src[8].ssa; ++ args[TRAVERSAL_ARG_RAY_TMAX] = intr->src[9].ssa; ++ args[TRAVERSAL_ARG_PRIMITIVE_ID] = undef; ++ args[TRAVERSAL_ARG_INSTANCE_ADDR] = nir_undef(b, 1, 64); ++ args[TRAVERSAL_ARG_GEOMETRY_ID_AND_FLAGS] = undef; ++ args[TRAVERSAL_ARG_HIT_KIND] = undef; ++ for (unsigned i = 0; i < DIV_ROUND_UP(vars->payload_size, 4); ++i) { ++ args[TRAVERSAL_ARG_PAYLOAD_BASE + i] = ++ nir_instr_def(&nir_build_deref_var(b, vars->out_payload_storage[i])->instr); + } +- insert_rt_return(b, vars); ++ nir_build_indirect_call(b, vars->trace_ray_func, nir_load_var(b, vars->traversal_addr), param_count, args); + break; + } +- case nir_intrinsic_load_scratch: { +- if (data->late_lowering) +- nir_src_rewrite(&intr->src[0], nir_iadd_nuw(b, nir_load_var(b, vars->stack_ptr), intr->src[0].ssa)); +- return true; +- } +- case nir_intrinsic_store_scratch: { +- if (data->late_lowering) +- nir_src_rewrite(&intr->src[1], nir_iadd_nuw(b, nir_load_var(b, vars->stack_ptr), intr->src[1].ssa)); +- return true; +- } + case nir_intrinsic_load_shader_record_ptr: { + ret = nir_load_var(b, vars->shader_record_ptr); + break; +@@ -986,11 +970,6 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) + break; + } + case nir_intrinsic_execute_closest_hit_amd: { +- nir_store_var(b, vars->tmax, intr->src[1].ssa, 0x1); +- nir_store_var(b, vars->primitive_id, intr->src[2].ssa, 0x1); +- nir_store_var(b, vars->instance_addr, intr->src[3].ssa, 0x1); +- nir_store_var(b, vars->geometry_id_and_flags, intr->src[4].ssa, 0x1); +- nir_store_var(b, vars->hit_kind, intr->src[5].ssa, 0x1); + nir_def *record = load_sbt_entry(b, vars, intr->src[0].ssa, SBT_HIT, SBT_RECURSIVE_PTR); + + nir_def *should_return = +@@ -1002,28 +981,82 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) + + /* should_return is set if we had a hit but we won't be calling the closest hit + * shader and hence need to return immediately to the calling shader. */ +- nir_push_if(b, should_return); +- insert_rt_return(b, vars); ++ nir_push_if(b, nir_inot(b, should_return)); ++ unsigned param_count = CHIT_MISS_ARG_PAYLOAD_BASE + DIV_ROUND_UP(vars->payload_size, 4); ++ nir_def **args = rzalloc_array_size(b->shader, sizeof(nir_def *), param_count); ++ args[CHIT_MISS_ARG_LAUNCH_ID] = nir_vec3(b, nir_load_var(b, vars->launch_ids[0]), nir_load_var(b, vars->launch_ids[1]), nir_load_var(b, vars->launch_ids[2])); ++ args[CHIT_MISS_ARG_LAUNCH_SIZE] = nir_vec3(b, nir_load_var(b, vars->launch_sizes[0]), nir_load_var(b, vars->launch_sizes[1]), nir_load_var(b, vars->launch_sizes[2])); ++ args[CHIT_MISS_ARG_DESCRIPTORS] = nir_load_param(b, TRAVERSAL_ARG_DESCRIPTORS); ++ args[CHIT_MISS_ARG_PUSH_CONSTANTS] = nir_load_param(b, TRAVERSAL_ARG_PUSH_CONSTANTS); ++ args[CHIT_MISS_ARG_SBT_DESCRIPTORS] = nir_load_var(b, vars->sbt_descriptors); ++ args[CHIT_MISS_ARG_TRAVERSAL_ADDR] = nir_load_var(b, vars->traversal_addr); ++ args[CHIT_MISS_ARG_SHADER_RECORD_PTR] = record; ++ args[CHIT_MISS_ARG_ACCEL_STRUCT] = nir_load_var(b, vars->accel_struct); ++ args[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS] = nir_load_var(b, vars->cull_mask_and_flags); ++ args[CHIT_MISS_ARG_SBT_OFFSET] = nir_load_var(b, vars->sbt_offset); ++ args[CHIT_MISS_ARG_SBT_STRIDE] = nir_load_var(b, vars->sbt_stride); ++ args[CHIT_MISS_ARG_MISS_INDEX] = nir_load_var(b, vars->miss_index); ++ args[CHIT_MISS_ARG_RAY_ORIGIN] = nir_load_var(b, vars->origin); ++ args[CHIT_MISS_ARG_RAY_TMIN] = nir_load_var(b, vars->tmin); ++ args[CHIT_MISS_ARG_RAY_DIRECTION] = nir_load_var(b, vars->direction); ++ args[CHIT_MISS_ARG_RAY_TMAX] = intr->src[1].ssa; ++ args[CHIT_MISS_ARG_PRIMITIVE_ID] = intr->src[2].ssa; ++ args[CHIT_MISS_ARG_INSTANCE_ADDR] = intr->src[3].ssa; ++ args[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS] = intr->src[4].ssa; ++ args[CHIT_MISS_ARG_HIT_KIND] = intr->src[5].ssa; ++ for (unsigned i = 0; i < DIV_ROUND_UP(vars->payload_size, 4); ++i) { ++ args[CHIT_MISS_ARG_PAYLOAD_BASE + i] = ++ nir_instr_def(&nir_build_deref_cast(b, nir_load_param(b, TRAVERSAL_ARG_PAYLOAD_BASE + i), ++ nir_var_shader_call_data, glsl_uint_type(), 4) ++ ->instr); ++ } ++ nir_build_indirect_call(b, vars->chit_miss_func, nir_load_var(b, vars->shader_addr), param_count, args); + nir_pop_if(b, NULL); + break; + } + case nir_intrinsic_execute_miss_amd: { +- nir_store_var(b, vars->tmax, intr->src[0].ssa, 0x1); + nir_def *undef = nir_undef(b, 1, 32); +- nir_store_var(b, vars->primitive_id, undef, 0x1); +- nir_store_var(b, vars->instance_addr, nir_undef(b, 1, 64), 0x1); +- nir_store_var(b, vars->geometry_id_and_flags, undef, 0x1); +- nir_store_var(b, vars->hit_kind, undef, 0x1); + nir_def *miss_index = nir_load_var(b, vars->miss_index); + nir_def *record = load_sbt_entry(b, vars, miss_index, SBT_MISS, SBT_RECURSIVE_PTR); + + if (!(vars->flags & VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR)) { + /* In case of a NULL miss shader, do nothing and just return. */ +- nir_push_if(b, nir_ieq_imm(b, nir_load_var(b, vars->shader_addr), 0)); +- insert_rt_return(b, vars); +- nir_pop_if(b, NULL); ++ nir_push_if(b, nir_ine_imm(b, nir_load_var(b, vars->shader_addr), 0)); + } + ++ unsigned param_count = CHIT_MISS_ARG_PAYLOAD_BASE + DIV_ROUND_UP(vars->payload_size, 4); ++ nir_def **args = rzalloc_array_size(b->shader, sizeof(nir_def *), param_count); ++ args[CHIT_MISS_ARG_LAUNCH_ID] = nir_vec3(b, nir_load_var(b, vars->launch_ids[0]), nir_load_var(b, vars->launch_ids[1]), nir_load_var(b, vars->launch_ids[2])); ++ args[CHIT_MISS_ARG_LAUNCH_SIZE] = nir_vec3(b, nir_load_var(b, vars->launch_sizes[0]), nir_load_var(b, vars->launch_sizes[1]), nir_load_var(b, vars->launch_sizes[2])); ++ args[CHIT_MISS_ARG_DESCRIPTORS] = nir_load_param(b, TRAVERSAL_ARG_DESCRIPTORS); ++ args[CHIT_MISS_ARG_PUSH_CONSTANTS] = nir_load_param(b, TRAVERSAL_ARG_PUSH_CONSTANTS); ++ args[CHIT_MISS_ARG_SBT_DESCRIPTORS] = nir_load_var(b, vars->sbt_descriptors); ++ args[CHIT_MISS_ARG_TRAVERSAL_ADDR] = nir_load_var(b, vars->traversal_addr); ++ args[CHIT_MISS_ARG_SHADER_RECORD_PTR] = record; ++ args[CHIT_MISS_ARG_ACCEL_STRUCT] = nir_load_var(b, vars->accel_struct); ++ args[CHIT_MISS_ARG_CULL_MASK_AND_FLAGS] = nir_load_var(b, vars->cull_mask_and_flags); ++ args[CHIT_MISS_ARG_SBT_OFFSET] = nir_load_var(b, vars->sbt_offset); ++ args[CHIT_MISS_ARG_SBT_STRIDE] = nir_load_var(b, vars->sbt_stride); ++ args[CHIT_MISS_ARG_MISS_INDEX] = nir_load_var(b, vars->miss_index); ++ args[CHIT_MISS_ARG_RAY_ORIGIN] = nir_load_var(b, vars->origin); ++ args[CHIT_MISS_ARG_RAY_TMIN] = nir_load_var(b, vars->tmin); ++ args[CHIT_MISS_ARG_RAY_DIRECTION] = nir_load_var(b, vars->direction); ++ args[CHIT_MISS_ARG_RAY_TMAX] = intr->src[0].ssa; ++ args[CHIT_MISS_ARG_PRIMITIVE_ID] = undef; ++ args[CHIT_MISS_ARG_INSTANCE_ADDR] = nir_undef(b, 1, 64); ++ args[CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS] = undef; ++ args[CHIT_MISS_ARG_HIT_KIND] = undef; ++ for (unsigned i = 0; i < DIV_ROUND_UP(vars->payload_size, 4); ++i) { ++ args[CHIT_MISS_ARG_PAYLOAD_BASE + i] = ++ nir_instr_def(&nir_build_deref_cast(b, nir_load_param(b, TRAVERSAL_ARG_PAYLOAD_BASE + i), ++ nir_var_shader_call_data, glsl_uint_type(), 4) ++ ->instr); ++ } ++ nir_build_indirect_call(b, vars->chit_miss_func, nir_load_var(b, vars->shader_addr), param_count, args); ++ ++ if (!(vars->flags & VK_PIPELINE_CREATE_2_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR)) ++ nir_pop_if(b, NULL); ++ + break; + } + case nir_intrinsic_load_ray_triangle_vertex_positions: { +@@ -1032,6 +1065,14 @@ radv_lower_rt_instruction(nir_builder *b, nir_instr *instr, void *_data) + ret = radv_load_vertex_position(vars->device, b, instance_node_addr, primitive_id, nir_intrinsic_column(intr)); + break; + } ++ case nir_intrinsic_rt_trace_ray: ++ unreachable("nir_intrinsic_rt_trace_ray"); ++ case nir_intrinsic_rt_execute_callable: ++ unreachable("nir_intrinsic_rt_execute_callable"); ++ case nir_intrinsic_rt_resume: ++ unreachable("nir_intrinsic_rt_resume"); ++ case nir_intrinsic_rt_return_amd: ++ unreachable("nir_intrinsic_rt_return_amd"); + default: + return false; + } +@@ -1195,7 +1236,7 @@ insert_rt_case(nir_builder *b, nir_shader *shader, struct rt_variables *vars, ni + } + + void +-radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset, uint32_t *payload_size) ++radv_nir_lower_rt_io(nir_shader *nir, bool monolithic, uint32_t payload_offset) + { + if (!monolithic) { + NIR_PASS(_, nir, lower_rt_derefs); +@@ -1629,7 +1670,7 @@ radv_build_ahit_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g + radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->any_hit_shader].nir); + assert(nir_stage); + +- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); ++ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset); + b->shader->scratch_size = MAX2(nir_stage->scratch_size, b->shader->scratch_size); + + insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.any_hit_index); +@@ -1654,7 +1695,7 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g + radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->intersection_shader].nir); + assert(nir_stage); + +- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); ++ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset); + + nir_shader *any_hit_stage = NULL; + if (group->any_hit_shader != VK_SHADER_UNUSED_KHR) { +@@ -1662,7 +1703,7 @@ radv_build_isec_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_tracing_g + radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->any_hit_shader].nir); + assert(any_hit_stage); + +- radv_nir_lower_rt_io(any_hit_stage, data->vars->monolithic, data->vars->payload_offset, NULL); ++ radv_nir_lower_rt_io(any_hit_stage, data->vars->monolithic, data->vars->payload_offset); + + /* reserve stack size for any_hit before it is inlined */ + data->pipeline->stages[group->any_hit_shader].stack_size = any_hit_stage->scratch_size; +@@ -1708,7 +1749,7 @@ radv_build_recursive_case(nir_builder *b, nir_def *sbt_idx, struct radv_ray_trac + radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[group->recursive_shader].nir); + assert(nir_stage); + +- radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset, NULL); ++ radv_nir_lower_rt_io(nir_stage, data->vars->monolithic, data->vars->payload_offset); + + insert_rt_case(b, nir_stage, data->vars, sbt_idx, group->handle.general_index); + ralloc_free(nir_stage); +@@ -2175,10 +2216,23 @@ radv_count_hit_attrib_slots(nir_builder *b, nir_intrinsic_instr *instr, void *da + return false; + } + ++static bool ++radv_count_ray_payload_size(nir_builder *b, nir_intrinsic_instr *instr, void *data) ++{ ++ uint32_t *count = data; ++ if (instr->intrinsic == nir_intrinsic_load_incoming_ray_payload_amd || ++ instr->intrinsic == nir_intrinsic_load_outgoing_ray_payload_amd || ++ instr->intrinsic == nir_intrinsic_store_incoming_ray_payload_amd || ++ instr->intrinsic == nir_intrinsic_store_outgoing_ray_payload_amd) ++ *count = MAX2(*count, (nir_intrinsic_base(instr) + 1) * 4); ++ ++ return false; ++} ++ + static void + lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, + struct radv_ray_tracing_pipeline *pipeline, const struct radv_shader_info *info, +- const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, uint32_t payload_size, ++ const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, uint32_t *payload_size, + struct rt_variables *vars) + { + nir_function_impl *impl = radv_get_rt_shader_entrypoint(shader); +@@ -2195,6 +2249,7 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, + + uint32_t hit_attrib_count = 0; + nir_shader_intrinsics_pass(shader, radv_count_hit_attrib_slots, nir_metadata_all, &hit_attrib_count); ++ nir_shader_intrinsics_pass(shader, radv_count_ray_payload_size, nir_metadata_all, payload_size); + + /* Register storage for hit attributes */ + STACK_ARRAY(nir_variable *, hit_attribs, hit_attrib_count); +@@ -2203,10 +2258,10 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, + + nir_builder b = nir_builder_create(impl); + b.cursor = nir_before_impl(impl); +- nir_variable **payload_vars = rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(payload_size, 4)); ++ nir_variable **payload_vars = rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(*payload_size, 4)); + nir_deref_instr **payload_storage = +- rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(payload_size, 4)); +- for (unsigned i = 0; i < DIV_ROUND_UP(payload_size, 4); ++i) { ++ rzalloc_array_size(shader, sizeof(nir_variable *), DIV_ROUND_UP(*payload_size, 4)); ++ for (unsigned i = 0; i < DIV_ROUND_UP(*payload_size, 4); ++i) { + payload_vars[i] = nir_variable_create(shader, nir_var_shader_temp, glsl_uint_type(), "_payload"); + payload_storage[i] = nir_build_deref_var(&b, payload_vars[i]); + } +@@ -2215,26 +2270,28 @@ lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, + } + + static void +-radv_store_arg(nir_builder *b, const struct radv_shader_args *args, const struct radv_ray_tracing_stage_info *info, +- struct ac_arg arg, nir_def *value) ++store_param_var(nir_builder *b, nir_variable *var, unsigned param_index, unsigned num_components, unsigned bit_size) + { +- /* Do not pass unused data to the next stage. */ +- if (!info || !BITSET_TEST(info->unused_args, arg.arg_index)) +- ac_nir_store_arg(b, &args->ac, arg, value); ++ if (param_index != -1u) ++ nir_store_var(b, var, nir_load_param(b, param_index), (1 << num_components) - 1); ++ else ++ nir_store_var(b, var, nir_undef(b, num_components, bit_size), (1 << num_components) - 1); + } + + void + radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, +- const struct radv_shader_args *args, const struct radv_shader_info *info, uint32_t *stack_size, +- bool resume_shader, uint32_t payload_size, struct radv_device *device, +- struct radv_ray_tracing_pipeline *pipeline, bool monolithic, +- const struct radv_ray_tracing_stage_info *traversal_info) ++ const struct radv_shader_args *args, const struct radv_shader_info *info, uint32_t *payload_size, ++ uint32_t *stack_size, struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline, ++ bool monolithic) + { + nir_function_impl *impl = nir_shader_get_entrypoint(shader); ++ nir_function *entrypoint_function = impl->function; ++ ++ radv_nir_init_function_params(entrypoint_function, shader->info.stage, *payload_size); + + const VkPipelineCreateFlagBits2KHR create_flags = vk_rt_pipeline_create_flags(pCreateInfo); + +- struct rt_variables vars = create_rt_variables(shader, device, create_flags, payload_size, monolithic); ++ struct rt_variables vars = create_rt_variables(shader, device, create_flags, *payload_size, monolithic); + + if (monolithic) + lower_rt_instructions_monolithic(shader, device, pipeline, info, pCreateInfo, payload_size, &vars); +@@ -2247,152 +2304,158 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH + vars.stack_size = MAX2(vars.stack_size, shader->scratch_size); + *stack_size = MAX2(*stack_size, vars.stack_size); + } +- shader->scratch_size = 0; + + NIR_PASS(_, shader, nir_lower_returns); + +- nir_cf_list list; +- nir_cf_extract(&list, nir_before_impl(impl), nir_after_impl(impl)); ++ unsigned shader_record_ptr_arg = -1u; ++ unsigned launch_id_arg = -1u; ++ unsigned launch_size_arg = -1u; ++ unsigned sbt_descriptors_arg = -1u; ++ unsigned traversal_addr_arg = -1u; ++ unsigned accel_struct_arg = -1u; ++ unsigned cull_mask_and_flags_arg = -1u; ++ unsigned sbt_offset_arg = -1u; ++ unsigned sbt_stride_arg = -1u; ++ unsigned miss_index_arg = -1u; ++ unsigned ray_origin_arg = -1u; ++ unsigned ray_tmin_arg = -1u; ++ unsigned ray_direction_arg = -1u; ++ unsigned ray_tmax_arg = -1u; ++ unsigned primitive_id_arg = -1u; ++ unsigned instance_addr_arg = -1u; ++ unsigned geometry_id_and_flags_arg = -1u; ++ unsigned hit_kind_arg = -1u; ++ unsigned in_payload_base_arg = -1u; ++ ++ switch (shader->info.stage) { ++ case MESA_SHADER_CALLABLE: ++ in_payload_base_arg = RAYGEN_ARG_COUNT; ++ shader_record_ptr_arg = RAYGEN_ARG_SHADER_RECORD_PTR; ++ launch_id_arg = RAYGEN_ARG_LAUNCH_ID; ++ launch_size_arg = RAYGEN_ARG_LAUNCH_SIZE; ++ sbt_descriptors_arg = RAYGEN_ARG_SBT_DESCRIPTORS; ++ traversal_addr_arg = RAYGEN_ARG_TRAVERSAL_ADDR; ++ break; ++ case MESA_SHADER_RAYGEN: ++ shader_record_ptr_arg = RAYGEN_ARG_SHADER_RECORD_PTR; ++ launch_id_arg = RAYGEN_ARG_LAUNCH_ID; ++ launch_size_arg = RAYGEN_ARG_LAUNCH_SIZE; ++ sbt_descriptors_arg = RAYGEN_ARG_SBT_DESCRIPTORS; ++ traversal_addr_arg = RAYGEN_ARG_TRAVERSAL_ADDR; ++ break; ++ case MESA_SHADER_INTERSECTION: ++ launch_id_arg = TRAVERSAL_ARG_LAUNCH_ID; ++ launch_size_arg = TRAVERSAL_ARG_LAUNCH_SIZE; ++ sbt_descriptors_arg = TRAVERSAL_ARG_SBT_DESCRIPTORS; ++ traversal_addr_arg = TRAVERSAL_ARG_TRAVERSAL_ADDR; ++ shader_record_ptr_arg = TRAVERSAL_ARG_SHADER_RECORD_PTR; ++ accel_struct_arg = TRAVERSAL_ARG_ACCEL_STRUCT; ++ cull_mask_and_flags_arg = TRAVERSAL_ARG_CULL_MASK_AND_FLAGS; ++ sbt_offset_arg = TRAVERSAL_ARG_SBT_OFFSET; ++ sbt_stride_arg = TRAVERSAL_ARG_SBT_STRIDE; ++ miss_index_arg = TRAVERSAL_ARG_MISS_INDEX; ++ ray_origin_arg = TRAVERSAL_ARG_RAY_ORIGIN; ++ ray_tmin_arg = TRAVERSAL_ARG_RAY_TMIN; ++ ray_direction_arg = TRAVERSAL_ARG_RAY_DIRECTION; ++ ray_tmax_arg = TRAVERSAL_ARG_RAY_TMAX; ++ in_payload_base_arg = TRAVERSAL_ARG_PAYLOAD_BASE; ++ break; ++ case MESA_SHADER_CLOSEST_HIT: ++ case MESA_SHADER_MISS: ++ launch_id_arg = CHIT_MISS_ARG_LAUNCH_ID; ++ launch_size_arg = CHIT_MISS_ARG_LAUNCH_SIZE; ++ sbt_descriptors_arg = CHIT_MISS_ARG_SBT_DESCRIPTORS; ++ traversal_addr_arg = CHIT_MISS_ARG_TRAVERSAL_ADDR; ++ shader_record_ptr_arg = CHIT_MISS_ARG_SHADER_RECORD_PTR; ++ accel_struct_arg = CHIT_MISS_ARG_ACCEL_STRUCT; ++ cull_mask_and_flags_arg = CHIT_MISS_ARG_CULL_MASK_AND_FLAGS; ++ sbt_offset_arg = CHIT_MISS_ARG_SBT_OFFSET; ++ sbt_stride_arg = CHIT_MISS_ARG_SBT_STRIDE; ++ miss_index_arg = CHIT_MISS_ARG_MISS_INDEX; ++ ray_origin_arg = CHIT_MISS_ARG_RAY_ORIGIN; ++ ray_tmin_arg = CHIT_MISS_ARG_RAY_TMIN; ++ ray_direction_arg = CHIT_MISS_ARG_RAY_DIRECTION; ++ ray_tmax_arg = CHIT_MISS_ARG_RAY_TMAX; ++ primitive_id_arg = CHIT_MISS_ARG_PRIMITIVE_ID; ++ instance_addr_arg = CHIT_MISS_ARG_INSTANCE_ADDR; ++ geometry_id_and_flags_arg = CHIT_MISS_ARG_GEOMETRY_ID_AND_FLAGS; ++ hit_kind_arg = CHIT_MISS_ARG_HIT_KIND; ++ in_payload_base_arg = CHIT_MISS_ARG_PAYLOAD_BASE; ++ break; ++ default: ++ break; ++ } + + /* initialize variables */ + nir_builder b = nir_builder_at(nir_before_impl(impl)); + +- nir_def *descriptor_sets = ac_nir_load_arg(&b, &args->ac, args->descriptor_sets[0]); +- nir_def *push_constants = ac_nir_load_arg(&b, &args->ac, args->ac.push_constants); +- nir_def *sbt_descriptors = ac_nir_load_arg(&b, &args->ac, args->ac.rt.sbt_descriptors); +- + nir_def *launch_sizes[3]; ++ nir_def *launch_size_vec = nir_load_param(&b, launch_size_arg); + for (uint32_t i = 0; i < ARRAY_SIZE(launch_sizes); i++) { +- launch_sizes[i] = ac_nir_load_arg(&b, &args->ac, args->ac.rt.launch_sizes[i]); ++ launch_sizes[i] = nir_channel(&b, launch_size_vec, i); + nir_store_var(&b, vars.launch_sizes[i], launch_sizes[i], 1); + } + +- nir_def *scratch_offset = NULL; +- if (args->ac.scratch_offset.used) +- scratch_offset = ac_nir_load_arg(&b, &args->ac, args->ac.scratch_offset); +- nir_def *ring_offsets = NULL; +- if (args->ac.ring_offsets.used) +- ring_offsets = ac_nir_load_arg(&b, &args->ac, args->ac.ring_offsets); +- + nir_def *launch_ids[3]; ++ nir_def *launch_id_vec = nir_load_param(&b, launch_id_arg); + for (uint32_t i = 0; i < ARRAY_SIZE(launch_ids); i++) { +- launch_ids[i] = ac_nir_load_arg(&b, &args->ac, args->ac.rt.launch_ids[i]); ++ launch_ids[i] = nir_channel(&b, launch_id_vec, i); + nir_store_var(&b, vars.launch_ids[i], launch_ids[i], 1); + } + +- nir_def *traversal_addr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.traversal_shader_addr); +- nir_store_var(&b, vars.traversal_addr, nir_pack_64_2x32(&b, traversal_addr), 1); ++ nir_store_var(&b, vars.traversal_addr, nir_load_param(&b, traversal_addr_arg), 1); + +- nir_def *shader_addr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.shader_addr); +- shader_addr = nir_pack_64_2x32(&b, shader_addr); +- nir_store_var(&b, vars.shader_addr, shader_addr, 1); ++ nir_store_var(&b, vars.sbt_descriptors, nir_load_param(&b, sbt_descriptors_arg), 1); + +- nir_store_var(&b, vars.stack_ptr, ac_nir_load_arg(&b, &args->ac, args->ac.rt.dynamic_callable_stack_base), 1); +- nir_def *record_ptr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.shader_record); +- nir_store_var(&b, vars.shader_record_ptr, nir_pack_64_2x32(&b, record_ptr), 1); +- nir_store_var(&b, vars.arg, ac_nir_load_arg(&b, &args->ac, args->ac.rt.payload_offset), 1); +- +- nir_def *accel_struct = ac_nir_load_arg(&b, &args->ac, args->ac.rt.accel_struct); +- nir_store_var(&b, vars.accel_struct, nir_pack_64_2x32(&b, accel_struct), 1); +- nir_store_var(&b, vars.cull_mask_and_flags, ac_nir_load_arg(&b, &args->ac, args->ac.rt.cull_mask_and_flags), 1); +- nir_store_var(&b, vars.sbt_offset, ac_nir_load_arg(&b, &args->ac, args->ac.rt.sbt_offset), 1); +- nir_store_var(&b, vars.sbt_stride, ac_nir_load_arg(&b, &args->ac, args->ac.rt.sbt_stride), 1); +- nir_store_var(&b, vars.origin, ac_nir_load_arg(&b, &args->ac, args->ac.rt.ray_origin), 0x7); +- nir_store_var(&b, vars.tmin, ac_nir_load_arg(&b, &args->ac, args->ac.rt.ray_tmin), 1); +- nir_store_var(&b, vars.direction, ac_nir_load_arg(&b, &args->ac, args->ac.rt.ray_direction), 0x7); +- nir_store_var(&b, vars.tmax, ac_nir_load_arg(&b, &args->ac, args->ac.rt.ray_tmax), 1); +- +- if (traversal_info && traversal_info->miss_index.state == RADV_RT_CONST_ARG_STATE_VALID) +- nir_store_var(&b, vars.miss_index, nir_imm_int(&b, traversal_info->miss_index.value), 0x1); +- else +- nir_store_var(&b, vars.miss_index, ac_nir_load_arg(&b, &args->ac, args->ac.rt.miss_index), 0x1); +- +- nir_store_var(&b, vars.primitive_id, ac_nir_load_arg(&b, &args->ac, args->ac.rt.primitive_id), 1); +- nir_def *instance_addr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.instance_addr); +- nir_store_var(&b, vars.instance_addr, nir_pack_64_2x32(&b, instance_addr), 1); +- nir_store_var(&b, vars.geometry_id_and_flags, ac_nir_load_arg(&b, &args->ac, args->ac.rt.geometry_id_and_flags), 1); +- nir_store_var(&b, vars.hit_kind, ac_nir_load_arg(&b, &args->ac, args->ac.rt.hit_kind), 1); +- +- /* guard the shader, so that only the correct invocations execute it */ +- nir_if *shader_guard = NULL; +- if (shader->info.stage != MESA_SHADER_RAYGEN || resume_shader) { +- nir_def *uniform_shader_addr = ac_nir_load_arg(&b, &args->ac, args->ac.rt.uniform_shader_addr); +- uniform_shader_addr = nir_pack_64_2x32(&b, uniform_shader_addr); +- uniform_shader_addr = nir_ior_imm(&b, uniform_shader_addr, radv_get_rt_priority(shader->info.stage)); +- +- shader_guard = nir_push_if(&b, nir_ieq(&b, uniform_shader_addr, shader_addr)); +- shader_guard->control = nir_selection_control_divergent_always_taken; +- } +- +- nir_cf_reinsert(&list, b.cursor); +- +- if (shader_guard) +- nir_pop_if(&b, shader_guard); ++ if (monolithic) { ++ nir_store_var(&b, vars.stack_ptr, nir_imm_int(&b, 0), 1); ++ ++ nir_store_var(&b, vars.arg, nir_imm_int(&b, 0), 1); ++ } ++ ++ store_param_var(&b, vars.shader_record_ptr, shader_record_ptr_arg, 1, 64); ++ store_param_var(&b, vars.accel_struct, accel_struct_arg, 1, 64); ++ store_param_var(&b, vars.cull_mask_and_flags, cull_mask_and_flags_arg, 1, 32); ++ store_param_var(&b, vars.sbt_offset, sbt_offset_arg, 1, 32); ++ store_param_var(&b, vars.sbt_stride, sbt_stride_arg, 1, 32); ++ store_param_var(&b, vars.miss_index, miss_index_arg, 1, 32); ++ store_param_var(&b, vars.origin, ray_origin_arg, 3, 32); ++ store_param_var(&b, vars.tmin, ray_tmin_arg, 1, 32); ++ store_param_var(&b, vars.direction, ray_direction_arg, 3, 32); ++ store_param_var(&b, vars.tmax, ray_tmax_arg, 1, 32); ++ store_param_var(&b, vars.primitive_id, primitive_id_arg, 1, 32); ++ store_param_var(&b, vars.instance_addr, instance_addr_arg, 1, 64); ++ store_param_var(&b, vars.geometry_id_and_flags, geometry_id_and_flags_arg, 1, 32); ++ store_param_var(&b, vars.hit_kind, hit_kind_arg, 1, 32); + + b.cursor = nir_after_impl(impl); + + if (monolithic) { + nir_terminate(&b); +- } else { +- /* select next shader */ +- shader_addr = nir_load_var(&b, vars.shader_addr); +- nir_def *next = select_next_shader(&b, shader_addr, info->wave_size); +- ac_nir_store_arg(&b, &args->ac, args->ac.rt.uniform_shader_addr, next); +- +- ac_nir_store_arg(&b, &args->ac, args->descriptor_sets[0], descriptor_sets); +- ac_nir_store_arg(&b, &args->ac, args->ac.push_constants, push_constants); +- ac_nir_store_arg(&b, &args->ac, args->ac.rt.sbt_descriptors, sbt_descriptors); +- ac_nir_store_arg(&b, &args->ac, args->ac.rt.traversal_shader_addr, traversal_addr); +- +- for (uint32_t i = 0; i < ARRAY_SIZE(launch_sizes); i++) { +- if (rt_info.uses_launch_size) +- ac_nir_store_arg(&b, &args->ac, args->ac.rt.launch_sizes[i], launch_sizes[i]); +- else +- radv_store_arg(&b, args, traversal_info, args->ac.rt.launch_sizes[i], launch_sizes[i]); +- } +- +- if (scratch_offset) +- ac_nir_store_arg(&b, &args->ac, args->ac.scratch_offset, scratch_offset); +- if (ring_offsets) +- ac_nir_store_arg(&b, &args->ac, args->ac.ring_offsets, ring_offsets); +- +- for (uint32_t i = 0; i < ARRAY_SIZE(launch_ids); i++) { +- if (rt_info.uses_launch_id) +- ac_nir_store_arg(&b, &args->ac, args->ac.rt.launch_ids[i], launch_ids[i]); +- else +- radv_store_arg(&b, args, traversal_info, args->ac.rt.launch_ids[i], launch_ids[i]); +- } +- +- /* store back all variables to registers */ +- ac_nir_store_arg(&b, &args->ac, args->ac.rt.dynamic_callable_stack_base, nir_load_var(&b, vars.stack_ptr)); +- ac_nir_store_arg(&b, &args->ac, args->ac.rt.shader_addr, shader_addr); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.shader_record, nir_load_var(&b, vars.shader_record_ptr)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.payload_offset, nir_load_var(&b, vars.arg)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.accel_struct, nir_load_var(&b, vars.accel_struct)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.cull_mask_and_flags, +- nir_load_var(&b, vars.cull_mask_and_flags)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.sbt_offset, nir_load_var(&b, vars.sbt_offset)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.sbt_stride, nir_load_var(&b, vars.sbt_stride)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.miss_index, nir_load_var(&b, vars.miss_index)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.ray_origin, nir_load_var(&b, vars.origin)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.ray_tmin, nir_load_var(&b, vars.tmin)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.ray_direction, nir_load_var(&b, vars.direction)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.ray_tmax, nir_load_var(&b, vars.tmax)); +- +- radv_store_arg(&b, args, traversal_info, args->ac.rt.primitive_id, nir_load_var(&b, vars.primitive_id)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.instance_addr, nir_load_var(&b, vars.instance_addr)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.geometry_id_and_flags, +- nir_load_var(&b, vars.geometry_id_and_flags)); +- radv_store_arg(&b, args, traversal_info, args->ac.rt.hit_kind, nir_load_var(&b, vars.hit_kind)); + } + + nir_metadata_preserve(impl, nir_metadata_none); + + /* cleanup passes */ ++ if (!monolithic) { ++ NIR_PASS_V(shader, radv_nir_lower_ray_payload_derefs, 0); ++ ++ b.cursor = nir_before_impl(impl); ++ nir_deref_instr **payload_in_storage = ++ rzalloc_array_size(shader, sizeof(nir_deref_instr *), DIV_ROUND_UP(*payload_size, 4)); ++ if (in_payload_base_arg != -1u) { ++ for (unsigned i = 0; i < DIV_ROUND_UP(*payload_size, 4); ++i) { ++ payload_in_storage[i] = nir_build_deref_cast(&b, nir_load_param(&b, in_payload_base_arg + i), ++ nir_var_shader_call_data, glsl_uint_type(), 4); ++ } ++ } ++ NIR_PASS_V(shader, lower_rt_storage, NULL, payload_in_storage, vars.out_payload_storage, info->wave_size); ++ ++ nir_remove_dead_derefs(shader); ++ nir_remove_dead_variables(shader, nir_var_function_temp | nir_var_shader_call_data, NULL); ++ } + NIR_PASS_V(shader, nir_lower_global_vars_to_local); + NIR_PASS_V(shader, nir_lower_vars_to_ssa); +- if (shader->info.stage == MESA_SHADER_CLOSEST_HIT || shader->info.stage == MESA_SHADER_INTERSECTION) +- NIR_PASS_V(shader, lower_hit_attribs, NULL, info->wave_size); + } + + static bool +diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c +index c4feea4a6f95b..196f8aa23a032 100644 +--- a/src/amd/vulkan/radv_pipeline_rt.c ++++ b/src/amd/vulkan/radv_pipeline_rt.c +@@ -368,7 +368,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, + bool keep_executable_info = radv_pipeline_capture_shaders(device, pipeline->base.base.create_flags); + bool keep_statistic_info = radv_pipeline_capture_shader_stats(device, pipeline->base.base.create_flags); + +- radv_nir_lower_rt_io(stage->nir, monolithic, 0, payload_size); ++ radv_nir_lower_rt_io(stage->nir, monolithic, 0); + + /* Gather shader info. */ + nir_shader_gather_info(stage->nir, nir_shader_get_entrypoint(stage->nir)); +@@ -382,70 +382,30 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, + stage->info.user_sgprs_locs = stage->args.user_sgprs_locs; + stage->info.inline_push_constant_mask = stage->args.ac.inline_push_const_mask; + +- /* Move ray tracing system values to the top that are set by rt_trace_ray +- * to prevent them from being overwritten by other rt_trace_ray calls. +- */ +- NIR_PASS_V(stage->nir, move_rt_instructions); +- +- uint32_t num_resume_shaders = 0; +- nir_shader **resume_shaders = NULL; +- +- if (stage->stage != MESA_SHADER_INTERSECTION && !monolithic) { +- nir_builder b = nir_builder_at(nir_after_impl(nir_shader_get_entrypoint(stage->nir))); +- nir_rt_return_amd(&b); +- +- const nir_lower_shader_calls_options opts = { +- .address_format = nir_address_format_32bit_offset, +- .stack_alignment = 16, +- .localized_loads = true, +- .vectorizer_callback = ac_nir_mem_vectorize_callback, +- .vectorizer_data = &pdev->info.gfx_level, +- }; +- nir_lower_shader_calls(stage->nir, &opts, &resume_shaders, &num_resume_shaders, stage->nir); +- } +- +- unsigned num_shaders = num_resume_shaders + 1; +- nir_shader **shaders = ralloc_array(stage->nir, nir_shader *, num_shaders); +- if (!shaders) +- return VK_ERROR_OUT_OF_HOST_MEMORY; +- +- shaders[0] = stage->nir; +- for (uint32_t i = 0; i < num_resume_shaders; i++) +- shaders[i + 1] = resume_shaders[i]; +- + if (stage_info) + memset(stage_info->unused_args, 0xFF, sizeof(stage_info->unused_args)); + + /* Postprocess shader parts. */ +- for (uint32_t i = 0; i < num_shaders; i++) { +- struct radv_shader_stage temp_stage = *stage; +- temp_stage.nir = shaders[i]; +- radv_nir_lower_rt_abi(temp_stage.nir, pCreateInfo, &temp_stage.args, &stage->info, stack_size, i > 0, device, +- pipeline, monolithic, traversal_stage_info); ++ radv_nir_lower_rt_abi(stage->nir, pCreateInfo, &stage->args, &stage->info, payload_size, stack_size, device, ++ pipeline, monolithic); + +- /* Info might be out-of-date after inlining in radv_nir_lower_rt_abi(). */ +- nir_shader_gather_info(temp_stage.nir, radv_get_rt_shader_entrypoint(temp_stage.nir)); ++ /* Info might be out-of-date after inlining in radv_nir_lower_rt_abi(). */ ++ nir_shader_gather_info(stage->nir, radv_get_rt_shader_entrypoint(stage->nir)); + +- radv_optimize_nir(temp_stage.nir, stage->key.optimisations_disabled); +- radv_postprocess_nir(device, NULL, &temp_stage); +- +- if (stage_info) +- radv_gather_unused_args(stage_info, shaders[i]); +- } ++ radv_optimize_nir(stage->nir, stage->key.optimisations_disabled); ++ radv_postprocess_nir(device, NULL, stage); + +- bool dump_shader = radv_can_dump_shader(device, shaders[0], false); ++ bool dump_shader = radv_can_dump_shader(device, stage->nir, false); + bool replayable = + pipeline->base.base.create_flags & VK_PIPELINE_CREATE_2_RAY_TRACING_SHADER_GROUP_HANDLE_CAPTURE_REPLAY_BIT_KHR; + + if (dump_shader) { + simple_mtx_lock(&instance->shader_dump_mtx); +- for (uint32_t i = 0; i < num_shaders; i++) +- nir_print_shader(shaders[i], stderr); ++ nir_print_shader(stage->nir, stderr); + } + + /* Compile NIR shader to AMD assembly. */ +- binary = +- radv_shader_nir_to_asm(device, stage, shaders, num_shaders, NULL, keep_executable_info, keep_statistic_info); ++ binary = radv_shader_nir_to_asm(device, stage, &stage->nir, 1, NULL, keep_executable_info, keep_statistic_info); + struct radv_shader *shader; + if (replay_block || replayable) { + VkResult result = radv_shader_create_uncached(device, binary, replayable, replay_block, &shader); +@@ -463,7 +423,7 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, + if (stack_size) + *stack_size += DIV_ROUND_UP(shader->config.scratch_bytes_per_wave, shader->info.wave_size); + +- radv_shader_generate_debug_info(device, dump_shader, keep_executable_info, binary, shader, shaders, num_shaders, ++ radv_shader_generate_debug_info(device, dump_shader, keep_executable_info, binary, shader, &stage->nir, 1, + &stage->info); + + if (shader && keep_executable_info && stage->spirv.size) { +diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h +index f6a0f35c23333..654ae528866d8 100644 +--- a/src/amd/vulkan/radv_shader.h ++++ b/src/amd/vulkan/radv_shader.h +@@ -516,15 +516,14 @@ radv_get_rt_shader_entrypoint(nir_shader *shader) + return NULL; + } + +-void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset, uint32_t *payload_size); ++void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_offset); + + struct radv_ray_tracing_stage_info; + + void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, + const struct radv_shader_args *args, const struct radv_shader_info *info, +- uint32_t *stack_size, bool resume_shader, uint32_t payload_size, struct radv_device *device, +- struct radv_ray_tracing_pipeline *pipeline, bool monolithic, +- const struct radv_ray_tracing_stage_info *traversal_info); ++ uint32_t *payload_size, uint32_t *stack_size, struct radv_device *device, ++ struct radv_ray_tracing_pipeline *pipeline, bool monolithic); + + void radv_gather_unused_args(struct radv_ray_tracing_stage_info *info, nir_shader *nir); + +diff --git a/src/amd/vulkan/radv_shader_args.c b/src/amd/vulkan/radv_shader_args.c +index 75f5a66444f91..e52fc48c33ebc 100644 +--- a/src/amd/vulkan/radv_shader_args.c ++++ b/src/amd/vulkan/radv_shader_args.c +@@ -316,7 +316,7 @@ radv_init_shader_args(const struct radv_device *device, gl_shader_stage stage, s + void + radv_declare_rt_shader_args(enum amd_gfx_level gfx_level, struct radv_shader_args *args) + { +- add_ud_arg(args, 2, AC_ARG_CONST_PTR, &args->ac.rt.uniform_shader_addr, AC_UD_SCRATCH_RING_OFFSETS); ++ add_ud_arg(args, 2, AC_ARG_CONST_PTR, &args->ac.ring_offsets, AC_UD_SCRATCH_RING_OFFSETS); + add_ud_arg(args, 1, AC_ARG_CONST_PTR_PTR, &args->descriptor_sets[0], AC_UD_INDIRECT_DESCRIPTOR_SETS); + ac_add_arg(&args->ac, AC_ARG_SGPR, 1, AC_ARG_CONST_PTR, &args->ac.push_constants); + ac_add_arg(&args->ac, AC_ARG_SGPR, 2, AC_ARG_CONST_DESC_PTR, &args->ac.rt.sbt_descriptors); +@@ -334,25 +334,8 @@ radv_declare_rt_shader_args(enum amd_gfx_level gfx_level, struct radv_shader_arg + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.launch_ids[i]); + + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.dynamic_callable_stack_base); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_CONST_PTR, &args->ac.rt.shader_addr); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_CONST_PTR, &args->ac.rt.shader_record); + + ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.payload_offset); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_FLOAT, &args->ac.rt.ray_origin); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 3, AC_ARG_FLOAT, &args->ac.rt.ray_direction); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.rt.ray_tmin); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_FLOAT, &args->ac.rt.ray_tmax); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.cull_mask_and_flags); +- +- ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_CONST_PTR, &args->ac.rt.accel_struct); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.sbt_offset); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.sbt_stride); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.miss_index); +- +- ac_add_arg(&args->ac, AC_ARG_VGPR, 2, AC_ARG_CONST_PTR, &args->ac.rt.instance_addr); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.primitive_id); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.geometry_id_and_flags); +- ac_add_arg(&args->ac, AC_ARG_VGPR, 1, AC_ARG_INT, &args->ac.rt.hit_kind); + } + + static bool +@@ -548,7 +531,6 @@ declare_shader_args(const struct radv_device *device, const struct radv_graphics + radv_init_shader_args(device, stage, args); + + if (gl_shader_stage_is_rt(stage)) { +- radv_declare_rt_shader_args(gfx_level, args); + return; + } + +-- +GitLab + + +From 7a6a16e551cf02df8e14d8b729584ca9d8bf5443 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 12:22:46 +0200 +Subject: [PATCH 29/71] radv/rt: Remove radv_gather_unused_args + +Not needed anymore. +--- + src/amd/vulkan/nir/radv_nir_rt_shader.c | 47 ------------------------- + src/amd/vulkan/radv_shader.h | 2 -- + 2 files changed, 49 deletions(-) + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index 7968cb36f5d87..d0e43ebd406b7 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -2457,50 +2457,3 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH + NIR_PASS_V(shader, nir_lower_global_vars_to_local); + NIR_PASS_V(shader, nir_lower_vars_to_ssa); + } +- +-static bool +-radv_arg_def_is_unused(nir_def *def) +-{ +- nir_foreach_use (use, def) { +- nir_instr *use_instr = nir_src_parent_instr(use); +- if (use_instr->type == nir_instr_type_intrinsic) { +- nir_intrinsic_instr *use_intr = nir_instr_as_intrinsic(use_instr); +- if (use_intr->intrinsic == nir_intrinsic_store_scalar_arg_amd || +- use_intr->intrinsic == nir_intrinsic_store_vector_arg_amd) +- continue; +- } else if (use_instr->type == nir_instr_type_phi) { +- nir_cf_node *prev_node = nir_cf_node_prev(&use_instr->block->cf_node); +- if (!prev_node) +- return false; +- +- nir_phi_instr *phi = nir_instr_as_phi(use_instr); +- if (radv_arg_def_is_unused(&phi->def)) +- continue; +- } +- +- return false; +- } +- +- return true; +-} +- +-static bool +-radv_gather_unused_args_instr(nir_builder *b, nir_intrinsic_instr *instr, void *data) +-{ +- if (instr->intrinsic != nir_intrinsic_load_scalar_arg_amd && instr->intrinsic != nir_intrinsic_load_vector_arg_amd) +- return false; +- +- if (!radv_arg_def_is_unused(&instr->def)) { +- /* This arg is used for more than passing data to the next stage. */ +- struct radv_ray_tracing_stage_info *info = data; +- BITSET_CLEAR(info->unused_args, nir_intrinsic_base(instr)); +- } +- +- return false; +-} +- +-void +-radv_gather_unused_args(struct radv_ray_tracing_stage_info *info, nir_shader *nir) +-{ +- nir_shader_intrinsics_pass(nir, radv_gather_unused_args_instr, nir_metadata_all, info); +-} +diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h +index 654ae528866d8..7dacf66a7a3fa 100644 +--- a/src/amd/vulkan/radv_shader.h ++++ b/src/amd/vulkan/radv_shader.h +@@ -525,8 +525,6 @@ void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateI + uint32_t *payload_size, uint32_t *stack_size, struct radv_device *device, + struct radv_ray_tracing_pipeline *pipeline, bool monolithic); + +-void radv_gather_unused_args(struct radv_ray_tracing_stage_info *info, nir_shader *nir); +- + struct radv_shader_stage; + + nir_shader *radv_shader_spirv_to_nir(struct radv_device *device, const struct radv_shader_stage *stage, +-- +GitLab + + +From c4aa21f8f03032e97d13aece927b62240986fd39 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sat, 4 May 2024 17:51:17 +0200 +Subject: [PATCH 30/71] radv/rt: make radv_nir_init_rt_function_params public + +--- + src/amd/vulkan/nir/radv_nir_rt_shader.c | 10 +++++----- + src/amd/vulkan/radv_shader.h | 1 + + 2 files changed, 6 insertions(+), 5 deletions(-) + +diff --git a/src/amd/vulkan/nir/radv_nir_rt_shader.c b/src/amd/vulkan/nir/radv_nir_rt_shader.c +index d0e43ebd406b7..aa9af1eeefd54 100644 +--- a/src/amd/vulkan/nir/radv_nir_rt_shader.c ++++ b/src/amd/vulkan/nir/radv_nir_rt_shader.c +@@ -221,7 +221,7 @@ lower_rt_derefs(nir_shader *shader) + return progress; + } + +-static void ++void + radv_nir_init_rt_function_params(nir_function *function, gl_shader_stage stage, unsigned payload_size) + { + unsigned payload_base = -1u; +@@ -583,13 +583,13 @@ create_rt_variables(nir_shader *shader, struct radv_device *device, const VkPipe + } + + nir_function *trace_ray_func = nir_function_create(shader, "trace_ray_func"); +- radv_nir_init_function_params(trace_ray_func, MESA_SHADER_INTERSECTION, max_payload_size); ++ radv_nir_init_rt_function_params(trace_ray_func, MESA_SHADER_INTERSECTION, max_payload_size); + vars.trace_ray_func = trace_ray_func; + nir_function *chit_miss_func = nir_function_create(shader, "chit_miss_func"); +- radv_nir_init_function_params(chit_miss_func, MESA_SHADER_CLOSEST_HIT, max_payload_size); ++ radv_nir_init_rt_function_params(chit_miss_func, MESA_SHADER_CLOSEST_HIT, max_payload_size); + vars.chit_miss_func = chit_miss_func; + nir_function *callable_func = nir_function_create(shader, "callable_func"); +- radv_nir_init_function_params(callable_func, MESA_SHADER_CALLABLE, max_payload_size); ++ radv_nir_init_rt_function_params(callable_func, MESA_SHADER_CALLABLE, max_payload_size); + vars.callable_func = callable_func; + return vars; + } +@@ -2287,7 +2287,7 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH + nir_function_impl *impl = nir_shader_get_entrypoint(shader); + nir_function *entrypoint_function = impl->function; + +- radv_nir_init_function_params(entrypoint_function, shader->info.stage, *payload_size); ++ radv_nir_init_rt_function_params(entrypoint_function, shader->info.stage, *payload_size); + + const VkPipelineCreateFlagBits2KHR create_flags = vk_rt_pipeline_create_flags(pCreateInfo); + +diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h +index 7dacf66a7a3fa..10e062fb041b9 100644 +--- a/src/amd/vulkan/radv_shader.h ++++ b/src/amd/vulkan/radv_shader.h +@@ -520,6 +520,7 @@ void radv_nir_lower_rt_io(nir_shader *shader, bool monolithic, uint32_t payload_ + + struct radv_ray_tracing_stage_info; + ++void radv_nir_init_rt_function_params(nir_function *function, gl_shader_stage stage, unsigned payload_size); + void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, + const struct radv_shader_args *args, const struct radv_shader_info *info, + uint32_t *payload_size, uint32_t *stack_size, struct radv_device *device, +-- +GitLab + + +From 98acf10bc32ec843f53497bc701a673777232c65 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Fri, 3 May 2024 17:36:43 +0200 +Subject: [PATCH 31/71] radv: Use call optimization + +--- + src/amd/vulkan/radv_pipeline.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c +index 82a5aac71437d..daaf4e9ba4f00 100644 +--- a/src/amd/vulkan/radv_pipeline.c ++++ b/src/amd/vulkan/radv_pipeline.c +@@ -643,6 +643,8 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat + * spilling. + */ + NIR_PASS(_, stage->nir, nir_opt_move, nir_move_comparisons); ++ ++ NIR_PASS(_, stage->nir, nir_minimize_call_live_states); + } + } + +-- +GitLab + + +From 872b8a249c2fa92a5425c4476d7021d881d76990 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Thu, 28 Dec 2023 20:03:05 +0100 +Subject: [PATCH 32/71] aco: Add ABI and Pseudo CALL format + +--- + src/amd/compiler/aco_builder_h.py | 29 +++ + .../compiler/aco_instruction_selection.cpp | 23 ++ + src/amd/compiler/aco_ir.cpp | 1 + + src/amd/compiler/aco_ir.h | 235 +++++++++++++++++- + src/amd/compiler/aco_opcodes.py | 7 +- + src/amd/compiler/aco_register_allocation.cpp | 71 ------ + 6 files changed, 292 insertions(+), 74 deletions(-) + +diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py +index b1054bda76fd6..3d47be6101248 100644 +--- a/src/amd/compiler/aco_builder_h.py ++++ b/src/amd/compiler/aco_builder_h.py +@@ -567,6 +567,7 @@ formats = [("pseudo", [Format.PSEUDO], list(itertools.product(range(5), range(6) + ("branch", [Format.PSEUDO_BRANCH], itertools.product([1], [0, 1])), + ("barrier", [Format.PSEUDO_BARRIER], [(0, 0)]), + ("reduction", [Format.PSEUDO_REDUCTION], [(3, 3)]), ++ ("call", [Format.PSEUDO_CALL], [(0, 0)]), + ("vop1", [Format.VOP1], [(0, 0), (1, 1), (2, 2)]), + ("vop1_sdwa", [Format.VOP1, Format.SDWA], [(1, 1)]), + ("vop2", [Format.VOP2], itertools.product([1, 2], [2, 3])), +@@ -603,6 +604,7 @@ formats = [("pseudo", [Format.PSEUDO], list(itertools.product(range(5), range(6) + formats = [(f if len(f) == 5 else f + ('',)) for f in formats] + %>\\ + % for name, formats, shapes, extra_field_setup in formats: ++ % if shapes: + % for num_definitions, num_operands in shapes: + <% + args = ['aco_opcode opcode'] +@@ -655,6 +657,33 @@ formats = [(f if len(f) == 5 else f + ('',)) for f in formats] + + % endif + % endfor ++% else: ++ <% ++ args = ['aco_opcode opcode', 'aco::span definitions', 'aco::span operands' ] ++ for f in formats: ++ args += f.get_builder_field_decls() ++ %>\\ ++ ++ Result ${name}(${', '.join(args)}) ++ { ++ ${struct} *instr = create_instruction<${struct}>(opcode, (Format)(${'|'.join('(int)Format::%s' % f.name for f in formats)}), operands.size(), definitions.size()); ++ for (unsigned i = 0; i < definitions.size(); ++i) { ++ instr->definitions[i] = definitions[i]; ++ instr->definitions[i].setPrecise(is_precise); ++ instr->definitions[i].setNUW(is_nuw); ++ } ++ for (unsigned i = 0; i < operands.size(); ++i) ++ instr->operands[i] = operands[i]; ++ % for f in formats: ++ % for dest, field_name in zip(f.get_builder_field_dests(), f.get_builder_field_names()): ++ instr->${dest} = ${field_name}; ++ % endfor ++ ${f.get_builder_initialization(num_operands)} ++ % endfor ++ ${extra_field_setup} ++ return insert(instr); ++ } ++% endif + % endfor + }; + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index 30f0bdd1cb8f8..662b6cccc0abf 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -10537,6 +10537,29 @@ visit_jump(isel_context* ctx, nir_jump_instr* instr) + } + } + ++ABI ++make_abi(const ABI& base, Program* program) ++{ ++ ABI abi = base; ++ ++ unsigned sgpr_limit = program->dev.sgpr_limit; ++ /* GFX8- needs a scratch_rsrc that we need to keep around somewhere */ ++ if (program->gfx_level < GFX9) ++ sgpr_limit -= (align(sgpr_limit, 4) - sgpr_limit) + 4; ++ unsigned vgpr_limit = program->dev.vgpr_limit; ++ ++ abi.parameterSpace.sgpr.size = ++ std::min(abi.parameterSpace.sgpr.size, sgpr_limit - abi.parameterSpace.sgpr.lo()); ++ abi.parameterSpace.vgpr.size = ++ std::min(abi.parameterSpace.vgpr.size, vgpr_limit - (abi.parameterSpace.vgpr.lo() - 256)); ++ abi.clobberedRegs.sgpr.size = ++ std::min(abi.clobberedRegs.sgpr.size, sgpr_limit - abi.clobberedRegs.sgpr.lo()); ++ abi.clobberedRegs.vgpr.size = ++ std::min(abi.clobberedRegs.vgpr.size, vgpr_limit - (abi.clobberedRegs.vgpr.lo() - 256)); ++ ++ return abi; ++} ++ + void + visit_block(isel_context* ctx, nir_block* block) + { +diff --git a/src/amd/compiler/aco_ir.cpp b/src/amd/compiler/aco_ir.cpp +index 2c0b17a82cae6..afa1364a83f59 100644 +--- a/src/amd/compiler/aco_ir.cpp ++++ b/src/amd/compiler/aco_ir.cpp +@@ -1541,6 +1541,7 @@ get_instr_data_size(Format format) + case Format::PSEUDO_BARRIER: return sizeof(Pseudo_barrier_instruction); + case Format::PSEUDO_REDUCTION: return sizeof(Pseudo_reduction_instruction); + case Format::PSEUDO_BRANCH: return sizeof(Pseudo_branch_instruction); ++ case Format::PSEUDO_CALL: return sizeof(Pseudo_call_instruction); + case Format::DS: return sizeof(DS_instruction); + case Format::FLAT: + case Format::GLOBAL: +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index d838b728e19ce..62661b8918a9e 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -441,6 +441,215 @@ static constexpr PhysReg exec_hi{127}; + static constexpr PhysReg pops_exiting_wave_id{239}; /* GFX9-GFX10.3 */ + static constexpr PhysReg scc{253}; + ++/* Iterator type for making PhysRegInterval compatible with range-based for */ ++struct PhysRegIterator { ++ using difference_type = int; ++ using value_type = unsigned; ++ using reference = const unsigned&; ++ using pointer = const unsigned*; ++ using iterator_category = std::bidirectional_iterator_tag; ++ ++ PhysReg reg; ++ ++ PhysReg operator*() const { return reg; } ++ ++ PhysRegIterator& operator++() ++ { ++ reg.reg_b += 4; ++ return *this; ++ } ++ ++ PhysRegIterator& operator--() ++ { ++ reg.reg_b -= 4; ++ return *this; ++ } ++ ++ bool operator==(PhysRegIterator oth) const { return reg == oth.reg; } ++ ++ bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; } ++ ++ bool operator<(PhysRegIterator oth) const { return reg < oth.reg; } ++}; ++ ++/* Half-open register interval used in "sliding window"-style for-loops */ ++struct PhysRegInterval { ++ PhysReg lo_; ++ unsigned size; ++ ++ /* Inclusive lower bound */ ++ PhysReg lo() const { return lo_; } ++ ++ /* Exclusive upper bound */ ++ PhysReg hi() const { return PhysReg{lo() + size}; } ++ ++ PhysRegInterval& operator+=(uint32_t stride) ++ { ++ lo_ = PhysReg{lo_.reg() + stride}; ++ return *this; ++ } ++ ++ bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; } ++ ++ /* Construct a half-open interval, excluding the end register */ ++ static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; } ++ ++ bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); } ++ ++ bool contains(const PhysRegInterval& needle) const ++ { ++ return needle.lo() >= lo() && needle.hi() <= hi(); ++ } ++ ++ PhysRegIterator begin() const { return {lo_}; } ++ ++ PhysRegIterator end() const { return {PhysReg{lo_ + size}}; } ++}; ++ ++inline bool ++intersects(const PhysRegInterval& a, const PhysRegInterval& b) ++{ ++ return a.hi() > b.lo() && b.hi() > a.lo(); ++} ++ ++struct GPRInterval { ++ PhysRegInterval sgpr; ++ PhysRegInterval vgpr; ++}; ++ ++struct ABI { ++ GPRInterval parameterSpace; ++ GPRInterval clobberedRegs; ++ ++ bool clobbersVCC; ++ bool clobbersSCC; ++}; ++ ++static constexpr ABI rtRaygenABI = { ++ .parameterSpace = ++ { ++ .sgpr = ++ { ++ .lo_ = PhysReg(0), ++ .size = 32, ++ }, ++ .vgpr = ++ { ++ .lo_ = PhysReg(256), ++ .size = 32, ++ }, ++ }, ++ .clobberedRegs = ++ { ++ .sgpr = ++ { ++ .lo_ = PhysReg(0), ++ .size = 108, ++ }, ++ .vgpr = ++ { ++ .lo_ = PhysReg(256), ++ .size = 128, ++ }, ++ }, ++ .clobbersVCC = true, ++ .clobbersSCC = true, ++}; ++ ++static constexpr ABI rtTraversalABI = { ++ .parameterSpace = ++ { ++ .sgpr = ++ { ++ .lo_ = PhysReg(0), ++ .size = 32, ++ }, ++ .vgpr = ++ { ++ .lo_ = PhysReg(256), ++ .size = 32, ++ }, ++ }, ++ .clobberedRegs = ++ { ++ /* TODO: maybe find better values */ ++ .sgpr = ++ { ++ .lo_ = PhysReg(0), ++ .size = 108, ++ }, ++ .vgpr = ++ { ++ .lo_ = PhysReg(256), ++ .size = 128, ++ }, ++ }, ++ .clobbersVCC = true, ++ .clobbersSCC = true, ++}; ++ ++static constexpr ABI rtAnyHitABI = { ++ .parameterSpace = ++ { ++ .sgpr = ++ { ++ .lo_ = PhysReg(0), ++ .size = 32, ++ }, ++ .vgpr = ++ { ++ .lo_ = PhysReg(256), ++ .size = 32, ++ }, ++ }, ++ .clobberedRegs = ++ { ++ .sgpr = ++ { ++ .lo_ = PhysReg(80), ++ .size = 16, ++ }, ++ .vgpr = ++ { ++ .lo_ = PhysReg(256 + 80), ++ .size = 32, ++ }, ++ }, ++ .clobbersVCC = true, ++ .clobbersSCC = true, ++}; ++ ++static constexpr ABI rtClosestHitMissABI = { ++ .parameterSpace = ++ { ++ .sgpr = ++ { ++ .lo_ = PhysReg(0), ++ .size = 32, ++ }, ++ .vgpr = ++ { ++ .lo_ = PhysReg(256), ++ .size = 32, ++ }, ++ }, ++ .clobberedRegs = ++ { ++ .sgpr = ++ { ++ .lo_ = PhysReg(0), ++ .size = 108, ++ }, ++ .vgpr = ++ { ++ .lo_ = PhysReg(256), ++ .size = 128, ++ }, ++ }, ++ .clobbersVCC = true, ++ .clobbersSCC = true, ++}; ++ + /** + * Operand Class + * Initially, each Operand refers to either +@@ -1095,6 +1304,7 @@ struct FLAT_instruction; + struct Pseudo_branch_instruction; + struct Pseudo_barrier_instruction; + struct Pseudo_reduction_instruction; ++struct Pseudo_call_instruction; + struct VALU_instruction; + struct VINTERP_inreg_instruction; + struct VINTRP_instruction; +@@ -1295,6 +1505,17 @@ struct Instruction { + return *(Pseudo_reduction_instruction*)this; + } + constexpr bool isReduction() const noexcept { return format == Format::PSEUDO_REDUCTION; } ++ Pseudo_call_instruction& call() noexcept ++ { ++ assert(isCall()); ++ return *(Pseudo_call_instruction*)this; ++ } ++ const Pseudo_call_instruction& call() const noexcept ++ { ++ assert(isCall()); ++ return *(Pseudo_call_instruction*)this; ++ } ++ constexpr bool isCall() const noexcept { return format == Format::PSEUDO_CALL; } + constexpr bool isVOP3P() const noexcept { return (uint16_t)format & (uint16_t)Format::VOP3P; } + VINTERP_inreg_instruction& vinterp_inreg() noexcept + { +@@ -1773,6 +1994,16 @@ struct Pseudo_reduction_instruction : public Instruction { + static_assert(sizeof(Pseudo_reduction_instruction) == sizeof(Instruction) + 4, + "Unexpected padding"); + ++struct Pseudo_call_instruction : public Instruction { ++ ABI abi; ++ /* ++ * Register demand that's exclusively used for blocking registers for ABI compatibility. ++ * Set by live var analysis. ++ */ ++ RegisterDemand blocked_abi_demand; ++}; ++static_assert(sizeof(Pseudo_call_instruction) == sizeof(Instruction) + 40, "Unexpected padding"); ++ + inline bool + Instruction::accessesLDS() const noexcept + { +@@ -1845,8 +2076,8 @@ memory_sync_info get_sync_info(const Instruction* instr); + inline bool + is_dead(const std::vector& uses, const Instruction* instr) + { +- if (instr->definitions.empty() || instr->isBranch() || instr->opcode == aco_opcode::p_startpgm || +- instr->opcode == aco_opcode::p_init_scratch || ++ if (instr->definitions.empty() || instr->isBranch() || instr->isCall() || ++ instr->opcode == aco_opcode::p_startpgm || instr->opcode == aco_opcode::p_init_scratch || + instr->opcode == aco_opcode::p_dual_src_export_gfx11) + return false; + +diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py +index 6e37ee6fad6c9..d828f1642658b 100644 +--- a/src/amd/compiler/aco_opcodes.py ++++ b/src/amd/compiler/aco_opcodes.py +@@ -50,6 +50,7 @@ class Format(IntEnum): + PSEUDO_BRANCH = auto() + PSEUDO_BARRIER = auto() + PSEUDO_REDUCTION = auto() ++ PSEUDO_CALL = auto() + # Scalar ALU & Control Formats + SOP1 = auto() + SOP2 = auto() +@@ -93,7 +94,7 @@ class Format(IntEnum): + return "salu" + elif self in [Format.FLAT, Format.GLOBAL, Format.SCRATCH]: + return "flatlike" +- elif self in [Format.PSEUDO_BRANCH, Format.PSEUDO_REDUCTION, Format.PSEUDO_BARRIER]: ++ elif self in [Format.PSEUDO_BRANCH, Format.PSEUDO_REDUCTION, Format.PSEUDO_BARRIER, Format.PSEUDO_CALL]: + return self.name.split("_")[-1].lower() + else: + return self.name.lower() +@@ -162,6 +163,8 @@ class Format(IntEnum): + elif self == Format.PSEUDO_BARRIER: + return [('memory_sync_info', 'sync', None), + ('sync_scope', 'exec_scope', 'scope_invocation')] ++ elif self == Format.PSEUDO_CALL: ++ return [('ABI', 'abi', None)] + elif self == Format.VINTRP: + return [('unsigned', 'attribute', None), + ('unsigned', 'component', None), +@@ -351,6 +354,8 @@ insn("p_cbranch_nz", format=Format.PSEUDO_BRANCH) + + insn("p_barrier", format=Format.PSEUDO_BARRIER) + ++insn("p_call", format=Format.PSEUDO_CALL) ++ + # Primitive Ordered Pixel Shading pseudo-instructions. + + # For querying whether the current wave can enter the ordered section on GFX9-10.3, doing +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index 3ce0680bf52d6..4d73525bd0660 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -74,37 +74,6 @@ struct assignment { + } + }; + +-/* Iterator type for making PhysRegInterval compatible with range-based for */ +-struct PhysRegIterator { +- using difference_type = int; +- using value_type = unsigned; +- using reference = const unsigned&; +- using pointer = const unsigned*; +- using iterator_category = std::bidirectional_iterator_tag; +- +- PhysReg reg; +- +- PhysReg operator*() const { return reg; } +- +- PhysRegIterator& operator++() +- { +- reg.reg_b += 4; +- return *this; +- } +- +- PhysRegIterator& operator--() +- { +- reg.reg_b -= 4; +- return *this; +- } +- +- bool operator==(PhysRegIterator oth) const { return reg == oth.reg; } +- +- bool operator!=(PhysRegIterator oth) const { return reg != oth.reg; } +- +- bool operator<(PhysRegIterator oth) const { return reg < oth.reg; } +-}; +- + struct vector_info { + vector_info() : is_weak(false), num_parts(0), parts(NULL) {} + vector_info(Instruction* instr, unsigned start = 0, bool weak = false) +@@ -162,46 +131,6 @@ struct ra_ctx { + } + }; + +-/* Half-open register interval used in "sliding window"-style for-loops */ +-struct PhysRegInterval { +- PhysReg lo_; +- unsigned size; +- +- /* Inclusive lower bound */ +- PhysReg lo() const { return lo_; } +- +- /* Exclusive upper bound */ +- PhysReg hi() const { return PhysReg{lo() + size}; } +- +- PhysRegInterval& operator+=(uint32_t stride) +- { +- lo_ = PhysReg{lo_.reg() + stride}; +- return *this; +- } +- +- bool operator!=(const PhysRegInterval& oth) const { return lo_ != oth.lo_ || size != oth.size; } +- +- /* Construct a half-open interval, excluding the end register */ +- static PhysRegInterval from_until(PhysReg first, PhysReg end) { return {first, end - first}; } +- +- bool contains(PhysReg reg) const { return lo() <= reg && reg < hi(); } +- +- bool contains(const PhysRegInterval& needle) const +- { +- return needle.lo() >= lo() && needle.hi() <= hi(); +- } +- +- PhysRegIterator begin() const { return {lo_}; } +- +- PhysRegIterator end() const { return {PhysReg{lo_ + size}}; } +-}; +- +-bool +-intersects(const PhysRegInterval& a, const PhysRegInterval& b) +-{ +- return a.hi() > b.lo() && b.hi() > a.lo(); +-} +- + /* Gets the stride for full (non-subdword) registers */ + uint32_t + get_stride(RegClass rc) +-- +GitLab + + +From 9d88284e83bab4a0ba20700dc3be48c646284a79 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Tue, 9 Apr 2024 08:08:07 +0200 +Subject: [PATCH 33/71] aco: Add pseudo instr to calculate a function callee's + stack pointer + +--- + src/amd/compiler/aco_lower_to_hw_instr.cpp | 14 ++++++++++++++ + src/amd/compiler/aco_opcodes.py | 2 ++ + 2 files changed, 16 insertions(+) + +diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp +index fa3c805f491b5..1e1737319c3f6 100644 +--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp ++++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp +@@ -2817,6 +2817,20 @@ lower_to_hw_instr(Program* program) + ((32 - 1) << 11) | shader_cycles_hi); + break; + } ++ case aco_opcode::p_callee_stack_ptr: { ++ unsigned caller_stack_size = ++ ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size; ++ unsigned scratch_param_size = instr->operands[0].constantValue(); ++ unsigned callee_stack_start = caller_stack_size + scratch_param_size; ++ if (ctx.program->gfx_level < GFX9) ++ callee_stack_start *= ctx.program->wave_size; ++ if (instr->operands.size() < 2) ++ bld.sop1(aco_opcode::s_mov_b32, instr->definitions[0], ++ Operand::c32(callee_stack_start)); ++ else ++ bld.sop2(aco_opcode::s_add_u32, instr->definitions[0], Definition(scc, s1), ++ instr->operands[1], Operand::c32(callee_stack_start)); ++ } + default: break; + } + } else if (instr->isBranch()) { +diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py +index d828f1642658b..696a5a945b310 100644 +--- a/src/amd/compiler/aco_opcodes.py ++++ b/src/amd/compiler/aco_opcodes.py +@@ -331,6 +331,8 @@ insn("p_boolean_phi") + insn("p_as_uniform") + insn("p_unit_test") + ++insn("p_callee_stack_ptr") ++ + insn("p_create_vector") + insn("p_extract_vector") + insn("p_split_vector") +-- +GitLab + + +From 0e07c86fd764126d0af3bfb2041d680e9367ee6e Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 22 Apr 2024 06:50:54 +0200 +Subject: [PATCH 34/71] aco: Add scratch stack pointer + +Function callees shouldn't overwrite caller's stacks. +Track where to write scratch data with a stack pointer. +--- + src/amd/compiler/aco_ir.h | 1 + + src/amd/compiler/aco_reindex_ssa.cpp | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index 62661b8918a9e..ef2a6a0255664 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -2361,6 +2361,7 @@ public: + std::vector constant_data; + Temp private_segment_buffer; + Temp scratch_offset; ++ Temp stack_ptr = {}; + + uint16_t num_waves = 0; + uint16_t min_waves = 0; +diff --git a/src/amd/compiler/aco_reindex_ssa.cpp b/src/amd/compiler/aco_reindex_ssa.cpp +index 7c30e5b53656e..5e135a8ff83fe 100644 +--- a/src/amd/compiler/aco_reindex_ssa.cpp ++++ b/src/amd/compiler/aco_reindex_ssa.cpp +@@ -73,6 +73,7 @@ reindex_program(idx_ctx& ctx, Program* program) + program->private_segment_buffer.regClass()); + program->scratch_offset = + Temp(ctx.renames[program->scratch_offset.id()], program->scratch_offset.regClass()); ++ program->stack_ptr = Temp(ctx.renames[program->stack_ptr.id()], program->stack_ptr.regClass()); + program->temp_rc = ctx.temp_rc; + } + +-- +GitLab + + +From e876db458963a92579827a04a21b1427c0442c72 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 22 Apr 2024 06:51:10 +0200 +Subject: [PATCH 35/71] aco/spill: Use scratch stack pointer + +--- + src/amd/compiler/aco_spill.cpp | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp +index be45b0eda7632..2e30bf9e2783e 100644 +--- a/src/amd/compiler/aco_spill.cpp ++++ b/src/amd/compiler/aco_spill.cpp +@@ -1240,7 +1240,12 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block, + } + + /* GFX9+ uses scratch_* instructions, which don't use a resource. */ +- ctx.scratch_rsrc = offset_bld.copy(offset_bld.def(s1), Operand::c32(saddr)); ++ if (ctx.program->stack_ptr.id()) ++ ctx.scratch_rsrc = ++ offset_bld.sop2(aco_opcode::s_add_u32, offset_bld.def(s1), Definition(scc, s1), ++ Operand(ctx.program->stack_ptr), Operand::c32(saddr)); ++ else ++ ctx.scratch_rsrc = offset_bld.copy(offset_bld.def(s1), Operand::c32(saddr)); + } + } else { + if (ctx.scratch_rsrc == Temp()) +-- +GitLab + + +From 968bea7283d902a01843297661c63ea802a67a04 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sat, 4 May 2024 16:01:59 +0200 +Subject: [PATCH 36/71] nir: Allow forward-declaring nir_parameter + +--- + src/compiler/nir/nir.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h +index 10a592f4b87bb..dc6b15cd082b4 100644 +--- a/src/compiler/nir/nir.h ++++ b/src/compiler/nir/nir.h +@@ -3646,7 +3646,7 @@ nir_cf_list_is_empty_block(struct exec_list *cf_list) + return false; + } + +-typedef struct { ++typedef struct nir_parameter { + uint8_t num_components; + uint8_t bit_size; + +-- +GitLab + + +From e245c9553b06094af7afc232d8db158bd2e7b3d6 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 6 Mar 2024 13:27:17 +0100 +Subject: [PATCH 37/71] aco: Add call info + +--- + .../compiler/aco_instruction_selection.cpp | 80 +++++++++++++++++++ + src/amd/compiler/aco_instruction_selection.h | 32 ++++++++ + .../aco_instruction_selection_setup.cpp | 8 ++ + src/amd/compiler/aco_ir.h | 4 + + 4 files changed, 124 insertions(+) + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index 662b6cccc0abf..0875d1c7a20f4 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -10,6 +10,7 @@ + #include "aco_builder.h" + #include "aco_interface.h" + #include "aco_ir.h" ++#include "aco_nir_call_attribs.h" + + #include "common/ac_descriptors.h" + #include "common/ac_gpu_info.h" +@@ -10560,6 +10561,85 @@ make_abi(const ABI& base, Program* program) + return abi; + } + ++struct callee_info ++get_callee_info(const ABI& abi, unsigned param_count, const nir_parameter* parameters, ++ Program* program) ++{ ++ struct callee_info info = {}; ++ info.param_infos.reserve(param_count); ++ ++ unsigned sgpr_reg_byte_offset = 0; ++ unsigned vgpr_reg_byte_offset = 0; ++ unsigned scratch_param_byte_offset = 0; ++ ++ Temp return_addr = program ? program->allocateTmp(s2) : Temp(); ++ Definition return_def = Definition(return_addr); ++ return_def.setPrecolored(abi.parameterSpace.sgpr.lo().advance(sgpr_reg_byte_offset)); ++ sgpr_reg_byte_offset += 8; ++ ++ info.return_address = parameter_info{ ++ .discardable = false, ++ .is_reg = true, ++ .def = return_def, ++ }; ++ ++ for (unsigned i = 0; i < param_count; ++i) { ++ unsigned* reg_byte_offset; ++ PhysRegInterval interval; ++ RegType type; ++ if (parameters[i].is_uniform) { ++ reg_byte_offset = &sgpr_reg_byte_offset; ++ interval = abi.parameterSpace.sgpr; ++ /* Explicitly reserve space for the stack pointer, which is allocated last */ ++ interval.size -= 1; ++ type = RegType::sgpr; ++ } else { ++ reg_byte_offset = &vgpr_reg_byte_offset; ++ interval = abi.parameterSpace.vgpr; ++ type = RegType::vgpr; ++ } ++ ++ unsigned byte_size = align(parameters[i].bit_size, 32) / 8 * parameters[i].num_components; ++ RegClass rc = RegClass(type, byte_size / 4); ++ PhysReg param_reg = interval.lo().advance(*reg_byte_offset); ++ ++ if (param_reg < interval.hi()) { ++ ++info.reg_param_count; ++ if (parameters[i].is_return) ++ ++info.reg_return_param_count; ++ Temp dst = program ? program->allocateTmp(rc) : Temp(); ++ Definition def = Definition(dst); ++ def.setPrecolored(param_reg); ++ *reg_byte_offset += byte_size; ++ info.param_infos.emplace_back(parameter_info{ ++ .discardable = !!(parameters[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE), ++ .is_reg = true, ++ .def = def, ++ }); ++ } else { ++ info.param_infos.emplace_back(parameter_info{ ++ .discardable = !!(parameters[i].driver_attributes & ACO_NIR_PARAM_ATTRIB_DISCARDABLE), ++ .is_reg = false, ++ .scratch_offset = scratch_param_byte_offset, ++ }); ++ scratch_param_byte_offset += byte_size; ++ } ++ } ++ ++ Temp stack_ptr = program ? program->allocateTmp(s1) : Temp(); ++ Definition stack_def = Definition(stack_ptr); ++ stack_def.setPrecolored(abi.parameterSpace.sgpr.lo().advance(sgpr_reg_byte_offset)); ++ sgpr_reg_byte_offset += 4; ++ info.stack_ptr = parameter_info{ ++ .discardable = false, ++ .is_reg = true, ++ .def = stack_def, ++ }; ++ ++ info.scratch_param_size = scratch_param_byte_offset; ++ return info; ++} ++ + void + visit_block(isel_context* ctx, nir_block* block) + { +diff --git a/src/amd/compiler/aco_instruction_selection.h b/src/amd/compiler/aco_instruction_selection.h +index d7464811def91..1682ed262f1e5 100644 +--- a/src/amd/compiler/aco_instruction_selection.h ++++ b/src/amd/compiler/aco_instruction_selection.h +@@ -35,6 +35,31 @@ struct shader_io_state { + } + }; + ++struct parameter_info { ++ bool discardable; ++ bool is_reg; ++ union { ++ Definition def; ++ unsigned scratch_offset; ++ }; ++}; ++ ++struct call_info { ++ nir_call_instr* nir_instr; ++ Instruction* aco_instr; ++ std::vector return_info; ++ unsigned scratch_param_size; ++}; ++ ++struct callee_info { ++ std::vector param_infos; ++ parameter_info return_address; ++ parameter_info stack_ptr; ++ unsigned reg_param_count = 0; ++ unsigned reg_return_param_count = 0; ++ unsigned scratch_param_size = 0; ++}; ++ + struct exec_info { + /* Set to false when loop_nest_depth==0 && parent_if.is_divergent==false */ + bool potentially_empty_discard = false; +@@ -111,6 +136,13 @@ struct isel_context { + uint32_t wqm_instruction_idx; + + BITSET_DECLARE(output_args, AC_MAX_ARGS); ++ ++ /* Function information */ ++ ABI callee_abi; ++ struct callee_info callee_info; ++ std::vector call_infos; ++ Temp next_divergent_pc; ++ Temp next_pc; + }; + + inline Temp +diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp +index 28708503c6b38..f1cd92aad5fd2 100644 +--- a/src/amd/compiler/aco_instruction_selection_setup.cpp ++++ b/src/amd/compiler/aco_instruction_selection_setup.cpp +@@ -393,6 +393,8 @@ init_context(isel_context* ctx, nir_shader* shader) + ctx->program->allocateRange(impl->ssa_alloc); + RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id; + ++ unsigned call_count = 0; ++ + /* TODO: make this recursive to improve compile times */ + bool done = false; + while (!done) { +@@ -699,12 +701,18 @@ init_context(isel_context* ctx, nir_shader* shader) + regclasses[phi->def.index] = rc; + break; + } ++ case nir_instr_type_call: { ++ ++call_count; ++ break; ++ } + default: break; + } + } + } + } + ++ ctx->call_infos.reserve(call_count); ++ + ctx->program->config->spi_ps_input_ena = ctx->program->info.ps.spi_ps_input_ena; + ctx->program->config->spi_ps_input_addr = ctx->program->info.ps.spi_ps_input_addr; + +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index ef2a6a0255664..920174ac50798 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -2385,6 +2385,10 @@ public: + /* For shader part with previous shader part that has lds access. */ + bool pending_lds_access = false; + ++ ABI callee_abi = {}; ++ unsigned short arg_sgpr_count; ++ unsigned short arg_vgpr_count; ++ + struct { + monotonic_buffer_resource memory; + /* live-in temps per block */ +-- +GitLab + + +From 112032179e9758b2c24ab0184b3dd73ff34d7266 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sun, 21 Apr 2024 17:52:58 +0200 +Subject: [PATCH 38/71] aco/isel: Use stack pointer parameter in + load/store_scratch + +--- + .../compiler/aco_instruction_selection.cpp | 32 +++++++++++++++++-- + 1 file changed, 29 insertions(+), 3 deletions(-) + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index 0875d1c7a20f4..f985685b1d524 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -7751,11 +7751,28 @@ visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr) + if (ctx->program->gfx_level >= GFX9) { + if (nir_src_is_const(instr->src[0])) { + uint32_t max = ctx->program->dev.scratch_global_offset_max + 1; +- info.offset = +- bld.copy(bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max))); ++ if (ctx->callee_info.stack_ptr.is_reg) ++ info.offset = ++ bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), ++ Operand(ctx->callee_info.stack_ptr.def.getTemp()), ++ Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max))); ++ else ++ info.offset = bld.copy( ++ bld.def(s1), Operand::c32(ROUND_DOWN_TO(nir_src_as_uint(instr->src[0]), max))); + info.const_offset = nir_src_as_uint(instr->src[0]) % max; + } else { +- info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa)); ++ if (ctx->callee_info.stack_ptr.is_reg) { ++ Temp store_offset = get_ssa_temp(ctx, instr->src[0].ssa); ++ if (store_offset.type() == RegType::sgpr) ++ info.offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), ++ Operand(ctx->callee_info.stack_ptr.def.getTemp()), ++ Operand(store_offset)); ++ else ++ info.offset = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), ++ Operand(ctx->callee_info.stack_ptr.def.getTemp()), ++ Operand(store_offset)); ++ } else ++ info.offset = Operand(get_ssa_temp(ctx, instr->src[0].ssa)); + } + EmitLoadParameters params = scratch_flat_load_params; + params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1; +@@ -7775,6 +7792,15 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr) + Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + Temp offset = get_ssa_temp(ctx, instr->src[1].ssa); + ++ if (ctx->callee_info.stack_ptr.is_reg && ctx->program->gfx_level >= GFX9) { ++ if (offset.type() == RegType::sgpr) ++ offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), ++ Operand(ctx->callee_info.stack_ptr.def.getTemp()), Operand(offset)); ++ else ++ offset = bld.vop2(aco_opcode::v_add_u32, bld.def(v1), ++ Operand(ctx->callee_info.stack_ptr.def.getTemp()), Operand(offset)); ++ } ++ + unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; + unsigned writemask = util_widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes); + +-- +GitLab + + +From b8e49a1b7325c6b46fa2bd27732047b213ef5bda Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sat, 9 Mar 2024 11:15:43 +0100 +Subject: [PATCH 39/71] nir,aco: Add set_next_call_pc_amd intrinsic + +Used for lowering function calls +--- + src/amd/compiler/aco_instruction_selection.cpp | 5 +++++ + src/compiler/nir/nir_intrinsics.py | 2 ++ + 2 files changed, 7 insertions(+) + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index f985685b1d524..d83801d8e35cc 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -9640,6 +9640,11 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) + bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)), + Operand::c32(nir_intrinsic_base(instr))); + break; ++ case nir_intrinsic_set_next_call_pc_amd: { ++ ctx->next_divergent_pc = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); ++ ctx->next_pc = get_ssa_temp(ctx, instr->src[1].ssa); ++ break; ++ } + default: + isel_err(&instr->instr, "Unimplemented intrinsic instr"); + abort(); +diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py +index 2a6de0c4b6f25..1e3efcf06446d 100644 +--- a/src/compiler/nir/nir_intrinsics.py ++++ b/src/compiler/nir/nir_intrinsics.py +@@ -2374,3 +2374,5 @@ intrinsic("enqueue_node_payloads", src_comp=[-1]) + + # Returns true if it has been called for every payload. + intrinsic("finalize_incoming_node_payload", src_comp=[-1], dest_comp=1) ++ ++intrinsic("set_next_call_pc_amd", src_comp=[1, 1], bit_sizes=[64]) +-- +GitLab + + +From c8aec7b77ef0fd5e1bb36cbf06929fd75523b8ca Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 26 Feb 2024 12:20:26 +0100 +Subject: [PATCH 40/71] nir,aco: add call_return_adress sysval + +--- + src/amd/compiler/aco_instruction_selection.cpp | 5 +++++ + src/compiler/nir/nir_divergence_analysis.c | 1 + + src/compiler/nir/nir_intrinsics.py | 1 + + 3 files changed, 7 insertions(+) + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index d83801d8e35cc..d0d0dc1b036df 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -9640,6 +9640,11 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) + bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)), + Operand::c32(nir_intrinsic_base(instr))); + break; ++ case nir_intrinsic_load_call_return_address_amd: { ++ bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), ++ Operand(ctx->callee_info.return_address.def.getTemp())); ++ break; ++ } + case nir_intrinsic_set_next_call_pc_amd: { + ctx->next_divergent_pc = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); + ctx->next_pc = get_ssa_temp(ctx, instr->src[1].ssa); +diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c +index 78943c897922f..2fc4eda71aeb0 100644 +--- a/src/compiler/nir/nir_divergence_analysis.c ++++ b/src/compiler/nir/nir_divergence_analysis.c +@@ -344,6 +344,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) + case nir_intrinsic_load_samples_log2_agx: + case nir_intrinsic_load_active_subgroup_count_agx: + case nir_intrinsic_load_constant_base_ptr: ++ case nir_intrinsic_load_call_return_address_amd: + is_divergent = false; + break; + +diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py +index 1e3efcf06446d..808ee31420ba0 100644 +--- a/src/compiler/nir/nir_intrinsics.py ++++ b/src/compiler/nir/nir_intrinsics.py +@@ -2375,4 +2375,5 @@ intrinsic("enqueue_node_payloads", src_comp=[-1]) + # Returns true if it has been called for every payload. + intrinsic("finalize_incoming_node_payload", src_comp=[-1], dest_comp=1) + ++system_value("call_return_address_amd", 1, bit_sizes=[64]) + intrinsic("set_next_call_pc_amd", src_comp=[1, 1], bit_sizes=[64]) +-- +GitLab + + +From a61f79118bc11db5dbbc1ef19c521c834936a637 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sun, 7 Jan 2024 22:15:13 +0100 +Subject: [PATCH 41/71] radv/nir: Lower NIR function call ABI + +--- + src/amd/vulkan/meson.build | 1 + + src/amd/vulkan/nir/radv_nir.h | 4 + + src/amd/vulkan/nir/radv_nir_lower_call_abi.c | 433 +++++++++++++++++++ + src/amd/vulkan/radv_pipeline.c | 4 + + src/compiler/nir/nir_divergence_analysis.c | 1 + + src/compiler/nir/nir_intrinsics.py | 3 + + 6 files changed, 446 insertions(+) + create mode 100644 src/amd/vulkan/nir/radv_nir_lower_call_abi.c + +diff --git a/src/amd/vulkan/meson.build b/src/amd/vulkan/meson.build +index 5976bef8b85cf..84414ad41b7c0 100644 +--- a/src/amd/vulkan/meson.build ++++ b/src/amd/vulkan/meson.build +@@ -66,6 +66,7 @@ libradv_files = files( + 'nir/radv_nir_apply_pipeline_layout.c', + 'nir/radv_nir_export_multiview.c', + 'nir/radv_nir_lower_abi.c', ++ 'nir/radv_nir_lower_call_abi.c', + 'nir/radv_nir_lower_cooperative_matrix.c', + 'nir/radv_nir_lower_fs_barycentric.c', + 'nir/radv_nir_lower_fs_intrinsics.c', +diff --git a/src/amd/vulkan/nir/radv_nir.h b/src/amd/vulkan/nir/radv_nir.h +index cd779d64e857c..e004de467ed3e 100644 +--- a/src/amd/vulkan/nir/radv_nir.h ++++ b/src/amd/vulkan/nir/radv_nir.h +@@ -90,6 +90,10 @@ typedef struct radv_nir_opt_tid_function_options { + + bool radv_nir_opt_tid_function(nir_shader *shader, const radv_nir_opt_tid_function_options *options); + ++void radv_nir_lower_callee_signature(nir_function *function, struct set *visited_funcs); ++ ++bool radv_nir_lower_call_abi(nir_shader *shader, unsigned wave_size); ++ + #ifdef __cplusplus + } + #endif +diff --git a/src/amd/vulkan/nir/radv_nir_lower_call_abi.c b/src/amd/vulkan/nir/radv_nir_lower_call_abi.c +new file mode 100644 +index 0000000000000..5f18f9aea0f28 +--- /dev/null ++++ b/src/amd/vulkan/nir/radv_nir_lower_call_abi.c +@@ -0,0 +1,433 @@ ++/* ++ * Copyright © 2023 Valve Corporation ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a ++ * copy of this software and associated documentation files (the "Software"), ++ * to deal in the Software without restriction, including without limitation ++ * the rights to use, copy, modify, merge, publish, distribute, sublicense, ++ * and/or sell copies of the Software, and to permit persons to whom the ++ * Software is furnished to do so, subject to the following conditions: ++ * ++ * The above copyright notice and this permission notice (including the next ++ * paragraph) shall be included in all copies or substantial portions of the ++ * Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL ++ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "aco_nir_call_attribs.h" ++#include "nir_builder.h" ++#include "radv_nir.h" ++ ++void ++radv_nir_lower_callee_signature(nir_function *function, struct set *visited_funcs) ++{ ++ if (visited_funcs) { ++ if (_mesa_set_search(visited_funcs, function)) ++ return; ++ _mesa_set_add(visited_funcs, function); ++ } ++ ++ nir_parameter *old_params = function->params; ++ unsigned old_num_params = function->num_params; ++ ++ function->num_params += 2; ++ function->params = rzalloc_array_size(function->shader, function->num_params, sizeof(nir_parameter)); ++ ++ memcpy(function->params + 2, old_params, old_num_params * sizeof(nir_parameter)); ++ ++ function->params[0].num_components = 1; ++ function->params[0].bit_size = 64; ++ function->params[1].num_components = 1; ++ function->params[1].bit_size = 64; ++ function->params[1].is_uniform = true; ++ ++ nir_function_impl *impl = function->impl; ++ ++ if (!impl) ++ return; ++ ++ nir_foreach_block (block, impl) { ++ nir_foreach_instr_safe (instr, block) { ++ if (instr->type != nir_instr_type_intrinsic) ++ continue; ++ ++ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); ++ ++ if (intr->intrinsic == nir_intrinsic_load_param) ++ nir_intrinsic_set_param_idx(intr, nir_intrinsic_param_idx(intr) + 2); ++ } ++ } ++} ++ ++/* Checks if caller can call callee using tail calls. ++ * ++ * If the ABIs mismatch, we might need to insert move instructions to move return values from callee return registers to ++ * caller return registers after the call. In that case, tail-calls are impossible to do correctly. ++ */ ++static bool ++is_tail_call_compatible(nir_function *caller, nir_function *callee) ++{ ++ /* If the caller doesn't return at all, we don't need to care if return params are compatible */ ++ if (caller->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_NORETURN) ++ return true; ++ /* The same ABI can't mismatch */ ++ if ((caller->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) == ++ (callee->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK)) ++ return true; ++ /* The recursive shader ABI and the traversal shader ABI are built so that return parameters occupy exactly ++ * the same registers, to allow tail calls from the traversal shader. */ ++ if ((caller->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) == ACO_NIR_CALL_ABI_TRAVERSAL && ++ (callee->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) == ACO_NIR_CALL_ABI_RT_RECURSIVE) ++ return true; ++ return false; ++} ++ ++static void ++gather_tail_call_instrs_block(nir_function *caller, const struct nir_block *block, struct set *tail_calls) ++{ ++ nir_foreach_instr_reverse (instr, block) { ++ switch (instr->type) { ++ case nir_instr_type_phi: ++ case nir_instr_type_undef: ++ case nir_instr_type_load_const: ++ continue; ++ case nir_instr_type_alu: ++ if (!nir_op_is_vec_or_mov(nir_instr_as_alu(instr)->op)) ++ return; ++ continue; ++ case nir_instr_type_call: { ++ nir_call_instr *call = nir_instr_as_call(instr); ++ ++ if (!is_tail_call_compatible(caller, call->callee)) ++ return; ++ ++ for (unsigned i = 0; i < call->num_params; ++i) { ++ if (call->callee->params[i].is_return != caller->params[i].is_return) ++ return; ++ /* We can only do tail calls if the caller returns exactly the callee return values */ ++ if (caller->params[i].is_return) { ++ assert(call->params[i].ssa->parent_instr->type == nir_instr_type_deref); ++ nir_deref_instr *deref_root = nir_instr_as_deref(call->params[i].ssa->parent_instr); ++ while (nir_deref_instr_parent(deref_root)) ++ deref_root = nir_deref_instr_parent(deref_root); ++ ++ if (!deref_root->parent.ssa) ++ return; ++ if (deref_root->parent.ssa->parent_instr->type != nir_instr_type_intrinsic) ++ return; ++ nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(deref_root->parent.ssa->parent_instr); ++ if (intrin->intrinsic != nir_intrinsic_load_param) ++ return; ++ if (nir_intrinsic_param_idx(intrin) != i) ++ return; ++ } ++ if (call->callee->params[i].is_uniform != caller->params[i].is_uniform) ++ return; ++ if (call->callee->params[i].bit_size != caller->params[i].bit_size) ++ return; ++ if (call->callee->params[i].num_components != caller->params[i].num_components) ++ return; ++ } ++ ++ _mesa_set_add(tail_calls, instr); ++ continue; ++ } ++ default: ++ return; ++ } ++ } ++ ++ set_foreach (block->predecessors, pred) { ++ gather_tail_call_instrs_block(caller, pred->key, tail_calls); ++ } ++} ++ ++struct lower_param_info { ++ /* */ ++ nir_def *load_param_def; ++ ++ nir_def *return_deref; ++ bool has_store; ++}; ++ ++static void ++check_param_uses_for_stores(nir_deref_instr *instr, struct lower_param_info *info) ++{ ++ nir_foreach_use (deref_use, &instr->def) { ++ nir_instr *use_instr = nir_src_parent_instr(deref_use); ++ if (use_instr->type == nir_instr_type_deref) ++ check_param_uses_for_stores(nir_instr_as_deref(use_instr), info); ++ else if ((use_instr->type == nir_instr_type_intrinsic && ++ nir_instr_as_intrinsic(use_instr)->intrinsic == nir_intrinsic_store_deref) || ++ use_instr->type == nir_instr_type_call) ++ info->has_store = true; ++ } ++} ++ ++static void ++rewrite_return_param_uses(nir_intrinsic_instr *intr, unsigned param_idx, struct lower_param_info *param_defs) ++{ ++ nir_foreach_use_safe (use, &intr->def) { ++ nir_instr *use_instr = nir_src_parent_instr(use); ++ assert(use_instr && use_instr->type == nir_instr_type_deref && ++ nir_instr_as_deref(use_instr)->deref_type == nir_deref_type_cast); ++ check_param_uses_for_stores(nir_instr_as_deref(use_instr), ¶m_defs[param_idx]); ++ nir_def_rewrite_uses(&nir_instr_as_deref(use_instr)->def, param_defs[param_idx].return_deref); ++ ++ nir_instr_remove(use_instr); ++ } ++} ++ ++static void ++lower_call_abi_for_callee(nir_function *function, unsigned wave_size, struct set *visited_funcs) ++{ ++ nir_function_impl *impl = function->impl; ++ ++ nir_builder b = nir_builder_create(impl); ++ b.cursor = nir_before_impl(impl); ++ ++ nir_variable *tail_call_pc = ++ nir_variable_create(b.shader, nir_var_shader_temp, glsl_uint64_t_type(), "_tail_call_pc"); ++ nir_store_var(&b, tail_call_pc, nir_imm_int64(&b, 0), 0x1); ++ ++ struct set *tail_call_instrs = _mesa_set_create(b.shader, _mesa_hash_pointer, _mesa_key_pointer_equal); ++ gather_tail_call_instrs_block(function, nir_impl_last_block(impl), tail_call_instrs); ++ ++ radv_nir_lower_callee_signature(function, visited_funcs); ++ ++ /* guard the shader, so that only the correct invocations execute it */ ++ ++ nir_def *guard_condition = NULL; ++ nir_def *shader_addr; ++ nir_def *uniform_shader_addr; ++ if (function->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_DIVERGENT_CALL) { ++ nir_cf_list list; ++ nir_cf_extract(&list, nir_before_impl(impl), nir_after_impl(impl)); ++ ++ b.cursor = nir_before_impl(impl); ++ ++ shader_addr = nir_load_param(&b, 0); ++ uniform_shader_addr = nir_load_param(&b, 1); ++ ++ guard_condition = nir_ieq(&b, uniform_shader_addr, shader_addr); ++ nir_if *shader_guard = nir_push_if(&b, guard_condition); ++ shader_guard->control = nir_selection_control_divergent_always_taken; ++ nir_cf_reinsert(&list, b.cursor); ++ nir_pop_if(&b, shader_guard); ++ } else { ++ shader_addr = nir_load_param(&b, 0); ++ } ++ ++ b.cursor = nir_before_impl(impl); ++ struct lower_param_info *param_infos = ralloc_size(b.shader, function->num_params * sizeof(struct lower_param_info)); ++ nir_variable **param_vars = ralloc_size(b.shader, function->num_params * sizeof(nir_variable *)); ++ ++ for (unsigned i = 2; i < function->num_params; ++i) { ++ param_vars[i] = nir_local_variable_create(impl, function->params[i].type, "_param"); ++ unsigned num_components = glsl_get_vector_elements(function->params[i].type); ++ ++ if (function->params[i].is_return) { ++ assert(!glsl_type_is_array(function->params[i].type) && !glsl_type_is_struct(function->params[i].type)); ++ ++ function->params[i].bit_size = glsl_get_bit_size(function->params[i].type); ++ function->params[i].num_components = num_components; ++ ++ param_infos[i].return_deref = &nir_build_deref_var(&b, param_vars[i])->def; ++ } else { ++ param_infos[i].return_deref = NULL; ++ } ++ ++ param_infos[i].has_store = false; ++ param_infos[i].load_param_def = nir_load_param(&b, i); ++ nir_store_var(&b, param_vars[i], param_infos[i].load_param_def, (0x1 << num_components) - 1); ++ } ++ ++ unsigned max_tail_call_param = 0; ++ ++ nir_foreach_block (block, impl) { ++ bool progress; ++ do { ++ progress = false; ++ nir_foreach_instr_safe (instr, block) { ++ if (instr->type == nir_instr_type_call && _mesa_set_search(tail_call_instrs, instr)) { ++ nir_call_instr *call = nir_instr_as_call(instr); ++ b.cursor = nir_before_instr(instr); ++ ++ for (unsigned i = 0; i < call->num_params; ++i) { ++ if (call->callee->params[i].is_return) ++ nir_store_var(&b, param_vars[i + 2], ++ nir_load_deref(&b, nir_instr_as_deref(call->params[i].ssa->parent_instr)), ++ (0x1 << glsl_get_vector_elements(call->callee->params[i].type)) - 1); ++ else ++ nir_store_var(&b, param_vars[i + 2], call->params[i].ssa, ++ (0x1 << call->params[i].ssa->num_components) - 1); ++ param_infos[i + 2].has_store = true; ++ } ++ ++ nir_store_var(&b, tail_call_pc, call->indirect_callee.ssa, 0x1); ++ max_tail_call_param = MAX2(max_tail_call_param, call->num_params + 2); ++ ++ nir_instr_remove(instr); ++ ++ progress = true; ++ break; ++ } ++ ++ if (instr->type != nir_instr_type_intrinsic) ++ continue; ++ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); ++ if (nir_instr_as_intrinsic(instr)->intrinsic == nir_intrinsic_load_param) { ++ unsigned param_idx = nir_intrinsic_param_idx(intr); ++ ++ if (param_idx >= 2 && &intr->def != param_infos[param_idx].load_param_def) { ++ if (function->params[param_idx].is_return) ++ rewrite_return_param_uses(intr, param_idx, param_infos); ++ else ++ nir_def_rewrite_uses(&intr->def, param_infos[param_idx].load_param_def); ++ nir_instr_remove(instr); ++ progress = true; ++ break; ++ } ++ } ++ } ++ } while (progress); ++ } ++ ++ b.cursor = nir_after_impl(impl); ++ ++ for (unsigned i = 2; i < function->num_params; ++i) { ++ if (param_infos[i].has_store) ++ nir_store_param_amd(&b, nir_load_var(&b, param_vars[i]), .param_idx = i); ++ } ++ ++ if (guard_condition) ++ shader_addr = nir_bcsel(&b, guard_condition, nir_load_var(&b, tail_call_pc), shader_addr); ++ else ++ shader_addr = nir_load_var(&b, tail_call_pc); ++ nir_def *ballot = nir_ballot(&b, 1, wave_size, nir_ine_imm(&b, shader_addr, 0)); ++ nir_def *ballot_addr = nir_read_invocation(&b, shader_addr, nir_find_lsb(&b, ballot)); ++ uniform_shader_addr = nir_bcsel(&b, nir_ieq_imm(&b, ballot, 0), nir_load_call_return_address_amd(&b), ballot_addr); ++ ++ if (!(function->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_NORETURN)) { ++ nir_push_if(&b, nir_ieq_imm(&b, uniform_shader_addr, 0)); ++ nir_terminate(&b); ++ nir_pop_if(&b, NULL); ++ ++ nir_set_next_call_pc_amd(&b, shader_addr, uniform_shader_addr); ++ } ++} ++ ++static void ++lower_call_abi_for_call(nir_builder *b, nir_call_instr *call, unsigned *cur_call_idx, struct set *visited_funcs, ++ struct set *visited_calls) ++{ ++ unsigned call_idx = (*cur_call_idx)++; ++ ++ for (unsigned i = 0; i < call->num_params; ++i) { ++ unsigned callee_param_idx = i; ++ if (_mesa_set_search(visited_funcs, call->callee)) ++ callee_param_idx += 2; ++ ++ if (!call->callee->params[callee_param_idx].is_return) ++ continue; ++ ++ b->cursor = nir_before_instr(&call->instr); ++ ++ nir_src *old_src = &call->params[i]; ++ ++ assert(old_src->ssa->parent_instr->type == nir_instr_type_deref); ++ nir_deref_instr *param_deref = nir_instr_as_deref(old_src->ssa->parent_instr); ++ assert(param_deref->deref_type == nir_deref_type_var); ++ ++ nir_src_rewrite(old_src, nir_load_deref(b, param_deref)); ++ ++ b->cursor = nir_after_instr(&call->instr); ++ ++ unsigned num_components = glsl_get_vector_elements(param_deref->type); ++ ++ nir_store_deref( ++ b, param_deref, ++ nir_load_return_param_amd(b, num_components, glsl_base_type_get_bit_size(param_deref->type->base_type), ++ .call_idx = call_idx, .param_idx = i + 2), ++ (1u << num_components) - 1); ++ ++ assert(call->callee->params[callee_param_idx].bit_size == glsl_get_bit_size(param_deref->type)); ++ assert(call->callee->params[callee_param_idx].num_components == num_components); ++ } ++ ++ radv_nir_lower_callee_signature(call->callee, visited_funcs); ++ ++ b->cursor = nir_after_instr(&call->instr); ++ ++ nir_call_instr *new_call = nir_call_instr_create(b->shader, call->callee); ++ new_call->indirect_callee = nir_src_for_ssa(call->indirect_callee.ssa); ++ new_call->params[0] = nir_src_for_ssa(call->indirect_callee.ssa); ++ new_call->params[1] = nir_src_for_ssa(nir_read_first_invocation(b, call->indirect_callee.ssa)); ++ for (unsigned i = 2; i < new_call->num_params; ++i) ++ new_call->params[i] = nir_src_for_ssa(call->params[i - 2].ssa); ++ ++ nir_builder_instr_insert(b, &new_call->instr); ++ b->cursor = nir_after_instr(&new_call->instr); ++ _mesa_set_add(visited_calls, new_call); ++ ++ nir_instr_remove(&call->instr); ++} ++ ++static bool ++lower_call_abi_for_caller(nir_function_impl *impl, struct set *visited_funcs) ++{ ++ bool progress = false; ++ unsigned cur_call_idx = 0; ++ struct set *visited_calls = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); ++ ++ nir_foreach_block (block, impl) { ++ nir_foreach_instr_safe (instr, block) { ++ if (instr->type != nir_instr_type_call) ++ continue; ++ nir_call_instr *call = nir_instr_as_call(instr); ++ if (call->callee->impl) ++ continue; ++ if (_mesa_set_search(visited_calls, call)) ++ continue; ++ ++ nir_builder b = nir_builder_create(impl); ++ lower_call_abi_for_call(&b, call, &cur_call_idx, visited_funcs, visited_calls); ++ progress = true; ++ } ++ } ++ ++ _mesa_set_destroy(visited_calls, NULL); ++ ++ return progress; ++} ++ ++bool ++radv_nir_lower_call_abi(nir_shader *shader, unsigned wave_size) ++{ ++ struct set *visited_funcs = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); ++ ++ bool progress = false; ++ nir_foreach_function_with_impl (function, impl, shader) { ++ bool func_progress = false; ++ if (function->is_exported) { ++ lower_call_abi_for_callee(function, wave_size, visited_funcs); ++ func_progress = true; ++ } ++ func_progress |= lower_call_abi_for_caller(impl, visited_funcs); ++ ++ if (func_progress) ++ nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); ++ progress |= func_progress; ++ } ++ ++ _mesa_set_destroy(visited_funcs, NULL); ++ ++ return progress; ++} +diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c +index daaf4e9ba4f00..fc7195b5067ca 100644 +--- a/src/amd/vulkan/radv_pipeline.c ++++ b/src/amd/vulkan/radv_pipeline.c +@@ -575,6 +575,10 @@ radv_postprocess_nir(struct radv_device *device, const struct radv_graphics_stat + stage->nir, io_to_mem || lowered_ngg || stage->stage == MESA_SHADER_COMPUTE || stage->stage == MESA_SHADER_TASK, + gfx_level >= GFX8); + ++ NIR_PASS(_, stage->nir, radv_nir_lower_call_abi, stage->info.wave_size); ++ NIR_PASS(_, stage->nir, nir_lower_global_vars_to_local); ++ NIR_PASS(_, stage->nir, nir_lower_vars_to_ssa); ++ + NIR_PASS(_, stage->nir, nir_lower_fp16_casts, nir_lower_fp16_split_fp64); + + if (stage->nir->info.bit_sizes_int & (8 | 16)) { +diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c +index 2fc4eda71aeb0..1f780f0621cac 100644 +--- a/src/compiler/nir/nir_divergence_analysis.c ++++ b/src/compiler/nir/nir_divergence_analysis.c +@@ -892,6 +892,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) + case nir_intrinsic_load_sample_mask: + case nir_intrinsic_quad_ballot_agx: + case nir_intrinsic_load_agx: ++ case nir_intrinsic_load_return_param_amd: + is_divergent = true; + break; + +diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py +index 808ee31420ba0..32ab9b8a6acb8 100644 +--- a/src/compiler/nir/nir_intrinsics.py ++++ b/src/compiler/nir/nir_intrinsics.py +@@ -2375,5 +2375,8 @@ intrinsic("enqueue_node_payloads", src_comp=[-1]) + # Returns true if it has been called for every payload. + intrinsic("finalize_incoming_node_payload", src_comp=[-1], dest_comp=1) + ++intrinsic("store_param_amd", src_comp=[-1], indices=[PARAM_IDX]) ++intrinsic("load_return_param_amd", dest_comp=0, indices=[CALL_IDX, PARAM_IDX]) ++ + system_value("call_return_address_amd", 1, bit_sizes=[64]) + intrinsic("set_next_call_pc_amd", src_comp=[1, 1], bit_sizes=[64]) +-- +GitLab + + +From fbe63f63878376a556e9eab7999edab5f332f257 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sun, 7 Jan 2024 22:42:03 +0100 +Subject: [PATCH 42/71] aco: Compile all functions in RT shaders + +--- + .../compiler/aco_instruction_selection.cpp | 43 +- + .../aco_instruction_selection_setup.cpp | 639 +++++++++--------- + 2 files changed, 345 insertions(+), 337 deletions(-) + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index d0d0dc1b036df..95baf3a302d0c 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -11891,30 +11891,35 @@ void + select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* const* shaders, + const struct ac_shader_args* args) + { ++ bool first_block = true; + for (unsigned i = 0; i < shader_count; i++) { +- if (i) { +- ctx.block = ctx.program->create_and_insert_block(); +- ctx.block->kind = block_kind_top_level | block_kind_resume; +- } ++ nir_foreach_function_impl (impl, shaders[i]) { ++ if (!first_block) { ++ ctx.block = ctx.program->create_and_insert_block(); ++ ctx.block->kind = block_kind_top_level | block_kind_resume; ++ } ++ nir_shader* nir = shaders[i]; + +- nir_shader* nir = shaders[i]; +- init_context(&ctx, nir); +- setup_fp_mode(&ctx, nir); ++ init_context(&ctx, nir); ++ setup_fp_mode(&ctx, nir); + +- Instruction* startpgm = add_startpgm(&ctx); +- append_logical_start(ctx.block); +- split_arguments(&ctx, startpgm); +- visit_cf_list(&ctx, &nir_shader_get_entrypoint(nir)->body); +- append_logical_end(ctx.block); +- ctx.block->kind |= block_kind_uniform; ++ Instruction* startpgm = add_startpgm(&ctx); ++ append_logical_start(ctx.block); ++ split_arguments(&ctx, startpgm); ++ visit_cf_list(&ctx, &impl->body); ++ append_logical_end(ctx.block); ++ ctx.block->kind |= block_kind_uniform; + +- /* Fix output registers and jump to next shader. We can skip this when dealing with a raygen +- * shader without shader calls. +- */ +- if (shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN) +- insert_rt_jump_next(ctx, args); ++ /* Fix output registers and jump to next shader. We can skip this when dealing with a ++ * raygen shader without shader calls. ++ */ ++ if ((shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN) && ++ impl == nir_shader_get_entrypoint(nir)) ++ insert_rt_jump_next(ctx, args); + +- cleanup_context(&ctx); ++ cleanup_context(&ctx); ++ first_block = false; ++ } + } + + ctx.program->config->float_mode = ctx.program->blocks[0].fp_mode.val; +diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp +index f1cd92aad5fd2..600c63c8b9ce3 100644 +--- a/src/amd/compiler/aco_instruction_selection_setup.cpp ++++ b/src/amd/compiler/aco_instruction_selection_setup.cpp +@@ -257,8 +257,8 @@ setup_nir(isel_context* ctx, nir_shader* nir) + nir_opt_dce(nir); + } + +- nir_function_impl* func = nir_shader_get_entrypoint(nir); +- nir_index_ssa_defs(func); ++ nir_foreach_function_impl (impl, nir) ++ nir_index_ssa_defs(impl); + } + + /* Returns true if we can skip uniformization of a merge phi. This makes the destination divergent, +@@ -349,7 +349,6 @@ skip_uniformize_merge_phi(nir_def* ssa, unsigned depth) + void + init_context(isel_context* ctx, nir_shader* shader) + { +- nir_function_impl* impl = nir_shader_get_entrypoint(shader); + ctx->shader = shader; + + /* Init NIR range analysis. */ +@@ -366,356 +365,359 @@ init_context(isel_context* ctx, nir_shader* shader) + + ac_nir_opt_shared_append(shader); + +- uint32_t options = +- shader->options->divergence_analysis_options | nir_divergence_ignore_undef_if_phi_srcs; +- nir_divergence_analysis_impl(impl, (nir_divergence_options)options); +- shader->info.divergence_analysis_run = true; +- if (nir_opt_uniform_atomics(shader, false)) { +- nir_lower_int64(shader); ++ nir_foreach_function_impl (impl, shader) { ++ uint32_t options = ++ shader->options->divergence_analysis_options | nir_divergence_ignore_undef_if_phi_srcs; + nir_divergence_analysis_impl(impl, (nir_divergence_options)options); +- } ++ shader->info.divergence_analysis_run = true; ++ if (nir_opt_uniform_atomics(shader, false)) { ++ nir_lower_int64(shader); ++ nir_divergence_analysis_impl(impl, (nir_divergence_options)options); ++ } + +- apply_nuw_to_offsets(ctx, impl); ++ apply_nuw_to_offsets(ctx, impl); + +- /* sanitize control flow */ +- sanitize_cf_list(impl, &impl->body); +- nir_metadata_preserve(impl, nir_metadata_none); ++ /* sanitize control flow */ ++ sanitize_cf_list(impl, &impl->body); ++ nir_metadata_preserve(impl, nir_metadata_none); + +- /* we'll need these for isel */ +- nir_metadata_require(impl, nir_metadata_block_index | nir_metadata_dominance); ++ /* we'll need these for isel */ ++ nir_metadata_require(impl, nir_metadata_block_index | nir_metadata_dominance); + +- if (ctx->options->dump_preoptir) { +- fprintf(stderr, "NIR shader before instruction selection:\n"); +- nir_print_shader(shader, stderr); +- } ++ if (ctx->options->dump_preoptir) { ++ fprintf(stderr, "NIR shader before instruction selection:\n"); ++ nir_print_shader(shader, stderr); ++ } + +- ctx->first_temp_id = ctx->program->peekAllocationId(); +- ctx->program->allocateRange(impl->ssa_alloc); +- RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id; +- +- unsigned call_count = 0; +- +- /* TODO: make this recursive to improve compile times */ +- bool done = false; +- while (!done) { +- done = true; +- nir_foreach_block (block, impl) { +- nir_foreach_instr (instr, block) { +- switch (instr->type) { +- case nir_instr_type_alu: { +- nir_alu_instr* alu_instr = nir_instr_as_alu(instr); +- RegType type = alu_instr->def.divergent ? RegType::vgpr : RegType::sgpr; +- +- /* packed 16bit instructions have to be VGPR */ +- if (alu_instr->def.num_components == 2 && +- nir_op_infos[alu_instr->op].output_size == 0) ++ ctx->first_temp_id = ctx->program->peekAllocationId(); ++ ctx->program->allocateRange(impl->ssa_alloc); ++ RegClass* regclasses = ctx->program->temp_rc.data() + ctx->first_temp_id; ++ ++ unsigned call_count = 0; ++ ++ /* TODO: make this recursive to improve compile times */ ++ bool done = false; ++ while (!done) { ++ done = true; ++ nir_foreach_block (block, impl) { ++ nir_foreach_instr (instr, block) { ++ switch (instr->type) { ++ case nir_instr_type_alu: { ++ nir_alu_instr* alu_instr = nir_instr_as_alu(instr); ++ RegType type = alu_instr->def.divergent ? RegType::vgpr : RegType::sgpr; ++ ++ /* packed 16bit instructions have to be VGPR */ ++ if (alu_instr->def.num_components == 2 && ++ nir_op_infos[alu_instr->op].output_size == 0) + type = RegType::vgpr; + +- switch (alu_instr->op) { +- case nir_op_f2i16: +- case nir_op_f2u16: +- case nir_op_f2i32: +- case nir_op_f2u32: +- case nir_op_b2i8: +- case nir_op_b2i16: +- case nir_op_b2i32: +- case nir_op_b2b32: +- case nir_op_b2f16: +- case nir_op_b2f32: +- case nir_op_mov: break; +- case nir_op_fmulz: +- case nir_op_ffmaz: +- case nir_op_f2f64: +- case nir_op_u2f64: +- case nir_op_i2f64: +- case nir_op_pack_unorm_2x16: +- case nir_op_pack_snorm_2x16: +- case nir_op_pack_uint_2x16: +- case nir_op_pack_sint_2x16: +- case nir_op_ldexp: +- case nir_op_frexp_sig: +- case nir_op_frexp_exp: +- case nir_op_cube_amd: +- case nir_op_msad_4x8: +- case nir_op_mqsad_4x8: +- case nir_op_udot_4x8_uadd: +- case nir_op_sdot_4x8_iadd: +- case nir_op_sudot_4x8_iadd: +- case nir_op_udot_4x8_uadd_sat: +- case nir_op_sdot_4x8_iadd_sat: +- case nir_op_sudot_4x8_iadd_sat: +- case nir_op_udot_2x16_uadd: +- case nir_op_sdot_2x16_iadd: +- case nir_op_udot_2x16_uadd_sat: +- case nir_op_sdot_2x16_iadd_sat: type = RegType::vgpr; break; +- case nir_op_fmul: +- case nir_op_ffma: +- case nir_op_fadd: +- case nir_op_fsub: +- case nir_op_fmax: +- case nir_op_fmin: +- case nir_op_fsat: +- case nir_op_fneg: +- case nir_op_fabs: +- case nir_op_fsign: +- case nir_op_i2f16: +- case nir_op_i2f32: +- case nir_op_u2f16: +- case nir_op_u2f32: +- case nir_op_f2f16: +- case nir_op_f2f16_rtz: +- case nir_op_f2f16_rtne: +- case nir_op_f2f32: +- case nir_op_fquantize2f16: +- case nir_op_ffract: +- case nir_op_ffloor: +- case nir_op_fceil: +- case nir_op_ftrunc: +- case nir_op_fround_even: +- case nir_op_frcp: +- case nir_op_frsq: +- case nir_op_fsqrt: +- case nir_op_fexp2: +- case nir_op_flog2: +- case nir_op_fsin_amd: +- case nir_op_fcos_amd: +- case nir_op_pack_half_2x16_rtz_split: +- case nir_op_pack_half_2x16_split: +- case nir_op_unpack_half_2x16_split_x: +- case nir_op_unpack_half_2x16_split_y: { +- if (ctx->program->gfx_level < GFX11_5 || ++ switch (alu_instr->op) { ++ case nir_op_f2i16: ++ case nir_op_f2u16: ++ case nir_op_f2i32: ++ case nir_op_f2u32: ++ case nir_op_b2i8: ++ case nir_op_b2i16: ++ case nir_op_b2i32: ++ case nir_op_b2b32: ++ case nir_op_b2f16: ++ case nir_op_b2f32: ++ case nir_op_mov: break; ++ case nir_op_fmulz: ++ case nir_op_ffmaz: ++ case nir_op_f2f64: ++ case nir_op_u2f64: ++ case nir_op_i2f64: ++ case nir_op_pack_unorm_2x16: ++ case nir_op_pack_snorm_2x16: ++ case nir_op_pack_uint_2x16: ++ case nir_op_pack_sint_2x16: ++ case nir_op_ldexp: ++ case nir_op_frexp_sig: ++ case nir_op_frexp_exp: ++ case nir_op_cube_amd: ++ case nir_op_msad_4x8: ++ case nir_op_mqsad_4x8: ++ case nir_op_udot_4x8_uadd: ++ case nir_op_sdot_4x8_iadd: ++ case nir_op_sudot_4x8_iadd: ++ case nir_op_udot_4x8_uadd_sat: ++ case nir_op_sdot_4x8_iadd_sat: ++ case nir_op_sudot_4x8_iadd_sat: ++ case nir_op_udot_2x16_uadd: ++ case nir_op_sdot_2x16_iadd: ++ case nir_op_udot_2x16_uadd_sat: ++ case nir_op_sdot_2x16_iadd_sat: type = RegType::vgpr; break; ++ case nir_op_fmul: ++ case nir_op_ffma: ++ case nir_op_fadd: ++ case nir_op_fsub: ++ case nir_op_fmax: ++ case nir_op_fmin: ++ case nir_op_fsat: ++ case nir_op_fneg: ++ case nir_op_fabs: ++ case nir_op_fsign: ++ case nir_op_i2f16: ++ case nir_op_i2f32: ++ case nir_op_u2f16: ++ case nir_op_u2f32: ++ case nir_op_f2f16: ++ case nir_op_f2f16_rtz: ++ case nir_op_f2f16_rtne: ++ case nir_op_f2f32: ++ case nir_op_fquantize2f16: ++ case nir_op_ffract: ++ case nir_op_ffloor: ++ case nir_op_fceil: ++ case nir_op_ftrunc: ++ case nir_op_fround_even: ++ case nir_op_frcp: ++ case nir_op_frsq: ++ case nir_op_fsqrt: ++ case nir_op_fexp2: ++ case nir_op_flog2: ++ case nir_op_fsin_amd: ++ case nir_op_fcos_amd: ++ case nir_op_pack_half_2x16_rtz_split: ++ case nir_op_pack_half_2x16_split: ++ case nir_op_unpack_half_2x16_split_x: ++ case nir_op_unpack_half_2x16_split_y: { ++ if (ctx->program->gfx_level < GFX11_5 || + alu_instr->src[0].src.ssa->bit_size > 32) { + type = RegType::vgpr; + break; + } +- FALLTHROUGH; +- } +- default: +- for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) { +- if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr) +- type = RegType::vgpr; ++ FALLTHROUGH;} ++ default: ++ for (unsigned i = 0; i < nir_op_infos[alu_instr->op].num_inputs; i++) { ++ if (regclasses[alu_instr->src[i].src.ssa->index].type() == RegType::vgpr) ++ type = RegType::vgpr; ++ } ++ break; + } +- break; +- } + +- RegClass rc = +- get_reg_class(ctx, type, alu_instr->def.num_components, alu_instr->def.bit_size); +- regclasses[alu_instr->def.index] = rc; +- break; +- } +- case nir_instr_type_load_const: { +- unsigned num_components = nir_instr_as_load_const(instr)->def.num_components; +- unsigned bit_size = nir_instr_as_load_const(instr)->def.bit_size; +- RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size); +- regclasses[nir_instr_as_load_const(instr)->def.index] = rc; +- break; +- } +- case nir_instr_type_intrinsic: { +- nir_intrinsic_instr* intrinsic = nir_instr_as_intrinsic(instr); +- if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest) ++ RegClass rc = get_reg_class(ctx, type, alu_instr->def.num_components, ++ alu_instr->def.bit_size); ++ regclasses[alu_instr->def.index] = rc; + break; +- if (intrinsic->intrinsic == nir_intrinsic_strict_wqm_coord_amd) { +- regclasses[intrinsic->def.index] = +- RegClass::get(RegType::vgpr, intrinsic->def.num_components * 4 + +- nir_intrinsic_base(intrinsic)) +- .as_linear(); ++ } ++ case nir_instr_type_load_const: { ++ unsigned num_components = nir_instr_as_load_const(instr)->def.num_components; ++ unsigned bit_size = nir_instr_as_load_const(instr)->def.bit_size; ++ RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size); ++ regclasses[nir_instr_as_load_const(instr)->def.index] = rc; + break; + } +- RegType type = RegType::sgpr; +- switch (intrinsic->intrinsic) { +- case nir_intrinsic_load_push_constant: +- case nir_intrinsic_load_workgroup_id: +- case nir_intrinsic_load_num_workgroups: +- case nir_intrinsic_load_sbt_base_amd: +- case nir_intrinsic_load_subgroup_id: +- case nir_intrinsic_load_num_subgroups: +- case nir_intrinsic_load_first_vertex: +- case nir_intrinsic_load_base_instance: +- case nir_intrinsic_vote_all: +- case nir_intrinsic_vote_any: +- case nir_intrinsic_read_first_invocation: +- case nir_intrinsic_as_uniform: +- case nir_intrinsic_read_invocation: +- case nir_intrinsic_first_invocation: +- case nir_intrinsic_ballot: +- case nir_intrinsic_ballot_relaxed: +- case nir_intrinsic_bindless_image_samples: +- case nir_intrinsic_load_scalar_arg_amd: +- case nir_intrinsic_load_lds_ngg_scratch_base_amd: +- case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: +- case nir_intrinsic_load_smem_amd: +- case nir_intrinsic_unit_test_uniform_amd: type = RegType::sgpr; break; +- case nir_intrinsic_load_sample_id: +- case nir_intrinsic_load_input: +- case nir_intrinsic_load_per_primitive_input: +- case nir_intrinsic_load_output: +- case nir_intrinsic_load_input_vertex: +- case nir_intrinsic_load_per_vertex_input: +- case nir_intrinsic_load_per_vertex_output: +- case nir_intrinsic_load_vertex_id_zero_base: +- case nir_intrinsic_load_barycentric_sample: +- case nir_intrinsic_load_barycentric_pixel: +- case nir_intrinsic_load_barycentric_model: +- case nir_intrinsic_load_barycentric_centroid: +- case nir_intrinsic_load_barycentric_at_offset: +- case nir_intrinsic_load_interpolated_input: +- case nir_intrinsic_load_frag_coord: +- case nir_intrinsic_load_frag_shading_rate: +- case nir_intrinsic_load_sample_pos: +- case nir_intrinsic_load_local_invocation_id: +- case nir_intrinsic_load_local_invocation_index: +- case nir_intrinsic_load_subgroup_invocation: +- case nir_intrinsic_load_tess_coord: +- case nir_intrinsic_write_invocation_amd: +- case nir_intrinsic_mbcnt_amd: +- case nir_intrinsic_lane_permute_16_amd: +- case nir_intrinsic_dpp16_shift_amd: +- case nir_intrinsic_load_instance_id: +- case nir_intrinsic_ssbo_atomic: +- case nir_intrinsic_ssbo_atomic_swap: +- case nir_intrinsic_global_atomic_amd: +- case nir_intrinsic_global_atomic_swap_amd: +- case nir_intrinsic_bindless_image_atomic: +- case nir_intrinsic_bindless_image_atomic_swap: +- case nir_intrinsic_bindless_image_size: +- case nir_intrinsic_shared_atomic: +- case nir_intrinsic_shared_atomic_swap: +- case nir_intrinsic_load_scratch: +- case nir_intrinsic_load_invocation_id: +- case nir_intrinsic_load_primitive_id: +- case nir_intrinsic_load_typed_buffer_amd: +- case nir_intrinsic_load_buffer_amd: +- case nir_intrinsic_load_initial_edgeflags_amd: +- case nir_intrinsic_gds_atomic_add_amd: +- case nir_intrinsic_bvh64_intersect_ray_amd: +- case nir_intrinsic_load_vector_arg_amd: +- case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd: +- case nir_intrinsic_cmat_muladd_amd: +- case nir_intrinsic_unit_test_divergent_amd: type = RegType::vgpr; break; +- case nir_intrinsic_load_shared: +- case nir_intrinsic_load_shared2_amd: +- /* When the result of these loads is only used by cross-lane instructions, +- * it is beneficial to use a VGPR destination. This is because this allows +- * to put the s_waitcnt further down, which decreases latency. +- */ +- if (only_used_by_cross_lane_instrs(&intrinsic->def)) { +- type = RegType::vgpr; ++ case nir_instr_type_intrinsic: { ++ nir_intrinsic_instr* intrinsic = nir_instr_as_intrinsic(instr); ++ if (!nir_intrinsic_infos[intrinsic->intrinsic].has_dest) ++ break; ++ if (intrinsic->intrinsic == nir_intrinsic_strict_wqm_coord_amd) { ++ regclasses[intrinsic->def.index] = ++ RegClass::get(RegType::vgpr, intrinsic->def.num_components * 4 + ++ nir_intrinsic_base(intrinsic)) ++ .as_linear(); + break; + } +- FALLTHROUGH; +- case nir_intrinsic_shuffle: +- case nir_intrinsic_quad_broadcast: +- case nir_intrinsic_quad_swap_horizontal: +- case nir_intrinsic_quad_swap_vertical: +- case nir_intrinsic_quad_swap_diagonal: +- case nir_intrinsic_quad_swizzle_amd: +- case nir_intrinsic_masked_swizzle_amd: +- case nir_intrinsic_rotate: +- case nir_intrinsic_inclusive_scan: +- case nir_intrinsic_exclusive_scan: +- case nir_intrinsic_reduce: +- case nir_intrinsic_load_ubo: +- case nir_intrinsic_load_ssbo: +- case nir_intrinsic_load_global_amd: +- type = intrinsic->def.divergent ? RegType::vgpr : RegType::sgpr; +- break; +- case nir_intrinsic_ddx: +- case nir_intrinsic_ddy: +- case nir_intrinsic_ddx_fine: +- case nir_intrinsic_ddy_fine: +- case nir_intrinsic_ddx_coarse: +- case nir_intrinsic_ddy_coarse: ++ RegType type = RegType::sgpr; ++ switch (intrinsic->intrinsic) { ++ case nir_intrinsic_load_push_constant: ++ case nir_intrinsic_load_workgroup_id: ++ case nir_intrinsic_load_num_workgroups: ++ case nir_intrinsic_load_ray_launch_size: ++ case nir_intrinsic_load_sbt_base_amd: ++ case nir_intrinsic_load_subgroup_id: ++ case nir_intrinsic_load_num_subgroups: ++ case nir_intrinsic_load_first_vertex: ++ case nir_intrinsic_load_base_instance: ++ case nir_intrinsic_vote_all: ++ case nir_intrinsic_vote_any: ++ case nir_intrinsic_read_first_invocation: ++ case nir_intrinsic_as_uniform: ++ case nir_intrinsic_read_invocation: ++ case nir_intrinsic_first_invocation: ++ case nir_intrinsic_ballot: ++ case nir_intrinsic_ballot_relaxed: ++ case nir_intrinsic_bindless_image_samples: ++ case nir_intrinsic_load_scalar_arg_amd: ++ case nir_intrinsic_load_lds_ngg_scratch_base_amd: ++ case nir_intrinsic_load_lds_ngg_gs_out_vertex_base_amd: ++ case nir_intrinsic_load_smem_amd: ++ case nir_intrinsic_unit_test_uniform_amd: type = RegType::sgpr; break; ++ case nir_intrinsic_load_sample_id: ++ case nir_intrinsic_load_input: ++ case nir_intrinsic_load_per_primitive_input: ++ case nir_intrinsic_load_output: ++ case nir_intrinsic_load_input_vertex: ++ case nir_intrinsic_load_per_vertex_input: ++ case nir_intrinsic_load_per_vertex_output: ++ case nir_intrinsic_load_vertex_id_zero_base: ++ case nir_intrinsic_load_barycentric_sample: ++ case nir_intrinsic_load_barycentric_pixel: ++ case nir_intrinsic_load_barycentric_model: ++ case nir_intrinsic_load_barycentric_centroid: ++ case nir_intrinsic_load_barycentric_at_offset: ++ case nir_intrinsic_load_interpolated_input: ++ case nir_intrinsic_load_frag_coord: ++ case nir_intrinsic_load_frag_shading_rate: ++ case nir_intrinsic_load_sample_pos: ++ case nir_intrinsic_load_local_invocation_id: ++ case nir_intrinsic_load_local_invocation_index: ++ case nir_intrinsic_load_subgroup_invocation: ++ case nir_intrinsic_load_ray_launch_id: ++ case nir_intrinsic_load_tess_coord: ++ case nir_intrinsic_write_invocation_amd: ++ case nir_intrinsic_mbcnt_amd: ++ case nir_intrinsic_lane_permute_16_amd: ++ case nir_intrinsic_dpp16_shift_amd: ++ case nir_intrinsic_load_instance_id: ++ case nir_intrinsic_ssbo_atomic: ++ case nir_intrinsic_ssbo_atomic_swap: ++ case nir_intrinsic_global_atomic_amd: ++ case nir_intrinsic_global_atomic_swap_amd: ++ case nir_intrinsic_bindless_image_atomic: ++ case nir_intrinsic_bindless_image_atomic_swap: ++ case nir_intrinsic_bindless_image_size: ++ case nir_intrinsic_shared_atomic: ++ case nir_intrinsic_shared_atomic_swap: ++ case nir_intrinsic_load_scratch: ++ case nir_intrinsic_load_invocation_id: ++ case nir_intrinsic_load_primitive_id: ++ case nir_intrinsic_load_typed_buffer_amd: ++ case nir_intrinsic_load_buffer_amd: ++ case nir_intrinsic_load_initial_edgeflags_amd: ++ case nir_intrinsic_gds_atomic_add_amd: ++ case nir_intrinsic_bvh64_intersect_ray_amd: ++ case nir_intrinsic_load_vector_arg_amd: ++ case nir_intrinsic_ordered_xfb_counter_add_gfx11_amd: ++ case nir_intrinsic_cmat_muladd_amd: ++ case nir_intrinsic_unit_test_divergent_amd: type = RegType::vgpr; break; ++ case nir_intrinsic_load_shared: ++ case nir_intrinsic_load_shared2_amd: ++ /* When the result of these loads is only used by cross-lane instructions, ++ * it is beneficial to use a VGPR destination. This is because this allows ++ * to put the s_waitcnt further down, which decreases latency. ++ */ ++ if (only_used_by_cross_lane_instrs(&intrinsic->def)) { ++ type = RegType::vgpr; ++ break; ++ } ++ FALLTHROUGH; ++ case nir_intrinsic_shuffle: ++ case nir_intrinsic_quad_broadcast: ++ case nir_intrinsic_quad_swap_horizontal: ++ case nir_intrinsic_quad_swap_vertical: ++ case nir_intrinsic_quad_swap_diagonal: ++ case nir_intrinsic_quad_swizzle_amd: ++ case nir_intrinsic_masked_swizzle_amd: ++ case nir_intrinsic_rotate: ++ case nir_intrinsic_inclusive_scan: ++ case nir_intrinsic_exclusive_scan: ++ case nir_intrinsic_reduce: ++ case nir_intrinsic_load_ubo: ++ case nir_intrinsic_load_ssbo: ++ case nir_intrinsic_load_global_amd: ++ type = intrinsic->def.divergent ? RegType::vgpr : RegType::sgpr; ++ break; ++ case nir_intrinsic_ddx: ++ case nir_intrinsic_ddy: ++ case nir_intrinsic_ddx_fine: ++ case nir_intrinsic_ddy_fine: ++ case nir_intrinsic_ddx_coarse: ++ case nir_intrinsic_ddy_coarse: + type = RegType::vgpr; + break; +- case nir_intrinsic_load_view_index: +- type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr; +- break; +- default: +- for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; +- i++) { +- if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr) +- type = RegType::vgpr; ++ case nir_intrinsic_load_view_index: ++ type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr; ++ break; ++ default: ++ for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; ++ i++) { ++ if (regclasses[intrinsic->src[i].ssa->index].type() == RegType::vgpr) ++ type = RegType::vgpr; ++ } ++ break; + } ++ RegClass rc = get_reg_class(ctx, type, intrinsic->def.num_components, ++ intrinsic->def.bit_size); ++ regclasses[intrinsic->def.index] = rc; + break; + } +- RegClass rc = +- get_reg_class(ctx, type, intrinsic->def.num_components, intrinsic->def.bit_size); +- regclasses[intrinsic->def.index] = rc; +- break; +- } +- case nir_instr_type_tex: { +- nir_tex_instr* tex = nir_instr_as_tex(instr); +- RegType type = tex->def.divergent ? RegType::vgpr : RegType::sgpr; ++ case nir_instr_type_tex: { ++ nir_tex_instr* tex = nir_instr_as_tex(instr); ++ RegType type = tex->def.divergent ? RegType::vgpr : RegType::sgpr; + +- if (tex->op == nir_texop_texture_samples) { +- assert(!tex->def.divergent); +- } +- +- RegClass rc = get_reg_class(ctx, type, tex->def.num_components, tex->def.bit_size); +- regclasses[tex->def.index] = rc; +- break; +- } +- case nir_instr_type_undef: { +- unsigned num_components = nir_instr_as_undef(instr)->def.num_components; +- unsigned bit_size = nir_instr_as_undef(instr)->def.bit_size; +- RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size); +- regclasses[nir_instr_as_undef(instr)->def.index] = rc; +- break; +- } +- case nir_instr_type_phi: { +- nir_phi_instr* phi = nir_instr_as_phi(instr); +- RegType type = RegType::sgpr; +- unsigned num_components = phi->def.num_components; +- assert((phi->def.bit_size != 1 || num_components == 1) && +- "Multiple components not supported on boolean phis."); +- +- if (phi->def.divergent) { +- type = RegType::vgpr; +- } else { +- bool vgpr_src = false; +- nir_foreach_phi_src (src, phi) +- vgpr_src |= regclasses[src->src.ssa->index].type() == RegType::vgpr; ++ if (tex->op == nir_texop_texture_samples) { ++ assert(!tex->def.divergent); ++ } + +- if (vgpr_src) { ++ RegClass rc = ++ get_reg_class(ctx, type, tex->def.num_components, tex->def.bit_size); ++ regclasses[tex->def.index] = rc; ++ break; ++ } ++ case nir_instr_type_undef: { ++ unsigned num_components = nir_instr_as_undef(instr)->def.num_components; ++ unsigned bit_size = nir_instr_as_undef(instr)->def.bit_size; ++ RegClass rc = get_reg_class(ctx, RegType::sgpr, num_components, bit_size); ++ regclasses[nir_instr_as_undef(instr)->def.index] = rc; ++ break; ++ } ++ case nir_instr_type_phi: { ++ nir_phi_instr* phi = nir_instr_as_phi(instr); ++ RegType type = RegType::sgpr; ++ unsigned num_components = phi->def.num_components; ++ assert((phi->def.bit_size != 1 || num_components == 1) && ++ "Multiple components not supported on boolean phis."); ++ ++ if (phi->def.divergent) { + type = RegType::vgpr; ++ } else { ++ bool vgpr_src = false; ++ nir_foreach_phi_src (src, phi) ++ vgpr_src |= regclasses[src->src.ssa->index].type() == RegType::vgpr; + +- /* This might be the case because of nir_divergence_ignore_undef_if_phi_srcs. */ +- bool divergent_merge = false; +- if (nir_cf_node_prev(&block->cf_node) && +- nir_cf_node_prev(&block->cf_node)->type == nir_cf_node_if) { +- nir_if* nif = nir_cf_node_as_if(nir_cf_node_prev(&block->cf_node)); +- divergent_merge = nir_src_is_divergent(&nif->condition); +- } ++ if (vgpr_src) { ++ type = RegType::vgpr; + +- /* In case of uniform phis after divergent merges, ensure that the dst is an +- * SGPR and does not contain undefined values for some invocations. +- */ +- if (divergent_merge && !skip_uniformize_merge_phi(&phi->def, 0)) +- type = RegType::sgpr; ++ /* This might be the case because of nir_divergence_ignore_undef_if_phi_srcs. */ ++ bool divergent_merge = false; ++ if (nir_cf_node_prev(&block->cf_node) && ++ nir_cf_node_prev(&block->cf_node)->type == nir_cf_node_if) { ++ nir_if* nif = nir_cf_node_as_if(nir_cf_node_prev(&block->cf_node)); ++ divergent_merge = nir_src_is_divergent(&nif->condition); ++ } ++ ++ /* In case of uniform phis after divergent merges, ensure that the dst is an ++ * SGPR and does not contain undefined values for some invocations. ++ */ ++ if (divergent_merge && !skip_uniformize_merge_phi(&phi->def, 0)) ++ type = RegType::sgpr; ++ } + } +- } + +- RegClass rc = get_reg_class(ctx, type, num_components, phi->def.bit_size); +- if (rc != regclasses[phi->def.index]) +- done = false; +- regclasses[phi->def.index] = rc; +- break; +- } +- case nir_instr_type_call: { +- ++call_count; +- break; +- } +- default: break; ++ RegClass rc = get_reg_class(ctx, type, num_components, phi->def.bit_size); ++ if (rc != regclasses[phi->def.index]) ++ done = false; ++ regclasses[phi->def.index] = rc; ++ break; ++ } ++ case nir_instr_type_call: { ++ ++call_count; ++ break; ++ } ++ default: break; ++ } + } + } + } +- } +- +- ctx->call_infos.reserve(call_count); + +- ctx->program->config->spi_ps_input_ena = ctx->program->info.ps.spi_ps_input_ena; +- ctx->program->config->spi_ps_input_addr = ctx->program->info.ps.spi_ps_input_addr; ++ ctx->call_infos.reserve(call_count); + ++ ctx->program->config->spi_ps_input_ena = ctx->program->info.ps.spi_ps_input_ena; ++ ctx->program->config->spi_ps_input_addr = ctx->program->info.ps.spi_ps_input_addr; ++ } + /* align and copy constant data */ + while (ctx->program->constant_data.size() % 4u) + ctx->program->constant_data.push_back(0); +@@ -794,7 +796,8 @@ setup_isel_context(Program* program, unsigned shader_count, struct nir_shader* c + + unsigned nir_num_blocks = 0; + for (unsigned i = 0; i < shader_count; i++) +- nir_num_blocks += nir_shader_get_entrypoint(shaders[i])->num_blocks; ++ nir_foreach_function_impl (impl, shaders[i]) ++ nir_num_blocks += impl->num_blocks; + ctx.program->blocks.reserve(nir_num_blocks * 2); + ctx.block = ctx.program->create_and_insert_block(); + ctx.block->kind = block_kind_top_level; +-- +GitLab + + +From 5bfdc4d5da9fd66e98e3d04f0320719331a5bfaa Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sat, 23 Mar 2024 11:20:58 +0100 +Subject: [PATCH 43/71] aco: Add param temps in startpgm + +--- + src/amd/compiler/aco_assembler.cpp | 3 ++- + .../compiler/aco_instruction_selection.cpp | 23 ++++++++++++++++++- + src/amd/compiler/aco_ir.h | 1 + + 3 files changed, 25 insertions(+), 2 deletions(-) + +diff --git a/src/amd/compiler/aco_assembler.cpp b/src/amd/compiler/aco_assembler.cpp +index 9f50c3f59821b..9a774aec8621c 100644 +--- a/src/amd/compiler/aco_assembler.cpp ++++ b/src/amd/compiler/aco_assembler.cpp +@@ -1755,7 +1755,8 @@ emit_program(Program* program, std::vector& code, std::vectorconstant_data.data() + program->constant_data.size())); + + program->config->scratch_bytes_per_wave = +- align(program->config->scratch_bytes_per_wave, program->dev.scratch_alloc_granule); ++ align(program->config->scratch_bytes_per_wave + program->scratch_arg_size, ++ program->dev.scratch_alloc_granule); + + return exec_size; + } +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index 95baf3a302d0c..c44a7324d58e8 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -11557,8 +11557,12 @@ create_fs_end_for_epilog(isel_context* ctx) + } + + Instruction* +-add_startpgm(struct isel_context* ctx) ++add_startpgm(struct isel_context* ctx, bool is_callee = false) + { ++ ctx->program->arg_sgpr_count = ctx->args->num_sgprs_used; ++ ctx->program->arg_vgpr_count = ctx->args->num_vgprs_used; ++ ctx->program->scratch_arg_size += ctx->callee_info.scratch_param_size; ++ + unsigned def_count = 0; + for (unsigned i = 0; i < ctx->args->arg_count; i++) { + if (ctx->args->args[i].skip) +@@ -11569,6 +11573,9 @@ add_startpgm(struct isel_context* ctx) + else + def_count++; + } ++ unsigned used_arg_count = def_count; ++ def_count += ++ ctx->callee_info.reg_param_count + (is_callee ? 2 : 0); /* parameters + return address */ + + if (ctx->stage.hw == AC_HW_COMPUTE_SHADER && ctx->program->gfx_level >= GFX12) + def_count += 3; +@@ -11634,6 +11641,20 @@ add_startpgm(struct isel_context* ctx) + ctx->workgroup_id[i] = ids[i].used ? Operand(get_arg(ctx, ids[i])) : Operand::zero(); + } + ++ if (is_callee) { ++ unsigned def_idx = used_arg_count; ++ ++ ctx->program->stack_ptr = ctx->callee_info.stack_ptr.def.getTemp(); ++ startpgm->definitions[def_idx++] = ctx->callee_info.stack_ptr.def; ++ startpgm->definitions[def_idx++] = ctx->callee_info.return_address.def; ++ ++ for (auto& info : ctx->callee_info.param_infos) { ++ if (!info.is_reg) ++ continue; ++ startpgm->definitions[def_idx++] = info.def; ++ } ++ } ++ + /* epilog has no scratch */ + if (ctx->args->scratch_offset.used) { + if (ctx->program->gfx_level < GFX9) { +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index 920174ac50798..7989d27dfe75b 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -2388,6 +2388,7 @@ public: + ABI callee_abi = {}; + unsigned short arg_sgpr_count; + unsigned short arg_vgpr_count; ++ unsigned scratch_arg_size = 0; + + struct { + monotonic_buffer_resource memory; +-- +GitLab + + +From c2b0a99236c67af869bef06a2e3d2af329206ef7 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 6 Mar 2024 13:27:56 +0100 +Subject: [PATCH 44/71] aco: Implement call parameter intrinsics + +--- + .../compiler/aco_instruction_selection.cpp | 158 ++++++++++++++++++ + .../aco_instruction_selection_setup.cpp | 13 +- + 2 files changed, 170 insertions(+), 1 deletion(-) + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index c44a7324d58e8..f3ec6fa04dd36 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -8341,6 +8341,107 @@ visit_cmat_muladd(isel_context* ctx, nir_intrinsic_instr* instr) + emit_split_vector(ctx, dst, instr->def.num_components); + } + ++void ++load_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param, Temp stack_ptr, ++ unsigned scratch_param_size, Temp dst) ++{ ++ int32_t const_offset = param.scratch_offset - scratch_param_size; ++ unsigned byte_size = dst.bytes(); ++ if (ctx->program->gfx_level < GFX9) { ++ Temp scratch_rsrc = load_scratch_resource(ctx->program, bld, true, false); ++ ++ Temp soffset = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), ++ stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr), ++ Operand::c32(-const_offset * ctx->program->wave_size)); ++ ++ aco_opcode op; ++ switch (byte_size) { ++ case 4: op = aco_opcode::buffer_load_dword; break; ++ case 8: op = aco_opcode::buffer_load_dwordx2; break; ++ case 12: op = aco_opcode::buffer_load_dwordx3; break; ++ case 16: op = aco_opcode::buffer_load_dwordx4; break; ++ default: unreachable("Unexpected param size"); ++ } ++ ++ Instruction* instr = ++ bld.mubuf(op, Definition(dst), scratch_rsrc, Operand(v1), soffset, 0, false); ++ instr->mubuf().sync = memory_sync_info(storage_scratch); ++ instr->mubuf().cache.value = ac_swizzled; ++ return; ++ } ++ ++ if (const_offset < ctx->program->dev.scratch_global_offset_min) { ++ stack_ptr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), ++ stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr), ++ Operand::c32(const_offset)); ++ const_offset = 0; ++ } ++ ++ aco_opcode op; ++ switch (byte_size) { ++ case 4: op = aco_opcode::scratch_load_dword; break; ++ case 8: op = aco_opcode::scratch_load_dwordx2; break; ++ case 12: op = aco_opcode::scratch_load_dwordx3; break; ++ case 16: op = aco_opcode::scratch_load_dwordx4; break; ++ default: unreachable("Unexpected param size"); ++ } ++ ++ bld.scratch(op, Definition(dst), Operand(v1), ++ stack_ptr == Temp() ? Operand(s1) : Operand(stack_ptr), (int16_t)const_offset, ++ memory_sync_info(storage_scratch)); ++} ++ ++void ++store_scratch_param(isel_context* ctx, Builder& bld, const parameter_info& param, Temp stack_ptr, ++ unsigned scratch_param_size, Temp data) ++{ ++ int32_t const_offset = param.scratch_offset - scratch_param_size; ++ unsigned byte_size = data.bytes(); ++ if (ctx->program->gfx_level < GFX9) { ++ Temp scratch_rsrc = load_scratch_resource(ctx->program, bld, true, false); ++ ++ Temp soffset = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), ++ stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr), ++ Operand::c32(-const_offset * ctx->program->wave_size)); ++ ++ assert(-const_offset * ctx->program->wave_size < 0x1ff00); ++ ++ aco_opcode op; ++ switch (byte_size) { ++ case 4: op = aco_opcode::buffer_store_dword; break; ++ case 8: op = aco_opcode::buffer_store_dwordx2; break; ++ case 12: op = aco_opcode::buffer_store_dwordx3; break; ++ case 16: op = aco_opcode::buffer_store_dwordx4; break; ++ default: unreachable("Unexpected param size"); ++ } ++ ++ Instruction* instr = ++ bld.mubuf(op, scratch_rsrc, Operand(v1), Operand(soffset), as_vgpr(bld, data), 0, false); ++ instr->mubuf().sync = memory_sync_info(storage_scratch); ++ instr->mubuf().cache.value = ac_swizzled; ++ return; ++ } ++ ++ if (const_offset < ctx->program->dev.scratch_global_offset_min) { ++ stack_ptr = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), ++ stack_ptr == Temp() ? Operand::c32(0) : Operand(stack_ptr), ++ Operand::c32(const_offset)); ++ const_offset = 0; ++ } ++ ++ aco_opcode op; ++ switch (byte_size) { ++ case 4: op = aco_opcode::scratch_store_dword; break; ++ case 8: op = aco_opcode::scratch_store_dwordx2; break; ++ case 12: op = aco_opcode::scratch_store_dwordx3; break; ++ case 16: op = aco_opcode::scratch_store_dwordx4; break; ++ default: unreachable("Unexpected param size"); ++ } ++ ++ bld.scratch(op, Operand(v1), stack_ptr == Temp() ? Operand(s1) : Operand(stack_ptr), ++ as_vgpr(bld, data), (int16_t)const_offset, memory_sync_info(storage_scratch)); ++} ++ + void + visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) + { +@@ -9640,6 +9741,63 @@ visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) + bld.pseudo(aco_opcode::p_unit_test, Definition(get_ssa_temp(ctx, &instr->def)), + Operand::c32(nir_intrinsic_base(instr))); + break; ++ case nir_intrinsic_load_return_param_amd: { ++ call_info& info = ctx->call_infos[nir_intrinsic_call_idx(instr)]; ++ ++ assert(nir_intrinsic_param_idx(instr) < info.nir_instr->callee->num_params); ++ ++ unsigned index_in_return_params = 0u; ++ for (unsigned i = 0; i < info.nir_instr->callee->num_params; ++i) { ++ if (nir_intrinsic_param_idx(instr) == i) { ++ assert(info.nir_instr->callee->params[i].is_return); ++ break; ++ } ++ if (info.nir_instr->callee->params[i].is_return) { ++ ++index_in_return_params; ++ } ++ } ++ ++ if (info.return_info[index_in_return_params].is_reg) { ++ bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), ++ Operand(info.return_info[index_in_return_params].def.getTemp())); ++ } else { ++ Temp stack_ptr; ++ if (ctx->callee_info.stack_ptr.is_reg) ++ stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), ++ Operand::c32(info.scratch_param_size), ++ Operand(ctx->callee_info.stack_ptr.def.getTemp())); ++ else ++ stack_ptr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), ++ Operand::c32(info.scratch_param_size)); ++ load_scratch_param(ctx, bld, info.return_info[index_in_return_params], stack_ptr, ++ info.scratch_param_size, get_ssa_temp(ctx, &instr->def)); ++ } ++ break; ++ } ++ case nir_intrinsic_load_param: { ++ const auto& param = ctx->callee_info.param_infos[nir_intrinsic_param_idx(instr)]; ++ if (param.is_reg) ++ bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), Operand(param.def.getTemp())); ++ else ++ load_scratch_param( ++ ctx, bld, param, ++ ctx->callee_info.stack_ptr.is_reg ? ctx->callee_info.stack_ptr.def.getTemp() : Temp(), ++ ctx->callee_info.scratch_param_size, get_ssa_temp(ctx, &instr->def)); ++ break; ++ } ++ case nir_intrinsic_store_param_amd: { ++ auto& param = ctx->callee_info.param_infos[nir_intrinsic_param_idx(instr)]; ++ if (param.is_reg) ++ param.def.setTemp(param.def.regClass().type() == RegType::vgpr ++ ? as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)) ++ : bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa))); ++ else ++ store_scratch_param( ++ ctx, bld, param, ++ ctx->callee_info.stack_ptr.is_reg ? ctx->callee_info.stack_ptr.def.getTemp() : Temp(), ++ ctx->callee_info.scratch_param_size, get_ssa_temp(ctx, instr->src[0].ssa)); ++ break; ++ } + case nir_intrinsic_load_call_return_address_amd: { + bld.copy(Definition(get_ssa_temp(ctx, &instr->def)), + Operand(ctx->callee_info.return_address.def.getTemp())); +diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp +index 600c63c8b9ce3..1bdbe28ec17bd 100644 +--- a/src/amd/compiler/aco_instruction_selection_setup.cpp ++++ b/src/amd/compiler/aco_instruction_selection_setup.cpp +@@ -5,12 +5,13 @@ + */ + + #include "aco_instruction_selection.h" ++#include "aco_nir_call_attribs.h" + + #include "common/ac_nir.h" + #include "common/sid.h" + +-#include "nir_control_flow.h" + #include "nir_builder.h" ++#include "nir_control_flow.h" + + #include + +@@ -631,6 +632,16 @@ init_context(isel_context* ctx, nir_shader* shader) + case nir_intrinsic_load_view_index: + type = ctx->stage == fragment_fs ? RegType::vgpr : RegType::sgpr; + break; ++ case nir_intrinsic_load_return_param_amd: { ++ type = RegType::vgpr; ++ break; ++ } ++ case nir_intrinsic_load_param: { ++ nir_parameter* param = ++ &impl->function->params[nir_intrinsic_param_idx(intrinsic)]; ++ type = param->is_uniform ? RegType::sgpr : RegType::vgpr; ++ break; ++ } + default: + for (unsigned i = 0; i < nir_intrinsic_infos[intrinsic->intrinsic].num_srcs; + i++) { +-- +GitLab + + +From 04c145740dcc48f05926edf8db90fc38b02bf2e5 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Thu, 6 Jun 2024 07:17:15 +0200 +Subject: [PATCH 45/71] aco: Add common utility to load scratch descriptor + +Also modifies the scratch descriptor to take the stack pointer into +account. +--- + .../compiler/aco_instruction_selection.cpp | 40 +-------- + src/amd/compiler/aco_scratch_rsrc.h | 82 +++++++++++++++++++ + src/amd/compiler/aco_spill.cpp | 54 +----------- + 3 files changed, 87 insertions(+), 89 deletions(-) + create mode 100644 src/amd/compiler/aco_scratch_rsrc.h + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index f3ec6fa04dd36..6ed8dd84c777f 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -11,6 +11,7 @@ + #include "aco_interface.h" + #include "aco_ir.h" + #include "aco_nir_call_attribs.h" ++#include "aco_scratch_rsrc.h" + + #include "common/ac_descriptors.h" + #include "common/ac_gpu_info.h" +@@ -7701,41 +7702,6 @@ visit_access_shared2_amd(isel_context* ctx, nir_intrinsic_instr* instr) + } + } + +-Temp +-get_scratch_resource(isel_context* ctx) +-{ +- Builder bld(ctx->program, ctx->block); +- Temp scratch_addr = ctx->program->private_segment_buffer; +- if (!scratch_addr.bytes()) { +- Temp addr_lo = +- bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo)); +- Temp addr_hi = +- bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi)); +- scratch_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi); +- } else if (ctx->stage.hw != AC_HW_COMPUTE_SHADER) { +- scratch_addr = +- bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero()); +- } +- +- struct ac_buffer_state ac_state = {0}; +- uint32_t desc[4]; +- +- ac_state.size = 0xffffffff; +- ac_state.format = PIPE_FORMAT_R32_FLOAT; +- for (int i = 0; i < 4; i++) +- ac_state.swizzle[i] = PIPE_SWIZZLE_0; +- /* older generations need element size = 4 bytes. element size removed in GFX9 */ +- ac_state.element_size = ctx->program->gfx_level <= GFX8 ? 1u : 0u; +- ac_state.index_stride = ctx->program->wave_size == 64 ? 3u : 2u; +- ac_state.add_tid = true; +- ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW; +- +- ac_build_buffer_descriptor(ctx->program->gfx_level, &ac_state, desc); +- +- return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(desc[2]), +- Operand::c32(desc[3])); +-} +- + void + visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr) + { +@@ -7778,7 +7744,7 @@ visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr) + params.max_const_offset_plus_one = ctx->program->dev.scratch_global_offset_max + 1; + emit_load(ctx, bld, info, params); + } else { +- info.resource = get_scratch_resource(ctx); ++ info.resource = load_scratch_resource(ctx->program, bld, false, true); + info.offset = Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa))); + info.soffset = ctx->program->scratch_offset; + emit_load(ctx, bld, info, scratch_mubuf_load_params); +@@ -7841,7 +7807,7 @@ visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr) + memory_sync_info(storage_scratch, semantic_private)); + } + } else { +- Temp rsrc = get_scratch_resource(ctx); ++ Temp rsrc = load_scratch_resource(ctx->program, bld, false, true); + offset = as_vgpr(ctx, offset); + for (unsigned i = 0; i < write_count; i++) { + aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); +diff --git a/src/amd/compiler/aco_scratch_rsrc.h b/src/amd/compiler/aco_scratch_rsrc.h +new file mode 100644 +index 0000000000000..5b0af2bca46f0 +--- /dev/null ++++ b/src/amd/compiler/aco_scratch_rsrc.h +@@ -0,0 +1,82 @@ ++/* ++ * Copyright © 2024 Valve Corporation. ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#include "aco_builder.h" ++#include "aco_ir.h" ++ ++#include "ac_descriptors.h" ++#include "amdgfxregs.h" ++ ++#ifndef ACO_SCRATCH_RSRC_H ++#define ACO_SCRATCH_RSRC_H ++ ++namespace aco { ++ ++inline Temp ++load_scratch_resource(Program* program, Builder& bld, bool apply_scratch_offset, ++ bool apply_stack_ptr) ++{ ++ Temp private_segment_buffer = program->private_segment_buffer; ++ if (!private_segment_buffer.bytes()) { ++ Temp addr_lo = ++ bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo)); ++ Temp addr_hi = ++ bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi)); ++ private_segment_buffer = ++ bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi); ++ } else if (program->stage.hw != AC_HW_COMPUTE_SHADER) { ++ private_segment_buffer = ++ bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero()); ++ } ++ ++ if ((apply_stack_ptr && program->stack_ptr != Temp()) || apply_scratch_offset) { ++ Temp addr_lo = bld.tmp(s1); ++ Temp addr_hi = bld.tmp(s1); ++ bld.pseudo(aco_opcode::p_split_vector, Definition(addr_lo), Definition(addr_hi), ++ private_segment_buffer); ++ ++ if (apply_stack_ptr && program->stack_ptr != Temp()) { ++ Temp carry = bld.tmp(s1); ++ addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo, ++ program->stack_ptr); ++ addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi, ++ Operand::c32(0), bld.scc(carry)); ++ } ++ ++ if (apply_scratch_offset) { ++ Temp carry = bld.tmp(s1); ++ addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo, ++ program->scratch_offset); ++ addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi, ++ Operand::c32(0), bld.scc(carry)); ++ } ++ ++ private_segment_buffer = ++ bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi); ++ } ++ ++ struct ac_buffer_state ac_state = {0}; ++ uint32_t desc[4]; ++ ++ ac_state.size = 0xffffffff; ++ ac_state.format = PIPE_FORMAT_R32_FLOAT; ++ for (int i = 0; i < 4; i++) ++ ac_state.swizzle[i] = PIPE_SWIZZLE_0; ++ /* older generations need element size = 4 bytes. element size removed in GFX9 */ ++ ac_state.element_size = program->gfx_level <= GFX8 ? 1u : 0u; ++ ac_state.index_stride = program->wave_size == 64 ? 3u : 2u; ++ ac_state.add_tid = true; ++ ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW; ++ ++ ac_build_buffer_descriptor(program->gfx_level, &ac_state, desc); ++ ++ return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, ++ Operand::c32(desc[2]), Operand::c32(desc[3])); ++} ++ ++} // namespace aco ++ ++#endif // ACO_SCRATCH_RSRC_H +diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp +index 2e30bf9e2783e..c271cbcf01eb8 100644 +--- a/src/amd/compiler/aco_spill.cpp ++++ b/src/amd/compiler/aco_spill.cpp +@@ -7,6 +7,7 @@ + + #include "aco_builder.h" + #include "aco_ir.h" ++#include "aco_scratch_rsrc.h" + #include "aco_util.h" + + #include "common/ac_descriptors.h" +@@ -1134,57 +1135,6 @@ spill_block(spill_ctx& ctx, unsigned block_idx) + } + } + +-Temp +-load_scratch_resource(spill_ctx& ctx, Builder& bld, bool apply_scratch_offset) +-{ +- Temp private_segment_buffer = ctx.program->private_segment_buffer; +- if (!private_segment_buffer.bytes()) { +- Temp addr_lo = +- bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_lo)); +- Temp addr_hi = +- bld.sop1(aco_opcode::p_load_symbol, bld.def(s1), Operand::c32(aco_symbol_scratch_addr_hi)); +- private_segment_buffer = +- bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi); +- } else if (ctx.program->stage.hw != AC_HW_COMPUTE_SHADER) { +- private_segment_buffer = +- bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, Operand::zero()); +- } +- +- if (apply_scratch_offset) { +- Temp addr_lo = bld.tmp(s1); +- Temp addr_hi = bld.tmp(s1); +- bld.pseudo(aco_opcode::p_split_vector, Definition(addr_lo), Definition(addr_hi), +- private_segment_buffer); +- +- Temp carry = bld.tmp(s1); +- addr_lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), addr_lo, +- ctx.program->scratch_offset); +- addr_hi = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), addr_hi, +- Operand::c32(0), bld.scc(carry)); +- +- private_segment_buffer = +- bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), addr_lo, addr_hi); +- } +- +- struct ac_buffer_state ac_state = {0}; +- uint32_t desc[4]; +- +- ac_state.size = 0xffffffff; +- ac_state.format = PIPE_FORMAT_R32_FLOAT; +- for (int i = 0; i < 4; i++) +- ac_state.swizzle[i] = PIPE_SWIZZLE_0; +- /* older generations need element size = 4 bytes. element size removed in GFX9 */ +- ac_state.element_size = ctx.program->gfx_level <= GFX8 ? 1u : 0u; +- ac_state.index_stride = ctx.program->wave_size == 64 ? 3u : 2u; +- ac_state.add_tid = true; +- ac_state.gfx10_oob_select = V_008F0C_OOB_SELECT_RAW; +- +- ac_build_buffer_descriptor(ctx.program->gfx_level, &ac_state, desc); +- +- return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, +- Operand::c32(desc[2]), Operand::c32(desc[3])); +-} +- + void + setup_vgpr_spill_reload(spill_ctx& ctx, Block& block, + std::vector>& instructions, uint32_t spill_slot, +@@ -1249,7 +1199,7 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block, + } + } else { + if (ctx.scratch_rsrc == Temp()) +- ctx.scratch_rsrc = load_scratch_resource(ctx, rsrc_bld, overflow); ++ ctx.scratch_rsrc = load_scratch_resource(ctx.program, rsrc_bld, overflow, true); + + if (overflow) { + uint32_t soffset = +-- +GitLab + + +From 912041711336f7e14a19439aeffd8a404990fd55 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 13 May 2024 06:14:32 +0200 +Subject: [PATCH 46/71] aco: Add Program::is_callee and set it for RT shaders + +--- + src/amd/compiler/aco_instruction_selection.cpp | 2 ++ + src/amd/compiler/aco_ir.h | 1 + + 2 files changed, 3 insertions(+) + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index 6ed8dd84c777f..d3d15c9500d5e 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -12048,6 +12048,8 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c + init_context(&ctx, nir); + setup_fp_mode(&ctx, nir); + ++ ctx.program->is_callee = true; ++ + Instruction* startpgm = add_startpgm(&ctx); + append_logical_start(ctx.block); + split_arguments(&ctx, startpgm); +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index 7989d27dfe75b..2bc7b91c81584 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -2385,6 +2385,7 @@ public: + /* For shader part with previous shader part that has lds access. */ + bool pending_lds_access = false; + ++ bool is_callee = false; + ABI callee_abi = {}; + unsigned short arg_sgpr_count; + unsigned short arg_vgpr_count; +-- +GitLab + + +From 441ab8b850fb95ed9a8cfc7ae0fe0e258385fdaa Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sat, 4 May 2024 17:54:14 +0200 +Subject: [PATCH 47/71] radv,aco: Use function call structure for RT programs + +--- + .../compiler/aco_instruction_selection.cpp | 208 ++++++++++++------ + src/amd/compiler/aco_interface.cpp | 7 +- + src/amd/compiler/aco_interface.h | 4 +- + src/amd/compiler/aco_ir.h | 4 +- + src/amd/vulkan/radv_pipeline_rt.c | 6 +- + src/amd/vulkan/radv_shader.c | 8 +- + src/amd/vulkan/radv_shader.h | 3 +- + 7 files changed, 165 insertions(+), 75 deletions(-) + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index d3d15c9500d5e..901b9ca843eb1 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -12003,33 +12003,53 @@ merged_wave_info_to_mask(isel_context* ctx, unsigned i) + return lanecount_to_mask(ctx, count, i * 8u); + } + +-static void +-insert_rt_jump_next(isel_context& ctx, const struct ac_shader_args* args) ++void ++insert_return(isel_context& ctx) + { +- unsigned src_count = 0; +- for (unsigned i = 0; i < ctx.args->arg_count; i++) +- src_count += !!BITSET_TEST(ctx.output_args, i); +- ++ unsigned return_param_count = 0; ++ for (auto& param_def : ctx.callee_info.param_infos) { ++ if (!param_def.is_reg || param_def.discardable) ++ continue; ++ ++return_param_count; ++ } ++ unsigned src_count = return_param_count + 2; ++ if (ctx.next_pc != Temp()) ++ src_count += ctx.args->arg_count; + Instruction* ret = create_instruction(aco_opcode::p_return, Format::PSEUDO, src_count, 0); + ctx.block->instructions.emplace_back(ret); + +- src_count = 0; +- for (unsigned i = 0; i < ctx.args->arg_count; i++) { +- if (!BITSET_TEST(ctx.output_args, i)) +- continue; +- +- enum ac_arg_regfile file = ctx.args->args[i].file; +- unsigned size = ctx.args->args[i].size; +- unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256); +- RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size); +- Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg}) +- : Operand(PhysReg{reg}, type); +- ret->operands[src_count] = op; +- src_count++; ++ if (ctx.next_pc != Temp()) { ++ for (unsigned i = 0; i < ctx.args->arg_count; i++) { ++ enum ac_arg_regfile file = ctx.args->args[i].file; ++ unsigned size = ctx.args->args[i].size; ++ unsigned reg = ctx.args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256); ++ RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size); ++ Operand op = ctx.arg_temps[i].id() ? Operand(ctx.arg_temps[i], PhysReg{reg}) ++ : Operand(PhysReg{reg}, type); ++ ret->operands[i] = op; ++ } + } + +- Builder bld(ctx.program, ctx.block); +- bld.sop1(aco_opcode::s_setpc_b64, get_arg(&ctx, ctx.args->rt.uniform_shader_addr)); ++ unsigned def_idx = ctx.next_pc != Temp() ? ctx.args->arg_count : 0; ++ for (unsigned i = 0; i < ctx.callee_info.param_infos.size(); ++i) { ++ const auto& param_info = ctx.callee_info.param_infos[i]; ++ if (!param_info.is_reg || param_info.discardable) ++ continue; ++ Temp param_temp = param_info.def.getTemp(); ++ if (i == 0 && ctx.next_pc != Temp()) ++ param_temp = ctx.next_divergent_pc; ++ else if (i == 1 && ctx.next_pc != Temp()) ++ param_temp = ctx.next_pc; ++ Operand op = Operand(param_temp); ++ op.setPrecolored(param_info.def.physReg()); ++ ret->operands[def_idx++] = op; ++ } ++ Operand op = Operand(ctx.callee_info.return_address.def.getTemp()); ++ op.setPrecolored(ctx.callee_info.return_address.def.physReg()); ++ ret->operands[def_idx++] = op; ++ Operand stack_op = Operand(ctx.callee_info.stack_ptr.def.getTemp()); ++ stack_op.setPrecolored(ctx.callee_info.stack_ptr.def.physReg()); ++ ret->operands[def_idx++] = stack_op; + } + + void +@@ -12048,21 +12068,38 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c + init_context(&ctx, nir); + setup_fp_mode(&ctx, nir); + ++ ABI abi; ++ /* TODO: callable abi? */ ++ switch (shaders[i]->info.stage) { ++ case MESA_SHADER_RAYGEN: ++ case MESA_SHADER_CLOSEST_HIT: ++ case MESA_SHADER_MISS: ++ case MESA_SHADER_CALLABLE: abi = rtRaygenABI; break; ++ case MESA_SHADER_INTERSECTION: abi = rtTraversalABI; break; ++ case MESA_SHADER_ANY_HIT: abi = rtAnyHitABI; break; ++ default: unreachable("invalid RT shader stage"); ++ } ++ ++ ctx.callee_abi = make_abi(abi, ctx.program); ++ ctx.program->callee_abi = ctx.callee_abi; ++ ctx.callee_info = get_callee_info(ctx.callee_abi, impl->function->num_params, ++ impl->function->params, ctx.program); + ctx.program->is_callee = true; + +- Instruction* startpgm = add_startpgm(&ctx); ++ Instruction* startpgm = add_startpgm(&ctx, true); + append_logical_start(ctx.block); + split_arguments(&ctx, startpgm); + visit_cf_list(&ctx, &impl->body); + append_logical_end(ctx.block); + ctx.block->kind |= block_kind_uniform; + +- /* Fix output registers and jump to next shader. We can skip this when dealing with a +- * raygen shader without shader calls. +- */ +- if ((shader_count > 1 || shaders[i]->info.stage != MESA_SHADER_RAYGEN) && +- impl == nir_shader_get_entrypoint(nir)) +- insert_rt_jump_next(ctx, args); ++ if (ctx.next_pc != Temp()) { ++ insert_return(ctx); ++ ++ Builder(ctx.program, ctx.block).sop1(aco_opcode::s_setpc_b64, Operand(ctx.next_pc)); ++ } else { ++ Builder(ctx.program, ctx.block).sopp(aco_opcode::s_endpgm); ++ } + + cleanup_context(&ctx); + first_block = false; +@@ -12879,7 +12916,8 @@ calc_nontrivial_instance_id(Builder& bld, const struct ac_shader_args* args, + void + select_rt_prolog(Program* program, ac_shader_config* config, + const struct aco_compiler_options* options, const struct aco_shader_info* info, +- const struct ac_shader_args* in_args, const struct ac_shader_args* out_args) ++ const struct ac_shader_args* in_args, const struct ac_arg* descriptors, ++ unsigned raygen_param_count, nir_parameter* raygen_params) + { + init_program(program, compute_cs, info, options->gfx_level, options->family, options->wgp_mode, + config); +@@ -12890,8 +12928,11 @@ select_rt_prolog(Program* program, ac_shader_config* config, + calc_min_waves(program); + Builder bld(program, block); + block->instructions.reserve(32); +- unsigned num_sgprs = MAX2(in_args->num_sgprs_used, out_args->num_sgprs_used); +- unsigned num_vgprs = MAX2(in_args->num_vgprs_used, out_args->num_vgprs_used); ++ unsigned num_sgprs = in_args->num_sgprs_used; ++ unsigned num_vgprs = in_args->num_vgprs_used; ++ ++ struct callee_info raygen_info = ++ get_callee_info(make_abi(rtRaygenABI, program), raygen_param_count, raygen_params, NULL); + + /* Inputs: + * Ring offsets: s[0-1] +@@ -12906,9 +12947,11 @@ select_rt_prolog(Program* program, ac_shader_config* config, + * Local invocation IDs: v[0-2] + */ + PhysReg in_ring_offsets = get_arg_reg(in_args, in_args->ring_offsets); ++ PhysReg in_descriptors = get_arg_reg(in_args, *descriptors); ++ PhysReg in_push_constants = get_arg_reg(in_args, in_args->push_constants); + PhysReg in_sbt_desc = get_arg_reg(in_args, in_args->rt.sbt_descriptors); ++ PhysReg in_traversal_addr = get_arg_reg(in_args, in_args->rt.traversal_shader_addr); + PhysReg in_launch_size_addr = get_arg_reg(in_args, in_args->rt.launch_size_addr); +- PhysReg in_stack_base = get_arg_reg(in_args, in_args->rt.dynamic_callable_stack_base); + PhysReg in_wg_id_x; + PhysReg in_wg_id_y; + PhysReg in_wg_id_z; +@@ -12942,46 +12985,84 @@ select_rt_prolog(Program* program, ac_shader_config* config, + * Shader VA: v[4-5] + * Shader Record Ptr: v[6-7] + */ +- PhysReg out_uniform_shader_addr = get_arg_reg(out_args, out_args->rt.uniform_shader_addr); +- PhysReg out_launch_size_x = get_arg_reg(out_args, out_args->rt.launch_sizes[0]); +- PhysReg out_launch_size_y = get_arg_reg(out_args, out_args->rt.launch_sizes[1]); +- PhysReg out_launch_size_z = get_arg_reg(out_args, out_args->rt.launch_sizes[2]); ++ assert(raygen_info.stack_ptr.is_reg); ++ assert(raygen_info.return_address.is_reg); ++ assert(raygen_info.param_infos[0].is_reg); ++ assert(raygen_info.param_infos[1].is_reg); ++ assert(raygen_info.param_infos[RAYGEN_ARG_LAUNCH_ID + 2].is_reg); ++ assert(raygen_info.param_infos[RAYGEN_ARG_LAUNCH_SIZE + 2].is_reg); ++ assert(raygen_info.param_infos[RAYGEN_ARG_DESCRIPTORS + 2].is_reg); ++ assert(raygen_info.param_infos[RAYGEN_ARG_PUSH_CONSTANTS + 2].is_reg); ++ assert(raygen_info.param_infos[RAYGEN_ARG_SBT_DESCRIPTORS + 2].is_reg); ++ assert(raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR + 2].is_reg); ++ assert(raygen_info.param_infos[RAYGEN_ARG_SHADER_RECORD_PTR + 2].is_reg); ++ PhysReg out_stack_ptr_param = raygen_info.stack_ptr.def.physReg(); ++ PhysReg out_return_shader_addr = raygen_info.return_address.def.physReg(); ++ PhysReg out_divergent_shader_addr = raygen_info.param_infos[0].def.physReg(); ++ PhysReg out_uniform_shader_addr = raygen_info.param_infos[1].def.physReg(); ++ PhysReg out_launch_size_x = raygen_info.param_infos[RAYGEN_ARG_LAUNCH_SIZE + 2].def.physReg(); ++ PhysReg out_launch_size_y = out_launch_size_x.advance(4); ++ PhysReg out_launch_size_z = out_launch_size_y.advance(4); + PhysReg out_launch_ids[3]; +- for (unsigned i = 0; i < 3; i++) +- out_launch_ids[i] = get_arg_reg(out_args, out_args->rt.launch_ids[i]); +- PhysReg out_stack_ptr = get_arg_reg(out_args, out_args->rt.dynamic_callable_stack_base); +- PhysReg out_record_ptr = get_arg_reg(out_args, out_args->rt.shader_record); ++ out_launch_ids[0] = raygen_info.param_infos[RAYGEN_ARG_LAUNCH_ID + 2].def.physReg(); ++ for (unsigned i = 1; i < 3; i++) ++ out_launch_ids[i] = out_launch_ids[i - 1].advance(4); ++ PhysReg out_descriptors = raygen_info.param_infos[RAYGEN_ARG_DESCRIPTORS + 2].def.physReg(); ++ PhysReg out_push_constants = ++ raygen_info.param_infos[RAYGEN_ARG_PUSH_CONSTANTS + 2].def.physReg(); ++ PhysReg out_sbt_descriptors = ++ raygen_info.param_infos[RAYGEN_ARG_SBT_DESCRIPTORS + 2].def.physReg(); ++ PhysReg out_traversal_addr = ++ raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR + 2].def.physReg(); ++ PhysReg out_record_ptr = raygen_info.param_infos[RAYGEN_ARG_SHADER_RECORD_PTR + 2].def.physReg(); ++ ++ num_sgprs = std::max(num_sgprs, out_stack_ptr_param.reg()); ++ num_vgprs = std::max(num_vgprs, out_record_ptr.reg() - 256 + 2); + + /* Temporaries: */ + num_sgprs = align(num_sgprs, 2); ++ num_sgprs += 2; + PhysReg tmp_raygen_sbt = PhysReg{num_sgprs}; + num_sgprs += 2; ++ PhysReg tmp_launch_size_addr = PhysReg{num_sgprs}; ++ num_sgprs += 2; + PhysReg tmp_ring_offsets = PhysReg{num_sgprs}; + num_sgprs += 2; ++ PhysReg tmp_traversal_addr = PhysReg{num_sgprs}; ++ num_sgprs += 2; + PhysReg tmp_wg_id_x_times_size = PhysReg{num_sgprs}; + num_sgprs++; + + PhysReg tmp_invocation_idx = PhysReg{256 + num_vgprs++}; + + /* Confirm some assumptions about register aliasing */ +- assert(in_ring_offsets == out_uniform_shader_addr); +- assert(get_arg_reg(in_args, in_args->push_constants) == +- get_arg_reg(out_args, out_args->push_constants)); +- assert(get_arg_reg(in_args, in_args->rt.sbt_descriptors) == +- get_arg_reg(out_args, out_args->rt.sbt_descriptors)); +- assert(in_launch_size_addr == out_launch_size_x); +- assert(in_stack_base == out_launch_size_z); +- assert(in_local_ids[0] == out_launch_ids[0]); ++ assert(in_descriptors == out_uniform_shader_addr); ++ assert(in_sbt_desc == out_launch_size_x); ++ assert(in_traversal_addr == out_launch_size_z); ++ assert(in_wg_id_x == out_traversal_addr); + + /* gfx_level >= GFX9 || in_scratch_offset.reg() >= out_args->num_sgprs_used); ++ assert(options->gfx_level >= GFX9 || ++ in_scratch_offset.reg() >= ++ raygen_info.param_infos[RAYGEN_ARG_TRAVERSAL_ADDR].def.physReg()); + + /* load raygen sbt */ + bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_raygen_sbt, s2), Operand(in_sbt_desc, s2), + Operand::c32(0u)); + ++ bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_launch_size_addr, s2), ++ Operand(in_launch_size_addr, s2)); ++ bld.sop1(aco_opcode::s_mov_b64, Definition(tmp_traversal_addr, s2), ++ Operand(in_traversal_addr, s2)); ++ bld.sop1(aco_opcode::s_mov_b32, Definition(out_descriptors, s1), Operand(in_descriptors, s1)); ++ bld.sop1(aco_opcode::s_mov_b32, Definition(out_push_constants, s1), ++ Operand(in_push_constants, s1)); ++ bld.sop1(aco_opcode::s_mov_b32, Definition(out_sbt_descriptors, s1), Operand(in_sbt_desc, s1)); ++ bld.sop1(aco_opcode::s_mov_b32, Definition(out_sbt_descriptors.advance(4), s1), ++ Operand(in_sbt_desc.advance(4), s1)); ++ + /* init scratch */ + if (options->gfx_level < GFX9) { + /* copy ring offsets to temporary location*/ +@@ -12992,18 +13073,15 @@ select_rt_prolog(Program* program, ac_shader_config* config, + Operand(in_scratch_offset, s1)); + } + +- /* set stack ptr */ +- bld.vop1(aco_opcode::v_mov_b32, Definition(out_stack_ptr, v1), Operand(in_stack_base, s1)); +- + /* load raygen address */ + bld.smem(aco_opcode::s_load_dwordx2, Definition(out_uniform_shader_addr, s2), + Operand(tmp_raygen_sbt, s2), Operand::c32(0u)); + + /* load ray launch sizes */ + bld.smem(aco_opcode::s_load_dword, Definition(out_launch_size_z, s1), +- Operand(in_launch_size_addr, s2), Operand::c32(8u)); ++ Operand(tmp_launch_size_addr, s2), Operand::c32(8u)); + bld.smem(aco_opcode::s_load_dwordx2, Definition(out_launch_size_x, s2), +- Operand(in_launch_size_addr, s2), Operand::c32(0u)); ++ Operand(tmp_launch_size_addr, s2), Operand::c32(0u)); + + /* calculate ray launch ids */ + if (options->gfx_level >= GFX11) { +@@ -13059,6 +13137,11 @@ select_rt_prolog(Program* program, ac_shader_config* config, + Operand::c32(-1u), Operand(tmp_invocation_idx, v1)); + } + ++ bld.sop1(aco_opcode::s_mov_b32, Definition(out_traversal_addr, s1), ++ Operand(tmp_traversal_addr, s1)); ++ bld.sop1(aco_opcode::s_mov_b32, Definition(out_traversal_addr.advance(4), s1), ++ Operand(tmp_traversal_addr.advance(4), s1)); ++ + /* Make fixup operations a no-op if this is not a converted 2D dispatch. */ + bld.sopc(aco_opcode::s_cmp_lg_u32, Definition(scc, s1), + Operand::c32(ACO_RT_CONVERTED_2D_LAUNCH_SIZE), Operand(out_launch_size_y, s1)); +@@ -13070,14 +13153,15 @@ select_rt_prolog(Program* program, ac_shader_config* config, + bld.vop2(aco_opcode::v_cndmask_b32, Definition(out_launch_ids[1], v1), Operand::zero(), + Operand(out_launch_ids[1], v1), Operand(vcc, bld.lm)); + +- if (options->gfx_level < GFX9) { +- /* write scratch/ring offsets to outputs, if needed */ +- bld.sop1(aco_opcode::s_mov_b32, +- Definition(get_arg_reg(out_args, out_args->scratch_offset), s1), +- Operand(in_scratch_offset, s1)); +- bld.sop1(aco_opcode::s_mov_b64, Definition(get_arg_reg(out_args, out_args->ring_offsets), s2), +- Operand(tmp_ring_offsets, s2)); +- } ++ if (program->gfx_level < GFX8) ++ bld.vop3(aco_opcode::v_lshr_b64, Definition(out_divergent_shader_addr, v2), ++ Operand(out_uniform_shader_addr, s2), Operand::c32(0)); ++ else ++ bld.vop3(aco_opcode::v_lshrrev_b64, Definition(out_divergent_shader_addr, v2), ++ Operand::c32(0), Operand(out_uniform_shader_addr, s2)); ++ bld.sop1(aco_opcode::s_mov_b64, Definition(out_return_shader_addr, s2), Operand::c32(0)); ++ ++ bld.sopk(aco_opcode::s_movk_i32, Definition(out_stack_ptr_param, s1), 0); + + /* jump to raygen */ + bld.sop1(aco_opcode::s_setpc_b64, Operand(out_uniform_shader_addr, s2)); +diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp +index 32a28908f90f0..5c7956caeedd4 100644 +--- a/src/amd/compiler/aco_interface.cpp ++++ b/src/amd/compiler/aco_interface.cpp +@@ -307,8 +307,8 @@ aco_compile_shader(const struct aco_compiler_options* options, const struct aco_ + void + aco_compile_rt_prolog(const struct aco_compiler_options* options, + const struct aco_shader_info* info, const struct ac_shader_args* in_args, +- const struct ac_shader_args* out_args, aco_callback* build_prolog, +- void** binary) ++ const struct ac_arg* descriptors, unsigned raygen_param_count, ++ nir_parameter* raygen_params, aco_callback* build_prolog, void** binary) + { + init(); + +@@ -319,7 +319,8 @@ aco_compile_rt_prolog(const struct aco_compiler_options* options, + program->debug.func = NULL; + program->debug.private_data = NULL; + +- select_rt_prolog(program.get(), &config, options, info, in_args, out_args); ++ select_rt_prolog(program.get(), &config, options, info, in_args, descriptors, raygen_param_count, ++ raygen_params); + validate(program.get()); + insert_waitcnt(program.get()); + insert_NOPs(program.get()); +diff --git a/src/amd/compiler/aco_interface.h b/src/amd/compiler/aco_interface.h +index 462727432a1ac..efc3172647183 100644 +--- a/src/amd/compiler/aco_interface.h ++++ b/src/amd/compiler/aco_interface.h +@@ -49,8 +49,8 @@ void aco_compile_shader(const struct aco_compiler_options* options, + + void aco_compile_rt_prolog(const struct aco_compiler_options* options, + const struct aco_shader_info* info, const struct ac_shader_args* in_args, +- const struct ac_shader_args* out_args, aco_callback* build_prolog, +- void** binary); ++ const struct ac_arg* descriptors, unsigned raygen_param_count, ++ nir_parameter* raygen_params, aco_callback* build_prolog, void** binary); + + void aco_compile_vs_prolog(const struct aco_compiler_options* options, + const struct aco_shader_info* info, +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index 2bc7b91c81584..ccf2710d5453f 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -23,6 +23,7 @@ + #include + + typedef struct nir_shader nir_shader; ++typedef struct nir_parameter nir_parameter; + + namespace aco { + +@@ -2462,7 +2463,8 @@ void select_trap_handler_shader(Program* program, struct nir_shader* shader, + void select_rt_prolog(Program* program, ac_shader_config* config, + const struct aco_compiler_options* options, + const struct aco_shader_info* info, const struct ac_shader_args* in_args, +- const struct ac_shader_args* out_args); ++ const struct ac_arg* descriptors, unsigned raygen_param_count, ++ nir_parameter* raygen_params); + void select_vs_prolog(Program* program, const struct aco_vs_prolog_info* pinfo, + ac_shader_config* config, const struct aco_compiler_options* options, + const struct aco_shader_info* info, const struct ac_shader_args* args); +diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c +index 196f8aa23a032..216eb1bb09f89 100644 +--- a/src/amd/vulkan/radv_pipeline_rt.c ++++ b/src/amd/vulkan/radv_pipeline_rt.c +@@ -808,8 +808,12 @@ static void + compile_rt_prolog(struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline) + { + const struct radv_physical_device *pdev = radv_device_physical(device); ++ struct nir_function raygen_stub = {}; + +- pipeline->prolog = radv_create_rt_prolog(device); ++ /* Create a dummy function signature for raygen shaders in order to pass parameter info to the prolog */ ++ radv_nir_init_rt_function_params(&raygen_stub, MESA_SHADER_RAYGEN, 0); ++ radv_nir_lower_callee_signature(&raygen_stub, NULL); ++ pipeline->prolog = radv_create_rt_prolog(device, raygen_stub.num_params, raygen_stub.params); + + /* create combined config */ + struct ac_shader_config *config = &pipeline->prolog->config; +diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c +index e5aa2ff636e1f..60648b2405321 100644 +--- a/src/amd/vulkan/radv_shader.c ++++ b/src/amd/vulkan/radv_shader.c +@@ -3073,13 +3073,12 @@ radv_aco_build_shader_part(void **bin, uint32_t num_sgprs, uint32_t num_vgprs, c + } + + struct radv_shader * +-radv_create_rt_prolog(struct radv_device *device) ++radv_create_rt_prolog(struct radv_device *device, unsigned raygen_param_count, nir_parameter *raygen_params) + { + const struct radv_physical_device *pdev = radv_device_physical(device); + const struct radv_instance *instance = radv_physical_device_instance(pdev); + struct radv_shader *prolog; + struct radv_shader_args in_args = {0}; +- struct radv_shader_args out_args = {0}; + struct radv_nir_compiler_options options = {0}; + radv_fill_nir_compiler_options(&options, device, NULL, false, instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS, false, + radv_device_fault_detection_enabled(device), false); +@@ -3100,7 +3099,6 @@ radv_create_rt_prolog(struct radv_device *device) + info.cs.uses_block_id[i] = true; + + radv_declare_shader_args(device, NULL, &info, MESA_SHADER_COMPUTE, MESA_SHADER_NONE, &in_args); +- radv_declare_rt_shader_args(options.info->gfx_level, &out_args); + info.user_sgprs_locs = in_args.user_sgprs_locs; + + #if AMD_LLVM_AVAILABLE +@@ -3114,8 +3112,8 @@ radv_create_rt_prolog(struct radv_device *device) + struct aco_compiler_options ac_opts; + radv_aco_convert_shader_info(&ac_info, &info, &in_args, &device->cache_key, options.info->gfx_level); + radv_aco_convert_opts(&ac_opts, &options, &in_args, &stage_key); +- aco_compile_rt_prolog(&ac_opts, &ac_info, &in_args.ac, &out_args.ac, &radv_aco_build_shader_binary, +- (void **)&binary); ++ aco_compile_rt_prolog(&ac_opts, &ac_info, &in_args.ac, &in_args.descriptor_sets[0], raygen_param_count, raygen_params, ++ &radv_aco_build_shader_binary, (void **)&binary); + binary->info = info; + + radv_postprocess_binary_config(device, binary, &in_args); +diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h +index 10e062fb041b9..5ee1ee40466cf 100644 +--- a/src/amd/vulkan/radv_shader.h ++++ b/src/amd/vulkan/radv_shader.h +@@ -576,7 +576,8 @@ void radv_free_shader_memory(struct radv_device *device, union radv_shader_arena + + struct radv_shader *radv_create_trap_handler_shader(struct radv_device *device); + +-struct radv_shader *radv_create_rt_prolog(struct radv_device *device); ++struct radv_shader *radv_create_rt_prolog(struct radv_device *device, unsigned raygen_param_count, ++ nir_parameter *raygen_params); + + struct radv_shader_part *radv_shader_part_create(struct radv_device *device, struct radv_shader_part_binary *binary, + unsigned wave_size); +-- +GitLab + + +From 26d71a1077a1d0b29e4e426c5a83d0a04a7b18d6 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 13 May 2024 06:17:34 +0200 +Subject: [PATCH 48/71] aco/ssa_elimination: Don't remove exec writes for last + blocks of callee shaders + +The caller is going to use the exec mask written there. +--- + src/amd/compiler/aco_ssa_elimination.cpp | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/src/amd/compiler/aco_ssa_elimination.cpp b/src/amd/compiler/aco_ssa_elimination.cpp +index a1477244f51d9..e63dd63ad917c 100644 +--- a/src/amd/compiler/aco_ssa_elimination.cpp ++++ b/src/amd/compiler/aco_ssa_elimination.cpp +@@ -758,7 +758,8 @@ eliminate_useless_exec_writes_in_block(ssa_elimination_ctx& ctx, Block& block) + /* Check if any successor needs the outgoing exec mask from the current block. */ + + bool exec_write_used; +- if (block.kind & block_kind_end_with_regs) { ++ if (block.kind & block_kind_end_with_regs || ++ (block.linear_succs.empty() && ctx.program->is_callee)) { + /* Last block of a program with succeed shader part should respect final exec write. */ + exec_write_used = true; + } else { +-- +GitLab + + +From 6935d9d0a326ae77622e57057ee433faf3c33146 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 6 Mar 2024 14:53:39 +0100 +Subject: [PATCH 49/71] aco/isel: Handle calls + +--- + .../compiler/aco_instruction_selection.cpp | 130 ++++++++++++++++++ + 1 file changed, 130 insertions(+) + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index 901b9ca843eb1..b926d357739a4 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -10800,6 +10800,135 @@ get_callee_info(const ABI& abi, unsigned param_count, const nir_parameter* param + return info; + } + ++void ++visit_call(isel_context* ctx, nir_call_instr* instr) ++{ ++ Builder bld(ctx->program, ctx->block); ++ ++ ABI abi; ++ /* TODO: callable abi? */ ++ switch (instr->callee->driver_attributes & ACO_NIR_FUNCTION_ATTRIB_ABI_MASK) { ++ case ACO_NIR_CALL_ABI_RT_RECURSIVE: abi = make_abi(rtRaygenABI, ctx->program); break; ++ case ACO_NIR_CALL_ABI_TRAVERSAL: abi = make_abi(rtTraversalABI, ctx->program); break; ++ case ACO_NIR_CALL_ABI_AHIT_ISEC: abi = make_abi(rtAnyHitABI, ctx->program); break; ++ default: unreachable("invalid abi"); ++ } ++ ++ struct callee_info info = ++ get_callee_info(abi, instr->callee->num_params, instr->callee->params, nullptr); ++ std::vector return_infos; ++ ++ Instruction* stack_instr; ++ Definition stack_ptr; ++ if (info.stack_ptr.is_reg) { ++ stack_instr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), ++ Operand::c32(info.scratch_param_size), ++ Operand(ctx->callee_info.stack_ptr.def.getTemp())); ++ stack_ptr = ctx->callee_info.stack_ptr.def; ++ } else { ++ stack_instr = bld.pseudo(aco_opcode::p_callee_stack_ptr, bld.def(s1), ++ Operand::c32(info.scratch_param_size)); ++ stack_ptr = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(s1), Operand::c32(0)).def(0); ++ } ++ ++ for (unsigned i = 0; i < info.param_infos.size(); ++i) { ++ if (info.param_infos[i].is_reg) ++ continue; ++ ++ store_scratch_param(ctx, bld, info.param_infos[i], stack_instr->definitions[0].getTemp(), ++ info.scratch_param_size, get_ssa_temp(ctx, instr->params[i].ssa)); ++ } ++ ++ unsigned extra_def_count = 1; ++ ++ Temp vcc_backup; ++ if (ctx->program->dev.sgpr_limit <= vcc_hi.reg()) { ++ vcc_backup = bld.copy(bld.def(bld.lm), Operand(vcc, bld.lm)); ++ --extra_def_count; ++ } ++ ++ unsigned extra_param_count = 3; ++ if (ctx->program->gfx_level < GFX9) ++ ++extra_param_count; ++ ++ unsigned param_size = info.scratch_param_size; ++ if (ctx->program->gfx_level < GFX9) ++ param_size *= ctx->program->wave_size; ++ ++ Instruction* call_instr = ++ create_instruction(aco_opcode::p_call, Format::PSEUDO_CALL, ++ info.reg_param_count + ctx->args->arg_count + extra_param_count, ++ info.reg_return_param_count + extra_def_count); ++ call_instr->call().abi = abi; ++ call_instr->operands[0] = Operand(ctx->callee_info.return_address.def.getTemp(), ++ info.return_address.def.physReg()); ++ call_instr->operands[1] = Operand(stack_ptr.getTemp(), info.stack_ptr.def.physReg()); ++ call_instr->operands[2] = Operand::c32(param_size); ++ if (ctx->program->gfx_level < GFX9) { ++ call_instr->operands[info.reg_param_count + ctx->args->arg_count + 3] = ++ Operand(load_scratch_resource(ctx->program, bld, true, false)); ++ call_instr->operands[info.reg_param_count + ctx->args->arg_count + 3].setLateKill(true); ++ } ++ ++ unsigned reg_return_param_idx = 0; ++ for (unsigned i = 0; i < info.param_infos.size(); ++i) { ++ if (!info.param_infos[i].is_reg) { ++ if (instr->callee->params[i].is_return) { ++ return_infos.emplace_back(parameter_info{ ++ .is_reg = false, ++ .scratch_offset = info.param_infos[i].scratch_offset, ++ }); ++ } ++ continue; ++ } ++ ++ if (instr->callee->params[i].is_uniform) ++ call_instr->operands[i + 3] = Operand(get_ssa_temp(ctx, instr->params[i].ssa)); ++ else ++ call_instr->operands[i + 3] = ++ Operand(as_vgpr(ctx, get_ssa_temp(ctx, instr->params[i].ssa))); ++ ++ if (instr->callee->params[i].is_return) { ++ assert(!instr->callee->params[i].is_uniform); ++ Definition def = ++ bld.def(RegClass(RegType::vgpr, DIV_ROUND_UP(instr->callee->params[i].bit_size, 32)), ++ info.param_infos[i].def.physReg()); ++ call_instr->definitions[extra_def_count + reg_return_param_idx++] = def; ++ return_infos.emplace_back(parameter_info{ ++ .is_reg = true, ++ .def = def, ++ }); ++ } ++ ++ call_instr->operands[i + 3].setPrecolored(info.param_infos[i].def.physReg()); ++ } ++ ++ for (unsigned i = 0; i < ctx->args->arg_count; i++) { ++ enum ac_arg_regfile file = ctx->args->args[i].file; ++ unsigned size = ctx->args->args[i].size; ++ unsigned reg = ctx->args->args[i].offset + (file == AC_ARG_SGPR ? 0 : 256); ++ RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size); ++ Operand op = ctx->arg_temps[i].id() ? Operand(ctx->arg_temps[i], PhysReg{reg}) ++ : Operand(PhysReg{reg}, type); ++ op.setLateKill(true); ++ call_instr->operands[info.reg_param_count + 3 + i] = op; ++ } ++ ++ if (ctx->program->dev.sgpr_limit <= vcc_hi.reg()) ++ bld.copy(bld.def(bld.lm, vcc), Operand(vcc_backup)); ++ else ++ call_instr->definitions[0] = bld.def(s2, vcc); ++ ++ ctx->block->instructions.emplace_back(static_cast(call_instr)); ++ ++ ctx->call_infos.emplace_back(call_info{ ++ .nir_instr = instr, ++ .aco_instr = call_instr, ++ .return_info = std::move(return_infos), ++ .scratch_param_size = info.scratch_param_size, ++ }); ++} ++ + void + visit_block(isel_context* ctx, nir_block* block) + { +@@ -10823,6 +10952,7 @@ visit_block(isel_context* ctx, nir_block* block) + case nir_instr_type_undef: visit_undef(ctx, nir_instr_as_undef(instr)); break; + case nir_instr_type_deref: break; + case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break; ++ case nir_instr_type_call: visit_call(ctx, nir_instr_as_call(instr)); break; + default: isel_err(instr, "Unknown NIR instr type"); + } + } +-- +GitLab + + +From 5a1503448739d2e2012bb0392711e3f6612df00f Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 6 Mar 2024 14:56:16 +0100 +Subject: [PATCH 50/71] aco/lower_to_hw_instr: Lower calls + +--- + src/amd/compiler/aco_lower_to_hw_instr.cpp | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp +index 1e1737319c3f6..c9a918d8a373f 100644 +--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp ++++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp +@@ -3017,6 +3017,16 @@ lower_to_hw_instr(Program* program) + } else if (instr->isMIMG() && instr->mimg().strict_wqm) { + lower_image_sample(&ctx, instr); + ctx.instructions.emplace_back(std::move(instr)); ++ } else if (instr->isCall()) { ++ PhysReg stack_reg = instr->operands[1].physReg(); ++ if (instr->operands[2].constantValue()) ++ bld.sop2(aco_opcode::s_add_u32, Definition(stack_reg, s1), Definition(scc, s1), ++ Operand(stack_reg, s1), instr->operands[2]); ++ bld.sop1(aco_opcode::s_swappc_b64, Definition(instr->operands[0].physReg(), s2), ++ Operand(instr->operands[4].physReg(), s2)); ++ if (instr->operands[2].constantValue()) ++ bld.sop2(aco_opcode::s_sub_u32, Definition(stack_reg, s1), Definition(scc, s1), ++ Operand(stack_reg, s1), instr->operands[2]); + } else { + ctx.instructions.emplace_back(std::move(instr)); + } +-- +GitLab + + +From 6a4e937529ba36e41712205f201a308e98c6a8c9 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 13 Mar 2024 10:59:52 +0100 +Subject: [PATCH 51/71] aco/live_var_analysis: Handle calls + +--- + src/amd/compiler/aco_live_var_analysis.cpp | 47 ++++++++++++++++++++++ + 1 file changed, 47 insertions(+) + +diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp +index a635c94496143..64814e983bb2e 100644 +--- a/src/amd/compiler/aco_live_var_analysis.cpp ++++ b/src/amd/compiler/aco_live_var_analysis.cpp +@@ -29,9 +29,46 @@ get_temp_register_demand(Instruction* instr, RegisterDemand& demand_before, Regi + demand_before += op.getTemp(); + } + } ++ ++ if (instr->isCall()) ++ demand_after += instr->call().blocked_abi_demand; + } + } + ++void ++compute_blocked_abi_demand(Program* program, unsigned linear_vgpr_demand, Pseudo_call_instruction& instr) ++{ ++ const unsigned max_vgpr = get_addr_vgpr_from_waves(program, program->min_waves); ++ /* Linear VGPRs can intersect with preserved VGPRs, we insert spill code for them in ++ * spill_preserved. ++ */ ++ unsigned preserved_vgprs = max_vgpr - (instr.abi.clobberedRegs.vgpr.hi() - 256); ++ linear_vgpr_demand -= std::min(preserved_vgprs, linear_vgpr_demand); ++ ++ unsigned preserved_vgpr_demand = ++ instr.abi.clobberedRegs.vgpr.size - ++ std::min(linear_vgpr_demand, instr.abi.clobberedRegs.vgpr.size); ++ unsigned preserved_sgpr_demand = instr.abi.clobberedRegs.sgpr.size; ++ ++ /* Don't count definitions contained in clobbered call regs twice */ ++ for (auto& definition : instr.definitions) { ++ if (definition.isTemp() && definition.isFixed()) { ++ auto def_regs = PhysRegInterval{PhysReg{definition.physReg().reg()}, definition.size()}; ++ for (auto reg : def_regs) { ++ if (instr.abi.clobberedRegs.sgpr.contains(reg)) ++ --preserved_sgpr_demand; ++ if (instr.abi.clobberedRegs.vgpr.contains(reg)) ++ --preserved_vgpr_demand; ++ } ++ } ++ } ++ if (instr.abi.clobberedRegs.sgpr.contains(instr.operands[1].physReg()) && ++ !instr.operands[1].isKill()) ++ --preserved_sgpr_demand; ++ ++ instr.blocked_abi_demand = RegisterDemand(preserved_vgpr_demand, preserved_sgpr_demand); ++} ++ + RegisterDemand + get_live_changes(Instruction* instr) + { +@@ -313,6 +350,16 @@ process_live_temps_per_block(live_ctx& ctx, Block* block) + } + } + ++ if (insn->isCall()) { ++ unsigned linear_vgpr_demand = 0; ++ for (unsigned t : live) { ++ if (ctx.program->temp_rc[t].is_linear_vgpr()) ++ linear_vgpr_demand += ctx.program->temp_rc[t].size(); ++ } ++ compute_blocked_abi_demand(ctx.program, linear_vgpr_demand, insn->call()); ++ insn->register_demand += insn->call().blocked_abi_demand; ++ } ++ + operand_demand += new_demand; + insn->register_demand.update(operand_demand); + block->register_demand.update(insn->register_demand); +-- +GitLab + + +From 66c7c6cc5e167e8a763fe17520e575ad6cae7f50 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sat, 23 Mar 2024 10:29:13 +0100 +Subject: [PATCH 52/71] aco/ra: add utility to block interval + +--- + src/amd/compiler/aco_register_allocation.cpp | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index 4d73525bd0660..9012a742bda33 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -264,6 +264,8 @@ public: + fill(start, rc.size(), 0xFFFFFFFF); + } + ++ void block(PhysRegInterval interval) { fill(interval.lo(), interval.size, 0xFFFFFFFF); } ++ + bool is_blocked(PhysReg start) const + { + if (regs[start] == 0xFFFFFFFF) +-- +GitLab + + +From f2f3a2b63f646a30906c47bac0bb095618b12e9f Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sat, 23 Mar 2024 10:31:35 +0100 +Subject: [PATCH 53/71] aco/ra: handle clobbered regions by calls + +--- + src/amd/compiler/aco_register_allocation.cpp | 53 ++++++++++++++++++++ + 1 file changed, 53 insertions(+) + +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index 9012a742bda33..68502a79476e2 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -2104,6 +2104,12 @@ handle_fixed_operands(ra_ctx& ctx, RegisterFile& register_file, + bool found = false; + for (auto reg : regs.second) { + PhysRegInterval range = {reg, ctx.program->temp_rc[regs.first].size()}; ++ if (instr->isCall()) { ++ if (intersects(instr->call().abi.clobberedRegs.vgpr, range)) ++ continue; ++ if (intersects(instr->call().abi.clobberedRegs.sgpr, range)) ++ continue; ++ } + bool intersects_with_def = false; + for (const auto& def : instr->definitions) { + if (!def.isTemp() || !def.isFixed()) +@@ -3142,6 +3148,49 @@ register_allocation(Program* program, ra_test_policy policy) + register_file.clear(op); + } + ++ if (instr->isCall()) { ++ /* create parallelcopy pair to move blocking vars */ ++ RegisterFile tmp_file = register_file; ++ std::vector vars = ++ collect_vars(ctx, tmp_file, instr->call().abi.clobberedRegs.sgpr); ++ std::vector vars2 = ++ collect_vars(ctx, tmp_file, instr->call().abi.clobberedRegs.vgpr); ++ ++ /* Allow linear VGPRs in the clobbered range, they are spilled in spill_preserved. */ ++ for (auto it = vars2.begin(); it != vars2.end();) { ++ if (program->temp_rc[*it].is_linear_vgpr()) { ++ it = vars2.erase(it); ++ tmp_file.block(ctx.assignments[*it].reg, program->temp_rc[*it]); ++ } else { ++ ++it; ++ } ++ } ++ for (auto it = vars.begin(); it != vars.end();) { ++ if (instr->operands[1].tempId() == *it) ++ it = vars.erase(it); ++ else ++ ++it; ++ } ++ ++ vars.insert(vars.end(), vars2.begin(), vars2.end()); ++ ++ tmp_file.fill_killed_operands(instr.get()); ++ tmp_file.block(instr->call().abi.clobberedRegs.sgpr); ++ tmp_file.block(instr->call().abi.clobberedRegs.vgpr); ++ ++ adjust_max_used_regs(ctx, RegClass::s1, ++ instr->call().abi.clobberedRegs.sgpr.hi().reg() - 1); ++ adjust_max_used_regs(ctx, RegClass::v1, ++ instr->call().abi.clobberedRegs.vgpr.hi().reg() - 1); ++ ++ ASSERTED bool success = false; ++ success = ++ get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, PhysRegInterval{}); ++ assert(success); ++ ++ update_renames(ctx, register_file, parallelcopy, instr, rename_not_killed_ops); ++ } ++ + optimize_encoding(ctx, register_file, instr); + + /* Handle definitions which must have the same register as an operand. +@@ -3171,6 +3220,10 @@ register_allocation(Program* program, ra_test_policy policy) + RegisterFile tmp_file(register_file); + /* re-enable the killed operands, so that we don't move the blocking vars there */ + tmp_file.fill_killed_operands(instr.get()); ++ if (instr->isCall()) { ++ tmp_file.block(instr->call().abi.clobberedRegs.sgpr); ++ tmp_file.block(instr->call().abi.clobberedRegs.vgpr); ++ } + + ASSERTED bool success = false; + success = get_regs_for_copies(ctx, tmp_file, parallelcopy, vars, instr, def_regs); +-- +GitLab + + +From 04f918a810d1b5953922cf91c9ea068a3d6c54db Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Fri, 3 May 2024 17:37:04 +0200 +Subject: [PATCH 54/71] aco/insert_waitcnt: Insert waitcnts before s_swappc too + +--- + src/amd/compiler/aco_insert_waitcnt.cpp | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/src/amd/compiler/aco_insert_waitcnt.cpp b/src/amd/compiler/aco_insert_waitcnt.cpp +index e6263d6f191f2..510ed8613c41d 100644 +--- a/src/amd/compiler/aco_insert_waitcnt.cpp ++++ b/src/amd/compiler/aco_insert_waitcnt.cpp +@@ -344,6 +344,10 @@ kill(wait_imm& imm, Instruction* instr, wait_ctx& ctx, memory_sync_info sync_inf + force_waitcnt(ctx, imm); + } + ++ if (instr->opcode == aco_opcode::s_swappc_b64) ++ u_foreach_bit (i, (~counter_vs) & ctx.nonzero) ++ imm[i] = 0; ++ + /* Make sure POPS coherent memory accesses have reached the L2 cache before letting the + * overlapping waves proceed into the ordered section. + */ +-- +GitLab + + +From 35688a25c2e66aa5a8ddbe2c2700cf0fe0e7642b Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 13 May 2024 06:30:07 +0200 +Subject: [PATCH 55/71] aco/ra: Add utility to clear PhysRegInterval + +--- + src/amd/compiler/aco_register_allocation.cpp | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index 68502a79476e2..eb87bf111f5a8 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -266,6 +266,8 @@ public: + + void block(PhysRegInterval interval) { fill(interval.lo(), interval.size, 0xFFFFFFFF); } + ++ void clear(PhysRegInterval interval) { fill(interval.lo(), interval.size, 0); } ++ + bool is_blocked(PhysReg start) const + { + if (regs[start] == 0xFFFFFFFF) +-- +GitLab + + +From 15ce5c3c90909b56b7c62d00d7e5022f4244140e Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sat, 18 May 2024 10:19:58 +0200 +Subject: [PATCH 56/71] aco/util: Add aco::unordered_set + +--- + src/amd/compiler/aco_util.h | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/src/amd/compiler/aco_util.h b/src/amd/compiler/aco_util.h +index 68a6c686408f0..0c5f9566bd213 100644 +--- a/src/amd/compiler/aco_util.h ++++ b/src/amd/compiler/aco_util.h +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + #include + + namespace aco { +@@ -390,6 +391,14 @@ template , class Pred = std::equ + using unordered_map = + std::unordered_map>>; + ++/* ++ * aco::unordered_set - alias for std::unordered_map with monotonic_allocator ++ * ++ * This template specialization mimics std::pmr::unordered_set. ++ */ ++template , class Pred = std::equal_to> ++using unordered_set = std::unordered_set>; ++ + /* + * Cache-friendly set of 32-bit IDs with fast insert/erase/lookup and + * the ability to efficiently iterate over contained elements. +-- +GitLab + + +From be7080caa16a484d00a6213c284f91421bb9abb1 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 13 May 2024 06:23:55 +0200 +Subject: [PATCH 57/71] aco: Add pass for spilling call-related VGPRs + +Spills preserved VGPRs for callees and linear VGPRs added by the +spiller. +--- + .../compiler/aco_instruction_selection.cpp | 65 ++- + src/amd/compiler/aco_interface.cpp | 2 + + src/amd/compiler/aco_ir.h | 11 +- + src/amd/compiler/aco_opcodes.py | 3 + + src/amd/compiler/aco_opt_value_numbering.cpp | 3 +- + src/amd/compiler/aco_register_allocation.cpp | 62 +- + src/amd/compiler/aco_spill_preserved.cpp | 547 ++++++++++++++++++ + src/amd/compiler/meson.build | 1 + + 8 files changed, 670 insertions(+), 24 deletions(-) + create mode 100644 src/amd/compiler/aco_spill_preserved.cpp + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index b926d357739a4..deb97c1867667 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -106,9 +106,21 @@ append_logical_start(Block* b) + } + + static void +-append_logical_end(Block* b) ++append_logical_end(isel_context* ctx) + { +- Builder(NULL, b).pseudo(aco_opcode::p_logical_end); ++ Builder bld(ctx->program, ctx->block); ++ ++ Operand stack_ptr_op; ++ if (ctx->program->gfx_level >= GFX9) ++ stack_ptr_op = Operand(ctx->callee_info.stack_ptr.def.getTemp()); ++ else ++ stack_ptr_op = Operand(load_scratch_resource(ctx->program, bld, true, true)); ++ stack_ptr_op.setLateKill(true); ++ if (ctx->program->is_callee) ++ bld.pseudo(aco_opcode::p_reload_preserved_vgpr, bld.def(s1), bld.def(bld.lm), ++ bld.def(s1, scc), stack_ptr_op); ++ ++ bld.pseudo(aco_opcode::p_logical_end); + } + + Temp +@@ -10485,7 +10497,7 @@ void + begin_loop(isel_context* ctx, loop_context* lc) + { + // TODO: we might want to wrap the loop around a branch if exec.potentially_empty=true +- append_logical_end(ctx->block); ++ append_logical_end(ctx); + ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform; + Builder bld(ctx->program, ctx->block); + bld.branch(aco_opcode::p_branch, bld.def(s2)); +@@ -10543,7 +10555,7 @@ end_loop(isel_context* ctx, loop_context* lc) + if (!ctx->cf_info.has_branch) { + unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx; + Builder bld(ctx->program, ctx->block); +- append_logical_end(ctx->block); ++ append_logical_end(ctx); + + /* No need to check exec.potentially_empty_break/continue originating inside the loop. In the + * only case where it's possible at this point (divergent break after divergent continue), we +@@ -10610,7 +10622,7 @@ emit_loop_jump(isel_context* ctx, bool is_break) + { + Builder bld(ctx->program, ctx->block); + Block* logical_target; +- append_logical_end(ctx->block); ++ append_logical_end(ctx); + unsigned idx = ctx->block->index; + + if (is_break) { +@@ -11072,7 +11084,7 @@ begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond, + { + ic->cond = cond; + +- append_logical_end(ctx->block); ++ append_logical_end(ctx); + ctx->block->kind |= block_kind_branch; + + /* branch to linear then block */ +@@ -11118,7 +11130,7 @@ begin_divergent_if_else(isel_context* ctx, if_context* ic, + nir_selection_control sel_ctrl = nir_selection_control_none) + { + Block* BB_then_logical = ctx->block; +- append_logical_end(BB_then_logical); ++ append_logical_end(ctx); + /* branch from logical then block to invert block */ + aco_ptr branch; + branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); +@@ -11177,7 +11189,7 @@ static void + end_divergent_if(isel_context* ctx, if_context* ic) + { + Block* BB_else_logical = ctx->block; +- append_logical_end(BB_else_logical); ++ append_logical_end(ctx); + + /* branch from logical else block to endif block */ + aco_ptr branch; +@@ -11222,7 +11234,7 @@ begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond) + { + assert(cond.regClass() == s1); + +- append_logical_end(ctx->block); ++ append_logical_end(ctx); + ctx->block->kind |= block_kind_uniform; + + aco_ptr branch; +@@ -11257,7 +11269,7 @@ begin_uniform_if_else(isel_context* ctx, if_context* ic) + Block* BB_then = ctx->block; + + if (!ctx->cf_info.has_branch) { +- append_logical_end(BB_then); ++ append_logical_end(ctx); + /* branch from then block to endif block */ + aco_ptr branch; + branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); +@@ -11291,7 +11303,7 @@ end_uniform_if(isel_context* ctx, if_context* ic) + Block* BB_else = ctx->block; + + if (!ctx->cf_info.has_branch) { +- append_logical_end(BB_else); ++ append_logical_end(ctx); + /* branch from then block to endif block */ + aco_ptr branch; + branch.reset(create_instruction(aco_opcode::p_branch, Format::PSEUDO_BRANCH, 0, 1)); +@@ -12217,13 +12229,34 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c + ctx.program->is_callee = true; + + Instruction* startpgm = add_startpgm(&ctx, true); ++ ++ Builder bld(ctx.program, ctx.block); ++ ++ Operand stack_ptr_op; ++ if (ctx.program->gfx_level >= GFX9) ++ stack_ptr_op = Operand(ctx.callee_info.stack_ptr.def.getTemp()); ++ else ++ stack_ptr_op = Operand(load_scratch_resource(ctx.program, bld, true, true)); ++ stack_ptr_op.setLateKill(true); ++ bld.pseudo(aco_opcode::p_spill_preserved_vgpr, bld.def(s1), bld.def(bld.lm), ++ bld.def(s1, scc), stack_ptr_op); ++ + append_logical_start(ctx.block); + split_arguments(&ctx, startpgm); + visit_cf_list(&ctx, &impl->body); +- append_logical_end(ctx.block); ++ append_logical_end(&ctx); + ctx.block->kind |= block_kind_uniform; + + if (ctx.next_pc != Temp()) { ++ bld = Builder(ctx.program, ctx.block); ++ if (ctx.program->gfx_level >= GFX9) ++ stack_ptr_op = Operand(ctx.callee_info.stack_ptr.def.getTemp()); ++ else ++ stack_ptr_op = Operand(load_scratch_resource(ctx.program, bld, true, true)); ++ stack_ptr_op.setLateKill(true); ++ bld.pseudo(aco_opcode::p_reload_preserved_vgpr, bld.def(s1), bld.def(bld.lm), ++ bld.def(s1, scc), stack_ptr_op); ++ + insert_return(ctx); + + Builder(ctx.program, ctx.block).sop1(aco_opcode::s_setpc_b64, Operand(ctx.next_pc)); +@@ -12503,7 +12536,7 @@ select_shader(isel_context& ctx, nir_shader* nir, const bool need_startpgm, cons + if (need_endpgm) { + program->config->float_mode = program->blocks[0].fp_mode.val; + +- append_logical_end(ctx.block); ++ append_logical_end(&ctx); + ctx.block->kind |= block_kind_uniform; + + if ((!program->info.ps.has_epilog && !is_first_stage_of_merged_shader) || +@@ -12918,7 +12951,7 @@ select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shade + + program->config->float_mode = program->blocks[0].fp_mode.val; + +- append_logical_end(ctx.block); ++ append_logical_end(&ctx); + ctx.block->kind |= block_kind_uniform; + bld.sopp(aco_opcode::s_endpgm); + +@@ -13864,7 +13897,7 @@ select_ps_epilog(Program* program, void* pinfo, ac_shader_config* config, + + program->config->float_mode = program->blocks[0].fp_mode.val; + +- append_logical_end(ctx.block); ++ append_logical_end(&ctx); + ctx.block->kind |= block_kind_export_end; + bld.reset(ctx.block); + bld.sopp(aco_opcode::s_endpgm); +@@ -13900,7 +13933,7 @@ select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config, + + program->config->float_mode = program->blocks[0].fp_mode.val; + +- append_logical_end(ctx.block); ++ append_logical_end(&ctx); + + build_end_with_regs(&ctx, regs); + +diff --git a/src/amd/compiler/aco_interface.cpp b/src/amd/compiler/aco_interface.cpp +index 5c7956caeedd4..921fc3894c694 100644 +--- a/src/amd/compiler/aco_interface.cpp ++++ b/src/amd/compiler/aco_interface.cpp +@@ -172,6 +172,8 @@ aco_postprocess_shader(const struct aco_compiler_options* options, + validate(program.get()); + } + ++ spill_preserved(program.get()); ++ + ssa_elimination(program.get()); + } + +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index ccf2710d5453f..e2101ae5162bc 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -2079,7 +2079,9 @@ is_dead(const std::vector& uses, const Instruction* instr) + { + if (instr->definitions.empty() || instr->isBranch() || instr->isCall() || + instr->opcode == aco_opcode::p_startpgm || instr->opcode == aco_opcode::p_init_scratch || +- instr->opcode == aco_opcode::p_dual_src_export_gfx11) ++ instr->opcode == aco_opcode::p_dual_src_export_gfx11 || ++ instr->opcode == aco_opcode::p_spill_preserved_vgpr || ++ instr->opcode == aco_opcode::p_reload_preserved_vgpr) + return false; + + if (std::any_of(instr->definitions.begin(), instr->definitions.end(), +@@ -2492,6 +2494,7 @@ void setup_reduce_temp(Program* program); + void lower_to_cssa(Program* program); + void register_allocation(Program* program, ra_test_policy = {}); + void reindex_ssa(Program* program); ++void spill_preserved(Program* program); + void ssa_elimination(Program* program); + void lower_to_hw_instr(Program* program); + void schedule_program(Program* program); +@@ -2608,4 +2611,10 @@ extern const Info instr_info; + + } // namespace aco + ++namespace std { ++template <> struct hash { ++ size_t operator()(aco::PhysReg temp) const noexcept { return std::hash{}(temp.reg_b); } ++}; ++} // namespace std ++ + #endif /* ACO_IR_H */ +diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py +index 696a5a945b310..8d0b93a044270 100644 +--- a/src/amd/compiler/aco_opcodes.py ++++ b/src/amd/compiler/aco_opcodes.py +@@ -333,6 +333,9 @@ insn("p_unit_test") + + insn("p_callee_stack_ptr") + ++insn("p_spill_preserved_vgpr") ++insn("p_reload_preserved_vgpr") ++ + insn("p_create_vector") + insn("p_extract_vector") + insn("p_split_vector") +diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp +index d5be9e9302d66..a199216907a5e 100644 +--- a/src/amd/compiler/aco_opt_value_numbering.cpp ++++ b/src/amd/compiler/aco_opt_value_numbering.cpp +@@ -313,7 +313,8 @@ can_eliminate(aco_ptr& instr) + if (instr->definitions.empty() || instr->opcode == aco_opcode::p_phi || + instr->opcode == aco_opcode::p_linear_phi || + instr->opcode == aco_opcode::p_pops_gfx9_add_exiting_wave_id || +- instr->definitions[0].isNoCSE()) ++ instr->definitions[0].isNoCSE() || instr->opcode == aco_opcode::p_spill_preserved_vgpr || ++ instr->opcode == aco_opcode::p_reload_preserved_vgpr) + return false; + + return true; +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index eb87bf111f5a8..88f40f894e79c 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -19,12 +19,6 @@ + #include + #include + +-namespace std { +-template <> struct hash { +- size_t operator()(aco::PhysReg temp) const noexcept { return std::hash{}(temp.reg_b); } +-}; +-} // namespace std +- + namespace aco { + namespace { + +@@ -2492,6 +2486,23 @@ init_reg_file(ra_ctx& ctx, const std::vector& live_out_per_block, Block& + const IDSet& live_in = live_out_per_block[block.index]; + assert(block.index != 0 || live_in.empty()); + ++ /* Callee shaders only get a chance to spill preserved registers after p_startpgm. ++ * To make sure nothing uses these regs until we can spill them, block them here. ++ */ ++ if (block.index == 0 && ctx.program->is_callee) { ++ PhysRegInterval preserved_vgpr_lo = PhysRegInterval{ ++ .lo_ = PhysReg{256u + ctx.program->arg_vgpr_count}, ++ .size = ++ ctx.program->callee_abi.clobberedRegs.vgpr.lo() - 256 - ctx.program->arg_vgpr_count, ++ }; ++ PhysRegInterval preserved_vgpr_hi = PhysRegInterval{ ++ .lo_ = ctx.program->callee_abi.clobberedRegs.vgpr.hi(), ++ .size = PhysReg{256u + ctx.vgpr_limit} - ctx.program->callee_abi.clobberedRegs.vgpr.hi(), ++ }; ++ register_file.block(preserved_vgpr_hi); ++ register_file.block(preserved_vgpr_lo); ++ } ++ + if (block.kind & block_kind_loop_header) { + ctx.loop_header.emplace_back(block.index); + /* already rename phis incoming value */ +@@ -3093,6 +3104,31 @@ register_allocation(Program* program, ra_test_policy policy) + instructions.emplace_back(std::move(instr)); + break; + } ++ if (instr->opcode == aco_opcode::p_reload_preserved_vgpr && block.linear_succs.empty()) { ++ PhysRegInterval preserved_vgpr_lo = PhysRegInterval{ ++ .lo_ = PhysReg{256u + ctx.program->arg_vgpr_count}, ++ .size = ctx.program->callee_abi.clobberedRegs.vgpr.lo() - 256u - ++ ctx.program->arg_vgpr_count, ++ }; ++ PhysRegInterval preserved_vgpr_hi = PhysRegInterval{ ++ .lo_ = ctx.program->callee_abi.clobberedRegs.vgpr.hi(), ++ .size = ++ PhysReg{256u + ctx.vgpr_limit} - ctx.program->callee_abi.clobberedRegs.vgpr.hi(), ++ }; ++ std::vector vars = collect_vars(ctx, register_file, preserved_vgpr_lo); ++ std::vector vars2 = collect_vars(ctx, register_file, preserved_vgpr_hi); ++ vars.insert(vars.end(), vars2.begin(), vars2.end()); ++ ++ register_file.block(preserved_vgpr_lo); ++ register_file.block(preserved_vgpr_hi); ++ ++ ASSERTED bool success = false; ++ success = get_regs_for_copies(ctx, register_file, parallelcopy, vars, instr, ++ PhysRegInterval{}); ++ assert(success); ++ ++ update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0); ++ } + + assert(!is_phi(instr)); + +@@ -3397,6 +3433,20 @@ register_allocation(Program* program, ra_test_policy policy) + instr->format = asVOP3(instr->format); + } + ++ if (instr->opcode == aco_opcode::p_spill_preserved_vgpr) { ++ PhysRegInterval preserved_vgpr_lo = PhysRegInterval{ ++ .lo_ = PhysReg{256u + ctx.program->arg_vgpr_count}, ++ .size = ctx.program->callee_abi.clobberedRegs.vgpr.lo() - 256u - ++ ctx.program->arg_vgpr_count, ++ }; ++ PhysRegInterval preserved_vgpr_hi = PhysRegInterval{ ++ .lo_ = ctx.program->callee_abi.clobberedRegs.vgpr.hi(), ++ .size = ++ PhysReg{256u + ctx.vgpr_limit} - ctx.program->callee_abi.clobberedRegs.vgpr.hi(), ++ }; ++ register_file.clear(preserved_vgpr_hi); ++ register_file.clear(preserved_vgpr_lo); ++ } + instructions.emplace_back(std::move(*instr_it)); + + } /* end for Instr */ +diff --git a/src/amd/compiler/aco_spill_preserved.cpp b/src/amd/compiler/aco_spill_preserved.cpp +new file mode 100644 +index 0000000000000..a6a6dd04c2d9f +--- /dev/null ++++ b/src/amd/compiler/aco_spill_preserved.cpp +@@ -0,0 +1,547 @@ ++/* ++ * Copyright © 2024 Valve Corporation ++ * ++ * SPDX-License-Identifier: MIT ++ */ ++ ++#include "aco_builder.h" ++#include "aco_ir.h" ++ ++#include ++#include ++ ++namespace aco { ++ ++struct postdom_info { ++ unsigned logical_imm_postdom; ++ unsigned linear_imm_postdom; ++}; ++ ++struct spill_preserved_ctx { ++ Program* program; ++ aco::monotonic_buffer_resource memory; ++ ++ aco::unordered_map preserved_spill_offsets; ++ aco::unordered_set preserved_regs; ++ aco::unordered_set preserved_linear_regs; ++ ++ aco::unordered_map> reg_block_uses; ++ std::vector dom_info; ++ ++ unsigned next_preserved_offset; ++ ++ explicit spill_preserved_ctx(Program* program_) ++ : program(program_), memory(), preserved_spill_offsets(memory), preserved_regs(memory), ++ preserved_linear_regs(memory), reg_block_uses(memory), ++ next_preserved_offset( ++ DIV_ROUND_UP(program_->config->scratch_bytes_per_wave, program_->wave_size)) ++ { ++ dom_info.resize(program->blocks.size(), {-1u, -1u}); ++ } ++}; ++ ++void ++add_instr(spill_preserved_ctx& ctx, unsigned block_index, bool seen_reload, ++ const aco_ptr& instr) ++{ ++ for (auto& def : instr->definitions) { ++ assert(def.isFixed()); ++ if (def.regClass().type() == RegType::sgpr) ++ continue; ++ /* Round down subdword registers to their base */ ++ PhysReg start_reg = PhysReg{def.physReg().reg()}; ++ for (auto reg : PhysRegInterval{start_reg, def.regClass().size()}) { ++ if (reg < 256u + ctx.program->arg_vgpr_count) ++ continue; ++ if (ctx.program->callee_abi.clobberedRegs.vgpr.contains(reg) && ++ !def.regClass().is_linear_vgpr()) ++ continue; ++ /* Don't count start_linear_vgpr without a copy as a use since the value doesn't matter. ++ * This allows us to move reloads a bit further up the CF. ++ */ ++ if (instr->opcode == aco_opcode::p_start_linear_vgpr && instr->operands.empty()) ++ continue; ++ ++ if (def.regClass().is_linear_vgpr()) ++ ctx.preserved_linear_regs.insert(reg); ++ else ++ ctx.preserved_regs.insert(reg); ++ ++ if (seen_reload) { ++ if (def.regClass().is_linear_vgpr()) ++ for (auto succ : ctx.program->blocks[block_index].linear_succs) ++ ctx.reg_block_uses[reg].emplace(succ); ++ else ++ for (auto succ : ctx.program->blocks[block_index].logical_succs) ++ ctx.reg_block_uses[reg].emplace(succ); ++ } else { ++ ctx.reg_block_uses[reg].emplace(block_index); ++ } ++ } ++ } ++ for (auto& op : instr->operands) { ++ assert(op.isFixed()); ++ if (op.regClass().type() == RegType::sgpr) ++ continue; ++ if (op.isConstant()) ++ continue; ++ /* Round down subdword registers to their base */ ++ PhysReg start_reg = PhysReg{op.physReg().reg()}; ++ for (auto reg : PhysRegInterval{start_reg, op.regClass().size()}) { ++ if (reg < 256u + ctx.program->arg_vgpr_count) ++ continue; ++ /* Don't count end_linear_vgpr as a use since the value doesn't matter. ++ * This allows us to move reloads a bit further up the CF. ++ */ ++ if (instr->opcode == aco_opcode::p_end_linear_vgpr) ++ continue; ++ if (ctx.program->callee_abi.clobberedRegs.vgpr.contains(reg) && ++ !op.regClass().is_linear_vgpr()) ++ continue; ++ if (op.regClass().is_linear_vgpr()) ++ ctx.preserved_linear_regs.insert(reg); ++ ++ if (seen_reload) { ++ if (op.regClass().is_linear_vgpr()) ++ for (auto succ : ctx.program->blocks[block_index].linear_succs) ++ ctx.reg_block_uses[reg].emplace(succ); ++ else ++ for (auto succ : ctx.program->blocks[block_index].logical_succs) ++ ctx.reg_block_uses[reg].emplace(succ); ++ } else { ++ ctx.reg_block_uses[reg].emplace(block_index); ++ } ++ } ++ } ++} ++ ++void ++spill_preserved(spill_preserved_ctx& ctx, PhysReg reg, std::vector>& spills, ++ std::vector>& lvgpr_spills) ++{ ++ unsigned offset; ++ ++ auto offset_iter = ctx.preserved_spill_offsets.find(reg); ++ if (offset_iter == ctx.preserved_spill_offsets.end()) { ++ offset = ctx.next_preserved_offset; ++ ctx.next_preserved_offset += 4; ++ ctx.preserved_spill_offsets.emplace(reg, offset); ++ } else { ++ offset = offset_iter->second; ++ } ++ ++ if (ctx.preserved_linear_regs.find(reg) != ctx.preserved_linear_regs.end()) ++ lvgpr_spills.emplace_back(reg, offset); ++ else ++ spills.emplace_back(reg, offset); ++} ++ ++void ++emit_spills_reloads_internal(spill_preserved_ctx& ctx, Builder& bld, ++ std::vector>& spills, PhysReg stack_reg, ++ PhysReg soffset, bool reload, bool linear, bool soffset_valid) ++{ ++ if (spills.empty()) ++ return; ++ ++ int end_offset = spills.back().second; ++ int start_offset = spills.front().second; ++ if (ctx.program->gfx_level >= GFX9) ++ assert(end_offset - start_offset < ctx.program->dev.scratch_global_offset_max); ++ ++ bool overflow = ++ end_offset > ctx.program->dev.scratch_global_offset_max || ctx.program->gfx_level < GFX9; ++ if (overflow) { ++ if (ctx.program->gfx_level >= GFX9) ++ bld.sop2(aco_opcode::s_add_u32, Definition(soffset, s1), Definition(scc, s1), ++ Operand(stack_reg, s1), Operand::c32(start_offset)); ++ else if (soffset_valid) ++ bld.sop2(aco_opcode::s_add_u32, Definition(soffset, s1), Definition(scc, s1), ++ Operand(soffset, s1), Operand::c32(start_offset * ctx.program->wave_size)); ++ else ++ bld.sop1(aco_opcode::s_mov_b32, Definition(soffset, s1), ++ Operand::c32(start_offset * ctx.program->wave_size)); ++ } ++ ++ Operand soffset_op; ++ if (ctx.program->gfx_level >= GFX9) ++ soffset_op = Operand(overflow ? soffset : stack_reg, s1); ++ else ++ soffset_op = soffset_valid || overflow ? Operand(soffset, s1) : Operand(sgpr_null, s1); ++ ++ for (const auto& spill : spills) { ++ if (ctx.program->gfx_level >= GFX9) { ++ if (reload) ++ bld.scratch(aco_opcode::scratch_load_dword, ++ Definition(spill.first, linear ? v1.as_linear() : v1), Operand(v1), ++ soffset_op, overflow ? spill.second - start_offset : spill.second, ++ memory_sync_info(storage_vgpr_spill, semantic_private)); ++ else ++ bld.scratch(aco_opcode::scratch_store_dword, Operand(v1), soffset_op, ++ Operand(spill.first, linear ? v1.as_linear() : v1), ++ overflow ? spill.second - start_offset : spill.second, ++ memory_sync_info(storage_vgpr_spill, semantic_private)); ++ } else { ++ if (reload) { ++ Instruction* instr = bld.mubuf( ++ aco_opcode::buffer_load_dword, Definition(spill.first, linear ? v1.as_linear() : v1), ++ Operand(stack_reg, s4), Operand(v1), soffset_op, ++ overflow ? spill.second - start_offset : spill.second, false); ++ instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); ++ instr->mubuf().cache.value = ac_swizzled; ++ } else { ++ Instruction* instr = ++ bld.mubuf(aco_opcode::buffer_store_dword, Operand(stack_reg, s4), Operand(v1), ++ soffset_op, Operand(spill.first, linear ? v1.as_linear() : v1), ++ overflow ? spill.second - start_offset : spill.second, false); ++ instr->mubuf().sync = memory_sync_info(storage_vgpr_spill, semantic_private); ++ instr->mubuf().cache.value = ac_swizzled; ++ } ++ } ++ } ++ ++ if (overflow && ctx.program->gfx_level < GFX9) ++ bld.sop2(aco_opcode::s_sub_i32, Definition(soffset, s1), Definition(scc, s1), ++ Operand(soffset, s1), Operand::c32(start_offset * ctx.program->wave_size)); ++} ++ ++void ++emit_spills_reloads(spill_preserved_ctx& ctx, std::vector>& instructions, ++ std::vector>::iterator& insert_point, ++ std::vector>& spills, ++ std::vector>& lvgpr_spills, bool reload) ++{ ++ auto spill_reload_compare = [](const auto& first, const auto& second) ++ { return first.second < second.second; }; ++ ++ std::sort(spills.begin(), spills.end(), spill_reload_compare); ++ std::sort(lvgpr_spills.begin(), lvgpr_spills.end(), spill_reload_compare); ++ ++ PhysReg stack_reg = (*insert_point)->operands[0].physReg(); ++ PhysReg soffset = (*insert_point)->definitions[0].physReg(); ++ PhysReg exec_backup = (*insert_point)->definitions[1].physReg(); ++ ++ std::vector> spill_instructions; ++ Builder bld(ctx.program, &spill_instructions); ++ ++ emit_spills_reloads_internal(ctx, bld, spills, stack_reg, soffset, reload, false, false); ++ if (!lvgpr_spills.empty()) { ++ bld.sop1(Builder::s_or_saveexec, Definition(exec_backup, bld.lm), Definition(scc, s1), ++ Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm)); ++ emit_spills_reloads_internal(ctx, bld, lvgpr_spills, stack_reg, soffset, reload, true, false); ++ bld.sop1(Builder::WaveSpecificOpcode::s_mov, Definition(exec, bld.lm), ++ Operand(exec_backup, bld.lm)); ++ } ++ ++ insert_point = instructions.erase(insert_point); ++ instructions.insert(insert_point, std::move_iterator(spill_instructions.begin()), ++ std::move_iterator(spill_instructions.end())); ++} ++ ++void ++init_block_info(spill_preserved_ctx& ctx) ++{ ++ unsigned cur_loop_header = -1u; ++ for (unsigned index = ctx.program->blocks.size() - 1; index < ctx.program->blocks.size();) { ++ const Block& block = ctx.program->blocks[index]; ++ ++ if (block.linear_succs.empty()) { ++ ctx.dom_info[index].logical_imm_postdom = block.index; ++ ctx.dom_info[index].linear_imm_postdom = block.index; ++ } else { ++ int new_logical_postdom = -1; ++ int new_linear_postdom = -1; ++ for (unsigned succ_idx : block.logical_succs) { ++ if ((int)ctx.dom_info[succ_idx].logical_imm_postdom == -1) { ++ assert(cur_loop_header == -1u || succ_idx >= cur_loop_header); ++ if (cur_loop_header == -1u) ++ cur_loop_header = succ_idx; ++ continue; ++ } ++ ++ if (new_logical_postdom == -1) { ++ new_logical_postdom = (int)succ_idx; ++ continue; ++ } ++ ++ while ((int)succ_idx != new_logical_postdom) { ++ if ((int)succ_idx < new_logical_postdom) ++ succ_idx = ctx.dom_info[succ_idx].logical_imm_postdom; ++ if ((int)succ_idx > new_logical_postdom) ++ new_logical_postdom = (int)ctx.dom_info[new_logical_postdom].logical_imm_postdom; ++ } ++ } ++ ++ for (unsigned succ_idx : block.linear_succs) { ++ if ((int)ctx.dom_info[succ_idx].linear_imm_postdom == -1) { ++ assert(cur_loop_header == -1u || succ_idx >= cur_loop_header); ++ if (cur_loop_header == -1u) ++ cur_loop_header = succ_idx; ++ continue; ++ } ++ ++ if (new_linear_postdom == -1) { ++ new_linear_postdom = (int)succ_idx; ++ continue; ++ } ++ ++ while ((int)succ_idx != new_linear_postdom) { ++ if ((int)succ_idx < new_linear_postdom) ++ succ_idx = ctx.dom_info[succ_idx].linear_imm_postdom; ++ if ((int)succ_idx > new_linear_postdom) ++ new_linear_postdom = (int)ctx.dom_info[new_linear_postdom].linear_imm_postdom; ++ } ++ } ++ ++ ctx.dom_info[index].logical_imm_postdom = new_logical_postdom; ++ ctx.dom_info[index].linear_imm_postdom = new_linear_postdom; ++ } ++ ++ bool seen_reload_vgpr = false; ++ for (auto& instr : block.instructions) { ++ if (instr->opcode == aco_opcode::p_reload_preserved_vgpr) { ++ seen_reload_vgpr = true; ++ continue; ++ } ++ ++ add_instr(ctx, index, seen_reload_vgpr, instr); ++ } ++ ++ /* Process predecessors of loop headers again, since post-dominance information of the header ++ * was not available the first time ++ */ ++ unsigned next_idx = index - 1; ++ if (index == cur_loop_header) { ++ assert(block.kind & block_kind_loop_header); ++ for (auto pred : block.logical_preds) ++ if (ctx.dom_info[pred].logical_imm_postdom == -1u) ++ next_idx = std::max(next_idx, pred); ++ for (auto pred : block.linear_preds) ++ if (ctx.dom_info[pred].linear_imm_postdom == -1u) ++ next_idx = std::max(next_idx, pred); ++ cur_loop_header = -1u; ++ } ++ index = next_idx; ++ } ++} ++ ++struct call_spill { ++ unsigned instr_idx; ++ std::vector> spills; ++}; ++ ++void ++emit_call_spills(spill_preserved_ctx& ctx) ++{ ++ std::set linear_vgprs; ++ std::unordered_map> block_call_spills; ++ ++ unsigned max_scratch_offset = ctx.next_preserved_offset; ++ ++ for (auto& block : ctx.program->blocks) { ++ for (auto it = block.instructions.begin(); it != block.instructions.end(); ++it) { ++ auto& instr = *it; ++ ++ if (instr->opcode == aco_opcode::p_call) { ++ unsigned scratch_offset = ctx.next_preserved_offset; ++ struct call_spill spill = { ++ .instr_idx = (unsigned)(it - block.instructions.begin()), ++ }; ++ for (auto& reg : linear_vgprs) { ++ if (!instr->call().abi.clobberedRegs.vgpr.contains(reg)) ++ continue; ++ spill.spills.emplace_back(reg, scratch_offset); ++ scratch_offset += 4; ++ } ++ max_scratch_offset = std::max(max_scratch_offset, scratch_offset); ++ ++ block_call_spills[block.index].emplace_back(std::move(spill)); ++ } else if (instr->opcode == aco_opcode::p_start_linear_vgpr) { ++ linear_vgprs.insert(instr->definitions[0].physReg()); ++ } else if (instr->opcode == aco_opcode::p_end_linear_vgpr) { ++ for (auto& op : instr->operands) ++ linear_vgprs.erase(op.physReg()); ++ } ++ } ++ } ++ ++ /* XXX: This should also be possible on GFX9, although small negative scratch offsets ++ * seem to hang the GPU, so disable it there now. ++ */ ++ if (ctx.program->gfx_level >= GFX10) ++ for (auto& block_calls : block_call_spills) ++ for (auto& call_spills : block_calls.second) ++ for (auto& spill : call_spills.spills) ++ spill.second -= max_scratch_offset; ++ ++ for (auto& block_calls : block_call_spills) { ++ for (unsigned i = 0; i < block_calls.second.size(); ++i) { ++ auto& block = ctx.program->blocks[block_calls.first]; ++ auto& call = block_calls.second[i]; ++ auto& instr = block.instructions[call.instr_idx]; ++ auto it = block.instructions.begin() + call.instr_idx; ++ unsigned num_inserted_instrs = 0; ++ ++ std::vector> spill_instructions; ++ Builder bld(ctx.program, &spill_instructions); ++ ++ PhysReg stack_reg = instr->operands[1].physReg(); ++ PhysReg soffset = PhysReg{UINT32_MAX}; ++ PhysReg scratch_rsrc = PhysReg{UINT32_MAX}; ++ if (ctx.program->gfx_level < GFX9) ++ scratch_rsrc = instr->operands.back().physReg(); ++ ++ if (ctx.program->gfx_level >= GFX10) { ++ bld.sop2(aco_opcode::s_add_u32, Definition(stack_reg, s1), Definition(scc, s1), ++ Operand(stack_reg, s1), Operand::c32(max_scratch_offset)); ++ emit_spills_reloads_internal(ctx, bld, call.spills, stack_reg, soffset, false, true, ++ false); ++ } else if (ctx.program->gfx_level == GFX9) { ++ emit_spills_reloads_internal(ctx, bld, call.spills, stack_reg, soffset, false, true, ++ false); ++ bld.sop2(aco_opcode::s_add_u32, Definition(stack_reg, s1), Definition(scc, s1), ++ Operand(stack_reg, s1), Operand::c32(max_scratch_offset)); ++ } else { ++ emit_spills_reloads_internal(ctx, bld, call.spills, scratch_rsrc, stack_reg, false, ++ true, true); ++ bld.sop2(aco_opcode::s_add_u32, Definition(stack_reg, s1), Definition(scc, s1), ++ Operand(stack_reg, s1), ++ Operand::c32(max_scratch_offset * ctx.program->wave_size)); ++ } ++ ++ it = block.instructions.insert(it, std::move_iterator(spill_instructions.begin()), ++ std::move_iterator(spill_instructions.end())); ++ it += spill_instructions.size() + 1; ++ num_inserted_instrs += spill_instructions.size(); ++ ++ spill_instructions.clear(); ++ ++ if (ctx.program->gfx_level >= GFX10) { ++ emit_spills_reloads_internal(ctx, bld, call.spills, stack_reg, soffset, true, true, ++ false); ++ bld.sop2(aco_opcode::s_sub_u32, Definition(stack_reg, s1), Definition(scc, s1), ++ Operand(stack_reg, s1), Operand::c32(max_scratch_offset)); ++ } else if (ctx.program->gfx_level == GFX9) { ++ bld.sop2(aco_opcode::s_sub_u32, Definition(stack_reg, s1), Definition(scc, s1), ++ Operand(stack_reg, s1), Operand::c32(max_scratch_offset)); ++ emit_spills_reloads_internal(ctx, bld, call.spills, stack_reg, soffset, true, true, ++ false); ++ } else { ++ bld.sop2(aco_opcode::s_sub_u32, Definition(stack_reg, s1), Definition(scc, s1), ++ Operand(stack_reg, s1), ++ Operand::c32(max_scratch_offset * ctx.program->wave_size)); ++ emit_spills_reloads_internal(ctx, bld, call.spills, scratch_rsrc, stack_reg, true, true, ++ true); ++ } ++ ++ block.instructions.insert(it, std::move_iterator(spill_instructions.begin()), ++ std::move_iterator(spill_instructions.end())); ++ num_inserted_instrs += spill_instructions.size(); ++ ++ for (unsigned j = i + 1; j < block_calls.second.size(); ++j) ++ block_calls.second[j].instr_idx += num_inserted_instrs; ++ } ++ } ++ ++ ctx.next_preserved_offset = max_scratch_offset; ++} ++ ++void ++emit_preserved_spills(spill_preserved_ctx& ctx) ++{ ++ std::vector> spills; ++ std::vector> lvgpr_spills; ++ ++ for (auto reg : ctx.preserved_regs) ++ spill_preserved(ctx, reg, spills, lvgpr_spills); ++ for (auto reg : ctx.preserved_linear_regs) ++ spill_preserved(ctx, reg, spills, lvgpr_spills); ++ ++ auto start_instr = std::find_if(ctx.program->blocks.front().instructions.begin(), ++ ctx.program->blocks.front().instructions.end(), ++ [](const auto& instr) ++ { return instr->opcode == aco_opcode::p_spill_preserved_vgpr; }); ++ emit_spills_reloads(ctx, ctx.program->blocks.front().instructions, start_instr, spills, ++ lvgpr_spills, false); ++ ++ auto block_reloads = ++ std::vector>>(ctx.program->blocks.size()); ++ auto lvgpr_block_reloads = ++ std::vector>>(ctx.program->blocks.size()); ++ ++ for (auto it = ctx.reg_block_uses.begin(); it != ctx.reg_block_uses.end();) { ++ bool is_linear = ctx.preserved_linear_regs.find(it->first) != ctx.preserved_linear_regs.end(); ++ ++ if (!is_linear && ctx.preserved_regs.find(it->first) == ctx.preserved_regs.end()) { ++ it = ctx.reg_block_uses.erase(it); ++ continue; ++ } ++ ++ unsigned min_common_postdom = 0; ++ ++ for (auto succ_idx : it->second) { ++ while (succ_idx != min_common_postdom) { ++ if (min_common_postdom < succ_idx) { ++ min_common_postdom = is_linear ++ ? ctx.dom_info[min_common_postdom].linear_imm_postdom ++ : ctx.dom_info[min_common_postdom].logical_imm_postdom; ++ } else { ++ succ_idx = is_linear ? ctx.dom_info[succ_idx].linear_imm_postdom ++ : ctx.dom_info[succ_idx].logical_imm_postdom; ++ } ++ } ++ } ++ ++ while (std::find_if(ctx.program->blocks[min_common_postdom].instructions.rbegin(), ++ ctx.program->blocks[min_common_postdom].instructions.rend(), ++ [](const auto& instr) { ++ return instr->opcode == aco_opcode::p_reload_preserved_vgpr; ++ }) == ctx.program->blocks[min_common_postdom].instructions.rend()) ++ min_common_postdom = is_linear ? ctx.dom_info[min_common_postdom].linear_imm_postdom ++ : ctx.dom_info[min_common_postdom].logical_imm_postdom; ++ ++ if (is_linear) { ++ lvgpr_block_reloads[min_common_postdom].emplace_back( ++ it->first, ctx.preserved_spill_offsets[it->first]); ++ ctx.preserved_linear_regs.erase(it->first); ++ } else { ++ block_reloads[min_common_postdom].emplace_back(it->first, ++ ctx.preserved_spill_offsets[it->first]); ++ ctx.preserved_regs.erase(it->first); ++ } ++ ++ it = ctx.reg_block_uses.erase(it); ++ } ++ ++ for (unsigned i = 0; i < ctx.program->blocks.size(); ++i) { ++ auto instr_it = std::find_if( ++ ctx.program->blocks[i].instructions.rbegin(), ctx.program->blocks[i].instructions.rend(), ++ [](const auto& instr) { return instr->opcode == aco_opcode::p_reload_preserved_vgpr; }); ++ if (instr_it == ctx.program->blocks[i].instructions.rend()) { ++ assert(block_reloads[i].empty() && lvgpr_block_reloads[i].empty()); ++ continue; ++ } ++ auto end_instr = std::prev(instr_it.base()); ++ emit_spills_reloads(ctx, ctx.program->blocks[i].instructions, end_instr, block_reloads[i], ++ lvgpr_block_reloads[i], true); ++ } ++} ++ ++void ++spill_preserved(Program* program) ++{ ++ if (!program->is_callee) ++ return; ++ ++ spill_preserved_ctx ctx(program); ++ ++ init_block_info(ctx); ++ ++ if (!program->bypass_reg_preservation) ++ emit_preserved_spills(ctx); ++ ++ emit_call_spills(ctx); ++ ++ program->config->scratch_bytes_per_wave = ctx.next_preserved_offset * program->wave_size; ++} ++} // namespace aco +diff --git a/src/amd/compiler/meson.build b/src/amd/compiler/meson.build +index b235f626f97af..38006e78543dc 100644 +--- a/src/amd/compiler/meson.build ++++ b/src/amd/compiler/meson.build +@@ -62,6 +62,7 @@ libaco_files = files( + 'aco_scheduler.cpp', + 'aco_scheduler_ilp.cpp', + 'aco_spill.cpp', ++ 'aco_spill_preserved.cpp', + 'aco_ssa_elimination.cpp', + 'aco_statistics.cpp', + 'aco_util.h', +-- +GitLab + + +From 35220611d653ced3a7ed06565c71815e9d135b5e Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 13 May 2024 06:26:51 +0200 +Subject: [PATCH 58/71] aco: Add cur_reg_demand to Program + +For checking whether spilling of preserved SGPRs is needed. +--- + src/amd/compiler/aco_ir.h | 1 + + src/amd/compiler/aco_live_var_analysis.cpp | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index e2101ae5162bc..6f510fef17a04 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -2345,6 +2345,7 @@ public: + std::vector blocks; + std::vector temp_rc = {s1}; + RegisterDemand max_reg_demand = RegisterDemand(); ++ RegisterDemand cur_reg_demand = RegisterDemand(); + ac_shader_config* config; + struct aco_shader_info info; + enum amd_gfx_level gfx_level; +diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp +index 64814e983bb2e..52561464b0e1e 100644 +--- a/src/amd/compiler/aco_live_var_analysis.cpp ++++ b/src/amd/compiler/aco_live_var_analysis.cpp +@@ -565,6 +565,7 @@ update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) + uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); + ++ program->cur_reg_demand = new_demand; + /* this won't compile, register pressure reduction necessary */ + if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) { + program->num_waves = 0; +-- +GitLab + + +From 20e1d11ec9b648ecc2d41bd5974c91545880e7b8 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 13 May 2024 06:28:31 +0200 +Subject: [PATCH 59/71] aco: Spill callee-preserved SGPRs + +--- + src/amd/compiler/aco_opcodes.py | 2 + + src/amd/compiler/aco_register_allocation.cpp | 46 ++++++- + src/amd/compiler/aco_scheduler.cpp | 8 ++ + src/amd/compiler/aco_spill.cpp | 119 +++++++++++++++++-- + 4 files changed, 167 insertions(+), 8 deletions(-) + +diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py +index 8d0b93a044270..a2f0876838f92 100644 +--- a/src/amd/compiler/aco_opcodes.py ++++ b/src/amd/compiler/aco_opcodes.py +@@ -334,7 +334,9 @@ insn("p_unit_test") + insn("p_callee_stack_ptr") + + insn("p_spill_preserved_vgpr") ++insn("p_spill_preserved_sgpr") + insn("p_reload_preserved_vgpr") ++insn("p_reload_preserved_sgpr") + + insn("p_create_vector") + insn("p_extract_vector") +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index 88f40f894e79c..b8915e520e8e1 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -3054,11 +3054,35 @@ register_allocation(Program* program, ra_test_policy policy) + ra_ctx ctx(program, policy); + get_affinities(ctx); + ++ std::unordered_set blocked_sgpr; ++ if (ctx.program->is_callee) { ++ PhysRegInterval preserved_sgpr_lo = PhysRegInterval{ ++ .lo_ = PhysReg{ctx.program->arg_sgpr_count}, ++ .size = ctx.program->callee_abi.clobberedRegs.sgpr.lo() - ctx.program->arg_sgpr_count, ++ }; ++ PhysRegInterval preserved_sgpr_hi = PhysRegInterval{ ++ .lo_ = ctx.program->callee_abi.clobberedRegs.sgpr.hi(), ++ .size = PhysReg{ctx.sgpr_limit} - ctx.program->callee_abi.clobberedRegs.sgpr.hi(), ++ }; ++ for (auto reg : preserved_sgpr_lo) { ++ blocked_sgpr.insert(reg); ++ adjust_max_used_regs(ctx, RegClass::s1, reg); ++ } ++ for (auto reg : preserved_sgpr_hi) { ++ blocked_sgpr.insert(reg); ++ adjust_max_used_regs(ctx, RegClass::s1, reg); ++ } ++ } ++ + for (Block& block : program->blocks) { + ctx.block = █ + + /* initialize register file */ + RegisterFile register_file = init_reg_file(ctx, program->live.live_in, block); ++ for (auto& reg : blocked_sgpr) { ++ if (register_file.is_empty_or_blocked(reg)) ++ register_file.block(reg, s1); ++ } + ctx.war_hint.reset(); + ctx.rr_vgpr_it = {PhysReg{256}}; + ctx.rr_sgpr_it = {PhysReg{0}}; +@@ -3104,7 +3128,27 @@ register_allocation(Program* program, ra_test_policy policy) + instructions.emplace_back(std::move(instr)); + break; + } +- if (instr->opcode == aco_opcode::p_reload_preserved_vgpr && block.linear_succs.empty()) { ++ if (instr->opcode == aco_opcode::p_spill_preserved_sgpr) { ++ if (register_file.is_blocked(instr->operands[0].physReg())) ++ register_file.clear(instr->operands[0]); ++ blocked_sgpr.erase(instr->operands[0].physReg()); ++ continue; ++ } else if (instr->opcode == aco_opcode::p_reload_preserved_sgpr) { ++ blocked_sgpr.insert(instr->operands[0].physReg()); ++ std::vector vars = collect_vars( ++ ctx, register_file, {instr->operands[0].physReg(), instr->operands[0].size()}); ++ register_file.block(instr->operands[0].physReg(), instr->operands[0].regClass()); ++ ASSERTED bool success = false; ++ success = get_regs_for_copies(ctx, register_file, parallelcopy, vars, instr, ++ PhysRegInterval{}); ++ assert(success); ++ ++ update_renames(ctx, register_file, parallelcopy, instr, (UpdateRenames)0); ++ register_file.block(instr->operands[0].physReg(), instr->operands[0].regClass()); ++ emit_parallel_copy(ctx, parallelcopy, instr, instructions, temp_in_scc, register_file); ++ continue; ++ } else if (instr->opcode == aco_opcode::p_reload_preserved_vgpr && ++ block.linear_succs.empty()) { + PhysRegInterval preserved_vgpr_lo = PhysRegInterval{ + .lo_ = PhysReg{256u + ctx.program->arg_vgpr_count}, + .size = ctx.program->callee_abi.clobberedRegs.vgpr.lo() - 256u - +diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp +index 4115c0bf3d7cf..e6eb1e49a4021 100644 +--- a/src/amd/compiler/aco_scheduler.cpp ++++ b/src/amd/compiler/aco_scheduler.cpp +@@ -1266,6 +1266,14 @@ schedule_program(Program* program) + assert(ctx.num_waves > 0); + ctx.mv.max_registers = {int16_t(get_addr_vgpr_from_waves(program, ctx.num_waves * wave_fac) - 2), + int16_t(get_addr_sgpr_from_waves(program, ctx.num_waves * wave_fac))}; ++ /* If not all preserved SGPRs in callee shaders were spilled, don't try using them for ++ * scheduling. ++ */ ++ if (program->is_callee) { ++ ctx.mv.max_registers.sgpr = ++ std::max(std::min(ctx.mv.max_registers.sgpr, program->cur_reg_demand.sgpr), ++ (int16_t)program->callee_abi.clobberedRegs.sgpr.size); ++ } + + /* NGG culling shaders are very sensitive to position export scheduling. + * Schedule less aggressively when early primitive export is used, and +diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp +index c271cbcf01eb8..e143b51809570 100644 +--- a/src/amd/compiler/aco_spill.cpp ++++ b/src/amd/compiler/aco_spill.cpp +@@ -75,6 +75,8 @@ struct spill_ctx { + std::vector> spills_entry; + std::vector> spills_exit; + ++ std::vector preserved_spill_ids; ++ + std::vector processed; + std::vector loop; + +@@ -138,11 +140,27 @@ struct spill_ctx { + for (auto pair : loop.back().spills) + add_interference(spill_id, pair.second); + } ++ for (auto id : preserved_spill_ids) ++ add_interference(spill_id, id); + + spills[to_spill] = spill_id; + return spill_id; + } + ++ uint32_t add_preserved_spill(RegClass rc, ++ std::vector>& block_spills) ++ { ++ const uint32_t spill_id = allocate_spill_id(rc); ++ for (auto& spills : block_spills) ++ for (auto pair : spills) ++ add_interference(spill_id, pair.second); ++ for (auto id : preserved_spill_ids) ++ add_interference(spill_id, id); ++ preserved_spill_ids.push_back(spill_id); ++ ++ return spill_id; ++ } ++ + void add_interference(uint32_t first, uint32_t second) + { + if (interferences[first].first.type() != interferences[second].first.type()) +@@ -1461,6 +1479,8 @@ end_unused_spill_vgprs(spill_ctx& ctx, Block& block, std::vector& vgpr_spi + if (pair.first.type() == RegType::sgpr && ctx.is_reloaded[pair.second]) + is_used[slots[pair.second] / ctx.wave_size] = true; + } ++ for (auto preserved : ctx.preserved_spill_ids) ++ is_used[slots[preserved] / ctx.wave_size] = true; + + std::vector temps; + for (unsigned i = 0; i < vgpr_spill_temps.size(); i++) { +@@ -1635,6 +1655,13 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) + } + } + ++ if (!(*it)->definitions[0].isTemp()) { ++ auto id_it = std::find(ctx.preserved_spill_ids.begin(), ++ ctx.preserved_spill_ids.end(), spill_id); ++ assert(id_it != ctx.preserved_spill_ids.end()); ++ ctx.preserved_spill_ids.erase(id_it); ++ } ++ + /* reload sgpr: just add the vgpr temp to operands */ + Instruction* reload = create_instruction(aco_opcode::p_reload, Format::PSEUDO, 2, 1); + reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]); +@@ -1653,6 +1680,37 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) + ctx.program->config->scratch_bytes_per_wave += ctx.vgpr_spill_slots * 4 * ctx.program->wave_size; + } + ++void ++spill_reload_preserved_sgpr(spill_ctx& ctx, std::vector>& spill_instructions, ++ std::vector>& reload_instructions, PhysReg reg) ++{ ++ uint32_t spill_id = ctx.add_preserved_spill(RegClass::s1, ctx.spills_exit); ++ ++ aco_ptr spill{create_instruction(aco_opcode::p_spill, Format::PSEUDO, 2, 0)}; ++ spill->operands[0] = Operand(reg, RegClass::s1); ++ spill->operands[1] = Operand::c32(spill_id); ++ ++ aco_ptr unblock{ ++ create_instruction(aco_opcode::p_spill_preserved_sgpr, Format::PSEUDO, 1, 0)}; ++ unblock->operands[0] = Operand(reg, RegClass::s1); ++ ++ spill_instructions.emplace_back(std::move(spill)); ++ spill_instructions.emplace_back(std::move(unblock)); ++ ++ aco_ptr block{ ++ create_instruction(aco_opcode::p_reload_preserved_sgpr, Format::PSEUDO, 1, 0)}; ++ block->operands[0] = Operand(reg, RegClass::s1); ++ ++ aco_ptr reload{create_instruction(aco_opcode::p_reload, Format::PSEUDO, 1, 1)}; ++ reload->operands[0] = Operand::c32(spill_id); ++ reload->definitions[0] = Definition(reg, RegClass::s1); ++ ++ reload_instructions.emplace_back(std::move(block)); ++ reload_instructions.emplace_back(std::move(reload)); ++ ++ ctx.is_reloaded[spill_id] = true; ++} ++ + } /* end namespace */ + + void +@@ -1663,8 +1721,16 @@ spill(Program* program) + + program->progress = CompilationProgress::after_spilling; + ++ const uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); ++ const uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); ++ uint16_t abi_sgpr_limit = ++ std::min((uint16_t)(program->callee_abi.clobberedRegs.sgpr.size + program->arg_sgpr_count), ++ sgpr_limit); ++ if (!program->is_callee) ++ abi_sgpr_limit = sgpr_limit; ++ + /* no spilling when register pressure is low enough */ +- if (program->num_waves > 0) ++ if (program->num_waves > 0 && program->cur_reg_demand.sgpr <= abi_sgpr_limit) + return; + + /* lower to CSSA before spilling to ensure correctness w.r.t. phis */ +@@ -1672,14 +1738,12 @@ spill(Program* program) + + /* calculate target register demand */ + const RegisterDemand demand = program->max_reg_demand; /* current max */ +- const uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); +- const uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); + uint16_t extra_vgprs = 0; + uint16_t extra_sgprs = 0; + + /* calculate extra VGPRs required for spilling SGPRs */ +- if (demand.sgpr > sgpr_limit) { +- unsigned sgpr_spills = demand.sgpr - sgpr_limit; ++ if (demand.sgpr > abi_sgpr_limit) { ++ unsigned sgpr_spills = demand.sgpr - abi_sgpr_limit; + extra_vgprs = DIV_ROUND_UP(sgpr_spills * 2, program->wave_size) + 1; + } + /* add extra SGPRs required for spilling VGPRs */ +@@ -1688,9 +1752,9 @@ spill(Program* program) + extra_sgprs = 1; /* SADDR */ + else + extra_sgprs = 5; /* scratch_resource (s4) + scratch_offset (s1) */ +- if (demand.sgpr + extra_sgprs > sgpr_limit) { ++ if (demand.sgpr + extra_sgprs > abi_sgpr_limit) { + /* re-calculate in case something has changed */ +- unsigned sgpr_spills = demand.sgpr + extra_sgprs - sgpr_limit; ++ unsigned sgpr_spills = demand.sgpr + extra_sgprs - abi_sgpr_limit; + extra_vgprs = DIV_ROUND_UP(sgpr_spills * 2, program->wave_size) + 1; + } + } +@@ -1702,10 +1766,51 @@ spill(Program* program) + gather_ssa_use_info(ctx); + get_rematerialize_info(ctx); + ++ /* Prepare spilling of preserved SGPRs. Don't insert the instructions yet so live info ++ * stays valid. ++ */ ++ std::vector> preserved_spill_instructions; ++ std::vector> preserved_reload_instructions; ++ if (demand.sgpr > abi_sgpr_limit && ctx.program->is_callee) { ++ ctx.preserved_spill_ids.reserve(demand.sgpr - abi_sgpr_limit); ++ ++ for (PhysReg reg = PhysReg{program->arg_sgpr_count}; ++ reg < program->callee_abi.clobberedRegs.sgpr.lo(); reg = reg.advance(4)) ++ spill_reload_preserved_sgpr(ctx, preserved_spill_instructions, ++ preserved_reload_instructions, reg); ++ ++ unsigned max_reg = ++ std::min((unsigned)program->cur_reg_demand.sgpr + extra_sgprs, (unsigned)sgpr_limit); ++ for (PhysReg reg = program->callee_abi.clobberedRegs.sgpr.hi(); reg < max_reg; ++ reg = reg.advance(4)) ++ spill_reload_preserved_sgpr(ctx, preserved_spill_instructions, ++ preserved_reload_instructions, reg); ++ } ++ + /* create spills and reloads */ + for (unsigned i = 0; i < program->blocks.size(); i++) + spill_block(ctx, i); + ++ if (!preserved_spill_instructions.empty()) { ++ auto spill_insert_point = std::find_if( ++ program->blocks.front().instructions.begin(), program->blocks.front().instructions.end(), ++ [](const auto& instr) { return instr->opcode == aco_opcode::p_spill_preserved_vgpr; }); ++ assert(spill_insert_point != program->blocks.front().instructions.end()); ++ ++ spill_insert_point = std::next(spill_insert_point); ++ program->blocks.front().instructions.insert( ++ spill_insert_point, std::move_iterator(preserved_spill_instructions.begin()), ++ std::move_iterator(preserved_spill_instructions.end())); ++ ++ auto reload_insert_point = std::find_if( ++ program->blocks.back().instructions.begin(), program->blocks.back().instructions.end(), ++ [](const auto& instr) { return instr->opcode == aco_opcode::p_reload_preserved_vgpr; }); ++ assert(reload_insert_point != program->blocks.back().instructions.end()); ++ program->blocks.back().instructions.insert( ++ reload_insert_point, std::move_iterator(preserved_reload_instructions.begin()), ++ std::move_iterator(preserved_reload_instructions.end())); ++ } ++ + /* assign spill slots and DCE rematerialized code */ + assign_spill_slots(ctx, extra_vgprs); + +-- +GitLab + + +From 3f8defc2ff59734c6e9b2bdc2554fc4f30204a1a Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 13 May 2024 06:29:40 +0200 +Subject: [PATCH 60/71] aco/ra: Also consider blocked registers as not + containing temps + +--- + src/amd/compiler/aco_register_allocation.cpp | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index b8915e520e8e1..ff8475e19014d 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -231,6 +231,14 @@ public: + return res; + } + ++ unsigned count_zero_or_blocked(PhysRegInterval reg_interval) const ++ { ++ unsigned res = 0; ++ for (PhysReg reg : reg_interval) ++ res += !regs[reg] || regs[reg] == 0xFFFFFFFF; ++ return res; ++ } ++ + /* Returns true if any of the bytes in the given range are allocated or blocked */ + bool test(PhysReg start, unsigned num_bytes) const + { +@@ -3501,8 +3509,8 @@ register_allocation(Program* program, ra_test_policy policy) + + ASSERTED PhysRegInterval vgpr_bounds = get_reg_bounds(ctx, RegType::vgpr, false); + ASSERTED PhysRegInterval sgpr_bounds = get_reg_bounds(ctx, RegType::sgpr, false); +- assert(register_file.count_zero(vgpr_bounds) == ctx.vgpr_bounds); +- assert(register_file.count_zero(sgpr_bounds) == ctx.sgpr_bounds); ++ assert(register_file.count_zero_or_blocked(vgpr_bounds) == ctx.vgpr_bounds); ++ assert(register_file.count_zero_or_blocked(sgpr_bounds) == ctx.sgpr_bounds); + } else if (should_compact_linear_vgprs(ctx, register_file)) { + aco_ptr br = std::move(instructions.back()); + instructions.pop_back(); +-- +GitLab + + +From 475664aaa95eaf7cf58abef67f524a658363d379 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Mon, 13 May 2024 06:30:35 +0200 +Subject: [PATCH 61/71] aco/ra: Skip blocked regs in get_reg_impl + +--- + src/amd/compiler/aco_register_allocation.cpp | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index ff8475e19014d..aec47824719a9 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -1307,7 +1307,7 @@ get_reg_impl(ra_ctx& ctx, const RegisterFile& reg_file, std::vector +Date: Mon, 13 May 2024 06:31:01 +0200 +Subject: [PATCH 62/71] aco/isel: Bypass reg preservation for noreturn shaders + +--- + src/amd/compiler/aco_instruction_selection.cpp | 1 + + src/amd/compiler/aco_ir.h | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index deb97c1867667..6c98777b12689 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -12261,6 +12261,7 @@ select_program_rt(isel_context& ctx, unsigned shader_count, struct nir_shader* c + + Builder(ctx.program, ctx.block).sop1(aco_opcode::s_setpc_b64, Operand(ctx.next_pc)); + } else { ++ ctx.program->bypass_reg_preservation = true; + Builder(ctx.program, ctx.block).sopp(aco_opcode::s_endpgm); + } + +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index 6f510fef17a04..2ab9eaa5d653c 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -2390,6 +2390,7 @@ public: + bool pending_lds_access = false; + + bool is_callee = false; ++ bool bypass_reg_preservation = false; + ABI callee_abi = {}; + unsigned short arg_sgpr_count; + unsigned short arg_vgpr_count; +-- +GitLab + + +From 8de0e68756db0eea3b7e332bf47b295863de41a1 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Fri, 31 May 2024 16:46:28 +0200 +Subject: [PATCH 63/71] aco/ra: Add separate counter for blocked registers + +We can't assume blocked registers are free in get_reg_impl, but +we don't want to pessimize register usage estimations either. +--- + src/amd/compiler/aco_register_allocation.cpp | 25 ++++++++++++++++---- + 1 file changed, 21 insertions(+), 4 deletions(-) + +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index aec47824719a9..5b4b50652006e 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -97,6 +97,8 @@ struct ra_ctx { + aco_ptr phi_dummy; + uint16_t max_used_sgpr = 0; + uint16_t max_used_vgpr = 0; ++ uint16_t max_blocked_sgpr = 0; ++ uint16_t max_blocked_vgpr = 0; + uint16_t sgpr_limit; + uint16_t vgpr_limit; + std::bitset<512> war_hint; +@@ -765,6 +767,21 @@ adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) + } + } + ++void ++adjust_max_blocked_regs(ra_ctx& ctx, RegType type, unsigned reg) ++{ ++ uint16_t max_addressible_sgpr = ctx.sgpr_limit; ++ if (type == RegType::vgpr) { ++ assert(reg >= 256); ++ uint16_t hi = reg - 256 - 1; ++ assert(hi <= 255); ++ ctx.max_blocked_vgpr = std::max(ctx.max_blocked_vgpr, hi); ++ } else if (reg <= max_addressible_sgpr) { ++ uint16_t hi = reg - 1; ++ ctx.max_blocked_sgpr = std::max(ctx.max_blocked_sgpr, std::min(hi, max_addressible_sgpr)); ++ } ++} ++ + enum UpdateRenames { + rename_not_killed_ops = 0x1, + }; +@@ -3268,10 +3285,10 @@ register_allocation(Program* program, ra_test_policy policy) + tmp_file.block(instr->call().abi.clobberedRegs.sgpr); + tmp_file.block(instr->call().abi.clobberedRegs.vgpr); + +- adjust_max_used_regs(ctx, RegClass::s1, +- instr->call().abi.clobberedRegs.sgpr.hi().reg() - 1); +- adjust_max_used_regs(ctx, RegClass::v1, +- instr->call().abi.clobberedRegs.vgpr.hi().reg() - 1); ++ adjust_max_blocked_regs(ctx, RegType::sgpr, ++ instr->call().abi.clobberedRegs.sgpr.hi().reg()); ++ adjust_max_blocked_regs(ctx, RegType::vgpr, ++ instr->call().abi.clobberedRegs.vgpr.hi().reg()); + + ASSERTED bool success = false; + success = +-- +GitLab + + +From ffb65b8b229cab1e36a6334344088aa9f0928d3a Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sat, 1 Jun 2024 11:50:04 +0200 +Subject: [PATCH 64/71] aco/spill: Don't spill scratch_rsrc-related temps + +These temps are used to create the scratch_rsrc. Spilling them will +never benefit anything, because assign_spill_slots will insert code +that keeps them live. Since the spiller assumes all spilled variables +to be dead, this can cause more variables being live than intended and +spilling to fail. +--- + src/amd/compiler/aco_spill.cpp | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp +index e143b51809570..b36a15b68e553 100644 +--- a/src/amd/compiler/aco_spill.cpp ++++ b/src/amd/compiler/aco_spill.cpp +@@ -371,6 +371,9 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) + if (var.type() != type || ctx.spills_entry[block_idx].count(var) || + var.regClass().is_linear_vgpr()) + continue; ++ if (var == ctx.program->stack_ptr || var == ctx.program->scratch_offset || ++ var == ctx.program->private_segment_buffer) ++ continue; + + unsigned can_remat = ctx.remat.count(var); + if (can_remat > remat || (can_remat == remat && ctx.ssa_infos[t].score() > score)) { +@@ -415,7 +418,8 @@ init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_idx) + continue; + Temp var = phi->definitions[0].getTemp(); + if (var.type() == type && !ctx.spills_entry[block_idx].count(var) && +- ctx.ssa_infos[var.id()].score() > score) { ++ ctx.ssa_infos[var.id()].score() > score && var != ctx.program->stack_ptr && ++ var != ctx.program->scratch_offset && var != ctx.program->private_segment_buffer) { + to_spill = var; + score = ctx.ssa_infos[var.id()].score(); + } +@@ -965,6 +969,10 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s + + if (can_rematerialize > do_rematerialize || loop_variable > avoid_respill || + ctx.ssa_infos[t].score() > score) { ++ if (var == ctx.program->stack_ptr || var == ctx.program->scratch_offset || ++ var == ctx.program->private_segment_buffer) ++ continue; ++ + unsigned cur_operand_idx = -1u; + bool can_spill = true; + for (auto it = instr->operands.begin(); it != instr->operands.end(); ++it) { +-- +GitLab + + +From 524d5f329cc352e8049ef573a728d47f2f6741e3 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Wed, 5 Jun 2024 11:06:32 +0200 +Subject: [PATCH 65/71] aco/spill: Ignore extra VGPRs/SGPRs for calls + +For VGPRs, we make sure they're spilled in the spill_preserved pass. +For SGPRs, we make sure to reinitialize scratch_rsrc after calls. +--- + src/amd/compiler/aco_spill.cpp | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp +index b36a15b68e553..943a3788a15c2 100644 +--- a/src/amd/compiler/aco_spill.cpp ++++ b/src/amd/compiler/aco_spill.cpp +@@ -88,17 +88,20 @@ struct spill_ctx { + std::set unused_remats; + unsigned wave_size; + ++ RegisterDemand extra_demand; ++ + unsigned sgpr_spill_slots; + unsigned vgpr_spill_slots; + Temp scratch_rsrc; + +- spill_ctx(const RegisterDemand target_pressure_, Program* program_) ++ spill_ctx(const RegisterDemand target_pressure_, RegisterDemand extra_demand_, Program* program_) + : target_pressure(target_pressure_), program(program_), memory(), + renames(program->blocks.size(), aco::map(memory)), + spills_entry(program->blocks.size(), aco::unordered_map(memory)), + spills_exit(program->blocks.size(), aco::unordered_map(memory)), + processed(program->blocks.size(), false), ssa_infos(program->peekAllocationId()), +- remat(memory), wave_size(program->wave_size), sgpr_spill_slots(0), vgpr_spill_slots(0) ++ remat(memory), wave_size(program->wave_size), extra_demand(extra_demand_), ++ sgpr_spill_slots(0), vgpr_spill_slots(0) + {} + + void add_affinity(uint32_t first, uint32_t second) +@@ -943,8 +946,14 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s + RegisterDemand new_demand = instr->register_demand; + std::optional live_changes; + ++ RegisterDemand ignored_regs = {}; ++ ++ /* We spill linear VGPRs for calls in spill_preserved */ ++ if (instr->isCall() || (!instructions.empty() && instructions.back()->isCall())) ++ ignored_regs += ctx.extra_demand; ++ + /* if reg pressure is too high, spill variable with furthest next use */ +- while ((new_demand - spilled_registers).exceeds(ctx.target_pressure)) { ++ while ((new_demand - spilled_registers).exceeds(ctx.target_pressure + ignored_regs)) { + float score = 0.0; + Temp to_spill = Temp(); + unsigned operand_idx = -1u; +@@ -953,7 +962,8 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s + unsigned avoid_respill = 0; + + RegType type = RegType::sgpr; +- if (new_demand.vgpr - spilled_registers.vgpr > ctx.target_pressure.vgpr) ++ if (new_demand.vgpr - spilled_registers.vgpr > ++ (ctx.target_pressure.vgpr + ignored_regs.vgpr)) + type = RegType::vgpr; + + for (unsigned t : ctx.program->live.live_in[block_idx]) { +@@ -1770,7 +1780,7 @@ spill(Program* program) + const RegisterDemand target(vgpr_limit - extra_vgprs, sgpr_limit - extra_sgprs); + + /* initialize ctx */ +- spill_ctx ctx(target, program); ++ spill_ctx ctx(target, RegisterDemand(extra_vgprs, extra_sgprs), program); + gather_ssa_use_info(ctx); + get_rematerialize_info(ctx); + +-- +GitLab + + +From 9bedff4e6eef064be53aaa64c14cb40318e311b9 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sat, 1 Jun 2024 16:38:24 +0200 +Subject: [PATCH 66/71] aco: Add and set block->contains_call + +--- + src/amd/compiler/aco_instruction_selection.cpp | 1 + + src/amd/compiler/aco_ir.h | 1 + + 2 files changed, 2 insertions(+) + +diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp +index 6c98777b12689..fae8d57479bb8 100644 +--- a/src/amd/compiler/aco_instruction_selection.cpp ++++ b/src/amd/compiler/aco_instruction_selection.cpp +@@ -10939,6 +10939,7 @@ visit_call(isel_context* ctx, nir_call_instr* instr) + .return_info = std::move(return_infos), + .scratch_param_size = info.scratch_param_size, + }); ++ ctx->block->contains_call = true; + } + + void +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index 2ab9eaa5d653c..14f2c07eda7a8 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -2219,6 +2219,7 @@ struct Block { + /* this information is needed for predecessors to blocks with phis when + * moving out of ssa */ + bool scc_live_out = false; ++ bool contains_call = true; + + Block() : index(0) {} + }; +-- +GitLab + + +From ca4c18e7be750667c68229346bba989d28255ceb Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Sat, 1 Jun 2024 12:00:48 +0200 +Subject: [PATCH 67/71] aco/spill: Reset scratch_rsrc on calls + +--- + src/amd/compiler/aco_spill.cpp | 46 ++++++++++++++++++++++++++++------ + 1 file changed, 39 insertions(+), 7 deletions(-) + +diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp +index 943a3788a15c2..61ffd57b497f9 100644 +--- a/src/amd/compiler/aco_spill.cpp ++++ b/src/amd/compiler/aco_spill.cpp +@@ -93,6 +93,7 @@ struct spill_ctx { + unsigned sgpr_spill_slots; + unsigned vgpr_spill_slots; + Temp scratch_rsrc; ++ unsigned scratch_rsrc_block = -1u; + + spill_ctx(const RegisterDemand target_pressure_, RegisterDemand extra_demand_, Program* program_) + : target_pressure(target_pressure_), program(program_), memory(), +@@ -1192,19 +1193,28 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block, + bool overflow = (ctx.vgpr_spill_slots - 1) * 4 > offset_range; + + Builder rsrc_bld(ctx.program); ++ unsigned bld_block = block.index; + if (block.kind & block_kind_top_level) { + rsrc_bld.reset(&instructions); + } else if (ctx.scratch_rsrc == Temp() && (!overflow || ctx.program->gfx_level < GFX9)) { + Block* tl_block = █ +- while (!(tl_block->kind & block_kind_top_level)) ++ while (!(tl_block->kind & block_kind_top_level) && ++ std::find_if(tl_block->instructions.begin(), tl_block->instructions.end(), ++ [](auto& instr) ++ { return !instr || instr->isCall(); }) == tl_block->instructions.end()) + tl_block = &ctx.program->blocks[tl_block->linear_idom]; + + /* find p_logical_end */ +- std::vector>& prev_instructions = tl_block->instructions; +- unsigned idx = prev_instructions.size() - 1; +- while (prev_instructions[idx]->opcode != aco_opcode::p_logical_end) +- idx--; +- rsrc_bld.reset(&prev_instructions, std::next(prev_instructions.begin(), idx)); ++ if (tl_block->kind & block_kind_top_level) { ++ std::vector>& prev_instructions = tl_block->instructions; ++ unsigned idx = prev_instructions.size() - 1; ++ while (prev_instructions[idx]->opcode != aco_opcode::p_logical_end) ++ idx--; ++ rsrc_bld.reset(&prev_instructions, std::next(prev_instructions.begin(), idx)); ++ bld_block = tl_block->index; ++ } else { ++ rsrc_bld.reset(&instructions); ++ } + } + + /* If spilling overflows the constant offset range at any point, we need to emit the soffset +@@ -1232,10 +1242,13 @@ setup_vgpr_spill_reload(spill_ctx& ctx, Block& block, + Operand(ctx.program->stack_ptr), Operand::c32(saddr)); + else + ctx.scratch_rsrc = offset_bld.copy(offset_bld.def(s1), Operand::c32(saddr)); ++ ctx.scratch_rsrc_block = bld_block; + } + } else { +- if (ctx.scratch_rsrc == Temp()) ++ if (ctx.scratch_rsrc == Temp()) { + ctx.scratch_rsrc = load_scratch_resource(ctx.program, rsrc_bld, overflow, true); ++ ctx.scratch_rsrc_block = bld_block; ++ } + + if (overflow) { + uint32_t soffset = +@@ -1571,6 +1584,22 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) + unsigned last_top_level_block_idx = 0; + for (Block& block : ctx.program->blocks) { + ++ if (ctx.scratch_rsrc_block < ctx.program->blocks.size() && ++ !(ctx.program->blocks[ctx.scratch_rsrc_block].kind & block_kind_top_level)) ++ ctx.scratch_rsrc = Temp(); ++ ++ if (block.kind & block_kind_loop_header) { ++ for (unsigned index = block.index; ++ index < ctx.program->blocks.size() && ++ ctx.program->blocks[index].loop_nest_depth >= block.loop_nest_depth; ++ ++index) { ++ if (ctx.program->blocks[index].contains_call) { ++ ctx.scratch_rsrc = Temp(); ++ break; ++ } ++ } ++ } ++ + if (block.kind & block_kind_top_level) { + last_top_level_block_idx = block.index; + +@@ -1588,6 +1617,9 @@ assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) + Builder bld(ctx.program, &instructions); + for (it = block.instructions.begin(); it != block.instructions.end(); ++it) { + ++ if ((*it)->isCall()) ++ ctx.scratch_rsrc = Temp(); ++ + if ((*it)->opcode == aco_opcode::p_spill) { + uint32_t spill_id = (*it)->operands[1].constantValue(); + +-- +GitLab + + +From b90eeacb2aa89b4d33315cc3e49c13611710d945 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Thu, 6 Jun 2024 08:08:02 +0200 +Subject: [PATCH 68/71] radv: Re-enable RT pipelines on GFX9+ + +--- + src/amd/vulkan/radv_physical_device.c | 6 +----- + 1 file changed, 1 insertion(+), 5 deletions(-) + +diff --git a/src/amd/vulkan/radv_physical_device.c b/src/amd/vulkan/radv_physical_device.c +index 98826470d4d60..382195e70a77e 100644 +--- a/src/amd/vulkan/radv_physical_device.c ++++ b/src/amd/vulkan/radv_physical_device.c +@@ -111,14 +111,10 @@ radv_filter_minmax_enabled(const struct radv_physical_device *pdev) + bool + radv_enable_rt(const struct radv_physical_device *pdev, bool rt_pipelines) + { +- /* Temporarily under construction! */ +- if (rt_pipelines) +- return false; +- + if (pdev->info.gfx_level < GFX10_3 && !radv_emulate_rt(pdev)) + return false; + +- if (rt_pipelines && pdev->use_llvm) ++ if (rt_pipelines && (pdev->use_llvm || pdev->info.gfx_level < GFX9)) + return false; + + return true; +-- +GitLab + + +From c73f158059b287185f612d3ea1e1ef8bcc46f58b Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Thu, 6 Jun 2024 08:03:43 +0200 +Subject: [PATCH 69/71] aco: Add separate register usage tracking for + ABI-preserved regs + +If a shader uses fewer registers than are preserved by an ABI, we'll +want to set the register demand to the actual register usage instead of +the demand set by preserved call registers. + +Totals from 11 (0.01% of 81072) affected shaders: +MaxWaves: 120 -> 176 (+46.67%) +Instrs: 9493 -> 9516 (+0.24%) +CodeSize: 54868 -> 55012 (+0.26%); split: -0.03%, +0.29% +VGPRs: 1088 -> 640 (-41.18%) +Latency: 140184 -> 141125 (+0.67%); split: -0.06%, +0.73% +InvThroughput: 38824 -> 35752 (-7.91%); split: -7.93%, +0.02% +VClause: 256 -> 262 (+2.34%) +SClause: 129 -> 136 (+5.43%) +Copies: 1379 -> 1402 (+1.67%); split: -0.15%, +1.81% +VALU: 6386 -> 6405 (+0.30%); split: -0.03%, +0.33% +SALU: 968 -> 972 (+0.41%) +VMEM: 1028 -> 1030 (+0.19%) +--- + src/amd/compiler/aco_ir.h | 7 +++- + src/amd/compiler/aco_live_var_analysis.cpp | 24 +++++++---- + src/amd/compiler/aco_lower_to_cssa.cpp | 10 ++++- + src/amd/compiler/aco_register_allocation.cpp | 22 +++++----- + src/amd/compiler/aco_scheduler.cpp | 43 +++++++++++++++++++- + src/amd/compiler/aco_spill.cpp | 4 +- + 6 files changed, 84 insertions(+), 26 deletions(-) + +diff --git a/src/amd/compiler/aco_ir.h b/src/amd/compiler/aco_ir.h +index 14f2c07eda7a8..92b21a8b4ed6a 100644 +--- a/src/amd/compiler/aco_ir.h ++++ b/src/amd/compiler/aco_ir.h +@@ -2346,7 +2346,7 @@ public: + std::vector blocks; + std::vector temp_rc = {s1}; + RegisterDemand max_reg_demand = RegisterDemand(); +- RegisterDemand cur_reg_demand = RegisterDemand(); ++ RegisterDemand max_real_reg_demand = RegisterDemand(); + ac_shader_config* config; + struct aco_shader_info info; + enum amd_gfx_level gfx_level; +@@ -2485,7 +2485,8 @@ void select_ps_prolog(Program* program, void* pinfo, ac_shader_config* config, + void lower_phis(Program* program); + void lower_subdword(Program* program); + void calc_min_waves(Program* program); +-void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand); ++void update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand, ++ const RegisterDemand new_real_demand); + void live_var_analysis(Program* program); + std::vector dead_code_analysis(Program* program); + void dominator_tree(Program* program); +@@ -2561,6 +2562,8 @@ int get_op_fixed_to_def(Instruction* instr); + RegisterDemand get_live_changes(Instruction* instr); + RegisterDemand get_temp_registers(Instruction* instr); + RegisterDemand get_temp_reg_changes(Instruction* instr); ++void compute_blocked_abi_demand(Program* program, unsigned linear_vgpr_demand, ++ Pseudo_call_instruction& instr); + + /* number of sgprs that need to be allocated but might notbe addressable as s0-s105 */ + uint16_t get_extra_sgprs(Program* program); +diff --git a/src/amd/compiler/aco_live_var_analysis.cpp b/src/amd/compiler/aco_live_var_analysis.cpp +index 52561464b0e1e..9d6284b38e0a3 100644 +--- a/src/amd/compiler/aco_live_var_analysis.cpp ++++ b/src/amd/compiler/aco_live_var_analysis.cpp +@@ -207,6 +207,7 @@ void + process_live_temps_per_block(live_ctx& ctx, Block* block) + { + RegisterDemand new_demand; ++ RegisterDemand real_block_demand; + block->register_demand = RegisterDemand(); + IDSet live = compute_live_out(ctx, block); + +@@ -363,6 +364,10 @@ process_live_temps_per_block(live_ctx& ctx, Block* block) + operand_demand += new_demand; + insn->register_demand.update(operand_demand); + block->register_demand.update(insn->register_demand); ++ if (insn->isCall()) ++ real_block_demand.update(insn->register_demand - insn->call().blocked_abi_demand); ++ else ++ real_block_demand.update(insn->register_demand); + } + + /* handle phi definitions */ +@@ -419,6 +424,7 @@ process_live_temps_per_block(live_ctx& ctx, Block* block) + block->live_in_demand = new_demand; + block->live_in_demand.sgpr += 2; /* Add 2 SGPRs for potential long-jumps. */ + block->register_demand.update(block->live_in_demand); ++ ctx.program->max_real_reg_demand.update(real_block_demand); + ctx.program->max_reg_demand.update(block->register_demand); + ctx.handled_once = std::min(ctx.handled_once, block->index); + +@@ -559,29 +565,30 @@ max_suitable_waves(Program* program, uint16_t waves) + } + + void +-update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) ++update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand, ++ const RegisterDemand new_real_demand) + { + assert(program->min_waves >= 1); + uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); + +- program->cur_reg_demand = new_demand; ++ program->max_reg_demand = new_demand; + /* this won't compile, register pressure reduction necessary */ + if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) { + program->num_waves = 0; +- program->max_reg_demand = new_demand; + } else { +- program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr); ++ program->num_waves = ++ program->dev.physical_sgprs / get_sgpr_alloc(program, new_real_demand.sgpr); + uint16_t vgpr_demand = +- get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2; ++ get_vgpr_alloc(program, new_real_demand.vgpr) + program->config->num_shared_vgprs / 2; + program->num_waves = + std::min(program->num_waves, program->dev.physical_vgprs / vgpr_demand); + program->num_waves = std::min(program->num_waves, program->dev.max_waves_per_simd); + + /* Adjust for LDS and workgroup multiples and calculate max_reg_demand */ + program->num_waves = max_suitable_waves(program, program->num_waves); +- program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves); +- program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves); ++ program->max_real_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves); ++ program->max_real_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves); + } + } + +@@ -592,6 +599,7 @@ live_var_analysis(Program* program) + program->live.memory.release(); + program->live.live_in.resize(program->blocks.size(), IDSet(program->live.memory)); + program->max_reg_demand = RegisterDemand(); ++ program->max_real_reg_demand = RegisterDemand(); + program->needs_vcc = program->gfx_level >= GFX10; + + live_ctx ctx; +@@ -607,7 +615,7 @@ live_var_analysis(Program* program) + + /* calculate the program's register demand and number of waves */ + if (program->progress < CompilationProgress::after_ra) +- update_vgpr_sgpr_demand(program, program->max_reg_demand); ++ update_vgpr_sgpr_demand(program, program->max_reg_demand, program->max_real_reg_demand); + } + + } // namespace aco +diff --git a/src/amd/compiler/aco_lower_to_cssa.cpp b/src/amd/compiler/aco_lower_to_cssa.cpp +index 4268e21d820d2..237aaa01f4bc7 100644 +--- a/src/amd/compiler/aco_lower_to_cssa.cpp ++++ b/src/amd/compiler/aco_lower_to_cssa.cpp +@@ -519,6 +519,7 @@ emit_parallelcopies(cssa_ctx& ctx) + } + + RegisterDemand new_demand; ++ RegisterDemand real_new_demand; + for (Block& block : ctx.program->blocks) { + /* Finally, rename coalesced phi operands */ + for (aco_ptr& phi : block.instructions) { +@@ -538,13 +539,18 @@ emit_parallelcopies(cssa_ctx& ctx) + + /* Resummarize the block's register demand */ + block.register_demand = block.live_in_demand; +- for (const aco_ptr& instr : block.instructions) ++ for (const aco_ptr& instr : block.instructions) { + block.register_demand.update(instr->register_demand); ++ if (instr->isCall()) ++ real_new_demand.update(instr->register_demand - instr->call().blocked_abi_demand); ++ else ++ real_new_demand.update(instr->register_demand); ++ } + new_demand.update(block.register_demand); + } + + /* Update max_reg_demand and num_waves */ +- update_vgpr_sgpr_demand(ctx.program, new_demand); ++ update_vgpr_sgpr_demand(ctx.program, new_demand, real_new_demand); + + assert(renames.empty()); + } +diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp +index 5b4b50652006e..e0d6f6bfeaf5a 100644 +--- a/src/amd/compiler/aco_register_allocation.cpp ++++ b/src/amd/compiler/aco_register_allocation.cpp +@@ -121,8 +121,8 @@ struct ra_ctx { + sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); + vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); + +- sgpr_bounds = program->max_reg_demand.sgpr; +- vgpr_bounds = program->max_reg_demand.vgpr; ++ sgpr_bounds = program->max_real_reg_demand.sgpr; ++ vgpr_bounds = program->max_real_reg_demand.vgpr; + num_linear_vgprs = 0; + } + }; +@@ -1426,16 +1426,18 @@ increase_register_file(ra_ctx& ctx, RegClass rc) + { + if (rc.type() == RegType::vgpr && ctx.num_linear_vgprs == 0 && + ctx.vgpr_bounds < ctx.vgpr_limit) { ++ RegisterDemand new_demand = ++ RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_real_reg_demand.sgpr); + /* If vgpr_bounds is less than max_reg_demand.vgpr, this should be a no-op. */ +- update_vgpr_sgpr_demand( +- ctx.program, RegisterDemand(ctx.vgpr_bounds + 1, ctx.program->max_reg_demand.sgpr)); ++ update_vgpr_sgpr_demand(ctx.program, new_demand, new_demand); + +- ctx.vgpr_bounds = ctx.program->max_reg_demand.vgpr; +- } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < ctx.sgpr_limit) { +- update_vgpr_sgpr_demand( +- ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.sgpr_bounds + 1)); ++ ctx.vgpr_bounds = ctx.program->max_real_reg_demand.vgpr; ++ } else if (rc.type() == RegType::sgpr && ctx.program->max_real_reg_demand.sgpr < ctx.sgpr_limit) { ++ RegisterDemand new_demand = ++ RegisterDemand(ctx.program->max_real_reg_demand.vgpr, ctx.sgpr_bounds + 1); ++ update_vgpr_sgpr_demand(ctx.program, new_demand, new_demand); + +- ctx.sgpr_bounds = ctx.program->max_reg_demand.sgpr; ++ ctx.sgpr_bounds = ctx.program->max_real_reg_demand.sgpr; + } else { + return false; + } +@@ -2049,7 +2051,7 @@ handle_pseudo(ra_ctx& ctx, const RegisterFile& reg_file, Instruction* instr) + ; + if (reg < 0) { + reg = ctx.max_used_sgpr + 1; +- for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++) ++ for (; reg < ctx.program->max_real_reg_demand.sgpr && reg_file[PhysReg{(unsigned)reg}]; reg++) + ; + } + +diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp +index e6eb1e49a4021..438e45def661c 100644 +--- a/src/amd/compiler/aco_scheduler.cpp ++++ b/src/amd/compiler/aco_scheduler.cpp +@@ -1271,7 +1271,7 @@ schedule_program(Program* program) + */ + if (program->is_callee) { + ctx.mv.max_registers.sgpr = +- std::max(std::min(ctx.mv.max_registers.sgpr, program->cur_reg_demand.sgpr), ++ std::max(std::min(ctx.mv.max_registers.sgpr, program->max_reg_demand.sgpr), + (int16_t)program->callee_abi.clobberedRegs.sgpr.size); + } + +@@ -1291,10 +1291,49 @@ schedule_program(Program* program) + + /* update max_reg_demand and num_waves */ + RegisterDemand new_demand; ++ RegisterDemand real_new_demand; + for (Block& block : program->blocks) { + new_demand.update(block.register_demand); ++ if (block.contains_call) { ++ unsigned linear_vgpr_demand = 0; ++ for (auto t : program->live.live_in[block.index]) ++ if (program->temp_rc[t].is_linear_vgpr()) ++ linear_vgpr_demand += program->temp_rc[t].size(); ++ ++ for (unsigned i = block.instructions.size() - 1; i < block.instructions.size(); --i) { ++ Instruction* instr = block.instructions[i].get(); ++ ++ for (auto& def : instr->definitions) { ++ if (def.regClass().is_linear_vgpr() && !def.isKill()) ++ linear_vgpr_demand -= def.size(); ++ } ++ for (auto& op : instr->operands) { ++ if (op.regClass().is_linear_vgpr() && op.isFirstKill()) ++ linear_vgpr_demand += op.size(); ++ } ++ ++ if (!block.instructions[i]->isCall()) { ++ real_new_demand.update(block.instructions[i]->register_demand); ++ continue; ++ } ++ ++ compute_blocked_abi_demand(program, linear_vgpr_demand, instr->call()); ++ ++ const unsigned max_vgpr = get_addr_vgpr_from_waves(program, program->min_waves); ++ const unsigned max_sgpr = get_addr_sgpr_from_waves(program, program->min_waves); ++ ++ if (instr->call().abi.clobberedRegs.vgpr.hi() == PhysReg{256 + max_vgpr} && ++ instr->call().abi.clobberedRegs.sgpr.hi() == PhysReg{max_sgpr}) ++ real_new_demand.update(block.instructions[i]->register_demand - ++ instr->call().blocked_abi_demand); ++ else ++ real_new_demand.update(block.instructions[i]->register_demand); ++ } ++ } else { ++ real_new_demand.update(block.register_demand); ++ } + } +- update_vgpr_sgpr_demand(program, new_demand); ++ update_vgpr_sgpr_demand(program, new_demand, real_new_demand); + + /* Validate live variable information */ + if (!validate_live_vars(program)) +diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp +index 61ffd57b497f9..2ebe7c28fa8fd 100644 +--- a/src/amd/compiler/aco_spill.cpp ++++ b/src/amd/compiler/aco_spill.cpp +@@ -1780,7 +1780,7 @@ spill(Program* program) + abi_sgpr_limit = sgpr_limit; + + /* no spilling when register pressure is low enough */ +- if (program->num_waves > 0 && program->cur_reg_demand.sgpr <= abi_sgpr_limit) ++ if (program->num_waves > 0 && program->max_reg_demand.sgpr <= abi_sgpr_limit) + return; + + /* lower to CSSA before spilling to ensure correctness w.r.t. phis */ +@@ -1830,7 +1830,7 @@ spill(Program* program) + preserved_reload_instructions, reg); + + unsigned max_reg = +- std::min((unsigned)program->cur_reg_demand.sgpr + extra_sgprs, (unsigned)sgpr_limit); ++ std::min((unsigned)program->max_reg_demand.sgpr + extra_sgprs, (unsigned)sgpr_limit); + for (PhysReg reg = program->callee_abi.clobberedRegs.sgpr.hi(); reg < max_reg; + reg = reg.advance(4)) + spill_reload_preserved_sgpr(ctx, preserved_spill_instructions, +-- +GitLab + + +From 450c3456e89dd5d8604128482be7768eebda4b1e Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Tue, 4 Jun 2024 15:08:48 +0200 +Subject: [PATCH 70/71] aco/spill: Restore registers spilled by call + immediately + +Makes for better latency hiding if we're not short on registers +otherwise. + +Totals from 7 (0.01% of 81072) affected shaders: +Instrs: 9084 -> 8980 (-1.14%) +CodeSize: 52564 -> 51976 (-1.12%) +SpillSGPRs: 244 -> 248 (+1.64%); split: -3.28%, +4.92% +SpillVGPRs: 360 -> 367 (+1.94%) +Latency: 138989 -> 135669 (-2.39%); split: -2.49%, +0.10% +InvThroughput: 35120 -> 35301 (+0.52%); split: -0.06%, +0.57% +VClause: 258 -> 241 (-6.59%) +SClause: 116 -> 117 (+0.86%) +Copies: 1290 -> 1311 (+1.63%) +Branches: 131 -> 119 (-9.16%) +VALU: 6125 -> 6143 (+0.29%); split: -0.20%, +0.49% +SALU: 920 -> 913 (-0.76%); split: -0.98%, +0.22% +VMEM: 1026 -> 989 (-3.61%) +--- + src/amd/compiler/aco_spill.cpp | 21 +++++++++++++++++++++ + 1 file changed, 21 insertions(+) + +diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp +index 2ebe7c28fa8fd..dea810ce42cf4 100644 +--- a/src/amd/compiler/aco_spill.cpp ++++ b/src/amd/compiler/aco_spill.cpp +@@ -908,6 +908,8 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s + + auto& current_spills = ctx.spills_exit[block_idx]; + ++ std::vector call_spills; ++ + while (idx < block->instructions.size()) { + aco_ptr& instr = block->instructions[idx]; + +@@ -922,6 +924,22 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s + + std::map> reloads; + ++ if (!call_spills.empty()) { ++ RegisterDemand demand = instr->register_demand; ++ while (!(demand - spilled_registers).exceeds(ctx.target_pressure) && ++ !call_spills.empty()) { ++ Temp old_tmp = call_spills.back(); ++ call_spills.pop_back(); ++ ++ Temp new_tmp = ctx.program->allocateTmp(ctx.program->temp_rc[old_tmp.id()]); ++ ctx.renames[block_idx][old_tmp] = new_tmp; ++ reloads[old_tmp] = std::make_pair(new_tmp, current_spills[old_tmp]); ++ current_spills.erase(old_tmp); ++ spilled_registers -= new_tmp; ++ } ++ call_spills.clear(); ++ } ++ + /* rename and reload operands */ + for (Operand& op : instr->operands) { + if (!op.isTemp()) +@@ -1051,6 +1069,9 @@ process_block(spill_ctx& ctx, unsigned block_idx, Block* block, RegisterDemand s + } + + uint32_t spill_id = ctx.add_to_spills(to_spill, current_spills); ++ if (instr->isCall()) ++ call_spills.emplace_back(to_spill); ++ + /* add interferences with reloads */ + for (std::pair>& pair : reloads) + ctx.add_interference(spill_id, pair.second.second); +-- +GitLab + + +From ceea1b8cab549bf5b79c51c54c6f995a5fa79a62 Mon Sep 17 00:00:00 2001 +From: Friedrich Vock +Date: Tue, 4 Jun 2024 15:12:21 +0200 +Subject: [PATCH 71/71] aco/vn: Don't combine expressions across calls + +This increases live state across calls, which in turn increases spilling +and makes for slower shaders overall. + +Totals from 7 (0.01% of 81072) affected shaders: +Instrs: 8980 -> 8955 (-0.28%); split: -0.88%, +0.60% +CodeSize: 51976 -> 51684 (-0.56%); split: -1.02%, +0.46% +SpillSGPRs: 248 -> 244 (-1.61%); split: -3.63%, +2.02% +SpillVGPRs: 367 -> 365 (-0.54%); split: -1.09%, +0.54% +Scratch: 32768 -> 31744 (-3.12%) +Latency: 135669 -> 128720 (-5.12%); split: -5.13%, +0.01% +InvThroughput: 35301 -> 34783 (-1.47%); split: -1.51%, +0.05% +VClause: 241 -> 242 (+0.41%) +SClause: 117 -> 120 (+2.56%) +Copies: 1311 -> 1338 (+2.06%); split: -0.69%, +2.75% +PreSGPRs: 899 -> 895 (-0.44%); split: -1.56%, +1.11% +PreVGPRs: 1103 -> 1099 (-0.36%) +VALU: 6143 -> 6098 (-0.73%); split: -1.22%, +0.49% +SALU: 913 -> 933 (+2.19%); split: -0.11%, +2.30% +VMEM: 989 -> 967 (-2.22%) +SMEM: 201 -> 214 (+6.47%) +--- + src/amd/compiler/aco_opt_value_numbering.cpp | 24 ++++++++++++++++++++ + 1 file changed, 24 insertions(+) + +diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp +index a199216907a5e..c35dbdaa5fcb7 100644 +--- a/src/amd/compiler/aco_opt_value_numbering.cpp ++++ b/src/amd/compiler/aco_opt_value_numbering.cpp +@@ -43,6 +43,8 @@ struct InstrHash { + for (const Operand& op : instr->operands) + hash = murmur_32_scramble(hash, op.constantValue()); + ++ hash = murmur_32_scramble(hash, instr->pass_flags >> 16); ++ + size_t data_size = get_instr_data_size(instr->format); + + /* skip format, opcode and pass_flags and op/def spans */ +@@ -240,6 +242,9 @@ struct vn_ctx { + expr_set expr_values; + aco::unordered_map renames; + ++ /* For each block, a counter of how many calls were encountered in the linear/logical CFG. */ ++ std::vector> call_indices; ++ + /* The exec id should be the same on the same level of control flow depth. + * Together with the check for dominator relations, it is safe to assume + * that the same exec_id also means the same execution mask. +@@ -254,6 +259,7 @@ struct vn_ctx { + for (Block& block : program->blocks) + size += block.instructions.size(); + expr_values.reserve(size); ++ call_indices.resize(program->blocks.size(), {0, 0}); + } + }; + +@@ -341,6 +347,13 @@ process_block(vn_ctx& ctx, Block& block) + std::vector> new_instructions; + new_instructions.reserve(block.instructions.size()); + ++ uint32_t linear_call_idx = 0; ++ uint32_t logical_call_idx = 0; ++ for (auto index : block.linear_preds) ++ linear_call_idx = std::max(linear_call_idx, ctx.call_indices[index].first); ++ for (auto index : block.logical_preds) ++ logical_call_idx = std::max(logical_call_idx, ctx.call_indices[index].second); ++ + for (aco_ptr& instr : block.instructions) { + /* first, rename operands */ + for (Operand& op : instr->operands) { +@@ -354,6 +367,10 @@ process_block(vn_ctx& ctx, Block& block) + if (instr->opcode == aco_opcode::p_discard_if || + instr->opcode == aco_opcode::p_demote_to_helper || instr->opcode == aco_opcode::p_end_wqm) + ctx.exec_id++; ++ if (instr->isCall()) { ++ ++linear_call_idx; ++ ++logical_call_idx; ++ } + + /* simple copy-propagation through renaming */ + bool copy_instr = +@@ -370,7 +387,12 @@ process_block(vn_ctx& ctx, Block& block) + continue; + } + ++ bool use_linear_call_idx = ++ std::any_of(instr->definitions.begin(), instr->definitions.end(), ++ [](const auto& def) { return def.regClass().is_linear(); }); ++ + instr->pass_flags = ctx.exec_id; ++ instr->pass_flags |= (use_linear_call_idx ? linear_call_idx : logical_call_idx) << 16; + std::pair res = ctx.expr_values.emplace(instr.get(), block.index); + + /* if there was already an expression with the same value number */ +@@ -409,6 +431,8 @@ process_block(vn_ctx& ctx, Block& block) + } + } + ++ ctx.call_indices[block.index] = {linear_call_idx, logical_call_idx}; ++ + block.instructions = std::move(new_instructions); + } + +-- +GitLab +