From 9d266840d2e61dc8d99ba33b86704c4b9213fcf7 Mon Sep 17 00:00:00 2001 From: "Ward Nakchbandi (Cosmic Fusion)" <83735213+CosmicFusion@users.noreply.github.com> Date: Thu, 5 Oct 2023 22:52:53 +0300 Subject: [PATCH] --- ...llium-fix-build-failure-on-powerpcspe.diff | 35 - debian/patches/21929.patch | 679 ----------------- debian/patches/24720.patch | 687 ------------------ debian/patches/24789.patch | 34 - debian/patches/path_max.diff | 39 - debian/patches/revert-af1ee8e01044.diff | 153 ---- debian/patches/series | 6 - debian/patches/src_glx_dri_common.h.diff | 13 - 8 files changed, 1646 deletions(-) delete mode 100644 debian/patches/07_gallium-fix-build-failure-on-powerpcspe.diff delete mode 100644 debian/patches/21929.patch delete mode 100644 debian/patches/24720.patch delete mode 100644 debian/patches/24789.patch delete mode 100644 debian/patches/path_max.diff delete mode 100644 debian/patches/revert-af1ee8e01044.diff delete mode 100644 debian/patches/series delete mode 100644 debian/patches/src_glx_dri_common.h.diff diff --git a/debian/patches/07_gallium-fix-build-failure-on-powerpcspe.diff b/debian/patches/07_gallium-fix-build-failure-on-powerpcspe.diff deleted file mode 100644 index 029bd22..0000000 --- a/debian/patches/07_gallium-fix-build-failure-on-powerpcspe.diff +++ /dev/null @@ -1,35 +0,0 @@ -From a4f14e7239780b02af8d74669c5458d4b0957d4d Mon Sep 17 00:00:00 2001 -From: Roland Stigge -Date: Sun, 2 Mar 2014 19:52:56 +0100 -Subject: [PATCH] gallium: fix build failure on powerpcspe - -In the case of powerpc, mesa activates some altivec instructions -that are unknown on the powerpcspe architecture (see -https://wiki.debian.org/PowerPCSPEPort), causing a build failure as the -'vand' opcode is not recognized by the assembler. - -This patch fixes this by preventing the PPC-specialcasing in case of -powerpcspe (__NO_FPRS__ is only defined there). - -https://bugs.debian.org/695746 ---- - src/gallium/include/pipe/p_config.h | 2 ++ - 1 file changed, 2 insertions(+) - ---- a/src/util/detect_arch.h -+++ b/src/util/detect_arch.h -@@ -70,12 +70,14 @@ - #endif - #endif - -+#ifndef __NO_FPRS__ - #if defined(__ppc__) || defined(__ppc64__) || defined(__PPC__) || defined(__PPC64__) - #define DETECT_ARCH_PPC 1 - #if defined(__ppc64__) || defined(__PPC64__) - #define DETECT_ARCH_PPC_64 1 - #endif - #endif -+#endif - - #if defined(__s390x__) - #define DETECT_ARCH_S390 1 diff --git a/debian/patches/21929.patch b/debian/patches/21929.patch deleted file mode 100644 index 85317d0..0000000 --- a/debian/patches/21929.patch +++ /dev/null @@ -1,679 +0,0 @@ -From ed9fb6be100cff6c2066beb0cdf8b3a17cab292c Mon Sep 17 00:00:00 2001 -From: Konstantin Seurer -Date: Sat, 24 Jun 2023 15:49:13 +0200 -Subject: [PATCH 1/4] radv: Add rt.monolithic to radv_pipeline_key - ---- - src/amd/vulkan/radv_shader.h | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h -index 0c53695edae7c..6eb95fdd0a097 100644 ---- a/src/amd/vulkan/radv_shader.h -+++ b/src/amd/vulkan/radv_shader.h -@@ -135,6 +135,10 @@ struct radv_pipeline_key { - - bool line_smooth_enabled; - } ps; -+ -+ struct { -+ bool monolithic; -+ } rt; - }; - - struct radv_nir_compiler_options { --- - - -From 8f45cc08361f55c1e613a11198b1ae97c519406e Mon Sep 17 00:00:00 2001 -From: Konstantin Seurer -Date: Sat, 24 Jun 2023 15:46:51 +0200 -Subject: [PATCH 2/4] radv/rt: Store NIR shaders separately - -In order to compile monolithic shaders with pipeline libraries, we need -to keep the NIR around for inlining recursive stages. ---- - src/amd/vulkan/radv_pipeline_cache.c | 9 +-- - src/amd/vulkan/radv_pipeline_rt.c | 93 +++++++++++++++++++++------- - src/amd/vulkan/radv_private.h | 1 + - src/amd/vulkan/radv_rt_shader.c | 7 +-- - 4 files changed, 79 insertions(+), 31 deletions(-) - -diff --git a/src/amd/vulkan/radv_pipeline_cache.c b/src/amd/vulkan/radv_pipeline_cache.c -index 5bbbc755ae11f..7e4c6f8898130 100644 ---- a/src/amd/vulkan/radv_pipeline_cache.c -+++ b/src/amd/vulkan/radv_pipeline_cache.c -@@ -481,11 +481,12 @@ radv_ray_tracing_pipeline_cache_search(struct radv_device *device, struct vk_pip - pipeline->base.base.shaders[MESA_SHADER_INTERSECTION] = radv_shader_ref(pipeline_obj->shaders[idx++]); - - for (unsigned i = 0; i < pCreateInfo->stageCount; i++) { -- if (radv_ray_tracing_stage_is_compiled(&pipeline->stages[i])) { -+ if (radv_ray_tracing_stage_is_compiled(&pipeline->stages[i])) - pipeline->stages[i].shader = &radv_shader_ref(pipeline_obj->shaders[idx++])->base; -- } else if (is_library) { -- pipeline->stages[i].shader = radv_pipeline_cache_search_nir(device, cache, pipeline->stages[i].sha1); -- complete &= pipeline->stages[i].shader != NULL; -+ -+ if (is_library) { -+ pipeline->stages[i].nir = radv_pipeline_cache_search_nir(device, cache, pipeline->stages[i].sha1); -+ complete &= pipeline->stages[i].nir != NULL; - } - } - -diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c -index c86ea3a508468..85afc8cb28e1b 100644 ---- a/src/amd/vulkan/radv_pipeline_rt.c -+++ b/src/amd/vulkan/radv_pipeline_rt.c -@@ -263,7 +263,10 @@ radv_rt_fill_stage_info(const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, st - RADV_FROM_HANDLE(radv_pipeline, pipeline, pCreateInfo->pLibraryInfo->pLibraries[i]); - struct radv_ray_tracing_pipeline *library_pipeline = radv_pipeline_to_ray_tracing(pipeline); - for (unsigned j = 0; j < library_pipeline->stage_count; ++j) { -- stages[idx].shader = vk_pipeline_cache_object_ref(library_pipeline->stages[j].shader); -+ stages[idx].nir = vk_pipeline_cache_object_ref(library_pipeline->stages[j].nir); -+ if (library_pipeline->stages[j].shader) -+ stages[idx].shader = vk_pipeline_cache_object_ref(library_pipeline->stages[j].shader); -+ - stages[idx].stage = library_pipeline->stages[j].stage; - stages[idx].stack_size = library_pipeline->stages[j].stack_size; - memcpy(stages[idx].sha1, library_pipeline->stages[j].sha1, SHA1_DIGEST_LENGTH); -@@ -462,45 +465,83 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca - return VK_PIPELINE_COMPILE_REQUIRED; - VkResult result = VK_SUCCESS; - -- struct radv_ray_tracing_stage *stages = pipeline->stages; -+ struct radv_ray_tracing_stage *rt_stages = pipeline->stages; -+ -+ struct radv_shader_stage *stages = calloc(pCreateInfo->stageCount, sizeof(struct radv_shader_stage)); -+ if (!stages) -+ return VK_ERROR_OUT_OF_HOST_MEMORY; -+ -+ bool has_callable = false; -+ for (uint32_t i = 0; i < pipeline->stage_count; i++) { -+ if (pipeline->stages[i].stage == MESA_SHADER_CALLABLE) { -+ has_callable = true; -+ break; -+ } -+ } - - for (uint32_t idx = 0; idx < pCreateInfo->stageCount; idx++) { -+ if (rt_stages[idx].shader || rt_stages[idx].nir) -+ continue; -+ - int64_t stage_start = os_time_get_nano(); -- struct radv_shader_stage stage; -- radv_pipeline_stage_init(&pCreateInfo->pStages[idx], pipeline_layout, &stage); - -- if (stages[idx].shader) -- goto feedback; -+ struct radv_shader_stage *stage = &stages[idx]; -+ radv_pipeline_stage_init(&pCreateInfo->pStages[idx], pipeline_layout, stage); - - /* precompile the shader */ -- stage.nir = radv_parse_rt_stage(device, &pCreateInfo->pStages[idx], key, pipeline_layout); -+ stage->nir = radv_parse_rt_stage(device, &pCreateInfo->pStages[idx], key, pipeline_layout); -+ -+ /* Cases in which we need to keep around the NIR: -+ * - pipeline library: The final pipeline might be monolithic in which case it will need every NIR shader. -+ * If there is a callable shader, we can be sure that the final pipeline won't be -+ * monolithic. -+ * - non-recursive: Non-recursive shaders are inlined into the traversal shader. -+ * - monolithic: Callable shaders (chit/miss) are inlined into the raygen shader. -+ */ -+ bool compiled = radv_ray_tracing_stage_is_compiled(&rt_stages[idx]); -+ bool library = pCreateInfo->flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR; -+ bool nir_needed = -+ (library && !has_callable) || !compiled || (key->rt.monolithic && rt_stages[idx].stage != MESA_SHADER_RAYGEN); -+ nir_needed &= !rt_stages[idx].nir; -+ if (nir_needed) { -+ rt_stages[idx].stack_size = stage->nir->scratch_size; -+ rt_stages[idx].nir = radv_pipeline_cache_nir_to_handle(device, cache, stage->nir, rt_stages[idx].sha1, -+ !key->optimisations_disabled); -+ } - -- if (radv_ray_tracing_stage_is_compiled(&stages[idx])) { -- uint32_t stack_size = 0; -+ stage->feedback.duration = os_time_get_nano() - stage_start; -+ } - -+ for (uint32_t idx = 0; idx < pCreateInfo->stageCount; idx++) { -+ int64_t stage_start = os_time_get_nano(); -+ struct radv_shader_stage *stage = &stages[idx]; -+ -+ /* Cases in which we need to compile the shader (raygen/callable/chit/miss): -+ * TODO: - monolithic: Extend the loop to cover imported stages and force compilation of imported raygen -+ * shaders since pipeline library shaders use separate compilation. -+ * - separate: Compile any recursive stage if wasn't compiled yet. -+ * TODO: Skip chit and miss shaders in the monolithic case. -+ */ -+ bool shader_needed = radv_ray_tracing_stage_is_compiled(&rt_stages[idx]) && !rt_stages[idx].shader; -+ if (shader_needed) { -+ uint32_t stack_size = 0; - struct radv_serialized_shader_arena_block *replay_block = - capture_replay_handles[idx].arena_va ? &capture_replay_handles[idx] : NULL; - - struct radv_shader *shader; - result = -- radv_rt_nir_to_asm(device, cache, pCreateInfo, key, pipeline, &stage, &stack_size, replay_block, &shader); -- stages[idx].stack_size = stack_size; -- stages[idx].shader = shader ? &shader->base : NULL; -- } else { -- stages[idx].stack_size = stage.nir->scratch_size; -- stages[idx].shader = -- radv_pipeline_cache_nir_to_handle(device, cache, stage.nir, stages[idx].sha1, !key->optimisations_disabled); -- } -- ralloc_free(stage.nir); -+ radv_rt_nir_to_asm(device, cache, pCreateInfo, key, pipeline, stage, &stack_size, replay_block, &shader); -+ if (result != VK_SUCCESS) -+ goto cleanup; - -- if (result != VK_SUCCESS) -- return result; -+ rt_stages[idx].stack_size = stack_size; -+ rt_stages[idx].shader = shader ? &shader->base : NULL; -+ } - -- feedback: - if (creation_feedback && creation_feedback->pipelineStageCreationFeedbackCount) { - assert(idx < creation_feedback->pipelineStageCreationFeedbackCount); -- stage.feedback.duration = os_time_get_nano() - stage_start; -- creation_feedback->pPipelineStageCreationFeedbacks[idx] = stage.feedback; -+ stage->feedback.duration += os_time_get_nano() - stage_start; -+ creation_feedback->pPipelineStageCreationFeedbacks[idx] = stage->feedback; - } - } - -@@ -527,6 +568,10 @@ radv_rt_compile_shaders(struct radv_device *device, struct vk_pipeline_cache *ca - result = radv_rt_nir_to_asm(device, cache, pCreateInfo, key, pipeline, &traversal_stage, NULL, NULL, - &pipeline->base.base.shaders[MESA_SHADER_INTERSECTION]); - -+cleanup: -+ for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) -+ ralloc_free(stages[i].nir); -+ free(stages); - return result; - } - -@@ -732,6 +777,8 @@ void - radv_destroy_ray_tracing_pipeline(struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline) - { - for (unsigned i = 0; i < pipeline->stage_count; i++) { -+ if (pipeline->stages[i].nir) -+ vk_pipeline_cache_object_unref(&device->vk, pipeline->stages[i].nir); - if (pipeline->stages[i].shader) - vk_pipeline_cache_object_unref(&device->vk, pipeline->stages[i].shader); - } -diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h -index 47e315488e9f9..7ab46738b46f0 100644 ---- a/src/amd/vulkan/radv_private.h -+++ b/src/amd/vulkan/radv_private.h -@@ -2364,6 +2364,7 @@ struct radv_ray_tracing_group { - }; - - struct radv_ray_tracing_stage { -+ struct vk_pipeline_cache_object *nir; - struct vk_pipeline_cache_object *shader; - gl_shader_stage stage; - uint32_t stack_size; -diff --git a/src/amd/vulkan/radv_rt_shader.c b/src/amd/vulkan/radv_rt_shader.c -index cc92beebc3503..3def324bcccf3 100644 ---- a/src/amd/vulkan/radv_rt_shader.c -+++ b/src/amd/vulkan/radv_rt_shader.c -@@ -1132,7 +1132,7 @@ visit_any_hit_shaders(struct radv_device *device, nir_builder *b, struct travers - if (is_dup) - continue; - -- nir_shader *nir_stage = radv_pipeline_cache_handle_to_nir(device, data->pipeline->stages[shader_id].shader); -+ nir_shader *nir_stage = radv_pipeline_cache_handle_to_nir(device, data->pipeline->stages[shader_id].nir); - assert(nir_stage); - - insert_rt_case(b, nir_stage, vars, sbt_idx, data->pipeline->groups[i].handle.any_hit_index); -@@ -1262,13 +1262,12 @@ handle_candidate_aabb(nir_builder *b, struct radv_leaf_intersection *intersectio - if (is_dup) - continue; - -- nir_shader *nir_stage = radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[shader_id].shader); -+ nir_shader *nir_stage = radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[shader_id].nir); - assert(nir_stage); - - nir_shader *any_hit_stage = NULL; - if (any_hit_shader_id != VK_SHADER_UNUSED_KHR) { -- any_hit_stage = -- radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[any_hit_shader_id].shader); -+ any_hit_stage = radv_pipeline_cache_handle_to_nir(data->device, data->pipeline->stages[any_hit_shader_id].nir); - assert(any_hit_stage); - - /* reserve stack size for any_hit before it is inlined */ --- -GitLab - - -From bba42cbc235e75a5c7ed05e55e48f71640c68ad4 Mon Sep 17 00:00:00 2001 -From: Konstantin Seurer -Date: Mon, 21 Aug 2023 13:32:53 +0200 -Subject: [PATCH 3/4] radv/rt: Add monolithic raygen lowering - -Ray traversal is inlined to allow for constant folding and avoid -spilling. ---- - src/amd/vulkan/radv_pipeline_rt.c | 11 +- - src/amd/vulkan/radv_rt_shader.c | 276 ++++++++++++++++++++++++++---- - src/amd/vulkan/radv_shader.h | 3 +- - 3 files changed, 248 insertions(+), 42 deletions(-) - -diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c -index 85afc8cb28e1b..12562c6cf89ba 100644 ---- a/src/amd/vulkan/radv_pipeline_rt.c -+++ b/src/amd/vulkan/radv_pipeline_rt.c -@@ -356,9 +356,8 @@ move_rt_instructions(nir_shader *shader) - static VkResult - radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, - const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, const struct radv_pipeline_key *pipeline_key, -- const struct radv_ray_tracing_pipeline *pipeline, struct radv_shader_stage *stage, -- uint32_t *stack_size, struct radv_serialized_shader_arena_block *replay_block, -- struct radv_shader **out_shader) -+ struct radv_ray_tracing_pipeline *pipeline, struct radv_shader_stage *stage, uint32_t *stack_size, -+ struct radv_serialized_shader_arena_block *replay_block, struct radv_shader **out_shader) - { - struct radv_shader_binary *binary; - bool keep_executable_info = radv_pipeline_capture_shaders(device, pipeline->base.base.create_flags); -@@ -384,7 +383,8 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, - uint32_t num_resume_shaders = 0; - nir_shader **resume_shaders = NULL; - -- if (stage->stage != MESA_SHADER_INTERSECTION) { -+ bool monolithic_raygen = pipeline_key->rt.monolithic && stage->stage == MESA_SHADER_RAYGEN; -+ if (stage->stage != MESA_SHADER_INTERSECTION && !monolithic_raygen) { - nir_builder b = nir_builder_at(nir_after_cf_list(&nir_shader_get_entrypoint(stage->nir)->body)); - nir_rt_return_amd(&b); - -@@ -411,7 +411,8 @@ radv_rt_nir_to_asm(struct radv_device *device, struct vk_pipeline_cache *cache, - for (uint32_t i = 0; i < num_shaders; i++) { - struct radv_shader_stage temp_stage = *stage; - temp_stage.nir = shaders[i]; -- radv_nir_lower_rt_abi(temp_stage.nir, pCreateInfo, &temp_stage.args, &stage->info, stack_size, i > 0); -+ radv_nir_lower_rt_abi(temp_stage.nir, pCreateInfo, &temp_stage.args, &stage->info, stack_size, i > 0, device, -+ pipeline, pipeline_key); - radv_optimize_nir(temp_stage.nir, pipeline_key->optimisations_disabled); - radv_postprocess_nir(device, pipeline_key, &temp_stage); - -diff --git a/src/amd/vulkan/radv_rt_shader.c b/src/amd/vulkan/radv_rt_shader.c -index 3def324bcccf3..362d918597008 100644 ---- a/src/amd/vulkan/radv_rt_shader.c -+++ b/src/amd/vulkan/radv_rt_shader.c -@@ -1306,6 +1306,87 @@ handle_candidate_aabb(nir_builder *b, struct radv_leaf_intersection *intersectio - nir_pop_if(b, NULL); - } - -+static void -+visit_closest_hit_shaders(struct radv_device *device, nir_builder *b, struct radv_ray_tracing_pipeline *pipeline, -+ struct rt_variables *vars) -+{ -+ nir_def *sbt_idx = nir_load_var(b, vars->idx); -+ -+ if (!(vars->flags & VK_PIPELINE_CREATE_RAY_TRACING_NO_NULL_CLOSEST_HIT_SHADERS_BIT_KHR)) -+ nir_push_if(b, nir_ine_imm(b, sbt_idx, 0)); -+ -+ for (unsigned i = 0; i < pipeline->group_count; ++i) { -+ struct radv_ray_tracing_group *group = &pipeline->groups[i]; -+ -+ unsigned shader_id = VK_SHADER_UNUSED_KHR; -+ if (group->type != VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR) -+ shader_id = group->recursive_shader; -+ -+ if (shader_id == VK_SHADER_UNUSED_KHR) -+ continue; -+ -+ /* Avoid emitting stages with the same shaders/handles multiple times. */ -+ bool is_dup = false; -+ for (unsigned j = 0; j < i; ++j) -+ if (pipeline->groups[j].handle.closest_hit_index == pipeline->groups[i].handle.closest_hit_index) -+ is_dup = true; -+ -+ if (is_dup) -+ continue; -+ -+ nir_shader *nir_stage = radv_pipeline_cache_handle_to_nir(device, pipeline->stages[shader_id].nir); -+ assert(nir_stage); -+ -+ insert_rt_case(b, nir_stage, vars, sbt_idx, pipeline->groups[i].handle.closest_hit_index); -+ ralloc_free(nir_stage); -+ } -+ -+ if (!(vars->flags & VK_PIPELINE_CREATE_RAY_TRACING_NO_NULL_CLOSEST_HIT_SHADERS_BIT_KHR)) -+ nir_pop_if(b, NULL); -+} -+ -+static void -+visit_miss_shaders(struct radv_device *device, nir_builder *b, struct radv_ray_tracing_pipeline *pipeline, -+ struct rt_variables *vars) -+{ -+ nir_def *sbt_idx = nir_load_var(b, vars->idx); -+ -+ if (!(vars->flags & VK_PIPELINE_CREATE_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR)) -+ nir_push_if(b, nir_ine_imm(b, sbt_idx, 0)); -+ -+ for (unsigned i = 0; i < pipeline->group_count; ++i) { -+ struct radv_ray_tracing_group *group = &pipeline->groups[i]; -+ -+ unsigned shader_id = VK_SHADER_UNUSED_KHR; -+ if (group->type == VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR) -+ shader_id = group->recursive_shader; -+ -+ if (shader_id == VK_SHADER_UNUSED_KHR) -+ continue; -+ -+ if (pipeline->stages[shader_id].stage != MESA_SHADER_MISS) -+ continue; -+ -+ /* Avoid emitting stages with the same shaders/handles multiple times. */ -+ bool is_dup = false; -+ for (unsigned j = 0; j < i; ++j) -+ if (pipeline->groups[j].handle.general_index == pipeline->groups[i].handle.general_index) -+ is_dup = true; -+ -+ if (is_dup) -+ continue; -+ -+ nir_shader *nir_stage = radv_pipeline_cache_handle_to_nir(device, pipeline->stages[shader_id].nir); -+ assert(nir_stage); -+ -+ insert_rt_case(b, nir_stage, vars, sbt_idx, pipeline->groups[i].handle.general_index); -+ ralloc_free(nir_stage); -+ } -+ -+ if (!(vars->flags & VK_PIPELINE_CREATE_RAY_TRACING_NO_NULL_MISS_SHADERS_BIT_KHR)) -+ nir_pop_if(b, NULL); -+} -+ - static void - store_stack_entry(nir_builder *b, nir_def *index, nir_def *value, const struct radv_ray_traversal_args *args) - { -@@ -1414,25 +1495,47 @@ radv_build_traversal(struct radv_device *device, struct radv_ray_tracing_pipelin - /* Register storage for hit attributes */ - nir_variable *hit_attribs[RADV_MAX_HIT_ATTRIB_SIZE / sizeof(uint32_t)]; - -- for (uint32_t i = 0; i < ARRAY_SIZE(hit_attribs); i++) -- hit_attribs[i] = nir_local_variable_create(nir_shader_get_entrypoint(b->shader), glsl_uint_type(), "ahit_attrib"); -+ if (!key->rt.monolithic || b->shader->info.stage != MESA_SHADER_RAYGEN) { -+ for (uint32_t i = 0; i < ARRAY_SIZE(hit_attribs); i++) -+ hit_attribs[i] = -+ nir_local_variable_create(nir_shader_get_entrypoint(b->shader), glsl_uint_type(), "ahit_attrib"); - -- lower_hit_attribs(b->shader, hit_attribs, device->physical_device->rt_wave_size); -+ lower_hit_attribs(b->shader, hit_attribs, device->physical_device->rt_wave_size); -+ } - - /* Initialize follow-up shader. */ - nir_push_if(b, nir_load_var(b, trav_vars.hit)); - { -- for (int i = 0; i < ARRAY_SIZE(hit_attribs); ++i) -- nir_store_hit_attrib_amd(b, nir_load_var(b, hit_attribs[i]), .base = i); -- nir_execute_closest_hit_amd(b, nir_load_var(b, vars->idx), nir_load_var(b, vars->tmax), -- nir_load_var(b, vars->primitive_id), nir_load_var(b, vars->instance_addr), -- nir_load_var(b, vars->geometry_id_and_flags), nir_load_var(b, vars->hit_kind)); -+ if (key->rt.monolithic && b->shader->info.stage == MESA_SHADER_RAYGEN) { -+ load_sbt_entry(b, vars, nir_load_var(b, vars->idx), SBT_HIT, SBT_CLOSEST_HIT_IDX); -+ -+ nir_def *should_return = -+ nir_test_mask(b, nir_load_var(b, vars->cull_mask_and_flags), SpvRayFlagsSkipClosestHitShaderKHRMask); -+ -+ /* should_return is set if we had a hit but we won't be calling the closest hit -+ * shader and hence need to return immediately to the calling shader. */ -+ nir_push_if(b, nir_inot(b, should_return)); -+ visit_closest_hit_shaders(device, b, pipeline, vars); -+ nir_pop_if(b, NULL); -+ } else { -+ for (int i = 0; i < ARRAY_SIZE(hit_attribs); ++i) -+ nir_store_hit_attrib_amd(b, nir_load_var(b, hit_attribs[i]), .base = i); -+ nir_execute_closest_hit_amd(b, nir_load_var(b, vars->idx), nir_load_var(b, vars->tmax), -+ nir_load_var(b, vars->primitive_id), nir_load_var(b, vars->instance_addr), -+ nir_load_var(b, vars->geometry_id_and_flags), nir_load_var(b, vars->hit_kind)); -+ } - } - nir_push_else(b, NULL); - { -- /* Only load the miss shader if we actually miss. It is valid to not specify an SBT pointer -- * for miss shaders if none of the rays miss. */ -- nir_execute_miss_amd(b, nir_load_var(b, vars->tmax)); -+ if (key->rt.monolithic && b->shader->info.stage == MESA_SHADER_RAYGEN) { -+ load_sbt_entry(b, vars, nir_load_var(b, vars->miss_index), SBT_MISS, SBT_GENERAL_IDX); -+ -+ visit_miss_shaders(device, b, pipeline, vars); -+ } else { -+ /* Only load the miss shader if we actually miss. It is valid to not specify an SBT pointer -+ * for miss shaders if none of the rays miss. */ -+ nir_execute_miss_amd(b, nir_load_var(b, vars->tmax)); -+ } - } - nir_pop_if(b, NULL); - } -@@ -1477,6 +1580,98 @@ radv_build_traversal_shader(struct radv_device *device, struct radv_ray_tracing_ - return b.shader; - } - -+struct lower_rt_instruction_monolithic_state { -+ struct radv_device *device; -+ struct radv_ray_tracing_pipeline *pipeline; -+ const struct radv_pipeline_key *key; -+ const VkRayTracingPipelineCreateInfoKHR *pCreateInfo; -+ -+ struct rt_variables *vars; -+}; -+ -+static bool -+lower_rt_instruction_monolithic(nir_builder *b, nir_instr *instr, void *data) -+{ -+ if (instr->type != nir_instr_type_intrinsic) -+ return false; -+ -+ b->cursor = nir_after_instr(instr); -+ -+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); -+ -+ struct lower_rt_instruction_monolithic_state *state = data; -+ struct rt_variables *vars = state->vars; -+ -+ switch (intr->intrinsic) { -+ case nir_intrinsic_execute_callable: -+ unreachable("nir_intrinsic_execute_callable"); -+ case nir_intrinsic_trace_ray: { -+ nir_store_var(b, vars->arg, nir_iadd_imm(b, intr->src[10].ssa, -b->shader->scratch_size), 1); -+ -+ /* Per the SPIR-V extension spec we have to ignore some bits for some arguments. */ -+ nir_store_var(b, vars->accel_struct, intr->src[0].ssa, 0x1); -+ nir_store_var(b, vars->cull_mask_and_flags, nir_ior(b, nir_ishl_imm(b, intr->src[2].ssa, 24), intr->src[1].ssa), -+ 0x1); -+ nir_store_var(b, vars->sbt_offset, nir_iand_imm(b, intr->src[3].ssa, 0xf), 0x1); -+ nir_store_var(b, vars->sbt_stride, nir_iand_imm(b, intr->src[4].ssa, 0xf), 0x1); -+ nir_store_var(b, vars->miss_index, nir_iand_imm(b, intr->src[5].ssa, 0xffff), 0x1); -+ nir_store_var(b, vars->origin, intr->src[6].ssa, 0x7); -+ nir_store_var(b, vars->tmin, intr->src[7].ssa, 0x1); -+ nir_store_var(b, vars->direction, intr->src[8].ssa, 0x7); -+ nir_store_var(b, vars->tmax, intr->src[9].ssa, 0x1); -+ -+ nir_def *stack_ptr = nir_load_var(b, vars->stack_ptr); -+ nir_store_var(b, vars->stack_ptr, nir_iadd_imm(b, stack_ptr, b->shader->scratch_size), 0x1); -+ -+ radv_build_traversal(state->device, state->pipeline, state->pCreateInfo, state->key, b, vars); -+ b->shader->info.shared_size = MAX2(b->shader->info.shared_size, state->device->physical_device->rt_wave_size * -+ MAX_STACK_ENTRY_COUNT * sizeof(uint32_t)); -+ -+ nir_store_var(b, vars->stack_ptr, stack_ptr, 0x1); -+ -+ nir_instr_remove(instr); -+ return true; -+ } -+ case nir_intrinsic_rt_resume: -+ unreachable("nir_intrinsic_rt_resume"); -+ case nir_intrinsic_rt_return_amd: -+ unreachable("nir_intrinsic_rt_return_amd"); -+ case nir_intrinsic_execute_closest_hit_amd: -+ unreachable("nir_intrinsic_execute_closest_hit_amd"); -+ case nir_intrinsic_execute_miss_amd: -+ unreachable("nir_intrinsic_execute_miss_amd"); -+ default: -+ return false; -+ } -+} -+ -+static void -+lower_rt_instructions_monolithic(nir_shader *shader, struct radv_device *device, -+ struct radv_ray_tracing_pipeline *pipeline, const struct radv_pipeline_key *key, -+ const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, struct rt_variables *vars) -+{ -+ nir_function_impl *impl = nir_shader_get_entrypoint(shader); -+ -+ struct lower_rt_instruction_monolithic_state state = { -+ .device = device, -+ .pipeline = pipeline, -+ .key = key, -+ .pCreateInfo = pCreateInfo, -+ .vars = vars, -+ }; -+ -+ nir_shader_instructions_pass(shader, lower_rt_instruction_monolithic, nir_metadata_none, &state); -+ nir_index_ssa_defs(impl); -+ -+ /* Register storage for hit attributes */ -+ nir_variable *hit_attribs[RADV_MAX_HIT_ATTRIB_SIZE / sizeof(uint32_t)]; -+ -+ for (uint32_t i = 0; i < ARRAY_SIZE(hit_attribs); i++) -+ hit_attribs[i] = nir_local_variable_create(impl, glsl_uint_type(), "ahit_attrib"); -+ -+ lower_hit_attribs(shader, hit_attribs, 0); -+} -+ - /** Select the next shader based on priorities: - * - * Detect the priority of the shader stage by the lowest bits in the address (low to high): -@@ -1517,13 +1712,18 @@ select_next_shader(nir_builder *b, nir_def *shader_addr, unsigned wave_size) - void - radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, - const struct radv_shader_args *args, const struct radv_shader_info *info, uint32_t *stack_size, -- bool resume_shader) -+ bool resume_shader, struct radv_device *device, struct radv_ray_tracing_pipeline *pipeline, -+ const struct radv_pipeline_key *key) - { - nir_function_impl *impl = nir_shader_get_entrypoint(shader); - - const VkPipelineCreateFlagBits2KHR create_flags = radv_get_pipeline_create_flags(pCreateInfo); - - struct rt_variables vars = create_rt_variables(shader, create_flags); -+ -+ if (key->rt.monolithic && shader->info.stage == MESA_SHADER_RAYGEN) -+ lower_rt_instructions_monolithic(shader, device, pipeline, key, pCreateInfo, &vars); -+ - lower_rt_instructions(shader, &vars, true); - - if (stack_size) { -@@ -1585,32 +1785,36 @@ radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKH - if (shader_guard) - nir_pop_if(&b, shader_guard); - -- /* select next shader */ - b.cursor = nir_after_cf_list(&impl->body); - -- shader_addr = nir_load_var(&b, vars.shader_addr); -- nir_def *next = select_next_shader(&b, shader_addr, info->wave_size); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.uniform_shader_addr, next); -- -- /* store back all variables to registers */ -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.dynamic_callable_stack_base, nir_load_var(&b, vars.stack_ptr)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.shader_addr, shader_addr); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.shader_record, nir_load_var(&b, vars.shader_record_ptr)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.payload_offset, nir_load_var(&b, vars.arg)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.accel_struct, nir_load_var(&b, vars.accel_struct)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.cull_mask_and_flags, nir_load_var(&b, vars.cull_mask_and_flags)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.sbt_offset, nir_load_var(&b, vars.sbt_offset)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.sbt_stride, nir_load_var(&b, vars.sbt_stride)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.miss_index, nir_load_var(&b, vars.miss_index)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.ray_origin, nir_load_var(&b, vars.origin)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.ray_tmin, nir_load_var(&b, vars.tmin)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.ray_direction, nir_load_var(&b, vars.direction)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.ray_tmax, nir_load_var(&b, vars.tmax)); -- -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.primitive_id, nir_load_var(&b, vars.primitive_id)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.instance_addr, nir_load_var(&b, vars.instance_addr)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.geometry_id_and_flags, nir_load_var(&b, vars.geometry_id_and_flags)); -- ac_nir_store_arg(&b, &args->ac, args->ac.rt.hit_kind, nir_load_var(&b, vars.hit_kind)); -+ if (key->rt.monolithic && shader->info.stage == MESA_SHADER_RAYGEN) { -+ nir_terminate(&b); -+ } else { -+ /* select next shader */ -+ shader_addr = nir_load_var(&b, vars.shader_addr); -+ nir_def *next = select_next_shader(&b, shader_addr, info->wave_size); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.uniform_shader_addr, next); -+ -+ /* store back all variables to registers */ -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.dynamic_callable_stack_base, nir_load_var(&b, vars.stack_ptr)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.shader_addr, shader_addr); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.shader_record, nir_load_var(&b, vars.shader_record_ptr)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.payload_offset, nir_load_var(&b, vars.arg)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.accel_struct, nir_load_var(&b, vars.accel_struct)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.cull_mask_and_flags, nir_load_var(&b, vars.cull_mask_and_flags)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.sbt_offset, nir_load_var(&b, vars.sbt_offset)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.sbt_stride, nir_load_var(&b, vars.sbt_stride)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.miss_index, nir_load_var(&b, vars.miss_index)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.ray_origin, nir_load_var(&b, vars.origin)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.ray_tmin, nir_load_var(&b, vars.tmin)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.ray_direction, nir_load_var(&b, vars.direction)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.ray_tmax, nir_load_var(&b, vars.tmax)); -+ -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.primitive_id, nir_load_var(&b, vars.primitive_id)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.instance_addr, nir_load_var(&b, vars.instance_addr)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.geometry_id_and_flags, nir_load_var(&b, vars.geometry_id_and_flags)); -+ ac_nir_store_arg(&b, &args->ac, args->ac.rt.hit_kind, nir_load_var(&b, vars.hit_kind)); -+ } - - nir_metadata_preserve(impl, nir_metadata_none); - -diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h -index 6eb95fdd0a097..969f9a56ab7f5 100644 ---- a/src/amd/vulkan/radv_shader.h -+++ b/src/amd/vulkan/radv_shader.h -@@ -635,7 +635,8 @@ nir_shader *radv_parse_rt_stage(struct radv_device *device, const VkPipelineShad - - void radv_nir_lower_rt_abi(nir_shader *shader, const VkRayTracingPipelineCreateInfoKHR *pCreateInfo, - const struct radv_shader_args *args, const struct radv_shader_info *info, -- uint32_t *stack_size, bool resume_shader); -+ uint32_t *stack_size, bool resume_shader, struct radv_device *device, -+ struct radv_ray_tracing_pipeline *pipeline, const struct radv_pipeline_key *key); - - struct radv_shader_stage; - --- - - -From 5c9dd4efece8f352d00d1310b556928cccb239c8 Mon Sep 17 00:00:00 2001 -From: Konstantin Seurer -Date: Sat, 24 Jun 2023 16:11:16 +0200 -Subject: [PATCH 4/4] radv/rt: Use monolithic pipelines - -Only available for non-recursive pipelines that do not have callables. ---- - src/amd/vulkan/radv_pipeline_rt.c | 11 +++++++++++ - 1 file changed, 11 insertions(+) - -diff --git a/src/amd/vulkan/radv_pipeline_rt.c b/src/amd/vulkan/radv_pipeline_rt.c -index 12562c6cf89ba..97449b9cbafac 100644 ---- a/src/amd/vulkan/radv_pipeline_rt.c -+++ b/src/amd/vulkan/radv_pipeline_rt.c -@@ -103,6 +103,17 @@ radv_generate_rt_pipeline_key(const struct radv_device *device, const struct rad - } - } - -+ if (!(pCreateInfo->flags & VK_PIPELINE_CREATE_LIBRARY_BIT_KHR)) { -+ key.rt.monolithic = pCreateInfo->maxPipelineRayRecursionDepth <= 1; -+ -+ for (uint32_t i = 0; i < pipeline->stage_count; i++) { -+ if (pipeline->stages[i].stage == MESA_SHADER_CALLABLE) { -+ key.rt.monolithic = false; -+ break; -+ } -+ } -+ } -+ - return key; - } - --- diff --git a/debian/patches/24720.patch b/debian/patches/24720.patch deleted file mode 100644 index c27c928..0000000 --- a/debian/patches/24720.patch +++ /dev/null @@ -1,687 +0,0 @@ -From 42be7a3c53698a165e9612619f6a34a65bbf91ff Mon Sep 17 00:00:00 2001 -From: Konstantin Seurer -Date: Wed, 16 Aug 2023 10:37:56 +0200 -Subject: [PATCH 1/3] radv: Remove dead radix_sort_vk_get_memory_requirements - call - ---- - src/amd/vulkan/radv_acceleration_structure.c | 4 ---- - 1 file changed, 4 deletions(-) - -diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c -index ece47b1230c88..5c5eb16e61f9d 100644 ---- a/src/amd/vulkan/radv_acceleration_structure.c -+++ b/src/amd/vulkan/radv_acceleration_structure.c -@@ -745,10 +745,6 @@ morton_sort(VkCommandBuffer commandBuffer, uint32_t infoCount, - { - RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); - for (uint32_t i = 0; i < infoCount; ++i) { -- struct radix_sort_vk_memory_requirements requirements; -- radix_sort_vk_get_memory_requirements(cmd_buffer->device->meta_state.accel_struct_build.radix_sort, -- bvh_states[i].node_count, &requirements); -- - struct radix_sort_vk_sort_devaddr_info info = cmd_buffer->device->meta_state.accel_struct_build.radix_sort_info; - info.count = bvh_states[i].node_count; - --- -GitLab - - -From faa17e5322ea66cd74e37aab48316059a05738d6 Mon Sep 17 00:00:00 2001 -From: Konstantin Seurer -Date: Wed, 16 Aug 2023 11:09:25 +0200 -Subject: [PATCH 2/3] radv/radix_sort: Vendor the radix sort dispatch code - -This needs to be done so we can optimize it for occpuancy when building -multiple acceleration structures in parallel. Changes to the original -code: - -- Change // to /* */ -- clang-format -- Replace vkCmd calls with calls to the driver entrypoints -- Add a light weight info struct -- Use radv_fill_buffer directly ---- - src/amd/vulkan/radv_acceleration_structure.c | 218 ++++++++++++++++--- - src/amd/vulkan/radv_private.h | 1 - - 2 files changed, 187 insertions(+), 32 deletions(-) - -diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c -index 5c5eb16e61f9d..9866de2e594a8 100644 ---- a/src/amd/vulkan/radv_acceleration_structure.c -+++ b/src/amd/vulkan/radv_acceleration_structure.c -@@ -27,7 +27,9 @@ - #include "nir_builder.h" - #include "radv_cs.h" - -+#include "radix_sort/common/vk/barrier.h" - #include "radix_sort/radv_radix_sort.h" -+#include "radix_sort/shaders/push.h" - - #include "bvh/build_interface.h" - #include "bvh/bvh.h" -@@ -76,6 +78,7 @@ static const uint32_t header_spv[] = { - }; - - #define KEY_ID_PAIR_SIZE 8 -+#define MORTON_BIT_SIZE 24 - - enum internal_build_type { - INTERNAL_BUILD_TYPE_LBVH, -@@ -382,17 +385,6 @@ cleanup: - return result; - } - --static void --radix_sort_fill_buffer(VkCommandBuffer commandBuffer, radix_sort_vk_buffer_info_t const *buffer_info, -- VkDeviceSize offset, VkDeviceSize size, uint32_t data) --{ -- RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); -- -- assert(size != VK_WHOLE_SIZE); -- -- radv_fill_buffer(cmd_buffer, NULL, NULL, buffer_info->devaddr + buffer_info->offset + offset, size, data); --} -- - VkResult - radv_device_init_null_accel_struct(struct radv_device *device) - { -@@ -576,12 +568,6 @@ radv_device_init_accel_struct_build_state(struct radv_device *device) - - device->meta_state.accel_struct_build.radix_sort = - radv_create_radix_sort_u64(radv_device_to_handle(device), &device->meta_state.alloc, device->meta_state.cache); -- -- struct radix_sort_vk_sort_devaddr_info *radix_sort_info = &device->meta_state.accel_struct_build.radix_sort_info; -- radix_sort_info->ext = NULL; -- radix_sort_info->key_bits = 24; -- radix_sort_info->fill_buffer = radix_sort_fill_buffer; -- - exit: - mtx_unlock(&device->meta_state.mtx); - return result; -@@ -743,28 +729,198 @@ morton_sort(VkCommandBuffer commandBuffer, uint32_t infoCount, - const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states, - enum radv_cmd_flush_bits flush_bits) - { -+ /* Copyright 2019 The Fuchsia Authors. */ - RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer); -+ -+ radix_sort_vk_t *rs = cmd_buffer->device->meta_state.accel_struct_build.radix_sort; -+ - for (uint32_t i = 0; i < infoCount; ++i) { -- struct radix_sort_vk_sort_devaddr_info info = cmd_buffer->device->meta_state.accel_struct_build.radix_sort_info; -- info.count = bvh_states[i].node_count; -+ uint32_t count = bvh_states[i].node_count; -+ uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; -+ uint64_t keyvals_odd_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[1]; -+ uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; -+ -+ /* Anything to do? */ -+ if (!count) { -+ bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[0]; -+ continue; -+ } -+ -+ /* -+ * OVERVIEW -+ * -+ * 1. Pad the keyvals in `scatter_even`. -+ * 2. Zero the `histograms` and `partitions`. -+ * --- BARRIER --- -+ * 3. HISTOGRAM is dispatched before PREFIX. -+ * --- BARRIER --- -+ * 4. PREFIX is dispatched before the first SCATTER. -+ * --- BARRIER --- -+ * 5. One or more SCATTER dispatches. -+ * -+ * Note that the `partitions` buffer can be zeroed anytime before the first -+ * scatter. -+ */ -+ -+ /* How many passes? */ -+ uint32_t keyval_bytes = rs->config.keyval_dwords * (uint32_t)sizeof(uint32_t); -+ uint32_t keyval_bits = keyval_bytes * 8; -+ uint32_t key_bits = MIN2(MORTON_BIT_SIZE, keyval_bits); -+ uint32_t passes = (key_bits + RS_RADIX_LOG2 - 1) / RS_RADIX_LOG2; -+ -+ bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[passes & 1]; -+ -+ /* -+ * PAD KEYVALS AND ZERO HISTOGRAM/PARTITIONS -+ * -+ * Pad fractional blocks with max-valued keyvals. -+ * -+ * Zero the histograms and partitions buffer. -+ * -+ * This assumes the partitions follow the histograms. -+ */ -+ -+ /* FIXME(allanmac): Consider precomputing some of these values and hang them off `rs`. */ -+ -+ /* How many scatter blocks? */ -+ uint32_t scatter_wg_size = 1 << rs->config.scatter.workgroup_size_log2; -+ uint32_t scatter_block_kvs = scatter_wg_size * rs->config.scatter.block_rows; -+ uint32_t scatter_blocks = (count + scatter_block_kvs - 1) / scatter_block_kvs; -+ uint32_t count_ru_scatter = scatter_blocks * scatter_block_kvs; -+ -+ /* -+ * How many histogram blocks? -+ * -+ * Note that it's OK to have more max-valued digits counted by the histogram -+ * than sorted by the scatters because the sort is stable. -+ */ -+ uint32_t histo_wg_size = 1 << rs->config.histogram.workgroup_size_log2; -+ uint32_t histo_block_kvs = histo_wg_size * rs->config.histogram.block_rows; -+ uint32_t histo_blocks = (count_ru_scatter + histo_block_kvs - 1) / histo_block_kvs; -+ uint32_t count_ru_histo = histo_blocks * histo_block_kvs; -+ -+ /* Fill with max values */ -+ if (count_ru_histo > count) { -+ radv_fill_buffer(cmd_buffer, NULL, NULL, keyvals_even_addr + count * keyval_bytes, -+ (count_ru_histo - count) * keyval_bytes, 0xFFFFFFFF); -+ } -+ -+ /* -+ * Zero histograms and invalidate partitions. -+ * -+ * Note that the partition invalidation only needs to be performed once -+ * because the even/odd scatter dispatches rely on the the previous pass to -+ * leave the partitions in an invalid state. -+ * -+ * Note that the last workgroup doesn't read/write a partition so it doesn't -+ * need to be initialized. -+ */ -+ uint32_t histo_partition_count = passes + scatter_blocks - 1; -+ uint32_t pass_idx = (keyval_bytes - passes); -+ -+ uint32_t fill_base = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t)); -+ -+ radv_fill_buffer(cmd_buffer, NULL, NULL, internal_addr + rs->internal.histograms.offset + fill_base, -+ histo_partition_count * (RS_RADIX_SIZE * sizeof(uint32_t)), 0); -+ -+ /* -+ * Pipeline: HISTOGRAM -+ * -+ * TODO(allanmac): All subgroups should try to process approximately the same -+ * number of blocks in order to minimize tail effects. This was implemented -+ * and reverted but should be reimplemented and benchmarked later. -+ */ -+ vk_barrier_transfer_w_to_compute_r(commandBuffer); -+ -+ uint64_t devaddr_histograms = internal_addr + rs->internal.histograms.offset; -+ -+ /* Dispatch histogram */ -+ struct rs_push_histogram push_histogram = { -+ .devaddr_histograms = devaddr_histograms, -+ .devaddr_keyvals = keyvals_even_addr, -+ .passes = passes, -+ }; -+ -+ radv_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.histogram, VK_SHADER_STAGE_COMPUTE_BIT, 0, -+ sizeof(push_histogram), &push_histogram); -+ -+ radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram); -+ -+ vk_common_CmdDispatch(commandBuffer, histo_blocks, 1, 1); - -- info.keyvals_even.buffer = VK_NULL_HANDLE; -- info.keyvals_even.offset = 0; -- info.keyvals_even.devaddr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; -+ /* -+ * Pipeline: PREFIX -+ * -+ * Launch one workgroup per pass. -+ */ -+ vk_barrier_compute_w_to_compute_r(commandBuffer); -+ -+ struct rs_push_prefix push_prefix = { -+ .devaddr_histograms = devaddr_histograms, -+ }; -+ -+ radv_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.prefix, VK_SHADER_STAGE_COMPUTE_BIT, 0, -+ sizeof(push_prefix), &push_prefix); - -- info.keyvals_odd = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[1]; -+ radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix); - -- info.internal.buffer = VK_NULL_HANDLE; -- info.internal.offset = 0; -- info.internal.devaddr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; -+ vk_common_CmdDispatch(commandBuffer, passes, 1, 1); - -- VkDeviceAddress result_addr; -- radix_sort_vk_sort_devaddr(cmd_buffer->device->meta_state.accel_struct_build.radix_sort, &info, -- radv_device_to_handle(cmd_buffer->device), commandBuffer, &result_addr); -+ /* Pipeline: SCATTER */ -+ vk_barrier_compute_w_to_compute_r(commandBuffer); -+ -+ uint32_t histogram_offset = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t)); -+ uint64_t devaddr_partitions = internal_addr + rs->internal.partitions.offset; -+ -+ struct rs_push_scatter push_scatter = { -+ .devaddr_keyvals_even = keyvals_even_addr, -+ .devaddr_keyvals_odd = keyvals_odd_addr, -+ .devaddr_partitions = devaddr_partitions, -+ .devaddr_histograms = devaddr_histograms + histogram_offset, -+ .pass_offset = (pass_idx & 3) * RS_RADIX_LOG2, -+ }; - -- assert(result_addr == info.keyvals_even.devaddr || result_addr == info.keyvals_odd); -+ { -+ uint32_t pass_dword = pass_idx / 4; - -- bvh_states[i].scratch_offset = (uint32_t)(result_addr - pInfos[i].scratchData.deviceAddress); -+ radv_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.scatter[pass_dword].even, -+ VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_scatter), &push_scatter); -+ -+ radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, -+ rs->pipelines.named.scatter[pass_dword].even); -+ } -+ -+ bool is_even = true; -+ -+ while (true) { -+ vk_common_CmdDispatch(commandBuffer, scatter_blocks, 1, 1); -+ -+ /* Continue? */ -+ if (++pass_idx >= keyval_bytes) -+ break; -+ -+ vk_barrier_compute_w_to_compute_r(commandBuffer); -+ -+ is_even ^= true; -+ push_scatter.devaddr_histograms += (RS_RADIX_SIZE * sizeof(uint32_t)); -+ push_scatter.pass_offset = (pass_idx & 3) * RS_RADIX_LOG2; -+ -+ uint32_t pass_dword = pass_idx / 4; -+ -+ /* Update push constants that changed */ -+ VkPipelineLayout pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even -+ : rs->pipeline_layouts.named.scatter[pass_dword].odd; -+ radv_CmdPushConstants(commandBuffer, pl, VK_SHADER_STAGE_COMPUTE_BIT, -+ offsetof(struct rs_push_scatter, devaddr_histograms), -+ sizeof(push_scatter.devaddr_histograms) + sizeof(push_scatter.pass_offset), -+ &push_scatter.devaddr_histograms); -+ -+ /* Bind new pipeline */ -+ VkPipeline p = -+ is_even ? rs->pipelines.named.scatter[pass_dword].even : rs->pipelines.named.scatter[pass_dword].odd; -+ -+ radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, p); -+ } - } - - cmd_buffer->state.flush_bits |= flush_bits; -diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h -index 1ea606c2ca111..2b0b9189f33f6 100644 ---- a/src/amd/vulkan/radv_private.h -+++ b/src/amd/vulkan/radv_private.h -@@ -731,7 +731,6 @@ struct radv_meta_state { - VkPipeline copy_pipeline; - - struct radix_sort_vk *radix_sort; -- struct radix_sort_vk_sort_devaddr_info radix_sort_info; - - struct { - VkBuffer buffer; --- -GitLab - - -From 04c77145628fe9956ae44a25ba7b1dfe401a9de8 Mon Sep 17 00:00:00 2001 -From: Konstantin Seurer -Date: Wed, 16 Aug 2023 11:50:18 +0200 -Subject: [PATCH 3/3] radv: Perform multiple sorts in parallel - -This was the last part that didn't scale with multiple infos. Reducing -the amount of barriers in this case improves DOOM Eternal performance by -50%. (Running with low resolution) ---- - src/amd/vulkan/radv_acceleration_structure.c | 264 ++++++++++--------- - 1 file changed, 143 insertions(+), 121 deletions(-) - -diff --git a/src/amd/vulkan/radv_acceleration_structure.c b/src/amd/vulkan/radv_acceleration_structure.c -index 9866de2e594a8..85852453fcb29 100644 ---- a/src/amd/vulkan/radv_acceleration_structure.c -+++ b/src/amd/vulkan/radv_acceleration_structure.c -@@ -598,6 +598,13 @@ struct bvh_state { - struct acceleration_structure_layout accel_struct; - struct scratch_layout scratch; - struct build_config config; -+ -+ /* Radix sort state */ -+ uint32_t scatter_blocks; -+ uint32_t count_ru_scatter; -+ uint32_t histo_blocks; -+ uint32_t count_ru_histo; -+ struct rs_push_scatter push_scatter; - }; - - static uint32_t -@@ -734,75 +741,79 @@ morton_sort(VkCommandBuffer commandBuffer, uint32_t infoCount, - - radix_sort_vk_t *rs = cmd_buffer->device->meta_state.accel_struct_build.radix_sort; - -- for (uint32_t i = 0; i < infoCount; ++i) { -- uint32_t count = bvh_states[i].node_count; -- uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; -- uint64_t keyvals_odd_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[1]; -- uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; -+ /* -+ * OVERVIEW -+ * -+ * 1. Pad the keyvals in `scatter_even`. -+ * 2. Zero the `histograms` and `partitions`. -+ * --- BARRIER --- -+ * 3. HISTOGRAM is dispatched before PREFIX. -+ * --- BARRIER --- -+ * 4. PREFIX is dispatched before the first SCATTER. -+ * --- BARRIER --- -+ * 5. One or more SCATTER dispatches. -+ * -+ * Note that the `partitions` buffer can be zeroed anytime before the first -+ * scatter. -+ */ -+ -+ /* How many passes? */ -+ uint32_t keyval_bytes = rs->config.keyval_dwords * (uint32_t)sizeof(uint32_t); -+ uint32_t keyval_bits = keyval_bytes * 8; -+ uint32_t key_bits = MIN2(MORTON_BIT_SIZE, keyval_bits); -+ uint32_t passes = (key_bits + RS_RADIX_LOG2 - 1) / RS_RADIX_LOG2; - -- /* Anything to do? */ -- if (!count) { -+ for (uint32_t i = 0; i < infoCount; ++i) { -+ if (bvh_states[i].node_count) -+ bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[passes & 1]; -+ else - bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[0]; -- continue; -- } -- -- /* -- * OVERVIEW -- * -- * 1. Pad the keyvals in `scatter_even`. -- * 2. Zero the `histograms` and `partitions`. -- * --- BARRIER --- -- * 3. HISTOGRAM is dispatched before PREFIX. -- * --- BARRIER --- -- * 4. PREFIX is dispatched before the first SCATTER. -- * --- BARRIER --- -- * 5. One or more SCATTER dispatches. -- * -- * Note that the `partitions` buffer can be zeroed anytime before the first -- * scatter. -- */ -- -- /* How many passes? */ -- uint32_t keyval_bytes = rs->config.keyval_dwords * (uint32_t)sizeof(uint32_t); -- uint32_t keyval_bits = keyval_bytes * 8; -- uint32_t key_bits = MIN2(MORTON_BIT_SIZE, keyval_bits); -- uint32_t passes = (key_bits + RS_RADIX_LOG2 - 1) / RS_RADIX_LOG2; -+ } - -- bvh_states[i].scratch_offset = bvh_states[i].scratch.sort_buffer_offset[passes & 1]; -+ /* -+ * PAD KEYVALS AND ZERO HISTOGRAM/PARTITIONS -+ * -+ * Pad fractional blocks with max-valued keyvals. -+ * -+ * Zero the histograms and partitions buffer. -+ * -+ * This assumes the partitions follow the histograms. -+ */ -+ -+ /* FIXME(allanmac): Consider precomputing some of these values and hang them off `rs`. */ -+ -+ /* How many scatter blocks? */ -+ uint32_t scatter_wg_size = 1 << rs->config.scatter.workgroup_size_log2; -+ uint32_t scatter_block_kvs = scatter_wg_size * rs->config.scatter.block_rows; -+ -+ /* -+ * How many histogram blocks? -+ * -+ * Note that it's OK to have more max-valued digits counted by the histogram -+ * than sorted by the scatters because the sort is stable. -+ */ -+ uint32_t histo_wg_size = 1 << rs->config.histogram.workgroup_size_log2; -+ uint32_t histo_block_kvs = histo_wg_size * rs->config.histogram.block_rows; -+ -+ uint32_t pass_idx = (keyval_bytes - passes); - -- /* -- * PAD KEYVALS AND ZERO HISTOGRAM/PARTITIONS -- * -- * Pad fractional blocks with max-valued keyvals. -- * -- * Zero the histograms and partitions buffer. -- * -- * This assumes the partitions follow the histograms. -- */ -+ for (uint32_t i = 0; i < infoCount; ++i) { -+ if (!bvh_states[i].node_count) -+ continue; - -- /* FIXME(allanmac): Consider precomputing some of these values and hang them off `rs`. */ -+ uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; -+ uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; - -- /* How many scatter blocks? */ -- uint32_t scatter_wg_size = 1 << rs->config.scatter.workgroup_size_log2; -- uint32_t scatter_block_kvs = scatter_wg_size * rs->config.scatter.block_rows; -- uint32_t scatter_blocks = (count + scatter_block_kvs - 1) / scatter_block_kvs; -- uint32_t count_ru_scatter = scatter_blocks * scatter_block_kvs; -+ bvh_states[i].scatter_blocks = (bvh_states[i].node_count + scatter_block_kvs - 1) / scatter_block_kvs; -+ bvh_states[i].count_ru_scatter = bvh_states[i].scatter_blocks * scatter_block_kvs; - -- /* -- * How many histogram blocks? -- * -- * Note that it's OK to have more max-valued digits counted by the histogram -- * than sorted by the scatters because the sort is stable. -- */ -- uint32_t histo_wg_size = 1 << rs->config.histogram.workgroup_size_log2; -- uint32_t histo_block_kvs = histo_wg_size * rs->config.histogram.block_rows; -- uint32_t histo_blocks = (count_ru_scatter + histo_block_kvs - 1) / histo_block_kvs; -- uint32_t count_ru_histo = histo_blocks * histo_block_kvs; -+ bvh_states[i].histo_blocks = (bvh_states[i].count_ru_scatter + histo_block_kvs - 1) / histo_block_kvs; -+ bvh_states[i].count_ru_histo = bvh_states[i].histo_blocks * histo_block_kvs; - - /* Fill with max values */ -- if (count_ru_histo > count) { -- radv_fill_buffer(cmd_buffer, NULL, NULL, keyvals_even_addr + count * keyval_bytes, -- (count_ru_histo - count) * keyval_bytes, 0xFFFFFFFF); -+ if (bvh_states[i].count_ru_histo > bvh_states[i].node_count) { -+ radv_fill_buffer(cmd_buffer, NULL, NULL, keyvals_even_addr + bvh_states[i].node_count * keyval_bytes, -+ (bvh_states[i].count_ru_histo - bvh_states[i].node_count) * keyval_bytes, 0xFFFFFFFF); - } - - /* -@@ -815,28 +826,35 @@ morton_sort(VkCommandBuffer commandBuffer, uint32_t infoCount, - * Note that the last workgroup doesn't read/write a partition so it doesn't - * need to be initialized. - */ -- uint32_t histo_partition_count = passes + scatter_blocks - 1; -- uint32_t pass_idx = (keyval_bytes - passes); -+ uint32_t histo_partition_count = passes + bvh_states[i].scatter_blocks - 1; - - uint32_t fill_base = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t)); - - radv_fill_buffer(cmd_buffer, NULL, NULL, internal_addr + rs->internal.histograms.offset + fill_base, - histo_partition_count * (RS_RADIX_SIZE * sizeof(uint32_t)), 0); -+ } - -- /* -- * Pipeline: HISTOGRAM -- * -- * TODO(allanmac): All subgroups should try to process approximately the same -- * number of blocks in order to minimize tail effects. This was implemented -- * and reverted but should be reimplemented and benchmarked later. -- */ -- vk_barrier_transfer_w_to_compute_r(commandBuffer); -+ /* -+ * Pipeline: HISTOGRAM -+ * -+ * TODO(allanmac): All subgroups should try to process approximately the same -+ * number of blocks in order to minimize tail effects. This was implemented -+ * and reverted but should be reimplemented and benchmarked later. -+ */ -+ vk_barrier_transfer_w_to_compute_r(commandBuffer); -+ -+ radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram); -+ -+ for (uint32_t i = 0; i < infoCount; ++i) { -+ if (!bvh_states[i].node_count) -+ continue; - -- uint64_t devaddr_histograms = internal_addr + rs->internal.histograms.offset; -+ uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; -+ uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; - - /* Dispatch histogram */ - struct rs_push_histogram push_histogram = { -- .devaddr_histograms = devaddr_histograms, -+ .devaddr_histograms = internal_addr + rs->internal.histograms.offset, - .devaddr_keyvals = keyvals_even_addr, - .passes = passes, - }; -@@ -844,83 +862,87 @@ morton_sort(VkCommandBuffer commandBuffer, uint32_t infoCount, - radv_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.histogram, VK_SHADER_STAGE_COMPUTE_BIT, 0, - sizeof(push_histogram), &push_histogram); - -- radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.histogram); -+ vk_common_CmdDispatch(commandBuffer, bvh_states[i].histo_blocks, 1, 1); -+ } - -- vk_common_CmdDispatch(commandBuffer, histo_blocks, 1, 1); -+ /* -+ * Pipeline: PREFIX -+ * -+ * Launch one workgroup per pass. -+ */ -+ vk_barrier_compute_w_to_compute_r(commandBuffer); - -- /* -- * Pipeline: PREFIX -- * -- * Launch one workgroup per pass. -- */ -- vk_barrier_compute_w_to_compute_r(commandBuffer); -+ radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix); -+ -+ for (uint32_t i = 0; i < infoCount; ++i) { -+ if (!bvh_states[i].node_count) -+ continue; -+ -+ uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; - - struct rs_push_prefix push_prefix = { -- .devaddr_histograms = devaddr_histograms, -+ .devaddr_histograms = internal_addr + rs->internal.histograms.offset, - }; - - radv_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.prefix, VK_SHADER_STAGE_COMPUTE_BIT, 0, - sizeof(push_prefix), &push_prefix); - -- radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, rs->pipelines.named.prefix); -- - vk_common_CmdDispatch(commandBuffer, passes, 1, 1); -+ } - -- /* Pipeline: SCATTER */ -- vk_barrier_compute_w_to_compute_r(commandBuffer); -+ /* Pipeline: SCATTER */ -+ vk_barrier_compute_w_to_compute_r(commandBuffer); - -- uint32_t histogram_offset = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t)); -- uint64_t devaddr_partitions = internal_addr + rs->internal.partitions.offset; -+ uint32_t histogram_offset = pass_idx * (RS_RADIX_SIZE * sizeof(uint32_t)); - -- struct rs_push_scatter push_scatter = { -+ for (uint32_t i = 0; i < infoCount; i++) { -+ uint64_t keyvals_even_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[0]; -+ uint64_t keyvals_odd_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_buffer_offset[1]; -+ uint64_t internal_addr = pInfos[i].scratchData.deviceAddress + bvh_states[i].scratch.sort_internal_offset; -+ -+ bvh_states[i].push_scatter = (struct rs_push_scatter){ - .devaddr_keyvals_even = keyvals_even_addr, - .devaddr_keyvals_odd = keyvals_odd_addr, -- .devaddr_partitions = devaddr_partitions, -- .devaddr_histograms = devaddr_histograms + histogram_offset, -- .pass_offset = (pass_idx & 3) * RS_RADIX_LOG2, -+ .devaddr_partitions = internal_addr + rs->internal.partitions.offset, -+ .devaddr_histograms = internal_addr + rs->internal.histograms.offset + histogram_offset, - }; -+ } - -- { -- uint32_t pass_dword = pass_idx / 4; -+ bool is_even = true; - -- radv_CmdPushConstants(commandBuffer, rs->pipeline_layouts.named.scatter[pass_dword].even, -- VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_scatter), &push_scatter); -+ while (true) { -+ uint32_t pass_dword = pass_idx / 4; - -- radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, -- rs->pipelines.named.scatter[pass_dword].even); -- } -+ /* Bind new pipeline */ -+ VkPipeline p = -+ is_even ? rs->pipelines.named.scatter[pass_dword].even : rs->pipelines.named.scatter[pass_dword].odd; -+ radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, p); - -- bool is_even = true; -+ /* Update push constants that changed */ -+ VkPipelineLayout pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even // -+ : rs->pipeline_layouts.named.scatter[pass_dword].odd; - -- while (true) { -- vk_common_CmdDispatch(commandBuffer, scatter_blocks, 1, 1); -+ for (uint32_t i = 0; i < infoCount; i++) { -+ if (!bvh_states[i].node_count) -+ continue; - -- /* Continue? */ -- if (++pass_idx >= keyval_bytes) -- break; -+ bvh_states[i].push_scatter.pass_offset = (pass_idx & 3) * RS_RADIX_LOG2; - -- vk_barrier_compute_w_to_compute_r(commandBuffer); -+ radv_CmdPushConstants(commandBuffer, pl, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(struct rs_push_scatter), -+ &bvh_states[i].push_scatter); - -- is_even ^= true; -- push_scatter.devaddr_histograms += (RS_RADIX_SIZE * sizeof(uint32_t)); -- push_scatter.pass_offset = (pass_idx & 3) * RS_RADIX_LOG2; -+ vk_common_CmdDispatch(commandBuffer, bvh_states[i].scatter_blocks, 1, 1); - -- uint32_t pass_dword = pass_idx / 4; -+ bvh_states[i].push_scatter.devaddr_histograms += (RS_RADIX_SIZE * sizeof(uint32_t)); -+ } - -- /* Update push constants that changed */ -- VkPipelineLayout pl = is_even ? rs->pipeline_layouts.named.scatter[pass_dword].even -- : rs->pipeline_layouts.named.scatter[pass_dword].odd; -- radv_CmdPushConstants(commandBuffer, pl, VK_SHADER_STAGE_COMPUTE_BIT, -- offsetof(struct rs_push_scatter, devaddr_histograms), -- sizeof(push_scatter.devaddr_histograms) + sizeof(push_scatter.pass_offset), -- &push_scatter.devaddr_histograms); -+ /* Continue? */ -+ if (++pass_idx >= keyval_bytes) -+ break; - -- /* Bind new pipeline */ -- VkPipeline p = -- is_even ? rs->pipelines.named.scatter[pass_dword].even : rs->pipelines.named.scatter[pass_dword].odd; -+ vk_barrier_compute_w_to_compute_r(commandBuffer); - -- radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, p); -- } -+ is_even ^= true; - } - - cmd_buffer->state.flush_bits |= flush_bits; --- -GitLab diff --git a/debian/patches/24789.patch b/debian/patches/24789.patch deleted file mode 100644 index 8b01f93..0000000 --- a/debian/patches/24789.patch +++ /dev/null @@ -1,34 +0,0 @@ -From 87f95fa7f24415f51391f128adf7f048358be226 Mon Sep 17 00:00:00 2001 -From: Friedrich Vock -Date: Sat, 19 Aug 2023 11:00:45 +0200 -Subject: [PATCH] nir/load_store_vectorize: Handle intrinsics with constant - base - -This includes nir_load_stack and nir_store_stack, which are vectorized -in nir_lower_shader_calls. If not adjusted, we end up loading from -the wrong base. - -Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9596 -Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/9587 -Cc: mesa-stable -Part-of: ---- - src/compiler/nir/nir_opt_load_store_vectorize.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/src/compiler/nir/nir_opt_load_store_vectorize.c b/src/compiler/nir/nir_opt_load_store_vectorize.c -index 73e6ff6d8878..4bea8fbea6ff 100644 ---- a/src/compiler/nir/nir_opt_load_store_vectorize.c -+++ b/src/compiler/nir/nir_opt_load_store_vectorize.c -@@ -756,6 +756,8 @@ vectorize_loads(nir_builder *b, struct vectorize_ctx *ctx, - - nir_intrinsic_set_range_base(first->intrin, low_base); - nir_intrinsic_set_range(first->intrin, MAX2(low_end, high_end) - low_base); -+ } else if (nir_intrinsic_has_base(first->intrin) && info->base_src == -1 && info->deref_src == -1) { -+ nir_intrinsic_set_base(first->intrin, nir_intrinsic_base(low->intrin)); - } - - first->key = low->key; --- -GitLab - diff --git a/debian/patches/path_max.diff b/debian/patches/path_max.diff deleted file mode 100644 index 9832c73..0000000 --- a/debian/patches/path_max.diff +++ /dev/null @@ -1,39 +0,0 @@ ---- a/src/util/tests/cache_test.cpp -+++ b/src/util/tests/cache_test.cpp -@@ -82,8 +82,8 @@ check_directories_created(void *mem_ctx, - { - bool sub_dirs_created = false; - -- char buf[PATH_MAX]; -- if (getcwd(buf, PATH_MAX)) { -+ char *buf = getcwd(NULL, 0); -+ if (buf) { - char *full_path = ralloc_asprintf(mem_ctx, "%s%s", buf, ++cache_dir); - struct stat sb; - if (stat(full_path, &sb) != -1 && S_ISDIR(sb.st_mode)) ---- a/src/util/tests/process_test.c -+++ b/src/util/tests/process_test.c -@@ -36,6 +36,10 @@ - #define PATH_MAX MAX_PATH - #endif - -+#if !defined(PATH_MAX) && defined(__GNU__) -+#define PATH_MAX (4096) -+#endif -+ - static bool error = false; - - static void ---- a/src/gallium/auxiliary/pipe-loader/pipe_loader.c -+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader.c -@@ -42,6 +42,10 @@ - #define PATH_MAX _MAX_PATH - #endif - -+#if !defined(PATH_MAX) && defined(__GNU__) -+#define PATH_MAX (4096) -+#endif -+ - #define MODULE_PREFIX "pipe_" - - static int (*backends[])(struct pipe_loader_device **, int) = { diff --git a/debian/patches/revert-af1ee8e01044.diff b/debian/patches/revert-af1ee8e01044.diff deleted file mode 100644 index 6fd03d7..0000000 --- a/debian/patches/revert-af1ee8e01044.diff +++ /dev/null @@ -1,153 +0,0 @@ -From 0c3587a2f8e1b6cfadf9a4bbb6ae4b2c3e14a651 Mon Sep 17 00:00:00 2001 -From: Leandro Ribeiro -Date: Sun, 10 Apr 2022 22:54:36 -0300 -Subject: [PATCH] Revert "egl/wayland: deprecate drm_handle_format() and - drm_handle_capabilities()" - -Commit af1ee8e010441f8f2ed8c77065b159652a4ac9fe dropped support to -wl_drm, as we thought that most compositors from active projects were -already supporting zwp_linux_dmabuf_v1. - -But that's not true, so revert this commit in order to give these -projects a longer transition period. - -Note that we didn't add back the support to GEM name API, and that was -on purpose. - -Signed-off-by: Leandro Ribeiro ---- - src/egl/drivers/dri2/egl_dri2.h | 1 + - src/egl/drivers/dri2/platform_wayland.c | 59 +++++++++++++++++++------ - 2 files changed, 47 insertions(+), 13 deletions(-) - -diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h -index 89158993efdd..1c840a966b3c 100644 ---- a/src/egl/drivers/dri2/egl_dri2.h -+++ b/src/egl/drivers/dri2/egl_dri2.h -@@ -284,6 +284,7 @@ struct dri2_egl_display - struct zwp_linux_dmabuf_feedback_v1 *wl_dmabuf_feedback; - struct dmabuf_feedback_format_table format_table; - bool authenticated; -+ uint32_t capabilities; - char *device_name; - #endif - -diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c -index e9ecf6d1e716..19fad8bfa08e 100644 ---- a/src/egl/drivers/dri2/platform_wayland.c -+++ b/src/egl/drivers/dri2/platform_wayland.c -@@ -1344,7 +1344,7 @@ create_wl_buffer(struct dri2_egl_display *dri2_dpy, - struct dri2_egl_surface *dri2_surf, - __DRIimage *image) - { -- struct wl_buffer *ret; -+ struct wl_buffer *ret = NULL; - EGLBoolean query; - int width, height, fourcc, num_planes; - uint64_t modifier = DRM_FORMAT_MOD_INVALID; -@@ -1448,11 +1448,28 @@ create_wl_buffer(struct dri2_egl_display *dri2_dpy, - ret = zwp_linux_buffer_params_v1_create_immed(params, width, height, - fourcc, 0); - zwp_linux_buffer_params_v1_destroy(params); -+ } else { -+ struct wl_drm *wl_drm = -+ dri2_surf ? dri2_surf->wl_drm_wrapper : dri2_dpy->wl_drm; -+ int fd, stride; -+ -+ if (num_planes > 1) -+ return NULL; -+ -+ query = dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_FD, &fd); -+ query &= dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_STRIDE, &stride); -+ if (!query) { -+ if (fd >= 0) -+ close(fd); -+ return NULL; -+ } - -- return ret; -+ ret = wl_drm_create_prime_buffer(wl_drm, fd, width, height, fourcc, 0, -+ stride, 0, 0, 0, 0); -+ close(fd); - } - -- return NULL; -+ return ret; - } - - static EGLBoolean -@@ -1699,16 +1716,21 @@ drm_handle_device(void *data, struct wl_drm *drm, const char *device) - static void - drm_handle_format(void *data, struct wl_drm *drm, uint32_t format) - { -- /* deprecated, as compositors already support the dma-buf protocol extension -- * and so we can rely on dmabuf_handle_modifier() to receive formats and -- * modifiers */ -+ struct dri2_egl_display *dri2_dpy = data; -+ int visual_idx = dri2_wl_visual_idx_from_fourcc(format); -+ -+ if (visual_idx == -1) -+ return; -+ -+ BITSET_SET(dri2_dpy->formats.formats_bitmap, visual_idx); - } - - static void - drm_handle_capabilities(void *data, struct wl_drm *drm, uint32_t value) - { -- /* deprecated, as compositors already support the dma-buf protocol extension -- * and so we can rely on it to create wl_buffer's */ -+ struct dri2_egl_display *dri2_dpy = data; -+ -+ dri2_dpy->capabilities = value; - } - - static void -@@ -2077,13 +2099,12 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp) - wl_registry_add_listener(dri2_dpy->wl_registry, - ®istry_listener_drm, dri2_dpy); - -- /* The compositor must expose the dma-buf interface. */ -- if (roundtrip(dri2_dpy) < 0 || dri2_dpy->wl_dmabuf == NULL) -+ if (roundtrip(dri2_dpy) < 0) - goto cleanup; - - /* Get default dma-buf feedback */ -- if (zwp_linux_dmabuf_v1_get_version(dri2_dpy->wl_dmabuf) >= -- ZWP_LINUX_DMABUF_V1_GET_DEFAULT_FEEDBACK_SINCE_VERSION) { -+ if (dri2_dpy->wl_dmabuf && zwp_linux_dmabuf_v1_get_version(dri2_dpy->wl_dmabuf) >= -+ ZWP_LINUX_DMABUF_V1_GET_DEFAULT_FEEDBACK_SINCE_VERSION) { - dmabuf_feedback_format_table_init(&dri2_dpy->format_table); - dri2_dpy->wl_dmabuf_feedback = - zwp_linux_dmabuf_v1_get_default_feedback(dri2_dpy->wl_dmabuf); -@@ -2091,7 +2112,6 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp) - &dmabuf_feedback_listener, dri2_dpy); - } - -- /* Receive events from the interfaces */ - if (roundtrip(dri2_dpy) < 0) - goto cleanup; - -@@ -2178,6 +2198,19 @@ dri2_initialize_wayland_drm(_EGLDisplay *disp) - - dri2_wl_setup_swap_interval(disp); - -+ if (dri2_dpy->wl_drm) { -+ /* To use Prime, we must have _DRI_IMAGE v7 at least. createImageFromFds -+ * support indicates that Prime export/import is supported by the driver. -+ * We deprecated the support to GEM names API, so we bail out if the -+ * driver does not suport Prime. */ -+ if (!(dri2_dpy->capabilities & WL_DRM_CAPABILITY_PRIME) || -+ (dri2_dpy->image->base.version < 7) || -+ (dri2_dpy->image->createImageFromFds == NULL)) { -+ _eglLog(_EGL_WARNING, "wayland-egl: display does not support prime"); -+ goto cleanup; -+ } -+ } -+ - if (dri2_dpy->is_different_gpu && - (dri2_dpy->image->base.version < 9 || - dri2_dpy->image->blitImage == NULL)) { --- -GitLab - diff --git a/debian/patches/series b/debian/patches/series deleted file mode 100644 index d2b1d0c..0000000 --- a/debian/patches/series +++ /dev/null @@ -1,6 +0,0 @@ -#07_gallium-fix-build-failure-on-powerpcspe.diff -#path_max.diff -#src_glx_dri_common.h.diff -#21929.patch -#24720.patch -#24789.patch diff --git a/debian/patches/src_glx_dri_common.h.diff b/debian/patches/src_glx_dri_common.h.diff deleted file mode 100644 index 7524922..0000000 --- a/debian/patches/src_glx_dri_common.h.diff +++ /dev/null @@ -1,13 +0,0 @@ ---- a/src/glx/dri_common.h -+++ b/src/glx/dri_common.h -@@ -57,6 +57,10 @@ extern struct glx_config *driConvertConf - - extern void driDestroyConfigs(const __DRIconfig **configs); - -+#ifndef __GLXDRIdrawable -+typedef struct __GLXDRIdrawableRec __GLXDRIdrawable; -+#endif -+ - extern __GLXDRIdrawable * - driFetchDrawable(struct glx_context *gc, GLXDrawable glxDrawable); -