From 260287b3a14ac17227ef1987d5410f4f2c6ef275 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Sun, 3 Nov 2024 11:57:07 -0800 Subject: [PATCH] Rewrite index optimization code for maximum efficiency While all the previous fixes to optimizeVertexCache invocation fixed the vertex transform efficiency, the import code still was missing two crucial recommendations from meshoptimizer documentation: - All meshes should be optimized for vertex cache (this reorders vertices for maximum fetch efficiency) - When LODs are used with a shared vertex buffer, the vertex order should be generated by doing a vertex fetch optimization on the concatenated index buffer from coarse to fine LODs; this maximizes fetch efficiency for coarse LODs The last point is especially crucial for Mali GPUs; unlike other GPUs where vertex order affects fetch efficiency but not shading, these GPUs have various shading quirks (depending on the GPU generation) that really require consecutive index ranges for each LOD, which requires the second optimization mentioned above. However all of these also help desktop GPUs and other mobile GPUs as well. Because this optimization is "global" in the sense that it affects all LODs and all vertex arrays in concert, I've taken this opportunity to isolate all optimization code in this function and pull it out of generate_lods and create_shadow_mesh; this doesn't change the vertex cache efficiency, but makes the code cleaner. Consequently, optimize_indices should be called after other functions like create_shadow_mesh / generate_lods. This required exposing meshopt_optimizeVertexFetchRemap; as a drive-by, meshopt_simplifySloppy was never used so it's not exposed anymore - this will simplify future meshopt upgrades if they end up changing the function's interface. --- editor/import/3d/resource_importer_obj.cpp | 4 +- editor/import/3d/resource_importer_scene.cpp | 4 +- modules/meshoptimizer/register_types.cpp | 4 +- scene/resources/3d/importer_mesh.cpp | 105 +++++++++++++++---- scene/resources/3d/importer_mesh.h | 2 +- scene/resources/surface_tool.cpp | 2 +- scene/resources/surface_tool.h | 8 +- 7 files changed, 99 insertions(+), 30 deletions(-) diff --git a/editor/import/3d/resource_importer_obj.cpp b/editor/import/3d/resource_importer_obj.cpp index e77f5ec2b1c..3669844207d 100644 --- a/editor/import/3d/resource_importer_obj.cpp +++ b/editor/import/3d/resource_importer_obj.cpp @@ -535,8 +535,6 @@ static Error _parse_obj(const String &p_path, List> &r_meshes, } } - mesh->optimize_indices_for_cache(); - if (p_generate_lods) { // Use normal merge/split angles that match the defaults used for 3D scene importing. mesh->generate_lods(60.0f, {}); @@ -546,6 +544,8 @@ static Error _parse_obj(const String &p_path, List> &r_meshes, mesh->create_shadow_mesh(); } + mesh->optimize_indices(); + if (p_single_mesh && mesh->get_surface_count() > 0) { r_meshes.push_back(mesh); } diff --git a/editor/import/3d/resource_importer_scene.cpp b/editor/import/3d/resource_importer_scene.cpp index 58af558e7bf..d2e3b7927bc 100644 --- a/editor/import/3d/resource_importer_scene.cpp +++ b/editor/import/3d/resource_importer_scene.cpp @@ -2567,8 +2567,6 @@ Node *ResourceImporterScene::_generate_meshes(Node *p_node, const Dictionary &p_ } } - src_mesh_node->get_mesh()->optimize_indices_for_cache(); - if (generate_lods) { Array skin_pose_transform_array = _get_skinned_pose_transforms(src_mesh_node); src_mesh_node->get_mesh()->generate_lods(merge_angle, skin_pose_transform_array); @@ -2578,6 +2576,8 @@ Node *ResourceImporterScene::_generate_meshes(Node *p_node, const Dictionary &p_ src_mesh_node->get_mesh()->create_shadow_mesh(); } + src_mesh_node->get_mesh()->optimize_indices(); + if (!save_to_file.is_empty()) { Ref existing = ResourceCache::get_ref(save_to_file); if (existing.is_valid()) { diff --git a/modules/meshoptimizer/register_types.cpp b/modules/meshoptimizer/register_types.cpp index 781f928f66e..ebfe5d9c5b6 100644 --- a/modules/meshoptimizer/register_types.cpp +++ b/modules/meshoptimizer/register_types.cpp @@ -40,10 +40,10 @@ void initialize_meshoptimizer_module(ModuleInitializationLevel p_level) { } SurfaceTool::optimize_vertex_cache_func = meshopt_optimizeVertexCache; + SurfaceTool::optimize_vertex_fetch_remap_func = meshopt_optimizeVertexFetchRemap; SurfaceTool::simplify_func = meshopt_simplify; SurfaceTool::simplify_with_attrib_func = meshopt_simplifyWithAttributes; SurfaceTool::simplify_scale_func = meshopt_simplifyScale; - SurfaceTool::simplify_sloppy_func = meshopt_simplifySloppy; SurfaceTool::generate_remap_func = meshopt_generateVertexRemap; SurfaceTool::remap_vertex_func = meshopt_remapVertexBuffer; SurfaceTool::remap_index_func = meshopt_remapIndexBuffer; @@ -55,9 +55,9 @@ void uninitialize_meshoptimizer_module(ModuleInitializationLevel p_level) { } SurfaceTool::optimize_vertex_cache_func = nullptr; + SurfaceTool::optimize_vertex_fetch_remap_func = nullptr; SurfaceTool::simplify_func = nullptr; SurfaceTool::simplify_scale_func = nullptr; - SurfaceTool::simplify_sloppy_func = nullptr; SurfaceTool::generate_remap_func = nullptr; SurfaceTool::remap_vertex_func = nullptr; SurfaceTool::remap_index_func = nullptr; diff --git a/scene/resources/3d/importer_mesh.cpp b/scene/resources/3d/importer_mesh.cpp index e255cb077ff..f040f04cd81 100644 --- a/scene/resources/3d/importer_mesh.cpp +++ b/scene/resources/3d/importer_mesh.cpp @@ -168,10 +168,56 @@ void ImporterMesh::set_surface_material(int p_surface, const Ref &p_ma mesh.unref(); } -void ImporterMesh::optimize_indices_for_cache() { +template +static Vector _remap_array(Vector p_array, const Vector &p_remap, uint32_t p_vertex_count) { + ERR_FAIL_COND_V(p_array.size() % p_remap.size() != 0, p_array); + int num_elements = p_array.size() / p_remap.size(); + T *data = p_array.ptrw(); + SurfaceTool::remap_vertex_func(data, data, p_remap.size(), sizeof(T) * num_elements, p_remap.ptr()); + p_array.resize(p_vertex_count * num_elements); + return p_array; +} + +static void _remap_arrays(Array &r_arrays, const Vector &p_remap, uint32_t p_vertex_count) { + for (int i = 0; i < r_arrays.size(); i++) { + if (i == RS::ARRAY_INDEX) { + continue; + } + + switch (r_arrays[i].get_type()) { + case Variant::NIL: + break; + case Variant::PACKED_VECTOR3_ARRAY: + r_arrays[i] = _remap_array(r_arrays[i], p_remap, p_vertex_count); + break; + case Variant::PACKED_VECTOR2_ARRAY: + r_arrays[i] = _remap_array(r_arrays[i], p_remap, p_vertex_count); + break; + case Variant::PACKED_FLOAT32_ARRAY: + r_arrays[i] = _remap_array(r_arrays[i], p_remap, p_vertex_count); + break; + case Variant::PACKED_INT32_ARRAY: + r_arrays[i] = _remap_array(r_arrays[i], p_remap, p_vertex_count); + break; + case Variant::PACKED_BYTE_ARRAY: + r_arrays[i] = _remap_array(r_arrays[i], p_remap, p_vertex_count); + break; + case Variant::PACKED_COLOR_ARRAY: + r_arrays[i] = _remap_array(r_arrays[i], p_remap, p_vertex_count); + break; + default: + ERR_FAIL_MSG("Unhandled array type."); + } + } +} + +void ImporterMesh::optimize_indices() { if (!SurfaceTool::optimize_vertex_cache_func) { return; } + if (!SurfaceTool::optimize_vertex_fetch_remap_func || !SurfaceTool::remap_vertex_func || !SurfaceTool::remap_index_func) { + return; + } for (int i = 0; i < surfaces.size(); i++) { if (surfaces[i].primitive != Mesh::PRIMITIVE_TRIANGLES) { @@ -188,10 +234,48 @@ void ImporterMesh::optimize_indices_for_cache() { continue; } + // Optimize indices for vertex cache to establish final triangle order. int *indices_ptr = indices.ptrw(); SurfaceTool::optimize_vertex_cache_func((unsigned int *)indices_ptr, (const unsigned int *)indices_ptr, index_count, vertex_count); - surfaces.write[i].arrays[RS::ARRAY_INDEX] = indices; + + for (int j = 0; j < surfaces[i].lods.size(); ++j) { + Surface::LOD &lod = surfaces.write[i].lods.write[j]; + int *lod_indices_ptr = lod.indices.ptrw(); + SurfaceTool::optimize_vertex_cache_func((unsigned int *)lod_indices_ptr, (const unsigned int *)lod_indices_ptr, lod.indices.size(), vertex_count); + } + + // Concatenate indices for all LODs in the order of coarse->fine; this establishes the effective order of vertices, + // and is important to optimize for vertex fetch (all GPUs) and shading (Mali GPUs) + PackedInt32Array merged_indices; + for (int j = surfaces[i].lods.size() - 1; j >= 0; --j) { + merged_indices.append_array(surfaces[i].lods[j].indices); + } + merged_indices.append_array(indices); + + // Generate remap array that establishes optimal vertex order according to the order of indices above. + Vector remap; + remap.resize(vertex_count); + unsigned int new_vertex_count = SurfaceTool::optimize_vertex_fetch_remap_func(remap.ptrw(), (const unsigned int *)merged_indices.ptr(), merged_indices.size(), vertex_count); + + // We need to remap all vertex and index arrays in lockstep according to the remap. + SurfaceTool::remap_index_func((unsigned int *)indices_ptr, (const unsigned int *)indices_ptr, index_count, remap.ptr()); + surfaces.write[i].arrays[RS::ARRAY_INDEX] = indices; + + for (int j = 0; j < surfaces[i].lods.size(); ++j) { + Surface::LOD &lod = surfaces.write[i].lods.write[j]; + int *lod_indices_ptr = lod.indices.ptrw(); + SurfaceTool::remap_index_func((unsigned int *)lod_indices_ptr, (const unsigned int *)lod_indices_ptr, lod.indices.size(), remap.ptr()); + } + + _remap_arrays(surfaces.write[i].arrays, remap, new_vertex_count); + for (int j = 0; j < surfaces[i].blend_shape_data.size(); j++) { + _remap_arrays(surfaces.write[i].blend_shape_data.write[j].arrays, remap, new_vertex_count); + } + } + + if (shadow_mesh.is_valid()) { + shadow_mesh->optimize_indices(); } } @@ -215,9 +299,6 @@ void ImporterMesh::generate_lods(float p_normal_merge_angle, Array p_bone_transf if (!SurfaceTool::simplify_with_attrib_func) { return; } - if (!SurfaceTool::optimize_vertex_cache_func) { - return; - } LocalVector bone_transform_vector; for (int i = 0; i < p_bone_transform_array.size(); i++) { @@ -431,12 +512,6 @@ void ImporterMesh::generate_lods(float p_normal_merge_angle, Array p_bone_transf } surfaces.write[i].lods.sort_custom(); - - for (int j = 0; j < surfaces.write[i].lods.size(); j++) { - Surface::LOD &lod = surfaces.write[i].lods.write[j]; - unsigned int *lod_indices_ptr = (unsigned int *)lod.indices.ptrw(); - SurfaceTool::optimize_vertex_cache_func(lod_indices_ptr, lod_indices_ptr, lod.indices.size(), vertex_count); - } } } @@ -574,10 +649,6 @@ void ImporterMesh::create_shadow_mesh() { index_wptr[j] = vertex_remap[index]; } - if (SurfaceTool::optimize_vertex_cache_func && surfaces[i].primitive == Mesh::PRIMITIVE_TRIANGLES) { - SurfaceTool::optimize_vertex_cache_func((unsigned int *)index_wptr, (const unsigned int *)index_wptr, index_count, new_vertices.size()); - } - new_surface[RS::ARRAY_INDEX] = new_indices; // Make sure the same LODs as the full version are used. @@ -596,10 +667,6 @@ void ImporterMesh::create_shadow_mesh() { index_wptr[k] = vertex_remap[index]; } - if (SurfaceTool::optimize_vertex_cache_func && surfaces[i].primitive == Mesh::PRIMITIVE_TRIANGLES) { - SurfaceTool::optimize_vertex_cache_func((unsigned int *)index_wptr, (const unsigned int *)index_wptr, index_count, new_vertices.size()); - } - lods[surfaces[i].lods[j].distance] = new_indices; } } diff --git a/scene/resources/3d/importer_mesh.h b/scene/resources/3d/importer_mesh.h index 7776e78f111..2bdf759da61 100644 --- a/scene/resources/3d/importer_mesh.h +++ b/scene/resources/3d/importer_mesh.h @@ -113,7 +113,7 @@ public: void set_surface_material(int p_surface, const Ref &p_material); - void optimize_indices_for_cache(); + void optimize_indices(); void generate_lods(float p_normal_merge_angle, Array p_skin_pose_transform_array); diff --git a/scene/resources/surface_tool.cpp b/scene/resources/surface_tool.cpp index 6921885ee02..c230cf1b704 100644 --- a/scene/resources/surface_tool.cpp +++ b/scene/resources/surface_tool.cpp @@ -33,10 +33,10 @@ #define EQ_VERTEX_DIST 0.00001 SurfaceTool::OptimizeVertexCacheFunc SurfaceTool::optimize_vertex_cache_func = nullptr; +SurfaceTool::OptimizeVertexFetchRemapFunc SurfaceTool::optimize_vertex_fetch_remap_func = nullptr; SurfaceTool::SimplifyFunc SurfaceTool::simplify_func = nullptr; SurfaceTool::SimplifyWithAttribFunc SurfaceTool::simplify_with_attrib_func = nullptr; SurfaceTool::SimplifyScaleFunc SurfaceTool::simplify_scale_func = nullptr; -SurfaceTool::SimplifySloppyFunc SurfaceTool::simplify_sloppy_func = nullptr; SurfaceTool::GenerateRemapFunc SurfaceTool::generate_remap_func = nullptr; SurfaceTool::RemapVertexFunc SurfaceTool::remap_vertex_func = nullptr; SurfaceTool::RemapIndexFunc SurfaceTool::remap_index_func = nullptr; diff --git a/scene/resources/surface_tool.h b/scene/resources/surface_tool.h index 3b18e082a13..68dc9e71987 100644 --- a/scene/resources/surface_tool.h +++ b/scene/resources/surface_tool.h @@ -90,14 +90,14 @@ public: typedef void (*OptimizeVertexCacheFunc)(unsigned int *destination, const unsigned int *indices, size_t index_count, size_t vertex_count); static OptimizeVertexCacheFunc optimize_vertex_cache_func; + typedef size_t (*OptimizeVertexFetchRemapFunc)(unsigned int *destination, const unsigned int *indices, size_t index_count, size_t vertex_count); + static OptimizeVertexFetchRemapFunc optimize_vertex_fetch_remap_func; typedef size_t (*SimplifyFunc)(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_positions, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, unsigned int options, float *r_error); static SimplifyFunc simplify_func; typedef size_t (*SimplifyWithAttribFunc)(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_data, size_t vertex_count, size_t vertex_stride, const float *attributes, size_t attribute_stride, const float *attribute_weights, size_t attribute_count, const unsigned char *vertex_lock, size_t target_index_count, float target_error, unsigned int options, float *result_error); static SimplifyWithAttribFunc simplify_with_attrib_func; typedef float (*SimplifyScaleFunc)(const float *vertex_positions, size_t vertex_count, size_t vertex_positions_stride); static SimplifyScaleFunc simplify_scale_func; - typedef size_t (*SimplifySloppyFunc)(unsigned int *destination, const unsigned int *indices, size_t index_count, const float *vertex_positions_data, size_t vertex_count, size_t vertex_positions_stride, size_t target_index_count, float target_error, float *out_result_error); - static SimplifySloppyFunc simplify_sloppy_func; typedef size_t (*GenerateRemapFunc)(unsigned int *destination, const unsigned int *indices, size_t index_count, const void *vertices, size_t vertex_count, size_t vertex_size); static GenerateRemapFunc generate_remap_func; typedef void (*RemapVertexFunc)(void *destination, const void *vertices, size_t vertex_count, size_t vertex_size, const unsigned int *remap); @@ -222,7 +222,9 @@ public: void clear(); - LocalVector &get_vertex_array() { return vertex_array; } + LocalVector &get_vertex_array() { + return vertex_array; + } void create_from_triangle_arrays(const Array &p_arrays); void create_from_arrays(const Array &p_arrays, Mesh::PrimitiveType p_primitive_type = Mesh::PRIMITIVE_TRIANGLES);