From 81ba75ea655591eb5aef51a1fd10b51f953fe404 Mon Sep 17 00:00:00 2001 From: Anders Jenbo Date: Fri, 13 Jun 2025 01:21:34 +0200 Subject: [PATCH] SIMD optimize z-buffer clear (#283) --- LEGO1/mxdirectx/legodxinfo.cpp | 73 +----------- LEGO1/mxdirectx/legodxinfo.h | 1 - .../src/d3drm/backends/software/renderer.cpp | 109 +++++++++++++----- miniwin/src/internal/d3drmrenderer_software.h | 2 +- 4 files changed, 82 insertions(+), 103 deletions(-) diff --git a/LEGO1/mxdirectx/legodxinfo.cpp b/LEGO1/mxdirectx/legodxinfo.cpp index 38ced82d..bb89275a 100644 --- a/LEGO1/mxdirectx/legodxinfo.cpp +++ b/LEGO1/mxdirectx/legodxinfo.cpp @@ -1,10 +1,8 @@ #include "legodxinfo.h" +#include #include #include // for vsprintf -#if defined(_MSC_VER) && _MSC_VER >= 1310 -#include -#endif // File name validated by BETA10 0x1011cba3; directory unknown @@ -204,74 +202,7 @@ int LegoDeviceEnumerate::GetBestDevice() // FUNCTION: BETA10 0x1011cf54 bool LegoDeviceEnumerate::SupportsSIMD() { -#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__EMSCRIPTEN__) - // All x86_64 and 64-bit ARM CPUs support at least SSE2 or NEON - return true; -#elif defined(__i386__) || defined(_M_IX86) - // 32-bit x86 - need to use CPUID to check for MMX or SSE - if (!SupportsCPUID()) { - return false; - } - - int edx; -#if defined(_MSC_VER) && _MSC_VER >= 1310 - int cpuInfo[4]; - __cpuid(cpuInfo, 1); - edx = cpuInfo[3]; -#else - __asm__ __volatile__("movl $1, %%eax\n\t" - "cpuid\n\t" - : "=d"(edx) - : - : "%eax", "%ebx", "%ecx"); -#endif - return (edx & (1 << 23)) != 0; // Bit 23: MMX -#elif defined(__arm__) && defined(__ANDROID__) - // Runtime check for NEON on 32-bit ARM (using Android NDK) - return android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON; -#else -// Prevent unsupported builds -#error "Unsupported platform: SIMD feature detection not implemented" -#endif -} - -// FUNCTION: CONFIG 0x00402970 -// FUNCTION: LEGO1 0x1009d1e0 -// FUNCTION: BETA10 0x1011cf97 -bool LegoDeviceEnumerate::SupportsCPUID() -{ -#if defined(_M_X64) || defined(__x86_64__) || defined(__amd64__) - return true; -#elif defined(_M_IX86) || defined(__i386__) - int has_cpuid; -#ifdef _MSC_VER - __asm { - xor eax, eax ; Zero EAX register - pushfd ; Push EFLAGS register value on the stack - or dword ptr[esp], 0x200000 ; Set bit 0x200000: Able to use CPUID instruction (Pentium+) - popfd ; Write the updated value into the EFLAGS register - pushfd ; Push EFLAGS register value on the stack (again) - btr dword ptr[esp], 0x15 ; Test bit 0x15 (21) and reset (set CF) - adc eax, eax ; Add with carry: EAX = EAX + EAX + CF = CF - popfd ; Push EFLAGS register value on the stack (again, and makes sure the stack remains the same) - mov has_cpuid, eax ; Save eax into C variable - } -#else - __asm__("xorl %%eax, %%eax\n\t" // Zero EAX register - "pushfl\n\t" // Push EFLAGS register value on the stack - "orl $0x200000, (%%esp)\n\t" // Set bit 0x200000: Able to use CPUID instruction (Pentium+) - "popfl\n\t" // Write the updated value into the EFLAGS register - "pushfl\n\t" // Push EFLAGS register value on the stack (again) - "btrl $0x15, (%%esp)\n\t" // Test bit 0x15 (21) and reset (set CF) - "adc %%eax, %%eax\n\t" // Add with carry: EAX = EAX + EAX + CF = CF - "popfl" // Push EFLAGS register value on the stack (again, and makes sure the stack remains the same) - : "=a"(has_cpuid) // has_cpuid == EAX - ); -#endif - return has_cpuid; -#else - return false; -#endif + return SDL_HasSSE2() || SDL_HasNEON() || SDL_HasMMX(); } // FUNCTION: CONFIG 0x004029a0 diff --git a/LEGO1/mxdirectx/legodxinfo.h b/LEGO1/mxdirectx/legodxinfo.h index d94aaaff..c9289870 100644 --- a/LEGO1/mxdirectx/legodxinfo.h +++ b/LEGO1/mxdirectx/legodxinfo.h @@ -16,7 +16,6 @@ class LegoDeviceEnumerate : public MxDeviceEnumerate { int BETA_1011cc65(int p_idx, char* p_buffer); int GetBestDevice(); static bool SupportsSIMD(); - static bool SupportsCPUID(); int FUN_1009d210(); unsigned char FUN_1009d3d0(Direct3DDeviceInfo& p_device); diff --git a/miniwin/src/d3drm/backends/software/renderer.cpp b/miniwin/src/d3drm/backends/software/renderer.cpp index 305a5fe9..0833bdb5 100644 --- a/miniwin/src/d3drm/backends/software/renderer.cpp +++ b/miniwin/src/d3drm/backends/software/renderer.cpp @@ -10,6 +10,18 @@ #include #include #include +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) +#include +#if defined(__i386__) || defined(_M_IX86) +#include +#endif +#endif +#if defined(__arm__) || defined(__aarch64__) +#include +#endif +#if defined(__wasm_simd128__) +#include +#endif Direct3DRMSoftwareRenderer::Direct3DRMSoftwareRenderer(DWORD width, DWORD height) : m_width(width), m_height(height) { @@ -30,7 +42,44 @@ void Direct3DRMSoftwareRenderer::SetProjection(const D3DRMMATRIX4D& projection, void Direct3DRMSoftwareRenderer::ClearZBuffer() { - std::fill(m_zBuffer.begin(), m_zBuffer.end(), std::numeric_limits::infinity()); + const size_t size = m_zBuffer.size(); + const float inf = std::numeric_limits::infinity(); + size_t i = 0; + +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) + if (SDL_HasSSE2()) { + __m128 inf4 = _mm_set1_ps(inf); + for (; i + 4 <= size; i += 4) { + _mm_storeu_ps(&m_zBuffer[i], inf4); + } + } +#if defined(__i386__) || defined(_M_IX86) + else if (SDL_HasMMX()) { + const __m64 mm_inf = _mm_set_pi32(0x7F800000, 0x7F800000); + for (; i + 2 <= size; i += 2) { + *reinterpret_cast<__m64*>(&m_zBuffer[i]) = mm_inf; + } + _mm_empty(); + } +#endif +#elif defined(__arm__) || defined(__aarch64__) + if (SDL_HasNEON()) { + float32x4_t inf4 = vdupq_n_f32(inf); + for (; i + 4 <= size; i += 4) { + vst1q_f32(&m_zBuffer[i], inf4); + } + } +#elif defined(__wasm_simd128__) + const size_t simdWidth = 4; + v128_t infVec = wasm_f32x4_splat(inf); + for (; i + simdWidth <= size; i += simdWidth) { + wasm_v128_store(&m_zBuffer[i], infVec); + } +#endif + + for (; i < size; ++i) { + m_zBuffer[i] = inf; + } } void Direct3DRMSoftwareRenderer::ProjectVertex(const D3DRMVERTEX& v, D3DRMVECTOR4D& p) const @@ -123,10 +172,20 @@ void Direct3DRMSoftwareRenderer::DrawTriangleClipped(const D3DRMVERTEX (&v)[3], } } -void Direct3DRMSoftwareRenderer::BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a) +Uint32 Direct3DRMSoftwareRenderer::BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a) { - Uint32 dstPixel = 0; - memcpy(&dstPixel, pixelAddr, m_bytesPerPixel); + Uint32 dstPixel; + switch (m_bytesPerPixel) { + case 1: + dstPixel = *pixelAddr; + break; + case 2: + dstPixel = *(Uint16*) pixelAddr; + break; + case 4: + dstPixel = *(Uint32*) pixelAddr; + break; + } Uint8 dstR, dstG, dstB, dstA; SDL_GetRGBA(dstPixel, m_format, m_palette, &dstR, &dstG, &dstB, &dstA); @@ -139,18 +198,7 @@ void Direct3DRMSoftwareRenderer::BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 outB = static_cast(b * alpha + dstB * invAlpha); Uint8 outA = static_cast(a + dstA * invAlpha); - Uint32 blended = SDL_MapRGBA(m_format, m_palette, outR, outG, outB, outA); - switch (m_bytesPerPixel) { - case 1: - *pixelAddr = static_cast(blended); - break; - case 2: - *reinterpret_cast(pixelAddr) = static_cast(blended); - break; - case 4: - *reinterpret_cast(pixelAddr) = blended; - break; - } + return SDL_MapRGBA(m_format, m_palette, outR, outG, outB, outA); } SDL_Color Direct3DRMSoftwareRenderer::ApplyLighting( @@ -370,6 +418,7 @@ void Direct3DRMSoftwareRenderer::DrawTriangleProjected( } Uint8* pixelAddr = pixels + y * pitch + x * m_bytesPerPixel; + Uint32 finalColor; if (appearance.color.a == 255) { zref = z; @@ -415,22 +464,22 @@ void Direct3DRMSoftwareRenderer::DrawTriangleProjected( b = (b * tb + 127) / 255; } - Uint32 finalColor = SDL_MapRGBA(m_format, m_palette, r, g, b, 255); - switch (m_bytesPerPixel) { - case 1: - *pixelAddr = static_cast(finalColor); - break; - case 2: - *reinterpret_cast(pixelAddr) = static_cast(finalColor); - break; - case 4: - *reinterpret_cast(pixelAddr) = finalColor; - break; - } + finalColor = SDL_MapRGBA(m_format, m_palette, r, g, b, 255); } else { - // Transparent alpha blending with vertex alpha - BlendPixel(pixelAddr, r, g, b, appearance.color.a); + finalColor = BlendPixel(pixelAddr, r, g, b, appearance.color.a); + } + + switch (m_bytesPerPixel) { + case 1: + *pixelAddr = static_cast(finalColor); + break; + case 2: + *reinterpret_cast(pixelAddr) = static_cast(finalColor); + break; + case 4: + *reinterpret_cast(pixelAddr) = finalColor; + break; } } } diff --git a/miniwin/src/internal/d3drmrenderer_software.h b/miniwin/src/internal/d3drmrenderer_software.h index 574d052d..28347bc7 100644 --- a/miniwin/src/internal/d3drmrenderer_software.h +++ b/miniwin/src/internal/d3drmrenderer_software.h @@ -55,7 +55,7 @@ class Direct3DRMSoftwareRenderer : public Direct3DRMRenderer { ); void DrawTriangleClipped(const D3DRMVERTEX (&v)[3], const Appearance& appearance); void ProjectVertex(const D3DRMVERTEX& v, D3DRMVECTOR4D& p) const; - void BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a); + Uint32 BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a); SDL_Color ApplyLighting(const D3DVECTOR& position, const D3DVECTOR& normal, const Appearance& appearance); void AddTextureDestroyCallback(Uint32 id, IDirect3DRMTexture* texture); void AddMeshDestroyCallback(Uint32 id, IDirect3DRMMesh* mesh);