SIMD optimize z-buffer clear (#283)

This commit is contained in:
Anders Jenbo 2025-06-13 01:21:34 +02:00 committed by GitHub
parent 5080e372f9
commit 81ba75ea65
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 82 additions and 103 deletions

View File

@ -1,10 +1,8 @@
#include "legodxinfo.h"
#include <SDL3/SDL_cpuinfo.h>
#include <assert.h>
#include <stdio.h> // for vsprintf
#if defined(_MSC_VER) && _MSC_VER >= 1310
#include <intrin.h>
#endif
// File name validated by BETA10 0x1011cba3; directory unknown
@ -204,74 +202,7 @@ int LegoDeviceEnumerate::GetBestDevice()
// FUNCTION: BETA10 0x1011cf54
bool LegoDeviceEnumerate::SupportsSIMD()
{
#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__EMSCRIPTEN__)
// All x86_64 and 64-bit ARM CPUs support at least SSE2 or NEON
return true;
#elif defined(__i386__) || defined(_M_IX86)
// 32-bit x86 - need to use CPUID to check for MMX or SSE
if (!SupportsCPUID()) {
return false;
}
int edx;
#if defined(_MSC_VER) && _MSC_VER >= 1310
int cpuInfo[4];
__cpuid(cpuInfo, 1);
edx = cpuInfo[3];
#else
__asm__ __volatile__("movl $1, %%eax\n\t"
"cpuid\n\t"
: "=d"(edx)
:
: "%eax", "%ebx", "%ecx");
#endif
return (edx & (1 << 23)) != 0; // Bit 23: MMX
#elif defined(__arm__) && defined(__ANDROID__)
// Runtime check for NEON on 32-bit ARM (using Android NDK)
return android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON;
#else
// Prevent unsupported builds
#error "Unsupported platform: SIMD feature detection not implemented"
#endif
}
// FUNCTION: CONFIG 0x00402970
// FUNCTION: LEGO1 0x1009d1e0
// FUNCTION: BETA10 0x1011cf97
bool LegoDeviceEnumerate::SupportsCPUID()
{
#if defined(_M_X64) || defined(__x86_64__) || defined(__amd64__)
return true;
#elif defined(_M_IX86) || defined(__i386__)
int has_cpuid;
#ifdef _MSC_VER
__asm {
xor eax, eax ; Zero EAX register
pushfd ; Push EFLAGS register value on the stack
or dword ptr[esp], 0x200000 ; Set bit 0x200000: Able to use CPUID instruction (Pentium+)
popfd ; Write the updated value into the EFLAGS register
pushfd ; Push EFLAGS register value on the stack (again)
btr dword ptr[esp], 0x15 ; Test bit 0x15 (21) and reset (set CF)
adc eax, eax ; Add with carry: EAX = EAX + EAX + CF = CF
popfd ; Push EFLAGS register value on the stack (again, and makes sure the stack remains the same)
mov has_cpuid, eax ; Save eax into C variable
}
#else
__asm__("xorl %%eax, %%eax\n\t" // Zero EAX register
"pushfl\n\t" // Push EFLAGS register value on the stack
"orl $0x200000, (%%esp)\n\t" // Set bit 0x200000: Able to use CPUID instruction (Pentium+)
"popfl\n\t" // Write the updated value into the EFLAGS register
"pushfl\n\t" // Push EFLAGS register value on the stack (again)
"btrl $0x15, (%%esp)\n\t" // Test bit 0x15 (21) and reset (set CF)
"adc %%eax, %%eax\n\t" // Add with carry: EAX = EAX + EAX + CF = CF
"popfl" // Push EFLAGS register value on the stack (again, and makes sure the stack remains the same)
: "=a"(has_cpuid) // has_cpuid == EAX
);
#endif
return has_cpuid;
#else
return false;
#endif
return SDL_HasSSE2() || SDL_HasNEON() || SDL_HasMMX();
}
// FUNCTION: CONFIG 0x004029a0

View File

@ -16,7 +16,6 @@ class LegoDeviceEnumerate : public MxDeviceEnumerate {
int BETA_1011cc65(int p_idx, char* p_buffer);
int GetBestDevice();
static bool SupportsSIMD();
static bool SupportsCPUID();
int FUN_1009d210();
unsigned char FUN_1009d3d0(Direct3DDeviceInfo& p_device);

View File

@ -10,6 +10,18 @@
#include <cmath>
#include <cstring>
#include <limits>
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
#include <xmmintrin.h>
#if defined(__i386__) || defined(_M_IX86)
#include <xmmintrin.h>
#endif
#endif
#if defined(__arm__) || defined(__aarch64__)
#include <arm_neon.h>
#endif
#if defined(__wasm_simd128__)
#include <wasm_simd128.h>
#endif
Direct3DRMSoftwareRenderer::Direct3DRMSoftwareRenderer(DWORD width, DWORD height) : m_width(width), m_height(height)
{
@ -30,7 +42,44 @@ void Direct3DRMSoftwareRenderer::SetProjection(const D3DRMMATRIX4D& projection,
void Direct3DRMSoftwareRenderer::ClearZBuffer()
{
std::fill(m_zBuffer.begin(), m_zBuffer.end(), std::numeric_limits<float>::infinity());
const size_t size = m_zBuffer.size();
const float inf = std::numeric_limits<float>::infinity();
size_t i = 0;
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
if (SDL_HasSSE2()) {
__m128 inf4 = _mm_set1_ps(inf);
for (; i + 4 <= size; i += 4) {
_mm_storeu_ps(&m_zBuffer[i], inf4);
}
}
#if defined(__i386__) || defined(_M_IX86)
else if (SDL_HasMMX()) {
const __m64 mm_inf = _mm_set_pi32(0x7F800000, 0x7F800000);
for (; i + 2 <= size; i += 2) {
*reinterpret_cast<__m64*>(&m_zBuffer[i]) = mm_inf;
}
_mm_empty();
}
#endif
#elif defined(__arm__) || defined(__aarch64__)
if (SDL_HasNEON()) {
float32x4_t inf4 = vdupq_n_f32(inf);
for (; i + 4 <= size; i += 4) {
vst1q_f32(&m_zBuffer[i], inf4);
}
}
#elif defined(__wasm_simd128__)
const size_t simdWidth = 4;
v128_t infVec = wasm_f32x4_splat(inf);
for (; i + simdWidth <= size; i += simdWidth) {
wasm_v128_store(&m_zBuffer[i], infVec);
}
#endif
for (; i < size; ++i) {
m_zBuffer[i] = inf;
}
}
void Direct3DRMSoftwareRenderer::ProjectVertex(const D3DRMVERTEX& v, D3DRMVECTOR4D& p) const
@ -123,10 +172,20 @@ void Direct3DRMSoftwareRenderer::DrawTriangleClipped(const D3DRMVERTEX (&v)[3],
}
}
void Direct3DRMSoftwareRenderer::BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
Uint32 Direct3DRMSoftwareRenderer::BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
{
Uint32 dstPixel = 0;
memcpy(&dstPixel, pixelAddr, m_bytesPerPixel);
Uint32 dstPixel;
switch (m_bytesPerPixel) {
case 1:
dstPixel = *pixelAddr;
break;
case 2:
dstPixel = *(Uint16*) pixelAddr;
break;
case 4:
dstPixel = *(Uint32*) pixelAddr;
break;
}
Uint8 dstR, dstG, dstB, dstA;
SDL_GetRGBA(dstPixel, m_format, m_palette, &dstR, &dstG, &dstB, &dstA);
@ -139,18 +198,7 @@ void Direct3DRMSoftwareRenderer::BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g,
Uint8 outB = static_cast<Uint8>(b * alpha + dstB * invAlpha);
Uint8 outA = static_cast<Uint8>(a + dstA * invAlpha);
Uint32 blended = SDL_MapRGBA(m_format, m_palette, outR, outG, outB, outA);
switch (m_bytesPerPixel) {
case 1:
*pixelAddr = static_cast<Uint8>(blended);
break;
case 2:
*reinterpret_cast<Uint16*>(pixelAddr) = static_cast<Uint16>(blended);
break;
case 4:
*reinterpret_cast<Uint32*>(pixelAddr) = blended;
break;
}
return SDL_MapRGBA(m_format, m_palette, outR, outG, outB, outA);
}
SDL_Color Direct3DRMSoftwareRenderer::ApplyLighting(
@ -370,6 +418,7 @@ void Direct3DRMSoftwareRenderer::DrawTriangleProjected(
}
Uint8* pixelAddr = pixels + y * pitch + x * m_bytesPerPixel;
Uint32 finalColor;
if (appearance.color.a == 255) {
zref = z;
@ -415,22 +464,22 @@ void Direct3DRMSoftwareRenderer::DrawTriangleProjected(
b = (b * tb + 127) / 255;
}
Uint32 finalColor = SDL_MapRGBA(m_format, m_palette, r, g, b, 255);
switch (m_bytesPerPixel) {
case 1:
*pixelAddr = static_cast<Uint8>(finalColor);
break;
case 2:
*reinterpret_cast<Uint16*>(pixelAddr) = static_cast<Uint16>(finalColor);
break;
case 4:
*reinterpret_cast<Uint32*>(pixelAddr) = finalColor;
break;
}
finalColor = SDL_MapRGBA(m_format, m_palette, r, g, b, 255);
}
else {
// Transparent alpha blending with vertex alpha
BlendPixel(pixelAddr, r, g, b, appearance.color.a);
finalColor = BlendPixel(pixelAddr, r, g, b, appearance.color.a);
}
switch (m_bytesPerPixel) {
case 1:
*pixelAddr = static_cast<Uint8>(finalColor);
break;
case 2:
*reinterpret_cast<Uint16*>(pixelAddr) = static_cast<Uint16>(finalColor);
break;
case 4:
*reinterpret_cast<Uint32*>(pixelAddr) = finalColor;
break;
}
}
}

View File

@ -55,7 +55,7 @@ class Direct3DRMSoftwareRenderer : public Direct3DRMRenderer {
);
void DrawTriangleClipped(const D3DRMVERTEX (&v)[3], const Appearance& appearance);
void ProjectVertex(const D3DRMVERTEX& v, D3DRMVECTOR4D& p) const;
void BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
Uint32 BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
SDL_Color ApplyLighting(const D3DVECTOR& position, const D3DVECTOR& normal, const Appearance& appearance);
void AddTextureDestroyCallback(Uint32 id, IDirect3DRMTexture* texture);
void AddMeshDestroyCallback(Uint32 id, IDirect3DRMMesh* mesh);