mirror of
https://github.com/isledecomp/isle-portable.git
synced 2026-01-12 10:41:15 +00:00
SIMD optimize z-buffer clear (#283)
This commit is contained in:
parent
5080e372f9
commit
81ba75ea65
@ -1,10 +1,8 @@
|
||||
#include "legodxinfo.h"
|
||||
|
||||
#include <SDL3/SDL_cpuinfo.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h> // for vsprintf
|
||||
#if defined(_MSC_VER) && _MSC_VER >= 1310
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
// File name validated by BETA10 0x1011cba3; directory unknown
|
||||
|
||||
@ -204,74 +202,7 @@ int LegoDeviceEnumerate::GetBestDevice()
|
||||
// FUNCTION: BETA10 0x1011cf54
|
||||
bool LegoDeviceEnumerate::SupportsSIMD()
|
||||
{
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__EMSCRIPTEN__)
|
||||
// All x86_64 and 64-bit ARM CPUs support at least SSE2 or NEON
|
||||
return true;
|
||||
#elif defined(__i386__) || defined(_M_IX86)
|
||||
// 32-bit x86 - need to use CPUID to check for MMX or SSE
|
||||
if (!SupportsCPUID()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int edx;
|
||||
#if defined(_MSC_VER) && _MSC_VER >= 1310
|
||||
int cpuInfo[4];
|
||||
__cpuid(cpuInfo, 1);
|
||||
edx = cpuInfo[3];
|
||||
#else
|
||||
__asm__ __volatile__("movl $1, %%eax\n\t"
|
||||
"cpuid\n\t"
|
||||
: "=d"(edx)
|
||||
:
|
||||
: "%eax", "%ebx", "%ecx");
|
||||
#endif
|
||||
return (edx & (1 << 23)) != 0; // Bit 23: MMX
|
||||
#elif defined(__arm__) && defined(__ANDROID__)
|
||||
// Runtime check for NEON on 32-bit ARM (using Android NDK)
|
||||
return android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON;
|
||||
#else
|
||||
// Prevent unsupported builds
|
||||
#error "Unsupported platform: SIMD feature detection not implemented"
|
||||
#endif
|
||||
}
|
||||
|
||||
// FUNCTION: CONFIG 0x00402970
|
||||
// FUNCTION: LEGO1 0x1009d1e0
|
||||
// FUNCTION: BETA10 0x1011cf97
|
||||
bool LegoDeviceEnumerate::SupportsCPUID()
|
||||
{
|
||||
#if defined(_M_X64) || defined(__x86_64__) || defined(__amd64__)
|
||||
return true;
|
||||
#elif defined(_M_IX86) || defined(__i386__)
|
||||
int has_cpuid;
|
||||
#ifdef _MSC_VER
|
||||
__asm {
|
||||
xor eax, eax ; Zero EAX register
|
||||
pushfd ; Push EFLAGS register value on the stack
|
||||
or dword ptr[esp], 0x200000 ; Set bit 0x200000: Able to use CPUID instruction (Pentium+)
|
||||
popfd ; Write the updated value into the EFLAGS register
|
||||
pushfd ; Push EFLAGS register value on the stack (again)
|
||||
btr dword ptr[esp], 0x15 ; Test bit 0x15 (21) and reset (set CF)
|
||||
adc eax, eax ; Add with carry: EAX = EAX + EAX + CF = CF
|
||||
popfd ; Push EFLAGS register value on the stack (again, and makes sure the stack remains the same)
|
||||
mov has_cpuid, eax ; Save eax into C variable
|
||||
}
|
||||
#else
|
||||
__asm__("xorl %%eax, %%eax\n\t" // Zero EAX register
|
||||
"pushfl\n\t" // Push EFLAGS register value on the stack
|
||||
"orl $0x200000, (%%esp)\n\t" // Set bit 0x200000: Able to use CPUID instruction (Pentium+)
|
||||
"popfl\n\t" // Write the updated value into the EFLAGS register
|
||||
"pushfl\n\t" // Push EFLAGS register value on the stack (again)
|
||||
"btrl $0x15, (%%esp)\n\t" // Test bit 0x15 (21) and reset (set CF)
|
||||
"adc %%eax, %%eax\n\t" // Add with carry: EAX = EAX + EAX + CF = CF
|
||||
"popfl" // Push EFLAGS register value on the stack (again, and makes sure the stack remains the same)
|
||||
: "=a"(has_cpuid) // has_cpuid == EAX
|
||||
);
|
||||
#endif
|
||||
return has_cpuid;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
return SDL_HasSSE2() || SDL_HasNEON() || SDL_HasMMX();
|
||||
}
|
||||
|
||||
// FUNCTION: CONFIG 0x004029a0
|
||||
|
||||
@ -16,7 +16,6 @@ class LegoDeviceEnumerate : public MxDeviceEnumerate {
|
||||
int BETA_1011cc65(int p_idx, char* p_buffer);
|
||||
int GetBestDevice();
|
||||
static bool SupportsSIMD();
|
||||
static bool SupportsCPUID();
|
||||
int FUN_1009d210();
|
||||
unsigned char FUN_1009d3d0(Direct3DDeviceInfo& p_device);
|
||||
|
||||
|
||||
@ -10,6 +10,18 @@
|
||||
#include <cmath>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
|
||||
#include <xmmintrin.h>
|
||||
#if defined(__i386__) || defined(_M_IX86)
|
||||
#include <xmmintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
#if defined(__arm__) || defined(__aarch64__)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
#if defined(__wasm_simd128__)
|
||||
#include <wasm_simd128.h>
|
||||
#endif
|
||||
|
||||
Direct3DRMSoftwareRenderer::Direct3DRMSoftwareRenderer(DWORD width, DWORD height) : m_width(width), m_height(height)
|
||||
{
|
||||
@ -30,7 +42,44 @@ void Direct3DRMSoftwareRenderer::SetProjection(const D3DRMMATRIX4D& projection,
|
||||
|
||||
void Direct3DRMSoftwareRenderer::ClearZBuffer()
|
||||
{
|
||||
std::fill(m_zBuffer.begin(), m_zBuffer.end(), std::numeric_limits<float>::infinity());
|
||||
const size_t size = m_zBuffer.size();
|
||||
const float inf = std::numeric_limits<float>::infinity();
|
||||
size_t i = 0;
|
||||
|
||||
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
|
||||
if (SDL_HasSSE2()) {
|
||||
__m128 inf4 = _mm_set1_ps(inf);
|
||||
for (; i + 4 <= size; i += 4) {
|
||||
_mm_storeu_ps(&m_zBuffer[i], inf4);
|
||||
}
|
||||
}
|
||||
#if defined(__i386__) || defined(_M_IX86)
|
||||
else if (SDL_HasMMX()) {
|
||||
const __m64 mm_inf = _mm_set_pi32(0x7F800000, 0x7F800000);
|
||||
for (; i + 2 <= size; i += 2) {
|
||||
*reinterpret_cast<__m64*>(&m_zBuffer[i]) = mm_inf;
|
||||
}
|
||||
_mm_empty();
|
||||
}
|
||||
#endif
|
||||
#elif defined(__arm__) || defined(__aarch64__)
|
||||
if (SDL_HasNEON()) {
|
||||
float32x4_t inf4 = vdupq_n_f32(inf);
|
||||
for (; i + 4 <= size; i += 4) {
|
||||
vst1q_f32(&m_zBuffer[i], inf4);
|
||||
}
|
||||
}
|
||||
#elif defined(__wasm_simd128__)
|
||||
const size_t simdWidth = 4;
|
||||
v128_t infVec = wasm_f32x4_splat(inf);
|
||||
for (; i + simdWidth <= size; i += simdWidth) {
|
||||
wasm_v128_store(&m_zBuffer[i], infVec);
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; i < size; ++i) {
|
||||
m_zBuffer[i] = inf;
|
||||
}
|
||||
}
|
||||
|
||||
void Direct3DRMSoftwareRenderer::ProjectVertex(const D3DRMVERTEX& v, D3DRMVECTOR4D& p) const
|
||||
@ -123,10 +172,20 @@ void Direct3DRMSoftwareRenderer::DrawTriangleClipped(const D3DRMVERTEX (&v)[3],
|
||||
}
|
||||
}
|
||||
|
||||
void Direct3DRMSoftwareRenderer::BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
|
||||
Uint32 Direct3DRMSoftwareRenderer::BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
|
||||
{
|
||||
Uint32 dstPixel = 0;
|
||||
memcpy(&dstPixel, pixelAddr, m_bytesPerPixel);
|
||||
Uint32 dstPixel;
|
||||
switch (m_bytesPerPixel) {
|
||||
case 1:
|
||||
dstPixel = *pixelAddr;
|
||||
break;
|
||||
case 2:
|
||||
dstPixel = *(Uint16*) pixelAddr;
|
||||
break;
|
||||
case 4:
|
||||
dstPixel = *(Uint32*) pixelAddr;
|
||||
break;
|
||||
}
|
||||
|
||||
Uint8 dstR, dstG, dstB, dstA;
|
||||
SDL_GetRGBA(dstPixel, m_format, m_palette, &dstR, &dstG, &dstB, &dstA);
|
||||
@ -139,18 +198,7 @@ void Direct3DRMSoftwareRenderer::BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g,
|
||||
Uint8 outB = static_cast<Uint8>(b * alpha + dstB * invAlpha);
|
||||
Uint8 outA = static_cast<Uint8>(a + dstA * invAlpha);
|
||||
|
||||
Uint32 blended = SDL_MapRGBA(m_format, m_palette, outR, outG, outB, outA);
|
||||
switch (m_bytesPerPixel) {
|
||||
case 1:
|
||||
*pixelAddr = static_cast<Uint8>(blended);
|
||||
break;
|
||||
case 2:
|
||||
*reinterpret_cast<Uint16*>(pixelAddr) = static_cast<Uint16>(blended);
|
||||
break;
|
||||
case 4:
|
||||
*reinterpret_cast<Uint32*>(pixelAddr) = blended;
|
||||
break;
|
||||
}
|
||||
return SDL_MapRGBA(m_format, m_palette, outR, outG, outB, outA);
|
||||
}
|
||||
|
||||
SDL_Color Direct3DRMSoftwareRenderer::ApplyLighting(
|
||||
@ -370,6 +418,7 @@ void Direct3DRMSoftwareRenderer::DrawTriangleProjected(
|
||||
}
|
||||
|
||||
Uint8* pixelAddr = pixels + y * pitch + x * m_bytesPerPixel;
|
||||
Uint32 finalColor;
|
||||
|
||||
if (appearance.color.a == 255) {
|
||||
zref = z;
|
||||
@ -415,22 +464,22 @@ void Direct3DRMSoftwareRenderer::DrawTriangleProjected(
|
||||
b = (b * tb + 127) / 255;
|
||||
}
|
||||
|
||||
Uint32 finalColor = SDL_MapRGBA(m_format, m_palette, r, g, b, 255);
|
||||
switch (m_bytesPerPixel) {
|
||||
case 1:
|
||||
*pixelAddr = static_cast<Uint8>(finalColor);
|
||||
break;
|
||||
case 2:
|
||||
*reinterpret_cast<Uint16*>(pixelAddr) = static_cast<Uint16>(finalColor);
|
||||
break;
|
||||
case 4:
|
||||
*reinterpret_cast<Uint32*>(pixelAddr) = finalColor;
|
||||
break;
|
||||
}
|
||||
finalColor = SDL_MapRGBA(m_format, m_palette, r, g, b, 255);
|
||||
}
|
||||
else {
|
||||
// Transparent alpha blending with vertex alpha
|
||||
BlendPixel(pixelAddr, r, g, b, appearance.color.a);
|
||||
finalColor = BlendPixel(pixelAddr, r, g, b, appearance.color.a);
|
||||
}
|
||||
|
||||
switch (m_bytesPerPixel) {
|
||||
case 1:
|
||||
*pixelAddr = static_cast<Uint8>(finalColor);
|
||||
break;
|
||||
case 2:
|
||||
*reinterpret_cast<Uint16*>(pixelAddr) = static_cast<Uint16>(finalColor);
|
||||
break;
|
||||
case 4:
|
||||
*reinterpret_cast<Uint32*>(pixelAddr) = finalColor;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -55,7 +55,7 @@ class Direct3DRMSoftwareRenderer : public Direct3DRMRenderer {
|
||||
);
|
||||
void DrawTriangleClipped(const D3DRMVERTEX (&v)[3], const Appearance& appearance);
|
||||
void ProjectVertex(const D3DRMVERTEX& v, D3DRMVECTOR4D& p) const;
|
||||
void BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
|
||||
Uint32 BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
|
||||
SDL_Color ApplyLighting(const D3DVECTOR& position, const D3DVECTOR& normal, const Appearance& appearance);
|
||||
void AddTextureDestroyCallback(Uint32 id, IDirect3DRMTexture* texture);
|
||||
void AddMeshDestroyCallback(Uint32 id, IDirect3DRMMesh* mesh);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user