mirror of
https://github.com/isledecomp/isle-portable.git
synced 2026-01-19 22:01:14 +00:00
SIMD optimize z-buffer clear (#283)
This commit is contained in:
parent
5080e372f9
commit
81ba75ea65
@ -1,10 +1,8 @@
|
|||||||
#include "legodxinfo.h"
|
#include "legodxinfo.h"
|
||||||
|
|
||||||
|
#include <SDL3/SDL_cpuinfo.h>
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include <stdio.h> // for vsprintf
|
#include <stdio.h> // for vsprintf
|
||||||
#if defined(_MSC_VER) && _MSC_VER >= 1310
|
|
||||||
#include <intrin.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// File name validated by BETA10 0x1011cba3; directory unknown
|
// File name validated by BETA10 0x1011cba3; directory unknown
|
||||||
|
|
||||||
@ -204,74 +202,7 @@ int LegoDeviceEnumerate::GetBestDevice()
|
|||||||
// FUNCTION: BETA10 0x1011cf54
|
// FUNCTION: BETA10 0x1011cf54
|
||||||
bool LegoDeviceEnumerate::SupportsSIMD()
|
bool LegoDeviceEnumerate::SupportsSIMD()
|
||||||
{
|
{
|
||||||
#if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__EMSCRIPTEN__)
|
return SDL_HasSSE2() || SDL_HasNEON() || SDL_HasMMX();
|
||||||
// All x86_64 and 64-bit ARM CPUs support at least SSE2 or NEON
|
|
||||||
return true;
|
|
||||||
#elif defined(__i386__) || defined(_M_IX86)
|
|
||||||
// 32-bit x86 - need to use CPUID to check for MMX or SSE
|
|
||||||
if (!SupportsCPUID()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
int edx;
|
|
||||||
#if defined(_MSC_VER) && _MSC_VER >= 1310
|
|
||||||
int cpuInfo[4];
|
|
||||||
__cpuid(cpuInfo, 1);
|
|
||||||
edx = cpuInfo[3];
|
|
||||||
#else
|
|
||||||
__asm__ __volatile__("movl $1, %%eax\n\t"
|
|
||||||
"cpuid\n\t"
|
|
||||||
: "=d"(edx)
|
|
||||||
:
|
|
||||||
: "%eax", "%ebx", "%ecx");
|
|
||||||
#endif
|
|
||||||
return (edx & (1 << 23)) != 0; // Bit 23: MMX
|
|
||||||
#elif defined(__arm__) && defined(__ANDROID__)
|
|
||||||
// Runtime check for NEON on 32-bit ARM (using Android NDK)
|
|
||||||
return android_getCpuFeatures() & ANDROID_CPU_ARM_FEATURE_NEON;
|
|
||||||
#else
|
|
||||||
// Prevent unsupported builds
|
|
||||||
#error "Unsupported platform: SIMD feature detection not implemented"
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// FUNCTION: CONFIG 0x00402970
|
|
||||||
// FUNCTION: LEGO1 0x1009d1e0
|
|
||||||
// FUNCTION: BETA10 0x1011cf97
|
|
||||||
bool LegoDeviceEnumerate::SupportsCPUID()
|
|
||||||
{
|
|
||||||
#if defined(_M_X64) || defined(__x86_64__) || defined(__amd64__)
|
|
||||||
return true;
|
|
||||||
#elif defined(_M_IX86) || defined(__i386__)
|
|
||||||
int has_cpuid;
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
__asm {
|
|
||||||
xor eax, eax ; Zero EAX register
|
|
||||||
pushfd ; Push EFLAGS register value on the stack
|
|
||||||
or dword ptr[esp], 0x200000 ; Set bit 0x200000: Able to use CPUID instruction (Pentium+)
|
|
||||||
popfd ; Write the updated value into the EFLAGS register
|
|
||||||
pushfd ; Push EFLAGS register value on the stack (again)
|
|
||||||
btr dword ptr[esp], 0x15 ; Test bit 0x15 (21) and reset (set CF)
|
|
||||||
adc eax, eax ; Add with carry: EAX = EAX + EAX + CF = CF
|
|
||||||
popfd ; Push EFLAGS register value on the stack (again, and makes sure the stack remains the same)
|
|
||||||
mov has_cpuid, eax ; Save eax into C variable
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
__asm__("xorl %%eax, %%eax\n\t" // Zero EAX register
|
|
||||||
"pushfl\n\t" // Push EFLAGS register value on the stack
|
|
||||||
"orl $0x200000, (%%esp)\n\t" // Set bit 0x200000: Able to use CPUID instruction (Pentium+)
|
|
||||||
"popfl\n\t" // Write the updated value into the EFLAGS register
|
|
||||||
"pushfl\n\t" // Push EFLAGS register value on the stack (again)
|
|
||||||
"btrl $0x15, (%%esp)\n\t" // Test bit 0x15 (21) and reset (set CF)
|
|
||||||
"adc %%eax, %%eax\n\t" // Add with carry: EAX = EAX + EAX + CF = CF
|
|
||||||
"popfl" // Push EFLAGS register value on the stack (again, and makes sure the stack remains the same)
|
|
||||||
: "=a"(has_cpuid) // has_cpuid == EAX
|
|
||||||
);
|
|
||||||
#endif
|
|
||||||
return has_cpuid;
|
|
||||||
#else
|
|
||||||
return false;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// FUNCTION: CONFIG 0x004029a0
|
// FUNCTION: CONFIG 0x004029a0
|
||||||
|
|||||||
@ -16,7 +16,6 @@ class LegoDeviceEnumerate : public MxDeviceEnumerate {
|
|||||||
int BETA_1011cc65(int p_idx, char* p_buffer);
|
int BETA_1011cc65(int p_idx, char* p_buffer);
|
||||||
int GetBestDevice();
|
int GetBestDevice();
|
||||||
static bool SupportsSIMD();
|
static bool SupportsSIMD();
|
||||||
static bool SupportsCPUID();
|
|
||||||
int FUN_1009d210();
|
int FUN_1009d210();
|
||||||
unsigned char FUN_1009d3d0(Direct3DDeviceInfo& p_device);
|
unsigned char FUN_1009d3d0(Direct3DDeviceInfo& p_device);
|
||||||
|
|
||||||
|
|||||||
@ -10,6 +10,18 @@
|
|||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <limits>
|
#include <limits>
|
||||||
|
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
|
||||||
|
#include <xmmintrin.h>
|
||||||
|
#if defined(__i386__) || defined(_M_IX86)
|
||||||
|
#include <xmmintrin.h>
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
#if defined(__arm__) || defined(__aarch64__)
|
||||||
|
#include <arm_neon.h>
|
||||||
|
#endif
|
||||||
|
#if defined(__wasm_simd128__)
|
||||||
|
#include <wasm_simd128.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
Direct3DRMSoftwareRenderer::Direct3DRMSoftwareRenderer(DWORD width, DWORD height) : m_width(width), m_height(height)
|
Direct3DRMSoftwareRenderer::Direct3DRMSoftwareRenderer(DWORD width, DWORD height) : m_width(width), m_height(height)
|
||||||
{
|
{
|
||||||
@ -30,7 +42,44 @@ void Direct3DRMSoftwareRenderer::SetProjection(const D3DRMMATRIX4D& projection,
|
|||||||
|
|
||||||
void Direct3DRMSoftwareRenderer::ClearZBuffer()
|
void Direct3DRMSoftwareRenderer::ClearZBuffer()
|
||||||
{
|
{
|
||||||
std::fill(m_zBuffer.begin(), m_zBuffer.end(), std::numeric_limits<float>::infinity());
|
const size_t size = m_zBuffer.size();
|
||||||
|
const float inf = std::numeric_limits<float>::infinity();
|
||||||
|
size_t i = 0;
|
||||||
|
|
||||||
|
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86)
|
||||||
|
if (SDL_HasSSE2()) {
|
||||||
|
__m128 inf4 = _mm_set1_ps(inf);
|
||||||
|
for (; i + 4 <= size; i += 4) {
|
||||||
|
_mm_storeu_ps(&m_zBuffer[i], inf4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#if defined(__i386__) || defined(_M_IX86)
|
||||||
|
else if (SDL_HasMMX()) {
|
||||||
|
const __m64 mm_inf = _mm_set_pi32(0x7F800000, 0x7F800000);
|
||||||
|
for (; i + 2 <= size; i += 2) {
|
||||||
|
*reinterpret_cast<__m64*>(&m_zBuffer[i]) = mm_inf;
|
||||||
|
}
|
||||||
|
_mm_empty();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#elif defined(__arm__) || defined(__aarch64__)
|
||||||
|
if (SDL_HasNEON()) {
|
||||||
|
float32x4_t inf4 = vdupq_n_f32(inf);
|
||||||
|
for (; i + 4 <= size; i += 4) {
|
||||||
|
vst1q_f32(&m_zBuffer[i], inf4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#elif defined(__wasm_simd128__)
|
||||||
|
const size_t simdWidth = 4;
|
||||||
|
v128_t infVec = wasm_f32x4_splat(inf);
|
||||||
|
for (; i + simdWidth <= size; i += simdWidth) {
|
||||||
|
wasm_v128_store(&m_zBuffer[i], infVec);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
for (; i < size; ++i) {
|
||||||
|
m_zBuffer[i] = inf;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Direct3DRMSoftwareRenderer::ProjectVertex(const D3DRMVERTEX& v, D3DRMVECTOR4D& p) const
|
void Direct3DRMSoftwareRenderer::ProjectVertex(const D3DRMVERTEX& v, D3DRMVECTOR4D& p) const
|
||||||
@ -123,10 +172,20 @@ void Direct3DRMSoftwareRenderer::DrawTriangleClipped(const D3DRMVERTEX (&v)[3],
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void Direct3DRMSoftwareRenderer::BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
|
Uint32 Direct3DRMSoftwareRenderer::BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a)
|
||||||
{
|
{
|
||||||
Uint32 dstPixel = 0;
|
Uint32 dstPixel;
|
||||||
memcpy(&dstPixel, pixelAddr, m_bytesPerPixel);
|
switch (m_bytesPerPixel) {
|
||||||
|
case 1:
|
||||||
|
dstPixel = *pixelAddr;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
dstPixel = *(Uint16*) pixelAddr;
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
dstPixel = *(Uint32*) pixelAddr;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
Uint8 dstR, dstG, dstB, dstA;
|
Uint8 dstR, dstG, dstB, dstA;
|
||||||
SDL_GetRGBA(dstPixel, m_format, m_palette, &dstR, &dstG, &dstB, &dstA);
|
SDL_GetRGBA(dstPixel, m_format, m_palette, &dstR, &dstG, &dstB, &dstA);
|
||||||
@ -139,18 +198,7 @@ void Direct3DRMSoftwareRenderer::BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g,
|
|||||||
Uint8 outB = static_cast<Uint8>(b * alpha + dstB * invAlpha);
|
Uint8 outB = static_cast<Uint8>(b * alpha + dstB * invAlpha);
|
||||||
Uint8 outA = static_cast<Uint8>(a + dstA * invAlpha);
|
Uint8 outA = static_cast<Uint8>(a + dstA * invAlpha);
|
||||||
|
|
||||||
Uint32 blended = SDL_MapRGBA(m_format, m_palette, outR, outG, outB, outA);
|
return SDL_MapRGBA(m_format, m_palette, outR, outG, outB, outA);
|
||||||
switch (m_bytesPerPixel) {
|
|
||||||
case 1:
|
|
||||||
*pixelAddr = static_cast<Uint8>(blended);
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
*reinterpret_cast<Uint16*>(pixelAddr) = static_cast<Uint16>(blended);
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
*reinterpret_cast<Uint32*>(pixelAddr) = blended;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SDL_Color Direct3DRMSoftwareRenderer::ApplyLighting(
|
SDL_Color Direct3DRMSoftwareRenderer::ApplyLighting(
|
||||||
@ -370,6 +418,7 @@ void Direct3DRMSoftwareRenderer::DrawTriangleProjected(
|
|||||||
}
|
}
|
||||||
|
|
||||||
Uint8* pixelAddr = pixels + y * pitch + x * m_bytesPerPixel;
|
Uint8* pixelAddr = pixels + y * pitch + x * m_bytesPerPixel;
|
||||||
|
Uint32 finalColor;
|
||||||
|
|
||||||
if (appearance.color.a == 255) {
|
if (appearance.color.a == 255) {
|
||||||
zref = z;
|
zref = z;
|
||||||
@ -415,22 +464,22 @@ void Direct3DRMSoftwareRenderer::DrawTriangleProjected(
|
|||||||
b = (b * tb + 127) / 255;
|
b = (b * tb + 127) / 255;
|
||||||
}
|
}
|
||||||
|
|
||||||
Uint32 finalColor = SDL_MapRGBA(m_format, m_palette, r, g, b, 255);
|
finalColor = SDL_MapRGBA(m_format, m_palette, r, g, b, 255);
|
||||||
switch (m_bytesPerPixel) {
|
|
||||||
case 1:
|
|
||||||
*pixelAddr = static_cast<Uint8>(finalColor);
|
|
||||||
break;
|
|
||||||
case 2:
|
|
||||||
*reinterpret_cast<Uint16*>(pixelAddr) = static_cast<Uint16>(finalColor);
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
*reinterpret_cast<Uint32*>(pixelAddr) = finalColor;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// Transparent alpha blending with vertex alpha
|
finalColor = BlendPixel(pixelAddr, r, g, b, appearance.color.a);
|
||||||
BlendPixel(pixelAddr, r, g, b, appearance.color.a);
|
}
|
||||||
|
|
||||||
|
switch (m_bytesPerPixel) {
|
||||||
|
case 1:
|
||||||
|
*pixelAddr = static_cast<Uint8>(finalColor);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
*reinterpret_cast<Uint16*>(pixelAddr) = static_cast<Uint16>(finalColor);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
*reinterpret_cast<Uint32*>(pixelAddr) = finalColor;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -55,7 +55,7 @@ class Direct3DRMSoftwareRenderer : public Direct3DRMRenderer {
|
|||||||
);
|
);
|
||||||
void DrawTriangleClipped(const D3DRMVERTEX (&v)[3], const Appearance& appearance);
|
void DrawTriangleClipped(const D3DRMVERTEX (&v)[3], const Appearance& appearance);
|
||||||
void ProjectVertex(const D3DRMVERTEX& v, D3DRMVECTOR4D& p) const;
|
void ProjectVertex(const D3DRMVERTEX& v, D3DRMVECTOR4D& p) const;
|
||||||
void BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
|
Uint32 BlendPixel(Uint8* pixelAddr, Uint8 r, Uint8 g, Uint8 b, Uint8 a);
|
||||||
SDL_Color ApplyLighting(const D3DVECTOR& position, const D3DVECTOR& normal, const Appearance& appearance);
|
SDL_Color ApplyLighting(const D3DVECTOR& position, const D3DVECTOR& normal, const Appearance& appearance);
|
||||||
void AddTextureDestroyCallback(Uint32 id, IDirect3DRMTexture* texture);
|
void AddTextureDestroyCallback(Uint32 id, IDirect3DRMTexture* texture);
|
||||||
void AddMeshDestroyCallback(Uint32 id, IDirect3DRMMesh* mesh);
|
void AddMeshDestroyCallback(Uint32 id, IDirect3DRMMesh* mesh);
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user