renderer_software: Multi-thread processing (#6698)

* renderer_software: Multi-thread processing

* Doubles the performance in most cases

* renderer_software: Move memory access out of the raster loop

* Profiling shows this has a significant impact
This commit is contained in:
GPUCode 2023-08-28 11:09:23 +03:00 committed by GitHub
parent 8b218e1b7d
commit d1f600601d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 201 additions and 181 deletions

View file

@ -41,10 +41,22 @@ Framebuffer::Framebuffer(Memory::MemorySystem& memory_, const Pica::FramebufferR
Framebuffer::~Framebuffer() = default; Framebuffer::~Framebuffer() = default;
void Framebuffer::DrawPixel(int x, int y, const Common::Vec4<u8>& color) const { void Framebuffer::Bind() {
const auto& framebuffer = regs.framebuffer; PAddr addr = regs.framebuffer.GetColorBufferPhysicalAddress();
const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); if (color_addr != addr) [[unlikely]] {
color_addr = addr;
color_buffer = memory.GetPhysicalPointer(color_addr);
}
addr = regs.framebuffer.GetDepthBufferPhysicalAddress();
if (depth_addr != addr) [[unlikely]] {
depth_addr = addr;
depth_buffer = memory.GetPhysicalPointer(depth_addr);
}
}
void Framebuffer::DrawPixel(u32 x, u32 y, const Common::Vec4<u8>& color) const {
const auto& framebuffer = regs.framebuffer;
// Similarly to textures, the render framebuffer is laid out from bottom to top, too. // Similarly to textures, the render framebuffer is laid out from bottom to top, too.
// NOTE: The framebuffer height register contains the actual FB height minus one. // NOTE: The framebuffer height register contains the actual FB height minus one.
y = framebuffer.height - y; y = framebuffer.height - y;
@ -54,8 +66,7 @@ void Framebuffer::DrawPixel(int x, int y, const Common::Vec4<u8>& color) const {
GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value())); GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value()));
const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) +
coarse_y * framebuffer.width * bytes_per_pixel; coarse_y * framebuffer.width * bytes_per_pixel;
u8* depth_buffer = memory.GetPhysicalPointer(addr); u8* dst_pixel = color_buffer + dst_offset;
u8* dst_pixel = depth_buffer + dst_offset;
switch (framebuffer.color_format) { switch (framebuffer.color_format) {
case FramebufferRegs::ColorFormat::RGBA8: case FramebufferRegs::ColorFormat::RGBA8:
@ -80,10 +91,8 @@ void Framebuffer::DrawPixel(int x, int y, const Common::Vec4<u8>& color) const {
} }
} }
const Common::Vec4<u8> Framebuffer::GetPixel(int x, int y) const { const Common::Vec4<u8> Framebuffer::GetPixel(u32 x, u32 y) const {
const auto& framebuffer = regs.framebuffer; const auto& framebuffer = regs.framebuffer;
const PAddr addr = framebuffer.GetColorBufferPhysicalAddress();
y = framebuffer.height - y; y = framebuffer.height - y;
const u32 coarse_y = y & ~7; const u32 coarse_y = y & ~7;
@ -91,7 +100,6 @@ const Common::Vec4<u8> Framebuffer::GetPixel(int x, int y) const {
GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value())); GPU::Regs::BytesPerPixel(GPU::Regs::PixelFormat(framebuffer.color_format.Value()));
const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) +
coarse_y * framebuffer.width * bytes_per_pixel; coarse_y * framebuffer.width * bytes_per_pixel;
const u8* color_buffer = memory.GetPhysicalPointer(addr);
const u8* src_pixel = color_buffer + src_offset; const u8* src_pixel = color_buffer + src_offset;
switch (framebuffer.color_format) { switch (framebuffer.color_format) {
@ -114,10 +122,8 @@ const Common::Vec4<u8> Framebuffer::GetPixel(int x, int y) const {
return {0, 0, 0, 0}; return {0, 0, 0, 0};
} }
u32 Framebuffer::GetDepth(int x, int y) const { u32 Framebuffer::GetDepth(u32 x, u32 y) const {
const auto& framebuffer = regs.framebuffer; const auto& framebuffer = regs.framebuffer;
const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress();
y = framebuffer.height - y; y = framebuffer.height - y;
const u32 coarse_y = y & ~7; const u32 coarse_y = y & ~7;
@ -125,7 +131,6 @@ u32 Framebuffer::GetDepth(int x, int y) const {
const u32 stride = framebuffer.width * bytes_per_pixel; const u32 stride = framebuffer.width * bytes_per_pixel;
const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride;
const u8* depth_buffer = memory.GetPhysicalPointer(addr);
const u8* src_pixel = depth_buffer + src_offset; const u8* src_pixel = depth_buffer + src_offset;
switch (framebuffer.depth_format) { switch (framebuffer.depth_format) {
@ -143,10 +148,8 @@ u32 Framebuffer::GetDepth(int x, int y) const {
} }
} }
u8 Framebuffer::GetStencil(int x, int y) const { u8 Framebuffer::GetStencil(u32 x, u32 y) const {
const auto& framebuffer = regs.framebuffer; const auto& framebuffer = regs.framebuffer;
const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress();
y = framebuffer.height - y; y = framebuffer.height - y;
const u32 coarse_y = y & ~7; const u32 coarse_y = y & ~7;
@ -154,7 +157,6 @@ u8 Framebuffer::GetStencil(int x, int y) const {
const u32 stride = framebuffer.width * bytes_per_pixel; const u32 stride = framebuffer.width * bytes_per_pixel;
const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; const u32 src_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride;
const u8* depth_buffer = memory.GetPhysicalPointer(addr);
const u8* src_pixel = depth_buffer + src_offset; const u8* src_pixel = depth_buffer + src_offset;
switch (framebuffer.depth_format) { switch (framebuffer.depth_format) {
@ -169,10 +171,8 @@ u8 Framebuffer::GetStencil(int x, int y) const {
} }
} }
void Framebuffer::SetDepth(int x, int y, u32 value) const { void Framebuffer::SetDepth(u32 x, u32 y, u32 value) const {
const auto& framebuffer = regs.framebuffer; const auto& framebuffer = regs.framebuffer;
const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress();
y = framebuffer.height - y; y = framebuffer.height - y;
const u32 coarse_y = y & ~7; const u32 coarse_y = y & ~7;
@ -180,7 +180,6 @@ void Framebuffer::SetDepth(int x, int y, u32 value) const {
const u32 stride = framebuffer.width * bytes_per_pixel; const u32 stride = framebuffer.width * bytes_per_pixel;
const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride;
u8* depth_buffer = memory.GetPhysicalPointer(addr);
u8* dst_pixel = depth_buffer + dst_offset; u8* dst_pixel = depth_buffer + dst_offset;
switch (framebuffer.depth_format) { switch (framebuffer.depth_format) {
@ -201,10 +200,8 @@ void Framebuffer::SetDepth(int x, int y, u32 value) const {
} }
} }
void Framebuffer::SetStencil(int x, int y, u8 value) const { void Framebuffer::SetStencil(u32 x, u32 y, u8 value) const {
const auto& framebuffer = regs.framebuffer; const auto& framebuffer = regs.framebuffer;
const PAddr addr = framebuffer.GetDepthBufferPhysicalAddress();
y = framebuffer.height - y; y = framebuffer.height - y;
const u32 coarse_y = y & ~7; const u32 coarse_y = y & ~7;
@ -212,7 +209,6 @@ void Framebuffer::SetStencil(int x, int y, u8 value) const {
const u32 stride = framebuffer.width * bytes_per_pixel; const u32 stride = framebuffer.width * bytes_per_pixel;
const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride; const u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * stride;
u8* depth_buffer = memory.GetPhysicalPointer(addr);
u8* dst_pixel = depth_buffer + dst_offset; u8* dst_pixel = depth_buffer + dst_offset;
switch (framebuffer.depth_format) { switch (framebuffer.depth_format) {
@ -231,7 +227,7 @@ void Framebuffer::SetStencil(int x, int y, u8 value) const {
} }
} }
void Framebuffer::DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) const { void Framebuffer::DrawShadowMapPixel(u32 x, u32 y, u32 depth, u8 stencil) const {
const auto& framebuffer = regs.framebuffer; const auto& framebuffer = regs.framebuffer;
const auto& shadow = regs.shadow; const auto& shadow = regs.shadow;
const PAddr addr = framebuffer.GetColorBufferPhysicalAddress(); const PAddr addr = framebuffer.GetColorBufferPhysicalAddress();

View file

@ -23,30 +23,37 @@ public:
explicit Framebuffer(Memory::MemorySystem& memory, const Pica::FramebufferRegs& framebuffer); explicit Framebuffer(Memory::MemorySystem& memory, const Pica::FramebufferRegs& framebuffer);
~Framebuffer(); ~Framebuffer();
/// Updates the framebuffer addresses from the PICA registers.
void Bind();
/// Draws a pixel at the specified coordinates. /// Draws a pixel at the specified coordinates.
void DrawPixel(int x, int y, const Common::Vec4<u8>& color) const; void DrawPixel(u32 x, u32 y, const Common::Vec4<u8>& color) const;
/// Returns the current color at the specified coordinates. /// Returns the current color at the specified coordinates.
[[nodiscard]] const Common::Vec4<u8> GetPixel(int x, int y) const; [[nodiscard]] const Common::Vec4<u8> GetPixel(u32 x, u32 y) const;
/// Returns the depth value at the specified coordinates. /// Returns the depth value at the specified coordinates.
[[nodiscard]] u32 GetDepth(int x, int y) const; [[nodiscard]] u32 GetDepth(u32 x, u32 y) const;
/// Returns the stencil value at the specified coordinates. /// Returns the stencil value at the specified coordinates.
[[nodiscard]] u8 GetStencil(int x, int y) const; [[nodiscard]] u8 GetStencil(u32 x, u32 y) const;
/// Stores the provided depth value at the specified coordinates. /// Stores the provided depth value at the specified coordinates.
void SetDepth(int x, int y, u32 value) const; void SetDepth(u32 x, u32 y, u32 value) const;
/// Stores the provided stencil value at the specified coordinates. /// Stores the provided stencil value at the specified coordinates.
void SetStencil(int x, int y, u8 value) const; void SetStencil(u32 x, u32 y, u8 value) const;
/// Draws a pixel to the shadow buffer. /// Draws a pixel to the shadow buffer.
void DrawShadowMapPixel(int x, int y, u32 depth, u8 stencil) const; void DrawShadowMapPixel(u32 x, u32 y, u32 depth, u8 stencil) const;
private: private:
Memory::MemorySystem& memory; Memory::MemorySystem& memory;
const Pica::FramebufferRegs& regs; const Pica::FramebufferRegs& regs;
PAddr color_addr;
u8* color_buffer{};
PAddr depth_addr;
u8* depth_buffer{};
}; };
u8 PerformStencilAction(Pica::FramebufferRegs::StencilAction action, u8 old_stencil, u8 ref); u8 PerformStencilAction(Pica::FramebufferRegs::StencilAction action, u8 old_stencil, u8 ref);

View file

@ -96,7 +96,9 @@ private:
} // Anonymous namespace } // Anonymous namespace
RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_) RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_)
: memory{memory_}, state{Pica::g_state}, regs{state.regs}, fb{memory, regs.framebuffer} {} : memory{memory_}, state{Pica::g_state}, regs{state.regs},
num_sw_threads{std::max(std::thread::hardware_concurrency(), 2U)},
sw_workers{num_sw_threads, "SwRenderer workers"}, fb{memory, regs.framebuffer} {}
void RasterizerSoftware::AddTriangle(const Pica::Shader::OutputVertex& v0, void RasterizerSoftware::AddTriangle(const Pica::Shader::OutputVertex& v0,
const Pica::Shader::OutputVertex& v1, const Pica::Shader::OutputVertex& v1,
@ -289,167 +291,180 @@ void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, con
const auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w); const auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w);
auto textures = regs.texturing.GetTextures(); const auto textures = regs.texturing.GetTextures();
const auto tev_stages = regs.texturing.GetTevStages(); const auto tev_stages = regs.texturing.GetTevStages();
fb.Bind();
// Enter rasterization loop, starting at the center of the topleft bounding box corner. // Enter rasterization loop, starting at the center of the topleft bounding box corner.
// TODO: Not sure if looping through x first might be faster // TODO: Not sure if looping through x first might be faster
for (u16 y = min_y + 8; y < max_y; y += 0x10) { for (u16 y = min_y + 8; y < max_y; y += 0x10) {
for (u16 x = min_x + 8; x < max_x; x += 0x10) { const auto process_scanline = [&, y] {
// Do not process the pixel if it's inside the scissor box and the scissor mode is set for (u16 x = min_x + 8; x < max_x; x += 0x10) {
// to Exclude. // Do not process the pixel if it's inside the scissor box and the scissor mode is
if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) { // set to Exclude.
if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) { if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) {
if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) {
continue;
}
}
// Calculate the barycentric coordinates w0, w1 and w2
const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
const s32 wsum = w0 + w1 + w2;
// If current pixel is not covered by the current primitive
if (w0 < 0 || w1 < 0 || w2 < 0) {
continue; continue;
} }
}
// Calculate the barycentric coordinates w0, w1 and w2 const auto baricentric_coordinates = Common::MakeVec(
const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); f24::FromFloat32(static_cast<f32>(w0)), f24::FromFloat32(static_cast<f32>(w1)),
const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y}); f24::FromFloat32(static_cast<f32>(w2)));
const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y}); const f24 interpolated_w_inverse =
const s32 wsum = w0 + w1 + w2; f24::One() / Common::Dot(w_inverse, baricentric_coordinates);
// If current pixel is not covered by the current primitive // interpolated_z = z / w
if (w0 < 0 || w1 < 0 || w2 < 0) { const float interpolated_z_over_w =
continue; (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 +
} v2.screenpos[2].ToFloat32() * w2) /
wsum;
const auto baricentric_coordinates = Common::MakeVec( // Not fully accurate. About 3 bits in precision are missing.
f24::FromFloat32(static_cast<f32>(w0)), f24::FromFloat32(static_cast<f32>(w1)), // Z-Buffer (z / w * scale + offset)
f24::FromFloat32(static_cast<f32>(w2))); const float depth_scale =
const f24 interpolated_w_inverse = f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32();
f24::One() / Common::Dot(w_inverse, baricentric_coordinates); const float depth_offset =
f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32();
float depth = interpolated_z_over_w * depth_scale + depth_offset;
// interpolated_z = z / w // Potentially switch to W-Buffer
const float interpolated_z_over_w = if (regs.rasterizer.depthmap_enable ==
(v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 + Pica::RasterizerRegs::DepthBuffering::WBuffering) {
v2.screenpos[2].ToFloat32() * w2) / // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
wsum; depth *= interpolated_w_inverse.ToFloat32() * wsum;
}
// Not fully accurate. About 3 bits in precision are missing. // Clamp the result
// Z-Buffer (z / w * scale + offset) depth = std::clamp(depth, 0.0f, 1.0f);
const float depth_scale =
f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32();
const float depth_offset =
f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32();
float depth = interpolated_z_over_w * depth_scale + depth_offset;
// Potentially switch to W-Buffer /**
if (regs.rasterizer.depthmap_enable == * Perspective correct attribute interpolation:
Pica::RasterizerRegs::DepthBuffering::WBuffering) { * Attribute values cannot be calculated by simple linear interpolation since
// W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w) * they are not linear in screen space. For example, when interpolating a
depth *= interpolated_w_inverse.ToFloat32() * wsum; * texture coordinate across two vertices, something simple like
} * u = (u0*w0 + u1*w1)/(w0+w1)
* will not work. However, the attribute value divided by the
// Clamp the result * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear
depth = std::clamp(depth, 0.0f, 1.0f); * in screenspace. Hence, we can linearly interpolate these two independently and
* calculate the interpolated attribute by dividing the results.
/** * I.e.
* Perspective correct attribute interpolation: * u_over_w = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1)
* Attribute values cannot be calculated by simple linear interpolation since * one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1)
* they are not linear in screen space. For example, when interpolating a * u = u_over_w / one_over_w
* texture coordinate across two vertices, something simple like *
* u = (u0*w0 + u1*w1)/(w0+w1) * The generalization to three vertices is straightforward in baricentric
* will not work. However, the attribute value divided by the *coordinates.
* clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear **/
* in screenspace. Hence, we can linearly interpolate these two independently and const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) {
* calculate the interpolated attribute by dividing the results. auto attr_over_w = Common::MakeVec(attr0, attr1, attr2);
* I.e. f24 interpolated_attr_over_w =
* u_over_w = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1) Common::Dot(attr_over_w, baricentric_coordinates);
* one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1) return interpolated_attr_over_w * interpolated_w_inverse;
* u = u_over_w / one_over_w
*
* The generalization to three vertices is straightforward in baricentric coordinates.
**/
const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) {
auto attr_over_w = Common::MakeVec(attr0, attr1, attr2);
f24 interpolated_attr_over_w = Common::Dot(attr_over_w, baricentric_coordinates);
return interpolated_attr_over_w * interpolated_w_inverse;
};
const Common::Vec4<u8> primary_color{
static_cast<u8>(
round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r())
.ToFloat32() *
255)),
static_cast<u8>(
round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g())
.ToFloat32() *
255)),
static_cast<u8>(
round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b())
.ToFloat32() *
255)),
static_cast<u8>(
round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a())
.ToFloat32() *
255)),
};
std::array<Common::Vec2<f24>, 3> uv;
uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u());
uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v());
uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u());
uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v());
// Sample bound texture units.
const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w);
const auto texture_color = TextureColor(uv, textures, tc0_w);
Common::Vec4<u8> primary_fragment_color = {0, 0, 0, 0};
Common::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
if (!regs.lighting.disable) {
const auto normquat =
Common::Quaternion<f32>{
{get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(),
get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(),
get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()},
get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
}
.Normalized();
const Common::Vec3f view{
get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
}; };
std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors(
regs.lighting, state.lighting, normquat, view, texture_color);
}
// Write the TEV stages. const Common::Vec4<u8> primary_color{
auto combiner_output = WriteTevConfig(texture_color, tev_stages, primary_color, static_cast<u8>(
primary_fragment_color, secondary_fragment_color); round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r())
.ToFloat32() *
255)),
static_cast<u8>(
round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g())
.ToFloat32() *
255)),
static_cast<u8>(
round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b())
.ToFloat32() *
255)),
static_cast<u8>(
round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a())
.ToFloat32() *
255)),
};
const auto& output_merger = regs.framebuffer.output_merger; std::array<Common::Vec2<f24>, 3> uv;
if (output_merger.fragment_operation_mode == uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
FramebufferRegs::FragmentOperationMode::Shadow) { uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
u32 depth_int = static_cast<u32>(depth * 0xFFFFFF); uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u());
// Use green color as the shadow intensity uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v());
u8 stencil = combiner_output.y; uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u());
fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil); uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v());
// Skip the normal output merger pipeline if it is in shadow mode
continue;
}
// Does alpha testing happen before or after stencil? // Sample bound texture units.
if (!DoAlphaTest(combiner_output.a())) { const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w);
continue; const auto texture_color = TextureColor(uv, textures, tc0_w);
Common::Vec4<u8> primary_fragment_color = {0, 0, 0, 0};
Common::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
if (!regs.lighting.disable) {
const auto normquat =
Common::Quaternion<f32>{
{get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x)
.ToFloat32(),
get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y)
.ToFloat32(),
get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z)
.ToFloat32()},
get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
}
.Normalized();
const Common::Vec3f view{
get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
};
std::tie(primary_fragment_color, secondary_fragment_color) =
ComputeFragmentsColors(regs.lighting, state.lighting, normquat, view,
texture_color);
}
// Write the TEV stages.
auto combiner_output =
WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color,
secondary_fragment_color);
const auto& output_merger = regs.framebuffer.output_merger;
if (output_merger.fragment_operation_mode ==
FramebufferRegs::FragmentOperationMode::Shadow) {
const u32 depth_int = static_cast<u32>(depth * 0xFFFFFF);
// Use green color as the shadow intensity
const u8 stencil = combiner_output.y;
fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil);
// Skip the normal output merger pipeline if it is in shadow mode
continue;
}
// Does alpha testing happen before or after stencil?
if (!DoAlphaTest(combiner_output.a())) {
continue;
}
WriteFog(depth, combiner_output);
if (!DoDepthStencilTest(x, y, depth)) {
continue;
}
const auto result = PixelColor(x, y, combiner_output);
if (regs.framebuffer.framebuffer.allow_color_write != 0) {
fb.DrawPixel(x >> 4, y >> 4, result);
}
} }
WriteFog(combiner_output, depth); };
if (!DoDepthStencilTest(x, y, depth)) { sw_workers.QueueWork(std::move(process_scanline));
continue;
}
const auto result = PixelColor(x, y, combiner_output);
if (regs.framebuffer.framebuffer.allow_color_write != 0) {
fb.DrawPixel(x >> 4, y >> 4, result);
}
}
} }
sw_workers.WaitForRequests();
} }
std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor( std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor(
@ -573,7 +588,7 @@ std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor(
} }
Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y, Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y,
Common::Vec4<u8>& combiner_output) const { Common::Vec4<u8> combiner_output) const {
const auto dest = fb.GetPixel(x >> 4, y >> 4); const auto dest = fb.GetPixel(x >> 4, y >> 4);
Common::Vec4<u8> blend_output = combiner_output; Common::Vec4<u8> blend_output = combiner_output;
@ -771,7 +786,7 @@ Common::Vec4<u8> RasterizerSoftware::WriteTevConfig(
return combiner_output; return combiner_output;
} }
void RasterizerSoftware::WriteFog(Common::Vec4<u8>& combiner_output, float depth) const { void RasterizerSoftware::WriteFog(float depth, Common::Vec4<u8>& combiner_output) const {
/** /**
* Apply fog combiner. Not fully accurate. We'd have to know what data type is used to * Apply fog combiner. Not fully accurate. We'd have to know what data type is used to
* store the depth etc. Using float for now until we know more about Pica datatypes. * store the depth etc. Using float for now until we know more about Pica datatypes.

View file

@ -5,7 +5,7 @@
#pragma once #pragma once
#include <span> #include <span>
#include "common/thread_worker.h"
#include "video_core/rasterizer_interface.h" #include "video_core/rasterizer_interface.h"
#include "video_core/regs_texturing.h" #include "video_core/regs_texturing.h"
#include "video_core/renderer_software/sw_clipper.h" #include "video_core/renderer_software/sw_clipper.h"
@ -52,7 +52,7 @@ private:
std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const; std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const;
/// Returns the final pixel color with blending or logic ops applied. /// Returns the final pixel color with blending or logic ops applied.
Common::Vec4<u8> PixelColor(u16 x, u16 y, Common::Vec4<u8>& combiner_output) const; Common::Vec4<u8> PixelColor(u16 x, u16 y, Common::Vec4<u8> combiner_output) const;
/// Emulates the TEV configuration and returns the combiner output. /// Emulates the TEV configuration and returns the combiner output.
Common::Vec4<u8> WriteTevConfig( Common::Vec4<u8> WriteTevConfig(
@ -62,7 +62,7 @@ private:
Common::Vec4<u8> secondary_fragment_color); Common::Vec4<u8> secondary_fragment_color);
/// Blends fog to the combiner output if enabled. /// Blends fog to the combiner output if enabled.
void WriteFog(Common::Vec4<u8>& combiner_output, float depth) const; void WriteFog(float depth, Common::Vec4<u8>& combiner_output) const;
/// Performs the alpha test. Returns false if the test failed. /// Performs the alpha test. Returns false if the test failed.
bool DoAlphaTest(u8 alpha) const; bool DoAlphaTest(u8 alpha) const;
@ -74,6 +74,8 @@ private:
Memory::MemorySystem& memory; Memory::MemorySystem& memory;
Pica::State& state; Pica::State& state;
const Pica::Regs& regs; const Pica::Regs& regs;
size_t num_sw_threads;
Common::ThreadWorker sw_workers;
Framebuffer fb; Framebuffer fb;
}; };