Merge pull request #5741 from ReinUsesLisp/new-bufcache

video_core: Reimplement the buffer cache
This commit is contained in:
bunnei 2021-02-12 22:22:18 -08:00 committed by GitHub
commit d3c7a7e7cf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
96 changed files with 3425 additions and 3118 deletions

View file

@ -471,3 +471,79 @@ TEST_CASE("BufferBase: Unaligned page region query") {
REQUIRE(buffer.IsRegionCpuModified(c + 4000, 1000)); REQUIRE(buffer.IsRegionCpuModified(c + 4000, 1000));
REQUIRE(buffer.IsRegionCpuModified(c + 4000, 1)); REQUIRE(buffer.IsRegionCpuModified(c + 4000, 1));
} }
TEST_CASE("BufferBase: Cached write") {
RasterizerInterface rasterizer;
BufferBase buffer(rasterizer, c, WORD);
buffer.UnmarkRegionAsCpuModified(c, WORD);
buffer.CachedCpuWrite(c + PAGE, PAGE);
REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
buffer.FlushCachedWrites();
REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE));
buffer.MarkRegionAsCpuModified(c, WORD);
REQUIRE(rasterizer.Count() == 0);
}
TEST_CASE("BufferBase: Multiple cached write") {
RasterizerInterface rasterizer;
BufferBase buffer(rasterizer, c, WORD);
buffer.UnmarkRegionAsCpuModified(c, WORD);
buffer.CachedCpuWrite(c + PAGE, PAGE);
buffer.CachedCpuWrite(c + PAGE * 3, PAGE);
REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
REQUIRE(!buffer.IsRegionCpuModified(c + PAGE * 3, PAGE));
buffer.FlushCachedWrites();
REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE));
REQUIRE(buffer.IsRegionCpuModified(c + PAGE * 3, PAGE));
buffer.MarkRegionAsCpuModified(c, WORD);
REQUIRE(rasterizer.Count() == 0);
}
TEST_CASE("BufferBase: Cached write unmarked") {
RasterizerInterface rasterizer;
BufferBase buffer(rasterizer, c, WORD);
buffer.UnmarkRegionAsCpuModified(c, WORD);
buffer.CachedCpuWrite(c + PAGE, PAGE);
buffer.UnmarkRegionAsCpuModified(c + PAGE, PAGE);
REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
buffer.FlushCachedWrites();
REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE));
buffer.MarkRegionAsCpuModified(c, WORD);
REQUIRE(rasterizer.Count() == 0);
}
TEST_CASE("BufferBase: Cached write iterated") {
RasterizerInterface rasterizer;
BufferBase buffer(rasterizer, c, WORD);
buffer.UnmarkRegionAsCpuModified(c, WORD);
buffer.CachedCpuWrite(c + PAGE, PAGE);
int num = 0;
buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
REQUIRE(num == 0);
REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
buffer.FlushCachedWrites();
REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE));
buffer.MarkRegionAsCpuModified(c, WORD);
REQUIRE(rasterizer.Count() == 0);
}
TEST_CASE("BufferBase: Cached write downloads") {
RasterizerInterface rasterizer;
BufferBase buffer(rasterizer, c, WORD);
buffer.UnmarkRegionAsCpuModified(c, WORD);
REQUIRE(rasterizer.Count() == 64);
buffer.CachedCpuWrite(c + PAGE, PAGE);
REQUIRE(rasterizer.Count() == 63);
buffer.MarkRegionAsGpuModified(c + PAGE, PAGE);
int num = 0;
buffer.ForEachDownloadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
REQUIRE(num == 0);
REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE));
buffer.FlushCachedWrites();
REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE));
REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE));
buffer.MarkRegionAsCpuModified(c, WORD);
REQUIRE(rasterizer.Count() == 0);
}

View file

@ -2,10 +2,8 @@ add_subdirectory(host_shaders)
add_library(video_core STATIC add_library(video_core STATIC
buffer_cache/buffer_base.h buffer_cache/buffer_base.h
buffer_cache/buffer_block.h buffer_cache/buffer_cache.cpp
buffer_cache/buffer_cache.h buffer_cache/buffer_cache.h
buffer_cache/map_interval.cpp
buffer_cache/map_interval.h
cdma_pusher.cpp cdma_pusher.cpp
cdma_pusher.h cdma_pusher.h
command_classes/codecs/codec.cpp command_classes/codecs/codec.cpp
@ -152,8 +150,6 @@ add_library(video_core STATIC
renderer_vulkan/vk_staging_buffer_pool.h renderer_vulkan/vk_staging_buffer_pool.h
renderer_vulkan/vk_state_tracker.cpp renderer_vulkan/vk_state_tracker.cpp
renderer_vulkan/vk_state_tracker.h renderer_vulkan/vk_state_tracker.h
renderer_vulkan/vk_stream_buffer.cpp
renderer_vulkan/vk_stream_buffer.h
renderer_vulkan/vk_swapchain.cpp renderer_vulkan/vk_swapchain.cpp
renderer_vulkan/vk_swapchain.h renderer_vulkan/vk_swapchain.h
renderer_vulkan/vk_texture_cache.cpp renderer_vulkan/vk_texture_cache.cpp

View file

@ -19,6 +19,7 @@ namespace VideoCommon {
enum class BufferFlagBits { enum class BufferFlagBits {
Picked = 1 << 0, Picked = 1 << 0,
CachedWrites = 1 << 1,
}; };
DECLARE_ENUM_FLAG_OPERATORS(BufferFlagBits) DECLARE_ENUM_FLAG_OPERATORS(BufferFlagBits)
@ -40,7 +41,7 @@ class BufferBase {
static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE;
/// Vector tracking modified pages tightly packed with small vector optimization /// Vector tracking modified pages tightly packed with small vector optimization
union WrittenWords { union WordsArray {
/// Returns the pointer to the words state /// Returns the pointer to the words state
[[nodiscard]] const u64* Pointer(bool is_short) const noexcept { [[nodiscard]] const u64* Pointer(bool is_short) const noexcept {
return is_short ? &stack : heap; return is_short ? &stack : heap;
@ -55,49 +56,59 @@ class BufferBase {
u64* heap; ///< Not-small buffers pointer to the storage u64* heap; ///< Not-small buffers pointer to the storage
}; };
struct GpuCpuWords { struct Words {
explicit GpuCpuWords() = default; explicit Words() = default;
explicit GpuCpuWords(u64 size_bytes_) : size_bytes{size_bytes_} { explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} {
if (IsShort()) { if (IsShort()) {
cpu.stack = ~u64{0}; cpu.stack = ~u64{0};
gpu.stack = 0; gpu.stack = 0;
cached_cpu.stack = 0;
untracked.stack = ~u64{0};
} else { } else {
// Share allocation between CPU and GPU pages and set their default values // Share allocation between CPU and GPU pages and set their default values
const size_t num_words = NumWords(); const size_t num_words = NumWords();
u64* const alloc = new u64[num_words * 2]; u64* const alloc = new u64[num_words * 4];
cpu.heap = alloc; cpu.heap = alloc;
gpu.heap = alloc + num_words; gpu.heap = alloc + num_words;
cached_cpu.heap = alloc + num_words * 2;
untracked.heap = alloc + num_words * 3;
std::fill_n(cpu.heap, num_words, ~u64{0}); std::fill_n(cpu.heap, num_words, ~u64{0});
std::fill_n(gpu.heap, num_words, 0); std::fill_n(gpu.heap, num_words, 0);
std::fill_n(cached_cpu.heap, num_words, 0);
std::fill_n(untracked.heap, num_words, ~u64{0});
} }
// Clean up tailing bits // Clean up tailing bits
const u64 last_local_page = const u64 last_word_size = size_bytes % BYTES_PER_WORD;
Common::DivCeil(size_bytes % BYTES_PER_WORD, BYTES_PER_PAGE); const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE);
const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD;
u64& last_word = cpu.Pointer(IsShort())[NumWords() - 1]; const u64 last_word = (~u64{0} << shift) >> shift;
last_word = (last_word << shift) >> shift; cpu.Pointer(IsShort())[NumWords() - 1] = last_word;
untracked.Pointer(IsShort())[NumWords() - 1] = last_word;
} }
~GpuCpuWords() { ~Words() {
Release(); Release();
} }
GpuCpuWords& operator=(GpuCpuWords&& rhs) noexcept { Words& operator=(Words&& rhs) noexcept {
Release(); Release();
size_bytes = rhs.size_bytes; size_bytes = rhs.size_bytes;
cpu = rhs.cpu; cpu = rhs.cpu;
gpu = rhs.gpu; gpu = rhs.gpu;
cached_cpu = rhs.cached_cpu;
untracked = rhs.untracked;
rhs.cpu.heap = nullptr; rhs.cpu.heap = nullptr;
return *this; return *this;
} }
GpuCpuWords(GpuCpuWords&& rhs) noexcept Words(Words&& rhs) noexcept
: size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu} { : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu},
cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} {
rhs.cpu.heap = nullptr; rhs.cpu.heap = nullptr;
} }
GpuCpuWords& operator=(const GpuCpuWords&) = delete; Words& operator=(const Words&) = delete;
GpuCpuWords(const GpuCpuWords&) = delete; Words(const Words&) = delete;
/// Returns true when the buffer fits in the small vector optimization /// Returns true when the buffer fits in the small vector optimization
[[nodiscard]] bool IsShort() const noexcept { [[nodiscard]] bool IsShort() const noexcept {
@ -118,8 +129,17 @@ class BufferBase {
} }
u64 size_bytes = 0; u64 size_bytes = 0;
WrittenWords cpu; WordsArray cpu;
WrittenWords gpu; WordsArray gpu;
WordsArray cached_cpu;
WordsArray untracked;
};
enum class Type {
CPU,
GPU,
CachedCPU,
Untracked,
}; };
public: public:
@ -132,68 +152,93 @@ public:
BufferBase& operator=(const BufferBase&) = delete; BufferBase& operator=(const BufferBase&) = delete;
BufferBase(const BufferBase&) = delete; BufferBase(const BufferBase&) = delete;
BufferBase& operator=(BufferBase&&) = default;
BufferBase(BufferBase&&) = default;
/// Returns the inclusive CPU modified range in a begin end pair /// Returns the inclusive CPU modified range in a begin end pair
[[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr,
u64 query_size) const noexcept { u64 query_size) const noexcept {
const u64 offset = query_cpu_addr - cpu_addr; const u64 offset = query_cpu_addr - cpu_addr;
return ModifiedRegion<false>(offset, query_size); return ModifiedRegion<Type::CPU>(offset, query_size);
} }
/// Returns the inclusive GPU modified range in a begin end pair /// Returns the inclusive GPU modified range in a begin end pair
[[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr,
u64 query_size) const noexcept { u64 query_size) const noexcept {
const u64 offset = query_cpu_addr - cpu_addr; const u64 offset = query_cpu_addr - cpu_addr;
return ModifiedRegion<true>(offset, query_size); return ModifiedRegion<Type::GPU>(offset, query_size);
} }
/// Returns true if a region has been modified from the CPU /// Returns true if a region has been modified from the CPU
[[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept {
const u64 offset = query_cpu_addr - cpu_addr; const u64 offset = query_cpu_addr - cpu_addr;
return IsRegionModified<false>(offset, query_size); return IsRegionModified<Type::CPU>(offset, query_size);
} }
/// Returns true if a region has been modified from the GPU /// Returns true if a region has been modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept {
const u64 offset = query_cpu_addr - cpu_addr; const u64 offset = query_cpu_addr - cpu_addr;
return IsRegionModified<true>(offset, query_size); return IsRegionModified<Type::GPU>(offset, query_size);
} }
/// Mark region as CPU modified, notifying the rasterizer about this change /// Mark region as CPU modified, notifying the rasterizer about this change
void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) {
ChangeRegionState<true, true>(words.cpu, dirty_cpu_addr, size); ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size);
} }
/// Unmark region as CPU modified, notifying the rasterizer about this change /// Unmark region as CPU modified, notifying the rasterizer about this change
void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) {
ChangeRegionState<false, true>(words.cpu, dirty_cpu_addr, size); ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size);
} }
/// Mark region as modified from the host GPU /// Mark region as modified from the host GPU
void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept {
ChangeRegionState<true, false>(words.gpu, dirty_cpu_addr, size); ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size);
} }
/// Unmark region as modified from the host GPU /// Unmark region as modified from the host GPU
void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept {
ChangeRegionState<false, false>(words.gpu, dirty_cpu_addr, size); ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size);
}
/// Mark region as modified from the CPU
/// but don't mark it as modified until FlusHCachedWrites is called.
void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) {
flags |= BufferFlagBits::CachedWrites;
ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size);
}
/// Flushes cached CPU writes, and notify the rasterizer about the deltas
void FlushCachedWrites() noexcept {
flags &= ~BufferFlagBits::CachedWrites;
const u64 num_words = NumWords();
const u64* const cached_words = Array<Type::CachedCPU>();
u64* const untracked_words = Array<Type::Untracked>();
u64* const cpu_words = Array<Type::CPU>();
for (u64 word_index = 0; word_index < num_words; ++word_index) {
const u64 cached_bits = cached_words[word_index];
NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits);
untracked_words[word_index] |= cached_bits;
cpu_words[word_index] |= cached_bits;
}
} }
/// Call 'func' for each CPU modified range and unmark those pages as CPU modified /// Call 'func' for each CPU modified range and unmark those pages as CPU modified
template <typename Func> template <typename Func>
void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) {
ForEachModifiedRange<false, true>(query_cpu_range, size, func); ForEachModifiedRange<Type::CPU>(query_cpu_range, size, func);
} }
/// Call 'func' for each GPU modified range and unmark those pages as GPU modified /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
template <typename Func> template <typename Func>
void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) { void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) {
ForEachModifiedRange<true, false>(query_cpu_range, size, func); ForEachModifiedRange<Type::GPU>(query_cpu_range, size, func);
} }
/// Call 'func' for each GPU modified range and unmark those pages as GPU modified /// Call 'func' for each GPU modified range and unmark those pages as GPU modified
template <typename Func> template <typename Func>
void ForEachDownloadRange(Func&& func) { void ForEachDownloadRange(Func&& func) {
ForEachModifiedRange<true, false>(cpu_addr, SizeBytes(), func); ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), func);
} }
/// Mark buffer as picked /// Mark buffer as picked
@ -206,6 +251,16 @@ public:
flags &= ~BufferFlagBits::Picked; flags &= ~BufferFlagBits::Picked;
} }
/// Increases the likeliness of this being a stream buffer
void IncreaseStreamScore(int score) noexcept {
stream_score += score;
}
/// Returns the likeliness of this being a stream buffer
[[nodiscard]] int StreamScore() const noexcept {
return stream_score;
}
/// Returns true when vaddr -> vaddr+size is fully contained in the buffer /// Returns true when vaddr -> vaddr+size is fully contained in the buffer
[[nodiscard]] bool IsInBounds(VAddr addr, u64 size) const noexcept { [[nodiscard]] bool IsInBounds(VAddr addr, u64 size) const noexcept {
return addr >= cpu_addr && addr + size <= cpu_addr + SizeBytes(); return addr >= cpu_addr && addr + size <= cpu_addr + SizeBytes();
@ -216,6 +271,11 @@ public:
return True(flags & BufferFlagBits::Picked); return True(flags & BufferFlagBits::Picked);
} }
/// Returns true when the buffer has pending cached writes
[[nodiscard]] bool HasCachedWrites() const noexcept {
return True(flags & BufferFlagBits::CachedWrites);
}
/// Returns the base CPU address of the buffer /// Returns the base CPU address of the buffer
[[nodiscard]] VAddr CpuAddr() const noexcept { [[nodiscard]] VAddr CpuAddr() const noexcept {
return cpu_addr; return cpu_addr;
@ -233,26 +293,48 @@ public:
} }
private: private:
template <Type type>
u64* Array() noexcept {
if constexpr (type == Type::CPU) {
return words.cpu.Pointer(IsShort());
} else if constexpr (type == Type::GPU) {
return words.gpu.Pointer(IsShort());
} else if constexpr (type == Type::CachedCPU) {
return words.cached_cpu.Pointer(IsShort());
} else if constexpr (type == Type::Untracked) {
return words.untracked.Pointer(IsShort());
}
}
template <Type type>
const u64* Array() const noexcept {
if constexpr (type == Type::CPU) {
return words.cpu.Pointer(IsShort());
} else if constexpr (type == Type::GPU) {
return words.gpu.Pointer(IsShort());
} else if constexpr (type == Type::CachedCPU) {
return words.cached_cpu.Pointer(IsShort());
} else if constexpr (type == Type::Untracked) {
return words.untracked.Pointer(IsShort());
}
}
/** /**
* Change the state of a range of pages * Change the state of a range of pages
* *
* @param written_words Pages to be marked or unmarked as modified
* @param dirty_addr Base address to mark or unmark as modified * @param dirty_addr Base address to mark or unmark as modified
* @param size Size in bytes to mark or unmark as modified * @param size Size in bytes to mark or unmark as modified
*
* @tparam enable True when the bits will be set to one, false for zero
* @tparam notify_rasterizer True when the rasterizer has to be notified about the changes
*/ */
template <bool enable, bool notify_rasterizer> template <Type type, bool enable>
void ChangeRegionState(WrittenWords& written_words, u64 dirty_addr, void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) {
s64 size) noexcept(!notify_rasterizer) {
const s64 difference = dirty_addr - cpu_addr; const s64 difference = dirty_addr - cpu_addr;
const u64 offset = std::max<s64>(difference, 0); const u64 offset = std::max<s64>(difference, 0);
size += std::min<s64>(difference, 0); size += std::min<s64>(difference, 0);
if (offset >= SizeBytes() || size < 0) { if (offset >= SizeBytes() || size < 0) {
return; return;
} }
u64* const state_words = written_words.Pointer(IsShort()); u64* const untracked_words = Array<Type::Untracked>();
u64* const state_words = Array<type>();
const u64 offset_end = std::min(offset + size, SizeBytes()); const u64 offset_end = std::min(offset + size, SizeBytes());
const u64 begin_page_index = offset / BYTES_PER_PAGE; const u64 begin_page_index = offset / BYTES_PER_PAGE;
const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; const u64 begin_word_index = begin_page_index / PAGES_PER_WORD;
@ -268,13 +350,19 @@ private:
u64 bits = ~u64{0}; u64 bits = ~u64{0};
bits = (bits >> right_offset) << right_offset; bits = (bits >> right_offset) << right_offset;
bits = (bits << left_offset) >> left_offset; bits = (bits << left_offset) >> left_offset;
if constexpr (notify_rasterizer) { if constexpr (type == Type::CPU || type == Type::CachedCPU) {
NotifyRasterizer<!enable>(word_index, state_words[word_index], bits); NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits);
} }
if constexpr (enable) { if constexpr (enable) {
state_words[word_index] |= bits; state_words[word_index] |= bits;
if constexpr (type == Type::CPU || type == Type::CachedCPU) {
untracked_words[word_index] |= bits;
}
} else { } else {
state_words[word_index] &= ~bits; state_words[word_index] &= ~bits;
if constexpr (type == Type::CPU || type == Type::CachedCPU) {
untracked_words[word_index] &= ~bits;
}
} }
page_index = 0; page_index = 0;
++word_index; ++word_index;
@ -291,7 +379,7 @@ private:
* @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages
*/ */
template <bool add_to_rasterizer> template <bool add_to_rasterizer>
void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) { void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const {
u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits;
VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; VAddr addr = cpu_addr + word_index * BYTES_PER_WORD;
while (changed_bits != 0) { while (changed_bits != 0) {
@ -315,21 +403,20 @@ private:
* @param query_cpu_range Base CPU address to loop over * @param query_cpu_range Base CPU address to loop over
* @param size Size in bytes of the CPU range to loop over * @param size Size in bytes of the CPU range to loop over
* @param func Function to call for each turned off region * @param func Function to call for each turned off region
*
* @tparam gpu True for host GPU pages, false for CPU pages
* @tparam notify_rasterizer True when the rasterizer should be notified about state changes
*/ */
template <bool gpu, bool notify_rasterizer, typename Func> template <Type type, typename Func>
void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) {
static_assert(type != Type::Untracked);
const s64 difference = query_cpu_range - cpu_addr; const s64 difference = query_cpu_range - cpu_addr;
const u64 query_begin = std::max<s64>(difference, 0); const u64 query_begin = std::max<s64>(difference, 0);
size += std::min<s64>(difference, 0); size += std::min<s64>(difference, 0);
if (query_begin >= SizeBytes() || size < 0) { if (query_begin >= SizeBytes() || size < 0) {
return; return;
} }
const u64* const cpu_words = words.cpu.Pointer(IsShort()); u64* const untracked_words = Array<Type::Untracked>();
u64* const state_words = Array<type>();
const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes()); const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort());
u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD);
@ -345,7 +432,8 @@ private:
const u64 word_index_end = std::distance(state_words, last_modified_word); const u64 word_index_end = std::distance(state_words, last_modified_word);
const unsigned local_page_begin = std::countr_zero(*first_modified_word); const unsigned local_page_begin = std::countr_zero(*first_modified_word);
const unsigned local_page_end = PAGES_PER_WORD - std::countl_zero(last_modified_word[-1]); const unsigned local_page_end =
static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]);
const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; const u64 word_page_begin = word_index_begin * PAGES_PER_WORD;
const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD;
const u64 query_page_begin = query_begin / BYTES_PER_PAGE; const u64 query_page_begin = query_begin / BYTES_PER_PAGE;
@ -371,11 +459,13 @@ private:
const u64 current_word = state_words[word_index] & bits; const u64 current_word = state_words[word_index] & bits;
state_words[word_index] &= ~bits; state_words[word_index] &= ~bits;
// Exclude CPU modified pages when visiting GPU pages if constexpr (type == Type::CPU) {
const u64 word = current_word & ~(gpu ? cpu_words[word_index] : 0); const u64 current_bits = untracked_words[word_index] & bits;
if constexpr (notify_rasterizer) { untracked_words[word_index] &= ~bits;
NotifyRasterizer<true>(word_index, word, ~u64{0}); NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
} }
// Exclude CPU modified pages when visiting GPU pages
const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
u64 page = page_begin; u64 page = page_begin;
page_begin = 0; page_begin = 0;
@ -416,17 +506,20 @@ private:
* @param offset Offset in bytes from the start of the buffer * @param offset Offset in bytes from the start of the buffer
* @param size Size in bytes of the region to query for modifications * @param size Size in bytes of the region to query for modifications
*/ */
template <bool gpu> template <Type type>
[[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
const u64* const cpu_words = words.cpu.Pointer(IsShort()); static_assert(type != Type::Untracked);
const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort());
const u64* const untracked_words = Array<Type::Untracked>();
const u64* const state_words = Array<type>();
const u64 num_query_words = size / BYTES_PER_WORD + 1; const u64 num_query_words = size / BYTES_PER_WORD + 1;
const u64 word_begin = offset / BYTES_PER_WORD; const u64 word_begin = offset / BYTES_PER_WORD;
const u64 word_end = std::min(word_begin + num_query_words, NumWords()); const u64 word_end = std::min(word_begin + num_query_words, NumWords());
const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0); const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
const u64 word = state_words[word_index] & ~off_word;
if (word == 0) { if (word == 0) {
continue; continue;
} }
@ -445,13 +538,13 @@ private:
* *
* @param offset Offset in bytes from the start of the buffer * @param offset Offset in bytes from the start of the buffer
* @param size Size in bytes of the region to query for modifications * @param size Size in bytes of the region to query for modifications
*
* @tparam gpu True to query GPU modified pages, false for CPU pages
*/ */
template <bool gpu> template <Type type>
[[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
const u64* const cpu_words = words.cpu.Pointer(IsShort()); static_assert(type != Type::Untracked);
const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort());
const u64* const untracked_words = Array<Type::Untracked>();
const u64* const state_words = Array<type>();
const u64 num_query_words = size / BYTES_PER_WORD + 1; const u64 num_query_words = size / BYTES_PER_WORD + 1;
const u64 word_begin = offset / BYTES_PER_WORD; const u64 word_begin = offset / BYTES_PER_WORD;
const u64 word_end = std::min(word_begin + num_query_words, NumWords()); const u64 word_end = std::min(word_begin + num_query_words, NumWords());
@ -460,7 +553,8 @@ private:
u64 begin = std::numeric_limits<u64>::max(); u64 begin = std::numeric_limits<u64>::max();
u64 end = 0; u64 end = 0;
for (u64 word_index = word_begin; word_index < word_end; ++word_index) { for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0); const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
const u64 word = state_words[word_index] & ~off_word;
if (word == 0) { if (word == 0) {
continue; continue;
} }
@ -488,8 +582,9 @@ private:
RasterizerInterface* rasterizer = nullptr; RasterizerInterface* rasterizer = nullptr;
VAddr cpu_addr = 0; VAddr cpu_addr = 0;
GpuCpuWords words; Words words;
BufferFlagBits flags{}; BufferFlagBits flags{};
int stream_score = 0;
}; };
} // namespace VideoCommon } // namespace VideoCommon

View file

@ -1,62 +0,0 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include "common/common_types.h"
namespace VideoCommon {
class BufferBlock {
public:
[[nodiscard]] bool Overlaps(VAddr start, VAddr end) const {
return (cpu_addr < end) && (cpu_addr_end > start);
}
[[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const {
return cpu_addr <= other_start && other_end <= cpu_addr_end;
}
[[nodiscard]] std::size_t Offset(VAddr in_addr) const {
return static_cast<std::size_t>(in_addr - cpu_addr);
}
[[nodiscard]] VAddr CpuAddr() const {
return cpu_addr;
}
[[nodiscard]] VAddr CpuAddrEnd() const {
return cpu_addr_end;
}
void SetCpuAddr(VAddr new_addr) {
cpu_addr = new_addr;
cpu_addr_end = new_addr + size;
}
[[nodiscard]] std::size_t Size() const {
return size;
}
[[nodiscard]] u64 Epoch() const {
return epoch;
}
void SetEpoch(u64 new_epoch) {
epoch = new_epoch;
}
protected:
explicit BufferBlock(VAddr cpu_addr_, std::size_t size_) : size{size_} {
SetCpuAddr(cpu_addr_);
}
private:
VAddr cpu_addr{};
VAddr cpu_addr_end{};
std::size_t size{};
u64 epoch{};
};
} // namespace VideoCommon

View file

@ -0,0 +1,13 @@
// Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include "common/microprofile.h"
namespace VideoCommon {
MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128));
MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128));
MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128));
} // namespace VideoCommon

File diff suppressed because it is too large Load diff

View file

@ -1,33 +0,0 @@
// Copyright 2020 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#include <algorithm>
#include <array>
#include <cstddef>
#include <memory>
#include "video_core/buffer_cache/map_interval.h"
namespace VideoCommon {
MapIntervalAllocator::MapIntervalAllocator() {
FillFreeList(first_chunk);
}
MapIntervalAllocator::~MapIntervalAllocator() = default;
void MapIntervalAllocator::AllocateNewChunk() {
*new_chunk = std::make_unique<Chunk>();
FillFreeList(**new_chunk);
new_chunk = &(*new_chunk)->next;
}
void MapIntervalAllocator::FillFreeList(Chunk& chunk) {
const std::size_t old_size = free_list.size();
free_list.resize(old_size + chunk.data.size());
std::transform(chunk.data.rbegin(), chunk.data.rend(), free_list.begin() + old_size,
[](MapInterval& interval) { return &interval; });
}
} // namespace VideoCommon

View file

@ -1,93 +0,0 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#pragma once
#include <array>
#include <cstddef>
#include <memory>
#include <vector>
#include <boost/intrusive/set_hook.hpp>
#include "common/common_types.h"
#include "video_core/gpu.h"
namespace VideoCommon {
struct MapInterval : public boost::intrusive::set_base_hook<boost::intrusive::optimize_size<true>> {
MapInterval() = default;
/*implicit*/ MapInterval(VAddr start_) noexcept : start{start_} {}
explicit MapInterval(VAddr start_, VAddr end_, GPUVAddr gpu_addr_) noexcept
: start{start_}, end{end_}, gpu_addr{gpu_addr_} {}
bool IsInside(VAddr other_start, VAddr other_end) const noexcept {
return start <= other_start && other_end <= end;
}
bool Overlaps(VAddr other_start, VAddr other_end) const noexcept {
return start < other_end && other_start < end;
}
void MarkAsModified(bool is_modified_, u64 ticks_) noexcept {
is_modified = is_modified_;
ticks = ticks_;
}
boost::intrusive::set_member_hook<> member_hook_;
VAddr start = 0;
VAddr end = 0;
GPUVAddr gpu_addr = 0;
u64 ticks = 0;
bool is_written = false;
bool is_modified = false;
bool is_registered = false;
bool is_memory_marked = false;
bool is_sync_pending = false;
};
struct MapIntervalCompare {
constexpr bool operator()(const MapInterval& lhs, const MapInterval& rhs) const noexcept {
return lhs.start < rhs.start;
}
};
class MapIntervalAllocator {
public:
MapIntervalAllocator();
~MapIntervalAllocator();
MapInterval* Allocate() {
if (free_list.empty()) {
AllocateNewChunk();
}
MapInterval* const interval = free_list.back();
free_list.pop_back();
return interval;
}
void Release(MapInterval* interval) {
free_list.push_back(interval);
}
private:
struct Chunk {
std::unique_ptr<Chunk> next;
std::array<MapInterval, 0x8000> data;
};
void AllocateNewChunk();
void FillFreeList(Chunk& chunk);
std::vector<MapInterval*> free_list;
Chunk first_chunk;
std::unique_ptr<Chunk>* new_chunk = &first_chunk.next;
};
} // namespace VideoCommon

View file

@ -110,12 +110,10 @@ void Vic::Execute() {
converted_frame_buffer.get(), block_height, 0, 0); converted_frame_buffer.get(), block_height, 0, 0);
gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size); gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size);
gpu.Maxwell3D().OnMemoryWrite();
} else { } else {
// send pitch linear frame // send pitch linear frame
gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr,
linear_size); linear_size);
gpu.Maxwell3D().OnMemoryWrite();
} }
break; break;
} }
@ -163,7 +161,6 @@ void Vic::Execute() {
} }
gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(), gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(),
chroma_buffer.size()); chroma_buffer.size());
gpu.Maxwell3D().OnMemoryWrite();
break; break;
} }
default: default:

View file

@ -12,13 +12,30 @@
#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32))) #define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32)))
namespace VideoCommon::Dirty { namespace VideoCommon::Dirty {
namespace {
using Tegra::Engines::Maxwell3D; using Tegra::Engines::Maxwell3D;
void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) { void SetupDirtyVertexBuffers(Maxwell3D::DirtyState::Tables& tables) {
static constexpr std::size_t num_array = 3;
for (std::size_t i = 0; i < Maxwell3D::Regs::NumVertexArrays; ++i) {
const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]);
const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]);
FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers);
FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers);
}
}
void SetupIndexBuffer(Maxwell3D::DirtyState::Tables& tables) {
FillBlock(tables[0], OFF(index_array), NUM(index_array), IndexBuffer);
}
void SetupDirtyDescriptors(Maxwell3D::DirtyState::Tables& tables) {
FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors); FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors);
FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors); FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors);
}
void SetupDirtyRenderTargets(Maxwell3D::DirtyState::Tables& tables) {
static constexpr std::size_t num_per_rt = NUM(rt[0]); static constexpr std::size_t num_per_rt = NUM(rt[0]);
static constexpr std::size_t begin = OFF(rt); static constexpr std::size_t begin = OFF(rt);
static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets; static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets;
@ -41,5 +58,13 @@ void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tabl
FillBlock(table, OFF(zeta), NUM(zeta), flag); FillBlock(table, OFF(zeta), NUM(zeta), flag);
} }
} }
} // Anonymous namespace
void SetupDirtyFlags(Maxwell3D::DirtyState::Tables& tables) {
SetupDirtyVertexBuffers(tables);
SetupIndexBuffer(tables);
SetupDirtyDescriptors(tables);
SetupDirtyRenderTargets(tables);
}
} // namespace VideoCommon::Dirty } // namespace VideoCommon::Dirty

View file

@ -30,6 +30,12 @@ enum : u8 {
ColorBuffer7, ColorBuffer7,
ZetaBuffer, ZetaBuffer,
VertexBuffers,
VertexBuffer0,
VertexBuffer31 = VertexBuffer0 + 31,
IndexBuffer,
LastCommonEntry, LastCommonEntry,
}; };
@ -47,6 +53,6 @@ void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables, std::size_
FillBlock(tables[1], begin, num, index_b); FillBlock(tables[1], begin, num, index_b);
} }
void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables); void SetupDirtyFlags(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables);
} // namespace VideoCommon::Dirty } // namespace VideoCommon::Dirty

View file

@ -23,8 +23,6 @@ void DmaPusher::DispatchCalls() {
MICROPROFILE_SCOPE(DispatchCalls); MICROPROFILE_SCOPE(DispatchCalls);
gpu.SyncGuestHost(); gpu.SyncGuestHost();
// On entering GPU code, assume all memory may be touched by the ARM core.
gpu.Maxwell3D().OnMemoryWrite();
dma_pushbuffer_subindex = 0; dma_pushbuffer_subindex = 0;

View file

@ -18,8 +18,8 @@ Fermi2D::Fermi2D() {
Fermi2D::~Fermi2D() = default; Fermi2D::~Fermi2D() = default;
void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
rasterizer = &rasterizer_; rasterizer = rasterizer_;
} }
void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) {

View file

@ -38,7 +38,7 @@ public:
~Fermi2D(); ~Fermi2D();
/// Binds a rasterizer to this engine. /// Binds a rasterizer to this engine.
void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
/// Write the value to the register identified by method. /// Write the value to the register identified by method.
void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; void CallMethod(u32 method, u32 method_argument, bool is_last_call) override;

View file

@ -21,8 +21,8 @@ KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manage
KeplerCompute::~KeplerCompute() = default; KeplerCompute::~KeplerCompute() = default;
void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
rasterizer = &rasterizer_; rasterizer = rasterizer_;
} }
void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) { void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) {
@ -39,7 +39,6 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
case KEPLER_COMPUTE_REG_INDEX(data_upload): { case KEPLER_COMPUTE_REG_INDEX(data_upload): {
upload_state.ProcessData(method_argument, is_last_call); upload_state.ProcessData(method_argument, is_last_call);
if (is_last_call) { if (is_last_call) {
system.GPU().Maxwell3D().OnMemoryWrite();
} }
break; break;
} }

View file

@ -46,7 +46,7 @@ public:
~KeplerCompute(); ~KeplerCompute();
/// Binds a rasterizer to this engine. /// Binds a rasterizer to this engine.
void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
static constexpr std::size_t NumConstBuffers = 8; static constexpr std::size_t NumConstBuffers = 8;

View file

@ -33,7 +33,6 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call
case KEPLERMEMORY_REG_INDEX(data): { case KEPLERMEMORY_REG_INDEX(data): {
upload_state.ProcessData(method_argument, is_last_call); upload_state.ProcessData(method_argument, is_last_call);
if (is_last_call) { if (is_last_call) {
system.GPU().Maxwell3D().OnMemoryWrite();
} }
break; break;
} }

View file

@ -30,8 +30,8 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_)
Maxwell3D::~Maxwell3D() = default; Maxwell3D::~Maxwell3D() = default;
void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
rasterizer = &rasterizer_; rasterizer = rasterizer_;
} }
void Maxwell3D::InitializeRegisterDefaults() { void Maxwell3D::InitializeRegisterDefaults() {
@ -223,7 +223,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
case MAXWELL3D_REG_INDEX(data_upload): case MAXWELL3D_REG_INDEX(data_upload):
upload_state.ProcessData(argument, is_last_call); upload_state.ProcessData(argument, is_last_call);
if (is_last_call) { if (is_last_call) {
OnMemoryWrite();
} }
return; return;
case MAXWELL3D_REG_INDEX(fragment_barrier): case MAXWELL3D_REG_INDEX(fragment_barrier):
@ -570,17 +569,18 @@ std::optional<u64> Maxwell3D::GetQueryResult() {
} }
} }
void Maxwell3D::ProcessCBBind(std::size_t stage_index) { void Maxwell3D::ProcessCBBind(size_t stage_index) {
// Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage. // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage.
auto& shader = state.shader_stages[stage_index]; const auto& bind_data = regs.cb_bind[stage_index];
auto& bind_data = regs.cb_bind[stage_index]; auto& buffer = state.shader_stages[stage_index].const_buffers[bind_data.index];
ASSERT(bind_data.index < Regs::MaxConstBuffers);
auto& buffer = shader.const_buffers[bind_data.index];
buffer.enabled = bind_data.valid.Value() != 0; buffer.enabled = bind_data.valid.Value() != 0;
buffer.address = regs.const_buffer.BufferAddress(); buffer.address = regs.const_buffer.BufferAddress();
buffer.size = regs.const_buffer.cb_size; buffer.size = regs.const_buffer.cb_size;
const bool is_enabled = bind_data.valid.Value() != 0;
const GPUVAddr gpu_addr = is_enabled ? regs.const_buffer.BufferAddress() : 0;
const u32 size = is_enabled ? regs.const_buffer.cb_size : 0;
rasterizer->BindGraphicsUniformBuffer(stage_index, bind_data.index, gpu_addr, size);
} }
void Maxwell3D::ProcessCBData(u32 value) { void Maxwell3D::ProcessCBData(u32 value) {
@ -635,7 +635,6 @@ void Maxwell3D::FinishCBData() {
const u32 id = cb_data_state.id; const u32 id = cb_data_state.id;
memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size); memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size);
OnMemoryWrite();
cb_data_state.id = null_cb_data; cb_data_state.id = null_cb_data;
cb_data_state.current = null_cb_data; cb_data_state.current = null_cb_data;

View file

@ -55,7 +55,7 @@ public:
~Maxwell3D(); ~Maxwell3D();
/// Binds a rasterizer to this engine. /// Binds a rasterizer to this engine.
void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
/// Register structure of the Maxwell3D engine. /// Register structure of the Maxwell3D engine.
/// TODO(Subv): This structure will need to be made bigger as more registers are discovered. /// TODO(Subv): This structure will need to be made bigger as more registers are discovered.
@ -1314,8 +1314,7 @@ public:
GPUVAddr LimitAddress() const { GPUVAddr LimitAddress() const {
return static_cast<GPUVAddr>((static_cast<GPUVAddr>(limit_high) << 32) | return static_cast<GPUVAddr>((static_cast<GPUVAddr>(limit_high) << 32) |
limit_low) + limit_low);
1;
} }
} vertex_array_limit[NumVertexArrays]; } vertex_array_limit[NumVertexArrays];
@ -1403,6 +1402,7 @@ public:
}; };
std::array<ShaderStageInfo, Regs::MaxShaderStage> shader_stages; std::array<ShaderStageInfo, Regs::MaxShaderStage> shader_stages;
u32 current_instance = 0; ///< Current instance to be used to simulate instanced rendering. u32 current_instance = 0; ///< Current instance to be used to simulate instanced rendering.
}; };
@ -1452,11 +1452,6 @@ public:
return *rasterizer; return *rasterizer;
} }
/// Notify a memory write has happened.
void OnMemoryWrite() {
dirty.flags |= dirty.on_write_stores;
}
enum class MMEDrawMode : u32 { enum class MMEDrawMode : u32 {
Undefined, Undefined,
Array, Array,
@ -1478,7 +1473,6 @@ public:
using Tables = std::array<Table, 2>; using Tables = std::array<Table, 2>;
Flags flags; Flags flags;
Flags on_write_stores;
Tables tables{}; Tables tables{};
} dirty; } dirty;
@ -1541,7 +1535,7 @@ private:
void FinishCBData(); void FinishCBData();
/// Handles a write to the CB_BIND register. /// Handles a write to the CB_BIND register.
void ProcessCBBind(std::size_t stage_index); void ProcessCBBind(size_t stage_index);
/// Handles a write to the VERTEX_END_GL register, triggering a draw. /// Handles a write to the VERTEX_END_GL register, triggering a draw.
void DrawArrays(); void DrawArrays();

View file

@ -60,9 +60,6 @@ void MaxwellDMA::Launch() {
return; return;
} }
// All copies here update the main memory, so mark all rasterizer states as invalid.
system.GPU().Maxwell3D().OnMemoryWrite();
if (is_src_pitch && is_dst_pitch) { if (is_src_pitch && is_dst_pitch) {
CopyPitchToPitch(); CopyPitchToPitch();
} else { } else {

View file

@ -143,22 +143,26 @@ private:
} }
bool ShouldWait() const { bool ShouldWait() const {
std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() || return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() ||
query_cache.ShouldWaitAsyncFlushes(); query_cache.ShouldWaitAsyncFlushes();
} }
bool ShouldFlush() const { bool ShouldFlush() const {
std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
return texture_cache.HasUncommittedFlushes() || buffer_cache.HasUncommittedFlushes() || return texture_cache.HasUncommittedFlushes() || buffer_cache.HasUncommittedFlushes() ||
query_cache.HasUncommittedFlushes(); query_cache.HasUncommittedFlushes();
} }
void PopAsyncFlushes() { void PopAsyncFlushes() {
std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
texture_cache.PopAsyncFlushes(); texture_cache.PopAsyncFlushes();
buffer_cache.PopAsyncFlushes(); buffer_cache.PopAsyncFlushes();
query_cache.PopAsyncFlushes(); query_cache.PopAsyncFlushes();
} }
void CommitAsyncFlushes() { void CommitAsyncFlushes() {
std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
texture_cache.CommitAsyncFlushes(); texture_cache.CommitAsyncFlushes();
buffer_cache.CommitAsyncFlushes(); buffer_cache.CommitAsyncFlushes();
query_cache.CommitAsyncFlushes(); query_cache.CommitAsyncFlushes();

View file

@ -44,8 +44,8 @@ GPU::~GPU() = default;
void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) { void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) {
renderer = std::move(renderer_); renderer = std::move(renderer_);
rasterizer = renderer->ReadRasterizer();
VideoCore::RasterizerInterface& rasterizer = renderer->Rasterizer();
memory_manager->BindRasterizer(rasterizer); memory_manager->BindRasterizer(rasterizer);
maxwell_3d->BindRasterizer(rasterizer); maxwell_3d->BindRasterizer(rasterizer);
fermi_2d->BindRasterizer(rasterizer); fermi_2d->BindRasterizer(rasterizer);
@ -171,7 +171,7 @@ void GPU::TickWork() {
const std::size_t size = request.size; const std::size_t size = request.size;
flush_requests.pop_front(); flush_requests.pop_front();
flush_request_mutex.unlock(); flush_request_mutex.unlock();
renderer->Rasterizer().FlushRegion(addr, size); rasterizer->FlushRegion(addr, size);
current_flush_fence.store(fence); current_flush_fence.store(fence);
flush_request_mutex.lock(); flush_request_mutex.lock();
} }
@ -193,11 +193,11 @@ u64 GPU::GetTicks() const {
} }
void GPU::FlushCommands() { void GPU::FlushCommands() {
renderer->Rasterizer().FlushCommands(); rasterizer->FlushCommands();
} }
void GPU::SyncGuestHost() { void GPU::SyncGuestHost() {
renderer->Rasterizer().SyncGuestHost(); rasterizer->SyncGuestHost();
} }
enum class GpuSemaphoreOperation { enum class GpuSemaphoreOperation {

View file

@ -366,6 +366,7 @@ protected:
std::unique_ptr<Tegra::DmaPusher> dma_pusher; std::unique_ptr<Tegra::DmaPusher> dma_pusher;
std::unique_ptr<Tegra::CDmaPusher> cdma_pusher; std::unique_ptr<Tegra::CDmaPusher> cdma_pusher;
std::unique_ptr<VideoCore::RendererBase> renderer; std::unique_ptr<VideoCore::RendererBase> renderer;
VideoCore::RasterizerInterface* rasterizer = nullptr;
const bool use_nvdec; const bool use_nvdec;
private: private:

View file

@ -38,6 +38,7 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
} }
auto current_context = context.Acquire(); auto current_context = context.Acquire();
VideoCore::RasterizerInterface* const rasterizer = renderer.ReadRasterizer();
CommandDataContainer next; CommandDataContainer next;
while (state.is_running) { while (state.is_running) {
@ -52,13 +53,13 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer,
} else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) { } else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) {
renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr); renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
} else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) { } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) {
renderer.Rasterizer().ReleaseFences(); rasterizer->ReleaseFences();
} else if (std::holds_alternative<GPUTickCommand>(next.data)) { } else if (std::holds_alternative<GPUTickCommand>(next.data)) {
system.GPU().TickWork(); system.GPU().TickWork();
} else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) { } else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) {
renderer.Rasterizer().FlushRegion(flush->addr, flush->size); rasterizer->FlushRegion(flush->addr, flush->size);
} else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) { } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) {
renderer.Rasterizer().OnCPUWrite(invalidate->addr, invalidate->size); rasterizer->OnCPUWrite(invalidate->addr, invalidate->size);
} else if (std::holds_alternative<EndProcessingCommand>(next.data)) { } else if (std::holds_alternative<EndProcessingCommand>(next.data)) {
return; return;
} else { } else {
@ -84,6 +85,7 @@ ThreadManager::~ThreadManager() {
void ThreadManager::StartThread(VideoCore::RendererBase& renderer, void ThreadManager::StartThread(VideoCore::RendererBase& renderer,
Core::Frontend::GraphicsContext& context, Core::Frontend::GraphicsContext& context,
Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) { Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) {
rasterizer = renderer.ReadRasterizer();
thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context), thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context),
std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher)); std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher));
} }
@ -129,12 +131,12 @@ void ThreadManager::FlushRegion(VAddr addr, u64 size) {
} }
void ThreadManager::InvalidateRegion(VAddr addr, u64 size) { void ThreadManager::InvalidateRegion(VAddr addr, u64 size) {
system.Renderer().Rasterizer().OnCPUWrite(addr, size); rasterizer->OnCPUWrite(addr, size);
} }
void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
// Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important
system.Renderer().Rasterizer().OnCPUWrite(addr, size); rasterizer->OnCPUWrite(addr, size);
} }
void ThreadManager::WaitIdle() const { void ThreadManager::WaitIdle() const {

View file

@ -27,6 +27,7 @@ class System;
} // namespace Core } // namespace Core
namespace VideoCore { namespace VideoCore {
class RasterizerInterface;
class RendererBase; class RendererBase;
} // namespace VideoCore } // namespace VideoCore
@ -151,11 +152,12 @@ private:
/// Pushes a command to be executed by the GPU thread /// Pushes a command to be executed by the GPU thread
u64 PushCommand(CommandData&& command_data); u64 PushCommand(CommandData&& command_data);
SynchState state;
Core::System& system; Core::System& system;
std::thread thread;
std::thread::id thread_id;
const bool is_async; const bool is_async;
VideoCore::RasterizerInterface* rasterizer = nullptr;
SynchState state;
std::thread thread;
}; };
} // namespace VideoCommon::GPUThread } // namespace VideoCommon::GPUThread

View file

@ -12,7 +12,6 @@ set(SHADER_FILES
vulkan_blit_depth_stencil.frag vulkan_blit_depth_stencil.frag
vulkan_present.frag vulkan_present.frag
vulkan_present.vert vulkan_present.vert
vulkan_quad_array.comp
vulkan_quad_indexed.comp vulkan_quad_indexed.comp
vulkan_uint8.comp vulkan_uint8.comp
) )

View file

@ -1,28 +0,0 @@
// Copyright 2019 yuzu Emulator Project
// Licensed under GPLv2 or any later version
// Refer to the license.txt file included.
#version 460 core
layout (local_size_x = 1024) in;
layout (std430, set = 0, binding = 0) buffer OutputBuffer {
uint output_indexes[];
};
layout (push_constant) uniform PushConstants {
uint first;
};
void main() {
uint primitive = gl_GlobalInvocationID.x;
if (primitive * 6 >= output_indexes.length()) {
return;
}
const uint quad_map[6] = uint[](0, 1, 2, 0, 2, 3);
for (uint vertex = 0; vertex < 6; ++vertex) {
uint index = first + primitive * 4 + quad_map[vertex];
output_indexes[primitive * 6 + vertex] = index;
}
}

View file

@ -16,9 +16,16 @@ layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer {
uint16_t output_indexes[]; uint16_t output_indexes[];
}; };
uint AssembleIndex(uint id) {
// Most primitive restart indices are 0xFF
// Hardcode this to 0xFF for now
uint index = uint(input_indexes[id]);
return index == 0xFF ? 0xFFFF : index;
}
void main() { void main() {
uint id = gl_GlobalInvocationID.x; uint id = gl_GlobalInvocationID.x;
if (id < input_indexes.length()) { if (id < input_indexes.length()) {
output_indexes[id] = uint16_t(input_indexes[id]); output_indexes[id] = uint16_t(AssembleIndex(id));
} }
} }

View file

@ -21,8 +21,8 @@ MemoryManager::MemoryManager(Core::System& system_)
MemoryManager::~MemoryManager() = default; MemoryManager::~MemoryManager() = default;
void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) {
rasterizer = &rasterizer_; rasterizer = rasterizer_;
} }
GPUVAddr MemoryManager::UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) { GPUVAddr MemoryManager::UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) {

View file

@ -72,7 +72,7 @@ public:
~MemoryManager(); ~MemoryManager();
/// Binds a renderer to the memory manager. /// Binds a renderer to the memory manager.
void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
[[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const; [[nodiscard]] std::optional<VAddr> GpuToCpuAddress(GPUVAddr addr) const;
@ -157,6 +157,8 @@ private:
using MapRange = std::pair<GPUVAddr, size_t>; using MapRange = std::pair<GPUVAddr, size_t>;
std::vector<MapRange> map_ranges; std::vector<MapRange> map_ranges;
std::vector<std::pair<VAddr, std::size_t>> cache_invalidate_queue;
}; };
} // namespace Tegra } // namespace Tegra

View file

@ -7,6 +7,7 @@
#include <atomic> #include <atomic>
#include <functional> #include <functional>
#include <optional> #include <optional>
#include <span>
#include "common/common_types.h" #include "common/common_types.h"
#include "video_core/engines/fermi_2d.h" #include "video_core/engines/fermi_2d.h"
#include "video_core/gpu.h" #include "video_core/gpu.h"
@ -49,6 +50,10 @@ public:
/// Records a GPU query and caches it /// Records a GPU query and caches it
virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0; virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0;
/// Signal an uniform buffer binding
virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
u32 size) = 0;
/// Signal a GPU based semaphore as a fence /// Signal a GPU based semaphore as a fence
virtual void SignalSemaphore(GPUVAddr addr, u32 value) = 0; virtual void SignalSemaphore(GPUVAddr addr, u32 value) = 0;

View file

@ -37,15 +37,11 @@ public:
std::unique_ptr<Core::Frontend::GraphicsContext> context); std::unique_ptr<Core::Frontend::GraphicsContext> context);
virtual ~RendererBase(); virtual ~RendererBase();
/// Initialize the renderer
[[nodiscard]] virtual bool Init() = 0;
/// Shutdown the renderer
virtual void ShutDown() = 0;
/// Finalize rendering the guest frame and draw into the presentation texture /// Finalize rendering the guest frame and draw into the presentation texture
virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0; virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
[[nodiscard]] virtual RasterizerInterface* ReadRasterizer() = 0;
// Getter/setter functions: // Getter/setter functions:
// ------------------------ // ------------------------
@ -57,14 +53,6 @@ public:
return m_current_frame; return m_current_frame;
} }
[[nodiscard]] RasterizerInterface& Rasterizer() {
return *rasterizer;
}
[[nodiscard]] const RasterizerInterface& Rasterizer() const {
return *rasterizer;
}
[[nodiscard]] Core::Frontend::GraphicsContext& Context() { [[nodiscard]] Core::Frontend::GraphicsContext& Context() {
return *context; return *context;
} }
@ -98,7 +86,6 @@ public:
protected: protected:
Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle. Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle.
std::unique_ptr<RasterizerInterface> rasterizer;
std::unique_ptr<Core::Frontend::GraphicsContext> context; std::unique_ptr<Core::Frontend::GraphicsContext> context;
f32 m_current_fps = 0.0f; ///< Current framerate, should be set by the renderer f32 m_current_fps = 0.0f; ///< Current framerate, should be set by the renderer
int m_current_frame = 0; ///< Current frame, should be set by the renderer int m_current_frame = 0; ///< Current frame, should be set by the renderer

View file

@ -2,98 +2,208 @@
// Licensed under GPLv2 or any later version // Licensed under GPLv2 or any later version
// Refer to the license.txt file included. // Refer to the license.txt file included.
#include <memory> #include <span>
#include <glad/glad.h>
#include "common/assert.h"
#include "common/microprofile.h"
#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_buffer_cache.h"
#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
#include "video_core/renderer_opengl/gl_resource_manager.h"
namespace OpenGL { namespace OpenGL {
namespace {
struct BindlessSSBO {
GLuint64EXT address;
GLsizei length;
GLsizei padding;
};
static_assert(sizeof(BindlessSSBO) == sizeof(GLuint) * 4);
using Maxwell = Tegra::Engines::Maxwell3D::Regs; constexpr std::array PROGRAM_LUT{
GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
};
} // Anonymous namespace
MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params)
: VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params) {}
Buffer::Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_) Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
: BufferBlock{cpu_addr_, size_} { VAddr cpu_addr_, u64 size_bytes_)
gl_buffer.Create(); : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_) {
glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size_), nullptr, GL_DYNAMIC_DRAW); buffer.Create();
if (device_.UseAssemblyShaders() || device_.HasVertexBufferUnifiedMemory()) { const std::string name = fmt::format("Buffer 0x{:x}", CpuAddr());
glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); glObjectLabel(GL_BUFFER, buffer.handle, static_cast<GLsizei>(name.size()), name.data());
glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); glNamedBufferData(buffer.handle, SizeBytes(), nullptr, GL_DYNAMIC_DRAW);
if (runtime.has_unified_vertex_buffers) {
glGetNamedBufferParameterui64vNV(buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &address);
} }
} }
Buffer::~Buffer() = default; void Buffer::ImmediateUpload(size_t offset, std::span<const u8> data) noexcept {
glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) { static_cast<GLsizeiptr>(data.size_bytes()), data.data());
glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset),
static_cast<GLsizeiptr>(data_size), data);
} }
void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) { void Buffer::ImmediateDownload(size_t offset, std::span<u8> data) noexcept {
MICROPROFILE_SCOPE(OpenGL_Buffer_Download); glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
const GLsizeiptr gl_size = static_cast<GLsizeiptr>(data_size); static_cast<GLsizeiptr>(data.size_bytes()), data.data());
const GLintptr gl_offset = static_cast<GLintptr>(offset);
if (read_buffer.handle == 0) {
read_buffer.Create();
glNamedBufferData(read_buffer.handle, static_cast<GLsizeiptr>(Size()), nullptr,
GL_STREAM_READ);
}
glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size);
glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data);
} }
void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, void Buffer::MakeResident(GLenum access) noexcept {
std::size_t copy_size) { // Abuse GLenum's order to exit early
glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset), // GL_NONE (default) < GL_READ_ONLY < GL_READ_WRITE
static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(copy_size)); if (access <= current_residency_access || buffer.handle == 0) {
}
OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer_,
Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
const Device& device_, OGLStreamBuffer& stream_buffer_,
StateTracker& state_tracker)
: GenericBufferCache{rasterizer_, gpu_memory_, cpu_memory_, stream_buffer_}, device{device_} {
if (!device.HasFastBufferSubData()) {
return; return;
} }
if (std::exchange(current_residency_access, access) != GL_NONE) {
// If the buffer is already resident, remove its residency before promoting it
glMakeNamedBufferNonResidentNV(buffer.handle);
}
glMakeNamedBufferResidentNV(buffer.handle, access);
}
static constexpr GLsizeiptr size = static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize); BufferCacheRuntime::BufferCacheRuntime(const Device& device_)
glCreateBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); : device{device_}, has_fast_buffer_sub_data{device.HasFastBufferSubData()},
for (const GLuint cbuf : cbufs) { use_assembly_shaders{device.UseAssemblyShaders()},
glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); has_unified_vertex_buffers{device.HasVertexBufferUnifiedMemory()},
stream_buffer{has_fast_buffer_sub_data ? std::nullopt : std::make_optional<StreamBuffer>()} {
GLint gl_max_attributes;
glGetIntegerv(GL_MAX_VERTEX_ATTRIBS, &gl_max_attributes);
max_attributes = static_cast<u32>(gl_max_attributes);
for (auto& stage_uniforms : fast_uniforms) {
for (OGLBuffer& buffer : stage_uniforms) {
buffer.Create();
glNamedBufferData(buffer.handle, BufferCache::SKIP_CACHE_SIZE, nullptr, GL_STREAM_DRAW);
}
}
for (auto& stage_uniforms : copy_uniforms) {
for (OGLBuffer& buffer : stage_uniforms) {
buffer.Create();
glNamedBufferData(buffer.handle, 0x10'000, nullptr, GL_STREAM_COPY);
}
}
for (OGLBuffer& buffer : copy_compute_uniforms) {
buffer.Create();
glNamedBufferData(buffer.handle, 0x10'000, nullptr, GL_STREAM_COPY);
} }
} }
OGLBufferCache::~OGLBufferCache() { void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
glDeleteBuffers(static_cast<GLsizei>(std::size(cbufs)), std::data(cbufs)); std::span<const VideoCommon::BufferCopy> copies) {
for (const VideoCommon::BufferCopy& copy : copies) {
glCopyNamedBufferSubData(
src_buffer.Handle(), dst_buffer.Handle(), static_cast<GLintptr>(copy.src_offset),
static_cast<GLintptr>(copy.dst_offset), static_cast<GLsizeiptr>(copy.size));
}
} }
std::shared_ptr<Buffer> OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { void BufferCacheRuntime::BindIndexBuffer(Buffer& buffer, u32 offset, u32 size) {
return std::make_shared<Buffer>(device, cpu_addr, size); if (has_unified_vertex_buffers) {
buffer.MakeResident(GL_READ_ONLY);
glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, buffer.HostGpuAddr() + offset,
static_cast<GLsizeiptr>(size));
} else {
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer.Handle());
index_buffer_offset = offset;
}
} }
OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) { void BufferCacheRuntime::BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size,
return {0, 0, 0}; u32 stride) {
if (index >= max_attributes) {
return;
}
if (has_unified_vertex_buffers) {
buffer.MakeResident(GL_READ_ONLY);
glBindVertexBuffer(index, 0, 0, static_cast<GLsizei>(stride));
glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, index,
buffer.HostGpuAddr() + offset, static_cast<GLsizeiptr>(size));
} else {
glBindVertexBuffer(index, buffer.Handle(), static_cast<GLintptr>(offset),
static_cast<GLsizei>(stride));
}
} }
OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, void BufferCacheRuntime::BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer,
std::size_t size) { u32 offset, u32 size) {
DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); if (use_assembly_shaders) {
const GLuint cbuf = cbufs[cbuf_cursor++]; GLuint handle;
if (offset != 0) {
handle = copy_uniforms[stage][binding_index].handle;
glCopyNamedBufferSubData(buffer.Handle(), handle, offset, 0, size);
} else {
handle = buffer.Handle();
}
glBindBufferRangeNV(PABO_LUT[stage], binding_index, handle, 0,
static_cast<GLsizeiptr>(size));
} else {
const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer;
const GLuint binding = base_binding + binding_index;
glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer.Handle(),
static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
}
}
glNamedBufferSubData(cbuf, 0, static_cast<GLsizeiptr>(size), raw_pointer); void BufferCacheRuntime::BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset,
return {cbuf, 0, 0}; u32 size) {
if (use_assembly_shaders) {
GLuint handle;
if (offset != 0) {
handle = copy_compute_uniforms[binding_index].handle;
glCopyNamedBufferSubData(buffer.Handle(), handle, offset, 0, size);
} else {
handle = buffer.Handle();
}
glBindBufferRangeNV(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding_index, handle, 0,
static_cast<GLsizeiptr>(size));
} else {
glBindBufferRange(GL_UNIFORM_BUFFER, binding_index, buffer.Handle(),
static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
}
}
void BufferCacheRuntime::BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer,
u32 offset, u32 size, bool is_written) {
if (use_assembly_shaders) {
const BindlessSSBO ssbo{
.address = buffer.HostGpuAddr() + offset,
.length = static_cast<GLsizei>(size),
.padding = 0,
};
buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY);
glProgramLocalParametersI4uivNV(PROGRAM_LUT[stage], binding_index, 1,
reinterpret_cast<const GLuint*>(&ssbo));
} else {
const GLuint base_binding = device.GetBaseBindings(stage).shader_storage_buffer;
const GLuint binding = base_binding + binding_index;
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, buffer.Handle(),
static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
}
}
void BufferCacheRuntime::BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset,
u32 size, bool is_written) {
if (use_assembly_shaders) {
const BindlessSSBO ssbo{
.address = buffer.HostGpuAddr() + offset,
.length = static_cast<GLsizei>(size),
.padding = 0,
};
buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY);
glProgramLocalParametersI4uivNV(GL_COMPUTE_PROGRAM_NV, binding_index, 1,
reinterpret_cast<const GLuint*>(&ssbo));
} else if (size == 0) {
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, 0, 0, 0);
} else {
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, buffer.Handle(),
static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
}
}
void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset,
u32 size) {
glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, index, buffer.Handle(),
static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
} }
} // namespace OpenGL } // namespace OpenGL

View file

@ -5,79 +5,157 @@
#pragma once #pragma once
#include <array> #include <array>
#include <memory> #include <span>
#include "common/alignment.h"
#include "common/common_types.h" #include "common/common_types.h"
#include "common/dynamic_library.h"
#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/engines/maxwell_3d.h" #include "video_core/rasterizer_interface.h"
#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h" #include "video_core/renderer_opengl/gl_stream_buffer.h"
namespace Core {
class System;
}
namespace OpenGL { namespace OpenGL {
class Device; class BufferCacheRuntime;
class OGLStreamBuffer;
class RasterizerOpenGL;
class StateTracker;
class Buffer : public VideoCommon::BufferBlock { class Buffer : public VideoCommon::BufferBase<VideoCore::RasterizerInterface> {
public: public:
explicit Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_); explicit Buffer(BufferCacheRuntime&, VideoCore::RasterizerInterface& rasterizer, VAddr cpu_addr,
~Buffer(); u64 size_bytes);
explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams);
void Upload(std::size_t offset, std::size_t data_size, const u8* data); void ImmediateUpload(size_t offset, std::span<const u8> data) noexcept;
void Download(std::size_t offset, std::size_t data_size, u8* data); void ImmediateDownload(size_t offset, std::span<u8> data) noexcept;
void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, void MakeResident(GLenum access) noexcept;
std::size_t copy_size);
GLuint Handle() const noexcept { [[nodiscard]] GLuint64EXT HostGpuAddr() const noexcept {
return gl_buffer.handle; return address;
} }
u64 Address() const noexcept { [[nodiscard]] GLuint Handle() const noexcept {
return gpu_address; return buffer.handle;
} }
private: private:
OGLBuffer gl_buffer; GLuint64EXT address = 0;
OGLBuffer read_buffer; OGLBuffer buffer;
u64 gpu_address = 0; GLenum current_residency_access = GL_NONE;
}; };
using GenericBufferCache = VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>; class BufferCacheRuntime {
class OGLBufferCache final : public GenericBufferCache { friend Buffer;
public: public:
explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer, static constexpr u8 INVALID_BINDING = std::numeric_limits<u8>::max();
Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory,
const Device& device, OGLStreamBuffer& stream_buffer,
StateTracker& state_tracker);
~OGLBufferCache();
BufferInfo GetEmptyBuffer(std::size_t) override; explicit BufferCacheRuntime(const Device& device_);
void Acquire() noexcept { void CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer,
cbuf_cursor = 0; std::span<const VideoCommon::BufferCopy> copies);
void BindIndexBuffer(Buffer& buffer, u32 offset, u32 size);
void BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, u32 stride);
void BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size);
void BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size);
void BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size,
bool is_written);
void BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size,
bool is_written);
void BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset, u32 size);
void BindFastUniformBuffer(size_t stage, u32 binding_index, u32 size) {
if (use_assembly_shaders) {
const GLuint handle = fast_uniforms[stage][binding_index].handle;
const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size);
glBindBufferRangeNV(PABO_LUT[stage], binding_index, handle, 0, gl_size);
} else {
const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer;
const GLuint binding = base_binding + binding_index;
glBindBufferRange(GL_UNIFORM_BUFFER, binding,
fast_uniforms[stage][binding_index].handle, 0,
static_cast<GLsizeiptr>(size));
}
} }
protected: void PushFastUniformBuffer(size_t stage, u32 binding_index, std::span<const u8> data) {
std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; if (use_assembly_shaders) {
glProgramBufferParametersIuivNV(
PABO_LUT[stage], binding_index, 0,
static_cast<GLsizei>(data.size_bytes() / sizeof(GLuint)),
reinterpret_cast<const GLuint*>(data.data()));
} else {
glNamedBufferSubData(fast_uniforms[stage][binding_index].handle, 0,
static_cast<GLsizeiptr>(data.size_bytes()), data.data());
}
}
BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; std::span<u8> BindMappedUniformBuffer(size_t stage, u32 binding_index, u32 size) noexcept {
const auto [mapped_span, offset] = stream_buffer->Request(static_cast<size_t>(size));
const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer;
const GLuint binding = base_binding + binding_index;
glBindBufferRange(GL_UNIFORM_BUFFER, binding, stream_buffer->Handle(),
static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size));
return mapped_span;
}
[[nodiscard]] const GLvoid* IndexOffset() const noexcept {
return reinterpret_cast<const GLvoid*>(static_cast<uintptr_t>(index_buffer_offset));
}
[[nodiscard]] bool HasFastBufferSubData() const noexcept {
return has_fast_buffer_sub_data;
}
private: private:
static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * static constexpr std::array PABO_LUT{
Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
};
const Device& device; const Device& device;
std::size_t cbuf_cursor = 0; bool has_fast_buffer_sub_data = false;
std::array<GLuint, NUM_CBUFS> cbufs{}; bool use_assembly_shaders = false;
bool has_unified_vertex_buffers = false;
u32 max_attributes = 0;
std::optional<StreamBuffer> stream_buffer;
std::array<std::array<OGLBuffer, VideoCommon::NUM_GRAPHICS_UNIFORM_BUFFERS>,
VideoCommon::NUM_STAGES>
fast_uniforms;
std::array<std::array<OGLBuffer, VideoCommon::NUM_GRAPHICS_UNIFORM_BUFFERS>,
VideoCommon::NUM_STAGES>
copy_uniforms;
std::array<OGLBuffer, VideoCommon::NUM_COMPUTE_UNIFORM_BUFFERS> copy_compute_uniforms;
u32 index_buffer_offset = 0;
}; };
struct BufferCacheParams {
using Runtime = OpenGL::BufferCacheRuntime;
using Buffer = OpenGL::Buffer;
static constexpr bool IS_OPENGL = true;
static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true;
static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = true;
static constexpr bool NEEDS_BIND_UNIFORM_INDEX = true;
static constexpr bool NEEDS_BIND_STORAGE_INDEX = true;
static constexpr bool USE_MEMORY_MAPS = false;
};
using BufferCache = VideoCommon::BufferCache<BufferCacheParams>;
} // namespace OpenGL } // namespace OpenGL

View file

@ -21,9 +21,7 @@
#include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_resource_manager.h"
namespace OpenGL { namespace OpenGL {
namespace { namespace {
// One uniform block is reserved for emulation purposes // One uniform block is reserved for emulation purposes
constexpr u32 ReservedUniformBlocks = 1; constexpr u32 ReservedUniformBlocks = 1;
@ -197,11 +195,13 @@ bool IsASTCSupported() {
const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED");
return nsight || HasExtension(extensions, "GL_EXT_debug_tool"); return nsight || HasExtension(extensions, "GL_EXT_debug_tool");
} }
} // Anonymous namespace } // Anonymous namespace
Device::Device() Device::Device() {
: max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { if (!GLAD_GL_VERSION_4_6) {
LOG_ERROR(Render_OpenGL, "OpenGL 4.6 is not available");
throw std::runtime_error{"Insufficient version"};
}
const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR));
const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION)); const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION));
const std::vector extensions = GetExtensions(); const std::vector extensions = GetExtensions();
@ -217,6 +217,9 @@ Device::Device()
"Beta driver 443.24 is known to have issues. There might be performance issues."); "Beta driver 443.24 is known to have issues. There might be performance issues.");
disable_fast_buffer_sub_data = true; disable_fast_buffer_sub_data = true;
} }
max_uniform_buffers = BuildMaxUniformBuffers();
base_bindings = BuildBaseBindings();
uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);

View file

@ -10,11 +10,9 @@
namespace OpenGL { namespace OpenGL {
static constexpr u32 EmulationUniformBlockBinding = 0; class Device {
class Device final {
public: public:
struct BaseBindings final { struct BaseBindings {
u32 uniform_buffer{}; u32 uniform_buffer{};
u32 shader_storage_buffer{}; u32 shader_storage_buffer{};
u32 sampler{}; u32 sampler{};

View file

@ -47,7 +47,7 @@ void GLInnerFence::Wait() {
FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_,
Tegra::GPU& gpu_, TextureCache& texture_cache_, Tegra::GPU& gpu_, TextureCache& texture_cache_,
OGLBufferCache& buffer_cache_, QueryCache& query_cache_) BufferCache& buffer_cache_, QueryCache& query_cache_)
: GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {} : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {}
Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) { Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) {

View file

@ -32,14 +32,13 @@ private:
}; };
using Fence = std::shared_ptr<GLInnerFence>; using Fence = std::shared_ptr<GLInnerFence>;
using GenericFenceManager = using GenericFenceManager = VideoCommon::FenceManager<Fence, TextureCache, BufferCache, QueryCache>;
VideoCommon::FenceManager<Fence, TextureCache, OGLBufferCache, QueryCache>;
class FenceManagerOpenGL final : public GenericFenceManager { class FenceManagerOpenGL final : public GenericFenceManager {
public: public:
explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
TextureCache& texture_cache_, OGLBufferCache& buffer_cache_, TextureCache& texture_cache, BufferCache& buffer_cache,
QueryCache& query_cache_); QueryCache& query_cache);
protected: protected:
Fence CreateFence(u32 value, bool is_stubbed) override; Fence CreateFence(u32 value, bool is_stubbed) override;

View file

@ -44,28 +44,14 @@ using VideoCore::Surface::PixelFormat;
using VideoCore::Surface::SurfaceTarget; using VideoCore::Surface::SurfaceTarget;
using VideoCore::Surface::SurfaceType; using VideoCore::Surface::SurfaceType;
MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Format Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_VB, "OpenGL", "Vertex Buffer Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Shader, "OpenGL", "Shader Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_UBO, "OpenGL", "Const Buffer Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Index, "OpenGL", "Index Buffer Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Texture, "OpenGL", "Texture Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Framebuffer, "OpenGL", "Framebuffer Setup", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Clears, "OpenGL", "Clears", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192));
MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100)); MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Management", MP_RGB(100, 255, 100));
MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255, 100, 100));
namespace { namespace {
constexpr size_t NUM_CONST_BUFFERS_PER_STAGE = 18;
constexpr size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE =
NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize;
constexpr size_t TOTAL_CONST_BUFFER_BYTES =
NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage;
constexpr size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16; constexpr size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16;
constexpr size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16;
struct TextureHandle { struct TextureHandle {
constexpr TextureHandle(u32 data, bool via_header_index) { constexpr TextureHandle(u32 data, bool via_header_index) {
@ -101,20 +87,6 @@ TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const
return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index); return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index);
} }
std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer,
const ConstBufferEntry& entry) {
if (!entry.IsIndirect()) {
return entry.GetSize();
}
if (buffer.size > Maxwell::MaxConstBufferSize) {
LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size,
Maxwell::MaxConstBufferSize);
return Maxwell::MaxConstBufferSize;
}
return buffer.size;
}
/// Translates hardware transform feedback indices /// Translates hardware transform feedback indices
/// @param location Hardware location /// @param location Hardware location
/// @return Pair of ARB_transform_feedback3 token stream first and third arguments /// @return Pair of ARB_transform_feedback3 token stream first and third arguments
@ -147,14 +119,6 @@ void oglEnable(GLenum cap, bool state) {
(state ? glEnable : glDisable)(cap); (state ? glEnable : glDisable)(cap);
} }
void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) {
if (num_ssbos == 0) {
return;
}
glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos),
reinterpret_cast<const GLuint*>(ssbos));
}
ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) { ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) {
if (entry.is_buffer) { if (entry.is_buffer) {
return ImageViewType::Buffer; return ImageViewType::Buffer;
@ -201,44 +165,28 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra
: RasterizerAccelerated(cpu_memory_), gpu(gpu_), maxwell3d(gpu.Maxwell3D()), : RasterizerAccelerated(cpu_memory_), gpu(gpu_), maxwell3d(gpu.Maxwell3D()),
kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_), kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_),
screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_), screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_),
stream_buffer(device, state_tracker),
texture_cache_runtime(device, program_manager, state_tracker), texture_cache_runtime(device, program_manager, state_tracker),
texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory),
buffer_cache_runtime(device),
buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime),
shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device), shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device),
query_cache(*this, maxwell3d, gpu_memory), query_cache(*this, maxwell3d, gpu_memory),
buffer_cache(*this, gpu_memory, cpu_memory_, device, stream_buffer, state_tracker),
fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache),
async_shaders(emu_window_) { async_shaders(emu_window_) {
unified_uniform_buffer.Create();
glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0);
if (device.UseAssemblyShaders()) {
glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
for (const GLuint cbuf : staging_cbufs) {
glNamedBufferStorage(cbuf, static_cast<GLsizeiptr>(Maxwell::MaxConstBufferSize),
nullptr, 0);
}
}
if (device.UseAsynchronousShaders()) { if (device.UseAsynchronousShaders()) {
async_shaders.AllocateWorkers(); async_shaders.AllocateWorkers();
} }
} }
RasterizerOpenGL::~RasterizerOpenGL() { RasterizerOpenGL::~RasterizerOpenGL() = default;
if (device.UseAssemblyShaders()) {
glDeleteBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data());
}
}
void RasterizerOpenGL::SetupVertexFormat() { void RasterizerOpenGL::SyncVertexFormats() {
auto& flags = maxwell3d.dirty.flags; auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::VertexFormats]) { if (!flags[Dirty::VertexFormats]) {
return; return;
} }
flags[Dirty::VertexFormats] = false; flags[Dirty::VertexFormats] = false;
MICROPROFILE_SCOPE(OpenGL_VAO);
// Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. Enables // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. Enables
// the first 16 vertex attributes always, as we don't know which ones are actually used until // the first 16 vertex attributes always, as we don't know which ones are actually used until
// shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now to // shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now to
@ -274,55 +222,7 @@ void RasterizerOpenGL::SetupVertexFormat() {
} }
} }
void RasterizerOpenGL::SetupVertexBuffer() { void RasterizerOpenGL::SyncVertexInstances() {
auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::VertexBuffers]) {
return;
}
flags[Dirty::VertexBuffers] = false;
MICROPROFILE_SCOPE(OpenGL_VB);
const bool use_unified_memory = device.HasVertexBufferUnifiedMemory();
// Upload all guest vertex arrays sequentially to our buffer
const auto& regs = maxwell3d.regs;
for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) {
if (!flags[Dirty::VertexBuffer0 + index]) {
continue;
}
flags[Dirty::VertexBuffer0 + index] = false;
const auto& vertex_array = regs.vertex_array[index];
if (!vertex_array.IsEnabled()) {
continue;
}
const GPUVAddr start = vertex_array.StartAddress();
const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
ASSERT(end >= start);
const GLuint gl_index = static_cast<GLuint>(index);
const u64 size = end - start;
if (size == 0) {
glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
if (use_unified_memory) {
glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0);
}
continue;
}
const auto info = buffer_cache.UploadMemory(start, size);
if (use_unified_memory) {
glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride);
glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index,
info.address + info.offset, size);
} else {
glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride);
}
}
}
void RasterizerOpenGL::SetupVertexInstances() {
auto& flags = maxwell3d.dirty.flags; auto& flags = maxwell3d.dirty.flags;
if (!flags[Dirty::VertexInstances]) { if (!flags[Dirty::VertexInstances]) {
return; return;
@ -343,17 +243,7 @@ void RasterizerOpenGL::SetupVertexInstances() {
} }
} }
GLintptr RasterizerOpenGL::SetupIndexBuffer() { void RasterizerOpenGL::SetupShaders(bool is_indexed) {
MICROPROFILE_SCOPE(OpenGL_Index);
const auto& regs = maxwell3d.regs;
const std::size_t size = CalculateIndexBufferSize();
const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size);
glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle);
return info.offset;
}
void RasterizerOpenGL::SetupShaders() {
MICROPROFILE_SCOPE(OpenGL_Shader);
u32 clip_distances = 0; u32 clip_distances = 0;
std::array<Shader*, Maxwell::MaxShaderStage> shaders{}; std::array<Shader*, Maxwell::MaxShaderStage> shaders{};
@ -410,11 +300,19 @@ void RasterizerOpenGL::SetupShaders() {
const size_t stage = index == 0 ? 0 : index - 1; const size_t stage = index == 0 ? 0 : index - 1;
shaders[stage] = shader; shaders[stage] = shader;
SetupDrawConstBuffers(stage, shader);
SetupDrawGlobalMemory(stage, shader);
SetupDrawTextures(shader, stage); SetupDrawTextures(shader, stage);
SetupDrawImages(shader, stage); SetupDrawImages(shader, stage);
buffer_cache.SetEnabledUniformBuffers(stage, shader->GetEntries().enabled_uniform_buffers);
buffer_cache.UnbindGraphicsStorageBuffers(stage);
u32 ssbo_index = 0;
for (const auto& buffer : shader->GetEntries().global_memory_entries) {
buffer_cache.BindGraphicsStorageBuffer(stage, ssbo_index, buffer.cbuf_index,
buffer.cbuf_offset, buffer.is_written);
++ssbo_index;
}
// Workaround for Intel drivers. // Workaround for Intel drivers.
// When a clip distance is enabled but not set in the shader it crops parts of the screen // When a clip distance is enabled but not set in the shader it crops parts of the screen
// (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the
@ -430,44 +328,27 @@ void RasterizerOpenGL::SetupShaders() {
SyncClipEnabled(clip_distances); SyncClipEnabled(clip_distances);
maxwell3d.dirty.flags[Dirty::Shaders] = false; maxwell3d.dirty.flags[Dirty::Shaders] = false;
buffer_cache.UpdateGraphicsBuffers(is_indexed);
const std::span indices_span(image_view_indices.data(), image_view_indices.size()); const std::span indices_span(image_view_indices.data(), image_view_indices.size());
texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); texture_cache.FillGraphicsImageViews(indices_span, image_view_ids);
buffer_cache.BindHostGeometryBuffers(is_indexed);
size_t image_view_index = 0; size_t image_view_index = 0;
size_t texture_index = 0; size_t texture_index = 0;
size_t image_index = 0; size_t image_index = 0;
for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
const Shader* const shader = shaders[stage]; const Shader* const shader = shaders[stage];
if (shader) { if (!shader) {
const auto base = device.GetBaseBindings(stage); continue;
}
buffer_cache.BindHostStageBuffers(stage);
const auto& base = device.GetBaseBindings(stage);
BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index, BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index,
texture_index, image_index); texture_index, image_index);
} }
} }
}
std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const {
const auto& regs = maxwell3d.regs;
std::size_t size = 0;
for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
if (!regs.vertex_array[index].IsEnabled())
continue;
const GPUVAddr start = regs.vertex_array[index].StartAddress();
const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress();
size += end - start;
ASSERT(end >= start);
}
return size;
}
std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const {
return static_cast<std::size_t>(maxwell3d.regs.index_array.count) *
static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes());
}
void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading,
const VideoCore::DiskResourceLoadCallback& callback) { const VideoCore::DiskResourceLoadCallback& callback) {
@ -475,6 +356,7 @@ void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& s
} }
void RasterizerOpenGL::Clear() { void RasterizerOpenGL::Clear() {
MICROPROFILE_SCOPE(OpenGL_Clears);
if (!maxwell3d.ShouldExecute()) { if (!maxwell3d.ShouldExecute()) {
return; return;
} }
@ -525,11 +407,9 @@ void RasterizerOpenGL::Clear() {
} }
UNIMPLEMENTED_IF(regs.clear_flags.viewport); UNIMPLEMENTED_IF(regs.clear_flags.viewport);
{ std::scoped_lock lock{texture_cache.mutex};
auto lock = texture_cache.AcquireLock();
texture_cache.UpdateRenderTargets(true); texture_cache.UpdateRenderTargets(true);
state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
}
if (use_color) { if (use_color) {
glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color);
@ -541,7 +421,6 @@ void RasterizerOpenGL::Clear() {
} else if (use_stencil) { } else if (use_stencil) {
glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil); glClearBufferiv(GL_STENCIL, 0, &regs.clear_stencil);
} }
++num_queued_commands; ++num_queued_commands;
} }
@ -550,75 +429,12 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
query_cache.UpdateCounters(); query_cache.UpdateCounters();
SyncViewport(); SyncState();
SyncRasterizeEnable();
SyncPolygonModes();
SyncColorMask();
SyncFragmentColorClampState();
SyncMultiSampleState();
SyncDepthTestState();
SyncDepthClamp();
SyncStencilTestState();
SyncBlendState();
SyncLogicOpState();
SyncCullMode();
SyncPrimitiveRestart();
SyncScissorTest();
SyncPointState();
SyncLineState();
SyncPolygonOffset();
SyncAlphaTest();
SyncFramebufferSRGB();
buffer_cache.Acquire();
current_cbuf = 0;
std::size_t buffer_size = CalculateVertexArraysSize();
// Add space for index buffer
if (is_indexed) {
buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
}
// Uniform space for the 5 shader stages
buffer_size =
Common::AlignUp<std::size_t>(buffer_size, 4) +
(sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage;
// Add space for at least 18 constant buffers
buffer_size += Maxwell::MaxConstBuffers *
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
// Prepare the vertex array.
buffer_cache.Map(buffer_size);
// Prepare vertex array format.
SetupVertexFormat();
// Upload vertex and index data.
SetupVertexBuffer();
SetupVertexInstances();
GLintptr index_buffer_offset = 0;
if (is_indexed) {
index_buffer_offset = SetupIndexBuffer();
}
// Setup emulation uniform buffer.
if (!device.UseAssemblyShaders()) {
MaxwellUniformData ubo;
ubo.SetFromRegs(maxwell3d);
const auto info =
buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset,
static_cast<GLsizeiptr>(sizeof(ubo)));
}
// Setup shaders and their used resources. // Setup shaders and their used resources.
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
SetupShaders(); SetupShaders(is_indexed);
// Signal the buffer cache that we are not going to upload more things.
buffer_cache.Unmap();
texture_cache.UpdateRenderTargets(false); texture_cache.UpdateRenderTargets(false);
state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle());
program_manager.BindGraphicsPipeline(); program_manager.BindGraphicsPipeline();
@ -632,7 +448,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
if (is_indexed) { if (is_indexed) {
const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base); const GLint base_vertex = static_cast<GLint>(maxwell3d.regs.vb_element_base);
const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count); const GLsizei num_vertices = static_cast<GLsizei>(maxwell3d.regs.index_array.count);
const GLvoid* offset = reinterpret_cast<const GLvoid*>(index_buffer_offset); const GLvoid* const offset = buffer_cache_runtime.IndexOffset();
const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format); const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format);
if (num_instances == 1 && base_instance == 0 && base_vertex == 0) { if (num_instances == 1 && base_instance == 0 && base_vertex == 0) {
glDrawElements(primitive_mode, num_vertices, format, offset); glDrawElements(primitive_mode, num_vertices, format, offset);
@ -672,22 +488,22 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) {
} }
void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) {
buffer_cache.Acquire();
current_cbuf = 0;
Shader* const kernel = shader_cache.GetComputeKernel(code_addr); Shader* const kernel = shader_cache.GetComputeKernel(code_addr);
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
BindComputeTextures(kernel); BindComputeTextures(kernel);
const size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers * const auto& entries = kernel->GetEntries();
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); buffer_cache.SetEnabledComputeUniformBuffers(entries.enabled_uniform_buffers);
buffer_cache.Map(buffer_size); buffer_cache.UnbindComputeStorageBuffers();
u32 ssbo_index = 0;
SetupComputeConstBuffers(kernel); for (const auto& buffer : entries.global_memory_entries) {
SetupComputeGlobalMemory(kernel); buffer_cache.BindComputeStorageBuffer(ssbo_index, buffer.cbuf_index, buffer.cbuf_offset,
buffer.is_written);
buffer_cache.Unmap(); ++ssbo_index;
}
buffer_cache.UpdateComputeBuffers();
buffer_cache.BindHostComputeBuffers();
const auto& launch_desc = kepler_compute.launch_description; const auto& launch_desc = kepler_compute.launch_description;
glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z);
@ -703,6 +519,12 @@ void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
query_cache.Query(gpu_addr, type, timestamp); query_cache.Query(gpu_addr, type, timestamp);
} }
void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
u32 size) {
std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.BindGraphicsUniformBuffer(stage, index, gpu_addr, size);
}
void RasterizerOpenGL::FlushAll() {} void RasterizerOpenGL::FlushAll() {}
void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
@ -711,19 +533,23 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
return; return;
} }
{ {
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.DownloadMemory(addr, size); texture_cache.DownloadMemory(addr, size);
} }
buffer_cache.FlushRegion(addr, size); {
std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.DownloadMemory(addr, size);
}
query_cache.FlushRegion(addr, size); query_cache.FlushRegion(addr, size);
} }
bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) { bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) {
std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
if (!Settings::IsGPULevelHigh()) { if (!Settings::IsGPULevelHigh()) {
return buffer_cache.MustFlushRegion(addr, size); return buffer_cache.IsRegionGpuModified(addr, size);
} }
return texture_cache.IsRegionGpuModified(addr, size) || return texture_cache.IsRegionGpuModified(addr, size) ||
buffer_cache.MustFlushRegion(addr, size); buffer_cache.IsRegionGpuModified(addr, size);
} }
void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
@ -732,11 +558,14 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
return; return;
} }
{ {
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.WriteMemory(addr, size); texture_cache.WriteMemory(addr, size);
} }
{
std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.WriteMemory(addr, size);
}
shader_cache.InvalidateRegion(addr, size); shader_cache.InvalidateRegion(addr, size);
buffer_cache.InvalidateRegion(addr, size);
query_cache.InvalidateRegion(addr, size); query_cache.InvalidateRegion(addr, size);
} }
@ -745,26 +574,35 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) {
if (addr == 0 || size == 0) { if (addr == 0 || size == 0) {
return; return;
} }
shader_cache.OnCPUWrite(addr, size);
{ {
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.WriteMemory(addr, size); texture_cache.WriteMemory(addr, size);
} }
shader_cache.OnCPUWrite(addr, size); {
buffer_cache.OnCPUWrite(addr, size); std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.CachedWriteMemory(addr, size);
}
} }
void RasterizerOpenGL::SyncGuestHost() { void RasterizerOpenGL::SyncGuestHost() {
MICROPROFILE_SCOPE(OpenGL_CacheManagement); MICROPROFILE_SCOPE(OpenGL_CacheManagement);
buffer_cache.SyncGuestHost();
shader_cache.SyncGuestHost(); shader_cache.SyncGuestHost();
{
std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.FlushCachedWrites();
}
} }
void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) { void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) {
{ {
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.UnmapMemory(addr, size); texture_cache.UnmapMemory(addr, size);
} }
buffer_cache.OnCPUWrite(addr, size); {
std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.WriteMemory(addr, size);
}
shader_cache.OnCPUWrite(addr, size); shader_cache.OnCPUWrite(addr, size);
} }
@ -799,14 +637,7 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) {
} }
void RasterizerOpenGL::WaitForIdle() { void RasterizerOpenGL::WaitForIdle() {
// Place a barrier on everything that is not framebuffer related. glMemoryBarrier(GL_ALL_BARRIER_BITS);
// This is related to another flag that is not currently implemented.
glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT |
GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT |
GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT |
GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT |
GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT |
GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT);
} }
void RasterizerOpenGL::FragmentBarrier() { void RasterizerOpenGL::FragmentBarrier() {
@ -831,18 +662,21 @@ void RasterizerOpenGL::TickFrame() {
num_queued_commands = 0; num_queued_commands = 0;
fence_manager.TickFrame(); fence_manager.TickFrame();
buffer_cache.TickFrame();
{ {
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.TickFrame(); texture_cache.TickFrame();
} }
{
std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.TickFrame();
}
} }
bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) { const Tegra::Engines::Fermi2D::Config& copy_config) {
MICROPROFILE_SCOPE(OpenGL_Blits); MICROPROFILE_SCOPE(OpenGL_Blits);
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.BlitImage(dst, src, copy_config); texture_cache.BlitImage(dst, src, copy_config);
return true; return true;
} }
@ -854,7 +688,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config,
} }
MICROPROFILE_SCOPE(OpenGL_CacheManagement); MICROPROFILE_SCOPE(OpenGL_CacheManagement);
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
ImageView* const image_view{texture_cache.TryFindFramebufferImageView(framebuffer_addr)}; ImageView* const image_view{texture_cache.TryFindFramebufferImageView(framebuffer_addr)};
if (!image_view) { if (!image_view) {
return false; return false;
@ -921,166 +755,6 @@ void RasterizerOpenGL::BindTextures(const ShaderEntries& entries, GLuint base_te
} }
} }
void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) {
static constexpr std::array PARAMETER_LUT{
GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV,
GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV,
GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV,
};
MICROPROFILE_SCOPE(OpenGL_UBO);
const auto& stages = maxwell3d.state.shader_stages;
const auto& shader_stage = stages[stage_index];
const auto& entries = shader->GetEntries();
const bool use_unified = entries.use_unified_uniforms;
const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE;
const auto base_bindings = device.GetBaseBindings(stage_index);
u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer;
for (const auto& entry : entries.const_buffers) {
const u32 index = entry.GetIndex();
const auto& buffer = shader_stage.const_buffers[index];
SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified,
base_unified_offset + index * Maxwell::MaxConstBufferSize);
++binding;
}
if (use_unified) {
const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer +
entries.global_memory_entries.size());
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle,
base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE);
}
}
void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) {
MICROPROFILE_SCOPE(OpenGL_UBO);
const auto& launch_desc = kepler_compute.launch_description;
const auto& entries = kernel->GetEntries();
const bool use_unified = entries.use_unified_uniforms;
u32 binding = 0;
for (const auto& entry : entries.const_buffers) {
const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
Tegra::Engines::ConstBufferInfo buffer;
buffer.address = config.Address();
buffer.size = config.size;
buffer.enabled = mask[entry.GetIndex()];
SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry,
use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize);
++binding;
}
if (use_unified) {
const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size());
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0,
NUM_CONST_BUFFERS_BYTES_PER_STAGE);
}
}
void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding,
const Tegra::Engines::ConstBufferInfo& buffer,
const ConstBufferEntry& entry, bool use_unified,
std::size_t unified_offset) {
if (!buffer.enabled) {
// Set values to zero to unbind buffers
if (device.UseAssemblyShaders()) {
glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0);
} else {
glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float));
}
return;
}
// Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140
// UBO alignment requirements.
const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4));
const bool fast_upload = !use_unified && device.HasFastBufferSubData();
const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment();
const GPUVAddr gpu_addr = buffer.address;
auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload);
if (device.UseAssemblyShaders()) {
UNIMPLEMENTED_IF(use_unified);
if (info.offset != 0) {
const GLuint staging_cbuf = staging_cbufs[current_cbuf++];
glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size);
info.handle = staging_cbuf;
info.offset = 0;
}
glBindBufferRangeNV(stage, binding, info.handle, info.offset, size);
return;
}
if (use_unified) {
glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset,
unified_offset, size);
} else {
glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size);
}
}
void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) {
static constexpr std::array TARGET_LUT = {
GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV,
GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV,
};
const auto& cbufs{maxwell3d.state.shader_stages[stage_index]};
const auto& entries{shader->GetEntries().global_memory_entries};
std::array<BindlessSSBO, 32> ssbos;
ASSERT(entries.size() < ssbos.size());
const bool assembly_shaders = device.UseAssemblyShaders();
u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer;
for (const auto& entry : entries) {
const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset};
const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
const u32 size{gpu_memory.Read<u32>(addr + 8)};
SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
++binding;
}
if (assembly_shaders) {
UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size());
}
}
void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) {
const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
const auto& entries{kernel->GetEntries().global_memory_entries};
std::array<BindlessSSBO, 32> ssbos;
ASSERT(entries.size() < ssbos.size());
u32 binding = 0;
for (const auto& entry : entries) {
const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset};
const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)};
const u32 size{gpu_memory.Read<u32>(addr + 8)};
SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]);
++binding;
}
if (device.UseAssemblyShaders()) {
UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size());
}
}
void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry,
GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) {
const size_t alignment{device.GetShaderStorageBufferAlignment()};
const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written);
if (device.UseAssemblyShaders()) {
*ssbo = BindlessSSBO{
.address = static_cast<GLuint64EXT>(info.address + info.offset),
.length = static_cast<GLsizei>(size),
.padding = 0,
};
} else {
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset,
static_cast<GLsizeiptr>(size));
}
}
void RasterizerOpenGL::SetupDrawTextures(const Shader* shader, size_t stage_index) { void RasterizerOpenGL::SetupDrawTextures(const Shader* shader, size_t stage_index) {
const bool via_header_index = const bool via_header_index =
maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
@ -1128,6 +802,30 @@ void RasterizerOpenGL::SetupComputeImages(const Shader* shader) {
} }
} }
void RasterizerOpenGL::SyncState() {
SyncViewport();
SyncRasterizeEnable();
SyncPolygonModes();
SyncColorMask();
SyncFragmentColorClampState();
SyncMultiSampleState();
SyncDepthTestState();
SyncDepthClamp();
SyncStencilTestState();
SyncBlendState();
SyncLogicOpState();
SyncCullMode();
SyncPrimitiveRestart();
SyncScissorTest();
SyncPointState();
SyncLineState();
SyncPolygonOffset();
SyncAlphaTest();
SyncFramebufferSRGB();
SyncVertexFormats();
SyncVertexInstances();
}
void RasterizerOpenGL::SyncViewport() { void RasterizerOpenGL::SyncViewport() {
auto& flags = maxwell3d.dirty.flags; auto& flags = maxwell3d.dirty.flags;
const auto& regs = maxwell3d.regs; const auto& regs = maxwell3d.regs;
@ -1163,9 +861,11 @@ void RasterizerOpenGL::SyncViewport() {
if (regs.screen_y_control.y_negate != 0) { if (regs.screen_y_control.y_negate != 0) {
flip_y = !flip_y; flip_y = !flip_y;
} }
glClipControl(flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT, const bool is_zero_to_one = regs.depth_mode == Maxwell::DepthMode::ZeroToOne;
regs.depth_mode == Maxwell::DepthMode::ZeroToOne ? GL_ZERO_TO_ONE const GLenum origin = flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT;
: GL_NEGATIVE_ONE_TO_ONE); const GLenum depth = is_zero_to_one ? GL_ZERO_TO_ONE : GL_NEGATIVE_ONE_TO_ONE;
state_tracker.ClipControl(origin, depth);
state_tracker.SetYNegate(regs.screen_y_control.y_negate != 0);
} }
if (dirty_viewport) { if (dirty_viewport) {
@ -1649,36 +1349,13 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) {
if (regs.tfb_enabled == 0) { if (regs.tfb_enabled == 0) {
return; return;
} }
if (device.UseAssemblyShaders()) { if (device.UseAssemblyShaders()) {
SyncTransformFeedback(); SyncTransformFeedback();
} }
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
UNIMPLEMENTED_IF(primitive_mode != GL_POINTS);
for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
const auto& binding = regs.tfb_bindings[index];
if (!binding.buffer_enable) {
if (enabled_transform_feedback_buffers[index]) {
glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), 0, 0,
0);
}
enabled_transform_feedback_buffers[index] = false;
continue;
}
enabled_transform_feedback_buffers[index] = true;
auto& tfb_buffer = transform_feedback_buffers[index];
tfb_buffer.Create();
const GLuint handle = tfb_buffer.handle;
const std::size_t size = binding.buffer_size;
glNamedBufferData(handle, static_cast<GLsizeiptr>(size), nullptr, GL_STREAM_COPY);
glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast<GLuint>(index), handle, 0,
static_cast<GLsizeiptr>(size));
}
// We may have to call BeginTransformFeedbackNV here since they seem to call different // We may have to call BeginTransformFeedbackNV here since they seem to call different
// implementations on Nvidia's driver (the pointer is different) but we are using // implementations on Nvidia's driver (the pointer is different) but we are using
@ -1692,23 +1369,7 @@ void RasterizerOpenGL::EndTransformFeedback() {
if (regs.tfb_enabled == 0) { if (regs.tfb_enabled == 0) {
return; return;
} }
glEndTransformFeedback(); glEndTransformFeedback();
for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) {
const auto& binding = regs.tfb_bindings[index];
if (!binding.buffer_enable) {
continue;
}
UNIMPLEMENTED_IF(binding.buffer_offset != 0);
const GLuint handle = transform_feedback_buffers[index].handle;
const GPUVAddr gpu_addr = binding.Address();
const std::size_t size = binding.buffer_size;
const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
glCopyNamedBufferSubData(handle, info.handle, 0, info.offset,
static_cast<GLsizeiptr>(size));
}
} }
} // namespace OpenGL } // namespace OpenGL

View file

@ -30,7 +30,6 @@
#include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h"
#include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_shader_manager.h"
#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_state_tracker.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h"
#include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/gl_texture_cache.h"
#include "video_core/shader/async_shaders.h" #include "video_core/shader/async_shaders.h"
#include "video_core/textures/texture.h" #include "video_core/textures/texture.h"
@ -72,6 +71,7 @@ public:
void DispatchCompute(GPUVAddr code_addr) override; void DispatchCompute(GPUVAddr code_addr) override;
void ResetCounter(VideoCore::QueryType type) override; void ResetCounter(VideoCore::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
void FlushAll() override; void FlushAll() override;
void FlushRegion(VAddr addr, u64 size) override; void FlushRegion(VAddr addr, u64 size) override;
bool MustFlushRegion(VAddr addr, u64 size) override; bool MustFlushRegion(VAddr addr, u64 size) override;
@ -119,27 +119,6 @@ private:
void BindTextures(const ShaderEntries& entries, GLuint base_texture, GLuint base_image, void BindTextures(const ShaderEntries& entries, GLuint base_texture, GLuint base_image,
size_t& image_view_index, size_t& texture_index, size_t& image_index); size_t& image_view_index, size_t& texture_index, size_t& image_index);
/// Configures the current constbuffers to use for the draw command.
void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader);
/// Configures the current constbuffers to use for the kernel invocation.
void SetupComputeConstBuffers(Shader* kernel);
/// Configures a constant buffer.
void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer,
const ConstBufferEntry& entry, bool use_unified,
std::size_t unified_offset);
/// Configures the current global memory entries to use for the draw command.
void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader);
/// Configures the current global memory entries to use for the kernel invocation.
void SetupComputeGlobalMemory(Shader* kernel);
/// Configures a global memory buffer.
void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr,
size_t size, BindlessSSBO* ssbo);
/// Configures the current textures to use for the draw command. /// Configures the current textures to use for the draw command.
void SetupDrawTextures(const Shader* shader, size_t stage_index); void SetupDrawTextures(const Shader* shader, size_t stage_index);
@ -152,6 +131,9 @@ private:
/// Configures images in a compute shader. /// Configures images in a compute shader.
void SetupComputeImages(const Shader* shader); void SetupComputeImages(const Shader* shader);
/// Syncs state to match guest's
void SyncState();
/// Syncs the viewport and depth range to match the guest state /// Syncs the viewport and depth range to match the guest state
void SyncViewport(); void SyncViewport();
@ -215,6 +197,12 @@ private:
/// Syncs the framebuffer sRGB state to match the guest state /// Syncs the framebuffer sRGB state to match the guest state
void SyncFramebufferSRGB(); void SyncFramebufferSRGB();
/// Syncs vertex formats to match the guest state
void SyncVertexFormats();
/// Syncs vertex instances to match the guest state
void SyncVertexInstances();
/// Syncs transform feedback state to match guest state /// Syncs transform feedback state to match guest state
/// @note Only valid on assembly shaders /// @note Only valid on assembly shaders
void SyncTransformFeedback(); void SyncTransformFeedback();
@ -225,19 +213,7 @@ private:
/// End a transform feedback /// End a transform feedback
void EndTransformFeedback(); void EndTransformFeedback();
std::size_t CalculateVertexArraysSize() const; void SetupShaders(bool is_indexed);
std::size_t CalculateIndexBufferSize() const;
/// Updates the current vertex format
void SetupVertexFormat();
void SetupVertexBuffer();
void SetupVertexInstances();
GLintptr SetupIndexBuffer();
void SetupShaders();
Tegra::GPU& gpu; Tegra::GPU& gpu;
Tegra::Engines::Maxwell3D& maxwell3d; Tegra::Engines::Maxwell3D& maxwell3d;
@ -249,12 +225,12 @@ private:
ProgramManager& program_manager; ProgramManager& program_manager;
StateTracker& state_tracker; StateTracker& state_tracker;
OGLStreamBuffer stream_buffer;
TextureCacheRuntime texture_cache_runtime; TextureCacheRuntime texture_cache_runtime;
TextureCache texture_cache; TextureCache texture_cache;
BufferCacheRuntime buffer_cache_runtime;
BufferCache buffer_cache;
ShaderCacheOpenGL shader_cache; ShaderCacheOpenGL shader_cache;
QueryCache query_cache; QueryCache query_cache;
OGLBufferCache buffer_cache;
FenceManagerOpenGL fence_manager; FenceManagerOpenGL fence_manager;
VideoCommon::Shader::AsyncShaders async_shaders; VideoCommon::Shader::AsyncShaders async_shaders;
@ -262,20 +238,8 @@ private:
boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices; boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices;
std::array<ImageViewId, MAX_IMAGE_VIEWS> image_view_ids; std::array<ImageViewId, MAX_IMAGE_VIEWS> image_view_ids;
boost::container::static_vector<GLuint, MAX_TEXTURES> sampler_handles; boost::container::static_vector<GLuint, MAX_TEXTURES> sampler_handles;
std::array<GLuint, MAX_TEXTURES> texture_handles; std::array<GLuint, MAX_TEXTURES> texture_handles{};
std::array<GLuint, MAX_IMAGES> image_handles; std::array<GLuint, MAX_IMAGES> image_handles{};
std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
transform_feedback_buffers;
std::bitset<Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers>
enabled_transform_feedback_buffers;
static constexpr std::size_t NUM_CONSTANT_BUFFERS =
Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers *
Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram;
std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{};
std::size_t current_cbuf = 0;
OGLBuffer unified_uniform_buffer;
/// Number of commands queued to the OpenGL driver. Resetted on flush. /// Number of commands queued to the OpenGL driver. Resetted on flush.
std::size_t num_queued_commands = 0; std::size_t num_queued_commands = 0;

View file

@ -171,12 +171,6 @@ void OGLBuffer::Release() {
handle = 0; handle = 0;
} }
void OGLBuffer::MakeStreamCopy(std::size_t buffer_size) {
ASSERT_OR_EXECUTE((handle != 0 && buffer_size != 0), { return; });
glNamedBufferData(handle, buffer_size, nullptr, GL_STREAM_COPY);
}
void OGLSync::Create() { void OGLSync::Create() {
if (handle != 0) if (handle != 0)
return; return;

View file

@ -234,9 +234,6 @@ public:
/// Deletes the internal OpenGL resource /// Deletes the internal OpenGL resource
void Release(); void Release();
// Converts the buffer into a stream copy buffer with a fixed size
void MakeStreamCopy(std::size_t buffer_size);
GLuint handle = 0; GLuint handle = 0;
}; };

View file

@ -64,7 +64,7 @@ using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>
constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32); constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32);
constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32); constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32);
constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt constexpr std::string_view COMMON_DECLARATIONS = R"(#define ftoi floatBitsToInt
#define ftou floatBitsToUint #define ftou floatBitsToUint
#define itof intBitsToFloat #define itof intBitsToFloat
#define utof uintBitsToFloat #define utof uintBitsToFloat
@ -77,10 +77,6 @@ bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{
const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f ); const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f );
const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f ); const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f );
layout (std140, binding = {}) uniform vs_config {{
float y_direction;
}};
)"; )";
class ShaderWriter final { class ShaderWriter final {
@ -402,13 +398,6 @@ std::string FlowStackTopName(MetaStackClass stack) {
return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack));
} }
bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) {
const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size());
// We waste one UBO for emulation
const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1;
return num_ubos > num_available_ubos;
}
struct GenericVaryingDescription { struct GenericVaryingDescription {
std::string name; std::string name;
u8 first_element = 0; u8 first_element = 0;
@ -420,9 +409,8 @@ public:
explicit GLSLDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_, explicit GLSLDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_,
ShaderType stage_, std::string_view identifier_, ShaderType stage_, std::string_view identifier_,
std::string_view suffix_) std::string_view suffix_)
: device{device_}, ir{ir_}, registry{registry_}, stage{stage_}, identifier{identifier_}, : device{device_}, ir{ir_}, registry{registry_}, stage{stage_},
suffix{suffix_}, header{ir.GetHeader()}, use_unified_uniforms{ identifier{identifier_}, suffix{suffix_}, header{ir.GetHeader()} {
UseUnifiedUniforms(device_, ir_, stage_)} {
if (stage != ShaderType::Compute) { if (stage != ShaderType::Compute) {
transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo());
} }
@ -516,7 +504,8 @@ private:
if (!identifier.empty()) { if (!identifier.empty()) {
code.AddLine("// {}", identifier); code.AddLine("// {}", identifier);
} }
code.AddLine("#version 440 {}", ir.UsesLegacyVaryings() ? "compatibility" : "core"); const bool use_compatibility = ir.UsesLegacyVaryings() || ir.UsesYNegate();
code.AddLine("#version 440 {}", use_compatibility ? "compatibility" : "core");
code.AddLine("#extension GL_ARB_separate_shader_objects : enable"); code.AddLine("#extension GL_ARB_separate_shader_objects : enable");
if (device.HasShaderBallot()) { if (device.HasShaderBallot()) {
code.AddLine("#extension GL_ARB_shader_ballot : require"); code.AddLine("#extension GL_ARB_shader_ballot : require");
@ -542,7 +531,7 @@ private:
code.AddNewLine(); code.AddNewLine();
code.AddLine(CommonDeclarations, EmulationUniformBlockBinding); code.AddLine(COMMON_DECLARATIONS);
} }
void DeclareVertex() { void DeclareVertex() {
@ -865,17 +854,6 @@ private:
} }
void DeclareConstantBuffers() { void DeclareConstantBuffers() {
if (use_unified_uniforms) {
const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer +
static_cast<u32>(ir.GetGlobalMemory().size());
code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{",
binding);
code.AddLine(" uint cbufs[];");
code.AddLine("}};");
code.AddNewLine();
return;
}
u32 binding = device.GetBaseBindings(stage).uniform_buffer; u32 binding = device.GetBaseBindings(stage).uniform_buffer;
for (const auto& [index, info] : ir.GetConstantBuffers()) { for (const auto& [index, info] : ir.GetConstantBuffers()) {
const u32 num_elements = Common::DivCeil(info.GetSize(), 4 * sizeof(u32)); const u32 num_elements = Common::DivCeil(info.GetSize(), 4 * sizeof(u32));
@ -1081,29 +1059,17 @@ private:
if (const auto cbuf = std::get_if<CbufNode>(&*node)) { if (const auto cbuf = std::get_if<CbufNode>(&*node)) {
const Node offset = cbuf->GetOffset(); const Node offset = cbuf->GetOffset();
const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS;
if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) {
// Direct access // Direct access
const u32 offset_imm = immediate->GetValue(); const u32 offset_imm = immediate->GetValue();
ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
if (use_unified_uniforms) {
return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4),
Type::Uint};
} else {
return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
offset_imm / (4 * 4), (offset_imm / 4) % 4), offset_imm / (4 * 4), (offset_imm / 4) % 4),
Type::Uint}; Type::Uint};
} }
}
// Indirect access // Indirect access
if (use_unified_uniforms) {
return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset,
Visit(offset).AsUint()),
Type::Uint};
}
const std::string final_offset = code.GenerateTemporary(); const std::string final_offset = code.GenerateTemporary();
code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
@ -2293,7 +2259,6 @@ private:
} }
} }
} }
if (header.ps.omap.depth) { if (header.ps.omap.depth) {
// The depth output is always 2 registers after the last color output, and current_reg // The depth output is always 2 registers after the last color output, and current_reg
// already contains one past the last color register. // already contains one past the last color register.
@ -2337,7 +2302,8 @@ private:
} }
Expression YNegate(Operation operation) { Expression YNegate(Operation operation) {
return {"y_direction", Type::Float}; // Y_NEGATE is mapped to this uniform value
return {"gl_FrontMaterial.ambient.a", Type::Float};
} }
template <u32 element> template <u32 element>
@ -2787,7 +2753,6 @@ private:
const std::string_view identifier; const std::string_view identifier;
const std::string_view suffix; const std::string_view suffix;
const Header header; const Header header;
const bool use_unified_uniforms;
std::unordered_map<u8, VaryingTFB> transform_feedback; std::unordered_map<u8, VaryingTFB> transform_feedback;
ShaderWriter code; ShaderWriter code;
@ -3003,8 +2968,10 @@ ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType s
for (std::size_t i = 0; i < std::size(clip_distances); ++i) { for (std::size_t i = 0; i < std::size(clip_distances); ++i) {
entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i;
} }
for (const auto& buffer : entries.const_buffers) {
entries.enabled_uniform_buffers |= 1U << buffer.GetIndex();
}
entries.shader_length = ir.GetLength(); entries.shader_length = ir.GetLength();
entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage);
return entries; return entries;
} }

View file

@ -55,7 +55,7 @@ struct ShaderEntries {
std::vector<ImageEntry> images; std::vector<ImageEntry> images;
std::size_t shader_length{}; std::size_t shader_length{};
u32 clip_distances{}; u32 clip_distances{};
bool use_unified_uniforms{}; u32 enabled_uniform_buffers{};
}; };
ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir, ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir,

View file

@ -36,16 +36,10 @@ void SetupDirtyColorMasks(Tables& tables) {
FillBlock(tables[1], OFF(color_mask), NUM(color_mask), ColorMasks); FillBlock(tables[1], OFF(color_mask), NUM(color_mask), ColorMasks);
} }
void SetupDirtyVertexArrays(Tables& tables) { void SetupDirtyVertexInstances(Tables& tables) {
static constexpr std::size_t num_array = 3;
static constexpr std::size_t instance_base_offset = 3; static constexpr std::size_t instance_base_offset = 3;
for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) {
const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]); const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]);
const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]);
FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers);
FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers);
const std::size_t instance_array_offset = array_offset + instance_base_offset; const std::size_t instance_array_offset = array_offset + instance_base_offset;
tables[0][instance_array_offset] = static_cast<u8>(VertexInstance0 + i); tables[0][instance_array_offset] = static_cast<u8>(VertexInstance0 + i);
tables[1][instance_array_offset] = VertexInstances; tables[1][instance_array_offset] = VertexInstances;
@ -217,11 +211,11 @@ void SetupDirtyMisc(Tables& tables) {
StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} { StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} {
auto& dirty = gpu.Maxwell3D().dirty; auto& dirty = gpu.Maxwell3D().dirty;
auto& tables = dirty.tables; auto& tables = dirty.tables;
SetupDirtyRenderTargets(tables); SetupDirtyFlags(tables);
SetupDirtyColorMasks(tables); SetupDirtyColorMasks(tables);
SetupDirtyViewports(tables); SetupDirtyViewports(tables);
SetupDirtyScissors(tables); SetupDirtyScissors(tables);
SetupDirtyVertexArrays(tables); SetupDirtyVertexInstances(tables);
SetupDirtyVertexFormat(tables); SetupDirtyVertexFormat(tables);
SetupDirtyShaders(tables); SetupDirtyShaders(tables);
SetupDirtyPolygonModes(tables); SetupDirtyPolygonModes(tables);
@ -241,19 +235,6 @@ StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags}
SetupDirtyClipControl(tables); SetupDirtyClipControl(tables);
SetupDirtyDepthClampEnabled(tables); SetupDirtyDepthClampEnabled(tables);
SetupDirtyMisc(tables); SetupDirtyMisc(tables);
auto& store = dirty.on_write_stores;
store[VertexBuffers] = true;
for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) {
store[VertexBuffer0 + i] = true;
}
}
void StateTracker::InvalidateStreamBuffer() {
flags[Dirty::VertexBuffers] = true;
for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) {
flags[index] = true;
}
} }
} // namespace OpenGL } // namespace OpenGL

View file

@ -28,10 +28,6 @@ enum : u8 {
VertexFormat0, VertexFormat0,
VertexFormat31 = VertexFormat0 + 31, VertexFormat31 = VertexFormat0 + 31,
VertexBuffers,
VertexBuffer0,
VertexBuffer31 = VertexBuffer0 + 31,
VertexInstances, VertexInstances,
VertexInstance0, VertexInstance0,
VertexInstance31 = VertexInstance0 + 31, VertexInstance31 = VertexInstance0 + 31,
@ -92,8 +88,6 @@ class StateTracker {
public: public:
explicit StateTracker(Tegra::GPU& gpu); explicit StateTracker(Tegra::GPU& gpu);
void InvalidateStreamBuffer();
void BindIndexBuffer(GLuint new_index_buffer) { void BindIndexBuffer(GLuint new_index_buffer) {
if (index_buffer == new_index_buffer) { if (index_buffer == new_index_buffer) {
return; return;
@ -110,13 +104,32 @@ public:
glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer); glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer);
} }
void ClipControl(GLenum new_origin, GLenum new_depth) {
if (new_origin == origin && new_depth == depth) {
return;
}
origin = new_origin;
depth = new_depth;
glClipControl(origin, depth);
}
void SetYNegate(bool new_y_negate) {
if (new_y_negate == y_negate) {
return;
}
// Y_NEGATE is mapped to gl_FrontMaterial.ambient.a
y_negate = new_y_negate;
const std::array ambient{0.0f, 0.0f, 0.0f, y_negate ? -1.0f : 1.0f};
glMaterialfv(GL_FRONT, GL_AMBIENT, ambient.data());
}
void NotifyScreenDrawVertexArray() { void NotifyScreenDrawVertexArray() {
flags[OpenGL::Dirty::VertexFormats] = true; flags[OpenGL::Dirty::VertexFormats] = true;
flags[OpenGL::Dirty::VertexFormat0 + 0] = true; flags[OpenGL::Dirty::VertexFormat0 + 0] = true;
flags[OpenGL::Dirty::VertexFormat0 + 1] = true; flags[OpenGL::Dirty::VertexFormat0 + 1] = true;
flags[OpenGL::Dirty::VertexBuffers] = true; flags[VideoCommon::Dirty::VertexBuffers] = true;
flags[OpenGL::Dirty::VertexBuffer0] = true; flags[VideoCommon::Dirty::VertexBuffer0] = true;
flags[OpenGL::Dirty::VertexInstances] = true; flags[OpenGL::Dirty::VertexInstances] = true;
flags[OpenGL::Dirty::VertexInstance0 + 0] = true; flags[OpenGL::Dirty::VertexInstance0 + 0] = true;
@ -202,6 +215,9 @@ private:
GLuint framebuffer = 0; GLuint framebuffer = 0;
GLuint index_buffer = 0; GLuint index_buffer = 0;
GLenum origin = GL_LOWER_LEFT;
GLenum depth = GL_NEGATIVE_ONE_TO_ONE;
bool y_negate = false;
}; };
} // namespace OpenGL } // namespace OpenGL

View file

@ -1,70 +1,64 @@
// Copyright 2018 Citra Emulator Project // Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version // Licensed under GPLv2 or any later version
// Refer to the license.txt file included. // Refer to the license.txt file included.
#include <tuple> #include <array>
#include <vector> #include <memory>
#include <span>
#include <glad/glad.h>
#include "common/alignment.h" #include "common/alignment.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/microprofile.h"
#include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_state_tracker.h"
#include "video_core/renderer_opengl/gl_stream_buffer.h" #include "video_core/renderer_opengl/gl_stream_buffer.h"
MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning",
MP_RGB(128, 128, 192));
namespace OpenGL { namespace OpenGL {
OGLStreamBuffer::OGLStreamBuffer(const Device& device, StateTracker& state_tracker_) StreamBuffer::StreamBuffer() {
: state_tracker{state_tracker_} { static constexpr GLenum flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT;
gl_buffer.Create(); buffer.Create();
glObjectLabel(GL_BUFFER, buffer.handle, -1, "Stream Buffer");
static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; glNamedBufferStorage(buffer.handle, STREAM_BUFFER_SIZE, nullptr, flags);
glNamedBufferStorage(gl_buffer.handle, BUFFER_SIZE, nullptr, flags); mapped_pointer =
mapped_ptr = static_cast<u8*>( static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, STREAM_BUFFER_SIZE, flags));
glMapNamedBufferRange(gl_buffer.handle, 0, BUFFER_SIZE, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); for (OGLSync& sync : fences) {
sync.Create();
if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) {
glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY);
glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address);
} }
} }
OGLStreamBuffer::~OGLStreamBuffer() { std::pair<std::span<u8>, size_t> StreamBuffer::Request(size_t size) noexcept {
glUnmapNamedBuffer(gl_buffer.handle); ASSERT(size < REGION_SIZE);
gl_buffer.Release(); for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end;
++region) {
fences[region].Create();
} }
used_iterator = iterator;
std::pair<u8*, GLintptr> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) { for (size_t region = Region(free_iterator) + 1,
ASSERT(size <= BUFFER_SIZE); region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS);
ASSERT(alignment <= BUFFER_SIZE); region < region_end; ++region) {
mapped_size = size; glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED);
fences[region].Release();
if (alignment > 0) {
buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment);
} }
if (iterator + size > free_iterator) {
if (buffer_pos + size > BUFFER_SIZE) { free_iterator = iterator + size;
MICROPROFILE_SCOPE(OpenGL_StreamBuffer);
glInvalidateBufferData(gl_buffer.handle);
state_tracker.InvalidateStreamBuffer();
buffer_pos = 0;
} }
if (iterator + size > STREAM_BUFFER_SIZE) {
return std::make_pair(mapped_ptr + buffer_pos, buffer_pos); for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) {
fences[region].Create();
} }
used_iterator = 0;
iterator = 0;
free_iterator = size;
void OGLStreamBuffer::Unmap(GLsizeiptr size) { for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) {
ASSERT(size <= mapped_size); glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED);
fences[region].Release();
if (size > 0) {
glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size);
} }
}
buffer_pos += size; const size_t offset = iterator;
iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT);
return {std::span(mapped_pointer + offset, size), offset};
} }
} // namespace OpenGL } // namespace OpenGL

View file

@ -1,9 +1,12 @@
// Copyright 2018 Citra Emulator Project // Copyright 2021 yuzu Emulator Project
// Licensed under GPLv2 or any later version // Licensed under GPLv2 or any later version
// Refer to the license.txt file included. // Refer to the license.txt file included.
#pragma once #pragma once
#include <array>
#include <memory>
#include <span>
#include <utility> #include <utility>
#include <glad/glad.h> #include <glad/glad.h>
@ -13,48 +16,35 @@
namespace OpenGL { namespace OpenGL {
class Device; class StreamBuffer {
class StateTracker; static constexpr size_t STREAM_BUFFER_SIZE = 64 * 1024 * 1024;
static constexpr size_t NUM_SYNCS = 16;
static constexpr size_t REGION_SIZE = STREAM_BUFFER_SIZE / NUM_SYNCS;
static constexpr size_t MAX_ALIGNMENT = 256;
static_assert(STREAM_BUFFER_SIZE % MAX_ALIGNMENT == 0);
static_assert(STREAM_BUFFER_SIZE % NUM_SYNCS == 0);
static_assert(REGION_SIZE % MAX_ALIGNMENT == 0);
class OGLStreamBuffer : private NonCopyable {
public: public:
explicit OGLStreamBuffer(const Device& device, StateTracker& state_tracker_); explicit StreamBuffer();
~OGLStreamBuffer();
/* [[nodiscard]] std::pair<std::span<u8>, size_t> Request(size_t size) noexcept;
* Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes
* and the optional alignment requirement.
* If the buffer is full, the whole buffer is reallocated which invalidates old chunks.
* The return values are the pointer to the new chunk, and the offset within the buffer.
* The actual used size must be specified on unmapping the chunk.
*/
std::pair<u8*, GLintptr> Map(GLsizeiptr size, GLintptr alignment = 0);
void Unmap(GLsizeiptr size); [[nodiscard]] GLuint Handle() const noexcept {
return buffer.handle;
GLuint Handle() const {
return gl_buffer.handle;
}
u64 Address() const {
return gpu_address;
}
GLsizeiptr Size() const noexcept {
return BUFFER_SIZE;
} }
private: private:
static constexpr GLsizeiptr BUFFER_SIZE = 256 * 1024 * 1024; [[nodiscard]] static size_t Region(size_t offset) noexcept {
return offset / REGION_SIZE;
}
StateTracker& state_tracker; size_t iterator = 0;
size_t used_iterator = 0;
OGLBuffer gl_buffer; size_t free_iterator = 0;
u8* mapped_pointer = nullptr;
GLuint64EXT gpu_address = 0; OGLBuffer buffer;
GLintptr buffer_pos = 0; std::array<OGLSync, NUM_SYNCS> fences;
GLsizeiptr mapped_size = 0;
u8* mapped_ptr = nullptr;
}; };
} // namespace OpenGL } // namespace OpenGL

View file

@ -398,9 +398,6 @@ void AttachTexture(GLuint fbo, GLenum attachment, const ImageView* image_view) {
} // Anonymous namespace } // Anonymous namespace
ImageBufferMap::ImageBufferMap(GLuint handle_, u8* map, size_t size, OGLSync* sync_)
: span(map, size), sync{sync_}, handle{handle_} {}
ImageBufferMap::~ImageBufferMap() { ImageBufferMap::~ImageBufferMap() {
if (sync) { if (sync) {
sync->Create(); sync->Create();
@ -487,11 +484,11 @@ void TextureCacheRuntime::Finish() {
glFinish(); glFinish();
} }
ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) { ImageBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) {
return upload_buffers.RequestMap(size, true); return upload_buffers.RequestMap(size, true);
} }
ImageBufferMap TextureCacheRuntime::MapDownloadBuffer(size_t size) { ImageBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) {
return download_buffers.RequestMap(size, false); return download_buffers.RequestMap(size, false);
} }
@ -553,15 +550,14 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src,
} }
void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map, void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map,
size_t buffer_offset,
std::span<const SwizzleParameters> swizzles) { std::span<const SwizzleParameters> swizzles) {
switch (image.info.type) { switch (image.info.type) {
case ImageType::e2D: case ImageType::e2D:
return util_shaders.BlockLinearUpload2D(image, map, buffer_offset, swizzles); return util_shaders.BlockLinearUpload2D(image, map, swizzles);
case ImageType::e3D: case ImageType::e3D:
return util_shaders.BlockLinearUpload3D(image, map, buffer_offset, swizzles); return util_shaders.BlockLinearUpload3D(image, map, swizzles);
case ImageType::Linear: case ImageType::Linear:
return util_shaders.PitchUpload(image, map, buffer_offset, swizzles); return util_shaders.PitchUpload(image, map, swizzles);
default: default:
UNREACHABLE(); UNREACHABLE();
break; break;
@ -596,7 +592,11 @@ ImageBufferMap TextureCacheRuntime::StagingBuffers::RequestMap(size_t requested_
bool insert_fence) { bool insert_fence) {
const size_t index = RequestBuffer(requested_size); const size_t index = RequestBuffer(requested_size);
OGLSync* const sync = insert_fence ? &syncs[index] : nullptr; OGLSync* const sync = insert_fence ? &syncs[index] : nullptr;
return ImageBufferMap(buffers[index].handle, maps[index], requested_size, sync); return ImageBufferMap{
.mapped_span = std::span(maps[index], requested_size),
.sync = sync,
.buffer = buffers[index].handle,
};
} }
size_t TextureCacheRuntime::StagingBuffers::RequestBuffer(size_t requested_size) { size_t TextureCacheRuntime::StagingBuffers::RequestBuffer(size_t requested_size) {
@ -709,10 +709,10 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_,
} }
} }
void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void Image::UploadMemory(const ImageBufferMap& map,
std::span<const VideoCommon::BufferImageCopy> copies) { std::span<const VideoCommon::BufferImageCopy> copies) {
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.Handle()); glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer);
glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes); glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, map.offset, unswizzled_size_bytes);
glPixelStorei(GL_UNPACK_ALIGNMENT, 1); glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
@ -728,23 +728,23 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
current_image_height = copy.buffer_image_height; current_image_height = copy.buffer_image_height;
glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height); glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height);
} }
CopyBufferToImage(copy, buffer_offset); CopyBufferToImage(copy, map.offset);
} }
} }
void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void Image::UploadMemory(const ImageBufferMap& map,
std::span<const VideoCommon::BufferCopy> copies) { std::span<const VideoCommon::BufferCopy> copies) {
for (const VideoCommon::BufferCopy& copy : copies) { for (const VideoCommon::BufferCopy& copy : copies) {
glCopyNamedBufferSubData(map.Handle(), buffer.handle, copy.src_offset + buffer_offset, glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + map.offset,
copy.dst_offset, copy.size); copy.dst_offset, copy.size);
} }
} }
void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset, void Image::DownloadMemory(ImageBufferMap& map,
std::span<const VideoCommon::BufferImageCopy> copies) { std::span<const VideoCommon::BufferImageCopy> copies) {
glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API
glBindBuffer(GL_PIXEL_PACK_BUFFER, map.Handle()); glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer);
glPixelStorei(GL_PACK_ALIGNMENT, 1); glPixelStorei(GL_PACK_ALIGNMENT, 1);
u32 current_row_length = std::numeric_limits<u32>::max(); u32 current_row_length = std::numeric_limits<u32>::max();
@ -759,7 +759,7 @@ void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset,
current_image_height = copy.buffer_image_height; current_image_height = copy.buffer_image_height;
glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height); glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height);
} }
CopyImageToBuffer(copy, buffer_offset); CopyImageToBuffer(copy, map.offset);
} }
} }

View file

@ -31,23 +31,13 @@ using VideoCommon::NUM_RT;
using VideoCommon::Offset2D; using VideoCommon::Offset2D;
using VideoCommon::RenderTargets; using VideoCommon::RenderTargets;
class ImageBufferMap { struct ImageBufferMap {
public:
explicit ImageBufferMap(GLuint handle, u8* map, size_t size, OGLSync* sync);
~ImageBufferMap(); ~ImageBufferMap();
GLuint Handle() const noexcept { std::span<u8> mapped_span;
return handle; size_t offset = 0;
}
std::span<u8> Span() const noexcept {
return span;
}
private:
std::span<u8> span;
OGLSync* sync; OGLSync* sync;
GLuint handle; GLuint buffer;
}; };
struct FormatProperties { struct FormatProperties {
@ -69,9 +59,9 @@ public:
void Finish(); void Finish();
ImageBufferMap MapUploadBuffer(size_t size); ImageBufferMap UploadStagingBuffer(size_t size);
ImageBufferMap MapDownloadBuffer(size_t size); ImageBufferMap DownloadStagingBuffer(size_t size);
void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies);
@ -89,7 +79,7 @@ public:
Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Filter filter,
Tegra::Engines::Fermi2D::Operation operation); Tegra::Engines::Fermi2D::Operation operation);
void AccelerateImageUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, void AccelerateImageUpload(Image& image, const ImageBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles); std::span<const VideoCommon::SwizzleParameters> swizzles);
void InsertUploadMemoryBarrier(); void InsertUploadMemoryBarrier();
@ -148,14 +138,12 @@ public:
explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
VAddr cpu_addr); VAddr cpu_addr);
void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void UploadMemory(const ImageBufferMap& map,
std::span<const VideoCommon::BufferImageCopy> copies); std::span<const VideoCommon::BufferImageCopy> copies);
void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void UploadMemory(const ImageBufferMap& map, std::span<const VideoCommon::BufferCopy> copies);
std::span<const VideoCommon::BufferCopy> copies);
void DownloadMemory(ImageBufferMap& map, size_t buffer_offset, void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies);
std::span<const VideoCommon::BufferImageCopy> copies);
GLuint Handle() const noexcept { GLuint Handle() const noexcept {
return texture.handle; return texture.handle;

View file

@ -29,9 +29,7 @@
#include "video_core/textures/decoders.h" #include "video_core/textures/decoders.h"
namespace OpenGL { namespace OpenGL {
namespace { namespace {
constexpr GLint PositionLocation = 0; constexpr GLint PositionLocation = 0;
constexpr GLint TexCoordLocation = 1; constexpr GLint TexCoordLocation = 1;
constexpr GLint ModelViewMatrixLocation = 0; constexpr GLint ModelViewMatrixLocation = 0;
@ -124,7 +122,6 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit
break; break;
} }
} }
} // Anonymous namespace } // Anonymous namespace
RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_, RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_,
@ -132,7 +129,17 @@ RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_,
Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
std::unique_ptr<Core::Frontend::GraphicsContext> context_) std::unique_ptr<Core::Frontend::GraphicsContext> context_)
: RendererBase{emu_window_, std::move(context_)}, telemetry_session{telemetry_session_}, : RendererBase{emu_window_, std::move(context_)}, telemetry_session{telemetry_session_},
emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, program_manager{device} {} emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, state_tracker{gpu},
program_manager{device},
rasterizer(emu_window, gpu, cpu_memory, device, screen_info, program_manager, state_tracker) {
if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
glEnable(GL_DEBUG_OUTPUT);
glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
glDebugMessageCallback(DebugHandler, nullptr);
}
AddTelemetryFields();
InitOpenGLObjects();
}
RendererOpenGL::~RendererOpenGL() = default; RendererOpenGL::~RendererOpenGL() = default;
@ -148,7 +155,7 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
++m_current_frame; ++m_current_frame;
rasterizer->TickFrame(); rasterizer.TickFrame();
context->SwapBuffers(); context->SwapBuffers();
render_window.OnFrameDisplayed(); render_window.OnFrameDisplayed();
@ -179,7 +186,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf
framebuffer_crop_rect = framebuffer.crop_rect; framebuffer_crop_rect = framebuffer.crop_rect;
const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset}; const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset};
if (rasterizer->AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) { if (rasterizer.AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) {
return; return;
} }
@ -267,6 +274,7 @@ void RendererOpenGL::InitOpenGLObjects() {
// Enable unified vertex attributes and query vertex buffer address when the driver supports it // Enable unified vertex attributes and query vertex buffer address when the driver supports it
if (device.HasVertexBufferUnifiedMemory()) { if (device.HasVertexBufferUnifiedMemory()) {
glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV);
glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV);
glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY); glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY);
glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV,
@ -289,14 +297,6 @@ void RendererOpenGL::AddTelemetryFields() {
telemetry_session.AddField(user_system, "GPU_OpenGL_Version", std::string(gl_version)); telemetry_session.AddField(user_system, "GPU_OpenGL_Version", std::string(gl_version));
} }
void RendererOpenGL::CreateRasterizer() {
if (rasterizer) {
return;
}
rasterizer = std::make_unique<RasterizerOpenGL>(emu_window, gpu, cpu_memory, device,
screen_info, program_manager, state_tracker);
}
void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
const Tegra::FramebufferConfig& framebuffer) { const Tegra::FramebufferConfig& framebuffer) {
texture.width = framebuffer.width; texture.width = framebuffer.width;
@ -407,6 +407,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
program_manager.BindHostPipeline(pipeline.handle); program_manager.BindHostPipeline(pipeline.handle);
state_tracker.ClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
glEnable(GL_CULL_FACE); glEnable(GL_CULL_FACE);
if (screen_info.display_srgb) { if (screen_info.display_srgb) {
glEnable(GL_FRAMEBUFFER_SRGB); glEnable(GL_FRAMEBUFFER_SRGB);
@ -425,7 +426,6 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) {
glCullFace(GL_BACK); glCullFace(GL_BACK);
glFrontFace(GL_CW); glFrontFace(GL_CW);
glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE);
glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width), glViewportIndexedf(0, 0.0f, 0.0f, static_cast<GLfloat>(layout.width),
static_cast<GLfloat>(layout.height)); static_cast<GLfloat>(layout.height));
glDepthRangeIndexed(0, 0.0, 0.0); glDepthRangeIndexed(0, 0.0, 0.0);
@ -497,25 +497,4 @@ void RendererOpenGL::RenderScreenshot() {
renderer_settings.screenshot_requested = false; renderer_settings.screenshot_requested = false;
} }
bool RendererOpenGL::Init() {
if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) {
glEnable(GL_DEBUG_OUTPUT);
glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS);
glDebugMessageCallback(DebugHandler, nullptr);
}
AddTelemetryFields();
if (!GLAD_GL_VERSION_4_6) {
return false;
}
InitOpenGLObjects();
CreateRasterizer();
return true;
}
void RendererOpenGL::ShutDown() {}
} // namespace OpenGL } // namespace OpenGL

View file

@ -10,6 +10,7 @@
#include "common/math_util.h" #include "common/math_util.h"
#include "video_core/renderer_base.h" #include "video_core/renderer_base.h"
#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_device.h"
#include "video_core/renderer_opengl/gl_rasterizer.h"
#include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_resource_manager.h"
#include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_shader_manager.h"
#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_state_tracker.h"
@ -63,18 +64,18 @@ public:
std::unique_ptr<Core::Frontend::GraphicsContext> context_); std::unique_ptr<Core::Frontend::GraphicsContext> context_);
~RendererOpenGL() override; ~RendererOpenGL() override;
bool Init() override;
void ShutDown() override;
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
VideoCore::RasterizerInterface* ReadRasterizer() override {
return &rasterizer;
}
private: private:
/// Initializes the OpenGL state and creates persistent objects. /// Initializes the OpenGL state and creates persistent objects.
void InitOpenGLObjects(); void InitOpenGLObjects();
void AddTelemetryFields(); void AddTelemetryFields();
void CreateRasterizer();
void ConfigureFramebufferTexture(TextureInfo& texture, void ConfigureFramebufferTexture(TextureInfo& texture,
const Tegra::FramebufferConfig& framebuffer); const Tegra::FramebufferConfig& framebuffer);
@ -98,8 +99,10 @@ private:
Core::Memory::Memory& cpu_memory; Core::Memory::Memory& cpu_memory;
Tegra::GPU& gpu; Tegra::GPU& gpu;
const Device device; Device device;
StateTracker state_tracker{gpu}; StateTracker state_tracker;
ProgramManager program_manager;
RasterizerOpenGL rasterizer;
// OpenGL object IDs // OpenGL object IDs
OGLSampler present_sampler; OGLSampler present_sampler;
@ -115,9 +118,6 @@ private:
/// Display information for Switch screen /// Display information for Switch screen
ScreenInfo screen_info; ScreenInfo screen_info;
/// Global dummy shader pipeline
ProgramManager program_manager;
/// OpenGL framebuffer data /// OpenGL framebuffer data
std::vector<u8> gl_framebuffer_data; std::vector<u8> gl_framebuffer_data;

View file

@ -63,7 +63,7 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_)
UtilShaders::~UtilShaders() = default; UtilShaders::~UtilShaders() = default;
void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, size_t buffer_offset, void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map,
std::span<const SwizzleParameters> swizzles) { std::span<const SwizzleParameters> swizzles) {
static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1};
static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0;
@ -71,13 +71,13 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle); program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle);
glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format));
for (const SwizzleParameters& swizzle : swizzles) { for (const SwizzleParameters& swizzle : swizzles) {
const Extent3D num_tiles = swizzle.num_tiles; const Extent3D num_tiles = swizzle.num_tiles;
const size_t input_offset = swizzle.buffer_offset + buffer_offset; const size_t input_offset = swizzle.buffer_offset + map.offset;
const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width);
const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height);
@ -91,8 +91,8 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
glUniform1ui(5, params.x_shift); glUniform1ui(5, params.x_shift);
glUniform1ui(6, params.block_height); glUniform1ui(6, params.block_height);
glUniform1ui(7, params.block_height_mask); glUniform1ui(7, params.block_height_mask);
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
input_offset, image.guest_size_bytes - swizzle.buffer_offset); image.guest_size_bytes - swizzle.buffer_offset);
glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0, glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0,
GL_WRITE_ONLY, store_format); GL_WRITE_ONLY, store_format);
glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers);
@ -100,7 +100,7 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s
program_manager.RestoreGuestCompute(); program_manager.RestoreGuestCompute();
} }
void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, size_t buffer_offset, void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map,
std::span<const SwizzleParameters> swizzles) { std::span<const SwizzleParameters> swizzles) {
static constexpr Extent3D WORKGROUP_SIZE{16, 8, 8}; static constexpr Extent3D WORKGROUP_SIZE{16, 8, 8};
@ -108,14 +108,14 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
static constexpr GLuint BINDING_INPUT_BUFFER = 1; static constexpr GLuint BINDING_INPUT_BUFFER = 1;
static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0;
glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle); program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle);
const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format));
for (const SwizzleParameters& swizzle : swizzles) { for (const SwizzleParameters& swizzle : swizzles) {
const Extent3D num_tiles = swizzle.num_tiles; const Extent3D num_tiles = swizzle.num_tiles;
const size_t input_offset = swizzle.buffer_offset + buffer_offset; const size_t input_offset = swizzle.buffer_offset + map.offset;
const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width);
const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height);
@ -132,8 +132,8 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
glUniform1ui(7, params.block_height_mask); glUniform1ui(7, params.block_height_mask);
glUniform1ui(8, params.block_depth); glUniform1ui(8, params.block_depth);
glUniform1ui(9, params.block_depth_mask); glUniform1ui(9, params.block_depth_mask);
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
input_offset, image.guest_size_bytes - swizzle.buffer_offset); image.guest_size_bytes - swizzle.buffer_offset);
glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0, glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0,
GL_WRITE_ONLY, store_format); GL_WRITE_ONLY, store_format);
glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z); glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z);
@ -141,7 +141,7 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s
program_manager.RestoreGuestCompute(); program_manager.RestoreGuestCompute();
} }
void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map,
std::span<const SwizzleParameters> swizzles) { std::span<const SwizzleParameters> swizzles) {
static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1};
static constexpr GLuint BINDING_INPUT_BUFFER = 0; static constexpr GLuint BINDING_INPUT_BUFFER = 0;
@ -159,7 +159,7 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu
"Non-power of two images are not implemented"); "Non-power of two images are not implemented");
program_manager.BindHostCompute(pitch_unswizzle_program.handle); program_manager.BindHostCompute(pitch_unswizzle_program.handle);
glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes);
glUniform2ui(LOC_ORIGIN, 0, 0); glUniform2ui(LOC_ORIGIN, 0, 0);
glUniform2i(LOC_DESTINATION, 0, 0); glUniform2i(LOC_DESTINATION, 0, 0);
glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block); glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block);
@ -167,13 +167,13 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu
glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), 0, GL_FALSE, 0, GL_WRITE_ONLY, format); glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), 0, GL_FALSE, 0, GL_WRITE_ONLY, format);
for (const SwizzleParameters& swizzle : swizzles) { for (const SwizzleParameters& swizzle : swizzles) {
const Extent3D num_tiles = swizzle.num_tiles; const Extent3D num_tiles = swizzle.num_tiles;
const size_t input_offset = swizzle.buffer_offset + buffer_offset; const size_t input_offset = swizzle.buffer_offset + map.offset;
const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width);
const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height);
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset,
input_offset, image.guest_size_bytes - swizzle.buffer_offset); image.guest_size_bytes - swizzle.buffer_offset);
glDispatchCompute(num_dispatches_x, num_dispatches_y, 1); glDispatchCompute(num_dispatches_x, num_dispatches_y, 1);
} }
program_manager.RestoreGuestCompute(); program_manager.RestoreGuestCompute();

View file

@ -15,21 +15,22 @@
namespace OpenGL { namespace OpenGL {
class Image; class Image;
class ImageBufferMap;
class ProgramManager; class ProgramManager;
struct ImageBufferMap;
class UtilShaders { class UtilShaders {
public: public:
explicit UtilShaders(ProgramManager& program_manager); explicit UtilShaders(ProgramManager& program_manager);
~UtilShaders(); ~UtilShaders();
void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, size_t buffer_offset, void BlockLinearUpload2D(Image& image, const ImageBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles); std::span<const VideoCommon::SwizzleParameters> swizzles);
void BlockLinearUpload3D(Image& image, const ImageBufferMap& map, size_t buffer_offset, void BlockLinearUpload3D(Image& image, const ImageBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles); std::span<const VideoCommon::SwizzleParameters> swizzles);
void PitchUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, void PitchUpload(Image& image, const ImageBufferMap& map,
std::span<const VideoCommon::SwizzleParameters> swizzles); std::span<const VideoCommon::SwizzleParameters> swizzles);
void CopyBC4(Image& dst_image, Image& src_image, void CopyBC4(Image& dst_image, Image& src_image,

View file

@ -531,13 +531,9 @@ VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison) {
return {}; return {};
} }
VkIndexType IndexFormat(const Device& device, Maxwell::IndexFormat index_format) { VkIndexType IndexFormat(Maxwell::IndexFormat index_format) {
switch (index_format) { switch (index_format) {
case Maxwell::IndexFormat::UnsignedByte: case Maxwell::IndexFormat::UnsignedByte:
if (!device.IsExtIndexTypeUint8Supported()) {
UNIMPLEMENTED_MSG("Native uint8 indices are not supported on this device");
return VK_INDEX_TYPE_UINT16;
}
return VK_INDEX_TYPE_UINT8_EXT; return VK_INDEX_TYPE_UINT8_EXT;
case Maxwell::IndexFormat::UnsignedShort: case Maxwell::IndexFormat::UnsignedShort:
return VK_INDEX_TYPE_UINT16; return VK_INDEX_TYPE_UINT16;

View file

@ -53,7 +53,7 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib
VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison); VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison);
VkIndexType IndexFormat(const Device& device, Maxwell::IndexFormat index_format); VkIndexType IndexFormat(Maxwell::IndexFormat index_format);
VkStencilOp StencilOp(Maxwell::StencilOp stencil_op); VkStencilOp StencilOp(Maxwell::StencilOp stencil_op);

View file

@ -80,17 +80,50 @@ std::string BuildCommaSeparatedExtensions(std::vector<std::string> available_ext
return separated_extensions; return separated_extensions;
} }
Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld,
VkSurfaceKHR surface) {
const std::vector<VkPhysicalDevice> devices = instance.EnumeratePhysicalDevices();
const s32 device_index = Settings::values.vulkan_device.GetValue();
if (device_index < 0 || device_index >= static_cast<s32>(devices.size())) {
LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index);
throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED);
}
const vk::PhysicalDevice physical_device(devices[device_index], dld);
return Device(*instance, physical_device, surface, dld);
}
} // Anonymous namespace } // Anonymous namespace
RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_,
Core::Frontend::EmuWindow& emu_window, Core::Frontend::EmuWindow& emu_window,
Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_,
std::unique_ptr<Core::Frontend::GraphicsContext> context_) std::unique_ptr<Core::Frontend::GraphicsContext> context_) try
: RendererBase{emu_window, std::move(context_)}, telemetry_session{telemetry_session_}, : RendererBase(emu_window, std::move(context_)),
cpu_memory{cpu_memory_}, gpu{gpu_} {} telemetry_session(telemetry_session_),
cpu_memory(cpu_memory_),
gpu(gpu_),
library(OpenLibrary()),
instance(CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type,
true, Settings::values.renderer_debug)),
debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr),
surface(CreateSurface(instance, render_window)),
device(CreateDevice(instance, dld, *surface)),
memory_allocator(device, false),
state_tracker(gpu),
scheduler(device, state_tracker),
swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width,
render_window.GetFramebufferLayout().height, false),
blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, scheduler,
screen_info),
rasterizer(render_window, gpu, gpu.MemoryManager(), cpu_memory, screen_info, device,
memory_allocator, state_tracker, scheduler) {
Report();
} catch (const vk::Exception& exception) {
LOG_ERROR(Render_Vulkan, "Vulkan initialization failed with error: {}", exception.what());
throw std::runtime_error{fmt::format("Vulkan initialization error {}", exception.what())};
}
RendererVulkan::~RendererVulkan() { RendererVulkan::~RendererVulkan() {
ShutDown(); void(device.GetLogical().WaitIdle());
} }
void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
@ -101,101 +134,38 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
if (layout.width > 0 && layout.height > 0 && render_window.IsShown()) { if (layout.width > 0 && layout.height > 0 && render_window.IsShown()) {
const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset; const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset;
const bool use_accelerated = const bool use_accelerated =
rasterizer->AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride); rasterizer.AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride);
const bool is_srgb = use_accelerated && screen_info.is_srgb; const bool is_srgb = use_accelerated && screen_info.is_srgb;
if (swapchain->HasFramebufferChanged(layout) || swapchain->GetSrgbState() != is_srgb) { if (swapchain.HasFramebufferChanged(layout) || swapchain.GetSrgbState() != is_srgb) {
swapchain->Create(layout.width, layout.height, is_srgb); swapchain.Create(layout.width, layout.height, is_srgb);
blit_screen->Recreate(); blit_screen.Recreate();
} }
scheduler->WaitWorker(); scheduler.WaitWorker();
swapchain->AcquireNextImage(); swapchain.AcquireNextImage();
const VkSemaphore render_semaphore = blit_screen->Draw(*framebuffer, use_accelerated); const VkSemaphore render_semaphore = blit_screen.Draw(*framebuffer, use_accelerated);
scheduler->Flush(render_semaphore); scheduler.Flush(render_semaphore);
if (swapchain->Present(render_semaphore)) { if (swapchain.Present(render_semaphore)) {
blit_screen->Recreate(); blit_screen.Recreate();
} }
rasterizer.TickFrame();
rasterizer->TickFrame();
} }
render_window.OnFrameDisplayed(); render_window.OnFrameDisplayed();
} }
bool RendererVulkan::Init() try {
library = OpenLibrary();
instance = CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type,
true, Settings::values.renderer_debug);
if (Settings::values.renderer_debug) {
debug_callback = CreateDebugCallback(instance);
}
surface = CreateSurface(instance, render_window);
InitializeDevice();
Report();
memory_allocator = std::make_unique<MemoryAllocator>(*device);
state_tracker = std::make_unique<StateTracker>(gpu);
scheduler = std::make_unique<VKScheduler>(*device, *state_tracker);
const auto& framebuffer = render_window.GetFramebufferLayout();
swapchain = std::make_unique<VKSwapchain>(*surface, *device, *scheduler);
swapchain->Create(framebuffer.width, framebuffer.height, false);
rasterizer = std::make_unique<RasterizerVulkan>(render_window, gpu, gpu.MemoryManager(),
cpu_memory, screen_info, *device,
*memory_allocator, *state_tracker, *scheduler);
blit_screen =
std::make_unique<VKBlitScreen>(cpu_memory, render_window, *rasterizer, *device,
*memory_allocator, *swapchain, *scheduler, screen_info);
return true;
} catch (const vk::Exception& exception) {
LOG_ERROR(Render_Vulkan, "Vulkan initialization failed with error: {}", exception.what());
return false;
}
void RendererVulkan::ShutDown() {
if (!device) {
return;
}
if (const auto& dev = device->GetLogical()) {
dev.WaitIdle();
}
rasterizer.reset();
blit_screen.reset();
scheduler.reset();
swapchain.reset();
memory_allocator.reset();
device.reset();
}
void RendererVulkan::InitializeDevice() {
const std::vector<VkPhysicalDevice> devices = instance.EnumeratePhysicalDevices();
const s32 device_index = Settings::values.vulkan_device.GetValue();
if (device_index < 0 || device_index >= static_cast<s32>(devices.size())) {
LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index);
throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED);
}
const vk::PhysicalDevice physical_device(devices[static_cast<size_t>(device_index)], dld);
device = std::make_unique<Device>(*instance, physical_device, *surface, dld);
}
void RendererVulkan::Report() const { void RendererVulkan::Report() const {
const std::string vendor_name{device->GetVendorName()}; const std::string vendor_name{device.GetVendorName()};
const std::string model_name{device->GetModelName()}; const std::string model_name{device.GetModelName()};
const std::string driver_version = GetDriverVersion(*device); const std::string driver_version = GetDriverVersion(device);
const std::string driver_name = fmt::format("{} {}", vendor_name, driver_version); const std::string driver_name = fmt::format("{} {}", vendor_name, driver_version);
const std::string api_version = GetReadableVersion(device->ApiVersion()); const std::string api_version = GetReadableVersion(device.ApiVersion());
const std::string extensions = BuildCommaSeparatedExtensions(device->GetAvailableExtensions()); const std::string extensions = BuildCommaSeparatedExtensions(device.GetAvailableExtensions());
LOG_INFO(Render_Vulkan, "Driver: {}", driver_name); LOG_INFO(Render_Vulkan, "Driver: {}", driver_name);
LOG_INFO(Render_Vulkan, "Device: {}", model_name); LOG_INFO(Render_Vulkan, "Device: {}", model_name);
@ -209,21 +179,4 @@ void RendererVulkan::Report() const {
telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions); telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions);
} }
std::vector<std::string> RendererVulkan::EnumerateDevices() try {
vk::InstanceDispatch dld;
const Common::DynamicLibrary library = OpenLibrary();
const vk::Instance instance = CreateInstance(library, dld, VK_API_VERSION_1_0);
const std::vector<VkPhysicalDevice> physical_devices = instance.EnumeratePhysicalDevices();
std::vector<std::string> names;
names.reserve(physical_devices.size());
for (const VkPhysicalDevice device : physical_devices) {
names.push_back(vk::PhysicalDevice(device, dld).GetProperties().deviceName);
}
return names;
} catch (const vk::Exception& exception) {
LOG_ERROR(Render_Vulkan, "Failed to enumerate devices with error: {}", exception.what());
return {};
}
} // namespace Vulkan } // namespace Vulkan

View file

@ -9,8 +9,14 @@
#include <vector> #include <vector>
#include "common/dynamic_library.h" #include "common/dynamic_library.h"
#include "video_core/renderer_base.h" #include "video_core/renderer_base.h"
#include "video_core/renderer_vulkan/vk_blit_screen.h"
#include "video_core/renderer_vulkan/vk_rasterizer.h"
#include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_state_tracker.h"
#include "video_core/renderer_vulkan/vk_swapchain.h"
#include "video_core/vulkan_common/vulkan_device.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h" #include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Core { namespace Core {
@ -27,20 +33,6 @@ class GPU;
namespace Vulkan { namespace Vulkan {
class Device;
class StateTracker;
class MemoryAllocator;
class VKBlitScreen;
class VKSwapchain;
class VKScheduler;
struct VKScreenInfo {
VkImageView image_view{};
u32 width{};
u32 height{};
bool is_srgb{};
};
class RendererVulkan final : public VideoCore::RendererBase { class RendererVulkan final : public VideoCore::RendererBase {
public: public:
explicit RendererVulkan(Core::TelemetrySession& telemtry_session, explicit RendererVulkan(Core::TelemetrySession& telemtry_session,
@ -49,15 +41,13 @@ public:
std::unique_ptr<Core::Frontend::GraphicsContext> context_); std::unique_ptr<Core::Frontend::GraphicsContext> context_);
~RendererVulkan() override; ~RendererVulkan() override;
bool Init() override;
void ShutDown() override;
void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
static std::vector<std::string> EnumerateDevices(); VideoCore::RasterizerInterface* ReadRasterizer() override {
return &rasterizer;
}
private: private:
void InitializeDevice();
void Report() const; void Report() const;
Core::TelemetrySession& telemetry_session; Core::TelemetrySession& telemetry_session;
@ -68,18 +58,18 @@ private:
vk::InstanceDispatch dld; vk::InstanceDispatch dld;
vk::Instance instance; vk::Instance instance;
vk::DebugUtilsMessenger debug_callback;
vk::SurfaceKHR surface; vk::SurfaceKHR surface;
VKScreenInfo screen_info; VKScreenInfo screen_info;
vk::DebugUtilsMessenger debug_callback; Device device;
std::unique_ptr<Device> device; MemoryAllocator memory_allocator;
std::unique_ptr<MemoryAllocator> memory_allocator; StateTracker state_tracker;
std::unique_ptr<StateTracker> state_tracker; VKScheduler scheduler;
std::unique_ptr<VKScheduler> scheduler; VKSwapchain swapchain;
std::unique_ptr<VKSwapchain> swapchain; VKBlitScreen blit_screen;
std::unique_ptr<VKBlitScreen> blit_screen; RasterizerVulkan rasterizer;
}; };
} // namespace Vulkan } // namespace Vulkan

View file

@ -18,7 +18,6 @@
#include "video_core/gpu.h" #include "video_core/gpu.h"
#include "video_core/host_shaders/vulkan_present_frag_spv.h" #include "video_core/host_shaders/vulkan_present_frag_spv.h"
#include "video_core/host_shaders/vulkan_present_vert_spv.h" #include "video_core/host_shaders/vulkan_present_vert_spv.h"
#include "video_core/rasterizer_interface.h"
#include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/renderer_vulkan.h"
#include "video_core/renderer_vulkan/vk_blit_screen.h" #include "video_core/renderer_vulkan/vk_blit_screen.h"
#include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h"
@ -113,13 +112,12 @@ struct VKBlitScreen::BufferData {
}; };
VKBlitScreen::VKBlitScreen(Core::Memory::Memory& cpu_memory_, VKBlitScreen::VKBlitScreen(Core::Memory::Memory& cpu_memory_,
Core::Frontend::EmuWindow& render_window_, Core::Frontend::EmuWindow& render_window_, const Device& device_,
VideoCore::RasterizerInterface& rasterizer_, const Device& device_,
MemoryAllocator& memory_allocator_, VKSwapchain& swapchain_, MemoryAllocator& memory_allocator_, VKSwapchain& swapchain_,
VKScheduler& scheduler_, const VKScreenInfo& screen_info_) VKScheduler& scheduler_, const VKScreenInfo& screen_info_)
: cpu_memory{cpu_memory_}, render_window{render_window_}, rasterizer{rasterizer_}, : cpu_memory{cpu_memory_}, render_window{render_window_}, device{device_},
device{device_}, memory_allocator{memory_allocator_}, swapchain{swapchain_}, memory_allocator{memory_allocator_}, swapchain{swapchain_}, scheduler{scheduler_},
scheduler{scheduler_}, image_count{swapchain.GetImageCount()}, screen_info{screen_info_} { image_count{swapchain.GetImageCount()}, screen_info{screen_info_} {
resource_ticks.resize(image_count); resource_ticks.resize(image_count);
CreateStaticResources(); CreateStaticResources();
@ -150,8 +148,8 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
SetUniformData(data, framebuffer); SetUniformData(data, framebuffer);
SetVertexData(data, framebuffer); SetVertexData(data, framebuffer);
const std::span<u8> map = buffer_commit.Map(); const std::span<u8> mapped_span = buffer_commit.Map();
std::memcpy(map.data(), &data, sizeof(data)); std::memcpy(mapped_span.data(), &data, sizeof(data));
if (!use_accelerated) { if (!use_accelerated) {
const u64 image_offset = GetRawImageOffset(framebuffer, image_index); const u64 image_offset = GetRawImageOffset(framebuffer, image_index);
@ -159,14 +157,13 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset; const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset;
const u8* const host_ptr = cpu_memory.GetPointer(framebuffer_addr); const u8* const host_ptr = cpu_memory.GetPointer(framebuffer_addr);
const size_t size_bytes = GetSizeInBytes(framebuffer); const size_t size_bytes = GetSizeInBytes(framebuffer);
rasterizer.FlushRegion(ToCacheAddr(host_ptr), size_bytes);
// TODO(Rodrigo): Read this from HLE // TODO(Rodrigo): Read this from HLE
constexpr u32 block_height_log2 = 4; constexpr u32 block_height_log2 = 4;
const u32 bytes_per_pixel = GetBytesPerPixel(framebuffer); const u32 bytes_per_pixel = GetBytesPerPixel(framebuffer);
Tegra::Texture::UnswizzleTexture( Tegra::Texture::UnswizzleTexture(
map.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes), bytes_per_pixel, mapped_span.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes),
framebuffer.width, framebuffer.height, 1, block_height_log2, 0); bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0);
const VkBufferImageCopy copy{ const VkBufferImageCopy copy{
.bufferOffset = image_offset, .bufferOffset = image_offset,
@ -266,7 +263,6 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool
cmdbuf.Draw(4, 1, 0, 0); cmdbuf.Draw(4, 1, 0, 0);
cmdbuf.EndRenderPass(); cmdbuf.EndRenderPass();
}); });
return *semaphores[image_index]; return *semaphores[image_index];
} }

View file

@ -38,12 +38,18 @@ class RasterizerVulkan;
class VKScheduler; class VKScheduler;
class VKSwapchain; class VKSwapchain;
class VKBlitScreen final { struct VKScreenInfo {
VkImageView image_view{};
u32 width{};
u32 height{};
bool is_srgb{};
};
class VKBlitScreen {
public: public:
explicit VKBlitScreen(Core::Memory::Memory& cpu_memory, explicit VKBlitScreen(Core::Memory::Memory& cpu_memory,
Core::Frontend::EmuWindow& render_window, Core::Frontend::EmuWindow& render_window, const Device& device,
VideoCore::RasterizerInterface& rasterizer, const Device& device, MemoryAllocator& memory_manager, VKSwapchain& swapchain,
MemoryAllocator& memory_allocator, VKSwapchain& swapchain,
VKScheduler& scheduler, const VKScreenInfo& screen_info); VKScheduler& scheduler, const VKScreenInfo& screen_info);
~VKBlitScreen(); ~VKBlitScreen();
@ -84,7 +90,6 @@ private:
Core::Memory::Memory& cpu_memory; Core::Memory::Memory& cpu_memory;
Core::Frontend::EmuWindow& render_window; Core::Frontend::EmuWindow& render_window;
VideoCore::RasterizerInterface& rasterizer;
const Device& device; const Device& device;
MemoryAllocator& memory_allocator; MemoryAllocator& memory_allocator;
VKSwapchain& swapchain; VKSwapchain& swapchain;

View file

@ -3,188 +3,308 @@
// Refer to the license.txt file included. // Refer to the license.txt file included.
#include <algorithm> #include <algorithm>
#include <array>
#include <cstring> #include <cstring>
#include <memory> #include <span>
#include <vector>
#include "core/core.h"
#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/renderer_vulkan/maxwell_to_vk.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_stream_buffer.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
#include "video_core/renderer_vulkan/vk_update_descriptor.h"
#include "video_core/vulkan_common/vulkan_device.h" #include "video_core/vulkan_common/vulkan_device.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h" #include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan { namespace Vulkan {
namespace { namespace {
VkBufferCopy MakeBufferCopy(const VideoCommon::BufferCopy& copy) {
return VkBufferCopy{
.srcOffset = copy.src_offset,
.dstOffset = copy.dst_offset,
.size = copy.size,
};
}
constexpr VkBufferUsageFlags BUFFER_USAGE = VkIndexType IndexTypeFromNumElements(const Device& device, u32 num_elements) {
VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | if (num_elements <= 0xff && device.IsExtIndexTypeUint8Supported()) {
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; return VK_INDEX_TYPE_UINT8_EXT;
}
if (num_elements <= 0xffff) {
return VK_INDEX_TYPE_UINT16;
}
return VK_INDEX_TYPE_UINT32;
}
constexpr VkPipelineStageFlags UPLOAD_PIPELINE_STAGE = size_t BytesPerIndex(VkIndexType index_type) {
VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | switch (index_type) {
VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | case VK_INDEX_TYPE_UINT8_EXT:
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; return 1;
case VK_INDEX_TYPE_UINT16:
constexpr VkAccessFlags UPLOAD_ACCESS_BARRIERS = return 2;
VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT | case VK_INDEX_TYPE_UINT32:
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_INDEX_READ_BIT; return 4;
default:
constexpr VkAccessFlags TRANSFORM_FEEDBACK_WRITE_ACCESS = UNREACHABLE_MSG("Invalid index type={}", index_type);
VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT; return 1;
}
}
template <typename T>
std::array<T, 6> MakeQuadIndices(u32 quad, u32 first) {
std::array<T, 6> indices{0, 1, 2, 0, 2, 3};
std::ranges::transform(indices, indices.begin(),
[quad, first](u32 index) { return first + index + quad * 4; });
return indices;
}
} // Anonymous namespace } // Anonymous namespace
Buffer::Buffer(const Device& device_, MemoryAllocator& memory_allocator, VKScheduler& scheduler_, Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params)
StagingBufferPool& staging_pool_, VAddr cpu_addr_, std::size_t size_) : VideoCommon::BufferBase<VideoCore::RasterizerInterface>(null_params) {}
: BufferBlock{cpu_addr_, size_}, device{device_}, scheduler{scheduler_}, staging_pool{
staging_pool_} { Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ VAddr cpu_addr_, u64 size_bytes_)
: VideoCommon::BufferBase<VideoCore::RasterizerInterface>(rasterizer_, cpu_addr_, size_bytes_) {
buffer = runtime.device.GetLogical().CreateBuffer(VkBufferCreateInfo{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr, .pNext = nullptr,
.flags = 0, .flags = 0,
.size = static_cast<VkDeviceSize>(size_), .size = SizeBytes(),
.usage = BUFFER_USAGE | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT |
VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE, .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0, .queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr, .pQueueFamilyIndices = nullptr,
}); });
commit = memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal); if (runtime.device.HasDebuggingToolAttached()) {
buffer.SetObjectNameEXT(fmt::format("Buffer 0x{:x}", CpuAddr()).c_str());
}
commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal);
} }
Buffer::~Buffer() = default; BufferCacheRuntime::BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_allocator_,
VKScheduler& scheduler_, StagingBufferPool& staging_pool_,
VKUpdateDescriptorQueue& update_descriptor_queue_,
VKDescriptorPool& descriptor_pool)
: device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
staging_pool{staging_pool_}, update_descriptor_queue{update_descriptor_queue_},
uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
quad_index_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue) {}
void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) { StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) {
const auto& staging = staging_pool.Request(data_size, MemoryUsage::Upload); return staging_pool.Request(size, MemoryUsage::Upload);
std::memcpy(staging.mapped_span.data(), data, data_size); }
scheduler.RequestOutsideRenderPassOperationContext(); StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size) {
return staging_pool.Request(size, MemoryUsage::Download);
}
const VkBuffer handle = Handle(); void BufferCacheRuntime::Finish() {
scheduler.Record([staging = staging.buffer, handle, offset, data_size, scheduler.Finish();
&device = device](vk::CommandBuffer cmdbuf) { }
const VkBufferMemoryBarrier read_barrier{
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer,
std::span<const VideoCommon::BufferCopy> copies) {
static constexpr VkMemoryBarrier READ_BARRIER{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr, .pNext = nullptr,
.srcAccessMask = .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT | .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT,
VK_ACCESS_HOST_WRITE_BIT | };
(device.IsExtTransformFeedbackSupported() ? TRANSFORM_FEEDBACK_WRITE_ACCESS : 0), static constexpr VkMemoryBarrier WRITE_BARRIER{
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .pNext = nullptr,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.buffer = handle, .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
.offset = offset, };
.size = data_size, // Measuring a popular game, this number never exceeds the specified size once data is warmed up
boost::container::small_vector<VkBufferCopy, 3> vk_copies(copies.size());
std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy);
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) {
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
0, READ_BARRIER);
cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0, WRITE_BARRIER);
});
}
void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format,
u32 base_vertex, u32 num_indices, VkBuffer buffer,
u32 offset, [[maybe_unused]] u32 size) {
VkIndexType vk_index_type = MaxwellToVK::IndexFormat(index_format);
VkDeviceSize vk_offset = offset;
VkBuffer vk_buffer = buffer;
if (topology == PrimitiveTopology::Quads) {
vk_index_type = VK_INDEX_TYPE_UINT32;
std::tie(vk_buffer, vk_offset) =
quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset);
} else if (vk_index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) {
vk_index_type = VK_INDEX_TYPE_UINT16;
std::tie(vk_buffer, vk_offset) = uint8_pass.Assemble(num_indices, buffer, offset);
}
if (vk_buffer == VK_NULL_HANDLE) {
// Vulkan doesn't support null index buffers. Replace it with our own null buffer.
ReserveNullIndexBuffer();
vk_buffer = *null_index_buffer;
}
scheduler.Record([vk_buffer, vk_offset, vk_index_type](vk::CommandBuffer cmdbuf) {
cmdbuf.BindIndexBuffer(vk_buffer, vk_offset, vk_index_type);
});
}
void BufferCacheRuntime::BindQuadArrayIndexBuffer(u32 first, u32 count) {
ReserveQuadArrayLUT(first + count, true);
// The LUT has the indices 0, 1, 2, and 3 copied as an array
// To apply these 'first' offsets we can apply an offset based on the modulus.
const VkIndexType index_type = quad_array_lut_index_type;
const size_t sub_first_offset = static_cast<size_t>(first % 4) * (current_num_indices / 4);
const size_t offset = (sub_first_offset + first / 4) * 6ULL * BytesPerIndex(index_type);
scheduler.Record([buffer = *quad_array_lut, index_type, offset](vk::CommandBuffer cmdbuf) {
cmdbuf.BindIndexBuffer(buffer, offset, index_type);
});
}
void BufferCacheRuntime::BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size,
u32 stride) {
if (device.IsExtExtendedDynamicStateSupported()) {
scheduler.Record([index, buffer, offset, size, stride](vk::CommandBuffer cmdbuf) {
const VkDeviceSize vk_offset = offset;
const VkDeviceSize vk_size = buffer != VK_NULL_HANDLE ? size : VK_WHOLE_SIZE;
const VkDeviceSize vk_stride = stride;
cmdbuf.BindVertexBuffers2EXT(index, 1, &buffer, &vk_offset, &vk_size, &vk_stride);
});
} else {
scheduler.Record([index, buffer, offset](vk::CommandBuffer cmdbuf) {
cmdbuf.BindVertexBuffer(index, buffer, offset);
});
}
}
void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset,
u32 size) {
if (!device.IsExtTransformFeedbackSupported()) {
// Already logged in the rasterizer
return;
}
scheduler.Record([index, buffer, offset, size](vk::CommandBuffer cmdbuf) {
const VkDeviceSize vk_offset = offset;
const VkDeviceSize vk_size = size;
cmdbuf.BindTransformFeedbackBuffersEXT(index, 1, &buffer, &vk_offset, &vk_size);
});
}
void BufferCacheRuntime::BindBuffer(VkBuffer buffer, u32 offset, u32 size) {
update_descriptor_queue.AddBuffer(buffer, offset, size);
}
void BufferCacheRuntime::ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle) {
if (num_indices <= current_num_indices) {
return;
}
if (wait_for_idle) {
scheduler.Finish();
}
current_num_indices = num_indices;
quad_array_lut_index_type = IndexTypeFromNumElements(device, num_indices);
const u32 num_quads = num_indices / 4;
const u32 num_triangle_indices = num_quads * 6;
const u32 num_first_offset_copies = 4;
const size_t bytes_per_index = BytesPerIndex(quad_array_lut_index_type);
const size_t size_bytes = num_triangle_indices * bytes_per_index * num_first_offset_copies;
quad_array_lut = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.size = size_bytes,
.usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
});
if (device.HasDebuggingToolAttached()) {
quad_array_lut.SetObjectNameEXT("Quad LUT");
}
quad_array_lut_commit = memory_allocator.Commit(quad_array_lut, MemoryUsage::DeviceLocal);
const StagingBufferRef staging = staging_pool.Request(size_bytes, MemoryUsage::Upload);
u8* staging_data = staging.mapped_span.data();
const size_t quad_size = bytes_per_index * 6;
for (u32 first = 0; first < num_first_offset_copies; ++first) {
for (u32 quad = 0; quad < num_quads; ++quad) {
switch (quad_array_lut_index_type) {
case VK_INDEX_TYPE_UINT8_EXT:
std::memcpy(staging_data, MakeQuadIndices<u8>(quad, first).data(), quad_size);
break;
case VK_INDEX_TYPE_UINT16:
std::memcpy(staging_data, MakeQuadIndices<u16>(quad, first).data(), quad_size);
break;
case VK_INDEX_TYPE_UINT32:
std::memcpy(staging_data, MakeQuadIndices<u32>(quad, first).data(), quad_size);
break;
default:
UNREACHABLE();
break;
}
staging_data += quad_size;
}
}
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([src_buffer = staging.buffer, src_offset = staging.offset,
dst_buffer = *quad_array_lut, size_bytes](vk::CommandBuffer cmdbuf) {
const VkBufferCopy copy{
.srcOffset = src_offset,
.dstOffset = 0,
.size = size_bytes,
}; };
const VkBufferMemoryBarrier write_barrier{ const VkBufferMemoryBarrier write_barrier{
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.pNext = nullptr, .pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = UPLOAD_ACCESS_BARRIERS, .dstAccessMask = VK_ACCESS_INDEX_READ_BIT,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.buffer = handle, .buffer = dst_buffer,
.offset = offset, .offset = 0,
.size = data_size, .size = size_bytes,
}; };
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy);
0, read_barrier); cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,
cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, data_size}); 0, write_barrier);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0,
write_barrier);
}); });
} }
void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) { void BufferCacheRuntime::ReserveNullIndexBuffer() {
auto staging = staging_pool.Request(data_size, MemoryUsage::Download); if (null_index_buffer) {
scheduler.RequestOutsideRenderPassOperationContext(); return;
}
const VkBuffer handle = Handle(); null_index_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{
scheduler.Record( .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
[staging = staging.buffer, handle, offset, data_size](vk::CommandBuffer cmdbuf) {
const VkBufferMemoryBarrier barrier{
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
.pNext = nullptr, .pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, .flags = 0,
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, .size = 4,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.buffer = handle, .queueFamilyIndexCount = 0,
.offset = offset, .pQueueFamilyIndices = nullptr,
.size = data_size,
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_VERTEX_SHADER_BIT |
VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT |
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {});
cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, data_size});
}); });
scheduler.Finish(); if (device.HasDebuggingToolAttached()) {
null_index_buffer.SetObjectNameEXT("Null index buffer");
std::memcpy(data, staging.mapped_span.data(), data_size);
} }
null_index_buffer_commit = memory_allocator.Commit(null_index_buffer, MemoryUsage::DeviceLocal);
void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
std::size_t copy_size) {
scheduler.RequestOutsideRenderPassOperationContext(); scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([buffer = *null_index_buffer](vk::CommandBuffer cmdbuf) {
const VkBuffer dst_buffer = Handle(); cmdbuf.FillBuffer(buffer, 0, VK_WHOLE_SIZE, 0);
scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset,
copy_size](vk::CommandBuffer cmdbuf) {
cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, copy_size});
std::array<VkBufferMemoryBarrier, 2> barriers;
barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
barriers[0].pNext = nullptr;
barriers[0].srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT;
barriers[0].dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barriers[0].buffer = src_buffer;
barriers[0].offset = src_offset;
barriers[0].size = copy_size;
barriers[1].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
barriers[1].pNext = nullptr;
barriers[1].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT;
barriers[1].dstAccessMask = UPLOAD_ACCESS_BARRIERS;
barriers[1].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barriers[1].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barriers[1].buffer = dst_buffer;
barriers[1].offset = dst_offset;
barriers[1].size = copy_size;
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {},
barriers, {});
}); });
} }
VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer_,
Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_,
const Device& device_, MemoryAllocator& memory_allocator_,
VKScheduler& scheduler_, VKStreamBuffer& stream_buffer_,
StagingBufferPool& staging_pool_)
: VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer_, gpu_memory_,
cpu_memory_, stream_buffer_},
device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_},
staging_pool{staging_pool_} {}
VKBufferCache::~VKBufferCache() = default;
std::shared_ptr<Buffer> VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) {
return std::make_shared<Buffer>(device, memory_allocator, scheduler, staging_pool, cpu_addr,
size);
}
VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) {
size = std::max(size, std::size_t(4));
const auto& empty = staging_pool.Request(size, MemoryUsage::DeviceLocal);
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([size, buffer = empty.buffer](vk::CommandBuffer cmdbuf) {
cmdbuf.FillBuffer(buffer, 0, size, 0);
});
return {empty.buffer, 0, 0};
}
} // namespace Vulkan } // namespace Vulkan

View file

@ -4,69 +4,124 @@
#pragma once #pragma once
#include <memory>
#include "common/common_types.h"
#include "video_core/buffer_cache/buffer_cache.h" #include "video_core/buffer_cache/buffer_cache.h"
#include "video_core/engines/maxwell_3d.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
#include "video_core/renderer_vulkan/vk_stream_buffer.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h" #include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan { namespace Vulkan {
class Device; class Device;
class VKDescriptorPool;
class VKScheduler; class VKScheduler;
class VKUpdateDescriptorQueue;
class Buffer final : public VideoCommon::BufferBlock { class BufferCacheRuntime;
class Buffer : public VideoCommon::BufferBase<VideoCore::RasterizerInterface> {
public: public:
explicit Buffer(const Device& device, MemoryAllocator& memory_allocator, VKScheduler& scheduler, explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params);
StagingBufferPool& staging_pool, VAddr cpu_addr_, std::size_t size_); explicit Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_,
~Buffer(); VAddr cpu_addr_, u64 size_bytes_);
void Upload(std::size_t offset, std::size_t data_size, const u8* data); [[nodiscard]] VkBuffer Handle() const noexcept {
void Download(std::size_t offset, std::size_t data_size, u8* data);
void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset,
std::size_t copy_size);
VkBuffer Handle() const {
return *buffer; return *buffer;
} }
u64 Address() const { operator VkBuffer() const noexcept {
return 0; return *buffer;
} }
private: private:
const Device& device;
VKScheduler& scheduler;
StagingBufferPool& staging_pool;
vk::Buffer buffer; vk::Buffer buffer;
MemoryCommit commit; MemoryCommit commit;
}; };
class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer> { class BufferCacheRuntime {
friend Buffer;
using PrimitiveTopology = Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology;
using IndexFormat = Tegra::Engines::Maxwell3D::Regs::IndexFormat;
public: public:
explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, explicit BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_manager_,
Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, VKScheduler& scheduler_, StagingBufferPool& staging_pool_,
const Device& device, MemoryAllocator& memory_allocator, VKUpdateDescriptorQueue& update_descriptor_queue_,
VKScheduler& scheduler, VKStreamBuffer& stream_buffer, VKDescriptorPool& descriptor_pool);
StagingBufferPool& staging_pool);
~VKBufferCache();
BufferInfo GetEmptyBuffer(std::size_t size) override; void Finish();
protected: [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size);
std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override;
[[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size);
void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer,
std::span<const VideoCommon::BufferCopy> copies);
void BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format, u32 num_indices,
u32 base_vertex, VkBuffer buffer, u32 offset, u32 size);
void BindQuadArrayIndexBuffer(u32 first, u32 count);
void BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size, u32 stride);
void BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size);
std::span<u8> BindMappedUniformBuffer([[maybe_unused]] size_t stage,
[[maybe_unused]] u32 binding_index, u32 size) {
const StagingBufferRef ref = staging_pool.Request(size, MemoryUsage::Upload);
BindBuffer(ref.buffer, static_cast<u32>(ref.offset), size);
return ref.mapped_span;
}
void BindUniformBuffer(VkBuffer buffer, u32 offset, u32 size) {
BindBuffer(buffer, offset, size);
}
void BindStorageBuffer(VkBuffer buffer, u32 offset, u32 size,
[[maybe_unused]] bool is_written) {
BindBuffer(buffer, offset, size);
}
private: private:
void BindBuffer(VkBuffer buffer, u32 offset, u32 size);
void ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle);
void ReserveNullIndexBuffer();
const Device& device; const Device& device;
MemoryAllocator& memory_allocator; MemoryAllocator& memory_allocator;
VKScheduler& scheduler; VKScheduler& scheduler;
StagingBufferPool& staging_pool; StagingBufferPool& staging_pool;
VKUpdateDescriptorQueue& update_descriptor_queue;
vk::Buffer quad_array_lut;
MemoryCommit quad_array_lut_commit;
VkIndexType quad_array_lut_index_type{};
u32 current_num_indices = 0;
vk::Buffer null_index_buffer;
MemoryCommit null_index_buffer_commit;
Uint8Pass uint8_pass;
QuadIndexedPass quad_index_pass;
}; };
struct BufferCacheParams {
using Runtime = Vulkan::BufferCacheRuntime;
using Buffer = Vulkan::Buffer;
static constexpr bool IS_OPENGL = false;
static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false;
static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = false;
static constexpr bool NEEDS_BIND_UNIFORM_INDEX = false;
static constexpr bool NEEDS_BIND_STORAGE_INDEX = false;
static constexpr bool USE_MEMORY_MAPS = true;
};
using BufferCache = VideoCommon::BufferCache<BufferCacheParams>;
} // namespace Vulkan } // namespace Vulkan

View file

@ -10,7 +10,7 @@
#include "common/alignment.h" #include "common/alignment.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/common_types.h" #include "common/common_types.h"
#include "video_core/host_shaders/vulkan_quad_array_comp_spv.h" #include "common/div_ceil.h"
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_compute_pass.h"
@ -22,30 +22,7 @@
#include "video_core/vulkan_common/vulkan_wrapper.h" #include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan { namespace Vulkan {
namespace { namespace {
VkDescriptorSetLayoutBinding BuildQuadArrayPassDescriptorSetLayoutBinding() {
return {
.binding = 0,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
};
}
VkDescriptorUpdateTemplateEntryKHR BuildQuadArrayPassDescriptorUpdateTemplateEntry() {
return {
.dstBinding = 0,
.dstArrayElement = 0,
.descriptorCount = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = 0,
.stride = sizeof(DescriptorUpdateEntry),
};
}
VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { VkPushConstantRange BuildComputePushConstantRange(std::size_t size) {
return { return {
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
@ -162,55 +139,6 @@ VkDescriptorSet VKComputePass::CommitDescriptorSet(
return set; return set;
} }
QuadArrayPass::QuadArrayPass(const Device& device_, VKScheduler& scheduler_,
VKDescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,
VKUpdateDescriptorQueue& update_descriptor_queue_)
: VKComputePass(device_, descriptor_pool_, BuildQuadArrayPassDescriptorSetLayoutBinding(),
BuildQuadArrayPassDescriptorUpdateTemplateEntry(),
BuildComputePushConstantRange(sizeof(u32)), VULKAN_QUAD_ARRAY_COMP_SPV),
scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_},
update_descriptor_queue{update_descriptor_queue_} {}
QuadArrayPass::~QuadArrayPass() = default;
std::pair<VkBuffer, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertices, u32 first) {
const u32 num_triangle_vertices = (num_vertices / 4) * 6;
const std::size_t staging_size = num_triangle_vertices * sizeof(u32);
const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
update_descriptor_queue.Acquire();
update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size);
const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
scheduler.RequestOutsideRenderPassOperationContext();
ASSERT(num_vertices % 4 == 0);
const u32 num_quads = num_vertices / 4;
scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer,
num_quads, first, set](vk::CommandBuffer cmdbuf) {
constexpr u32 dispatch_size = 1024;
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(first), &first);
cmdbuf.Dispatch(Common::AlignUp(num_quads, dispatch_size) / dispatch_size, 1, 1);
VkBufferMemoryBarrier barrier;
barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
barrier.pNext = nullptr;
barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.buffer = buffer;
barrier.offset = 0;
barrier.size = static_cast<VkDeviceSize>(num_quads) * 6 * sizeof(u32);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, {barrier}, {});
});
return {staging_ref.buffer, 0};
}
Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_, Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_,
VKDescriptorPool& descriptor_pool, StagingBufferPool& staging_buffer_pool_, VKDescriptorPool& descriptor_pool, StagingBufferPool& staging_buffer_pool_,
VKUpdateDescriptorQueue& update_descriptor_queue_) VKUpdateDescriptorQueue& update_descriptor_queue_)
@ -221,38 +149,33 @@ Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_,
Uint8Pass::~Uint8Pass() = default; Uint8Pass::~Uint8Pass() = default;
std::pair<VkBuffer, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer, std::pair<VkBuffer, VkDeviceSize> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer,
u64 src_offset) { u32 src_offset) {
const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16)); const u32 staging_size = static_cast<u32>(num_vertices * sizeof(u16));
const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
update_descriptor_queue.Acquire(); update_descriptor_queue.Acquire();
update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices); update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices);
update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size); update_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size);
const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
scheduler.RequestOutsideRenderPassOperationContext(); scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer, set, scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
num_vertices](vk::CommandBuffer cmdbuf) { num_vertices](vk::CommandBuffer cmdbuf) {
constexpr u32 dispatch_size = 1024; static constexpr u32 DISPATCH_SIZE = 1024;
static constexpr VkMemoryBarrier WRITE_BARRIER{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
};
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
cmdbuf.Dispatch(Common::AlignUp(num_vertices, dispatch_size) / dispatch_size, 1, 1); cmdbuf.Dispatch(Common::DivCeil(num_vertices, DISPATCH_SIZE), 1, 1);
VkBufferMemoryBarrier barrier;
barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
barrier.pNext = nullptr;
barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.buffer = buffer;
barrier.offset = 0;
barrier.size = static_cast<VkDeviceSize>(num_vertices * sizeof(u16));
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, WRITE_BARRIER);
}); });
return {staging_ref.buffer, 0}; return {staging.buffer, staging.offset};
} }
QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_, QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
@ -267,9 +190,9 @@ QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_,
QuadIndexedPass::~QuadIndexedPass() = default; QuadIndexedPass::~QuadIndexedPass() = default;
std::pair<VkBuffer, u64> QuadIndexedPass::Assemble( std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex, Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex,
VkBuffer src_buffer, u64 src_offset) { VkBuffer src_buffer, u32 src_offset) {
const u32 index_shift = [index_format] { const u32 index_shift = [index_format] {
switch (index_format) { switch (index_format) {
case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedByte: case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedByte:
@ -286,38 +209,33 @@ std::pair<VkBuffer, u64> QuadIndexedPass::Assemble(
const u32 num_tri_vertices = (num_vertices / 4) * 6; const u32 num_tri_vertices = (num_vertices / 4) * 6;
const std::size_t staging_size = num_tri_vertices * sizeof(u32); const std::size_t staging_size = num_tri_vertices * sizeof(u32);
const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal);
update_descriptor_queue.Acquire(); update_descriptor_queue.Acquire();
update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size); update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size);
update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size); update_descriptor_queue.AddBuffer(staging.buffer, staging.offset, staging_size);
const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue);
scheduler.RequestOutsideRenderPassOperationContext(); scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer, set, scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set,
num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) { num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) {
static constexpr u32 dispatch_size = 1024; static constexpr u32 DISPATCH_SIZE = 1024;
static constexpr VkMemoryBarrier WRITE_BARRIER{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT,
};
const std::array push_constants = {base_vertex, index_shift}; const std::array push_constants = {base_vertex, index_shift};
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {});
cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants), cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(push_constants),
&push_constants); &push_constants);
cmdbuf.Dispatch(Common::AlignUp(num_tri_vertices, dispatch_size) / dispatch_size, 1, 1); cmdbuf.Dispatch(Common::DivCeil(num_tri_vertices, DISPATCH_SIZE), 1, 1);
VkBufferMemoryBarrier barrier;
barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
barrier.pNext = nullptr;
barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED;
barrier.buffer = buffer;
barrier.offset = 0;
barrier.size = static_cast<VkDeviceSize>(num_tri_vertices * sizeof(u32));
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, WRITE_BARRIER);
}); });
return {staging_ref.buffer, 0}; return {staging.buffer, staging.offset};
} }
} // namespace Vulkan } // namespace Vulkan

View file

@ -41,22 +41,6 @@ private:
vk::ShaderModule module; vk::ShaderModule module;
}; };
class QuadArrayPass final : public VKComputePass {
public:
explicit QuadArrayPass(const Device& device_, VKScheduler& scheduler_,
VKDescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_,
VKUpdateDescriptorQueue& update_descriptor_queue_);
~QuadArrayPass();
std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, u32 first);
private:
VKScheduler& scheduler;
StagingBufferPool& staging_buffer_pool;
VKUpdateDescriptorQueue& update_descriptor_queue;
};
class Uint8Pass final : public VKComputePass { class Uint8Pass final : public VKComputePass {
public: public:
explicit Uint8Pass(const Device& device_, VKScheduler& scheduler_, explicit Uint8Pass(const Device& device_, VKScheduler& scheduler_,
@ -64,7 +48,10 @@ public:
VKUpdateDescriptorQueue& update_descriptor_queue_); VKUpdateDescriptorQueue& update_descriptor_queue_);
~Uint8Pass(); ~Uint8Pass();
std::pair<VkBuffer, u64> Assemble(u32 num_vertices, VkBuffer src_buffer, u64 src_offset); /// Assemble uint8 indices into an uint16 index buffer
/// Returns a pair with the staging buffer, and the offset where the assembled data is
std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, VkBuffer src_buffer,
u32 src_offset);
private: private:
VKScheduler& scheduler; VKScheduler& scheduler;
@ -80,9 +67,9 @@ public:
VKUpdateDescriptorQueue& update_descriptor_queue_); VKUpdateDescriptorQueue& update_descriptor_queue_);
~QuadIndexedPass(); ~QuadIndexedPass();
std::pair<VkBuffer, u64> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, std::pair<VkBuffer, VkDeviceSize> Assemble(
u32 num_vertices, u32 base_vertex, VkBuffer src_buffer, Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices,
u64 src_offset); u32 base_vertex, VkBuffer src_buffer, u32 src_offset);
private: private:
VKScheduler& scheduler; VKScheduler& scheduler;

View file

@ -45,8 +45,8 @@ void InnerFence::Wait() {
} }
VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_,
Tegra::MemoryManager& memory_manager_, TextureCache& texture_cache_, TextureCache& texture_cache_, BufferCache& buffer_cache_,
VKBufferCache& buffer_cache_, VKQueryCache& query_cache_, VKQueryCache& query_cache_, const Device& device_,
VKScheduler& scheduler_) VKScheduler& scheduler_)
: GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_}, : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_},
scheduler{scheduler_} {} scheduler{scheduler_} {}

View file

@ -22,7 +22,6 @@ class RasterizerInterface;
namespace Vulkan { namespace Vulkan {
class Device; class Device;
class VKBufferCache;
class VKQueryCache; class VKQueryCache;
class VKScheduler; class VKScheduler;
@ -45,14 +44,14 @@ private:
using Fence = std::shared_ptr<InnerFence>; using Fence = std::shared_ptr<InnerFence>;
using GenericFenceManager = using GenericFenceManager =
VideoCommon::FenceManager<Fence, TextureCache, VKBufferCache, VKQueryCache>; VideoCommon::FenceManager<Fence, TextureCache, BufferCache, VKQueryCache>;
class VKFenceManager final : public GenericFenceManager { class VKFenceManager final : public GenericFenceManager {
public: public:
explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu,
Tegra::MemoryManager& memory_manager_, TextureCache& texture_cache_, TextureCache& texture_cache, BufferCache& buffer_cache,
VKBufferCache& buffer_cache_, VKQueryCache& query_cache_, VKQueryCache& query_cache, const Device& device,
VKScheduler& scheduler_); VKScheduler& scheduler);
protected: protected:
Fence CreateFence(u32 value, bool is_stubbed) override; Fence CreateFence(u32 value, bool is_stubbed) override;

View file

@ -8,8 +8,6 @@
#include <mutex> #include <mutex>
#include <vector> #include <vector>
#include <boost/container/static_vector.hpp>
#include "common/alignment.h" #include "common/alignment.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/logging/log.h" #include "common/logging/log.h"
@ -24,7 +22,6 @@
#include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h"
#include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/renderer_vulkan.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h"
#include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h"
@ -50,15 +47,16 @@ MICROPROFILE_DEFINE(Vulkan_WaitForWorker, "Vulkan", "Wait for worker", MP_RGB(25
MICROPROFILE_DEFINE(Vulkan_Drawing, "Vulkan", "Record drawing", MP_RGB(192, 128, 128)); MICROPROFILE_DEFINE(Vulkan_Drawing, "Vulkan", "Record drawing", MP_RGB(192, 128, 128));
MICROPROFILE_DEFINE(Vulkan_Compute, "Vulkan", "Record compute", MP_RGB(192, 128, 128)); MICROPROFILE_DEFINE(Vulkan_Compute, "Vulkan", "Record compute", MP_RGB(192, 128, 128));
MICROPROFILE_DEFINE(Vulkan_Clearing, "Vulkan", "Record clearing", MP_RGB(192, 128, 128)); MICROPROFILE_DEFINE(Vulkan_Clearing, "Vulkan", "Record clearing", MP_RGB(192, 128, 128));
MICROPROFILE_DEFINE(Vulkan_Geometry, "Vulkan", "Setup geometry", MP_RGB(192, 128, 128));
MICROPROFILE_DEFINE(Vulkan_ConstBuffers, "Vulkan", "Setup constant buffers", MP_RGB(192, 128, 128));
MICROPROFILE_DEFINE(Vulkan_GlobalBuffers, "Vulkan", "Setup global buffers", MP_RGB(192, 128, 128));
MICROPROFILE_DEFINE(Vulkan_RenderTargets, "Vulkan", "Setup render targets", MP_RGB(192, 128, 128));
MICROPROFILE_DEFINE(Vulkan_Textures, "Vulkan", "Setup textures", MP_RGB(192, 128, 128));
MICROPROFILE_DEFINE(Vulkan_Images, "Vulkan", "Setup images", MP_RGB(192, 128, 128));
MICROPROFILE_DEFINE(Vulkan_PipelineCache, "Vulkan", "Pipeline cache", MP_RGB(192, 128, 128)); MICROPROFILE_DEFINE(Vulkan_PipelineCache, "Vulkan", "Pipeline cache", MP_RGB(192, 128, 128));
namespace { namespace {
struct DrawParams {
u32 base_instance;
u32 num_instances;
u32 base_vertex;
u32 num_vertices;
bool is_indexed;
};
constexpr auto COMPUTE_SHADER_INDEX = static_cast<size_t>(Tegra::Engines::ShaderType::Compute); constexpr auto COMPUTE_SHADER_INDEX = static_cast<size_t>(Tegra::Engines::ShaderType::Compute);
@ -67,7 +65,6 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in
const float width = src.scale_x * 2.0f; const float width = src.scale_x * 2.0f;
const float height = src.scale_y * 2.0f; const float height = src.scale_y * 2.0f;
const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f; const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f;
VkViewport viewport{ VkViewport viewport{
.x = src.translate_x - src.scale_x, .x = src.translate_x - src.scale_x,
.y = src.translate_y - src.scale_y, .y = src.translate_y - src.scale_y,
@ -76,12 +73,10 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in
.minDepth = src.translate_z - src.scale_z * reduce_z, .minDepth = src.translate_z - src.scale_z * reduce_z,
.maxDepth = src.translate_z + src.scale_z, .maxDepth = src.translate_z + src.scale_z,
}; };
if (!device.IsExtDepthRangeUnrestrictedSupported()) { if (!device.IsExtDepthRangeUnrestrictedSupported()) {
viewport.minDepth = std::clamp(viewport.minDepth, 0.0f, 1.0f); viewport.minDepth = std::clamp(viewport.minDepth, 0.0f, 1.0f);
viewport.maxDepth = std::clamp(viewport.maxDepth, 0.0f, 1.0f); viewport.maxDepth = std::clamp(viewport.maxDepth, 0.0f, 1.0f);
} }
return viewport; return viewport;
} }
@ -146,13 +141,6 @@ TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const
return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index); return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index);
} }
template <size_t N>
std::array<VkDeviceSize, N> ExpandStrides(const std::array<u16, N>& strides) {
std::array<VkDeviceSize, N> expanded;
std::copy(strides.begin(), strides.end(), expanded.begin());
return expanded;
}
ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) { ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) {
if (entry.is_buffer) { if (entry.is_buffer) {
return ImageViewType::e2D; return ImageViewType::e2D;
@ -221,190 +209,25 @@ void PushImageDescriptors(const ShaderEntries& entries, TextureCache& texture_ca
} }
} }
} // Anonymous namespace DrawParams MakeDrawParams(const Maxwell& regs, u32 num_instances, bool is_instanced,
bool is_indexed) {
class BufferBindings final { DrawParams params{
public: .base_instance = regs.vb_base_instance,
void AddVertexBinding(VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, u32 stride) { .num_instances = is_instanced ? num_instances : 1,
vertex.buffers[vertex.num_buffers] = buffer; .base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first,
vertex.offsets[vertex.num_buffers] = offset; .num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count,
vertex.sizes[vertex.num_buffers] = size; .is_indexed = is_indexed,
vertex.strides[vertex.num_buffers] = static_cast<u16>(stride);
++vertex.num_buffers;
}
void SetIndexBinding(VkBuffer buffer, VkDeviceSize offset, VkIndexType type) {
index.buffer = buffer;
index.offset = offset;
index.type = type;
}
void Bind(const Device& device, VKScheduler& scheduler) const {
// Use this large switch case to avoid dispatching more memory in the record lambda than
// what we need. It looks horrible, but it's the best we can do on standard C++.
switch (vertex.num_buffers) {
case 0:
return BindStatic<0>(device, scheduler);
case 1:
return BindStatic<1>(device, scheduler);
case 2:
return BindStatic<2>(device, scheduler);
case 3:
return BindStatic<3>(device, scheduler);
case 4:
return BindStatic<4>(device, scheduler);
case 5:
return BindStatic<5>(device, scheduler);
case 6:
return BindStatic<6>(device, scheduler);
case 7:
return BindStatic<7>(device, scheduler);
case 8:
return BindStatic<8>(device, scheduler);
case 9:
return BindStatic<9>(device, scheduler);
case 10:
return BindStatic<10>(device, scheduler);
case 11:
return BindStatic<11>(device, scheduler);
case 12:
return BindStatic<12>(device, scheduler);
case 13:
return BindStatic<13>(device, scheduler);
case 14:
return BindStatic<14>(device, scheduler);
case 15:
return BindStatic<15>(device, scheduler);
case 16:
return BindStatic<16>(device, scheduler);
case 17:
return BindStatic<17>(device, scheduler);
case 18:
return BindStatic<18>(device, scheduler);
case 19:
return BindStatic<19>(device, scheduler);
case 20:
return BindStatic<20>(device, scheduler);
case 21:
return BindStatic<21>(device, scheduler);
case 22:
return BindStatic<22>(device, scheduler);
case 23:
return BindStatic<23>(device, scheduler);
case 24:
return BindStatic<24>(device, scheduler);
case 25:
return BindStatic<25>(device, scheduler);
case 26:
return BindStatic<26>(device, scheduler);
case 27:
return BindStatic<27>(device, scheduler);
case 28:
return BindStatic<28>(device, scheduler);
case 29:
return BindStatic<29>(device, scheduler);
case 30:
return BindStatic<30>(device, scheduler);
case 31:
return BindStatic<31>(device, scheduler);
case 32:
return BindStatic<32>(device, scheduler);
}
UNREACHABLE();
}
private:
// Some of these fields are intentionally left uninitialized to avoid initializing them twice.
struct {
size_t num_buffers = 0;
std::array<VkBuffer, Maxwell::NumVertexArrays> buffers;
std::array<VkDeviceSize, Maxwell::NumVertexArrays> offsets;
std::array<VkDeviceSize, Maxwell::NumVertexArrays> sizes;
std::array<u16, Maxwell::NumVertexArrays> strides;
} vertex;
struct {
VkBuffer buffer = nullptr;
VkDeviceSize offset;
VkIndexType type;
} index;
template <size_t N>
void BindStatic(const Device& device, VKScheduler& scheduler) const {
if (device.IsExtExtendedDynamicStateSupported()) {
if (index.buffer) {
BindStatic<N, true, true>(scheduler);
} else {
BindStatic<N, false, true>(scheduler);
}
} else {
if (index.buffer) {
BindStatic<N, true, false>(scheduler);
} else {
BindStatic<N, false, false>(scheduler);
}
}
}
template <size_t N, bool is_indexed, bool has_extended_dynamic_state>
void BindStatic(VKScheduler& scheduler) const {
static_assert(N <= Maxwell::NumVertexArrays);
if constexpr (N == 0) {
return;
}
std::array<VkBuffer, N> buffers;
std::array<VkDeviceSize, N> offsets;
std::copy(vertex.buffers.begin(), vertex.buffers.begin() + N, buffers.begin());
std::copy(vertex.offsets.begin(), vertex.offsets.begin() + N, offsets.begin());
if constexpr (has_extended_dynamic_state) {
// With extended dynamic states we can specify the length and stride of a vertex buffer
std::array<VkDeviceSize, N> sizes;
std::array<u16, N> strides;
std::copy(vertex.sizes.begin(), vertex.sizes.begin() + N, sizes.begin());
std::copy(vertex.strides.begin(), vertex.strides.begin() + N, strides.begin());
if constexpr (is_indexed) {
scheduler.Record(
[buffers, offsets, sizes, strides, index = index](vk::CommandBuffer cmdbuf) {
cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type);
cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(),
offsets.data(), sizes.data(),
ExpandStrides(strides).data());
});
} else {
scheduler.Record([buffers, offsets, sizes, strides](vk::CommandBuffer cmdbuf) {
cmdbuf.BindVertexBuffers2EXT(0, static_cast<u32>(N), buffers.data(),
offsets.data(), sizes.data(),
ExpandStrides(strides).data());
});
}
return;
}
if constexpr (is_indexed) {
// Indexed draw
scheduler.Record([buffers, offsets, index = index](vk::CommandBuffer cmdbuf) {
cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type);
cmdbuf.BindVertexBuffers(0, static_cast<u32>(N), buffers.data(), offsets.data());
});
} else {
// Array draw
scheduler.Record([buffers, offsets](vk::CommandBuffer cmdbuf) {
cmdbuf.BindVertexBuffers(0, static_cast<u32>(N), buffers.data(), offsets.data());
});
}
}
}; };
if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) {
void RasterizerVulkan::DrawParameters::Draw(vk::CommandBuffer cmdbuf) const { // 6 triangle vertices per quad, base vertex is part of the index
if (is_indexed) { // See BindQuadArrayIndexBuffer for more details
cmdbuf.DrawIndexed(num_vertices, num_instances, 0, base_vertex, base_instance); params.num_vertices = (params.num_vertices / 4) * 6;
} else { params.base_vertex = 0;
cmdbuf.Draw(num_vertices, num_instances, base_vertex, base_instance); params.is_indexed = true;
} }
return params;
} }
} // Anonymous namespace
RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_,
Tegra::MemoryManager& gpu_memory_, Tegra::MemoryManager& gpu_memory_,
@ -414,21 +237,19 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
: RasterizerAccelerated{cpu_memory_}, gpu{gpu_}, : RasterizerAccelerated{cpu_memory_}, gpu{gpu_},
gpu_memory{gpu_memory_}, maxwell3d{gpu.Maxwell3D()}, kepler_compute{gpu.KeplerCompute()}, gpu_memory{gpu_memory_}, maxwell3d{gpu.Maxwell3D()}, kepler_compute{gpu.KeplerCompute()},
screen_info{screen_info_}, device{device_}, memory_allocator{memory_allocator_}, screen_info{screen_info_}, device{device_}, memory_allocator{memory_allocator_},
state_tracker{state_tracker_}, scheduler{scheduler_}, stream_buffer(device, scheduler), state_tracker{state_tracker_}, scheduler{scheduler_},
staging_pool(device, memory_allocator, scheduler), descriptor_pool(device, scheduler), staging_pool(device, memory_allocator, scheduler), descriptor_pool(device, scheduler),
update_descriptor_queue(device, scheduler), update_descriptor_queue(device, scheduler),
blit_image(device, scheduler, state_tracker, descriptor_pool), blit_image(device, scheduler, state_tracker, descriptor_pool),
quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
quad_indexed_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue),
texture_cache_runtime{device, scheduler, memory_allocator, staging_pool, blit_image}, texture_cache_runtime{device, scheduler, memory_allocator, staging_pool, blit_image},
texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory),
buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool,
update_descriptor_queue, descriptor_pool),
buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime),
pipeline_cache(*this, gpu, maxwell3d, kepler_compute, gpu_memory, device, scheduler, pipeline_cache(*this, gpu, maxwell3d, kepler_compute, gpu_memory, device, scheduler,
descriptor_pool, update_descriptor_queue), descriptor_pool, update_descriptor_queue),
buffer_cache(*this, gpu_memory, cpu_memory_, device, memory_allocator, scheduler,
stream_buffer, staging_pool),
query_cache{*this, maxwell3d, gpu_memory, device, scheduler}, query_cache{*this, maxwell3d, gpu_memory, device, scheduler},
fence_manager(*this, gpu, gpu_memory, texture_cache, buffer_cache, query_cache, scheduler), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler),
wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window_) { wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window_) {
scheduler.SetQueryCache(query_cache); scheduler.SetQueryCache(query_cache);
if (device.UseAsynchronousShaders()) { if (device.UseAsynchronousShaders()) {
@ -449,22 +270,14 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
GraphicsPipelineCacheKey key; GraphicsPipelineCacheKey key;
key.fixed_state.Fill(maxwell3d.regs, device.IsExtExtendedDynamicStateSupported()); key.fixed_state.Fill(maxwell3d.regs, device.IsExtExtendedDynamicStateSupported());
buffer_cache.Map(CalculateGraphicsStreamBufferSize(is_indexed)); std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
BufferBindings buffer_bindings;
const DrawParameters draw_params =
SetupGeometry(key.fixed_state, buffer_bindings, is_indexed, is_instanced);
auto lock = texture_cache.AcquireLock();
texture_cache.SynchronizeGraphicsDescriptors(); texture_cache.SynchronizeGraphicsDescriptors();
texture_cache.UpdateRenderTargets(false); texture_cache.UpdateRenderTargets(false);
const auto shaders = pipeline_cache.GetShaders(); const auto shaders = pipeline_cache.GetShaders();
key.shaders = GetShaderAddresses(shaders); key.shaders = GetShaderAddresses(shaders);
SetupShaderDescriptors(shaders); SetupShaderDescriptors(shaders, is_indexed);
buffer_cache.Unmap();
const Framebuffer* const framebuffer = texture_cache.GetFramebuffer(); const Framebuffer* const framebuffer = texture_cache.GetFramebuffer();
key.renderpass = framebuffer->RenderPass(); key.renderpass = framebuffer->RenderPass();
@ -476,22 +289,29 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) {
return; return;
} }
buffer_bindings.Bind(device, scheduler);
BeginTransformFeedback(); BeginTransformFeedback();
scheduler.RequestRenderpass(framebuffer); scheduler.RequestRenderpass(framebuffer);
scheduler.BindGraphicsPipeline(pipeline->GetHandle()); scheduler.BindGraphicsPipeline(pipeline->GetHandle());
UpdateDynamicStates(); UpdateDynamicStates();
const auto pipeline_layout = pipeline->GetLayout(); const auto& regs = maxwell3d.regs;
const auto descriptor_set = pipeline->CommitDescriptorSet(); const u32 num_instances = maxwell3d.mme_draw.instance_count;
const DrawParams draw_params = MakeDrawParams(regs, num_instances, is_instanced, is_indexed);
const VkPipelineLayout pipeline_layout = pipeline->GetLayout();
const VkDescriptorSet descriptor_set = pipeline->CommitDescriptorSet();
scheduler.Record([pipeline_layout, descriptor_set, draw_params](vk::CommandBuffer cmdbuf) { scheduler.Record([pipeline_layout, descriptor_set, draw_params](vk::CommandBuffer cmdbuf) {
if (descriptor_set) { if (descriptor_set) {
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout,
DESCRIPTOR_SET, descriptor_set, {}); DESCRIPTOR_SET, descriptor_set, nullptr);
}
if (draw_params.is_indexed) {
cmdbuf.DrawIndexed(draw_params.num_vertices, draw_params.num_instances, 0,
draw_params.base_vertex, draw_params.base_instance);
} else {
cmdbuf.Draw(draw_params.num_vertices, draw_params.num_instances,
draw_params.base_vertex, draw_params.base_instance);
} }
draw_params.Draw(cmdbuf);
}); });
EndTransformFeedback(); EndTransformFeedback();
@ -515,7 +335,7 @@ void RasterizerVulkan::Clear() {
return; return;
} }
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.UpdateRenderTargets(true); texture_cache.UpdateRenderTargets(true);
const Framebuffer* const framebuffer = texture_cache.GetFramebuffer(); const Framebuffer* const framebuffer = texture_cache.GetFramebuffer();
const VkExtent2D render_area = framebuffer->RenderArea(); const VkExtent2D render_area = framebuffer->RenderArea();
@ -559,7 +379,6 @@ void RasterizerVulkan::Clear() {
if (use_stencil) { if (use_stencil) {
aspect_flags |= VK_IMAGE_ASPECT_STENCIL_BIT; aspect_flags |= VK_IMAGE_ASPECT_STENCIL_BIT;
} }
scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil, scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil,
clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) { clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) {
VkClearAttachment attachment; VkClearAttachment attachment;
@ -580,8 +399,7 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
auto& pipeline = pipeline_cache.GetComputePipeline({ auto& pipeline = pipeline_cache.GetComputePipeline({
.shader = code_addr, .shader = code_addr,
.shared_memory_size = launch_desc.shared_alloc, .shared_memory_size = launch_desc.shared_alloc,
.workgroup_size = .workgroup_size{
{
launch_desc.block_dim_x, launch_desc.block_dim_x,
launch_desc.block_dim_y, launch_desc.block_dim_y,
launch_desc.block_dim_z, launch_desc.block_dim_z,
@ -594,10 +412,21 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
image_view_indices.clear(); image_view_indices.clear();
sampler_handles.clear(); sampler_handles.clear();
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex};
texture_cache.SynchronizeComputeDescriptors();
const auto& entries = pipeline.GetEntries(); const auto& entries = pipeline.GetEntries();
buffer_cache.SetEnabledComputeUniformBuffers(entries.enabled_uniform_buffers);
buffer_cache.UnbindComputeStorageBuffers();
u32 ssbo_index = 0;
for (const auto& buffer : entries.global_buffers) {
buffer_cache.BindComputeStorageBuffer(ssbo_index, buffer.cbuf_index, buffer.cbuf_offset,
buffer.is_written);
++ssbo_index;
}
buffer_cache.UpdateComputeBuffers();
texture_cache.SynchronizeComputeDescriptors();
SetupComputeUniformTexels(entries); SetupComputeUniformTexels(entries);
SetupComputeTextures(entries); SetupComputeTextures(entries);
SetupComputeStorageTexels(entries); SetupComputeStorageTexels(entries);
@ -606,20 +435,15 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) {
const std::span indices_span(image_view_indices.data(), image_view_indices.size()); const std::span indices_span(image_view_indices.data(), image_view_indices.size());
texture_cache.FillComputeImageViews(indices_span, image_view_ids); texture_cache.FillComputeImageViews(indices_span, image_view_ids);
buffer_cache.Map(CalculateComputeStreamBufferSize());
update_descriptor_queue.Acquire(); update_descriptor_queue.Acquire();
SetupComputeConstBuffers(entries); buffer_cache.BindHostComputeBuffers();
SetupComputeGlobalBuffers(entries);
ImageViewId* image_view_id_ptr = image_view_ids.data(); ImageViewId* image_view_id_ptr = image_view_ids.data();
VkSampler* sampler_ptr = sampler_handles.data(); VkSampler* sampler_ptr = sampler_handles.data();
PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr, PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr,
sampler_ptr); sampler_ptr);
buffer_cache.Unmap();
const VkPipeline pipeline_handle = pipeline.GetHandle(); const VkPipeline pipeline_handle = pipeline.GetHandle();
const VkPipelineLayout pipeline_layout = pipeline.GetLayout(); const VkPipelineLayout pipeline_layout = pipeline.GetLayout();
const VkDescriptorSet descriptor_set = pipeline.CommitDescriptorSet(); const VkDescriptorSet descriptor_set = pipeline.CommitDescriptorSet();
@ -644,6 +468,11 @@ void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type,
query_cache.Query(gpu_addr, type, timestamp); query_cache.Query(gpu_addr, type, timestamp);
} }
void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
u32 size) {
buffer_cache.BindGraphicsUniformBuffer(stage, index, gpu_addr, size);
}
void RasterizerVulkan::FlushAll() {} void RasterizerVulkan::FlushAll() {}
void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) { void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) {
@ -651,19 +480,23 @@ void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) {
return; return;
} }
{ {
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.DownloadMemory(addr, size); texture_cache.DownloadMemory(addr, size);
} }
buffer_cache.FlushRegion(addr, size); {
std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.DownloadMemory(addr, size);
}
query_cache.FlushRegion(addr, size); query_cache.FlushRegion(addr, size);
} }
bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) { bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) {
std::scoped_lock lock{texture_cache.mutex, buffer_cache.mutex};
if (!Settings::IsGPULevelHigh()) { if (!Settings::IsGPULevelHigh()) {
return buffer_cache.MustFlushRegion(addr, size); return buffer_cache.IsRegionGpuModified(addr, size);
} }
return texture_cache.IsRegionGpuModified(addr, size) || return texture_cache.IsRegionGpuModified(addr, size) ||
buffer_cache.MustFlushRegion(addr, size); buffer_cache.IsRegionGpuModified(addr, size);
} }
void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) { void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) {
@ -671,11 +504,14 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) {
return; return;
} }
{ {
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.WriteMemory(addr, size); texture_cache.WriteMemory(addr, size);
} }
{
std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.WriteMemory(addr, size);
}
pipeline_cache.InvalidateRegion(addr, size); pipeline_cache.InvalidateRegion(addr, size);
buffer_cache.InvalidateRegion(addr, size);
query_cache.InvalidateRegion(addr, size); query_cache.InvalidateRegion(addr, size);
} }
@ -683,25 +519,34 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
if (addr == 0 || size == 0) { if (addr == 0 || size == 0) {
return; return;
} }
pipeline_cache.OnCPUWrite(addr, size);
{ {
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.WriteMemory(addr, size); texture_cache.WriteMemory(addr, size);
} }
pipeline_cache.OnCPUWrite(addr, size); {
buffer_cache.OnCPUWrite(addr, size); std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.CachedWriteMemory(addr, size);
}
} }
void RasterizerVulkan::SyncGuestHost() { void RasterizerVulkan::SyncGuestHost() {
buffer_cache.SyncGuestHost();
pipeline_cache.SyncGuestHost(); pipeline_cache.SyncGuestHost();
{
std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.FlushCachedWrites();
}
} }
void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) { void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) {
{ {
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.UnmapMemory(addr, size); texture_cache.UnmapMemory(addr, size);
} }
buffer_cache.OnCPUWrite(addr, size); {
std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.WriteMemory(addr, size);
}
pipeline_cache.OnCPUWrite(addr, size); pipeline_cache.OnCPUWrite(addr, size);
} }
@ -774,18 +619,21 @@ void RasterizerVulkan::TickFrame() {
draw_counter = 0; draw_counter = 0;
update_descriptor_queue.TickFrame(); update_descriptor_queue.TickFrame();
fence_manager.TickFrame(); fence_manager.TickFrame();
buffer_cache.TickFrame();
staging_pool.TickFrame(); staging_pool.TickFrame();
{ {
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.TickFrame(); texture_cache.TickFrame();
} }
{
std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.TickFrame();
}
} }
bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& dst,
const Tegra::Engines::Fermi2D::Config& copy_config) { const Tegra::Engines::Fermi2D::Config& copy_config) {
auto lock = texture_cache.AcquireLock(); std::scoped_lock lock{texture_cache.mutex};
texture_cache.BlitImage(dst, src, copy_config); texture_cache.BlitImage(dst, src, copy_config);
return true; return true;
} }
@ -795,13 +643,11 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
if (!framebuffer_addr) { if (!framebuffer_addr) {
return false; return false;
} }
std::scoped_lock lock{texture_cache.mutex};
auto lock = texture_cache.AcquireLock();
ImageView* const image_view = texture_cache.TryFindFramebufferImageView(framebuffer_addr); ImageView* const image_view = texture_cache.TryFindFramebufferImageView(framebuffer_addr);
if (!image_view) { if (!image_view) {
return false; return false;
} }
screen_info.image_view = image_view->Handle(VideoCommon::ImageViewType::e2D); screen_info.image_view = image_view->Handle(VideoCommon::ImageViewType::e2D);
screen_info.width = image_view->size.width; screen_info.width = image_view->size.width;
screen_info.height = image_view->size.height; screen_info.height = image_view->size.height;
@ -830,29 +676,8 @@ void RasterizerVulkan::FlushWork() {
draw_counter = 0; draw_counter = 0;
} }
RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineState& fixed_state,
BufferBindings& buffer_bindings,
bool is_indexed,
bool is_instanced) {
MICROPROFILE_SCOPE(Vulkan_Geometry);
const auto& regs = maxwell3d.regs;
SetupVertexArrays(buffer_bindings);
const u32 base_instance = regs.vb_base_instance;
const u32 num_instances = is_instanced ? maxwell3d.mme_draw.instance_count : 1;
const u32 base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first;
const u32 num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count;
DrawParameters params{base_instance, num_instances, base_vertex, num_vertices, is_indexed};
SetupIndexBuffer(buffer_bindings, params, is_indexed);
return params;
}
void RasterizerVulkan::SetupShaderDescriptors( void RasterizerVulkan::SetupShaderDescriptors(
const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) { const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders, bool is_indexed) {
image_view_indices.clear(); image_view_indices.clear();
sampler_handles.clear(); sampler_handles.clear();
for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) {
@ -860,15 +685,27 @@ void RasterizerVulkan::SetupShaderDescriptors(
if (!shader) { if (!shader) {
continue; continue;
} }
const auto& entries = shader->GetEntries(); const ShaderEntries& entries = shader->GetEntries();
SetupGraphicsUniformTexels(entries, stage); SetupGraphicsUniformTexels(entries, stage);
SetupGraphicsTextures(entries, stage); SetupGraphicsTextures(entries, stage);
SetupGraphicsStorageTexels(entries, stage); SetupGraphicsStorageTexels(entries, stage);
SetupGraphicsImages(entries, stage); SetupGraphicsImages(entries, stage);
buffer_cache.SetEnabledUniformBuffers(stage, entries.enabled_uniform_buffers);
buffer_cache.UnbindGraphicsStorageBuffers(stage);
u32 ssbo_index = 0;
for (const auto& buffer : entries.global_buffers) {
buffer_cache.BindGraphicsStorageBuffer(stage, ssbo_index, buffer.cbuf_index,
buffer.cbuf_offset, buffer.is_written);
++ssbo_index;
}
} }
const std::span indices_span(image_view_indices.data(), image_view_indices.size()); const std::span indices_span(image_view_indices.data(), image_view_indices.size());
buffer_cache.UpdateGraphicsBuffers(is_indexed);
texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); texture_cache.FillGraphicsImageViews(indices_span, image_view_ids);
buffer_cache.BindHostGeometryBuffers(is_indexed);
update_descriptor_queue.Acquire(); update_descriptor_queue.Acquire();
ImageViewId* image_view_id_ptr = image_view_ids.data(); ImageViewId* image_view_id_ptr = image_view_ids.data();
@ -879,11 +716,9 @@ void RasterizerVulkan::SetupShaderDescriptors(
if (!shader) { if (!shader) {
continue; continue;
} }
const auto& entries = shader->GetEntries(); buffer_cache.BindHostStageBuffers(stage);
SetupGraphicsConstBuffers(entries, stage); PushImageDescriptors(shader->GetEntries(), texture_cache, update_descriptor_queue,
SetupGraphicsGlobalBuffers(entries, stage); image_view_id_ptr, sampler_ptr);
PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr,
sampler_ptr);
} }
} }
@ -916,27 +751,11 @@ void RasterizerVulkan::BeginTransformFeedback() {
LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
return; return;
} }
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry));
scheduler.Record(
UNIMPLEMENTED_IF(regs.tfb_bindings[1].buffer_enable); [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); });
UNIMPLEMENTED_IF(regs.tfb_bindings[2].buffer_enable);
UNIMPLEMENTED_IF(regs.tfb_bindings[3].buffer_enable);
const auto& binding = regs.tfb_bindings[0];
UNIMPLEMENTED_IF(binding.buffer_enable == 0);
UNIMPLEMENTED_IF(binding.buffer_offset != 0);
const GPUVAddr gpu_addr = binding.Address();
const VkDeviceSize size = static_cast<VkDeviceSize>(binding.buffer_size);
const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true);
scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) {
cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size);
cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr);
});
} }
void RasterizerVulkan::EndTransformFeedback() { void RasterizerVulkan::EndTransformFeedback() {
@ -947,104 +766,11 @@ void RasterizerVulkan::EndTransformFeedback() {
if (!device.IsExtTransformFeedbackSupported()) { if (!device.IsExtTransformFeedbackSupported()) {
return; return;
} }
scheduler.Record( scheduler.Record(
[](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
} }
void RasterizerVulkan::SetupVertexArrays(BufferBindings& buffer_bindings) {
const auto& regs = maxwell3d.regs;
for (size_t index = 0; index < Maxwell::NumVertexArrays; ++index) {
const auto& vertex_array = regs.vertex_array[index];
if (!vertex_array.IsEnabled()) {
continue;
}
const GPUVAddr start{vertex_array.StartAddress()};
const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
ASSERT(end >= start);
const size_t size = end - start;
if (size == 0) {
buffer_bindings.AddVertexBinding(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE, 0);
continue;
}
const auto info = buffer_cache.UploadMemory(start, size);
buffer_bindings.AddVertexBinding(info.handle, info.offset, size, vertex_array.stride);
}
}
void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params,
bool is_indexed) {
if (params.num_vertices == 0) {
return;
}
const auto& regs = maxwell3d.regs;
switch (regs.draw.topology) {
case Maxwell::PrimitiveTopology::Quads: {
if (!params.is_indexed) {
const auto [buffer, offset] =
quad_array_pass.Assemble(params.num_vertices, params.base_vertex);
buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32);
params.base_vertex = 0;
params.num_vertices = params.num_vertices * 6 / 4;
params.is_indexed = true;
break;
}
const GPUVAddr gpu_addr = regs.index_array.IndexStart();
const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
VkBuffer buffer = info.handle;
u64 offset = info.offset;
std::tie(buffer, offset) = quad_indexed_pass.Assemble(
regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset);
buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32);
params.num_vertices = (params.num_vertices / 4) * 6;
params.base_vertex = 0;
break;
}
default: {
if (!is_indexed) {
break;
}
const GPUVAddr gpu_addr = regs.index_array.IndexStart();
const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize());
VkBuffer buffer = info.handle;
u64 offset = info.offset;
auto format = regs.index_array.format;
const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte;
if (is_uint8 && !device.IsExtIndexTypeUint8Supported()) {
std::tie(buffer, offset) = uint8_pass.Assemble(params.num_vertices, buffer, offset);
format = Maxwell::IndexFormat::UnsignedShort;
}
buffer_bindings.SetIndexBinding(buffer, offset, MaxwellToVK::IndexFormat(device, format));
break;
}
}
}
void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, size_t stage) {
MICROPROFILE_SCOPE(Vulkan_ConstBuffers);
const auto& shader_stage = maxwell3d.state.shader_stages[stage];
for (const auto& entry : entries.const_buffers) {
SetupConstBuffer(entry, shader_stage.const_buffers[entry.GetIndex()]);
}
}
void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, size_t stage) {
MICROPROFILE_SCOPE(Vulkan_GlobalBuffers);
const auto& cbufs{maxwell3d.state.shader_stages[stage]};
for (const auto& entry : entries.global_buffers) {
const auto addr = cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset();
SetupGlobalBuffer(entry, addr);
}
}
void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, size_t stage) { void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, size_t stage) {
MICROPROFILE_SCOPE(Vulkan_Textures);
const auto& regs = maxwell3d.regs; const auto& regs = maxwell3d.regs;
const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
for (const auto& entry : entries.uniform_texels) { for (const auto& entry : entries.uniform_texels) {
@ -1054,7 +780,6 @@ void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries,
} }
void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, size_t stage) { void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, size_t stage) {
MICROPROFILE_SCOPE(Vulkan_Textures);
const auto& regs = maxwell3d.regs; const auto& regs = maxwell3d.regs;
const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
for (const auto& entry : entries.samplers) { for (const auto& entry : entries.samplers) {
@ -1070,7 +795,6 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, size_
} }
void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, size_t stage) { void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, size_t stage) {
MICROPROFILE_SCOPE(Vulkan_Textures);
const auto& regs = maxwell3d.regs; const auto& regs = maxwell3d.regs;
const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
for (const auto& entry : entries.storage_texels) { for (const auto& entry : entries.storage_texels) {
@ -1080,7 +804,6 @@ void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries,
} }
void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, size_t stage) { void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, size_t stage) {
MICROPROFILE_SCOPE(Vulkan_Images);
const auto& regs = maxwell3d.regs; const auto& regs = maxwell3d.regs;
const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex;
for (const auto& entry : entries.images) { for (const auto& entry : entries.images) {
@ -1089,32 +812,7 @@ void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, size_t
} }
} }
void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) {
MICROPROFILE_SCOPE(Vulkan_ConstBuffers);
const auto& launch_desc = kepler_compute.launch_description;
for (const auto& entry : entries.const_buffers) {
const auto& config = launch_desc.const_buffer_config[entry.GetIndex()];
const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value();
const Tegra::Engines::ConstBufferInfo info{
.address = config.Address(),
.size = config.size,
.enabled = mask[entry.GetIndex()],
};
SetupConstBuffer(entry, info);
}
}
void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) {
MICROPROFILE_SCOPE(Vulkan_GlobalBuffers);
const auto& cbufs{kepler_compute.launch_description.const_buffer_config};
for (const auto& entry : entries.global_buffers) {
const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()};
SetupGlobalBuffer(entry, addr);
}
}
void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) { void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
MICROPROFILE_SCOPE(Vulkan_Textures);
const bool via_header_index = kepler_compute.launch_description.linked_tsc; const bool via_header_index = kepler_compute.launch_description.linked_tsc;
for (const auto& entry : entries.uniform_texels) { for (const auto& entry : entries.uniform_texels) {
const TextureHandle handle = const TextureHandle handle =
@ -1124,7 +822,6 @@ void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) {
} }
void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) { void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
MICROPROFILE_SCOPE(Vulkan_Textures);
const bool via_header_index = kepler_compute.launch_description.linked_tsc; const bool via_header_index = kepler_compute.launch_description.linked_tsc;
for (const auto& entry : entries.samplers) { for (const auto& entry : entries.samplers) {
for (size_t index = 0; index < entry.size; ++index) { for (size_t index = 0; index < entry.size; ++index) {
@ -1139,7 +836,6 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) {
} }
void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) { void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
MICROPROFILE_SCOPE(Vulkan_Textures);
const bool via_header_index = kepler_compute.launch_description.linked_tsc; const bool via_header_index = kepler_compute.launch_description.linked_tsc;
for (const auto& entry : entries.storage_texels) { for (const auto& entry : entries.storage_texels) {
const TextureHandle handle = const TextureHandle handle =
@ -1149,7 +845,6 @@ void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) {
} }
void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
MICROPROFILE_SCOPE(Vulkan_Images);
const bool via_header_index = kepler_compute.launch_description.linked_tsc; const bool via_header_index = kepler_compute.launch_description.linked_tsc;
for (const auto& entry : entries.images) { for (const auto& entry : entries.images) {
const TextureHandle handle = const TextureHandle handle =
@ -1158,42 +853,6 @@ void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) {
} }
} }
void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry,
const Tegra::Engines::ConstBufferInfo& buffer) {
if (!buffer.enabled) {
// Set values to zero to unbind buffers
update_descriptor_queue.AddBuffer(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE);
return;
}
// Align the size to avoid bad std140 interactions
const size_t size = Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float));
ASSERT(size <= MaxConstbufferSize);
const u64 alignment = device.GetUniformBufferAlignment();
const auto info = buffer_cache.UploadMemory(buffer.address, size, alignment);
update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
}
void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) {
const u64 actual_addr = gpu_memory.Read<u64>(address);
const u32 size = gpu_memory.Read<u32>(address + 8);
if (size == 0) {
// Sometimes global memory pointers don't have a proper size. Upload a dummy entry
// because Vulkan doesn't like empty buffers.
// Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the
// default buffer.
static constexpr size_t dummy_size = 4;
const auto info = buffer_cache.GetEmptyBuffer(dummy_size);
update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size);
return;
}
const auto info = buffer_cache.UploadMemory(
actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten());
update_descriptor_queue.AddBuffer(info.handle, info.offset, size);
}
void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) {
if (!state_tracker.TouchViewports()) { if (!state_tracker.TouchViewports()) {
return; return;
@ -1206,7 +865,8 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg
GetViewportState(device, regs, 8), GetViewportState(device, regs, 9), GetViewportState(device, regs, 8), GetViewportState(device, regs, 9),
GetViewportState(device, regs, 10), GetViewportState(device, regs, 11), GetViewportState(device, regs, 10), GetViewportState(device, regs, 11),
GetViewportState(device, regs, 12), GetViewportState(device, regs, 13), GetViewportState(device, regs, 12), GetViewportState(device, regs, 13),
GetViewportState(device, regs, 14), GetViewportState(device, regs, 15)}; GetViewportState(device, regs, 14), GetViewportState(device, regs, 15),
};
scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); }); scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); });
} }
@ -1214,13 +874,14 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs
if (!state_tracker.TouchScissors()) { if (!state_tracker.TouchScissors()) {
return; return;
} }
const std::array scissors = { const std::array scissors{
GetScissorState(regs, 0), GetScissorState(regs, 1), GetScissorState(regs, 2), GetScissorState(regs, 0), GetScissorState(regs, 1), GetScissorState(regs, 2),
GetScissorState(regs, 3), GetScissorState(regs, 4), GetScissorState(regs, 5), GetScissorState(regs, 3), GetScissorState(regs, 4), GetScissorState(regs, 5),
GetScissorState(regs, 6), GetScissorState(regs, 7), GetScissorState(regs, 8), GetScissorState(regs, 6), GetScissorState(regs, 7), GetScissorState(regs, 8),
GetScissorState(regs, 9), GetScissorState(regs, 10), GetScissorState(regs, 11), GetScissorState(regs, 9), GetScissorState(regs, 10), GetScissorState(regs, 11),
GetScissorState(regs, 12), GetScissorState(regs, 13), GetScissorState(regs, 14), GetScissorState(regs, 12), GetScissorState(regs, 13), GetScissorState(regs, 14),
GetScissorState(regs, 15)}; GetScissorState(regs, 15),
};
scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); }); scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); });
} }
@ -1385,73 +1046,4 @@ void RasterizerVulkan::UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs&
}); });
} }
size_t RasterizerVulkan::CalculateGraphicsStreamBufferSize(bool is_indexed) const {
size_t size = CalculateVertexArraysSize();
if (is_indexed) {
size = Common::AlignUp(size, 4) + CalculateIndexBufferSize();
}
size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment());
return size;
}
size_t RasterizerVulkan::CalculateComputeStreamBufferSize() const {
return Tegra::Engines::KeplerCompute::NumConstBuffers *
(Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment());
}
size_t RasterizerVulkan::CalculateVertexArraysSize() const {
const auto& regs = maxwell3d.regs;
size_t size = 0;
for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) {
// This implementation assumes that all attributes are used in the shader.
const GPUVAddr start{regs.vertex_array[index].StartAddress()};
const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()};
DEBUG_ASSERT(end >= start);
size += (end - start) * regs.vertex_array[index].enable;
}
return size;
}
size_t RasterizerVulkan::CalculateIndexBufferSize() const {
return static_cast<size_t>(maxwell3d.regs.index_array.count) *
static_cast<size_t>(maxwell3d.regs.index_array.FormatSizeInBytes());
}
size_t RasterizerVulkan::CalculateConstBufferSize(
const ConstBufferEntry& entry, const Tegra::Engines::ConstBufferInfo& buffer) const {
if (entry.IsIndirect()) {
// Buffer is accessed indirectly, so upload the entire thing
return buffer.size;
} else {
// Buffer is accessed directly, upload just what we use
return entry.GetSize();
}
}
VkBuffer RasterizerVulkan::DefaultBuffer() {
if (default_buffer) {
return *default_buffer;
}
default_buffer = device.GetLogical().CreateBuffer({
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.size = DEFAULT_BUFFER_SIZE,
.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT |
VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
});
default_buffer_commit = memory_allocator.Commit(default_buffer, MemoryUsage::DeviceLocal);
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([buffer = *default_buffer](vk::CommandBuffer cmdbuf) {
cmdbuf.FillBuffer(buffer, 0, DEFAULT_BUFFER_SIZE, 0);
});
return *default_buffer;
}
} // namespace Vulkan } // namespace Vulkan

View file

@ -18,14 +18,12 @@
#include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/blit_image.h"
#include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_fence_manager.h" #include "video_core/renderer_vulkan/vk_fence_manager.h"
#include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_query_cache.h"
#include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
#include "video_core/renderer_vulkan/vk_stream_buffer.h"
#include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_texture_cache.h"
#include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h"
#include "video_core/shader/async_shaders.h" #include "video_core/shader/async_shaders.h"
@ -49,7 +47,6 @@ namespace Vulkan {
struct VKScreenInfo; struct VKScreenInfo;
class StateTracker; class StateTracker;
class BufferBindings;
class RasterizerVulkan final : public VideoCore::RasterizerAccelerated { class RasterizerVulkan final : public VideoCore::RasterizerAccelerated {
public: public:
@ -65,6 +62,7 @@ public:
void DispatchCompute(GPUVAddr code_addr) override; void DispatchCompute(GPUVAddr code_addr) override;
void ResetCounter(VideoCore::QueryType type) override; void ResetCounter(VideoCore::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override;
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
void FlushAll() override; void FlushAll() override;
void FlushRegion(VAddr addr, u64 size) override; void FlushRegion(VAddr addr, u64 size) override;
bool MustFlushRegion(VAddr addr, u64 size) override; bool MustFlushRegion(VAddr addr, u64 size) override;
@ -107,24 +105,11 @@ private:
static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float); static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float);
struct DrawParameters {
void Draw(vk::CommandBuffer cmdbuf) const;
u32 base_instance = 0;
u32 num_instances = 0;
u32 base_vertex = 0;
u32 num_vertices = 0;
bool is_indexed = 0;
};
void FlushWork(); void FlushWork();
/// Setups geometry buffers and state.
DrawParameters SetupGeometry(FixedPipelineState& fixed_state, BufferBindings& buffer_bindings,
bool is_indexed, bool is_instanced);
/// Setup descriptors in the graphics pipeline. /// Setup descriptors in the graphics pipeline.
void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders); void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders,
bool is_indexed);
void UpdateDynamicStates(); void UpdateDynamicStates();
@ -132,16 +117,6 @@ private:
void EndTransformFeedback(); void EndTransformFeedback();
void SetupVertexArrays(BufferBindings& buffer_bindings);
void SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, bool is_indexed);
/// Setup constant buffers in the graphics pipeline.
void SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage);
/// Setup global buffers in the graphics pipeline.
void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage);
/// Setup uniform texels in the graphics pipeline. /// Setup uniform texels in the graphics pipeline.
void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage); void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage);
@ -154,12 +129,6 @@ private:
/// Setup images in the graphics pipeline. /// Setup images in the graphics pipeline.
void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage); void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage);
/// Setup constant buffers in the compute pipeline.
void SetupComputeConstBuffers(const ShaderEntries& entries);
/// Setup global buffers in the compute pipeline.
void SetupComputeGlobalBuffers(const ShaderEntries& entries);
/// Setup texel buffers in the compute pipeline. /// Setup texel buffers in the compute pipeline.
void SetupComputeUniformTexels(const ShaderEntries& entries); void SetupComputeUniformTexels(const ShaderEntries& entries);
@ -172,11 +141,6 @@ private:
/// Setup images in the compute pipeline. /// Setup images in the compute pipeline.
void SetupComputeImages(const ShaderEntries& entries); void SetupComputeImages(const ShaderEntries& entries);
void SetupConstBuffer(const ConstBufferEntry& entry,
const Tegra::Engines::ConstBufferInfo& buffer);
void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address);
void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs);
void UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs);
@ -193,19 +157,6 @@ private:
void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs);
void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs);
size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const;
size_t CalculateComputeStreamBufferSize() const;
size_t CalculateVertexArraysSize() const;
size_t CalculateIndexBufferSize() const;
size_t CalculateConstBufferSize(const ConstBufferEntry& entry,
const Tegra::Engines::ConstBufferInfo& buffer) const;
VkBuffer DefaultBuffer();
Tegra::GPU& gpu; Tegra::GPU& gpu;
Tegra::MemoryManager& gpu_memory; Tegra::MemoryManager& gpu_memory;
Tegra::Engines::Maxwell3D& maxwell3d; Tegra::Engines::Maxwell3D& maxwell3d;
@ -217,24 +168,19 @@ private:
StateTracker& state_tracker; StateTracker& state_tracker;
VKScheduler& scheduler; VKScheduler& scheduler;
VKStreamBuffer stream_buffer;
StagingBufferPool staging_pool; StagingBufferPool staging_pool;
VKDescriptorPool descriptor_pool; VKDescriptorPool descriptor_pool;
VKUpdateDescriptorQueue update_descriptor_queue; VKUpdateDescriptorQueue update_descriptor_queue;
BlitImageHelper blit_image; BlitImageHelper blit_image;
QuadArrayPass quad_array_pass;
QuadIndexedPass quad_indexed_pass;
Uint8Pass uint8_pass;
TextureCacheRuntime texture_cache_runtime; TextureCacheRuntime texture_cache_runtime;
TextureCache texture_cache; TextureCache texture_cache;
BufferCacheRuntime buffer_cache_runtime;
BufferCache buffer_cache;
VKPipelineCache pipeline_cache; VKPipelineCache pipeline_cache;
VKBufferCache buffer_cache;
VKQueryCache query_cache; VKQueryCache query_cache;
VKFenceManager fence_manager; VKFenceManager fence_manager;
vk::Buffer default_buffer;
MemoryCommit default_buffer_commit;
vk::Event wfi_event; vk::Event wfi_event;
VideoCommon::Shader::AsyncShaders async_shaders; VideoCommon::Shader::AsyncShaders async_shaders;

View file

@ -52,18 +52,6 @@ VKScheduler::~VKScheduler() {
worker_thread.join(); worker_thread.join();
} }
u64 VKScheduler::CurrentTick() const noexcept {
return master_semaphore->CurrentTick();
}
bool VKScheduler::IsFree(u64 tick) const noexcept {
return master_semaphore->IsFree(tick);
}
void VKScheduler::Wait(u64 tick) {
master_semaphore->Wait(tick);
}
void VKScheduler::Flush(VkSemaphore semaphore) { void VKScheduler::Flush(VkSemaphore semaphore) {
SubmitExecution(semaphore); SubmitExecution(semaphore);
AllocateNewContext(); AllocateNewContext();
@ -269,7 +257,7 @@ void VKScheduler::EndRenderPass() {
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT |
VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT,
VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, 0, nullptr, nullptr, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, nullptr, nullptr,
vk::Span(barriers.data(), num_images)); vk::Span(barriers.data(), num_images));
}); });
state.renderpass = nullptr; state.renderpass = nullptr;

View file

@ -14,6 +14,7 @@
#include "common/alignment.h" #include "common/alignment.h"
#include "common/common_types.h" #include "common/common_types.h"
#include "common/threadsafe_queue.h" #include "common/threadsafe_queue.h"
#include "video_core/renderer_vulkan/vk_master_semaphore.h"
#include "video_core/vulkan_common/vulkan_wrapper.h" #include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan { namespace Vulkan {
@ -21,7 +22,6 @@ namespace Vulkan {
class CommandPool; class CommandPool;
class Device; class Device;
class Framebuffer; class Framebuffer;
class MasterSemaphore;
class StateTracker; class StateTracker;
class VKQueryCache; class VKQueryCache;
@ -32,15 +32,6 @@ public:
explicit VKScheduler(const Device& device, StateTracker& state_tracker); explicit VKScheduler(const Device& device, StateTracker& state_tracker);
~VKScheduler(); ~VKScheduler();
/// Returns the current command buffer tick.
[[nodiscard]] u64 CurrentTick() const noexcept;
/// Returns true when a tick has been triggered by the GPU.
[[nodiscard]] bool IsFree(u64 tick) const noexcept;
/// Waits for the given tick to trigger on the GPU.
void Wait(u64 tick);
/// Sends the current execution context to the GPU. /// Sends the current execution context to the GPU.
void Flush(VkSemaphore semaphore = nullptr); void Flush(VkSemaphore semaphore = nullptr);
@ -82,6 +73,21 @@ public:
(void)chunk->Record(command); (void)chunk->Record(command);
} }
/// Returns the current command buffer tick.
[[nodiscard]] u64 CurrentTick() const noexcept {
return master_semaphore->CurrentTick();
}
/// Returns true when a tick has been triggered by the GPU.
[[nodiscard]] bool IsFree(u64 tick) const noexcept {
return master_semaphore->IsFree(tick);
}
/// Waits for the given tick to trigger on the GPU.
void Wait(u64 tick) {
master_semaphore->Wait(tick);
}
/// Returns the master timeline semaphore. /// Returns the master timeline semaphore.
[[nodiscard]] MasterSemaphore& GetMasterSemaphore() const noexcept { [[nodiscard]] MasterSemaphore& GetMasterSemaphore() const noexcept {
return *master_semaphore; return *master_semaphore;

View file

@ -3106,7 +3106,11 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
entries.const_buffers.emplace_back(cbuf.second, cbuf.first); entries.const_buffers.emplace_back(cbuf.second, cbuf.first);
} }
for (const auto& [base, usage] : ir.GetGlobalMemory()) { for (const auto& [base, usage] : ir.GetGlobalMemory()) {
entries.global_buffers.emplace_back(base.cbuf_index, base.cbuf_offset, usage.is_written); entries.global_buffers.emplace_back(GlobalBufferEntry{
.cbuf_index = base.cbuf_index,
.cbuf_offset = base.cbuf_offset,
.is_written = usage.is_written,
});
} }
for (const auto& sampler : ir.GetSamplers()) { for (const auto& sampler : ir.GetSamplers()) {
if (sampler.is_buffer) { if (sampler.is_buffer) {
@ -3127,6 +3131,9 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) {
entries.attributes.insert(GetGenericAttributeLocation(attribute)); entries.attributes.insert(GetGenericAttributeLocation(attribute));
} }
} }
for (const auto& buffer : entries.const_buffers) {
entries.enabled_uniform_buffers |= 1U << buffer.GetIndex();
}
entries.clip_distances = ir.GetClipDistances(); entries.clip_distances = ir.GetClipDistances();
entries.shader_length = ir.GetLength(); entries.shader_length = ir.GetLength();
entries.uses_warps = ir.UsesWarps(); entries.uses_warps = ir.UsesWarps();

View file

@ -39,24 +39,7 @@ private:
u32 index{}; u32 index{};
}; };
class GlobalBufferEntry { struct GlobalBufferEntry {
public:
constexpr explicit GlobalBufferEntry(u32 cbuf_index_, u32 cbuf_offset_, bool is_written_)
: cbuf_index{cbuf_index_}, cbuf_offset{cbuf_offset_}, is_written{is_written_} {}
constexpr u32 GetCbufIndex() const {
return cbuf_index;
}
constexpr u32 GetCbufOffset() const {
return cbuf_offset;
}
constexpr bool IsWritten() const {
return is_written;
}
private:
u32 cbuf_index{}; u32 cbuf_index{};
u32 cbuf_offset{}; u32 cbuf_offset{};
bool is_written{}; bool is_written{};
@ -78,6 +61,7 @@ struct ShaderEntries {
std::set<u32> attributes; std::set<u32> attributes;
std::array<bool, Maxwell::NumClipDistances> clip_distances{}; std::array<bool, Maxwell::NumClipDistances> clip_distances{};
std::size_t shader_length{}; std::size_t shader_length{};
u32 enabled_uniform_buffers{};
bool uses_warps{}; bool uses_warps{};
}; };

View file

@ -8,6 +8,7 @@
#include <fmt/format.h> #include <fmt/format.h>
#include "common/alignment.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/bit_util.h" #include "common/bit_util.h"
#include "common/common_types.h" #include "common/common_types.h"
@ -17,18 +18,119 @@
#include "video_core/vulkan_common/vulkan_wrapper.h" #include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan { namespace Vulkan {
namespace {
// Maximum potential alignment of a Vulkan buffer
constexpr VkDeviceSize MAX_ALIGNMENT = 256;
// Maximum size to put elements in the stream buffer
constexpr VkDeviceSize MAX_STREAM_BUFFER_REQUEST_SIZE = 8 * 1024 * 1024;
// Stream buffer size in bytes
constexpr VkDeviceSize STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
constexpr VkDeviceSize REGION_SIZE = STREAM_BUFFER_SIZE / StagingBufferPool::NUM_SYNCS;
constexpr VkMemoryPropertyFlags HOST_FLAGS =
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
constexpr VkMemoryPropertyFlags STREAM_FLAGS = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT | HOST_FLAGS;
bool IsStreamHeap(VkMemoryHeap heap) noexcept {
return STREAM_BUFFER_SIZE < (heap.size * 2) / 3;
}
std::optional<u32> FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask,
VkMemoryPropertyFlags flags) noexcept {
for (u32 type_index = 0; type_index < props.memoryTypeCount; ++type_index) {
if (((type_mask >> type_index) & 1) == 0) {
// Memory type is incompatible
continue;
}
const VkMemoryType& memory_type = props.memoryTypes[type_index];
if ((memory_type.propertyFlags & flags) != flags) {
// Memory type doesn't have the flags we want
continue;
}
if (!IsStreamHeap(props.memoryHeaps[memory_type.heapIndex])) {
// Memory heap is not suitable for streaming
continue;
}
// Success!
return type_index;
}
return std::nullopt;
}
u32 FindMemoryTypeIndex(const VkPhysicalDeviceMemoryProperties& props, u32 type_mask) {
// Try to find a DEVICE_LOCAL_BIT type, Nvidia and AMD have a dedicated heap for this
std::optional<u32> type = FindMemoryTypeIndex(props, type_mask, STREAM_FLAGS);
if (type) {
return *type;
}
// Otherwise try without the DEVICE_LOCAL_BIT
type = FindMemoryTypeIndex(props, type_mask, HOST_FLAGS);
if (type) {
return *type;
}
// This should never happen, and in case it does, signal it as an out of memory situation
throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY);
}
size_t Region(size_t iterator) noexcept {
return iterator / REGION_SIZE;
}
} // Anonymous namespace
StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_,
VKScheduler& scheduler_) VKScheduler& scheduler_)
: device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} {} : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} {
const vk::Device& dev = device.GetLogical();
stream_buffer = dev.CreateBuffer(VkBufferCreateInfo{
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr,
.flags = 0,
.size = STREAM_BUFFER_SIZE,
.usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT |
VK_BUFFER_USAGE_INDEX_BUFFER_BIT,
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
.queueFamilyIndexCount = 0,
.pQueueFamilyIndices = nullptr,
});
if (device.HasDebuggingToolAttached()) {
stream_buffer.SetObjectNameEXT("Stream Buffer");
}
VkMemoryDedicatedRequirements dedicated_reqs{
.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS,
.pNext = nullptr,
.prefersDedicatedAllocation = VK_FALSE,
.requiresDedicatedAllocation = VK_FALSE,
};
const auto requirements = dev.GetBufferMemoryRequirements(*stream_buffer, &dedicated_reqs);
const bool make_dedicated = dedicated_reqs.prefersDedicatedAllocation == VK_TRUE ||
dedicated_reqs.requiresDedicatedAllocation == VK_TRUE;
const VkMemoryDedicatedAllocateInfo dedicated_info{
.sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
.pNext = nullptr,
.image = nullptr,
.buffer = *stream_buffer,
};
const auto memory_properties = device.GetPhysical().GetMemoryProperties();
stream_memory = dev.AllocateMemory(VkMemoryAllocateInfo{
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.pNext = make_dedicated ? &dedicated_info : nullptr,
.allocationSize = requirements.size,
.memoryTypeIndex = FindMemoryTypeIndex(memory_properties, requirements.memoryTypeBits),
});
if (device.HasDebuggingToolAttached()) {
stream_memory.SetObjectNameEXT("Stream Buffer Memory");
}
stream_buffer.BindMemory(*stream_memory, 0);
stream_pointer = stream_memory.Map(0, STREAM_BUFFER_SIZE);
}
StagingBufferPool::~StagingBufferPool() = default; StagingBufferPool::~StagingBufferPool() = default;
StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage) { StagingBufferRef StagingBufferPool::Request(size_t size, MemoryUsage usage) {
if (const std::optional<StagingBufferRef> ref = TryGetReservedBuffer(size, usage)) { if (usage == MemoryUsage::Upload && size <= MAX_STREAM_BUFFER_REQUEST_SIZE) {
return *ref; return GetStreamBuffer(size);
} }
return CreateStagingBuffer(size, usage); return GetStagingBuffer(size, usage);
} }
void StagingBufferPool::TickFrame() { void StagingBufferPool::TickFrame() {
@ -39,6 +141,51 @@ void StagingBufferPool::TickFrame() {
ReleaseCache(MemoryUsage::Download); ReleaseCache(MemoryUsage::Download);
} }
StagingBufferRef StagingBufferPool::GetStreamBuffer(size_t size) {
if (AreRegionsActive(Region(free_iterator) + 1,
std::min(Region(iterator + size) + 1, NUM_SYNCS))) {
// Avoid waiting for the previous usages to be free
return GetStagingBuffer(size, MemoryUsage::Upload);
}
const u64 current_tick = scheduler.CurrentTick();
std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + Region(iterator),
current_tick);
used_iterator = iterator;
free_iterator = std::max(free_iterator, iterator + size);
if (iterator + size > STREAM_BUFFER_SIZE) {
std::fill(sync_ticks.begin() + Region(used_iterator), sync_ticks.begin() + NUM_SYNCS,
current_tick);
used_iterator = 0;
iterator = 0;
free_iterator = size;
if (AreRegionsActive(0, Region(size) + 1)) {
// Avoid waiting for the previous usages to be free
return GetStagingBuffer(size, MemoryUsage::Upload);
}
}
const size_t offset = iterator;
iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT);
return StagingBufferRef{
.buffer = *stream_buffer,
.offset = static_cast<VkDeviceSize>(offset),
.mapped_span = std::span<u8>(stream_pointer + offset, size),
};
}
bool StagingBufferPool::AreRegionsActive(size_t region_begin, size_t region_end) const {
return std::any_of(sync_ticks.begin() + region_begin, sync_ticks.begin() + region_end,
[this](u64 sync_tick) { return !scheduler.IsFree(sync_tick); });
};
StagingBufferRef StagingBufferPool::GetStagingBuffer(size_t size, MemoryUsage usage) {
if (const std::optional<StagingBufferRef> ref = TryGetReservedBuffer(size, usage)) {
return *ref;
}
return CreateStagingBuffer(size, usage);
}
std::optional<StagingBufferRef> StagingBufferPool::TryGetReservedBuffer(size_t size, std::optional<StagingBufferRef> StagingBufferPool::TryGetReservedBuffer(size_t size,
MemoryUsage usage) { MemoryUsage usage) {
StagingBuffers& cache_level = GetCache(usage)[Common::Log2Ceil64(size)]; StagingBuffers& cache_level = GetCache(usage)[Common::Log2Ceil64(size)];

View file

@ -19,11 +19,14 @@ class VKScheduler;
struct StagingBufferRef { struct StagingBufferRef {
VkBuffer buffer; VkBuffer buffer;
VkDeviceSize offset;
std::span<u8> mapped_span; std::span<u8> mapped_span;
}; };
class StagingBufferPool { class StagingBufferPool {
public: public:
static constexpr size_t NUM_SYNCS = 16;
explicit StagingBufferPool(const Device& device, MemoryAllocator& memory_allocator, explicit StagingBufferPool(const Device& device, MemoryAllocator& memory_allocator,
VKScheduler& scheduler); VKScheduler& scheduler);
~StagingBufferPool(); ~StagingBufferPool();
@ -33,6 +36,11 @@ public:
void TickFrame(); void TickFrame();
private: private:
struct StreamBufferCommit {
size_t upper_bound;
u64 tick;
};
struct StagingBuffer { struct StagingBuffer {
vk::Buffer buffer; vk::Buffer buffer;
MemoryCommit commit; MemoryCommit commit;
@ -42,6 +50,7 @@ private:
StagingBufferRef Ref() const noexcept { StagingBufferRef Ref() const noexcept {
return { return {
.buffer = *buffer, .buffer = *buffer,
.offset = 0,
.mapped_span = mapped_span, .mapped_span = mapped_span,
}; };
} }
@ -56,6 +65,12 @@ private:
static constexpr size_t NUM_LEVELS = sizeof(size_t) * CHAR_BIT; static constexpr size_t NUM_LEVELS = sizeof(size_t) * CHAR_BIT;
using StagingBuffersCache = std::array<StagingBuffers, NUM_LEVELS>; using StagingBuffersCache = std::array<StagingBuffers, NUM_LEVELS>;
StagingBufferRef GetStreamBuffer(size_t size);
bool AreRegionsActive(size_t region_begin, size_t region_end) const;
StagingBufferRef GetStagingBuffer(size_t size, MemoryUsage usage);
std::optional<StagingBufferRef> TryGetReservedBuffer(size_t size, MemoryUsage usage); std::optional<StagingBufferRef> TryGetReservedBuffer(size_t size, MemoryUsage usage);
StagingBufferRef CreateStagingBuffer(size_t size, MemoryUsage usage); StagingBufferRef CreateStagingBuffer(size_t size, MemoryUsage usage);
@ -70,6 +85,15 @@ private:
MemoryAllocator& memory_allocator; MemoryAllocator& memory_allocator;
VKScheduler& scheduler; VKScheduler& scheduler;
vk::Buffer stream_buffer;
vk::DeviceMemory stream_memory;
u8* stream_pointer = nullptr;
size_t iterator = 0;
size_t used_iterator = 0;
size_t free_iterator = 0;
std::array<u64, NUM_SYNCS> sync_ticks{};
StagingBuffersCache device_local_cache; StagingBuffersCache device_local_cache;
StagingBuffersCache upload_cache; StagingBuffersCache upload_cache;
StagingBuffersCache download_cache; StagingBuffersCache download_cache;

View file

@ -30,15 +30,18 @@ using Table = Maxwell3D::DirtyState::Table;
using Flags = Maxwell3D::DirtyState::Flags; using Flags = Maxwell3D::DirtyState::Flags;
Flags MakeInvalidationFlags() { Flags MakeInvalidationFlags() {
static constexpr std::array INVALIDATION_FLAGS{ static constexpr int INVALIDATION_FLAGS[]{
Viewports, Scissors, DepthBias, BlendConstants, DepthBounds, Viewports, Scissors, DepthBias, BlendConstants, DepthBounds,
StencilProperties, CullMode, DepthBoundsEnable, DepthTestEnable, DepthWriteEnable, StencilProperties, CullMode, DepthBoundsEnable, DepthTestEnable, DepthWriteEnable,
DepthCompareOp, FrontFace, StencilOp, StencilTestEnable, DepthCompareOp, FrontFace, StencilOp, StencilTestEnable, VertexBuffers,
}; };
Flags flags{}; Flags flags{};
for (const int flag : INVALIDATION_FLAGS) { for (const int flag : INVALIDATION_FLAGS) {
flags[flag] = true; flags[flag] = true;
} }
for (int index = VertexBuffer0; index <= VertexBuffer31; ++index) {
flags[index] = true;
}
return flags; return flags;
} }
@ -130,7 +133,7 @@ void SetupDirtyStencilTestEnable(Tables& tables) {
StateTracker::StateTracker(Tegra::GPU& gpu) StateTracker::StateTracker(Tegra::GPU& gpu)
: flags{gpu.Maxwell3D().dirty.flags}, invalidation_flags{MakeInvalidationFlags()} { : flags{gpu.Maxwell3D().dirty.flags}, invalidation_flags{MakeInvalidationFlags()} {
auto& tables = gpu.Maxwell3D().dirty.tables; auto& tables = gpu.Maxwell3D().dirty.tables;
SetupDirtyRenderTargets(tables); SetupDirtyFlags(tables);
SetupDirtyViewports(tables); SetupDirtyViewports(tables);
SetupDirtyScissors(tables); SetupDirtyScissors(tables);
SetupDirtyDepthBias(tables); SetupDirtyDepthBias(tables);

View file

@ -56,8 +56,11 @@ VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 wi
} // Anonymous namespace } // Anonymous namespace
VKSwapchain::VKSwapchain(VkSurfaceKHR surface_, const Device& device_, VKScheduler& scheduler_) VKSwapchain::VKSwapchain(VkSurfaceKHR surface_, const Device& device_, VKScheduler& scheduler_,
: surface{surface_}, device{device_}, scheduler{scheduler_} {} u32 width, u32 height, bool srgb)
: surface{surface_}, device{device_}, scheduler{scheduler_} {
Create(width, height, srgb);
}
VKSwapchain::~VKSwapchain() = default; VKSwapchain::~VKSwapchain() = default;

View file

@ -20,7 +20,8 @@ class VKScheduler;
class VKSwapchain { class VKSwapchain {
public: public:
explicit VKSwapchain(VkSurfaceKHR surface, const Device& device, VKScheduler& scheduler); explicit VKSwapchain(VkSurfaceKHR surface, const Device& device, VKScheduler& scheduler,
u32 width, u32 height, bool srgb);
~VKSwapchain(); ~VKSwapchain();
/// Creates (or recreates) the swapchain with a given size. /// Creates (or recreates) the swapchain with a given size.

View file

@ -426,21 +426,23 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) {
void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage image, void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage image,
VkImageAspectFlags aspect_mask, bool is_initialized, VkImageAspectFlags aspect_mask, bool is_initialized,
std::span<const VkBufferImageCopy> copies) { std::span<const VkBufferImageCopy> copies) {
static constexpr VkAccessFlags ACCESS_FLAGS = VK_ACCESS_SHADER_WRITE_BIT | static constexpr VkAccessFlags WRITE_ACCESS_FLAGS =
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
static constexpr VkAccessFlags READ_ACCESS_FLAGS = VK_ACCESS_SHADER_READ_BIT |
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT;
const VkImageMemoryBarrier read_barrier{ const VkImageMemoryBarrier read_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr, .pNext = nullptr,
.srcAccessMask = ACCESS_FLAGS, .srcAccessMask = WRITE_ACCESS_FLAGS,
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED, .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image, .image = image,
.subresourceRange = .subresourceRange{
{
.aspectMask = aspect_mask, .aspectMask = aspect_mask,
.baseMipLevel = 0, .baseMipLevel = 0,
.levelCount = VK_REMAINING_MIP_LEVELS, .levelCount = VK_REMAINING_MIP_LEVELS,
@ -452,14 +454,13 @@ void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage im
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr, .pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = ACCESS_FLAGS, .dstAccessMask = WRITE_ACCESS_FLAGS | READ_ACCESS_FLAGS,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL, .newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image, .image = image,
.subresourceRange = .subresourceRange{
{
.aspectMask = aspect_mask, .aspectMask = aspect_mask,
.baseMipLevel = 0, .baseMipLevel = 0,
.levelCount = VK_REMAINING_MIP_LEVELS, .levelCount = VK_REMAINING_MIP_LEVELS,
@ -569,20 +570,12 @@ void TextureCacheRuntime::Finish() {
scheduler.Finish(); scheduler.Finish();
} }
ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) { StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) {
const auto staging_ref = staging_buffer_pool.Request(size, MemoryUsage::Upload); return staging_buffer_pool.Request(size, MemoryUsage::Upload);
return {
.handle = staging_ref.buffer,
.span = staging_ref.mapped_span,
};
} }
ImageBufferMap TextureCacheRuntime::MapDownloadBuffer(size_t size) { StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size) {
const auto staging_ref = staging_buffer_pool.Request(size, MemoryUsage::Download); return staging_buffer_pool.Request(size, MemoryUsage::Download);
return {
.handle = staging_ref.buffer,
.span = staging_ref.mapped_span,
};
} }
void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src, void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src,
@ -754,7 +747,7 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL, .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL, .newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
@ -765,12 +758,9 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src,
VkImageMemoryBarrier{ VkImageMemoryBarrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr, .pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_COLOR_ATTACHMENT_READ_BIT |
VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL, .oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL,
@ -828,12 +818,11 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_
} }
} }
void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void Image::UploadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
std::span<const BufferImageCopy> copies) {
// TODO: Move this to another API // TODO: Move this to another API
scheduler->RequestOutsideRenderPassOperationContext(); scheduler->RequestOutsideRenderPassOperationContext();
std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask); std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
const VkBuffer src_buffer = map.handle; const VkBuffer src_buffer = map.buffer;
const VkImage vk_image = *image; const VkImage vk_image = *image;
const VkImageAspectFlags vk_aspect_mask = aspect_mask; const VkImageAspectFlags vk_aspect_mask = aspect_mask;
const bool is_initialized = std::exchange(initialized, true); const bool is_initialized = std::exchange(initialized, true);
@ -843,12 +832,12 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
}); });
} }
void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void Image::UploadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferCopy> copies) { std::span<const VideoCommon::BufferCopy> copies) {
// TODO: Move this to another API // TODO: Move this to another API
scheduler->RequestOutsideRenderPassOperationContext(); scheduler->RequestOutsideRenderPassOperationContext();
std::vector vk_copies = TransformBufferCopies(copies, buffer_offset); std::vector vk_copies = TransformBufferCopies(copies, map.offset);
const VkBuffer src_buffer = map.handle; const VkBuffer src_buffer = map.buffer;
const VkBuffer dst_buffer = *buffer; const VkBuffer dst_buffer = *buffer;
scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) { scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) {
// TODO: Barriers // TODO: Barriers
@ -856,13 +845,57 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset,
}); });
} }
void Image::DownloadMemory(const ImageBufferMap& map, size_t buffer_offset, void Image::DownloadMemory(const StagingBufferRef& map, std::span<const BufferImageCopy> copies) {
std::span<const BufferImageCopy> copies) { std::vector vk_copies = TransformBufferImageCopies(copies, map.offset, aspect_mask);
std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask); scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask,
scheduler->Record([buffer = map.handle, image = *image, aspect_mask = aspect_mask,
vk_copies](vk::CommandBuffer cmdbuf) { vk_copies](vk::CommandBuffer cmdbuf) {
// TODO: Barriers const VkImageMemoryBarrier read_barrier{
cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_GENERAL, buffer, vk_copies); .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
.dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT,
.oldLayout = VK_IMAGE_LAYOUT_GENERAL,
.newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image,
.subresourceRange{
.aspectMask = aspect_mask,
.baseMipLevel = 0,
.levelCount = VK_REMAINING_MIP_LEVELS,
.baseArrayLayer = 0,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
const VkImageMemoryBarrier image_write_barrier{
.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = 0,
.dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
.oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL,
.newLayout = VK_IMAGE_LAYOUT_GENERAL,
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
.image = image,
.subresourceRange{
.aspectMask = aspect_mask,
.baseMipLevel = 0,
.levelCount = VK_REMAINING_MIP_LEVELS,
.baseArrayLayer = 0,
.layerCount = VK_REMAINING_ARRAY_LAYERS,
},
};
const VkMemoryBarrier memory_write_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT,
.dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT,
};
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
0, read_barrier);
cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, vk_copies);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
0, memory_write_barrier, nullptr, image_write_barrier);
}); });
} }
@ -1127,7 +1160,7 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM
.pAttachments = attachments.data(), .pAttachments = attachments.data(),
.width = key.size.width, .width = key.size.width,
.height = key.size.height, .height = key.size.height,
.layers = static_cast<u32>(num_layers), .layers = static_cast<u32>(std::max(num_layers, 1)),
}); });
if (runtime.device.HasDebuggingToolAttached()) { if (runtime.device.HasDebuggingToolAttached()) {
framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str()); framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str());

View file

@ -7,6 +7,7 @@
#include <compare> #include <compare>
#include <span> #include <span>
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
#include "video_core/texture_cache/texture_cache.h" #include "video_core/texture_cache/texture_cache.h"
#include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h"
#include "video_core/vulkan_common/vulkan_wrapper.h" #include "video_core/vulkan_common/vulkan_wrapper.h"
@ -53,19 +54,6 @@ struct hash<Vulkan::RenderPassKey> {
namespace Vulkan { namespace Vulkan {
struct ImageBufferMap {
[[nodiscard]] VkBuffer Handle() const noexcept {
return handle;
}
[[nodiscard]] std::span<u8> Span() const noexcept {
return span;
}
VkBuffer handle;
std::span<u8> span;
};
struct TextureCacheRuntime { struct TextureCacheRuntime {
const Device& device; const Device& device;
VKScheduler& scheduler; VKScheduler& scheduler;
@ -76,9 +64,9 @@ struct TextureCacheRuntime {
void Finish(); void Finish();
[[nodiscard]] ImageBufferMap MapUploadBuffer(size_t size); [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size);
[[nodiscard]] ImageBufferMap MapDownloadBuffer(size_t size); [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size);
void BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src, void BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src,
const std::array<Offset2D, 2>& dst_region, const std::array<Offset2D, 2>& dst_region,
@ -94,7 +82,7 @@ struct TextureCacheRuntime {
return false; return false;
} }
void AccelerateImageUpload(Image&, const ImageBufferMap&, size_t, void AccelerateImageUpload(Image&, const StagingBufferRef&,
std::span<const VideoCommon::SwizzleParameters>) { std::span<const VideoCommon::SwizzleParameters>) {
UNREACHABLE(); UNREACHABLE();
} }
@ -112,13 +100,12 @@ public:
explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr,
VAddr cpu_addr); VAddr cpu_addr);
void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void UploadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferImageCopy> copies); std::span<const VideoCommon::BufferImageCopy> copies);
void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void UploadMemory(const StagingBufferRef& map, std::span<const VideoCommon::BufferCopy> copies);
std::span<const VideoCommon::BufferCopy> copies);
void DownloadMemory(const ImageBufferMap& map, size_t buffer_offset, void DownloadMemory(const StagingBufferRef& map,
std::span<const VideoCommon::BufferImageCopy> copies); std::span<const VideoCommon::BufferImageCopy> copies);
[[nodiscard]] VkImage Handle() const noexcept { [[nodiscard]] VkImage Handle() const noexcept {

View file

@ -9,16 +9,7 @@
#include <shared_mutex> #include <shared_mutex>
#include <thread> #include <thread>
// This header includes both Vulkan and OpenGL headers, this has to be fixed
// Unfortunately, including OpenGL will include Windows.h that defines macros that can cause issues.
// Forcefully include glad early and undefine macros
#include <glad/glad.h> #include <glad/glad.h>
#ifdef CreateEvent
#undef CreateEvent
#endif
#ifdef CreateSemaphore
#undef CreateSemaphore
#endif
#include "common/common_types.h" #include "common/common_types.h"
#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_device.h"

View file

@ -76,6 +76,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
case SystemVariable::InvocationId: case SystemVariable::InvocationId:
return Operation(OperationCode::InvocationId); return Operation(OperationCode::InvocationId);
case SystemVariable::Ydirection: case SystemVariable::Ydirection:
uses_y_negate = true;
return Operation(OperationCode::YNegate); return Operation(OperationCode::YNegate);
case SystemVariable::InvocationInfo: case SystemVariable::InvocationInfo:
LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete"); LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete");

View file

@ -139,6 +139,10 @@ public:
return uses_legacy_varyings; return uses_legacy_varyings;
} }
bool UsesYNegate() const {
return uses_y_negate;
}
bool UsesWarps() const { bool UsesWarps() const {
return uses_warps; return uses_warps;
} }
@ -465,6 +469,7 @@ private:
bool uses_instance_id{}; bool uses_instance_id{};
bool uses_vertex_id{}; bool uses_vertex_id{};
bool uses_legacy_varyings{}; bool uses_legacy_varyings{};
bool uses_y_negate{};
bool uses_warps{}; bool uses_warps{};
bool uses_indexed_samplers{}; bool uses_indexed_samplers{};

View file

@ -103,9 +103,6 @@ public:
/// Notify the cache that a new frame has been queued /// Notify the cache that a new frame has been queued
void TickFrame(); void TickFrame();
/// Return an unique mutually exclusive lock for the cache
[[nodiscard]] std::unique_lock<std::mutex> AcquireLock();
/// Return a constant reference to the given image view id /// Return a constant reference to the given image view id
[[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept; [[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept;
@ -179,6 +176,8 @@ public:
/// Return true when a CPU region is modified from the GPU /// Return true when a CPU region is modified from the GPU
[[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size);
std::mutex mutex;
private: private:
/// Iterate over all page indices in a range /// Iterate over all page indices in a range
template <typename Func> template <typename Func>
@ -212,8 +211,8 @@ private:
void RefreshContents(Image& image); void RefreshContents(Image& image);
/// Upload data from guest to an image /// Upload data from guest to an image
template <typename MapBuffer> template <typename StagingBuffer>
void UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset); void UploadImageContents(Image& image, StagingBuffer& staging_buffer);
/// Find or create an image view from a guest descriptor /// Find or create an image view from a guest descriptor
[[nodiscard]] ImageViewId FindImageView(const TICEntry& config); [[nodiscard]] ImageViewId FindImageView(const TICEntry& config);
@ -325,8 +324,6 @@ private:
RenderTargets render_targets; RenderTargets render_targets;
std::mutex mutex;
std::unordered_map<TICEntry, ImageViewId> image_views; std::unordered_map<TICEntry, ImageViewId> image_views;
std::unordered_map<TSCEntry, SamplerId> samplers; std::unordered_map<TSCEntry, SamplerId> samplers;
std::unordered_map<RenderTargets, FramebufferId> framebuffers; std::unordered_map<RenderTargets, FramebufferId> framebuffers;
@ -385,11 +382,6 @@ void TextureCache<P>::TickFrame() {
++frame_tick; ++frame_tick;
} }
template <class P>
std::unique_lock<std::mutex> TextureCache<P>::AcquireLock() {
return std::unique_lock{mutex};
}
template <class P> template <class P>
const typename P::ImageView& TextureCache<P>::GetImageView(ImageViewId id) const noexcept { const typename P::ImageView& TextureCache<P>::GetImageView(ImageViewId id) const noexcept {
return slot_image_views[id]; return slot_image_views[id];
@ -598,11 +590,11 @@ void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) {
}); });
for (const ImageId image_id : images) { for (const ImageId image_id : images) {
Image& image = slot_images[image_id]; Image& image = slot_images[image_id];
auto map = runtime.MapDownloadBuffer(image.unswizzled_size_bytes); auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes);
const auto copies = FullDownloadCopies(image.info); const auto copies = FullDownloadCopies(image.info);
image.DownloadMemory(map, 0, copies); image.DownloadMemory(map, copies);
runtime.Finish(); runtime.Finish();
SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.Span()); SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span);
} }
} }
@ -757,25 +749,25 @@ void TextureCache<P>::PopAsyncFlushes() {
for (const ImageId image_id : download_ids) { for (const ImageId image_id : download_ids) {
total_size_bytes += slot_images[image_id].unswizzled_size_bytes; total_size_bytes += slot_images[image_id].unswizzled_size_bytes;
} }
auto download_map = runtime.MapDownloadBuffer(total_size_bytes); auto download_map = runtime.DownloadStagingBuffer(total_size_bytes);
size_t buffer_offset = 0; const size_t original_offset = download_map.offset;
for (const ImageId image_id : download_ids) { for (const ImageId image_id : download_ids) {
Image& image = slot_images[image_id]; Image& image = slot_images[image_id];
const auto copies = FullDownloadCopies(image.info); const auto copies = FullDownloadCopies(image.info);
image.DownloadMemory(download_map, buffer_offset, copies); image.DownloadMemory(download_map, copies);
buffer_offset += image.unswizzled_size_bytes; download_map.offset += image.unswizzled_size_bytes;
} }
// Wait for downloads to finish // Wait for downloads to finish
runtime.Finish(); runtime.Finish();
buffer_offset = 0; download_map.offset = original_offset;
const std::span<u8> download_span = download_map.Span(); std::span<u8> download_span = download_map.mapped_span;
for (const ImageId image_id : download_ids) { for (const ImageId image_id : download_ids) {
const ImageBase& image = slot_images[image_id]; const ImageBase& image = slot_images[image_id];
const auto copies = FullDownloadCopies(image.info); const auto copies = FullDownloadCopies(image.info);
const std::span<u8> image_download_span = download_span.subspan(buffer_offset); SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, download_span);
SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, image_download_span); download_map.offset += image.unswizzled_size_bytes;
buffer_offset += image.unswizzled_size_bytes; download_span = download_span.subspan(image.unswizzled_size_bytes);
} }
committed_downloads.pop(); committed_downloads.pop();
} }
@ -806,32 +798,32 @@ void TextureCache<P>::RefreshContents(Image& image) {
LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented");
return; return;
} }
auto map = runtime.MapUploadBuffer(MapSizeBytes(image)); auto staging = runtime.UploadStagingBuffer(MapSizeBytes(image));
UploadImageContents(image, map, 0); UploadImageContents(image, staging);
runtime.InsertUploadMemoryBarrier(); runtime.InsertUploadMemoryBarrier();
} }
template <class P> template <class P>
template <typename MapBuffer> template <typename StagingBuffer>
void TextureCache<P>::UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset) { void TextureCache<P>::UploadImageContents(Image& image, StagingBuffer& staging) {
const std::span<u8> mapped_span = map.Span().subspan(buffer_offset); const std::span<u8> mapped_span = staging.mapped_span;
const GPUVAddr gpu_addr = image.gpu_addr; const GPUVAddr gpu_addr = image.gpu_addr;
if (True(image.flags & ImageFlagBits::AcceleratedUpload)) { if (True(image.flags & ImageFlagBits::AcceleratedUpload)) {
gpu_memory.ReadBlockUnsafe(gpu_addr, mapped_span.data(), mapped_span.size_bytes()); gpu_memory.ReadBlockUnsafe(gpu_addr, mapped_span.data(), mapped_span.size_bytes());
const auto uploads = FullUploadSwizzles(image.info); const auto uploads = FullUploadSwizzles(image.info);
runtime.AccelerateImageUpload(image, map, buffer_offset, uploads); runtime.AccelerateImageUpload(image, staging, uploads);
} else if (True(image.flags & ImageFlagBits::Converted)) { } else if (True(image.flags & ImageFlagBits::Converted)) {
std::vector<u8> unswizzled_data(image.unswizzled_size_bytes); std::vector<u8> unswizzled_data(image.unswizzled_size_bytes);
auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, unswizzled_data); auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, unswizzled_data);
ConvertImage(unswizzled_data, image.info, mapped_span, copies); ConvertImage(unswizzled_data, image.info, mapped_span, copies);
image.UploadMemory(map, buffer_offset, copies); image.UploadMemory(staging, copies);
} else if (image.info.type == ImageType::Buffer) { } else if (image.info.type == ImageType::Buffer) {
const std::array copies{UploadBufferCopy(gpu_memory, gpu_addr, image, mapped_span)}; const std::array copies{UploadBufferCopy(gpu_memory, gpu_addr, image, mapped_span)};
image.UploadMemory(map, buffer_offset, copies); image.UploadMemory(staging, copies);
} else { } else {
const auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, mapped_span); const auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, mapped_span);
image.UploadMemory(map, buffer_offset, copies); image.UploadMemory(staging, copies);
} }
} }

View file

@ -38,19 +38,18 @@ namespace VideoCore {
std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) { std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) {
const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue(); const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue();
std::unique_ptr<Tegra::GPU> gpu = std::make_unique<Tegra::GPU>( const bool use_async = Settings::values.use_asynchronous_gpu_emulation.GetValue();
system, Settings::values.use_asynchronous_gpu_emulation.GetValue(), use_nvdec); auto gpu = std::make_unique<Tegra::GPU>(system, use_async, use_nvdec);
auto context = emu_window.CreateSharedContext(); auto context = emu_window.CreateSharedContext();
const auto scope = context->Acquire(); auto scope = context->Acquire();
try {
auto renderer = CreateRenderer(system, emu_window, *gpu, std::move(context)); auto renderer = CreateRenderer(system, emu_window, *gpu, std::move(context));
if (!renderer->Init()) {
return nullptr;
}
gpu->BindRenderer(std::move(renderer)); gpu->BindRenderer(std::move(renderer));
return gpu; return gpu;
} catch (const std::runtime_error& exception) {
LOG_ERROR(HW_GPU, "Failed to initialize GPU: {}", exception.what());
return nullptr;
}
} }
u16 GetResolutionScaleFactor(const RendererBase& renderer) { u16 GetResolutionScaleFactor(const RendererBase& renderer) {

View file

@ -18,27 +18,22 @@
#include "video_core/vulkan_common/vulkan_wrapper.h" #include "video_core/vulkan_common/vulkan_wrapper.h"
namespace Vulkan { namespace Vulkan {
namespace { namespace {
namespace Alternatives { namespace Alternatives {
constexpr std::array DEPTH24_UNORM_STENCIL8_UINT{
constexpr std::array Depth24UnormS8_UINT{
VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D32_SFLOAT_S8_UINT,
VK_FORMAT_D16_UNORM_S8_UINT, VK_FORMAT_D16_UNORM_S8_UINT,
VkFormat{}, VK_FORMAT_UNDEFINED,
}; };
constexpr std::array Depth16UnormS8_UINT{ constexpr std::array DEPTH16_UNORM_STENCIL8_UINT{
VK_FORMAT_D24_UNORM_S8_UINT, VK_FORMAT_D24_UNORM_S8_UINT,
VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D32_SFLOAT_S8_UINT,
VkFormat{}, VK_FORMAT_UNDEFINED,
}; };
} // namespace Alternatives } // namespace Alternatives
constexpr std::array REQUIRED_EXTENSIONS{ constexpr std::array REQUIRED_EXTENSIONS{
VK_KHR_SWAPCHAIN_EXTENSION_NAME,
VK_KHR_MAINTENANCE1_EXTENSION_NAME, VK_KHR_MAINTENANCE1_EXTENSION_NAME,
VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME, VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME,
VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME, VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME,
@ -52,6 +47,12 @@ constexpr std::array REQUIRED_EXTENSIONS{
VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME,
VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME,
VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME, VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME,
#ifdef _WIN32
VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
#endif
#ifdef __linux__
VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
#endif
}; };
template <typename T> template <typename T>
@ -63,9 +64,9 @@ void SetNext(void**& next, T& data) {
constexpr const VkFormat* GetFormatAlternatives(VkFormat format) { constexpr const VkFormat* GetFormatAlternatives(VkFormat format) {
switch (format) { switch (format) {
case VK_FORMAT_D24_UNORM_S8_UINT: case VK_FORMAT_D24_UNORM_S8_UINT:
return Alternatives::Depth24UnormS8_UINT.data(); return Alternatives::DEPTH24_UNORM_STENCIL8_UINT.data();
case VK_FORMAT_D16_UNORM_S8_UINT: case VK_FORMAT_D16_UNORM_S8_UINT:
return Alternatives::Depth16UnormS8_UINT.data(); return Alternatives::DEPTH16_UNORM_STENCIL8_UINT.data();
default: default:
return nullptr; return nullptr;
} }
@ -195,23 +196,18 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
const vk::InstanceDispatch& dld_) const vk::InstanceDispatch& dld_)
: instance{instance_}, dld{dld_}, physical{physical_}, properties{physical.GetProperties()}, : instance{instance_}, dld{dld_}, physical{physical_}, properties{physical.GetProperties()},
format_properties{GetFormatProperties(physical)} { format_properties{GetFormatProperties(physical)} {
CheckSuitability(); CheckSuitability(surface != nullptr);
SetupFamilies(surface); SetupFamilies(surface);
SetupFeatures(); SetupFeatures();
const auto queue_cis = GetDeviceQueueCreateInfos(); const auto queue_cis = GetDeviceQueueCreateInfos();
const std::vector extensions = LoadExtensions(); const std::vector extensions = LoadExtensions(surface != nullptr);
VkPhysicalDeviceFeatures2 features2{ VkPhysicalDeviceFeatures2 features2{
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2,
.pNext = nullptr, .pNext = nullptr,
.features{}, .features{
}; .robustBufferAccess = true,
const void* first_next = &features2;
void** next = &features2.pNext;
features2.features = {
.robustBufferAccess = false,
.fullDrawIndexUint32 = false, .fullDrawIndexUint32 = false,
.imageCubeArray = true, .imageCubeArray = true,
.independentBlend = true, .independentBlend = true,
@ -266,7 +262,11 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
.sparseResidencyAliased = false, .sparseResidencyAliased = false,
.variableMultisampleRate = false, .variableMultisampleRate = false,
.inheritedQueries = false, .inheritedQueries = false,
},
}; };
const void* first_next = &features2;
void** next = &features2.pNext;
VkPhysicalDeviceTimelineSemaphoreFeaturesKHR timeline_semaphore{ VkPhysicalDeviceTimelineSemaphoreFeaturesKHR timeline_semaphore{
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR, .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR,
.pNext = nullptr, .pNext = nullptr,
@ -384,7 +384,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR
robustness2 = { robustness2 = {
.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT, .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT,
.pNext = nullptr, .pNext = nullptr,
.robustBufferAccess2 = false, .robustBufferAccess2 = true,
.robustImageAccess2 = true, .robustImageAccess2 = true,
.nullDescriptor = true, .nullDescriptor = true,
}; };
@ -535,16 +535,18 @@ bool Device::IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags want
return (supported_usage & wanted_usage) == wanted_usage; return (supported_usage & wanted_usage) == wanted_usage;
} }
void Device::CheckSuitability() const { void Device::CheckSuitability(bool requires_swapchain) const {
std::bitset<REQUIRED_EXTENSIONS.size()> available_extensions; std::bitset<REQUIRED_EXTENSIONS.size()> available_extensions;
bool has_swapchain = false;
for (const VkExtensionProperties& property : physical.EnumerateDeviceExtensionProperties()) { for (const VkExtensionProperties& property : physical.EnumerateDeviceExtensionProperties()) {
for (std::size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) { const std::string_view name{property.extensionName};
for (size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) {
if (available_extensions[i]) { if (available_extensions[i]) {
continue; continue;
} }
const std::string_view name{property.extensionName};
available_extensions[i] = name == REQUIRED_EXTENSIONS[i]; available_extensions[i] = name == REQUIRED_EXTENSIONS[i];
} }
has_swapchain = has_swapchain || name == VK_KHR_SWAPCHAIN_EXTENSION_NAME;
} }
for (size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) { for (size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) {
if (available_extensions[i]) { if (available_extensions[i]) {
@ -553,6 +555,11 @@ void Device::CheckSuitability() const {
LOG_ERROR(Render_Vulkan, "Missing required extension: {}", REQUIRED_EXTENSIONS[i]); LOG_ERROR(Render_Vulkan, "Missing required extension: {}", REQUIRED_EXTENSIONS[i]);
throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT); throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT);
} }
if (requires_swapchain && !has_swapchain) {
LOG_ERROR(Render_Vulkan, "Missing required extension: VK_KHR_swapchain");
throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT);
}
struct LimitTuple { struct LimitTuple {
u32 minimum; u32 minimum;
u32 value; u32 value;
@ -574,7 +581,9 @@ void Device::CheckSuitability() const {
} }
const VkPhysicalDeviceFeatures features{physical.GetFeatures()}; const VkPhysicalDeviceFeatures features{physical.GetFeatures()};
const std::array feature_report{ const std::array feature_report{
std::make_pair(features.robustBufferAccess, "robustBufferAccess"),
std::make_pair(features.vertexPipelineStoresAndAtomics, "vertexPipelineStoresAndAtomics"), std::make_pair(features.vertexPipelineStoresAndAtomics, "vertexPipelineStoresAndAtomics"),
std::make_pair(features.robustBufferAccess, "robustBufferAccess"),
std::make_pair(features.imageCubeArray, "imageCubeArray"), std::make_pair(features.imageCubeArray, "imageCubeArray"),
std::make_pair(features.independentBlend, "independentBlend"), std::make_pair(features.independentBlend, "independentBlend"),
std::make_pair(features.depthClamp, "depthClamp"), std::make_pair(features.depthClamp, "depthClamp"),
@ -599,10 +608,13 @@ void Device::CheckSuitability() const {
} }
} }
std::vector<const char*> Device::LoadExtensions() { std::vector<const char*> Device::LoadExtensions(bool requires_surface) {
std::vector<const char*> extensions; std::vector<const char*> extensions;
extensions.reserve(7 + REQUIRED_EXTENSIONS.size()); extensions.reserve(8 + REQUIRED_EXTENSIONS.size());
extensions.insert(extensions.begin(), REQUIRED_EXTENSIONS.begin(), REQUIRED_EXTENSIONS.end()); extensions.insert(extensions.begin(), REQUIRED_EXTENSIONS.begin(), REQUIRED_EXTENSIONS.end());
if (requires_surface) {
extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME);
}
bool has_khr_shader_float16_int8{}; bool has_khr_shader_float16_int8{};
bool has_ext_subgroup_size_control{}; bool has_ext_subgroup_size_control{};
@ -743,7 +755,8 @@ std::vector<const char*> Device::LoadExtensions() {
robustness2.pNext = nullptr; robustness2.pNext = nullptr;
features.pNext = &robustness2; features.pNext = &robustness2;
physical.GetFeatures2KHR(features); physical.GetFeatures2KHR(features);
if (robustness2.nullDescriptor && robustness2.robustImageAccess2) { if (robustness2.nullDescriptor && robustness2.robustBufferAccess2 &&
robustness2.robustImageAccess2) {
extensions.push_back(VK_EXT_ROBUSTNESS_2_EXTENSION_NAME); extensions.push_back(VK_EXT_ROBUSTNESS_2_EXTENSION_NAME);
ext_robustness2 = true; ext_robustness2 = true;
} }

View file

@ -23,7 +23,7 @@ enum class FormatType { Linear, Optimal, Buffer };
const u32 GuestWarpSize = 32; const u32 GuestWarpSize = 32;
/// Handles data specific to a physical device. /// Handles data specific to a physical device.
class Device final { class Device {
public: public:
explicit Device(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface, explicit Device(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface,
const vk::InstanceDispatch& dld); const vk::InstanceDispatch& dld);
@ -227,10 +227,10 @@ public:
private: private:
/// Checks if the physical device is suitable. /// Checks if the physical device is suitable.
void CheckSuitability() const; void CheckSuitability(bool requires_swapchain) const;
/// Loads extensions into a vector and stores available ones in this object. /// Loads extensions into a vector and stores available ones in this object.
std::vector<const char*> LoadExtensions(); std::vector<const char*> LoadExtensions(bool requires_surface);
/// Sets up queue families. /// Sets up queue families.
void SetupFamilies(VkSurfaceKHR surface); void SetupFamilies(VkSurfaceKHR surface);

View file

@ -3,6 +3,7 @@
// Refer to the license.txt file included. // Refer to the license.txt file included.
#include <algorithm> #include <algorithm>
#include <future>
#include <optional> #include <optional>
#include <span> #include <span>
#include <utility> #include <utility>
@ -140,7 +141,10 @@ vk::Instance CreateInstance(const Common::DynamicLibrary& library, vk::InstanceD
VK_VERSION_MAJOR(required_version), VK_VERSION_MINOR(required_version)); VK_VERSION_MAJOR(required_version), VK_VERSION_MINOR(required_version));
throw vk::Exception(VK_ERROR_INCOMPATIBLE_DRIVER); throw vk::Exception(VK_ERROR_INCOMPATIBLE_DRIVER);
} }
vk::Instance instance = vk::Instance::Create(required_version, layers, extensions, dld); vk::Instance instance =
std::async([&] {
return vk::Instance::Create(required_version, layers, extensions, dld);
}).get();
if (!vk::Load(*instance, dld)) { if (!vk::Load(*instance, dld)) {
LOG_ERROR(Render_Vulkan, "Failed to load Vulkan instance function pointers"); LOG_ERROR(Render_Vulkan, "Failed to load Vulkan instance function pointers");
throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED);

View file

@ -7,6 +7,8 @@
#include <optional> #include <optional>
#include <vector> #include <vector>
#include <glad/glad.h>
#include "common/alignment.h" #include "common/alignment.h"
#include "common/assert.h" #include "common/assert.h"
#include "common/common_types.h" #include "common/common_types.h"
@ -55,10 +57,24 @@ struct Range {
class MemoryAllocation { class MemoryAllocation {
public: public:
explicit MemoryAllocation(const Device& device_, vk::DeviceMemory memory_, explicit MemoryAllocation(vk::DeviceMemory memory_, VkMemoryPropertyFlags properties,
VkMemoryPropertyFlags properties, u64 allocation_size_, u32 type) u64 allocation_size_, u32 type)
: device{device_}, memory{std::move(memory_)}, allocation_size{allocation_size_}, : memory{std::move(memory_)}, allocation_size{allocation_size_}, property_flags{properties},
property_flags{properties}, shifted_memory_type{1U << type} {} shifted_memory_type{1U << type} {}
#if defined(_WIN32) || defined(__linux__)
~MemoryAllocation() {
if (owning_opengl_handle != 0) {
glDeleteMemoryObjectsEXT(1, &owning_opengl_handle);
}
}
#endif
MemoryAllocation& operator=(const MemoryAllocation&) = delete;
MemoryAllocation(const MemoryAllocation&) = delete;
MemoryAllocation& operator=(MemoryAllocation&&) = delete;
MemoryAllocation(MemoryAllocation&&) = delete;
[[nodiscard]] std::optional<MemoryCommit> Commit(VkDeviceSize size, VkDeviceSize alignment) { [[nodiscard]] std::optional<MemoryCommit> Commit(VkDeviceSize size, VkDeviceSize alignment) {
const std::optional<u64> alloc = FindFreeRegion(size, alignment); const std::optional<u64> alloc = FindFreeRegion(size, alignment);
@ -88,6 +104,31 @@ public:
return memory_mapped_span; return memory_mapped_span;
} }
#ifdef _WIN32
[[nodiscard]] u32 ExportOpenGLHandle() {
if (!owning_opengl_handle) {
glCreateMemoryObjectsEXT(1, &owning_opengl_handle);
glImportMemoryWin32HandleEXT(owning_opengl_handle, allocation_size,
GL_HANDLE_TYPE_OPAQUE_WIN32_EXT,
memory.GetMemoryWin32HandleKHR());
}
return owning_opengl_handle;
}
#elif __linux__
[[nodiscard]] u32 ExportOpenGLHandle() {
if (!owning_opengl_handle) {
glCreateMemoryObjectsEXT(1, &owning_opengl_handle);
glImportMemoryFdEXT(owning_opengl_handle, allocation_size, GL_HANDLE_TYPE_OPAQUE_FD_EXT,
memory.GetMemoryFdKHR());
}
return owning_opengl_handle;
}
#else
[[nodiscard]] u32 ExportOpenGLHandle() {
return 0;
}
#endif
/// Returns whether this allocation is compatible with the arguments. /// Returns whether this allocation is compatible with the arguments.
[[nodiscard]] bool IsCompatible(VkMemoryPropertyFlags flags, u32 type_mask) const { [[nodiscard]] bool IsCompatible(VkMemoryPropertyFlags flags, u32 type_mask) const {
return (flags & property_flags) && (type_mask & shifted_memory_type) != 0; return (flags & property_flags) && (type_mask & shifted_memory_type) != 0;
@ -118,13 +159,15 @@ private:
return candidate; return candidate;
} }
const Device& device; ///< Vulkan device.
const vk::DeviceMemory memory; ///< Vulkan memory allocation handler. const vk::DeviceMemory memory; ///< Vulkan memory allocation handler.
const u64 allocation_size; ///< Size of this allocation. const u64 allocation_size; ///< Size of this allocation.
const VkMemoryPropertyFlags property_flags; ///< Vulkan memory property flags. const VkMemoryPropertyFlags property_flags; ///< Vulkan memory property flags.
const u32 shifted_memory_type; ///< Shifted Vulkan memory type. const u32 shifted_memory_type; ///< Shifted Vulkan memory type.
std::vector<Range> commits; ///< All commit ranges done from this allocation. std::vector<Range> commits; ///< All commit ranges done from this allocation.
std::span<u8> memory_mapped_span; ///< Memory mapped span. Empty if not queried before. std::span<u8> memory_mapped_span; ///< Memory mapped span. Empty if not queried before.
#if defined(_WIN32) || defined(__linux__)
u32 owning_opengl_handle{}; ///< Owning OpenGL memory object handle.
#endif
}; };
MemoryCommit::MemoryCommit(MemoryAllocation* allocation_, VkDeviceMemory memory_, u64 begin_, MemoryCommit::MemoryCommit(MemoryAllocation* allocation_, VkDeviceMemory memory_, u64 begin_,
@ -156,14 +199,19 @@ std::span<u8> MemoryCommit::Map() {
return span; return span;
} }
u32 MemoryCommit::ExportOpenGLHandle() const {
return allocation->ExportOpenGLHandle();
}
void MemoryCommit::Release() { void MemoryCommit::Release() {
if (allocation) { if (allocation) {
allocation->Free(begin); allocation->Free(begin);
} }
} }
MemoryAllocator::MemoryAllocator(const Device& device_) MemoryAllocator::MemoryAllocator(const Device& device_, bool export_allocations_)
: device{device_}, properties{device_.GetPhysical().GetMemoryProperties()} {} : device{device_}, properties{device_.GetPhysical().GetMemoryProperties()},
export_allocations{export_allocations_} {}
MemoryAllocator::~MemoryAllocator() = default; MemoryAllocator::~MemoryAllocator() = default;
@ -196,14 +244,24 @@ MemoryCommit MemoryAllocator::Commit(const vk::Image& image, MemoryUsage usage)
void MemoryAllocator::AllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size) { void MemoryAllocator::AllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size) {
const u32 type = FindType(flags, type_mask).value(); const u32 type = FindType(flags, type_mask).value();
const VkExportMemoryAllocateInfo export_allocate_info{
.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO,
.pNext = nullptr,
#ifdef _WIN32
.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT,
#elif __linux__
.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT,
#else
.handleTypes = 0,
#endif
};
vk::DeviceMemory memory = device.GetLogical().AllocateMemory({ vk::DeviceMemory memory = device.GetLogical().AllocateMemory({
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.pNext = nullptr, .pNext = export_allocations ? &export_allocate_info : nullptr,
.allocationSize = size, .allocationSize = size,
.memoryTypeIndex = type, .memoryTypeIndex = type,
}); });
allocations.push_back( allocations.push_back(std::make_unique<MemoryAllocation>(std::move(memory), flags, size, type));
std::make_unique<MemoryAllocation>(device, std::move(memory), flags, size, type));
} }
std::optional<MemoryCommit> MemoryAllocator::TryCommit(const VkMemoryRequirements& requirements, std::optional<MemoryCommit> MemoryAllocator::TryCommit(const VkMemoryRequirements& requirements,

View file

@ -43,6 +43,9 @@ public:
/// It will map the backing allocation if it hasn't been mapped before. /// It will map the backing allocation if it hasn't been mapped before.
std::span<u8> Map(); std::span<u8> Map();
/// Returns an non-owning OpenGL handle, creating one if it doesn't exist.
u32 ExportOpenGLHandle() const;
/// Returns the Vulkan memory handler. /// Returns the Vulkan memory handler.
VkDeviceMemory Memory() const { VkDeviceMemory Memory() const {
return memory; return memory;
@ -67,7 +70,15 @@ private:
/// Allocates and releases memory allocations on demand. /// Allocates and releases memory allocations on demand.
class MemoryAllocator { class MemoryAllocator {
public: public:
explicit MemoryAllocator(const Device& device_); /**
* Construct memory allocator
*
* @param device_ Device to allocate from
* @param export_allocations_ True when allocations have to be exported
*
* @throw vk::Exception on failure
*/
explicit MemoryAllocator(const Device& device_, bool export_allocations_);
~MemoryAllocator(); ~MemoryAllocator();
MemoryAllocator& operator=(const MemoryAllocator&) = delete; MemoryAllocator& operator=(const MemoryAllocator&) = delete;
@ -108,6 +119,7 @@ private:
const Device& device; ///< Device handle. const Device& device; ///< Device handle.
const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties.
const bool export_allocations; ///< True when memory allocations have to be exported.
std::vector<std::unique_ptr<MemoryAllocation>> allocations; ///< Current allocations. std::vector<std::unique_ptr<MemoryAllocation>> allocations; ///< Current allocations.
}; };

View file

@ -168,11 +168,15 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
X(vkFreeCommandBuffers); X(vkFreeCommandBuffers);
X(vkFreeDescriptorSets); X(vkFreeDescriptorSets);
X(vkFreeMemory); X(vkFreeMemory);
X(vkGetBufferMemoryRequirements); X(vkGetBufferMemoryRequirements2);
X(vkGetDeviceQueue); X(vkGetDeviceQueue);
X(vkGetEventStatus); X(vkGetEventStatus);
X(vkGetFenceStatus); X(vkGetFenceStatus);
X(vkGetImageMemoryRequirements); X(vkGetImageMemoryRequirements);
X(vkGetMemoryFdKHR);
#ifdef _WIN32
X(vkGetMemoryWin32HandleKHR);
#endif
X(vkGetQueryPoolResults); X(vkGetQueryPoolResults);
X(vkGetSemaphoreCounterValueKHR); X(vkGetSemaphoreCounterValueKHR);
X(vkMapMemory); X(vkMapMemory);
@ -505,6 +509,32 @@ void ImageView::SetObjectNameEXT(const char* name) const {
SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE_VIEW, name); SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE_VIEW, name);
} }
int DeviceMemory::GetMemoryFdKHR() const {
const VkMemoryGetFdInfoKHR get_fd_info{
.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR,
.pNext = nullptr,
.memory = handle,
.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR,
};
int fd;
Check(dld->vkGetMemoryFdKHR(owner, &get_fd_info, &fd));
return fd;
}
#ifdef _WIN32
HANDLE DeviceMemory::GetMemoryWin32HandleKHR() const {
const VkMemoryGetWin32HandleInfoKHR get_win32_handle_info{
.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR,
.pNext = nullptr,
.memory = handle,
.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR,
};
HANDLE win32_handle;
Check(dld->vkGetMemoryWin32HandleKHR(owner, &get_win32_handle_info, &win32_handle));
return win32_handle;
}
#endif
void DeviceMemory::SetObjectNameEXT(const char* name) const { void DeviceMemory::SetObjectNameEXT(const char* name) const {
SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_DEVICE_MEMORY, name); SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_DEVICE_MEMORY, name);
} }
@ -756,10 +786,20 @@ DeviceMemory Device::AllocateMemory(const VkMemoryAllocateInfo& ai) const {
return DeviceMemory(memory, handle, *dld); return DeviceMemory(memory, handle, *dld);
} }
VkMemoryRequirements Device::GetBufferMemoryRequirements(VkBuffer buffer) const noexcept { VkMemoryRequirements Device::GetBufferMemoryRequirements(VkBuffer buffer,
VkMemoryRequirements requirements; void* pnext) const noexcept {
dld->vkGetBufferMemoryRequirements(handle, buffer, &requirements); const VkBufferMemoryRequirementsInfo2 info{
return requirements; .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,
.pNext = nullptr,
.buffer = buffer,
};
VkMemoryRequirements2 requirements{
.sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
.pNext = pnext,
.memoryRequirements{},
};
dld->vkGetBufferMemoryRequirements2(handle, &info, &requirements);
return requirements.memoryRequirements;
} }
VkMemoryRequirements Device::GetImageMemoryRequirements(VkImage image) const noexcept { VkMemoryRequirements Device::GetImageMemoryRequirements(VkImage image) const noexcept {

View file

@ -15,8 +15,19 @@
#include <vector> #include <vector>
#define VK_NO_PROTOTYPES #define VK_NO_PROTOTYPES
#ifdef _WIN32
#define VK_USE_PLATFORM_WIN32_KHR
#endif
#include <vulkan/vulkan.h> #include <vulkan/vulkan.h>
// Sanitize macros
#ifdef CreateEvent
#undef CreateEvent
#endif
#ifdef CreateSemaphore
#undef CreateSemaphore
#endif
#include "common/common_types.h" #include "common/common_types.h"
#ifdef _MSC_VER #ifdef _MSC_VER
@ -174,7 +185,7 @@ struct InstanceDispatch {
}; };
/// Table holding Vulkan device function pointers. /// Table holding Vulkan device function pointers.
struct DeviceDispatch : public InstanceDispatch { struct DeviceDispatch : InstanceDispatch {
PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR{}; PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR{};
PFN_vkAllocateCommandBuffers vkAllocateCommandBuffers{}; PFN_vkAllocateCommandBuffers vkAllocateCommandBuffers{};
PFN_vkAllocateDescriptorSets vkAllocateDescriptorSets{}; PFN_vkAllocateDescriptorSets vkAllocateDescriptorSets{};
@ -272,11 +283,15 @@ struct DeviceDispatch : public InstanceDispatch {
PFN_vkFreeCommandBuffers vkFreeCommandBuffers{}; PFN_vkFreeCommandBuffers vkFreeCommandBuffers{};
PFN_vkFreeDescriptorSets vkFreeDescriptorSets{}; PFN_vkFreeDescriptorSets vkFreeDescriptorSets{};
PFN_vkFreeMemory vkFreeMemory{}; PFN_vkFreeMemory vkFreeMemory{};
PFN_vkGetBufferMemoryRequirements vkGetBufferMemoryRequirements{}; PFN_vkGetBufferMemoryRequirements2 vkGetBufferMemoryRequirements2{};
PFN_vkGetDeviceQueue vkGetDeviceQueue{}; PFN_vkGetDeviceQueue vkGetDeviceQueue{};
PFN_vkGetEventStatus vkGetEventStatus{}; PFN_vkGetEventStatus vkGetEventStatus{};
PFN_vkGetFenceStatus vkGetFenceStatus{}; PFN_vkGetFenceStatus vkGetFenceStatus{};
PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements{}; PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements{};
PFN_vkGetMemoryFdKHR vkGetMemoryFdKHR{};
#ifdef _WIN32
PFN_vkGetMemoryWin32HandleKHR vkGetMemoryWin32HandleKHR{};
#endif
PFN_vkGetQueryPoolResults vkGetQueryPoolResults{}; PFN_vkGetQueryPoolResults vkGetQueryPoolResults{};
PFN_vkGetSemaphoreCounterValueKHR vkGetSemaphoreCounterValueKHR{}; PFN_vkGetSemaphoreCounterValueKHR vkGetSemaphoreCounterValueKHR{};
PFN_vkMapMemory vkMapMemory{}; PFN_vkMapMemory vkMapMemory{};
@ -344,6 +359,9 @@ public:
/// Construct an empty handle. /// Construct an empty handle.
Handle() = default; Handle() = default;
/// Construct an empty handle.
Handle(std::nullptr_t) {}
/// Copying Vulkan objects is not supported and will never be. /// Copying Vulkan objects is not supported and will never be.
Handle(const Handle&) = delete; Handle(const Handle&) = delete;
Handle& operator=(const Handle&) = delete; Handle& operator=(const Handle&) = delete;
@ -659,6 +677,12 @@ class DeviceMemory : public Handle<VkDeviceMemory, VkDevice, DeviceDispatch> {
using Handle<VkDeviceMemory, VkDevice, DeviceDispatch>::Handle; using Handle<VkDeviceMemory, VkDevice, DeviceDispatch>::Handle;
public: public:
int GetMemoryFdKHR() const;
#ifdef _WIN32
HANDLE GetMemoryWin32HandleKHR() const;
#endif
/// Set object name. /// Set object name.
void SetObjectNameEXT(const char* name) const; void SetObjectNameEXT(const char* name) const;
@ -847,7 +871,8 @@ public:
DeviceMemory AllocateMemory(const VkMemoryAllocateInfo& ai) const; DeviceMemory AllocateMemory(const VkMemoryAllocateInfo& ai) const;
VkMemoryRequirements GetBufferMemoryRequirements(VkBuffer buffer) const noexcept; VkMemoryRequirements GetBufferMemoryRequirements(VkBuffer buffer,
void* pnext = nullptr) const noexcept;
VkMemoryRequirements GetImageMemoryRequirements(VkImage image) const noexcept; VkMemoryRequirements GetImageMemoryRequirements(VkImage image) const noexcept;
@ -1031,6 +1056,12 @@ public:
PipelineBarrier(src_stage_mask, dst_stage_mask, dependency_flags, {}, {}, {}); PipelineBarrier(src_stage_mask, dst_stage_mask, dependency_flags, {}, {}, {});
} }
void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
VkDependencyFlags dependency_flags,
const VkMemoryBarrier& memory_barrier) const noexcept {
PipelineBarrier(src_stage_mask, dst_stage_mask, dependency_flags, memory_barrier, {}, {});
}
void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask,
VkDependencyFlags dependency_flags, VkDependencyFlags dependency_flags,
const VkBufferMemoryBarrier& buffer_barrier) const noexcept { const VkBufferMemoryBarrier& buffer_barrier) const noexcept {

View file

@ -64,7 +64,7 @@ void EmuThread::run() {
emit LoadProgress(VideoCore::LoadCallbackStage::Prepare, 0, 0); emit LoadProgress(VideoCore::LoadCallbackStage::Prepare, 0, 0);
system.Renderer().Rasterizer().LoadDiskResources( system.Renderer().ReadRasterizer()->LoadDiskResources(
system.CurrentProcess()->GetTitleID(), stop_run, system.CurrentProcess()->GetTitleID(), stop_run,
[this](VideoCore::LoadCallbackStage stage, std::size_t value, std::size_t total) { [this](VideoCore::LoadCallbackStage stage, std::size_t value, std::size_t total) {
emit LoadProgress(stage, value, total); emit LoadProgress(stage, value, total);

View file

@ -782,14 +782,14 @@ void Config::ReadRendererValues() {
ReadSettingGlobal(Settings::values.frame_limit, QStringLiteral("frame_limit"), 100); ReadSettingGlobal(Settings::values.frame_limit, QStringLiteral("frame_limit"), 100);
ReadSettingGlobal(Settings::values.use_disk_shader_cache, ReadSettingGlobal(Settings::values.use_disk_shader_cache,
QStringLiteral("use_disk_shader_cache"), true); QStringLiteral("use_disk_shader_cache"), true);
ReadSettingGlobal(Settings::values.gpu_accuracy, QStringLiteral("gpu_accuracy"), 0); ReadSettingGlobal(Settings::values.gpu_accuracy, QStringLiteral("gpu_accuracy"), 1);
ReadSettingGlobal(Settings::values.use_asynchronous_gpu_emulation, ReadSettingGlobal(Settings::values.use_asynchronous_gpu_emulation,
QStringLiteral("use_asynchronous_gpu_emulation"), true); QStringLiteral("use_asynchronous_gpu_emulation"), true);
ReadSettingGlobal(Settings::values.use_nvdec_emulation, QStringLiteral("use_nvdec_emulation"), ReadSettingGlobal(Settings::values.use_nvdec_emulation, QStringLiteral("use_nvdec_emulation"),
true); true);
ReadSettingGlobal(Settings::values.use_vsync, QStringLiteral("use_vsync"), true); ReadSettingGlobal(Settings::values.use_vsync, QStringLiteral("use_vsync"), true);
ReadSettingGlobal(Settings::values.use_assembly_shaders, QStringLiteral("use_assembly_shaders"), ReadSettingGlobal(Settings::values.use_assembly_shaders, QStringLiteral("use_assembly_shaders"),
true); false);
ReadSettingGlobal(Settings::values.use_asynchronous_shaders, ReadSettingGlobal(Settings::values.use_asynchronous_shaders,
QStringLiteral("use_asynchronous_shaders"), false); QStringLiteral("use_asynchronous_shaders"), false);
ReadSettingGlobal(Settings::values.use_fast_gpu_time, QStringLiteral("use_fast_gpu_time"), ReadSettingGlobal(Settings::values.use_fast_gpu_time, QStringLiteral("use_fast_gpu_time"),
@ -1351,14 +1351,14 @@ void Config::SaveRendererValues() {
Settings::values.use_disk_shader_cache, true); Settings::values.use_disk_shader_cache, true);
WriteSettingGlobal(QStringLiteral("gpu_accuracy"), WriteSettingGlobal(QStringLiteral("gpu_accuracy"),
static_cast<int>(Settings::values.gpu_accuracy.GetValue(global)), static_cast<int>(Settings::values.gpu_accuracy.GetValue(global)),
Settings::values.gpu_accuracy.UsingGlobal(), 0); Settings::values.gpu_accuracy.UsingGlobal(), 1);
WriteSettingGlobal(QStringLiteral("use_asynchronous_gpu_emulation"), WriteSettingGlobal(QStringLiteral("use_asynchronous_gpu_emulation"),
Settings::values.use_asynchronous_gpu_emulation, true); Settings::values.use_asynchronous_gpu_emulation, true);
WriteSettingGlobal(QStringLiteral("use_nvdec_emulation"), Settings::values.use_nvdec_emulation, WriteSettingGlobal(QStringLiteral("use_nvdec_emulation"), Settings::values.use_nvdec_emulation,
true); true);
WriteSettingGlobal(QStringLiteral("use_vsync"), Settings::values.use_vsync, true); WriteSettingGlobal(QStringLiteral("use_vsync"), Settings::values.use_vsync, true);
WriteSettingGlobal(QStringLiteral("use_assembly_shaders"), WriteSettingGlobal(QStringLiteral("use_assembly_shaders"),
Settings::values.use_assembly_shaders, true); Settings::values.use_assembly_shaders, false);
WriteSettingGlobal(QStringLiteral("use_asynchronous_shaders"), WriteSettingGlobal(QStringLiteral("use_asynchronous_shaders"),
Settings::values.use_asynchronous_shaders, false); Settings::values.use_asynchronous_shaders, false);
WriteSettingGlobal(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time, WriteSettingGlobal(QStringLiteral("use_fast_gpu_time"), Settings::values.use_fast_gpu_time,

View file

@ -2,6 +2,9 @@
// Licensed under GPLv2 or any later version // Licensed under GPLv2 or any later version
// Refer to the license.txt file included. // Refer to the license.txt file included.
// Include this early to include Vulkan headers how we want to
#include "video_core/vulkan_common/vulkan_wrapper.h"
#include <QColorDialog> #include <QColorDialog>
#include <QComboBox> #include <QComboBox>
#include <QVulkanInstance> #include <QVulkanInstance>
@ -11,7 +14,8 @@
#include "core/core.h" #include "core/core.h"
#include "core/settings.h" #include "core/settings.h"
#include "ui_configure_graphics.h" #include "ui_configure_graphics.h"
#include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/vulkan_common/vulkan_instance.h"
#include "video_core/vulkan_common/vulkan_library.h"
#include "yuzu/configuration/configuration_shared.h" #include "yuzu/configuration/configuration_shared.h"
#include "yuzu/configuration/configure_graphics.h" #include "yuzu/configuration/configure_graphics.h"
@ -212,11 +216,23 @@ void ConfigureGraphics::UpdateDeviceComboBox() {
ui->device->setEnabled(enabled && !Core::System::GetInstance().IsPoweredOn()); ui->device->setEnabled(enabled && !Core::System::GetInstance().IsPoweredOn());
} }
void ConfigureGraphics::RetrieveVulkanDevices() { void ConfigureGraphics::RetrieveVulkanDevices() try {
using namespace Vulkan;
vk::InstanceDispatch dld;
const Common::DynamicLibrary library = OpenLibrary();
const vk::Instance instance = CreateInstance(library, dld, VK_API_VERSION_1_0);
const std::vector<VkPhysicalDevice> physical_devices = instance.EnumeratePhysicalDevices();
vulkan_devices.clear(); vulkan_devices.clear();
for (const auto& name : Vulkan::RendererVulkan::EnumerateDevices()) { vulkan_devices.reserve(physical_devices.size());
for (const VkPhysicalDevice device : physical_devices) {
const char* const name = vk::PhysicalDevice(device, dld).GetProperties().deviceName;
vulkan_devices.push_back(QString::fromStdString(name)); vulkan_devices.push_back(QString::fromStdString(name));
} }
} catch (const Vulkan::vk::Exception& exception) {
LOG_ERROR(Frontend, "Failed to enumerate devices with error: {}", exception.what());
} }
Settings::RendererBackend ConfigureGraphics::GetCurrentGraphicsBackend() const { Settings::RendererBackend ConfigureGraphics::GetCurrentGraphicsBackend() const {

View file

@ -388,7 +388,7 @@ void Config::ReadValues() {
static_cast<u16>(sdl2_config->GetInteger("Renderer", "frame_limit", 100))); static_cast<u16>(sdl2_config->GetInteger("Renderer", "frame_limit", 100)));
Settings::values.use_disk_shader_cache.SetValue( Settings::values.use_disk_shader_cache.SetValue(
sdl2_config->GetBoolean("Renderer", "use_disk_shader_cache", false)); sdl2_config->GetBoolean("Renderer", "use_disk_shader_cache", false));
const int gpu_accuracy_level = sdl2_config->GetInteger("Renderer", "gpu_accuracy", 0); const int gpu_accuracy_level = sdl2_config->GetInteger("Renderer", "gpu_accuracy", 1);
Settings::values.gpu_accuracy.SetValue(static_cast<Settings::GPUAccuracy>(gpu_accuracy_level)); Settings::values.gpu_accuracy.SetValue(static_cast<Settings::GPUAccuracy>(gpu_accuracy_level));
Settings::values.use_asynchronous_gpu_emulation.SetValue( Settings::values.use_asynchronous_gpu_emulation.SetValue(
sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", true)); sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", true));

View file

@ -215,7 +215,7 @@ int main(int argc, char** argv) {
// Core is loaded, start the GPU (makes the GPU contexts current to this thread) // Core is loaded, start the GPU (makes the GPU contexts current to this thread)
system.GPU().Start(); system.GPU().Start();
system.Renderer().Rasterizer().LoadDiskResources( system.Renderer().ReadRasterizer()->LoadDiskResources(
system.CurrentProcess()->GetTitleID(), false, system.CurrentProcess()->GetTitleID(), false,
[](VideoCore::LoadCallbackStage, size_t value, size_t total) {}); [](VideoCore::LoadCallbackStage, size_t value, size_t total) {});