diff --git a/Ryujinx.Graphics.GAL/Capabilities.cs b/Ryujinx.Graphics.GAL/Capabilities.cs
index c7cedb34b..4e5dff596 100644
--- a/Ryujinx.Graphics.GAL/Capabilities.cs
+++ b/Ryujinx.Graphics.GAL/Capabilities.cs
@@ -2,30 +2,32 @@ namespace Ryujinx.Graphics.GAL
{
public struct Capabilities
{
- public bool HasFrontFacingBug { get; }
- public bool HasVectorIndexingBug { get; }
+ public readonly bool HasFrontFacingBug;
+ public readonly bool HasVectorIndexingBug;
- public bool SupportsAstcCompression { get; }
- public bool SupportsBgraFormat { get; }
- public bool SupportsR4G4Format { get; }
- public bool SupportsFragmentShaderInterlock { get; }
- public bool SupportsFragmentShaderOrderingIntel { get; }
- public bool SupportsImageLoadFormatted { get; }
- public bool SupportsMismatchingViewFormat { get; }
- public bool SupportsNonConstantTextureOffset { get; }
- public bool SupportsShaderBallot { get; }
- public bool SupportsTextureShadowLod { get; }
- public bool SupportsViewportSwizzle { get; }
- public bool SupportsIndirectParameters { get; }
+ public readonly bool SupportsAstcCompression;
+ public readonly bool Supports3DTextureCompression;
+ public readonly bool SupportsBgraFormat;
+ public readonly bool SupportsR4G4Format;
+ public readonly bool SupportsFragmentShaderInterlock;
+ public readonly bool SupportsFragmentShaderOrderingIntel;
+ public readonly bool SupportsImageLoadFormatted;
+ public readonly bool SupportsMismatchingViewFormat;
+ public readonly bool SupportsNonConstantTextureOffset;
+ public readonly bool SupportsShaderBallot;
+ public readonly bool SupportsTextureShadowLod;
+ public readonly bool SupportsViewportSwizzle;
+ public readonly bool SupportsIndirectParameters;
- public int MaximumComputeSharedMemorySize { get; }
- public float MaximumSupportedAnisotropy { get; }
- public int StorageBufferOffsetAlignment { get; }
+ public readonly int MaximumComputeSharedMemorySize;
+ public readonly float MaximumSupportedAnisotropy;
+ public readonly int StorageBufferOffsetAlignment;
public Capabilities(
bool hasFrontFacingBug,
bool hasVectorIndexingBug,
bool supportsAstcCompression,
+ bool supports3DTextureCompression,
bool supportsBgraFormat,
bool supportsR4G4Format,
bool supportsFragmentShaderInterlock,
@@ -44,6 +46,7 @@ namespace Ryujinx.Graphics.GAL
HasFrontFacingBug = hasFrontFacingBug;
HasVectorIndexingBug = hasVectorIndexingBug;
SupportsAstcCompression = supportsAstcCompression;
+ Supports3DTextureCompression = supports3DTextureCompression;
SupportsBgraFormat = supportsBgraFormat;
SupportsR4G4Format = supportsR4G4Format;
SupportsFragmentShaderInterlock = supportsFragmentShaderInterlock;
diff --git a/Ryujinx.Graphics.GAL/Format.cs b/Ryujinx.Graphics.GAL/Format.cs
index d5e183baf..a454413bf 100644
--- a/Ryujinx.Graphics.GAL/Format.cs
+++ b/Ryujinx.Graphics.GAL/Format.cs
@@ -67,11 +67,9 @@ namespace Ryujinx.Graphics.GAL
R10G10B10A2Uint,
R11G11B10Float,
R9G9B9E5Float,
- Bc1RgbUnorm,
Bc1RgbaUnorm,
Bc2Unorm,
Bc3Unorm,
- Bc1RgbSrgb,
Bc1RgbaSrgb,
Bc2Srgb,
Bc3Srgb,
@@ -349,25 +347,5 @@ namespace Ryujinx.Graphics.GAL
{
return format.IsUint() || format.IsSint();
}
-
- ///
- /// Checks if the texture format is a BC4 compressed format.
- ///
- /// Texture format
- /// True if the texture format is a BC4 compressed format, false otherwise
- public static bool IsBc4(this Format format)
- {
- return format == Format.Bc4Unorm || format == Format.Bc4Snorm;
- }
-
- ///
- /// Checks if the texture format is a BC5 compressed format.
- ///
- /// Texture format
- /// True if the texture format is a BC5 compressed format, false otherwise
- public static bool IsBc5(this Format format)
- {
- return format == Format.Bc5Unorm || format == Format.Bc5Snorm;
- }
}
}
\ No newline at end of file
diff --git a/Ryujinx.Graphics.Gpu/GpuContext.cs b/Ryujinx.Graphics.Gpu/GpuContext.cs
index 5c9af3839..ddc95b2c0 100644
--- a/Ryujinx.Graphics.Gpu/GpuContext.cs
+++ b/Ryujinx.Graphics.Gpu/GpuContext.cs
@@ -78,14 +78,27 @@ namespace Ryujinx.Graphics.Gpu
///
/// Host hardware capabilities.
///
- internal Capabilities Capabilities => _caps.Value;
+ internal ref Capabilities Capabilities
+ {
+ get
+ {
+ if (!_capsLoaded)
+ {
+ _caps = Renderer.GetCapabilities();
+ _capsLoaded = true;
+ }
+
+ return ref _caps;
+ }
+ }
///
/// Event for signalling shader cache loading progress.
///
public event Action ShaderCacheStateChanged;
- private readonly Lazy _caps;
+ private bool _capsLoaded;
+ private Capabilities _caps;
private Thread _gpuThread;
///
@@ -110,8 +123,6 @@ namespace Ryujinx.Graphics.Gpu
DeferredActions = new Queue();
PhysicalMemoryRegistry = new ConcurrentDictionary();
-
- _caps = new Lazy(Renderer.GetCapabilities);
}
///
diff --git a/Ryujinx.Graphics.Gpu/Image/Texture.cs b/Ryujinx.Graphics.Gpu/Image/Texture.cs
index b2fa15a25..e1f00606f 100644
--- a/Ryujinx.Graphics.Gpu/Image/Texture.cs
+++ b/Ryujinx.Graphics.Gpu/Image/Texture.cs
@@ -834,13 +834,31 @@ namespace Ryujinx.Graphics.Gpu.Image
{
data = PixelConverter.ConvertR4G4ToR4G4B4A4(data);
}
- else if (Target == Target.Texture3D && Format.IsBc4())
+ else if (!_context.Capabilities.Supports3DTextureCompression && Target == Target.Texture3D)
{
- data = BCnDecoder.DecodeBC4(data, width, height, depth, levels, layers, Info.FormatInfo.Format == Format.Bc4Snorm);
- }
- else if (Target == Target.Texture3D && Format.IsBc5())
- {
- data = BCnDecoder.DecodeBC5(data, width, height, depth, levels, layers, Info.FormatInfo.Format == Format.Bc5Snorm);
+ switch (Format)
+ {
+ case Format.Bc1RgbaSrgb:
+ case Format.Bc1RgbaUnorm:
+ data = BCnDecoder.DecodeBC1(data, width, height, depth, levels, layers);
+ break;
+ case Format.Bc2Srgb:
+ case Format.Bc2Unorm:
+ data = BCnDecoder.DecodeBC2(data, width, height, depth, levels, layers);
+ break;
+ case Format.Bc3Srgb:
+ case Format.Bc3Unorm:
+ data = BCnDecoder.DecodeBC3(data, width, height, depth, levels, layers);
+ break;
+ case Format.Bc4Snorm:
+ case Format.Bc4Unorm:
+ data = BCnDecoder.DecodeBC4(data, width, height, depth, levels, layers, Format == Format.Bc4Snorm);
+ break;
+ case Format.Bc5Snorm:
+ case Format.Bc5Unorm:
+ data = BCnDecoder.DecodeBC5(data, width, height, depth, levels, layers, Format == Format.Bc5Snorm);
+ break;
+ }
}
return data;
diff --git a/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs b/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs
index 0461a81f7..188e1e090 100644
--- a/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs
+++ b/Ryujinx.Graphics.Gpu/Image/TextureCompatibility.cs
@@ -14,9 +14,6 @@ namespace Ryujinx.Graphics.Gpu.Image
private enum FormatClass
{
Unclassified,
- BCn64,
- BCn128,
- Bc1Rgb,
Bc1Rgba,
Bc2,
Bc3,
@@ -88,13 +85,21 @@ namespace Ryujinx.Graphics.Gpu.Image
return new FormatInfo(Format.R4G4B4A4Unorm, 1, 1, 2, 4);
}
- if (info.Target == Target.Texture3D)
+ if (!caps.Supports3DTextureCompression && info.Target == Target.Texture3D)
{
- // The host API does not support 3D BC4/BC5 compressed formats.
+ // The host API does not support 3D compressed formats.
// We assume software decompression will be done for those textures,
// and so we adjust the format here to match the decompressor output.
switch (info.FormatInfo.Format)
{
+ case Format.Bc1RgbaSrgb:
+ case Format.Bc2Srgb:
+ case Format.Bc3Srgb:
+ return new FormatInfo(Format.R8G8B8A8Srgb, 1, 1, 4, 4);
+ case Format.Bc1RgbaUnorm:
+ case Format.Bc2Unorm:
+ case Format.Bc3Unorm:
+ return new FormatInfo(Format.R8G8B8A8Unorm, 1, 1, 4, 4);
case Format.Bc4Unorm:
return new FormatInfo(Format.R8Unorm, 1, 1, 1, 1);
case Format.Bc4Snorm:
@@ -749,9 +754,6 @@ namespace Ryujinx.Graphics.Gpu.Image
{
switch (format)
{
- case Format.Bc1RgbSrgb:
- case Format.Bc1RgbUnorm:
- return FormatClass.Bc1Rgb;
case Format.Bc1RgbaSrgb:
case Format.Bc1RgbaUnorm:
return FormatClass.Bc1Rgba;
diff --git a/Ryujinx.Graphics.OpenGL/FormatTable.cs b/Ryujinx.Graphics.OpenGL/FormatTable.cs
index e3249cd6f..41fd9f370 100644
--- a/Ryujinx.Graphics.OpenGL/FormatTable.cs
+++ b/Ryujinx.Graphics.OpenGL/FormatTable.cs
@@ -80,11 +80,9 @@ namespace Ryujinx.Graphics.OpenGL
Add(Format.R10G10B10A2Uint, new FormatInfo(4, false, false, All.Rgb10A2ui, PixelFormat.RgbaInteger, PixelType.UnsignedInt2101010Reversed));
Add(Format.R11G11B10Float, new FormatInfo(3, false, false, All.R11fG11fB10f, PixelFormat.Rgb, PixelType.UnsignedInt10F11F11FRev));
Add(Format.R9G9B9E5Float, new FormatInfo(3, false, false, All.Rgb9E5, PixelFormat.Rgb, PixelType.UnsignedInt5999Rev));
- Add(Format.Bc1RgbUnorm, new FormatInfo(3, true, false, All.CompressedRgbS3tcDxt1Ext));
Add(Format.Bc1RgbaUnorm, new FormatInfo(4, true, false, All.CompressedRgbaS3tcDxt1Ext));
Add(Format.Bc2Unorm, new FormatInfo(4, true, false, All.CompressedRgbaS3tcDxt3Ext));
Add(Format.Bc3Unorm, new FormatInfo(4, true, false, All.CompressedRgbaS3tcDxt5Ext));
- Add(Format.Bc1RgbSrgb, new FormatInfo(3, false, false, All.CompressedSrgbS3tcDxt1Ext));
Add(Format.Bc1RgbaSrgb, new FormatInfo(4, true, false, All.CompressedSrgbAlphaS3tcDxt1Ext));
Add(Format.Bc2Srgb, new FormatInfo(4, false, false, All.CompressedSrgbAlphaS3tcDxt3Ext));
Add(Format.Bc3Srgb, new FormatInfo(4, false, false, All.CompressedSrgbAlphaS3tcDxt5Ext));
diff --git a/Ryujinx.Graphics.OpenGL/Renderer.cs b/Ryujinx.Graphics.OpenGL/Renderer.cs
index ceacbf294..8d44f2e44 100644
--- a/Ryujinx.Graphics.OpenGL/Renderer.cs
+++ b/Ryujinx.Graphics.OpenGL/Renderer.cs
@@ -104,6 +104,7 @@ namespace Ryujinx.Graphics.OpenGL
hasFrontFacingBug: HwCapabilities.Vendor == HwCapabilities.GpuVendor.IntelWindows,
hasVectorIndexingBug: HwCapabilities.Vendor == HwCapabilities.GpuVendor.AmdWindows,
supportsAstcCompression: HwCapabilities.SupportsAstcCompression,
+ supports3DTextureCompression: false,
supportsBgraFormat: false,
supportsR4G4Format: false,
supportsFragmentShaderInterlock: HwCapabilities.SupportsFragmentShaderInterlock,
diff --git a/Ryujinx.Graphics.Texture/BCnDecoder.cs b/Ryujinx.Graphics.Texture/BCnDecoder.cs
index b8b04bac2..b840cac89 100644
--- a/Ryujinx.Graphics.Texture/BCnDecoder.cs
+++ b/Ryujinx.Graphics.Texture/BCnDecoder.cs
@@ -1,7 +1,9 @@
using Ryujinx.Common;
using System;
-using System.Runtime.CompilerServices;
+using System.Buffers.Binary;
using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
namespace Ryujinx.Graphics.Texture
{
@@ -10,22 +12,30 @@ namespace Ryujinx.Graphics.Texture
private const int BlockWidth = 4;
private const int BlockHeight = 4;
- public static byte[] DecodeBC4(ReadOnlySpan data, int width, int height, int depth, int levels, int layers, bool signed)
+ public static byte[] DecodeBC1(ReadOnlySpan data, int width, int height, int depth, int levels, int layers)
{
int size = 0;
for (int l = 0; l < levels; l++)
{
- size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers;
+ size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
}
byte[] output = new byte[size];
- ReadOnlySpan data64 = MemoryMarshal.Cast(data);
+ Span tile = stackalloc byte[BlockWidth * BlockHeight * 4];
- Span rPal = stackalloc byte[8];
+ Span tileAsUint = MemoryMarshal.Cast(tile);
+ Span outputAsUint = MemoryMarshal.Cast(output);
- int baseOOffs = 0;
+ Span> tileAsVector128 = MemoryMarshal.Cast>(tile);
+
+ Span> outputLine0 = default;
+ Span> outputLine1 = default;
+ Span> outputLine2 = default;
+ Span> outputLine3 = default;
+
+ int imageBaseOOffs = 0;
for (int l = 0; l < levels; l++)
{
@@ -39,11 +49,302 @@ namespace Ryujinx.Graphics.Texture
for (int y = 0; y < h; y++)
{
int baseY = y * BlockHeight;
+ int copyHeight = Math.Min(BlockHeight, height - baseY);
+ int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+ if (copyHeight == 4)
+ {
+ outputLine0 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs));
+ outputLine1 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width));
+ outputLine2 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width * 2));
+ outputLine3 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width * 3));
+ }
for (int x = 0; x < w; x++)
{
int baseX = x * BlockWidth;
- int lineBaseOOffs = baseOOffs + baseX;
+ int copyWidth = Math.Min(BlockWidth, width - baseX);
+
+ BC1DecodeTileRgb(tile, data);
+
+ if ((copyWidth | copyHeight) == 4)
+ {
+ outputLine0[x] = tileAsVector128[0];
+ outputLine1[x] = tileAsVector128[1];
+ outputLine2[x] = tileAsVector128[2];
+ outputLine3[x] = tileAsVector128[3];
+ }
+ else
+ {
+ int pixelBaseOOffs = lineBaseOOffs + baseX;
+
+ for (int tY = 0; tY < copyHeight; tY++)
+ {
+ tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
+ }
+ }
+
+ data = data.Slice(8);
+ }
+ }
+
+ imageBaseOOffs += width * height;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodeBC2(ReadOnlySpan data, int width, int height, int depth, int levels, int layers)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
+ }
+
+ byte[] output = new byte[size];
+
+ Span tile = stackalloc byte[BlockWidth * BlockHeight * 4];
+
+ Span tileAsUint = MemoryMarshal.Cast(tile);
+ Span outputAsUint = MemoryMarshal.Cast(output);
+
+ Span> tileAsVector128 = MemoryMarshal.Cast>(tile);
+
+ Span> outputLine0 = default;
+ Span> outputLine1 = default;
+ Span> outputLine2 = default;
+ Span> outputLine3 = default;
+
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ for (int y = 0; y < h; y++)
+ {
+ int baseY = y * BlockHeight;
+ int copyHeight = Math.Min(BlockHeight, height - baseY);
+ int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+ if (copyHeight == 4)
+ {
+ outputLine0 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs));
+ outputLine1 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width));
+ outputLine2 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width * 2));
+ outputLine3 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width * 3));
+ }
+
+ for (int x = 0; x < w; x++)
+ {
+ int baseX = x * BlockWidth;
+ int copyWidth = Math.Min(BlockWidth, width - baseX);
+
+ BC23DecodeTileRgb(tile, data.Slice(8));
+
+ ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
+
+ for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, block >>= 4)
+ {
+ tile[i] = (byte)((block & 0xf) | (block << 4));
+ }
+
+ if ((copyWidth | copyHeight) == 4)
+ {
+ outputLine0[x] = tileAsVector128[0];
+ outputLine1[x] = tileAsVector128[1];
+ outputLine2[x] = tileAsVector128[2];
+ outputLine3[x] = tileAsVector128[3];
+ }
+ else
+ {
+ int pixelBaseOOffs = lineBaseOOffs + baseX;
+
+ for (int tY = 0; tY < copyHeight; tY++)
+ {
+ tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
+ }
+ }
+
+ data = data.Slice(16);
+ }
+ }
+
+ imageBaseOOffs += width * height;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodeBC3(ReadOnlySpan data, int width, int height, int depth, int levels, int layers)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
+ }
+
+ byte[] output = new byte[size];
+
+ Span tile = stackalloc byte[BlockWidth * BlockHeight * 4];
+ Span rPal = stackalloc byte[8];
+
+ Span tileAsUint = MemoryMarshal.Cast(tile);
+ Span outputAsUint = MemoryMarshal.Cast(output);
+
+ Span> tileAsVector128 = MemoryMarshal.Cast>(tile);
+
+ Span> outputLine0 = default;
+ Span> outputLine1 = default;
+ Span> outputLine2 = default;
+ Span> outputLine3 = default;
+
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ for (int y = 0; y < h; y++)
+ {
+ int baseY = y * BlockHeight;
+ int copyHeight = Math.Min(BlockHeight, height - baseY);
+ int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+ if (copyHeight == 4)
+ {
+ outputLine0 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs));
+ outputLine1 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width));
+ outputLine2 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width * 2));
+ outputLine3 = MemoryMarshal.Cast>(outputAsUint.Slice(lineBaseOOffs + width * 3));
+ }
+
+ for (int x = 0; x < w; x++)
+ {
+ int baseX = x * BlockWidth;
+ int copyWidth = Math.Min(BlockWidth, width - baseX);
+
+ BC23DecodeTileRgb(tile, data.Slice(8));
+
+ ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
+
+ rPal[0] = (byte)block;
+ rPal[1] = (byte)(block >> 8);
+
+ BCnLerpAlphaUnorm(rPal);
+ BCnDecodeTileAlphaRgba(tile, rPal, block >> 16);
+
+ if ((copyWidth | copyHeight) == 4)
+ {
+ outputLine0[x] = tileAsVector128[0];
+ outputLine1[x] = tileAsVector128[1];
+ outputLine2[x] = tileAsVector128[2];
+ outputLine3[x] = tileAsVector128[3];
+ }
+ else
+ {
+ int pixelBaseOOffs = lineBaseOOffs + baseX;
+
+ for (int tY = 0; tY < copyHeight; tY++)
+ {
+ tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
+ }
+ }
+
+ data = data.Slice(16);
+ }
+ }
+
+ imageBaseOOffs += width * height;
+ }
+ }
+
+ width = Math.Max(1, width >> 1);
+ height = Math.Max(1, height >> 1);
+ depth = Math.Max(1, depth >> 1);
+ }
+
+ return output;
+ }
+
+ public static byte[] DecodeBC4(ReadOnlySpan data, int width, int height, int depth, int levels, int layers, bool signed)
+ {
+ int size = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers;
+ }
+
+ byte[] output = new byte[size];
+ Span outputSpan = new Span(output);
+
+ ReadOnlySpan data64 = MemoryMarshal.Cast(data);
+
+ Span tile = stackalloc byte[BlockWidth * BlockHeight];
+ Span rPal = stackalloc byte[8];
+
+ Span tileAsUint = MemoryMarshal.Cast(tile);
+
+ Span outputLine0 = default;
+ Span outputLine1 = default;
+ Span outputLine2 = default;
+ Span outputLine3 = default;
+
+ int imageBaseOOffs = 0;
+
+ for (int l = 0; l < levels; l++)
+ {
+ int w = BitUtils.DivRoundUp(width, BlockWidth);
+ int h = BitUtils.DivRoundUp(height, BlockHeight);
+
+ for (int l2 = 0; l2 < layers; l2++)
+ {
+ for (int z = 0; z < depth; z++)
+ {
+ for (int y = 0; y < h; y++)
+ {
+ int baseY = y * BlockHeight;
+ int copyHeight = Math.Min(BlockHeight, height - baseY);
+ int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+ if (copyHeight == 4)
+ {
+ outputLine0 = MemoryMarshal.Cast(outputSpan.Slice(lineBaseOOffs));
+ outputLine1 = MemoryMarshal.Cast(outputSpan.Slice(lineBaseOOffs + width));
+ outputLine2 = MemoryMarshal.Cast(outputSpan.Slice(lineBaseOOffs + width * 2));
+ outputLine3 = MemoryMarshal.Cast(outputSpan.Slice(lineBaseOOffs + width * 3));
+ }
+
+ for (int x = 0; x < w; x++)
+ {
+ int baseX = x * BlockWidth;
+ int copyWidth = Math.Min(BlockWidth, width - baseX);
ulong block = data64[0];
@@ -52,45 +353,43 @@ namespace Ryujinx.Graphics.Texture
if (signed)
{
- CalculateBC3AlphaS(rPal);
+ BCnLerpAlphaSnorm(rPal);
}
else
{
- CalculateBC3Alpha(rPal);
+ BCnLerpAlphaUnorm(rPal);
}
- ulong rI = block >> 16;
+ BCnDecodeTileAlpha(tile, rPal, block >> 16);
- for (int texel = 0; texel < BlockWidth * BlockHeight; texel++)
+ if ((copyWidth | copyHeight) == 4)
{
- int tX = texel & 3;
- int tY = texel >> 2;
+ outputLine0[x] = tileAsUint[0];
+ outputLine1[x] = tileAsUint[1];
+ outputLine2[x] = tileAsUint[2];
+ outputLine3[x] = tileAsUint[3];
+ }
+ else
+ {
+ int pixelBaseOOffs = lineBaseOOffs + baseX;
- if (baseX + tX >= width || baseY + tY >= height)
+ for (int tY = 0; tY < copyHeight; tY++)
{
- continue;
+ tile.Slice(tY * 4, copyWidth).CopyTo(outputSpan.Slice(pixelBaseOOffs + width * tY, copyWidth));
}
-
- int shift = texel * 3;
-
- byte r = rPal[(int)((rI >> shift) & 7)];
-
- int oOffs = lineBaseOOffs + tY * width + tX;
-
- output[oOffs] = r;
}
data64 = data64.Slice(1);
}
-
- baseOOffs += width * (baseY + BlockHeight > height ? (height & (BlockHeight - 1)) : BlockHeight);
}
+
+ imageBaseOOffs += width * height;
}
}
- width = Math.Max(1, width >> 1);
+ width = Math.Max(1, width >> 1);
height = Math.Max(1, height >> 1);
- depth = Math.Max(1, depth >> 1);
+ depth = Math.Max(1, depth >> 1);
}
return output;
@@ -109,10 +408,22 @@ namespace Ryujinx.Graphics.Texture
ReadOnlySpan data64 = MemoryMarshal.Cast(data);
+ Span rTile = stackalloc byte[BlockWidth * BlockHeight * 2];
+ Span gTile = stackalloc byte[BlockWidth * BlockHeight * 2];
Span rPal = stackalloc byte[8];
Span gPal = stackalloc byte[8];
- int baseOOffs = 0;
+ Span outputAsUshort = MemoryMarshal.Cast(output);
+
+ Span rTileAsUint = MemoryMarshal.Cast(rTile);
+ Span gTileAsUint = MemoryMarshal.Cast(gTile);
+
+ Span outputLine0 = default;
+ Span outputLine1 = default;
+ Span outputLine2 = default;
+ Span outputLine3 = default;
+
+ int imageBaseOOffs = 0;
for (int l = 0; l < levels; l++)
{
@@ -126,11 +437,21 @@ namespace Ryujinx.Graphics.Texture
for (int y = 0; y < h; y++)
{
int baseY = y * BlockHeight;
+ int copyHeight = Math.Min(BlockHeight, height - baseY);
+ int lineBaseOOffs = imageBaseOOffs + baseY * width;
+
+ if (copyHeight == 4)
+ {
+ outputLine0 = MemoryMarshal.Cast(outputAsUshort.Slice(lineBaseOOffs));
+ outputLine1 = MemoryMarshal.Cast(outputAsUshort.Slice(lineBaseOOffs + width));
+ outputLine2 = MemoryMarshal.Cast(outputAsUshort.Slice(lineBaseOOffs + width * 2));
+ outputLine3 = MemoryMarshal.Cast(outputAsUshort.Slice(lineBaseOOffs + width * 3));
+ }
for (int x = 0; x < w; x++)
{
int baseX = x * BlockWidth;
- int lineBaseOOffs = baseOOffs + baseX;
+ int copyWidth = Math.Min(BlockWidth, width - baseX);
ulong blockL = data64[0];
ulong blockH = data64[1];
@@ -142,101 +463,346 @@ namespace Ryujinx.Graphics.Texture
if (signed)
{
- CalculateBC3AlphaS(rPal);
- CalculateBC3AlphaS(gPal);
+ BCnLerpAlphaSnorm(rPal);
+ BCnLerpAlphaSnorm(gPal);
}
else
{
- CalculateBC3Alpha(rPal);
- CalculateBC3Alpha(gPal);
+ BCnLerpAlphaUnorm(rPal);
+ BCnLerpAlphaUnorm(gPal);
}
- ulong rI = blockL >> 16;
- ulong gI = blockH >> 16;
+ BCnDecodeTileAlpha(rTile, rPal, blockL >> 16);
+ BCnDecodeTileAlpha(gTile, gPal, blockH >> 16);
- for (int texel = 0; texel < BlockWidth * BlockHeight; texel++)
+ if ((copyWidth | copyHeight) == 4)
{
- int tX = texel & 3;
- int tY = texel >> 2;
+ outputLine0[x] = InterleaveBytes(rTileAsUint[0], gTileAsUint[0]);
+ outputLine1[x] = InterleaveBytes(rTileAsUint[1], gTileAsUint[1]);
+ outputLine2[x] = InterleaveBytes(rTileAsUint[2], gTileAsUint[2]);
+ outputLine3[x] = InterleaveBytes(rTileAsUint[3], gTileAsUint[3]);
+ }
+ else
+ {
+ int pixelBaseOOffs = lineBaseOOffs + baseX;
- if (baseX + tX >= width || baseY + tY >= height)
+ for (int tY = 0; tY < copyHeight; tY++)
{
- continue;
+ int line = pixelBaseOOffs + width * tY;
+
+ for (int tX = 0; tX < copyWidth; tX++)
+ {
+ int texel = tY * BlockWidth + tX;
+
+ outputAsUshort[line + tX] = (ushort)(rTile[texel] | (gTile[texel] << 8));
+ }
}
-
- int shift = texel * 3;
-
- byte r = rPal[(int)((rI >> shift) & 7)];
- byte g = gPal[(int)((gI >> shift) & 7)];
-
- int oOffs = (lineBaseOOffs + tY * width + tX) * 2;
-
- output[oOffs + 0] = r;
- output[oOffs + 1] = g;
}
data64 = data64.Slice(2);
}
-
- baseOOffs += width * (baseY + BlockHeight > height ? (height & (BlockHeight - 1)) : BlockHeight);
}
+
+ imageBaseOOffs += width * height;
}
}
- width = Math.Max(1, width >> 1);
+ width = Math.Max(1, width >> 1);
height = Math.Max(1, height >> 1);
- depth = Math.Max(1, depth >> 1);
+ depth = Math.Max(1, depth >> 1);
}
return output;
}
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static void CalculateBC3Alpha(Span alpha)
+ private static ulong InterleaveBytes(uint left, uint right)
{
- for (int i = 2; i < 8; i++)
+ return InterleaveBytesWithZeros(left) | (InterleaveBytesWithZeros(right) << 8);
+ }
+
+ private static ulong InterleaveBytesWithZeros(uint value)
+ {
+ ulong output = value;
+ output = (output ^ (output << 16)) & 0xffff0000ffffUL;
+ output = (output ^ (output << 8)) & 0xff00ff00ff00ffUL;
+ return output;
+ }
+
+ private static void BCnLerpAlphaUnorm(Span alpha)
+ {
+ byte a0 = alpha[0];
+ byte a1 = alpha[1];
+
+ if (a0 > a1)
{
- if (alpha[0] > alpha[1])
+ alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
+ alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
+ alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
+ alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
+ alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
+ alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
+ }
+ else
+ {
+ alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
+ alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
+ alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
+ alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
+ alpha[6] = 0;
+ alpha[7] = 0xff;
+ }
+ }
+
+ private static void BCnLerpAlphaSnorm(Span alpha)
+ {
+ sbyte a0 = (sbyte)alpha[0];
+ sbyte a1 = (sbyte)alpha[1];
+
+ if (a0 > a1)
+ {
+ alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
+ alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
+ alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
+ alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
+ alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
+ alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
+ }
+ else
+ {
+ alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
+ alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
+ alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
+ alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
+ alpha[6] = 0x80;
+ alpha[7] = 0x7f;
+ }
+ }
+
+ private unsafe static void BCnDecodeTileAlpha(Span output, Span rPal, ulong rI)
+ {
+ if (Avx2.IsSupported)
+ {
+ Span> outputAsVector128 = MemoryMarshal.Cast>(output);
+
+ Vector128 shifts = Vector128.Create(0u, 3u, 6u, 9u);
+ Vector128 masks = Vector128.Create(7u);
+
+ Vector128 vClut;
+
+ fixed (byte* pRPal = rPal)
{
- alpha[i] = (byte)(((8 - i) * alpha[0] + (i - 1) * alpha[1]) / 7);
+ vClut = Sse2.LoadScalarVector128((ulong*)pRPal).AsByte();
}
- else if (i < 6)
+
+ Vector128 indices0 = Vector128.Create((uint)rI);
+ Vector128 indices1 = Vector128.Create((uint)(rI >> 24));
+ Vector128 indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
+ Vector128 indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
+ Vector128 indices01 = Sse2.ShiftRightLogical(indices00, 12);
+ Vector128 indices11 = Sse2.ShiftRightLogical(indices10, 12);
+ indices00 = Sse2.And(indices00, masks);
+ indices10 = Sse2.And(indices10, masks);
+ indices01 = Sse2.And(indices01, masks);
+ indices11 = Sse2.And(indices11, masks);
+
+ Vector128 indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32());
+ Vector128 indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32());
+
+ Vector128 indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16());
+
+ outputAsVector128[0] = Ssse3.Shuffle(vClut, indices);
+ }
+ else
+ {
+ for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3)
{
- alpha[i] = (byte)(((6 - i) * alpha[0] + (i - 1) * alpha[1]) / 7);
- }
- else if (i == 6)
- {
- alpha[i] = 0;
- }
- else /* i == 7 */
- {
- alpha[i] = 0xff;
+ output[i] = rPal[(int)(rI & 7)];
}
}
}
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- private static void CalculateBC3AlphaS(Span alpha)
+ private unsafe static void BCnDecodeTileAlphaRgba(Span output, Span rPal, ulong rI)
{
- for (int i = 2; i < 8; i++)
+ if (Avx2.IsSupported)
{
- if ((sbyte)alpha[0] > (sbyte)alpha[1])
+ Span> outputAsVector256 = MemoryMarshal.Cast>(output);
+
+ Vector256 shifts = Vector256.Create(0u, 3u, 6u, 9u, 12u, 15u, 18u, 21u);
+
+ Vector128 vClut128;
+
+ fixed (byte* pRPal = rPal)
{
- alpha[i] = (byte)(((8 - i) * (sbyte)alpha[0] + (i - 1) * (sbyte)alpha[1]) / 7);
+ vClut128 = Sse2.LoadScalarVector128((ulong*)pRPal).AsUInt32();
}
- else if (i < 6)
+
+ Vector256 vClut = Avx2.ConvertToVector256Int32(vClut128.AsByte()).AsUInt32();
+ vClut = Avx2.ShiftLeftLogical(vClut, 24);
+
+ Vector256 indices0 = Vector256.Create((uint)rI);
+ Vector256 indices1 = Vector256.Create((uint)(rI >> 24));
+
+ indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
+ indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
+
+ outputAsVector256[0] = Avx2.Or(outputAsVector256[0], Avx2.PermuteVar8x32(vClut, indices0));
+ outputAsVector256[1] = Avx2.Or(outputAsVector256[1], Avx2.PermuteVar8x32(vClut, indices1));
+ }
+ else
+ {
+ for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, rI >>= 3)
{
- alpha[i] = (byte)(((6 - i) * (sbyte)alpha[0] + (i - 1) * (sbyte)alpha[1]) / 7);
- }
- else if (i == 6)
- {
- alpha[i] = 0x80;
- }
- else /* i == 7 */
- {
- alpha[i] = 0x7f;
+ output[i] = rPal[(int)(rI & 7)];
}
}
}
+
+ private unsafe static void BC1DecodeTileRgb(Span output, ReadOnlySpan input)
+ {
+ Span clut = stackalloc uint[4];
+
+ uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
+ uint c0 = (ushort)c0c1;
+ uint c1 = (ushort)(c0c1 >> 16);
+
+ clut[0] = ConvertRgb565ToRgb888(c0) | 0xff000000;
+ clut[1] = ConvertRgb565ToRgb888(c1) | 0xff000000;
+ clut[2] = BC1LerpRgb2(clut[0], clut[1], c0, c1);
+ clut[3] = BC1LerpRgb3(clut[0], clut[1], c0, c1);
+
+ BCnDecodeTileRgb(clut, output, input);
+ }
+
+ private unsafe static void BC23DecodeTileRgb(Span output, ReadOnlySpan input)
+ {
+ Span clut = stackalloc uint[4];
+
+ uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
+ uint c0 = (ushort)c0c1;
+ uint c1 = (ushort)(c0c1 >> 16);
+
+ clut[0] = ConvertRgb565ToRgb888(c0);
+ clut[1] = ConvertRgb565ToRgb888(c1);
+ clut[2] = BC23LerpRgb2(clut[0], clut[1]);
+ clut[3] = BC23LerpRgb3(clut[0], clut[1]);
+
+ BCnDecodeTileRgb(clut, output, input);
+ }
+
+ private unsafe static void BCnDecodeTileRgb(Span clut, Span output, ReadOnlySpan input)
+ {
+ if (Avx2.IsSupported)
+ {
+ Span> outputAsVector256 = MemoryMarshal.Cast>(output);
+
+ Vector256 shifts0 = Vector256.Create(0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u);
+ Vector256 shifts1 = Vector256.Create(16u, 18u, 20u, 22u, 24u, 26u, 28u, 30u);
+ Vector256 masks = Vector256.Create(3u);
+
+ Vector256 vClut;
+
+ fixed (uint* pClut = &clut[0])
+ {
+ vClut = Sse2.LoadVector128(pClut).ToVector256Unsafe();
+ }
+
+ Vector256 indices0;
+
+ fixed (byte* pInput = input)
+ {
+ indices0 = Avx2.BroadcastScalarToVector256((uint*)(pInput + 4));
+ }
+
+ Vector256 indices1 = indices0;
+
+ indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts0);
+ indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts1);
+ indices0 = Avx2.And(indices0, masks);
+ indices1 = Avx2.And(indices1, masks);
+
+ outputAsVector256[0] = Avx2.PermuteVar8x32(vClut, indices0);
+ outputAsVector256[1] = Avx2.PermuteVar8x32(vClut, indices1);
+ }
+ else
+ {
+ Span outputAsUint = MemoryMarshal.Cast(output);
+
+ uint indices = BinaryPrimitives.ReadUInt32LittleEndian(input.Slice(4));
+
+ for (int i = 0; i < BlockWidth * BlockHeight; i++, indices >>= 2)
+ {
+ outputAsUint[i] = clut[(int)(indices & 3)];
+ }
+ }
+ }
+
+ private static uint BC1LerpRgb2(uint color0, uint color1, uint c0, uint c1)
+ {
+ if (c0 > c1)
+ {
+ return BC23LerpRgb2(color0, color1) | 0xff000000;
+ }
+
+ uint carry = color0 & color1;
+ uint addHalve = ((color0 ^ color1) >> 1) & 0x7f7f7f;
+ return (addHalve + carry) | 0xff000000;
+ }
+
+ private static uint BC23LerpRgb2(uint color0, uint color1)
+ {
+ uint r0 = (byte)color0;
+ uint g0 = color0 & 0xff00;
+ uint b0 = color0 & 0xff0000;
+
+ uint r1 = (byte)color1;
+ uint g1 = color1 & 0xff00;
+ uint b1 = color1 & 0xff0000;
+
+ uint mixR = (2 * r0 + r1) / 3;
+ uint mixG = (2 * g0 + g1) / 3;
+ uint mixB = (2 * b0 + b1) / 3;
+
+ return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
+ }
+
+ private static uint BC1LerpRgb3(uint color0, uint color1, uint c0, uint c1)
+ {
+ if (c0 > c1)
+ {
+ return BC23LerpRgb3(color0, color1) | 0xff000000;
+ }
+
+ return 0;
+ }
+
+ private static uint BC23LerpRgb3(uint color0, uint color1)
+ {
+ uint r0 = (byte)color0;
+ uint g0 = color0 & 0xff00;
+ uint b0 = color0 & 0xff0000;
+
+ uint r1 = (byte)color1;
+ uint g1 = color1 & 0xff00;
+ uint b1 = color1 & 0xff0000;
+
+ uint mixR = (2 * r1 + r0) / 3;
+ uint mixG = (2 * g1 + g0) / 3;
+ uint mixB = (2 * b1 + b0) / 3;
+
+ return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
+ }
+
+ private static uint ConvertRgb565ToRgb888(uint value)
+ {
+ uint b = (value & 0x1f) << 19;
+ uint g = (value << 5) & 0xfc00;
+ uint r = (value >> 8) & 0xf8;
+
+ b |= b >> 5;
+ g |= g >> 6;
+ r |= r >> 5;
+
+ return r | (g & 0xff00) | (b & 0xff0000);
+ }
}
}
\ No newline at end of file