Enable multithreaded VP9 decoding (#2009)
* Enable multithreaded VP9 decoding * Limit the number of threads used for video decoding
This commit is contained in:
parent
172ec326e5
commit
c465d771dd
6 changed files with 252 additions and 9 deletions
|
@ -1,13 +1,14 @@
|
||||||
using Ryujinx.Common.Memory;
|
using Ryujinx.Common.Memory;
|
||||||
|
using Ryujinx.Graphics.Nvdec.Vp9.Common;
|
||||||
|
using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
|
||||||
|
using Ryujinx.Graphics.Nvdec.Vp9.Types;
|
||||||
|
using Ryujinx.Graphics.Video;
|
||||||
using System;
|
using System;
|
||||||
using System.Buffers.Binary;
|
using System.Buffers.Binary;
|
||||||
using System.Diagnostics;
|
using System.Diagnostics;
|
||||||
using System.Runtime.CompilerServices;
|
using System.Runtime.CompilerServices;
|
||||||
using System.Runtime.InteropServices;
|
using System.Runtime.InteropServices;
|
||||||
using Ryujinx.Graphics.Nvdec.Vp9.Common;
|
using System.Threading.Tasks;
|
||||||
using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
|
|
||||||
using Ryujinx.Graphics.Nvdec.Vp9.Types;
|
|
||||||
using Ryujinx.Graphics.Video;
|
|
||||||
using Mv = Ryujinx.Graphics.Nvdec.Vp9.Types.Mv;
|
using Mv = Ryujinx.Graphics.Nvdec.Vp9.Types.Mv;
|
||||||
|
|
||||||
namespace Ryujinx.Graphics.Nvdec.Vp9
|
namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||||
|
@ -1095,6 +1096,19 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||||
data = data.Slice(size);
|
data = data.Slice(size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static void GetTileBuffers(ref Vp9Common cm, ArrayPtr<byte> data, int tileCols, ref Array64<TileBuffer> tileBuffers)
|
||||||
|
{
|
||||||
|
int c;
|
||||||
|
|
||||||
|
for (c = 0; c < tileCols; ++c)
|
||||||
|
{
|
||||||
|
bool isLast = c == tileCols - 1;
|
||||||
|
ref TileBuffer buf = ref tileBuffers[c];
|
||||||
|
buf.Col = c;
|
||||||
|
GetTileBuffer(isLast, ref cm.Error, ref data, ref buf);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static void GetTileBuffers(
|
private static void GetTileBuffers(
|
||||||
ref Vp9Common cm,
|
ref Vp9Common cm,
|
||||||
ArrayPtr<byte> data,
|
ArrayPtr<byte> data,
|
||||||
|
@ -1181,5 +1195,163 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||||
// Get last tile data.
|
// Get last tile data.
|
||||||
return cm.TileWorkerData[tileCols * tileRows - 1].BitReader.FindEnd();
|
return cm.TileWorkerData[tileCols * tileRows - 1].BitReader.FindEnd();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static bool DecodeTileCol(ref TileWorkerData tileData, ref Vp9Common cm, ref Array64<TileBuffer> tileBuffers)
|
||||||
|
{
|
||||||
|
ref TileInfo tile = ref tileData.Xd.Tile;
|
||||||
|
int finalCol = (1 << cm.Log2TileCols) - 1;
|
||||||
|
ArrayPtr<byte> bitReaderEnd = ArrayPtr<byte>.Null;
|
||||||
|
|
||||||
|
int n = tileData.BufStart;
|
||||||
|
|
||||||
|
tileData.Xd.Corrupted = false;
|
||||||
|
|
||||||
|
do
|
||||||
|
{
|
||||||
|
ref TileBuffer buf = ref tileBuffers[n];
|
||||||
|
|
||||||
|
Debug.Assert(cm.Log2TileRows == 0);
|
||||||
|
tileData.Dqcoeff = new Array32<Array32<int>>();
|
||||||
|
tile.Init(ref cm, 0, buf.Col);
|
||||||
|
SetupTokenDecoder(buf.Data, buf.Size, ref tileData.ErrorInfo, ref tileData.BitReader);
|
||||||
|
cm.InitMacroBlockD(ref tileData.Xd, new ArrayPtr<int>(ref tileData.Dqcoeff[0][0], 32 * 32));
|
||||||
|
tileData.Xd.ErrorInfo = new Ptr<InternalErrorInfo>(ref tileData.ErrorInfo);
|
||||||
|
|
||||||
|
for (int miRow = tile.MiRowStart; miRow < tile.MiRowEnd; miRow += Constants.MiBlockSize)
|
||||||
|
{
|
||||||
|
tileData.Xd.LeftContext = new Array3<Array16<sbyte>>();
|
||||||
|
tileData.Xd.LeftSegContext = new Array8<sbyte>();
|
||||||
|
for (int miCol = tile.MiColStart; miCol < tile.MiColEnd; miCol += Constants.MiBlockSize)
|
||||||
|
{
|
||||||
|
DecodePartition(ref tileData, ref cm, miRow, miCol, BlockSize.Block64x64, 4);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (buf.Col == finalCol)
|
||||||
|
{
|
||||||
|
bitReaderEnd = tileData.BitReader.FindEnd();
|
||||||
|
}
|
||||||
|
} while (!tileData.Xd.Corrupted && ++n <= tileData.BufEnd);
|
||||||
|
|
||||||
|
tileData.DataEnd = bitReaderEnd;
|
||||||
|
return !tileData.Xd.Corrupted;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static unsafe ArrayPtr<byte> DecodeTilesMt(ref Vp9Common cm, ArrayPtr<byte> data, int maxThreads)
|
||||||
|
{
|
||||||
|
ArrayPtr<byte> bitReaderEnd = ArrayPtr<byte>.Null;
|
||||||
|
|
||||||
|
int tileCols = 1 << cm.Log2TileCols;
|
||||||
|
int tileRows = 1 << cm.Log2TileRows;
|
||||||
|
int totalTiles = tileCols * tileRows;
|
||||||
|
int numWorkers = Math.Min(maxThreads, tileCols);
|
||||||
|
int n;
|
||||||
|
|
||||||
|
Debug.Assert(tileCols <= (1 << 6));
|
||||||
|
Debug.Assert(tileRows == 1);
|
||||||
|
|
||||||
|
cm.AboveContext.ToSpan().Fill(0);
|
||||||
|
cm.AboveSegContext.ToSpan().Fill(0);
|
||||||
|
|
||||||
|
for (n = 0; n < numWorkers; ++n)
|
||||||
|
{
|
||||||
|
ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles];
|
||||||
|
|
||||||
|
tileData.Xd = cm.Mb;
|
||||||
|
tileData.Xd.Counts = new Ptr<Vp9BackwardUpdates>(ref tileData.Counts);
|
||||||
|
tileData.Counts = new Vp9BackwardUpdates();
|
||||||
|
}
|
||||||
|
|
||||||
|
Array64<TileBuffer> tileBuffers = new Array64<TileBuffer>();
|
||||||
|
|
||||||
|
GetTileBuffers(ref cm, data, tileCols, ref tileBuffers);
|
||||||
|
|
||||||
|
tileBuffers.ToSpan().Slice(0, tileCols).Sort(CompareTileBuffers);
|
||||||
|
|
||||||
|
if (numWorkers == tileCols)
|
||||||
|
{
|
||||||
|
TileBuffer largest = tileBuffers[0];
|
||||||
|
Span<TileBuffer> buffers = tileBuffers.ToSpan();
|
||||||
|
buffers.Slice(1).CopyTo(buffers.Slice(0, tileBuffers.Length - 1));
|
||||||
|
tileBuffers[tileCols - 1] = largest;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
int start = 0, end = tileCols - 2;
|
||||||
|
TileBuffer tmp;
|
||||||
|
|
||||||
|
// Interleave the tiles to distribute the load between threads, assuming a
|
||||||
|
// larger tile implies it is more difficult to decode.
|
||||||
|
while (start < end)
|
||||||
|
{
|
||||||
|
tmp = tileBuffers[start];
|
||||||
|
tileBuffers[start] = tileBuffers[end];
|
||||||
|
tileBuffers[end] = tmp;
|
||||||
|
start += 2;
|
||||||
|
end -= 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int baseVal = tileCols / numWorkers;
|
||||||
|
int remain = tileCols % numWorkers;
|
||||||
|
int bufStart = 0;
|
||||||
|
|
||||||
|
for (n = 0; n < numWorkers; ++n)
|
||||||
|
{
|
||||||
|
int count = baseVal + (remain + n) / numWorkers;
|
||||||
|
ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles];
|
||||||
|
|
||||||
|
tileData.BufStart = bufStart;
|
||||||
|
tileData.BufEnd = bufStart + count - 1;
|
||||||
|
tileData.DataEnd = data.Slice(data.Length);
|
||||||
|
bufStart += count;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ptr<Vp9Common> cmPtr = new Ptr<Vp9Common>(ref cm);
|
||||||
|
|
||||||
|
Parallel.For(0, numWorkers, (n) =>
|
||||||
|
{
|
||||||
|
ref TileWorkerData tileData = ref cmPtr.Value.TileWorkerData[n + totalTiles];
|
||||||
|
|
||||||
|
if (!DecodeTileCol(ref tileData, ref cmPtr.Value, ref tileBuffers))
|
||||||
|
{
|
||||||
|
cmPtr.Value.Mb.Corrupted = true;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
for (; n > 0; --n)
|
||||||
|
{
|
||||||
|
if (bitReaderEnd.IsNull)
|
||||||
|
{
|
||||||
|
ref TileWorkerData tileData = ref cm.TileWorkerData[n - 1 + totalTiles];
|
||||||
|
bitReaderEnd = tileData.DataEnd;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (n = 0; n < numWorkers; ++n)
|
||||||
|
{
|
||||||
|
ref TileWorkerData tileData = ref cm.TileWorkerData[n + totalTiles];
|
||||||
|
AccumulateFrameCounts(ref cm.Counts.Value, ref tileData.Counts);
|
||||||
|
}
|
||||||
|
|
||||||
|
Debug.Assert(!bitReaderEnd.IsNull || cm.Mb.Corrupted);
|
||||||
|
return bitReaderEnd;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static int CompareTileBuffers(TileBuffer bufA, TileBuffer bufB)
|
||||||
|
{
|
||||||
|
return (bufA.Size < bufB.Size ? 1 : 0) - (bufA.Size > bufB.Size ? 1 : 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void AccumulateFrameCounts(ref Vp9BackwardUpdates accum, ref Vp9BackwardUpdates counts)
|
||||||
|
{
|
||||||
|
Span<uint> a = MemoryMarshal.Cast<Vp9BackwardUpdates, uint>(MemoryMarshal.CreateSpan(ref accum, 1));
|
||||||
|
Span<uint> c = MemoryMarshal.Cast<Vp9BackwardUpdates, uint>(MemoryMarshal.CreateSpan(ref counts, 1));
|
||||||
|
|
||||||
|
for (int i = 0; i < a.Length; i++)
|
||||||
|
{
|
||||||
|
a[i] += c[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -92,7 +92,14 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||||
|
|
||||||
cm.Mb.SetupBlockPlanes(1, 1);
|
cm.Mb.SetupBlockPlanes(1, 1);
|
||||||
|
|
||||||
cm.AllocTileWorkerData(_allocator, 1 << pictureInfo.Log2TileCols, 1 << pictureInfo.Log2TileRows);
|
int tileCols = 1 << pictureInfo.Log2TileCols;
|
||||||
|
int tileRows = 1 << pictureInfo.Log2TileRows;
|
||||||
|
|
||||||
|
// Video usually have only 4 columns, so more threads won't make a difference for those.
|
||||||
|
// Try to not take all CPU cores for video decoding.
|
||||||
|
int maxThreads = Math.Min(4, Environment.ProcessorCount / 2);
|
||||||
|
|
||||||
|
cm.AllocTileWorkerData(_allocator, tileCols, tileRows, maxThreads);
|
||||||
cm.AllocContextBuffers(_allocator, output.Width, output.Height);
|
cm.AllocContextBuffers(_allocator, output.Width, output.Height);
|
||||||
cm.InitContextBuffers();
|
cm.InitContextBuffers();
|
||||||
cm.SetupSegmentationDequant();
|
cm.SetupSegmentationDequant();
|
||||||
|
@ -103,9 +110,16 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||||
fixed (byte* dataPtr = bitstream)
|
fixed (byte* dataPtr = bitstream)
|
||||||
{
|
{
|
||||||
try
|
try
|
||||||
|
{
|
||||||
|
if (maxThreads > 1 && tileRows == 1 && tileCols > 1)
|
||||||
|
{
|
||||||
|
DecodeFrame.DecodeTilesMt(ref cm, new ArrayPtr<byte>(dataPtr, bitstream.Length), maxThreads);
|
||||||
|
}
|
||||||
|
else
|
||||||
{
|
{
|
||||||
DecodeFrame.DecodeTiles(ref cm, new ArrayPtr<byte>(dataPtr, bitstream.Length));
|
DecodeFrame.DecodeTiles(ref cm, new ArrayPtr<byte>(dataPtr, bitstream.Length));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
catch (InternalErrorException)
|
catch (InternalErrorException)
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -87,6 +87,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
return rv;
|
return rv;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Iwht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
public static void Iwht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
||||||
{
|
{
|
||||||
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
|
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
|
||||||
|
@ -142,6 +143,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Iwht4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
public static void Iwht4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
@ -209,6 +211,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3));
|
output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct4(ReadOnlySpan<int> input, Span<int> output)
|
public static void Idct4(ReadOnlySpan<int> input, Span<int> output)
|
||||||
{
|
{
|
||||||
Span<short> step = stackalloc short[4];
|
Span<short> step = stackalloc short[4];
|
||||||
|
@ -231,6 +234,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[3] = WrapLow(step[0] - step[3]);
|
output[3] = WrapLow(step[0] - step[3]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
public static void Idct4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -359,6 +363,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[7] = WrapLow(-x1);
|
output[7] = WrapLow(-x1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct8(ReadOnlySpan<int> input, Span<int> output)
|
public static void Idct8(ReadOnlySpan<int> input, Span<int> output)
|
||||||
{
|
{
|
||||||
Span<short> step1 = stackalloc short[8];
|
Span<short> step1 = stackalloc short[8];
|
||||||
|
@ -416,6 +421,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[7] = WrapLow(step1[0] - step1[7]);
|
output[7] = WrapLow(step1[0] - step1[7]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
public static void Idct8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -449,6 +455,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct8x812Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
public static void Idct8x812Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -457,6 +464,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
Span<int> tempIn = stackalloc int[8];
|
Span<int> tempIn = stackalloc int[8];
|
||||||
Span<int> tempOut = stackalloc int[8];
|
Span<int> tempOut = stackalloc int[8];
|
||||||
|
|
||||||
|
output.Fill(0);
|
||||||
|
|
||||||
// First transform rows
|
// First transform rows
|
||||||
// Only first 4 row has non-zero coefs
|
// Only first 4 row has non-zero coefs
|
||||||
for (i = 0; i < 4; ++i)
|
for (i = 0; i < 4; ++i)
|
||||||
|
@ -671,6 +680,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[15] = WrapLow(-x1);
|
output[15] = WrapLow(-x1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct16(ReadOnlySpan<int> input, Span<int> output)
|
public static void Idct16(ReadOnlySpan<int> input, Span<int> output)
|
||||||
{
|
{
|
||||||
Span<short> step1 = stackalloc short[16];
|
Span<short> step1 = stackalloc short[16];
|
||||||
|
@ -838,6 +848,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[15] = WrapLow(step2[0] - step2[15]);
|
output[15] = WrapLow(step2[0] - step2[15]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
public static void Idct16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -870,6 +881,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct16x1638Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
public static void Idct16x1638Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -878,6 +890,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
Span<int> tempIn = stackalloc int[16];
|
Span<int> tempIn = stackalloc int[16];
|
||||||
Span<int> tempOut = stackalloc int[16];
|
Span<int> tempOut = stackalloc int[16];
|
||||||
|
|
||||||
|
output.Fill(0);
|
||||||
|
|
||||||
// First transform rows. Since all non-zero dct coefficients are in
|
// First transform rows. Since all non-zero dct coefficients are in
|
||||||
// upper-left 8x8 area, we only need to calculate first 8 rows here.
|
// upper-left 8x8 area, we only need to calculate first 8 rows here.
|
||||||
for (i = 0; i < 8; ++i)
|
for (i = 0; i < 8; ++i)
|
||||||
|
@ -903,6 +917,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct16x1610Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
public static void Idct16x1610Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -911,6 +926,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
Span<int> tempIn = stackalloc int[16];
|
Span<int> tempIn = stackalloc int[16];
|
||||||
Span<int> tempOut = stackalloc int[16];
|
Span<int> tempOut = stackalloc int[16];
|
||||||
|
|
||||||
|
output.Fill(0);
|
||||||
|
|
||||||
// First transform rows. Since all non-zero dct coefficients are in
|
// First transform rows. Since all non-zero dct coefficients are in
|
||||||
// upper-left 4x4 area, we only need to calculate first 4 rows here.
|
// upper-left 4x4 area, we only need to calculate first 4 rows here.
|
||||||
for (i = 0; i < 4; ++i)
|
for (i = 0; i < 4; ++i)
|
||||||
|
@ -955,6 +972,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct32(ReadOnlySpan<int> input, Span<int> output)
|
public static void Idct32(ReadOnlySpan<int> input, Span<int> output)
|
||||||
{
|
{
|
||||||
Span<short> step1 = stackalloc short[32];
|
Span<short> step1 = stackalloc short[32];
|
||||||
|
@ -1324,6 +1342,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[31] = WrapLow(step1[0] - step1[31]);
|
output[31] = WrapLow(step1[0] - step1[31]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct32x321024Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
public static void Idct32x321024Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -1370,6 +1389,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct32x32135Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
public static void Idct32x32135Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -1378,6 +1398,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
Span<int> tempIn = stackalloc int[32];
|
Span<int> tempIn = stackalloc int[32];
|
||||||
Span<int> tempOut = stackalloc int[32];
|
Span<int> tempOut = stackalloc int[32];
|
||||||
|
|
||||||
|
output.Fill(0);
|
||||||
|
|
||||||
// Rows
|
// Rows
|
||||||
// Only upper-left 16x16 has non-zero coeff
|
// Only upper-left 16x16 has non-zero coeff
|
||||||
for (i = 0; i < 16; ++i)
|
for (i = 0; i < 16; ++i)
|
||||||
|
@ -1403,6 +1425,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void Idct32x3234Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
public static void Idct32x3234Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -1411,6 +1434,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
Span<int> tempIn = stackalloc int[32];
|
Span<int> tempIn = stackalloc int[32];
|
||||||
Span<int> tempOut = stackalloc int[32];
|
Span<int> tempOut = stackalloc int[32];
|
||||||
|
|
||||||
|
output.Fill(0);
|
||||||
|
|
||||||
// Rows
|
// Rows
|
||||||
// Only upper-left 8x8 has non-zero coeff
|
// Only upper-left 8x8 has non-zero coeff
|
||||||
for (i = 0; i < 8; ++i)
|
for (i = 0; i < 8; ++i)
|
||||||
|
@ -1456,6 +1481,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIwht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
public static void HighbdIwht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
||||||
{
|
{
|
||||||
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
|
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
|
||||||
|
@ -1511,6 +1537,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIwht4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
public static void HighbdIwht4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
@ -1584,6 +1611,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd);
|
output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct4(ReadOnlySpan<int> input, Span<int> output, int bd)
|
public static void HighbdIdct4(ReadOnlySpan<int> input, Span<int> output, int bd)
|
||||||
{
|
{
|
||||||
Span<int> step = stackalloc int[4];
|
Span<int> step = stackalloc int[4];
|
||||||
|
@ -1613,6 +1641,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[3] = HighbdWrapLow(step[0] - step[3], bd);
|
output[3] = HighbdWrapLow(step[0] - step[3], bd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
public static void HighbdIdct4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -1748,6 +1777,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[7] = HighbdWrapLow(-x1, bd);
|
output[7] = HighbdWrapLow(-x1, bd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct8(ReadOnlySpan<int> input, Span<int> output, int bd)
|
public static void HighbdIdct8(ReadOnlySpan<int> input, Span<int> output, int bd)
|
||||||
{
|
{
|
||||||
Span<int> step1 = stackalloc int[8];
|
Span<int> step1 = stackalloc int[8];
|
||||||
|
@ -1803,6 +1833,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[7] = HighbdWrapLow(step1[0] - step1[7], bd);
|
output[7] = HighbdWrapLow(step1[0] - step1[7], bd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
public static void HighbdIdct8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -1835,6 +1866,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct8x812Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
public static void HighbdIdct8x812Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -1843,6 +1875,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
Span<int> tempIn = stackalloc int[8];
|
Span<int> tempIn = stackalloc int[8];
|
||||||
Span<int> tempOut = stackalloc int[8];
|
Span<int> tempOut = stackalloc int[8];
|
||||||
|
|
||||||
|
output.Fill(0);
|
||||||
|
|
||||||
// First transform rows
|
// First transform rows
|
||||||
// Only first 4 row has non-zero coefs
|
// Only first 4 row has non-zero coefs
|
||||||
for (i = 0; i < 4; ++i)
|
for (i = 0; i < 4; ++i)
|
||||||
|
@ -2062,6 +2096,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[15] = HighbdWrapLow(-x1, bd);
|
output[15] = HighbdWrapLow(-x1, bd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct16(ReadOnlySpan<int> input, Span<int> output, int bd)
|
public static void HighbdIdct16(ReadOnlySpan<int> input, Span<int> output, int bd)
|
||||||
{
|
{
|
||||||
Span<int> step1 = stackalloc int[16];
|
Span<int> step1 = stackalloc int[16];
|
||||||
|
@ -2236,6 +2271,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[15] = HighbdWrapLow(step2[0] - step2[15], bd);
|
output[15] = HighbdWrapLow(step2[0] - step2[15], bd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
public static void HighbdIdct16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -2268,6 +2304,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct16x1638Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
public static void HighbdIdct16x1638Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -2276,6 +2313,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
Span<int> tempIn = stackalloc int[16];
|
Span<int> tempIn = stackalloc int[16];
|
||||||
Span<int> tempOut = stackalloc int[16];
|
Span<int> tempOut = stackalloc int[16];
|
||||||
|
|
||||||
|
output.Fill(0);
|
||||||
|
|
||||||
// First transform rows. Since all non-zero dct coefficients are in
|
// First transform rows. Since all non-zero dct coefficients are in
|
||||||
// upper-left 8x8 area, we only need to calculate first 8 rows here.
|
// upper-left 8x8 area, we only need to calculate first 8 rows here.
|
||||||
for (i = 0; i < 8; ++i)
|
for (i = 0; i < 8; ++i)
|
||||||
|
@ -2303,6 +2342,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct16x1610Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
public static void HighbdIdct16x1610Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -2311,6 +2351,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
Span<int> tempIn = stackalloc int[16];
|
Span<int> tempIn = stackalloc int[16];
|
||||||
Span<int> tempOut = stackalloc int[16];
|
Span<int> tempOut = stackalloc int[16];
|
||||||
|
|
||||||
|
output.Fill(0);
|
||||||
|
|
||||||
// First transform rows. Since all non-zero dct coefficients are in
|
// First transform rows. Since all non-zero dct coefficients are in
|
||||||
// upper-left 4x4 area, we only need to calculate first 4 rows here.
|
// upper-left 4x4 area, we only need to calculate first 4 rows here.
|
||||||
for (i = 0; i < 4; ++i)
|
for (i = 0; i < 4; ++i)
|
||||||
|
@ -2355,6 +2397,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct32(ReadOnlySpan<int> input, Span<int> output, int bd)
|
public static void HighbdIdct32(ReadOnlySpan<int> input, Span<int> output, int bd)
|
||||||
{
|
{
|
||||||
Span<int> step1 = stackalloc int[32];
|
Span<int> step1 = stackalloc int[32];
|
||||||
|
@ -2731,6 +2774,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
output[31] = HighbdWrapLow(step1[0] - step1[31], bd);
|
output[31] = HighbdWrapLow(step1[0] - step1[31], bd);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct32x321024Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
public static void HighbdIdct32x321024Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -2777,6 +2821,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct32x32135Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
public static void HighbdIdct32x32135Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -2785,6 +2830,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
Span<int> tempIn = stackalloc int[32];
|
Span<int> tempIn = stackalloc int[32];
|
||||||
Span<int> tempOut = stackalloc int[32];
|
Span<int> tempOut = stackalloc int[32];
|
||||||
|
|
||||||
|
output.Fill(0);
|
||||||
|
|
||||||
// Rows
|
// Rows
|
||||||
// Only upper-left 16x16 has non-zero coeff
|
// Only upper-left 16x16 has non-zero coeff
|
||||||
for (i = 0; i < 16; ++i)
|
for (i = 0; i < 16; ++i)
|
||||||
|
@ -2812,6 +2859,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[SkipLocalsInit]
|
||||||
public static void HighbdIdct32x3234Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
public static void HighbdIdct32x3234Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
@ -2820,6 +2868,8 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
|
||||||
Span<int> tempIn = stackalloc int[32];
|
Span<int> tempIn = stackalloc int[32];
|
||||||
Span<int> tempOut = stackalloc int[32];
|
Span<int> tempOut = stackalloc int[32];
|
||||||
|
|
||||||
|
output.Fill(0);
|
||||||
|
|
||||||
// Rows
|
// Rows
|
||||||
// Only upper-left 8x8 has non-zero coeff
|
// Only upper-left 8x8 has non-zero coeff
|
||||||
for (i = 0; i < 8; ++i)
|
for (i = 0; i < 8; ++i)
|
||||||
|
|
|
@ -4,6 +4,7 @@ namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||||
{
|
{
|
||||||
internal struct TileBuffer
|
internal struct TileBuffer
|
||||||
{
|
{
|
||||||
|
public int Col;
|
||||||
public ArrayPtr<byte> Data;
|
public ArrayPtr<byte> Data;
|
||||||
public int Size;
|
public int Size;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,14 +1,20 @@
|
||||||
using Ryujinx.Common.Memory;
|
using Ryujinx.Common.Memory;
|
||||||
using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
|
using Ryujinx.Graphics.Nvdec.Vp9.Dsp;
|
||||||
using Ryujinx.Graphics.Nvdec.Vp9.Types;
|
using Ryujinx.Graphics.Nvdec.Vp9.Types;
|
||||||
|
using Ryujinx.Graphics.Video;
|
||||||
|
|
||||||
namespace Ryujinx.Graphics.Nvdec.Vp9
|
namespace Ryujinx.Graphics.Nvdec.Vp9
|
||||||
{
|
{
|
||||||
internal struct TileWorkerData
|
internal struct TileWorkerData
|
||||||
{
|
{
|
||||||
|
public ArrayPtr<byte> DataEnd;
|
||||||
|
public int BufStart;
|
||||||
|
public int BufEnd;
|
||||||
public Reader BitReader;
|
public Reader BitReader;
|
||||||
|
public Vp9BackwardUpdates Counts;
|
||||||
public MacroBlockD Xd;
|
public MacroBlockD Xd;
|
||||||
/* dqcoeff are shared by all the planes. So planes must be decoded serially */
|
/* dqcoeff are shared by all the planes. So planes must be decoded serially */
|
||||||
public Array32<Array32<int>> Dqcoeff;
|
public Array32<Array32<int>> Dqcoeff;
|
||||||
|
public InternalErrorInfo ErrorInfo;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -127,9 +127,9 @@ namespace Ryujinx.Graphics.Nvdec.Vp9.Types
|
||||||
MBs = MbRows * MbCols;
|
MBs = MbRows * MbCols;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void AllocTileWorkerData(MemoryAllocator allocator, int tileCols, int tileRows)
|
public void AllocTileWorkerData(MemoryAllocator allocator, int tileCols, int tileRows, int maxThreads)
|
||||||
{
|
{
|
||||||
TileWorkerData = allocator.Allocate<TileWorkerData>(tileCols * tileRows);
|
TileWorkerData = allocator.Allocate<TileWorkerData>(tileCols * tileRows + (maxThreads > 1 ? maxThreads : 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void FreeTileWorkerData(MemoryAllocator allocator)
|
public void FreeTileWorkerData(MemoryAllocator allocator)
|
||||||
|
|
Loading…
Reference in a new issue