Skip to content

Commit

Permalink
Improved char->bytes performance and simplified related code. Added a…
Browse files Browse the repository at this point in the history
…uto-benchmark for Memcpy256.
  • Loading branch information
CptMoore committed Jan 9, 2025
1 parent 1dc7d4a commit ed465d7
Show file tree
Hide file tree
Showing 4 changed files with 175 additions and 86 deletions.
2 changes: 1 addition & 1 deletion ModTek/Features/Logging/AppenderFile.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ internal AppenderFile(string path, AppenderSettings settings)
$"""
ModTek v{GitVersionInformation.InformationalVersion} ({GitVersionInformation.CommitDate}) ; HarmonyX {typeof(Harmony).Assembly.GetName().Version}
{Environment.OSVersion} ; BattleTech {Application.version} ; Unity {Application.unityVersion} ; CLR {Environment.Version} ; {System.Runtime.InteropServices.RuntimeInformation.FrameworkDescription}"
{dateTime.ToLocalTime().ToString("o", CultureInfo.InvariantCulture)} ; Startup {unityStartupTime.ToString(null, CultureInfo.InvariantCulture)} ; Ticks {stopwatchTimestamp} ; Timestamp Overhead {MTStopwatch.OverheadPerTimestampInNanoseconds}ns
{dateTime.ToLocalTime().ToString("o", CultureInfo.InvariantCulture)} ; Startup {unityStartupTime.ToString(null, CultureInfo.InvariantCulture)} ; Ticks {stopwatchTimestamp} ; Timestamp Overhead {MTStopwatch.OverheadPerTimestampInNanoseconds}ns ; MemCpy->BlockCpy threshold {FastBuffer.MemcpyThreshold}
{new string('-', 80)}
"""
Expand Down
241 changes: 157 additions & 84 deletions ModTek/Features/Logging/FastBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,72 @@ internal void Append(byte value)

internal void Append(byte[] value)
{
var position = GetPointerAndIncrementLength(value.Length);
fixed (byte* bytes = value)
var length = value.Length;
var position = GetPointerAndIncrementLength(length);
if (length > MemcpyThreshold)
{
Memcpy1(position, bytes, value.Length);
var offset = (int)(position - _bufferPtr);
Buffer.BlockCopy(value, 0, _buffer, offset, length);
}
else
{
fixed (byte* bytes = value)
{
Memcpy256(position, bytes, value.Length);
}
}
}
internal static readonly int MemcpyThreshold = FindMemCpyThreshold();
// TODO once we know that its always above some value, we can just set it and remove the benchmark
private static int FindMemCpyThreshold()
{
const int MaxSize = 2 * 1024;
var srcA = new byte[MaxSize];
var dstA = new byte[MaxSize];
var dst = stackalloc byte[MaxSize];

const int TestRunsPerSize = 100;
var byteBufferTicks = new long[TestRunsPerSize];
var memCpyTicks = new long[TestRunsPerSize];

const int WarmupCount = 100;
for (var w = 0; w < WarmupCount + 1; w++)
{
var shouldMeasure = w == WarmupCount;
const int StepSize = 128;
const int ThresholdMin = 256;
for (var size=ThresholdMin+StepSize; size<=MaxSize; size+=StepSize) {
for (var run = 0; run < TestRunsPerSize; run++)
{
var start = shouldMeasure ? MTStopwatch.GetTimestamp() : 0;
Buffer.BlockCopy(srcA, 0, dstA, 0, size);
if (shouldMeasure)
{
byteBufferTicks[run] = MTStopwatch.GetTimestamp() - start;
}
}
for (var run = 0; run < TestRunsPerSize; run++)
{
var start = shouldMeasure ? MTStopwatch.GetTimestamp() : 0;
fixed (byte* bytes = srcA)
{
Memcpy256(dst, bytes, size);
}
if (shouldMeasure)
{
memCpyTicks[run] = MTStopwatch.GetTimestamp() - start;
}
}
if (shouldMeasure)
{
if (MTStopwatch.FastestTicksSum(memCpyTicks) > MTStopwatch.FastestTicksSum(byteBufferTicks))
{
return size - StepSize;
}
}
}
}
return MaxSize;
}

internal void Append(int value)
Expand All @@ -88,71 +149,75 @@ internal void Append(string value)
return;
}

// assume one byte per char, enlarge through AppendUsingEncoding if necessary
// assume one byte per char, fallback will enlarge more defensively
EnsureCapacity(_length + processingCount);

fixed (char* chars = value)
{
var positionIterPtr = _bufferPtr + _length;
var charsIterPtr = chars;
var dstPtr = _bufferPtr + _length;
var srcPtr = (byte*)chars;

// loop unrolling similar to Buffer.memcpy1
// parallelism isn't what makes it particular fast, it's the batching that is helpful (fewer ops overall)

// 8 is a sweat spot, for large amounts of data: 4 is slower, 16 is slower
// 8 is a sweat spot, since we can do the ASCII bit mask check with an ulong
{
const int IterSize = 8;
for (; processingCount >= IterSize; processingCount -= IterSize)
{
SetAscii(positionIterPtr, charsIterPtr, 0, out var a0);
SetAscii(positionIterPtr, charsIterPtr, 1, out var a1);
SetAscii(positionIterPtr, charsIterPtr, 2, out var a2);
SetAscii(positionIterPtr, charsIterPtr, 3, out var a3);
SetAscii(positionIterPtr, charsIterPtr, 4, out var a4);
SetAscii(positionIterPtr, charsIterPtr, 5, out var a5);
SetAscii(positionIterPtr, charsIterPtr, 6, out var a6);
SetAscii(positionIterPtr, charsIterPtr, 7, out var a7);
if (!(
a0 &&
a1 &&
a2 &&
a3 &&
a4 &&
a5 &&
a6 &&
a7
)) {
*(dstPtr + 0) = *(srcPtr + 0 * 2);
*(dstPtr + 1) = *(srcPtr + 1 * 2);
*(dstPtr + 2) = *(srcPtr + 2 * 2);
*(dstPtr + 3) = *(srcPtr + 3 * 2);
*(dstPtr + 4) = *(srcPtr + 4 * 2);
*(dstPtr + 5) = *(srcPtr + 5 * 2);
*(dstPtr + 6) = *(srcPtr + 6 * 2);
*(dstPtr + 7) = *(srcPtr + 7 * 2);

const ulong NonAsciiBitmask =
(1ul << (7 + 8 * 7)) +
(1ul << (7 + 8 * 6)) +
(1ul << (7 + 8 * 5)) +
(1ul << (7 + 8 * 4)) +
(1ul << (7 + 8 * 3)) +
(1ul << (7 + 8 * 2)) +
(1ul << (7 + 8 * 1)) +
(1ul << (7 + 8 * 0));
if ((*(ulong*)dstPtr & NonAsciiBitmask) != 0)
{
goto Utf8Fallback;
}
dstPtr += IterSize;
srcPtr += 2*IterSize;
_length += IterSize;
positionIterPtr = _bufferPtr + _length;
charsIterPtr += IterSize;
}
}

{
const int IterSize = 2;
for (; processingCount >= IterSize; processingCount -= IterSize)
{
SetAscii(positionIterPtr, charsIterPtr, 0, out var a0);
SetAscii(positionIterPtr, charsIterPtr, 1, out var a1);
if (!(
a0 &&
a1
)) {
*(dstPtr + 0) = *(srcPtr + 0 * 2);
*(dstPtr + 1) = *(srcPtr + 1 * 2);

const ushort NonAsciiBitmask =
(1 << (7 + 8 * 1)) +
(1 << (7 + 8 * 0));
if ((*(ushort*)dstPtr & NonAsciiBitmask) != 0)
{
goto Utf8Fallback;
}
dstPtr += IterSize;
srcPtr += 2*IterSize;
_length += IterSize;
positionIterPtr = _bufferPtr + _length;
charsIterPtr += IterSize;
}
}

if (processingCount > 0)
{
const int IterSize = 1;
SetAscii(positionIterPtr, charsIterPtr, 0, out var a0);
if (!a0)
*(dstPtr + 0) = *(srcPtr + 0 * 2);

const byte NonAsciiBitmask = 1 << 7;
if ((*dstPtr & NonAsciiBitmask) != 0)
{
goto Utf8Fallback;
}
Expand All @@ -172,14 +237,6 @@ internal void Append(string value)
}
internal static readonly MTStopwatch UTF8FallbackStopwatch = new();

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private void SetAscii(byte* positionIterPtr, char* charsIterPtr, int offset, out bool isUnicodeCompatibleAscii)
{
var valueAsByte = (byte)charsIterPtr[offset];
positionIterPtr[offset] = valueAsByte;
isUnicodeCompatibleAscii = valueAsByte <= 127;
}

internal void Append(DateTime value)
{
AppendTime(value.Hour, value.Minute, value.Second, value.Ticks);
Expand Down Expand Up @@ -224,46 +281,51 @@ private void EnsureCapacity(int targetLength)
private void EnlargeCapacity(int targetLength)
{
var newBuffer = new byte[targetLength];
var newHandle = GCHandle.Alloc(newBuffer, GCHandleType.Pinned);
try
if (_buffer != null)
{
var newBufferPtr = (byte*)newHandle.AddrOfPinnedObject();

if (_buffer != null)
// block copy is faster for larger byte arrays
Buffer.BlockCopy(_buffer, 0, newBuffer, 0, _length);
try
{
Memcpy1(newBufferPtr, _bufferPtr, _length);
try
{
_handle.Free();
}
catch
{
_buffer = null;
_bufferPtr = null;
_isG2 = false;
}
_handle.Free();
}
catch
{
_buffer = null;
_bufferPtr = null;
_isG2 = false;
}

_buffer = newBuffer;
_handle = newHandle;
_bufferPtr = newBufferPtr;
_isG2 = false;
}
catch
{
newHandle.Free();
throw;
}
_buffer = newBuffer;
_handle = GCHandle.Alloc(newBuffer, GCHandleType.Pinned);
_bufferPtr = (byte*)_handle.AddrOfPinnedObject();;
_isG2 = false;
}

internal static readonly MTStopwatch CopyStopwatch = new();
// from Buffer.memcpy1 and optimized to use 64bit/16bit types instead of just 8bit
internal static void Memcpy1(byte* dest, byte* src, int size)
// from Buffer.memcpy* and optimized to use wider types like 128 and 256 bit
// JIT can do xmm (128) and cpu can optimize 2x xmm (2x128) further it seems
internal static void Memcpy256(byte* dest, byte* src, int size)
{
// make sure to only measure when there is enough, otherwise measurement is slower than the actual copy
var measurement = size >= 64 ? MTStopwatch.GetTimestamp() : 0;
{ // 25% faster than if using 2x128 on AMD Zen4 hardware
const int BatchSize = My256Bit.Size;
for (; size >= BatchSize; size -= BatchSize)
{
*(My256Bit*)dest = *(My256Bit*)src;
dest += BatchSize;
src += BatchSize;
}
}
{ // 100% faster than if using 2x64 on xmm hardware
const int BatchSize = My128Bit.Size;
for (; size >= BatchSize; size -= BatchSize)
{
*(My128Bit*)dest = *(My128Bit*)src;
dest += BatchSize;
src += BatchSize;
}
}
{
const int BatchSize = sizeof(ulong); // 8
const int BatchSize = sizeof(ulong);
for (; size >= BatchSize; size -= BatchSize)
{
*(ulong*)dest = *(ulong*)src;
Expand All @@ -272,7 +334,7 @@ internal static void Memcpy1(byte* dest, byte* src, int size)
}
}
{
const int BatchSize = sizeof(ushort); // 2
const int BatchSize = sizeof(ushort);
for (; size >= BatchSize; size -= BatchSize)
{
*(ushort*)dest = *(ushort*)src;
Expand All @@ -284,10 +346,21 @@ internal static void Memcpy1(byte* dest, byte* src, int size)
{
*dest = *src;
}
if (measurement > 0)
{
CopyStopwatch.EndMeasurement(measurement, size);
}
}

// the jit can optimize this to 2x xmm 128 ops
// and 2x 128bit ops together are 25% faster than looping over 128bit ops
private struct My128Bit
{
internal const int Size = 128/8;
internal long _00;
internal long _01;
}
private struct My256Bit
{
internal const int Size = 256/8;
internal My128Bit _00;
internal My128Bit _01;
}

~FastBuffer()
Expand Down
1 change: 0 additions & 1 deletion ModTek/Features/Logging/MTLoggerAsyncQueue.cs
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ Async internal processing had an average latency of {latencyStats.AverageNanosec
Filters {AppenderFile.FiltersStopWatch.GetStats()}.
Formatter {AppenderFile.FormatterStopWatch.GetStats()}.
UTF8-Fallback {FastBuffer.UTF8FallbackStopwatch.GetStats()}.
Copy buffer per bytes {FastBuffer.CopyStopwatch.GetStats()}.
Write (to OS buffers) {AppenderFile.WriteStopwatch.GetStats()}.
"""
);
Expand Down
17 changes: 17 additions & 0 deletions ModTek/Util/Stopwatch/MTStopwatch.cs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,12 @@ static MTStopwatch()
s_timestampOverheadInAndAfterMeasurement = ( seSum - smSum ) / ActualCount;
}

internal void Reset()
{
Volatile.Write(ref _count, 0);
Volatile.Write(ref _ticks, 0);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static long GetTimestamp()
{
Expand All @@ -62,6 +68,17 @@ protected virtual void AddMeasurement(long elapsedTicks, long delta)

internal MTStopwatchStats GetStats() => new(this, Volatile.Read(ref _count), Volatile.Read(ref _ticks));

internal static long FastestTicksSum(long[] ticks, double onlyIncludeFastest = 0.5)
{
Array.Sort(ticks);
var sum = 0L;
for (var i = 0; i < ticks.Length * onlyIncludeFastest; i++)
{
sum += ticks[i];
}
return sum;
}

internal static TimeSpan TimeSpanFromTicks(long elapsedTicks)
{
return System.Diagnostics.Stopwatch.IsHighResolution ? TimeSpan.FromTicks((long)(elapsedTicks * s_stopWatchTicksToTimeSpanTicksMultiplier)) : TimeSpan.FromTicks(elapsedTicks);
Expand Down

0 comments on commit ed465d7

Please sign in to comment.