in .NET world
What is micro-optimization
What is micro-optimization
Total time Own timeMethod
15 234 ms
15 221 ms
14 163 ms
14 028 ms
1 034 ms
975 ms
13 ms
24 ms
135 ms
3 ms
59 ms
975 ms
What is micro-optimization
Total time Own timeMethod
15 234 ms
15 221 ms
832 ms
697 ms
14 365 ms
14 227 ms
13 ms
24 ms
135 ms
5 ms
138 ms
14 227 ms
What is micro-optimization
micro-optimization (plural micro-optimizations)
(programming, computer architecture)

Optimization at the level of individual instructions and operations.
Неделя оптимизации кода
А разговоров-то
intrinsic (plural intrinsics)
(computing, programming)
A built-in function that is implemented directly by the compiler,
without any intermediate call to a library.
PADDW (Add Packed Integers)
Source 1
Source 2
PDEP (Parallel Bits Deposit)
x0x1x2x3x4x5x6x7Source 1
Source 2
00x00x1x20000x30000 …
• SSE, SSE2, SSE3, SSE4, SSE4.1, SSE4.2, SSSE3

• Lzcnt, Popcnt



• BMI1, BMI2



using System.Runtime.Intrinsics.X86;
uint CalculateCrc32(byte[] data)
    if (Sse42.IsSupported)
        uint result = 0;
        foreach (var b in data)
            result = Sse42.Crc32(result, b);
        return result;
        // Falback implementation of the method
        // without using intrinsics
namespace System.Numerics
    public static class BitOperations
        public static int Log2(uint value)
            if (Lzcnt.IsSupported)
                if (value == 0)
                    return 0;
                // LZCNT contract is 0->32
                return 31 - (int)Lzcnt.LeadingZeroCount(value);
            // Fallback contract is 0->0
            return Log2SoftwareFallback(value);
| Time |
---------- |--------:|
Lznt | 0.58 ns |
Fallback | 1.52 ns |
byte[] data = ...
for (var i = 0; i < data.Length; i++)
    var item = data[i];
byte[] data = ...
for (var i = 0; i < data.Length; i += 8)
    var vector = data[i..i+8]
(computing, transitive)

To convert a program that operates on scalar values into
the equivalent program operating on vectors.
public static int IndexOf(this ReadOnlySpan<char> source, char value, int startIndex)
    for (int i = startIndex; i < source.Length; i++)
        if (source[i] == value)
            return i;
    return -1;
public static unsafe int IndexOf(ref char searchSpace, char value, int length)
if (Sse2.IsSupported)
        Determine how many to iterate to get data 16 bytes aligned
Iterate byte by byte
if (Avx2.IsSupported)
if (not 32 bytes aligned)
Check 16 bytes using SSE2
Iterate by 32 bytes using AVX2
if (more than 16 bytes left)
Check 16 bytes using SSE2
if (not all data iterated)
goto SequentialScan;
    else if (Sse2.IsSupported)
Iterate by 16 bytes using SSE2
if (not all data iterated)
goto SequentialScan:
return offset;
Length | Time |
----- |--------:|-----------:|
Old | 15 | 8.817 ns |
New | 15 | 4.577 ns |
Old | 1024 | 68.530 ns |
New | 1024 | 49.741 ns |
Heap allocations
(almost) New features
• ArrayPool class

• Span and ReadOnlySpan structures

• stackalloc
namespace System
    internal static class DateTimeFormat
        internal static string Format(DateTime dateTime, string? format, IFormatProvider? provider, TimeSpan offset)
            if (format != null && format.Length == 1)
                // Optimize for these standard formats that are not affected by culture.
                switch (format[0])
                    // Round trip format
                    case 'o':
                    case 'O':
                        const int MaxFormatOLength = 33;
                        Span<char> span = stackalloc char[MaxFormatOLength];
                        TryFormatO(dateTime, offset, span, out int ochars);
                        return span.Slice(0, ochars).ToString();
// ... More code goes here...
namespace System.Data.SqlClient
    internal sealed partial class TdsParser
        private bool TryReadSqlDateTime(SqlBuffer value, byte tdsType, int length, byte scale, TdsParserStateObject stateObj)
            Span<byte> datetimeBuffer = ((uint)length <= 16) ? stackalloc byte[16] : new byte[length];
            if (!stateObj.TryReadByteArray(datetimeBuffer, length))
                return false;
            ReadOnlySpan<byte> dateTimeData = datetimeBuffer.Slice(0, length);
// ... More code goes here...
namespace System.IO
    public abstract partial class TextReader : MarshalByRefObject, IDisposable
        public virtual async Task<string> ReadToEndAsync()
            var sb = new StringBuilder(4096);
            char[] chars = ArrayPool<char>.Shared.Rent(4096);
                int len;
                while ((len = await ReadAsyncInternal(chars, default).ConfigureAwait(false)) != 0)
                    sb.Append(chars, 0, len);
            return sb.ToString();
namespace System.Text.Json
    public static partial class JsonSerializer
        private static ReadOnlySpan<byte> GetUnescapedString(ReadOnlySpan<byte> utf8Source, int idx)
            // The escaped name is always longer than the unescaped, so it is safe to use escaped name for the buffer length.
            int length = utf8Source.Length;
            byte[] pooledName = null;
            Span<byte> unescapedName = length <= JsonConstants.StackallocThreshold ?
                stackalloc byte[length] :
                (pooledName = ArrayPool<byte>.Shared.Rent(length));
            JsonReaderHelper.Unescape(utf8Source, unescapedName, idx, out int written);
            ReadOnlySpan<byte> propertyName = unescapedName.Slice(0, written).ToArray();
            if (pooledName != null)
                // We clear the array because it is "user data" (although a property name).
                new Span<byte>(pooledName, 0, written).Clear();
            return propertyName;
namespace System
    public readonly struct Range : IEquatable<Range>
        public override string ToString()
// 2 for "..", then for each index 1 for '^' and 10 for longest possible uint
            Span<char> span = stackalloc char[2 + (2 * 11)];
            int charsWritten;
            int pos = 0;
            if (Start.IsFromEnd)
                span[0] = '^';
                pos = 1;
            bool formatted = ((uint)Start.Value).TryFormat(span.Slice(pos), out charsWritten);
            pos += charsWritten;
            span[pos++] = '.';
            span[pos++] = '.';
            if (End.IsFromEnd)
                span[pos++] = '^';
            formatted = ((uint)End.Value).TryFormat(span.Slice(pos), out charsWritten);
            pos += charsWritten;
            return new string(span.Slice(0, pos));
Parsing with Span
Object stack allocation
class AddOperation
    public int First { get; set; }
    public int Second { get; set; }
    public int Calculate()
        return First + Second;
public int Test()
    var operation = new AddOperation();
    operation.First = _first;
    operation.Second = _second;
    return operation.Calculate();
Object stack allocation disabled:
push rbx
mov rbx, rdi
movabs rdi, 0x113dc1bc0
call 0x1049f2920 (JitHelp: CORINFO_HELP_NEWSFAST)
mov edi, dword ptr [rbx + 0x8]
mov dword ptr [rax + 0x8], edi
mov edi, dword ptr [rbx + 0xc]
mov dword ptr [rax + 0xc], edi
mov edi, dword ptr [rax + 0x8]
add edi, dword ptr [rax + 0xc]
mov eax, edi
pop rbx
Object stack allocation
class AddOperation
    public int First { get; set; }
    public int Second { get; set; }
    public int Calculate()
        return First + Second;
public int Test()
    var operation = new AddOperation();
    operation.First = _first;
    operation.Second = _second;
    return operation.Calculate();
Object stack allocation disabled:
push rbx
mov rbx, rdi
movabs rdi, 0x113dc1bc0
call 0x1049f2920 (JitHelp: CORINFO_HELP_NEWSFAST)
mov edi, dword ptr [rbx + 0x8]
mov dword ptr [rax + 0x8], edi
mov edi, dword ptr [rbx + 0xc]
mov dword ptr [rax + 0xc], edi
mov edi, dword ptr [rax + 0x8]
add edi, dword ptr [rax + 0xc]
mov eax, edi
pop rbx
Object stack allocation
class AddOperation
    public int First { get; set; }
    public int Second { get; set; }
    public int Calculate()
        return First + Second;
public int Test()
    var operation = new AddOperation();
    operation.First = _first;
    operation.Second = _second;
    return operation.Calculate();
Object stack allocation disabled:
push rbx
mov rbx, rdi
movabs rdi, 0x113dc1bc0
call 0x1049f2920 (JitHelp: CORINFO_HELP_NEWSFAST)
mov edi, dword ptr [rbx + 0x8]
mov dword ptr [rax + 0x8], edi
mov edi, dword ptr [rbx + 0xc]
mov dword ptr [rax + 0xc], edi
mov edi, dword ptr [rax + 0x8]
add edi, dword ptr [rax + 0xc]
mov eax, edi
pop rbx
Object stack allocation
class AddOperation
    public int First { get; set; }
    public int Second { get; set; }
    public int Calculate()
        return First + Second;
public int Test()
    var operation = new AddOperation();
    operation.First = _first;
    operation.Second = _second;
    return operation.Calculate();
Object stack allocation disabled:
push rbx
mov rbx, rdi
movabs rdi, 0x113dc1bc0
call 0x1049f2920 (JitHelp: CORINFO_HELP_NEWSFAST)
mov edi, dword ptr [rbx + 0x8]
mov dword ptr [rax + 0x8], edi
mov edi, dword ptr [rbx + 0xc]
mov dword ptr [rax + 0xc], edi
mov edi, dword ptr [rax + 0x8]
add edi, dword ptr [rax + 0xc]
mov eax, edi
pop rbx
Object stack allocation
class AddOperation
    public int First { get; set; }
    public int Second { get; set; }
    public int Calculate()
        return First + Second;
public int Test()
    var operation = new AddOperation();
    operation.First = _first;
    operation.Second = _second;
    return operation.Calculate();
Object stack allocation disabled:
push rbx
mov rbx, rdi
movabs rdi, 0x113dc1bc0
call 0x1049f2920 (JitHelp: CORINFO_HELP_NEWSFAST)
mov edi, dword ptr [rbx + 0x8]
mov dword ptr [rax + 0x8], edi
mov edi, dword ptr [rbx + 0xc]
mov dword ptr [rax + 0xc], edi
mov edi, dword ptr [rax + 0x8]
add edi, dword ptr [rax + 0xc]
mov eax, edi
pop rbx
Object stack allocation
class AddOperation
    public int First { get; set; }
    public int Second { get; set; }
    public int Calculate()
        return First + Second;
public int Test()
    var operation = new AddOperation();
    operation.First = _first;
    operation.Second = _second;
    return operation.Calculate();
Object stack allocation enabled:
mov eax, dword ptr [rdi + 0x8]
mov edi, dword ptr [rdi + 0xc]
add eax, edi
Object stack allocation disabled:
push rbx
mov rbx, rdi
movabs rdi, 0x113dc1bc0
call 0x1049f2920 (JitHelp: CORINFO_HELP_NEWSFAST)
mov edi, dword ptr [rbx + 0x8]
mov dword ptr [rax + 0x8], edi
mov edi, dword ptr [rbx + 0xc]
mov dword ptr [rax + 0xc], edi
mov edi, dword ptr [rax + 0x8]
add edi, dword ptr [rax + 0xc]
mov eax, edi
pop rbx
Stack allocations
Stack structure
Local data
Previous frame address
Return address
Current method
stack frame
stack frame
stack frame
stack frame
Stack growth

Stack structure
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Stack growth

Stack structure
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer Stack growth

Stack structure
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer Stack growth

Stack structure
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer Stack growth

Stack structure
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer Stack growth

Stack structure
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer Stack growth

Stack structure
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer Stack growth

first variable
Stack structure
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer
first variable
Stack growth

Stack structure
second variable
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer
first variable
Stack growth

Stack structure
second variable
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer
first variable
Stack growth

Stack structure
second variable
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer
first variable
Stack growth

Stack structure
second variable
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer
first variable
Stack growth

Stack structure
second variable
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer
first variable
Stack growth

first variable copy
Stack structure
second variable
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer
first variable
Stack growth

second variable copy
first variable copy
Stack structure
second variable
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer
first variable
Stack growth

second variable copy
first variable copy
Stack structure
second variable
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer
first variable
Stack growth

TheMethod argument 2
TheMethod argument 1
Stack structure
second variable
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer
first variable
Stack growth

TheMethod argument 2
TheMethod argument 1
Return address
Stack structure
second variable
int Test()
    var first = new YearMonth(2019, 1);
    var second = new YearMonth(2018, 10);
    return TheMethod(first, second);
push    rbp
sub     rsp, 0x80
lea     rbp, [rsp + 0x80]
... fill stack frame with zeroes
lea     rdi, [rbp - 0x40]
mov     esi, 0x7e3
mov     edx, 0x1
call    0x114973fe0 (YearMonth..ctor)
lea     rdi, [rbp - 0x20]
mov     esi, 0x7e2
mov     edx, 0xa
call    0x114973fe0 (YearMonth..ctor))
vmovdqu xmm0, xmmword ptr [rbp - 0x40]
vmovdqu xmmword ptr [rsp], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x30]
vmovdqu xmmword ptr [rsp + 0x10], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x20]
vmovdqu xmmword ptr [rsp + 0x20], xmm0
vmovdqu xmm0, xmmword ptr [rbp - 0x10]
vmovdqu xmmword ptr [rsp + 0x30], xmm0
call    0x114972400 (TheMethod)
lea     rsp, [rbp]
pop     rbp
Previous stack frame pointer
first variable
Stack growth

TheMethod argument 2
TheMethod argument 1
Return address
TheMethod local data
Minimizing structure size
in parameter modifier
namespace System
    public readonly struct Decimal
        public static double ToDouble(decimal d)
            return DecCalc.VarR8FromDec(in d);
private struct DecCalc
            internal static double VarR8FromDec(in decimal value)
                const double ds2to64 = 1.8446744073709552e+019;
                double dbl = ((double)value.Low64 +
                    (double)value.High * ds2to64) / s_doublePowers10[value.Scale];
                if (value.IsNegative)
                    dbl = -dbl;
                return dbl;
ref return
namespace System
    public static class Math
        public static decimal Max(decimal val1, decimal val2)
            return decimal.Max(val1, val2);
    public readonly partial struct Decimal
        internal static ref readonly decimal Max(in decimal d1, in decimal d2)
            return ref DecCalc.VarDecCmp(in d1, in d2) >= 0 ? ref d1 : ref d2;
Virtual calls
Virtual calls
Method table
Method 1
Method 2
Compiled method 1Compiled method 2
Object instance
Some data
More data
And even more data
Non-virtual call
lea rdi, [rbx + 0x8]
call 0x114c9f620 (MyStructure.DoAction)
public void MyMethod()
Virtual table dispatch
mov rdi, qword ptr [rdi + 0x8]
mov rax, qword ptr [rdi]
mov rax, qword ptr [rax + 0x40]
call qword ptr [rax + 0x20]
public void MyMethod()
} Stuff
VTable indirections
Chunk 1
MethodTable pointer
Instance data
Object instance
Method table
Chunk 2
Method 1
Method 8
Method 7
Method 6
Method 5
Method 4
Method 3
Method 2
VTable indirection
Virtual stub dispatch
public void MyMethod()
mov rdi, qword ptr [rdi + 0x8]
movabs rax, 0x107320848
call qword ptr [rax]
movabs rax, 0x10846fd08
cmp qword ptr [rdi], rax
movabs rax, 0x1083e6c60
jne 0x1073668a5
jmp rax
Lookup stub
Indirect cell
Dispatch stub Resolve stub
Target Generic resolver
(computing, transitive) To make no longer virtual.
| Method | Mean | Error | StdDev |
|----------- |----------:|---------:|---------:|
| NonVirtual | 163.38 us | 1.286 us | 1.203 us |
| Virtual | 163.77 us | 1.848 us | 1.729 us |
| Interface | 191.27 us | 1.754 us | 1.641 us |
public class MyList<T> : IReadOnlyList<T>
    private readonly T[] _data;
    public virtual int Count => _data.Length;
    public virtual T this[int index] => _data[index];
public int Test()
var result = 0;
var data = ...
var length = data.Count;
for (var i = 0; i < length; i++)
    result += data[i];
return result;
in-line expansion (plural in-line expansions)
(software compilation)
The replacement by a compiler of a function call with a copy of
the entire function body.
void DoStuff()
var user = GetUser();
class User
private string _name;
public string Name
get => _name;
set => _name = value;
DotStuff method:
push    rbp
mov     rbp, rsp
call    0x11849bea0 (GetUser)
mov     rdi, rax
call    0x11849f650 (User.get_Name)
mov     rdi, rax
call    0x1184a0b70 (WriteLine)
pop     rbp
User.get_Name method:
mov     rax, qword ptr [rdi + 0x8]
void DoStuff()
var user = GetUser();
class User
private string _name;
public string Name
get => _name;
set => _name = value;
DotStuff method:
push    rbp
mov     rbp, rsp
call    0x11849bea0 (GetUser)
mov     rdi, rax
call    0x11849f650 (User.get_Name)
mov     rdi, rax
call    0x1184a0b70 (WriteLine)
pop     rbp
DotStuff method (with inlining):
push    rbp
mov     rbp, rsp
call    0x12541bea0 (GetUser)
mov     rdi, qword ptr [rax + 0x8]
call    0x125420b70 (WriteLine)
pop     rbp
User.get_Name method:
mov     rax, qword ptr [rdi + 0x8]
void DoStuff()
| Method | Mean | Error | StdDev |
|----------- |----------:|---------:|---------:|
| Inlining | 58.27 us | 1.140 us | 1.120 us |
| NonVirtual | 163.38 us | 1.286 us | 1.203 us |
| Virtual | 163.77 us | 1.848 us | 1.729 us |
| Interface | 191.27 us | 1.754 us | 1.641 us |
public class MyList<T> : IReadOnlyList<T>
    private readonly T[] _data;
    public virtual int Count => _data.Length;
    public virtual T this[int index] => _data[index];
public int Test()
var result = 0;
var data = ...
var length = data.Count;
for (var i = 0; i < length; i++)
    result += data[i];
return result;
Inlining requirements
• Devirtualization / non-virtual call

• No recursion

• Heuristic:

• Inlining is profitable

• Stack size is less than 16 bytes

• IL code is smaller than 16 bytes

• …
Inlining requirements
• Devirtualization / non-virtual call

• No recursion

• Heuristic:

• Inlining is profitable

• Stack size is less than 16 bytes

• IL code is smaller than 16 bytes

• …
namespace System
    public static class Math
        public static decimal Min(decimal val1, decimal val2)
            return decimal.Min(val1, val2);
foreach optimization
| Method | Mean | Error | StdDev |
|-------- |---------:|----------:|----------:|
| List | 2.311 us | 0.0233 us | 0.0218 us |
| IList | 4.392 us | 0.0601 us | 0.0562 us |
public int Test()
    var result = 0;
    IList<int> data = GetData();
    foreach (var item in data)
        result += item;
    return result;
.locals init (
[0] int32,
[1] valuetype List`1/Enumerator<int32>,
call GetData()
callvirt instance valuetype List`1<int32>::GetEnumerator()
br.s loop
start: ldloca.s 1
call instance !0 valuetype List`1/Enumerator<int32>::get_Current()
loop: ldloca.s 1
call instance bool valuetype List`1/Enumerator<int32>::MoveNext()
brtrue.s start
.locals init (
[0] int32,
[1] class IEnumerator`1<int32>,
call GetData()
callvirt instance class IEnumerable`1<int32>::GetEnumerator()
br.s loop
start: ldloc.1
callvirt instance !0 class IEnumerator::get_Current()
loop: ldloc.1
callvirt instance bool IEnumerator::MoveNext()
brtrue.s start
namespace System.Collections.Generic
    public class List<T> : IList<T>, IList, IReadOnlyList<T>
        public Enumerator GetEnumerator() => new Enumerator(this);
        IEnumerator<T> IEnumerable<T>.GetEnumerator() => new Enumerator(this);
        public struct Enumerator : IEnumerator<T>, IEnumerator
            private readonly List<T> _list;
            private int _index;
            private T _current;
            internal Enumerator(List<T> list)
                _list = list;
            public bool MoveNext()
                if (_index < _list._size)
                    _current = _list._items[_index];
                    return true;
                return false;
            public T Current => _current;
That’s all
Useful links
• Егор Богатов — Оптимизации внутри .NET Core

• SIMD + aligning example (corefx repo)


• Just complex SIMD usage (corefx repo)


• Book of the Runtime (a.k.a. BOTR)

.NET Fest 2019. Николай Балакин. Микрооптимизации в мире .NET