c# – Sobel Operator – SIMD x86 Intrinsics Implementation

I’m learning C# .NET 5 Intrinsics and interested in best practices. It’s really hard now to find enough information about how SIMD instructions (logically/internally) work in .NET. In addition i’m not familiar with C++ and Assebmly languages.

The purpose of the solution – apply Sobel Operator filter to the loaded image. For image operations i used System.Drawing.Common NuGet package. Thus the solution is Windows-only.

The SobelOperator class contains two Sobel Operator implementations:

  1. SobelOperatorScalar – Scalar solution that can be used as a fallback if current CPU is not compartible with AVX2.
  2. SobelOperatorSimd – SIMD x86 solution for hardware acceleration. – review target

Code for review

public interface ISobelOperator
{
    Bitmap Apply(Bitmap bmp);
}

public class SobelOperator : ISobelOperator
{
    private static Color() _grayPallette;
    private readonly ISobelOperator _operator;

    public bool IsHardwareAccelerated { get; } 
            
    public SobelOperator(bool hardwareAccelerated = true)
    {
        if (_grayPallette == null)
            _grayPallette = Enumerable.Range(0, 256).Select(i => Color.FromArgb(i, i, i)).ToArray();

        IsHardwareAccelerated = hardwareAccelerated && Avx2.IsSupported;

        _operator = IsHardwareAccelerated ? new SobelOperatorSimd() : new SobelOperatorScalar();
    }

    public Bitmap Apply(Bitmap bmp) 
        => _operator.Apply(bmp);

    private class SobelOperatorSimd : ISobelOperator
    {
        private const byte m0 = 0b01001001;
        private const byte m1 = 0b10010010;
        private const byte m2 = 0b00100100;

        //0.299R + 0.587G + 0.114B
        private readonly Vector256<float> bWeight = Vector256.Create(0.114f);
        private readonly Vector256<float> gWeight = Vector256.Create(0.587f);
        private readonly Vector256<float> rWeight = Vector256.Create(0.299f);
        private readonly Vector256<int> bMut = Vector256.Create(0, 3, 6, 1, 4, 7, 2, 5);
        private readonly Vector256<int> gMut = Vector256.Create(1, 4, 7, 2, 5, 0, 3, 6);
        private readonly Vector256<int> rMut = Vector256.Create(2, 5, 0, 3, 6, 1, 4, 7);

        (MethodImpl(MethodImplOptions.AggressiveInlining))
        private unsafe Vector256<int> GetBrightness(byte* ptr)
        {
            Vector256<int> v0 = Avx2.ConvertToVector256Int32(ptr);
            Vector256<int> v1 = Avx2.ConvertToVector256Int32(ptr + 8);
            Vector256<int> v2 = Avx2.ConvertToVector256Int32(ptr + 16);

            Vector256<int> vb = Avx2.Blend(Avx2.Blend(v0, v1, m1), v2, m2);
            vb = Avx2.PermuteVar8x32(vb, bMut);
            Vector256<int> vg = Avx2.Blend(Avx2.Blend(v0, v1, m2), v2, m0);
            vg = Avx2.PermuteVar8x32(vg, gMut);
            Vector256<int> vr = Avx2.Blend(Avx2.Blend(v0, v1, m0), v2, m1);
            vr = Avx2.PermuteVar8x32(vr, rMut);
            Vector256<float> vfb = Avx.Multiply(Avx.ConvertToVector256Single(vb), bWeight);
            Vector256<float> vfg = Avx.Multiply(Avx.ConvertToVector256Single(vg), gWeight);
            Vector256<float> vfr = Avx.Multiply(Avx.ConvertToVector256Single(vr), rWeight);
            return Avx.ConvertToVector256Int32WithTruncation(Avx.Add(Avx.Add(vfb, vfg), vfr));
        }

        (MethodImpl(MethodImplOptions.AggressiveInlining))
        private unsafe void ToGrayscale(byte* srcPtr, byte* dstPtr, int pixelsCount)
        {
            byte* tail = srcPtr + (pixelsCount & -16) * 3;
            byte* srcEnd = srcPtr + pixelsCount * 3;
            byte* dstEnd = dstPtr + pixelsCount;

            while (true)
            {
                while (srcPtr < tail)
                {
                    Vector256<int> vi0 = GetBrightness(srcPtr);
                    Vector256<int> vi1 = GetBrightness(srcPtr + 24);
                    Vector128<short> v0 = Sse2.PackSignedSaturate(Avx2.ExtractVector128(vi0, 0), Avx2.ExtractVector128(vi0, 1));
                    Vector128<short> v1 = Sse2.PackSignedSaturate(Avx2.ExtractVector128(vi1, 0), Avx2.ExtractVector128(vi1, 1));
                    Sse2.Store(dstPtr, Sse2.PackUnsignedSaturate(v0, v1));
                    srcPtr += 48;
                    dstPtr += 16;
                }
                if (srcPtr == srcEnd)
                    break;
                tail = srcEnd;
                srcPtr = srcEnd - 48;
                dstPtr = dstEnd - 16;
            }
        }

        (MethodImpl(MethodImplOptions.AggressiveInlining))
        private static unsafe Vector128<byte> ApplySobelKernel(byte* srcPtr, int width)
        {
            Vector256<short> v00 = Avx2.ConvertToVector256Int16(srcPtr);
            Vector256<short> v01 = Avx2.ConvertToVector256Int16(srcPtr + 1);
            Vector256<short> v02 = Avx2.ConvertToVector256Int16(srcPtr + 2);
            Vector256<short> v10 = Avx2.ConvertToVector256Int16(srcPtr + width);
            Vector256<short> v12 = Avx2.ConvertToVector256Int16(srcPtr + width + 2);
            Vector256<short> v20 = Avx2.ConvertToVector256Int16(srcPtr + width * 2);
            Vector256<short> v21 = Avx2.ConvertToVector256Int16(srcPtr + width * 2 + 1);
            Vector256<short> v22 = Avx2.ConvertToVector256Int16(srcPtr + width * 2 + 2);

            Vector256<short> vgx = Avx2.Subtract(v02, v00);
            vgx = Avx2.Subtract(vgx, Avx2.ShiftLeftLogical(v10, 1));
            vgx = Avx2.Add(vgx, Avx2.ShiftLeftLogical(v12, 1));
            vgx = Avx2.Subtract(vgx, v20);
            vgx = Avx2.Add(vgx, v22);

            Vector256<short> vgy = Avx2.Add(v00, Avx2.ShiftLeftLogical(v01, 1));
            vgy = Avx2.Add(vgy, v02);
            vgy = Avx2.Subtract(vgy, v20);
            vgy = Avx2.Subtract(vgy, Avx2.ShiftLeftLogical(v21, 1));
            vgy = Avx2.Subtract(vgy, v22);

            // sqrt(vgx * vgx + vgy * vgy)
            Vector256<short> vgp0 = Avx2.UnpackLow(vgx, vgy);
            Vector256<short> vgp1 = Avx2.UnpackHigh(vgx, vgy);
            Vector256<int> v0 = Avx2.MultiplyAddAdjacent(vgp0, vgp0);
            Vector256<int> v1 = Avx2.MultiplyAddAdjacent(vgp1, vgp1);
            Vector256<int> gt0 = Avx.ConvertToVector256Int32WithTruncation(Avx.Sqrt(Avx.ConvertToVector256Single(v0)));
            Vector256<int> gt1 = Avx.ConvertToVector256Int32WithTruncation(Avx.Sqrt(Avx.ConvertToVector256Single(v1)));

            Vector128<short> gts0 = Sse2.PackSignedSaturate(Avx2.ExtractVector128(gt0, 0), Avx2.ExtractVector128(gt1, 0));
            Vector128<short> gts1 = Sse2.PackSignedSaturate(Avx2.ExtractVector128(gt0, 1), Avx2.ExtractVector128(gt1, 1));
            return Sse2.PackUnsignedSaturate(gts0, gts1);
        }


        public Bitmap Apply(Bitmap bmp)
        {
            int width = bmp.Width;
            int height = bmp.Height;
            int pixelsCount = width * height;
            byte() buffer = new byte(pixelsCount);

            Rectangle rect = new Rectangle(Point.Empty, bmp.Size);
            Bitmap outBmp = new Bitmap(width, height, PixelFormat.Format8bppIndexed);
            ColorPalette pal = outBmp.Palette;
            for (int i = 0; i < 256; i++)
                pal.Entries(i) = _grayPallette(i);
            outBmp.Palette = pal;

            unsafe
            {
                fixed (byte* bufPtr = buffer)
                {
                    BitmapData bmpData = bmp.LockBits(rect, ImageLockMode.ReadOnly, PixelFormat.Format24bppRgb);
                    ToGrayscale((byte*)bmpData.Scan0.ToPointer(), bufPtr, pixelsCount);
                    bmp.UnlockBits(bmpData);

                    BitmapData outBmpData = outBmp.LockBits(rect, ImageLockMode.WriteOnly, PixelFormat.Format8bppIndexed);
                    byte* dstPtr = (byte*)outBmpData.Scan0.ToPointer();

                    int length = pixelsCount - width * 2 - 1;
                    byte* tail = bufPtr + (length & -16);
                    byte* srcPos = bufPtr;
                    byte* srcEnd = bufPtr + length;
                    byte* dstPos = dstPtr + width + 1;
                    byte* dstEnd = dstPos + length;

                    while (true)
                    {
                        while (srcPos < tail)
                        {
                            Sse2.Store(dstPos, ApplySobelKernel(srcPos, width));
                            srcPos += 16;
                            dstPos += 16;
                        }

                        if (srcPos == srcEnd)
                            break;
                        tail = srcEnd;
                        srcPos = srcEnd - 16;
                        dstPos = dstEnd - 16;
                    }

                    for (dstPos = dstPtr + width; dstPos <= dstPtr + pixelsCount - width; dstPos += width)
                    {
                        *dstPos-- = 0;
                        *dstPos++ = 0;
                    }

                    outBmp.UnlockBits(outBmpData);
                }
            }

            return outBmp;
        }
    }

    private class SobelOperatorScalar : ISobelOperator
    {
        public Bitmap Apply(Bitmap bmp)
        {
            BitmapData bmpData = bmp.LockBits(new Rectangle(Point.Empty, bmp.Size), ImageLockMode.ReadOnly, PixelFormat.Format24bppRgb);
            int strideLength = bmpData.Stride * bmpData.Height;
            byte() buffer = new byte(Math.Abs(strideLength));
            Marshal.Copy(bmpData.Scan0, buffer, 0, strideLength);
            bmp.UnlockBits(bmpData);

            int width = bmp.Width;
            int height = bmp.Height;
            int pixelsCount = width * height;
            byte() pixelBuffer = new byte(pixelsCount);
            byte() resultBuffer = new byte(pixelsCount);

            //0.299R + 0.587G + 0.114B
            for (int i = 0; i < pixelsCount; i++)
            {
                int offset = i * 3;
                byte brightness = (byte)(buffer(offset) * 0.114f + buffer(offset + 1) * 0.587f + buffer(offset + 2) * 0.299f);
                pixelBuffer(i) = brightness;
            }

            for (int i = width + 1; i < pixelsCount - width - 1; i++)
            {
                if (i % width == width - 1)
                    i += 2;

                int gx = -pixelBuffer(i - 1 - width) + pixelBuffer(i + 1 - width) - 2 * pixelBuffer(i - 1) +
                    2 * pixelBuffer(i + 1) - pixelBuffer(i - 1 + width) + pixelBuffer(i + 1 + width);

                int gy = pixelBuffer(i - 1 - width) + 2 * pixelBuffer(i - width) + pixelBuffer(i + 1 - width) -
                    pixelBuffer(i - 1 + width) - 2 * pixelBuffer(i + width) - pixelBuffer(i + 1 + width);

                int gt = (int)MathF.Sqrt(gx * gx + gy * gy);
                if (gt > byte.MaxValue) gt = byte.MaxValue;

                resultBuffer(i) = (byte)gt;
            }

            Bitmap outBmp = new Bitmap(width, height, PixelFormat.Format8bppIndexed);
            BitmapData outBmpData = outBmp.LockBits(new Rectangle(Point.Empty, outBmp.Size), ImageLockMode.WriteOnly, PixelFormat.Format8bppIndexed);
            Marshal.Copy(resultBuffer, 0, outBmpData.Scan0, outBmpData.Stride * outBmpData.Height);
            outBmp.UnlockBits(outBmpData);
            ColorPalette pal = outBmp.Palette;
            for (int i = 0; i < 256; i++)
                pal.Entries(i) = _grayPallette(i);
            outBmp.Palette = pal;
            return outBmp;
        }
    }
}

Output test

Test image
source image

Program.cs

static void Main(string() args)
{
    const string fileName = "image.jpg";
    Bitmap bmp = new Bitmap(fileName);

    SobelOperator sobelOperator = new SobelOperator();
    Console.WriteLine($"SIMD accelerated: {(sobelOperator.IsHardwareAccelerated ? "Yes" : "No")}");
    Bitmap result = sobelOperator.Apply(bmp);
    result.Save("out.jpg", ImageFormat.Jpeg);

    Console.WriteLine("Done.");
    Console.ReadKey();
}

Console output

SIMD accelerated: Yes
Done.

Output Image
enter image description here

Output images of Scalar and SIMD implementations are binary idential.

Benchmark.NET

(MemoryDiagnoser)
public class MyBenchmark
{
    private readonly ISobelOperator _sobelOperator = new SobelOperator();
    private readonly ISobelOperator _sobelOperatorSw = new SobelOperator(false);
    private readonly Bitmap bmp = new Bitmap(@"C:Sourceimage.jpg");

    (Benchmark(Description = "SIMD Enabled"))
    public Bitmap TestSimd()
    {
        return _sobelOperator.Apply(bmp);
    }

    (Benchmark(Description = "SIMD Disabled"))
    public Bitmap TestScalar()
    {
        return _sobelOperatorSw.Apply(bmp);
    }
}

static void Main(string() args)
{
    var summary = BenchmarkRunner.Run<MyBenchmark>();
    Console.ReadKey();
}
BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19042
Intel Core i7-4700HQ CPU 2.40GHz (Haswell), 1 CPU, 8 logical and 4 physical cores
.NET Core SDK=5.0.101
  (Host)     : .NET Core 5.0.1 (CoreCLR 5.0.120.57516, CoreFX 5.0.120.57516), X64 RyuJIT
  DefaultJob : .NET Core 5.0.1 (CoreCLR 5.0.120.57516, CoreFX 5.0.120.57516), X64 RyuJIT

|          Method |      Mean |     Error |    StdDev |    Gen 0 |    Gen 1 |    Gen 2 | Allocated |
|---------------- |----------:|----------:|----------:|---------:|---------:|---------:|----------:|
|  'SIMD Enabled' |  7.285 ms | 0.1165 ms | 0.1089 ms | 992.1875 | 992.1875 | 992.1875 |   3.35 MB |
| 'SIMD Disabled' | 48.412 ms | 0.2312 ms | 0.2162 ms | 454.5455 | 454.5455 | 454.5455 |  16.61 MB |

Intrinsics solution is ~6.6x Times faster. And eating less memory in general because it’s unsafe and doesn’t use Marshal.Copy to load/save the byte() buffers.