经过前面的讨论,我对Image类进行了优化,代码如下:
//C#灰度图像处理类,作者:wmesci
//
unsafe class Image :CriticalHandle, IDisposable
{
[DllImport("kernel32.dll")]
static extern IntPtr LocalAlloc(int flags, int size);
[DllImport("kernel32.dll")]
static extern IntPtr LocalFree(IntPtr memBlock);
[DllImport("kernel32.dll", EntryPoint = "RtlMoveMemory")]
static extern unsafe void CopyMemory(void* dst, void* src, int count);
[DllImport("ntdll.dll")]
static extern unsafe void* memset(void* src, byte value, uint size);
const byte Max = 255;
const byte Min = 0;
public Image(int width, int height)
: base(IntPtr.Zero)
{
if (width <= 0 || height <= 0)
throw new ArgumentOutOfRangeException();
Width = width;
Height = height;
Stride = (width + 3) & ~3;
Length = Stride * Height;
base.SetHandle(LocalAlloc(0x40, Length));
Pointer = (byte*)handle.ToPointer();
}
public Image(int width, int height, byte* data)
: this(width, height)
{
SetData(data);
}
public void GetData(void* dst)
{
CopyMemory(dst, Pointer, Length);
}
public void SetData(void* src)
{
CopyMemory(Pointer, src, Length);
}
public readonly int Width;
public readonly int Height;
public readonly int Length;
public readonly int Stride;
public readonly byte* Pointer;
public byte this[int x, int y]
{
get
{
return *(Pointer + y * Stride + x);
}
set
{
*(Pointer + y * Stride + x) = value;
}
}
public Image Clone()
{
return new Image(Width, Height, Pointer);
}
public void Add(Image img)
{
Action<int> act = y =>
{
byte* p1 = Pointer + y * Stride, p2 = (byte*)img.Pointer + y * img.Stride;
for (int x = 0; x < Stride; x += 4, p1 += 4, p2 += 4)
{
int d = (int)p1[0] + (int)p2[0];
if (d < 0)
p1[0] = 0;
else if (d > 255)
p1[0] = 255;
else
p1[0] = (byte)d;
d = (int)p1[1] + (int)p2[1];
if (d < 0)
p1[1] = 0;
else if (d > 255)
p1[1] = 255;
else
p1[1] = (byte)d;
d = (int)p1[2] + (int)p2[2];
if (d < 0)
p1[2] = 0;
else if (d > 255)
p1[2] = 255;
else
p1[2] = (byte)d;
d = (int)p1[3] + (int)p2[3];
if (d < 0)
p1[3] = 0;
else if (d > 255)
p1[3] = 255;
else
p1[3] = (byte)d;
}
};
Parallel.For(0, Height, act);
}
public void Sub(Image img)
{
Action<int> act = y =>
{
byte* p1 = Pointer + y * Stride, p2 = (byte*)img.Pointer + y * img.Stride;
for (int x = 0; x < Stride; x += 4, p1 += 4, p2 += 4)
{
int d = (int)p1[0] - (int)p2[0];
if (d < 0)
p1[0] = 0;
else if (d > 255)
p1[0] = 255;
else
p1[0] = (byte)d;
d = (int)p1[1] - (int)p2[1];
if (d < 0)
p1[1] = 0;
else if (d > 255)
p1[1] = 255;
else
p1[1] = (byte)d;
d = (int)p1[2] - (int)p2[2];
if (d < 0)
p1[2] = 0;
else if (d > 255)
p1[2] = 255;
else
p1[2] = (byte)d;
d = (int)p1[3] - (int)p2[3];
if (d < 0)
p1[3] = 0;
else if (d > 255)
p1[3] = 255;
else
p1[3] = (byte)d;
}
};
Parallel.For(0, Height, act);
}
/// <summary>OK</summary>
public void Mul(Image img, double scale)
{
Action<int> act = y =>
{
byte* p1 = Pointer + y * Stride, p2 = (byte*)img.Pointer + y * img.Stride;
for (int x = 0; x < Stride; x+=4, p1+=4, p2+=4)
{
double d = (int)p1[0] * (int)p2[0] * scale;
if (d < 0)
p1[0] = 0;
else if (d > 255)
p1[0] = 255;
else
p1[0] = (byte)d;
d = (int)p1[1] * (int)p2[1] * scale;
if (d < 0)
p1[1] = 0;
else if (d > 255)
p1[1] = 255;
else
p1[1] = (byte)d;
d = (int)p1[2] * (int)p2[2] * scale;
if (d < 0)
p1[2] = 0;
else if (d > 255)
p1[2] = 255;
else
p1[2] = (byte)d;
d = (int)p1[3] * (int)p2[3] * scale;
if (d < 0)
p1[3] = 0;
else if (d > 255)
p1[3] = 255;
else
p1[3] = (byte)d;
}
};
Parallel.For(0, Height, act);
}
public void Threshold(byte threshold)
{
Action<int> act = y =>
{
byte* p = Pointer + y * Stride;
for (int x = 0; x < Stride; x+=4, p+=4)
{
p[0] = p[0] < threshold ? Min : Max;
p[1] = p[1] < threshold ? Min : Max;
p[2] = p[2] < threshold ? Min : Max;
p[3] = p[3] < threshold ? Min : Max;
}
};
Parallel.For(0, Height, act);
}
/// <summary>OK</summary>
public void AddWeighted(Image img, double a, double b)
{
int* taba = stackalloc int[256];
for (int i = 0; i < 256; i++)
taba[i] = (int)(i * a);
int* tabb = stackalloc int[256];
for (int i = 0; i < 256; i++)
tabb[i] = (int)(i * b);
Action<int> act = y =>
{
byte* p1 = this.Pointer + y * this.Stride, p2 = (byte*)img.Pointer + y * img.Stride;
for (int x = 0; x < this.Stride; x+=4, p1+=4, p2+=4)
{
int d = taba[p1[0]] + taba[p2[0]];
if (d < 0)
p1[0] = 0;
else if (d > 255)
p1[0] = 255;
else
p1[0] = (byte)d;
d = taba[p1[1]] + taba[p2[1]];
if (d < 0)
p1[1] = 0;
else if (d > 255)
p1[1] = 255;
else
p1[1] = (byte)d;
d = taba[p1[2]] + taba[p2[2]];
if (d < 0)
p1[2] = 0;
else if (d > 255)
p1[2] = 255;
else
p1[2] = (byte)d;
d = taba[p1[3]] + taba[p2[3]];
if (d < 0)
p1[3] = 0;
else if (d > 255)
p1[3] = 255;
else
p1[3] = (byte)d;
}
};
Parallel.For(0, this.Height, act);
}
public static void Smooth(Image src, Image dst, int n)
{
//分配一块临时存储区
int* tmp = (int*)Marshal.AllocHGlobal(src.Stride * src.Height * 4).ToPointer();
Action<int> act = y =>
{
byte* p = src.Pointer + y * src.Stride;
int d = 0;
for (int i = -n; i <= n; i++)
{
int xx = Clamp(i, src.Stride);
d += p[xx];
}
tmp[y * src.Stride] = d;
};
Parallel.For(0, src.Height, act);
act = y =>
{
int i = y * src.Stride;
byte* p = src.Pointer + y * src.Stride;
for (int x = 1; x < src.Stride; x++)
{
int d = tmp[i];
int x1 = Clamp(x - n - 1, src.Stride);
int x2 = Clamp(x + n, src.Stride);
d += (p[x2] - p[x1]);
tmp[++i] = d;
}
};
Parallel.For(0, src.Height, act);
double f = 1.0 / (2 * n + 1);
f *= f;
act = x =>
{
int d = 0;
byte* p = dst.Pointer + x;
for (int j = -n; j <= n; j++)
{
int yy = Clamp(j, src.Height);
d += tmp[x + yy * src.Stride];
}
*p = (byte)(d * f);
p += src.Stride;
for (int y = 1; y < src.Height; y++, p += src.Stride)
{
int y1 = Clamp(y - n - 1, src.Height);
int y2 = Clamp(y + n, src.Height);
d += (tmp[x + y2 * src.Stride] - tmp[x + y1 * src.Stride]);
*p = (byte)(d * f);
}
};
Parallel.For(0, src.Stride, act);
Marshal.FreeHGlobal(new IntPtr(tmp));
}
private static int Clamp(int i, int max)
{
if (i < 0) return 0;
if (i >= max) return max - 1;
return i;
}
public override bool IsInvalid
{
get { return handle == IntPtr.Zero; }
}
protected override bool ReleaseHandle()
{
LocalFree(handle);
return true;
}
}
主要修改的地方如下:
1、将图像的每一行4字节对齐,增加Stribe属性,其值等于Width向上取最近的4的倍数,然后在所有的for循环里,每次操作4个字节。这样一来,减少了循环次数。
2、减少浮点运算
A:Add/Sub方法中的临时变量d改为int型
scale * *p1 * *p2改为p1[0] * p2[0] * scale,区别在于,前一种先算scale * *p0,是一个浮点乘法,其结果也是浮点数,然后再算和*p2的乘积,共两次浮点乘法;而后一种先算p1[0] * p2[0],这是一次整数乘法,然后再算和scale的积,共一次整数乘法一次浮点乘法。由于浮点乘法比整数乘法慢,因此效率会有所提高。
3、AddWeighted改为使用查表法进行运算,首先算出0~255这256个数和a、b的积,放在数组taba、tabb中,其后的循环中只需查表再相加即可,效率大幅提高!
下面是优化后的测试结果(数值表示Image类方法和对应的OpenCV方法执行时间之比):
CPU:AMD Athlon(tm) II X4 640 3.00GHz (四核)
样本:600 X 896
-------------------------------------
Add 1.446 1.315
Sub 1.171 1.109
Mul 0.651 0.580
Threshold 1.511 1.432
Smooth 0.938 0.908
AddWeighted 0.528 0.474
CPU:AMD Athlon(tm) II X4 640 3.00GHz (四核)
样本:1600 X 1200
-------------------------------------
Add 1.041 1.052
Sub 0.910 0.906
Mul 0.562 0.558
Threshold 1.277 1.236
Smooth 1.020 1.024
AddWeighted 0.462 0.461
CPU:AMD Athlon(tm) II X2 245 2.91GHz (双核)
样本:1600 X 1200
-------------------------------------
Add 1.514 1.533
Sub 1.225 1.163
Mul 1.085 1.095
Threshold 1.643 1.630
Smooth 1.847 1.867
AddWeighted 0.957 0.924
CPU:AMD Athlon(tm) II X2 245 2.91GHz (双核)
样本:600 X 896
-------------------------------------
Add 2.559 2.073 2.676
Sub 2.240 1.784 1.856
Mul 1.261 1.352 1.284
Threshold 2.453 2.511 3.101
Smooth 1.660 1.647 1.663
AddWeighted 0.978 1.017 0.961
CPU:Intel Core i3 M330 2.13GHz (双核四线程)
样本:1600 X 1200
-------------------------------------
Add 2.611
Sub 2.545
Mul 1.011
Threshold 2.882
Smooth 1.891
AddWeighted 0.525
CPU:Intel Core i3 M330 2.13GHz (双核四线程)
样本:600 X 896
-------------------------------------
Add 4.483
Sub 3.576
Mul 1.101
Threshold 5.953
Smooth 2.029
AddWeighted 0.581
CPU:Intel Core i7 2360QM 2.00GHz (四核八线程)
样本:600 X 896
-------------------------------------
Add 1.080 1.020
Sub 0.977 1.010
Mul 0.575 0.558
Threshold 0.842 0.898
Smooth 1.447 1.386
AddWeighted 0.325 0.366
CPU:Intel Core i7 2360QM 2.00GHz (四核八线程)
样本:1600 X 1200
-------------------------------------
Add 1.420
Sub 1.134
Mul 0.535
Threshold 0.878
Smooth 1.379
AddWeighted 0.325
分析以上数据,我们不难发现以下几点:
1、样本大小相同时,CPU核心数越多,Image/OpenCV就越小,这说明了多线程算法在多核CPU下的优势。
2、CPU相同时,样本大小越打,比值越小。
3、OpenCV针对Intel CPU使用IPP进行了优化,因此在Intel CPU上跑,比值会比在AMD CPU上打很多。
4、OpenCV里使用SSE优化过的方法,用C#实现时差距比较明显,怎么才能达到差不多的效率,这个暂时还没想到。但是OpenCV里没使用SSE优化的方法,如Mul、AddWeighted,使用C#完全可以达到相同的性能,甚至超过OpenCV,如AddWeighted,在双核CPU上也比OpenCV要快,在4核以上CPU上远超OpenCV!!
5、使用自写方法替代OpenCV是完全可行的!!
以上测试有几点需要说明:
1、CLR是在第一次运行某个方法时才进行编译,因此第一次执行某个方法时会慢很多,在计算时间时要排除第一次执行的时间。
2、C#调用OpenCV需要经过封送处理,但封送处理所消耗的时间在这里无法避免。
各位看官如有其它优化修改意见,还望不吝赐教!!!同时我会不断对这个类进行修改以及添加其它图像处理方法,对图像处理、OpenCV以及C#代码优化感兴趣的同学,请关注本贴!!