本文主要是介绍xmemcpy改进版,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
xmemcpy改进版,利用movdqu速度快的特点,利用内联和常量化来提高对于小内存的memcpy性能优化
xmemcpy来自github /progs/C/c_progs/memcpy.c ,不知道是不是原作者,这里进行了部分改进
------2016-2-28注意1:以下内容的缓冲区由于反复读取,总在L1cache中,类似于栈内存,如果总是在超出cache的内存中,则由于内存速度拖累,改进版与memcpy很难拉开差距,但是仍然有一定的效果
------2016-2-28注意2:DEBUG下速度会很慢,除非关闭/GS或用 #pragma runtime_checks( "s", restore ) (此编译杂注对模板无效)
------2016-3-5 注意3:参看zmemcpy改进版,对debug模式有相当大的提高 http://blog.csdn.net/superzmy/article/details/50810343
预期结果:
All time to memcpy 80 * 100M is 0.248s in 3GHz (xmemcopy)
All time to memcpy 80 * 100M is 0.476s in 3GHz (xmemcpy)
All time to memcpy 80 * 100M is 0.778s in 3GHz (xmemcpy unknownSize)
All time to memcpy 80 * 100M is 0.232s in 3GHz (movdq)
All time to memcpy 80 * 100M is 0.257s in 3GHz (movdq unalign)
All time to memcpy 81 * 100M is 0.298s in 3GHz (movdq)
All time to memcpy 81 * 100M is 0.264s in 3GHz (movdq unalign)
All time to memcpy 400 * 100M is 1.334s in 3GHz (xmemcopy)
All time to memcpy 400 * 100M is 1.236s in 3GHz (xmemcopy unalign)
All time to memcpy 400 * 100M is 1.819s in 3GHz (xmemcpy)
All time to memcpy 400 * 100M is 3.051s in 3GHz (rep movs)
All time to memcpy 400 * 100M is 2.984s in 3GHz (rep movs unalign)
All time to memcpy 400 * 100M is 3.015s in 3GHz (rep movs handwrite asm)
All time to memcpy 401 * 100M is 3.093s in 3GHz (rep movs)
All time to memcpy 401 * 100M is 3.193s in 3GHz (rep movs handwrite asm)
All time to memcpy 80 * 100M is 1.216s in 3GHz (rep movs handwrite asm)
All time to memcpy 4000 * 100M is 15.254s in 3GHz (rep movs handwrite asm)
All time to memcpy 80 * 100M is 1.824s in 3GHz (call _memcpy)
All time to memcpy 81 * 100M is 1.828s in 3GHz (call _memcpy)
All time to memcpy 81 * 100M is 1.779s in 3GHz (call _memcpy unalign)
All time to memcpy 400 * 100M is 2.554s in 3GHz (call _memcpy)
All time to memcpy 401 * 100M is 2.777s in 3GHz (call _memcpy)
All time to memcpy 401 * 100M is 2.725s in 3GHz (call _memcpy unalign)
All time to memcpy 4000 * 100M is 14.379s in 3GHz (call _memcpy)
以上代码vs2013编译 E3 1230V2上运行
// ConsoleApplication3.cpp : 定义控制台应用程序的入口点。
//#include "stdafx.h"
#include <windows.h>
#include <intrin.h>
#include <assert.h>
char data80[80] = "abcdefghijklmnopqrstuvwxyz0123456789";char data400[400] =
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"012345678901234567890123456789012345678";char data4000[4000] =
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
;char data401[401] =
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
"abcdefghijklmnopqrstuvwxyz0123456789"
;
char data81[81] = "abcdefghijklmnopqrstuvwxyz0123456789";// optimize memcpy less than 120bytes
// char a[32], b[32]; a = b; is faster than memcpy(a, b, sizeof(b));namespace com
{const static size_t _MAXSIZE_ = 80;extern void* (*g_base[_MAXSIZE_+1])(void *dest, const void *src);
};inline void *xmemcpy(void *dest, const void *src, size_t len);namespace com
{template <size_t size>struct xmemcpy_t{int data[size];};template <>struct xmemcpy_t<0>{};template <size_t size>class xmemcopy{public:inline static void * copy(void *dest, const void *src){if (size > _MAXSIZE_){size_t i = 0;for (; i + _MAXSIZE_ <= size; i += _MAXSIZE_)xmemcopy<_MAXSIZE_>::copy((char*)dest + i, (const char*)src + i);if (size % _MAXSIZE_) xmemcopy<size % _MAXSIZE_>::copy((char*)dest + i, (const char*)src + i);return dest;}typedef xmemcpy_t<((size - 1) % _MAXSIZE_ + 1) / sizeof(int)> type_t;*((type_t *)dest) = *((type_t *)src);if ((size%sizeof(int)) > 0) {((char *)dest)[size - 1] = ((char *)src)[size - 1];}if ((size%sizeof(int)) > 1) {((char *)dest)[size - 2] = ((char *)src)[size - 2];}if ((size%sizeof(int)) > 2) {((char *)dest)[size - 3] = ((char *)src)[size - 3];}return dest;}};template <>class xmemcopy<0>{public:static void * copy(void *dest, const void *src) { return dest; }};void* (*g_base[_MAXSIZE_+1])(void *dest, const void *src);template <size_t len>void init() {g_base[len] = xmemcopy<len>::copy;init<len - 1>();}template <>void init<0>() {g_base[0] = xmemcopy<0>::copy;}struct xmem_monitor{xmem_monitor() {init<_MAXSIZE_>();}};static xmem_monitor g_monitor;
}inline void *xmemcpy(void *dest, const void *src, size_t len)
{if (len <= com::_MAXSIZE_) {return com::g_base[len](dest, src);}else if (len <= com::_MAXSIZE_ * 10){size_t i = 0;for (; i + com::_MAXSIZE_ < len; i += com::_MAXSIZE_)com::xmemcopy<com::_MAXSIZE_>::copy((char*)dest + i, (const char*)src + i);com::g_base[len - i]((char*)dest + i, (const char*)src + i);return dest;}return ::memcpy(dest, src, len);
}int _tmain(int argc, _TCHAR* argv[])
{SetProcessAffinityMask(GetCurrentProcess(), 2);char buffer[10000] = {};com::xmemcopy<com::_MAXSIZE_ * 2>::copy(buffer, data400);if (memcmp(buffer, data400, com::_MAXSIZE_ * 2))__asm int 3;com::xmemcopy<com::_MAXSIZE_ * 2 + 1>::copy(buffer, data400);if (memcmp(buffer, data400, com::_MAXSIZE_ * 2 + 1))__asm int 3;com::xmemcopy<400>::copy(buffer, data400);if(memcmp(buffer, data400, 400))__asm int 3;char* volatile pb = buffer;char* volatile pb1 = buffer + 1;size_t volatile size40 = sizeof(data80);size_t volatile size41 = sizeof(data81);assert((int)pb % 4 == 0);assert((int)pb1 % 4 == 1);assert((int)data80 % 8 == 0);assert((int)data400 % 8 == 0);assert((int)data4000 % 8 == 0);for (int i = 0; i < 10; ++i){memcpy(pb, data80, size40);memcpy(pb, data81, size41);memcpy(pb, data400, sizeof(data400));memcpy(pb, data401, sizeof(data401));memcpy(pb, data4000, sizeof(data4000));}printf("\n");enum { Count = 100000000 };
#if(1){auto& dest = data80;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)com::xmemcopy<sizeof(dest)>::copy(pb, dest);t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcopy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data80;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)xmemcpy(pb, dest, sizeof(dest));t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcpy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data80;size_t volatile size = sizeof(dest);__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)xmemcpy(pb, dest, size);t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcpy unknownSize)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data80;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb, dest, sizeof(dest));t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data80;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb1, dest, sizeof(dest));t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data81;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb, dest, sizeof(dest));t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data81;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb1, dest, sizeof(dest));t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (movdq unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}// {auto& dest = data400;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)com::xmemcopy<sizeof(dest)>::copy(pb, dest);t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcopy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data400;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)com::xmemcopy<sizeof(dest)>::copy(pb1, dest);t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcopy unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}
#endifmemset(pb, 0, 400);{auto& dest = data400;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)xmemcpy(pb, dest, sizeof(dest));t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (xmemcpy)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data400;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb, dest, sizeof(dest));t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data400;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb1, dest, sizeof(dest));t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs unalign)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data400;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i){__asm{mov edi, dword ptr[pb];mov ecx, size data400 / 4;mov esi, dest;rep movs dword ptr es : [edi], dword ptr[esi];}}t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data401;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb, dest, sizeof(dest));t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data401;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i){__asm{mov edi, dword ptr[pb];mov ecx, size data401 / 4;mov esi, dest;rep movs dword ptr es : [edi], dword ptr[esi];movs byte ptr es : [edi], byte ptr[esi]}}t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data80;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i){__asm{mov edi, dword ptr[pb];mov ecx, size data80 / 4;mov esi, dest;rep movs dword ptr es : [edi], dword ptr[esi];}}t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data4000;__int64 t = __rdtsc();for (int i = 0; i < Count; ++i){__asm{mov edi, dword ptr[pb];mov ecx, size data4000 / 4;mov esi, dest;rep movs dword ptr es : [edi], dword ptr[esi];}}t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (rep movs handwrite asm)\n", sizeof(dest), Count / 1000000, t / 3000000000.0);}{auto& dest = data80;size_t volatile size = sizeof(dest);__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb, dest, size);t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);}{auto& dest = data81;size_t volatile size = sizeof(dest);__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb, dest, size);t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);}{auto& dest = data81;size_t volatile size = sizeof(dest);__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb1, dest, size);t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy unalign)\n", size, Count / 1000000, t / 3000000000.0);}{auto& dest = data400;size_t volatile size = sizeof(dest);__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb, dest, size);t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);}{auto& dest = data401;size_t volatile size = sizeof(dest);__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb, dest, size);t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);}{auto& dest = data401;size_t volatile size = sizeof(dest);__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb1, dest, size);t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy unalign)\n", size, Count / 1000000, t / 3000000000.0);}{auto& dest = data4000;size_t volatile size = sizeof(dest);__int64 t = __rdtsc();for (int i = 0; i < Count; ++i)memcpy(pb, dest, size);t = __rdtsc() - t;printf("All time to memcpy %d * %dM is %0.3fs in 3GHz (call _memcpy)\n", size, Count / 1000000, t / 3000000000.0);}return 0;
}
这篇关于xmemcpy改进版的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!