連載1 高速なメモリーコピー その4
void *p1 = malloc( size + 15 ); void *p2 = ((unsigned long)p + 15) & 0xFFFFFFF0; |
void * _aligned_malloc( size_t size, size_t alignment ); ex) void *p3 = _aligned_malloc( size, 16 ); |
void ssememcpy1( void *_dst, void *_src, DWORD _size ) { if (_size < 64) { memcpy( _dst, _src, _size ); return ; } __asm { mov esi, _src; mov eax, _dst; sub eax, esi; mov ecx, _size; shr ecx, 6; // ecx = ecx / 64; LOOP_MAIN: movdqa xmm0, [esi+ 0]; movdqa xmm1, [esi+16]; movdqa xmm2, [esi+32]; movdqa xmm3, [esi+48]; MOVNTDQ [esi+eax+ 0], xmm0; MOVNTDQ [esi+eax+16], xmm1; MOVNTDQ [esi+eax+32], xmm2; MOVNTDQ [esi+eax+48], xmm3; add esi, 64; loop LOOP_MAIN; add eax, esi; mov _src, esi; mov _dst, eax; } memcpy( _dst, _src, _size & 0x0000003F ); } |
- 関連記事
trackback
コメントの投稿