<tb@panthema.net>
<http://www.gnu.org/licenses/>
void cScanWrite64PtrSimpleLoop(char* memarea, size_t size, size_t repeats)
{
uint64_t* begin = (uint64_t*)memarea;
uint64_t* end = begin + size / sizeof(uint64_t);
uint64_t value = 0xC0FFEEEEBABE0000;
do {
uint64_t* p = begin;
do {
*p++ = value;
}
while (p < end);
}
while (--repeats != 0);
}
void ScanWrite64PtrSimpleLoop(char* memarea, size_t size, size_t repeats)
{
asm("mov $0xC0FFEEEEBABE0000, %%rax \n"
"1: \n"
"mov %[memarea], %%rcx \n"
"2: \n"
"mov %%rax, (%%rcx) \n"
"add $8, %%rcx \n"
"cmp %[end], %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "rax", "rcx");
}
REGISTER(ScanWrite64PtrSimpleLoop, 8, 8);
void ScanWrite64PtrUnrollLoop(char* memarea, size_t size, size_t repeats)
{
asm("mov $0xC0FFEEEEBABE0000, %%rax \n"
"1: \n"
"mov %[memarea], %%rcx \n"
"2: \n"
"mov %%rax, 0*8(%%rcx) \n"
"mov %%rax, 1*8(%%rcx) \n"
"mov %%rax, 2*8(%%rcx) \n"
"mov %%rax, 3*8(%%rcx) \n"
"mov %%rax, 4*8(%%rcx) \n"
"mov %%rax, 5*8(%%rcx) \n"
"mov %%rax, 6*8(%%rcx) \n"
"mov %%rax, 7*8(%%rcx) \n"
"mov %%rax, 8*8(%%rcx) \n"
"mov %%rax, 9*8(%%rcx) \n"
"mov %%rax, 10*8(%%rcx) \n"
"mov %%rax, 11*8(%%rcx) \n"
"mov %%rax, 12*8(%%rcx) \n"
"mov %%rax, 13*8(%%rcx) \n"
"mov %%rax, 14*8(%%rcx) \n"
"mov %%rax, 15*8(%%rcx) \n"
"add $16*8, %%rcx \n"
"cmp %[end], %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "rax", "rcx");
}
REGISTER(ScanWrite64PtrUnrollLoop, 8, 8);
void ScanRead64PtrSimpleLoop(char* memarea, size_t size, size_t repeats)
{
asm("1: \n"
"mov %[memarea], %%rcx \n"
"2: \n"
"mov (%%rcx), %%rax \n"
"add $8, %%rcx \n"
"cmp %[end], %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "rax", "rcx");
}
REGISTER(ScanRead64PtrSimpleLoop, 8, 8);
void ScanRead64PtrUnrollLoop(char* memarea, size_t size, size_t repeats)
{
asm("1: \n"
"mov %[memarea], %%rcx \n"
"2: \n"
"mov 0*8(%%rcx), %%rax \n"
"mov 1*8(%%rcx), %%rax \n"
"mov 2*8(%%rcx), %%rax \n"
"mov 3*8(%%rcx), %%rax \n"
"mov 4*8(%%rcx), %%rax \n"
"mov 5*8(%%rcx), %%rax \n"
"mov 6*8(%%rcx), %%rax \n"
"mov 7*8(%%rcx), %%rax \n"
"mov 8*8(%%rcx), %%rax \n"
"mov 9*8(%%rcx), %%rax \n"
"mov 10*8(%%rcx), %%rax \n"
"mov 11*8(%%rcx), %%rax \n"
"mov 12*8(%%rcx), %%rax \n"
"mov 13*8(%%rcx), %%rax \n"
"mov 14*8(%%rcx), %%rax \n"
"mov 15*8(%%rcx), %%rax \n"
"add $16*8, %%rcx \n"
"cmp %[end], %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "rax", "rcx");
}
REGISTER(ScanRead64PtrUnrollLoop, 8, 8);
void cScanWrite64IndexSimpleLoop(char* _memarea, size_t _size, size_t repeats)
{
uint64_t* memarea = (uint64_t*)_memarea;
uint64_t size = _size / sizeof(uint64_t);
uint64_t value = 0xC0FFEEEEBABE0000;
do {
for (size_t i = 0; i < size; ++i)
memarea[i] = value;
}
while (--repeats != 0);
}
void ScanWrite64IndexSimpleLoop(char* memarea, size_t size, size_t repeats)
{
asm("mov $0xC0FFEEEEBABE0000, %%rax \n"
"1: \n"
"xor %%rcx, %%rcx \n"
"2: \n"
"mov %%rax, (%[memarea],%%rcx) \n"
"add $8, %%rcx \n"
"cmp %[size], %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [size] "r" (size), [repeats] "r" (repeats)
: "rax", "rcx");
}
REGISTER(ScanWrite64IndexSimpleLoop, 8, 8);
void ScanWrite64IndexUnrollLoop(char* memarea, size_t size, size_t repeats)
{
asm("mov $0xC0FFEEEEBABE0000, %%rax \n"
"1: \n"
"xor %%rcx, %%rcx \n"
"2: \n"
"mov %%rax, 0*8(%[memarea],%%rcx) \n"
"mov %%rax, 1*8(%[memarea],%%rcx) \n"
"mov %%rax, 2*8(%[memarea],%%rcx) \n"
"mov %%rax, 3*8(%[memarea],%%rcx) \n"
"mov %%rax, 4*8(%[memarea],%%rcx) \n"
"mov %%rax, 5*8(%[memarea],%%rcx) \n"
"mov %%rax, 6*8(%[memarea],%%rcx) \n"
"mov %%rax, 7*8(%[memarea],%%rcx) \n"
"mov %%rax, 8*8(%[memarea],%%rcx) \n"
"mov %%rax, 9*8(%[memarea],%%rcx) \n"
"mov %%rax, 10*8(%[memarea],%%rcx) \n"
"mov %%rax, 11*8(%[memarea],%%rcx) \n"
"mov %%rax, 12*8(%[memarea],%%rcx) \n"
"mov %%rax, 13*8(%[memarea],%%rcx) \n"
"mov %%rax, 14*8(%[memarea],%%rcx) \n"
"mov %%rax, 15*8(%[memarea],%%rcx) \n"
"add $16*8, %%rcx \n"
"cmp %[size], %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [size] "r" (size), [repeats] "r" (repeats)
: "rax", "rcx");
}
REGISTER(ScanWrite64IndexUnrollLoop, 8, 8);
void ScanRead64IndexSimpleLoop(char* memarea, size_t size, size_t repeats)
{
asm("1: \n"
"xor %%rcx, %%rcx \n"
"2: \n"
"mov (%[memarea],%%rcx), %%rax \n"
"add $8, %%rcx \n"
"cmp %[size], %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [size] "r" (size), [repeats] "r" (repeats)
: "rax", "rcx");
}
REGISTER(ScanRead64IndexSimpleLoop, 8, 8);
void ScanRead64IndexUnrollLoop(char* memarea, size_t size, size_t repeats)
{
asm("1: \n"
"xor %%rcx, %%rcx \n"
"2: \n"
"mov 0*8(%[memarea],%%rcx), %%rax \n"
"mov 1*8(%[memarea],%%rcx), %%rax \n"
"mov 2*8(%[memarea],%%rcx), %%rax \n"
"mov 3*8(%[memarea],%%rcx), %%rax \n"
"mov 4*8(%[memarea],%%rcx), %%rax \n"
"mov 5*8(%[memarea],%%rcx), %%rax \n"
"mov 6*8(%[memarea],%%rcx), %%rax \n"
"mov 7*8(%[memarea],%%rcx), %%rax \n"
"mov 8*8(%[memarea],%%rcx), %%rax \n"
"mov 9*8(%[memarea],%%rcx), %%rax \n"
"mov 10*8(%[memarea],%%rcx), %%rax \n"
"mov 11*8(%[memarea],%%rcx), %%rax \n"
"mov 12*8(%[memarea],%%rcx), %%rax \n"
"mov 13*8(%[memarea],%%rcx), %%rax \n"
"mov 14*8(%[memarea],%%rcx), %%rax \n"
"mov 15*8(%[memarea],%%rcx), %%rax \n"
"add $16*8, %%rcx \n"
"cmp %[size], %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [size] "r" (size), [repeats] "r" (repeats)
: "rax", "rcx");
}
REGISTER(ScanRead64IndexUnrollLoop, 8, 8);
#if 0
static const int skiplen64 = 64;
void cSkipWrite64PtrSimpleLoop(char* memarea, size_t size, size_t repeats)
{
uint64_t* begin = (uint64_t*)memarea;
uint64_t* end = begin + size / sizeof(uint64_t);
uint64_t value = 0xC0FFEEEEBABE0000;
do {
uint64_t* p = begin;
do {
*p++ = value;
p += skiplen64 / sizeof(uint64_t);
}
while (p < end);
}
while (--repeats != 0);
}
REGISTER(cSkipWrite64PtrSimpleLoop, 8, 8);
void SkipWrite64PtrSimpleLoop(char* memarea, size_t size, size_t repeats)
{
asm("mov $0xC0FFEEEEBABE0000, %%rax \n"
"mov %[memarea], %%rsi \n"
"mov %[size], %%rdi \n"
"add %%rsi, %%rdi \n"
"1: \n"
"mov %%rsi, %%rcx \n"
"2: \n"
"mov %%rax, (%%rcx) \n"
"add $8, %%rcx \n"
"cmp %%rdi, %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "g" (memarea), [size] "g" (size), [repeats] "r" (repeats)
: "rax", "rcx", "rsi", "rdi");
}
REGISTER(SkipWrite64PtrSimpleLoop, 8, 8);
#endif
void cScanWrite128PtrSimpleLoop(char* memarea, size_t size, size_t repeats)
{
typedef std::pair<uint64_t,uint64_t> uint128;
uint128* begin = (uint128*)memarea;
uint128* end = begin + size / sizeof(uint128);
uint64_t val64 = 0xC0FFEEEEBABE0000;
uint128 value = uint128(val64,val64);
do {
uint128* p = begin;
do {
*p++ = value;
}
while(p < end);
}
while (--repeats != 0);
}
void ScanWrite128PtrSimpleLoop(char* memarea, size_t size, size_t repeats)
{
asm("mov $0xC0FFEEEEBABE0000, %%rax \n"
"movq %%rax, %%xmm0 \n"
"movq %%rax, %%xmm1 \n"
"movlhps %%xmm0, %%xmm1 \n"
"1: \n"
"mov %[memarea], %%rax \n"
"2: \n"
"movdqa %%xmm0, (%%rax) \n"
"add $16, %%rax \n"
"cmp %[end], %%rax \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "rax", "xmm0", "xmm1");
}
REGISTER_CPUFEAT(ScanWrite128PtrSimpleLoop, "sse", 16, 16);
void ScanWrite128PtrUnrollLoop(char* memarea, size_t size, size_t repeats)
{
asm("mov $0xC0FFEEEEBABE0000, %%rax \n"
"movq %%rax, %%xmm0 \n"
"movq %%rax, %%xmm1 \n"
"movlhps %%xmm0, %%xmm1 \n"
"1: \n"
"mov %[memarea], %%rax \n"
"2: \n"
"movdqa %%xmm0, 0*16(%%rax) \n"
"movdqa %%xmm0, 1*16(%%rax) \n"
"movdqa %%xmm0, 2*16(%%rax) \n"
"movdqa %%xmm0, 3*16(%%rax) \n"
"movdqa %%xmm0, 4*16(%%rax) \n"
"movdqa %%xmm0, 5*16(%%rax) \n"
"movdqa %%xmm0, 6*16(%%rax) \n"
"movdqa %%xmm0, 7*16(%%rax) \n"
"movdqa %%xmm0, 8*16(%%rax) \n"
"movdqa %%xmm0, 9*16(%%rax) \n"
"movdqa %%xmm0, 10*16(%%rax) \n"
"movdqa %%xmm0, 11*16(%%rax) \n"
"movdqa %%xmm0, 12*16(%%rax) \n"
"movdqa %%xmm0, 13*16(%%rax) \n"
"movdqa %%xmm0, 14*16(%%rax) \n"
"movdqa %%xmm0, 15*16(%%rax) \n"
"add $16*16, %%rax \n"
"cmp %[end], %%rax \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "rax", "xmm0", "xmm1");
}
REGISTER_CPUFEAT(ScanWrite128PtrUnrollLoop, "sse", 16, 16);
void ScanRead128PtrSimpleLoop(char* memarea, size_t size, size_t repeats)
{
asm("1: \n"
"mov %[memarea], %%rax \n"
"2: \n"
"movdqa (%%rax), %%xmm0 \n"
"add $16, %%rax \n"
"cmp %[end], %%rax \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "rax", "xmm0");
}
REGISTER_CPUFEAT(ScanRead128PtrSimpleLoop, "sse", 16, 16);
void ScanRead128PtrUnrollLoop(char* memarea, size_t size, size_t repeats)
{
asm("1: \n"
"mov %[memarea], %%rax \n"
"2: \n"
"movdqa 0*16(%%rax), %%xmm0 \n"
"movdqa 1*16(%%rax), %%xmm0 \n"
"movdqa 2*16(%%rax), %%xmm0 \n"
"movdqa 3*16(%%rax), %%xmm0 \n"
"movdqa 4*16(%%rax), %%xmm0 \n"
"movdqa 5*16(%%rax), %%xmm0 \n"
"movdqa 6*16(%%rax), %%xmm0 \n"
"movdqa 7*16(%%rax), %%xmm0 \n"
"movdqa 8*16(%%rax), %%xmm0 \n"
"movdqa 9*16(%%rax), %%xmm0 \n"
"movdqa 10*16(%%rax), %%xmm0 \n"
"movdqa 11*16(%%rax), %%xmm0 \n"
"movdqa 12*16(%%rax), %%xmm0 \n"
"movdqa 13*16(%%rax), %%xmm0 \n"
"movdqa 14*16(%%rax), %%xmm0 \n"
"movdqa 15*16(%%rax), %%xmm0 \n"
"add $16*16, %%rax \n"
"cmp %[end], %%rax \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "rax", "xmm0");
}
REGISTER_CPUFEAT(ScanRead128PtrUnrollLoop, "sse", 16, 16);
void ScanWrite256PtrSimpleLoop(char* memarea, size_t size, size_t repeats)
{
uint64_t value = 0xC0FFEEEEBABE0000;
asm("vbroadcastsd %[value], %%ymm0 \n"
"1: \n"
"mov %[memarea], %%rax \n"
"2: \n"
"vmovdqa %%ymm0, (%%rax) \n"
"add $32, %%rax \n"
"cmp %[end], %%rax \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats),
[value] "m" (value)
: "rax", "xmm0");
}
REGISTER_CPUFEAT(ScanWrite256PtrSimpleLoop, "avx", 32, 32);
void ScanWrite256PtrUnrollLoop(char* memarea, size_t size, size_t repeats)
{
uint64_t value = 0xC0FFEEEEBABE0000;
asm("vbroadcastsd %[value], %%ymm0 \n"
"1: \n"
"mov %[memarea], %%rax \n"
"2: \n"
"vmovdqa %%ymm0, 0*32(%%rax) \n"
"vmovdqa %%ymm0, 1*32(%%rax) \n"
"vmovdqa %%ymm0, 2*32(%%rax) \n"
"vmovdqa %%ymm0, 3*32(%%rax) \n"
"vmovdqa %%ymm0, 4*32(%%rax) \n"
"vmovdqa %%ymm0, 5*32(%%rax) \n"
"vmovdqa %%ymm0, 6*32(%%rax) \n"
"vmovdqa %%ymm0, 7*32(%%rax) \n"
"vmovdqa %%ymm0, 8*32(%%rax) \n"
"vmovdqa %%ymm0, 9*32(%%rax) \n"
"vmovdqa %%ymm0, 10*32(%%rax) \n"
"vmovdqa %%ymm0, 11*32(%%rax) \n"
"vmovdqa %%ymm0, 12*32(%%rax) \n"
"vmovdqa %%ymm0, 13*32(%%rax) \n"
"vmovdqa %%ymm0, 14*32(%%rax) \n"
"vmovdqa %%ymm0, 15*32(%%rax) \n"
"add $16*32, %%rax \n"
"cmp %[end], %%rax \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats),
[value] "m" (value)
: "rax", "xmm0");
}
REGISTER_CPUFEAT(ScanWrite256PtrUnrollLoop, "avx", 32, 32);
void ScanRead256PtrSimpleLoop(char* memarea, size_t size, size_t repeats)
{
asm("1: \n"
"mov %[memarea], %%rax \n"
"2: \n"
"vmovdqa (%%rax), %%ymm0 \n"
"add $32, %%rax \n"
"cmp %[end], %%rax \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "rax", "xmm0");
}
REGISTER_CPUFEAT(ScanRead256PtrSimpleLoop, "avx", 32, 32);
void ScanRead256PtrUnrollLoop(char* memarea, size_t size, size_t repeats)
{
asm("1: \n"
"mov %[memarea], %%rax \n"
"2: \n"
"vmovdqa 0*32(%%rax), %%ymm0 \n"
"vmovdqa 1*32(%%rax), %%ymm0 \n"
"vmovdqa 2*32(%%rax), %%ymm0 \n"
"vmovdqa 3*32(%%rax), %%ymm0 \n"
"vmovdqa 4*32(%%rax), %%ymm0 \n"
"vmovdqa 5*32(%%rax), %%ymm0 \n"
"vmovdqa 6*32(%%rax), %%ymm0 \n"
"vmovdqa 7*32(%%rax), %%ymm0 \n"
"vmovdqa 8*32(%%rax), %%ymm0 \n"
"vmovdqa 9*32(%%rax), %%ymm0 \n"
"vmovdqa 10*32(%%rax), %%ymm0 \n"
"vmovdqa 11*32(%%rax), %%ymm0 \n"
"vmovdqa 12*32(%%rax), %%ymm0 \n"
"vmovdqa 13*32(%%rax), %%ymm0 \n"
"vmovdqa 14*32(%%rax), %%ymm0 \n"
"vmovdqa 15*32(%%rax), %%ymm0 \n"
"add $16*32, %%rax \n"
"cmp %[end], %%rax \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "rax", "xmm0");
}
REGISTER_CPUFEAT(ScanRead256PtrUnrollLoop, "avx", 32, 32);
void ScanWrite32PtrSimpleLoop(char* memarea, size_t size, size_t repeats)
{
asm("mov $0xC0FFEEEE, %%eax \n"
"1: \n"
"mov %[memarea], %%rcx \n"
"2: \n"
"movl %%eax, (%%rcx) \n"
"add $4, %%rcx \n"
"cmp %[end], %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "eax", "rcx");
}
REGISTER(ScanWrite32PtrSimpleLoop, 4, 4);
void ScanWrite32PtrUnrollLoop(char* memarea, size_t size, size_t repeats)
{
asm("mov $0xC0FFEEEE, %%eax \n"
"1: \n"
"mov %[memarea], %%rcx \n"
"2: \n"
"movl %%eax, 0*4(%%rcx) \n"
"movl %%eax, 1*4(%%rcx) \n"
"movl %%eax, 2*4(%%rcx) \n"
"movl %%eax, 3*4(%%rcx) \n"
"movl %%eax, 4*4(%%rcx) \n"
"movl %%eax, 5*4(%%rcx) \n"
"movl %%eax, 6*4(%%rcx) \n"
"movl %%eax, 7*4(%%rcx) \n"
"movl %%eax, 8*4(%%rcx) \n"
"movl %%eax, 9*4(%%rcx) \n"
"movl %%eax, 10*4(%%rcx) \n"
"movl %%eax, 11*4(%%rcx) \n"
"movl %%eax, 12*4(%%rcx) \n"
"movl %%eax, 13*4(%%rcx) \n"
"movl %%eax, 14*4(%%rcx) \n"
"movl %%eax, 15*4(%%rcx) \n"
"add $16*4, %%rcx \n"
"cmp %[end], %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "eax", "rcx");
}
REGISTER(ScanWrite32PtrUnrollLoop, 4, 4);
void ScanRead32PtrSimpleLoop(char* memarea, size_t size, size_t repeats)
{
asm("1: \n"
"mov %[memarea], %%rcx \n"
"2: \n"
"movl (%%rcx), %%eax \n"
"add $4, %%rcx \n"
"cmp %[end], %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "eax", "rcx");
}
REGISTER(ScanRead32PtrSimpleLoop, 4, 4);
void ScanRead32PtrUnrollLoop(char* memarea, size_t size, size_t repeats)
{
asm("1: \n"
"mov %[memarea], %%rcx \n"
"2: \n"
"movl 0*4(%%rcx), %%eax \n"
"movl 1*4(%%rcx), %%eax \n"
"movl 2*4(%%rcx), %%eax \n"
"movl 3*4(%%rcx), %%eax \n"
"movl 4*4(%%rcx), %%eax \n"
"movl 5*4(%%rcx), %%eax \n"
"movl 6*4(%%rcx), %%eax \n"
"movl 7*4(%%rcx), %%eax \n"
"movl 8*4(%%rcx), %%eax \n"
"movl 9*4(%%rcx), %%eax \n"
"movl 10*4(%%rcx), %%eax \n"
"movl 11*4(%%rcx), %%eax \n"
"movl 12*4(%%rcx), %%eax \n"
"movl 13*4(%%rcx), %%eax \n"
"movl 14*4(%%rcx), %%eax \n"
"movl 15*4(%%rcx), %%eax \n"
"add $16*4, %%rcx \n"
"cmp %[end], %%rcx \n"
"jb 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [end] "r" (memarea+size), [repeats] "r" (repeats)
: "eax", "rcx");
}
REGISTER(ScanRead32PtrUnrollLoop, 4, 4);
void cPermRead64SimpleLoop(char* memarea, size_t, size_t repeats)
{
uint64_t* begin = (uint64_t*)memarea;
do {
uint64_t* p = begin;
do {
p = (uint64_t*)*p;
}
while (p != begin);
}
while (--repeats != 0);
}
void PermRead64SimpleLoop(char* memarea, size_t, size_t repeats)
{
asm("1: \n"
"mov %[memarea], %%rax \n"
"2: \n"
"mov (%%rax), %%rax \n"
"cmp %%rax, %[memarea] \n"
"jne 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [repeats] "r" (repeats)
: "rax");
}
REGISTER_PERM(PermRead64SimpleLoop, 8);
void PermRead64UnrollLoop(char* memarea, size_t, size_t repeats)
{
asm("1: \n"
"mov %[memarea], %%rax \n"
"2: \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"mov (%%rax), %%rax \n"
"cmp %%rax, %[memarea] \n"
"jne 2b \n"
"dec %[repeats] \n"
"jnz 1b \n"
:
: [memarea] "r" (memarea), [repeats] "r" (repeats)
: "rax");
}
REGISTER_PERM(PermRead64UnrollLoop, 8);