<tt.rantala@gmail.com>
TODO:
#include <xmmintrin.h>
#include <emmintrin.h>
namespace rantala {
#define left_bucket(pivot) (2*(pivot))
#define middle_bucket(pivot) (2*(pivot)+1)
#define right_bucket(pivot) (2*(pivot)+2)
#define total_buckets(pivots) (2*(pivots)+1)
static inline unsigned lcp(unsigned char a, unsigned char b)
{
assert(a != b);
(void)a;
(void)b;
return 0;
}
static inline unsigned lcp(uint16_t a, uint16_t b)
{
assert(a != b);
unsigned A, B;
A = 0xFF00 & a;
B = 0xFF00 & b;
if (A==0 or A!=B) return 0;
return 1;
}
static inline unsigned lcp(uint32_t a, uint32_t b)
{
assert(a != b);
unsigned A, B;
A = 0xFF000000 & a;
B = 0xFF000000 & b;
if (A==0 or A!=B) return 0;
A = 0x00FF0000 & a;
B = 0x00FF0000 & b;
if (A==0 or A!=B) return 1;
A = 0x0000FF00 & a;
B = 0x0000FF00 & b;
if (A==0 or A!=B) return 2;
return 3;
}
template <unsigned Pivots>
static void
fill_oracle_brute_force_sse(
unsigned char** strings,
size_t n,
uint8_t* restrict oracle,
const boost::array<unsigned char, Pivots>& pivots,
size_t depth)
{
assert(n % 16 == 0);
BOOST_STATIC_ASSERT(Pivots > 0);
BOOST_STATIC_ASSERT(total_buckets(Pivots) < 0x100);
typedef unsigned char CharT;
static const uint8_t _Constants[] __attribute__((aligned(16))) = {
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
};
const __m128i FlipBit = _mm_load_si128(
reinterpret_cast<const __m128i*>(_Constants+0));
const __m128i Mask1 = _mm_load_si128(
reinterpret_cast<const __m128i*>(_Constants+16));
uint8_t _Pivots[16*Pivots] __attribute__((aligned(16)));
for (unsigned i=0; i < Pivots; ++i) {
const uint8_t p = pivots[i]+0x80;
for (unsigned j=0; j < 16; ++j) {
_Pivots[16*i+j] = p;
}
}
for (size_t i=0; i < n; i += 16) {
CharT cache[16] __attribute__ ((aligned(16)));
for (unsigned j=0; j < 16; ++j) {
cache[j] = get_char<CharT>(strings[i+j], depth);
}
__m128i result = _mm_setzero_si128();
__m128i data = _mm_load_si128(
reinterpret_cast<__m128i*>(cache+0));
data = _mm_add_epi8(data, FlipBit);
__m128i p1;
__m128i p2 = _mm_load_si128(
reinterpret_cast<__m128i*>(_Pivots+0));
__m128i mask = Mask1;
for (unsigned k=0; k < Pivots-1; ++k) {
p1 = p2;
p2 = _mm_load_si128(
reinterpret_cast<__m128i*>(_Pivots+16*(k+1)));
__m128i tmp1 = _mm_cmpeq_epi8(data, p1);
tmp1 = _mm_and_si128(tmp1, mask);
mask = _mm_add_epi8(mask, Mask1);
result = _mm_or_si128(result, tmp1);
__m128i tmp2 = _mm_and_si128(
_mm_cmpgt_epi8(data, p1),
_mm_cmplt_epi8(data, p2));
tmp2 = _mm_and_si128(tmp2, mask);
mask = _mm_add_epi8(mask, Mask1);
result = _mm_or_si128(result, tmp2);
}
__m128i tmp1 = _mm_cmpeq_epi8(data, p2);
tmp1 = _mm_and_si128(tmp1, mask);
mask = _mm_add_epi8(mask, Mask1);
result = _mm_or_si128(result, tmp1);
__m128i tmp2 = _mm_cmpgt_epi8(data, p2);
tmp2 = _mm_and_si128(tmp2, mask);
result = _mm_or_si128(result, tmp2);
_mm_store_si128(reinterpret_cast<__m128i*>(oracle+i), result);
}
}
template <unsigned Pivots>
static void
fill_oracle_brute_force_sse(
unsigned char** strings,
size_t n,
uint8_t* restrict oracle,
const boost::array<uint16_t, Pivots>& pivots,
size_t depth)
{
assert(n % 16 == 0);
BOOST_STATIC_ASSERT(Pivots > 0);
BOOST_STATIC_ASSERT(total_buckets(Pivots) < 0x100);
typedef uint16_t CharT;
static const uint16_t _Constants[] __attribute__((aligned(16))) = {
0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001
};
const __m128i FlipBit = _mm_load_si128(
reinterpret_cast<const __m128i*>(_Constants+0));
const __m128i Mask1 = _mm_load_si128(
reinterpret_cast<const __m128i*>(_Constants+8));
uint16_t _Pivots[8*Pivots] __attribute__((aligned(16)));
for (unsigned i=0; i < Pivots; ++i) {
const uint16_t p = pivots[i]+0x8000;
for (unsigned j=0; j < 8; ++j) {
_Pivots[8*i+j] = p;
}
}
for (size_t i=0; i < n; i += 8) {
uint16_t cache[8] __attribute__((aligned(16)));
for (unsigned j=0; j<8; ++j) {
cache[j] = get_char<CharT>(strings[i+j], depth);
}
__m128i result = _mm_setzero_si128();
__m128i data = _mm_load_si128(
reinterpret_cast<__m128i*>(cache+0));
data = _mm_add_epi16(data, FlipBit);
__m128i p1;
__m128i p2 = _mm_load_si128(
reinterpret_cast<__m128i*>(_Pivots+0));
__m128i mask = Mask1;
for (unsigned k=0; k < Pivots-1; ++k) {
p1 = p2;
p2 = _mm_load_si128(
reinterpret_cast<__m128i*>(_Pivots+8*(k+1)));
__m128i tmp1 = _mm_cmpeq_epi16(data, p1);
tmp1 = _mm_and_si128(tmp1, mask);
mask = _mm_add_epi16(mask, Mask1);
result = _mm_or_si128(result, tmp1);
__m128i tmp2 = _mm_and_si128(
_mm_cmpgt_epi16(data, p1),
_mm_cmplt_epi16(data, p2));
tmp2 = _mm_and_si128(tmp2, mask);
mask = _mm_add_epi16(mask, Mask1);
result = _mm_or_si128(result, tmp2);
}
__m128i tmp1 = _mm_cmpeq_epi16(data, p2);
tmp1 = _mm_and_si128(tmp1, mask);
mask = _mm_add_epi16(mask, Mask1);
result = _mm_or_si128(result, tmp1);
__m128i tmp2 = _mm_cmpgt_epi16(data, p2);
tmp2 = _mm_and_si128(tmp2, mask);
result = _mm_or_si128(result, tmp2);
result = _mm_packus_epi16(result, _mm_setzero_si128());
_mm_storel_epi64(reinterpret_cast<__m128i*>(oracle+i), result);
}
}
template <unsigned pos>
static inline __attribute__((__always_inline__))
uint8_t extract8(__m128i a)
{
BOOST_STATIC_ASSERT(pos%2==0);
return uint8_t(_mm_extract_epi16(a, pos/2));
}
template <unsigned Pivots>
static void
fill_oracle_brute_force_sse(
unsigned char** strings,
size_t n,
uint8_t* restrict oracle,
const boost::array<uint32_t, Pivots>& pivots,
size_t depth)
{
assert(n % 16 == 0);
BOOST_STATIC_ASSERT(Pivots > 0);
BOOST_STATIC_ASSERT(total_buckets(Pivots) < 0x100);
typedef uint32_t CharT;
static const CharT _Constants[] __attribute__((aligned(16))) = {
0x80000000, 0x80000000, 0x80000000, 0x80000000,
0x00000001, 0x00000001, 0x00000001, 0x00000001
};
const __m128i FlipBit = _mm_load_si128(
reinterpret_cast<const __m128i*>(_Constants+0));
const __m128i Mask1 = _mm_load_si128(
reinterpret_cast<const __m128i*>(_Constants+4));
CharT _Pivots[4*Pivots] __attribute__((aligned(16)));
for (unsigned i=0; i < Pivots; ++i) {
const CharT p = pivots[i]+0x80000000;
for (unsigned j=0; j < 4; ++j) {
_Pivots[4*i+j] = p;
}
}
for (size_t i=0; i < n; i += 4) {
CharT cache[4] __attribute__((aligned(16)));
for (unsigned j=0; j<4; ++j) {
cache[j] = get_char<CharT>(strings[i+j], depth);
}
__m128i result = _mm_setzero_si128();
__m128i data = _mm_load_si128(
reinterpret_cast<__m128i*>(cache+0));
data = _mm_add_epi32(data, FlipBit);
__m128i p1;
__m128i p2 = _mm_load_si128(
reinterpret_cast<__m128i*>(_Pivots+0));
__m128i mask = Mask1;
for (unsigned k=0; k < Pivots-1; ++k) {
p1 = p2;
p2 = _mm_load_si128(
reinterpret_cast<__m128i*>(_Pivots+4*(k+1)));
__m128i tmp1 = _mm_cmpeq_epi32(data, p1);
tmp1 = _mm_and_si128(tmp1, mask);
mask = _mm_add_epi32(mask, Mask1);
result = _mm_or_si128(result, tmp1);
__m128i tmp2 = _mm_and_si128(
_mm_cmpgt_epi32(data, p1),
_mm_cmplt_epi32(data, p2));
tmp2 = _mm_and_si128(tmp2, mask);
mask = _mm_add_epi32(mask, Mask1);
result = _mm_or_si128(result, tmp2);
}
__m128i tmp1 = _mm_cmpeq_epi32(data, p2);
tmp1 = _mm_and_si128(tmp1, mask);
mask = _mm_add_epi32(mask, Mask1);
result = _mm_or_si128(result, tmp1);
__m128i tmp2 = _mm_cmpgt_epi32(data, p2);
tmp2 = _mm_and_si128(tmp2, mask);
result = _mm_or_si128(result, tmp2);
oracle[i+0] = extract8<0>(result);
oracle[i+1] = extract8<4>(result);
oracle[i+2] = extract8<8>(result);
oracle[i+3] = extract8<12>(result);
}
}
template <unsigned Pivots, typename CharT>
static void
fill_oracle(unsigned char** strings,
size_t N,
uint8_t* oracle,
const boost::array<CharT, Pivots>& pivots,
size_t depth)
{
BOOST_STATIC_ASSERT(Pivots > 0);
BOOST_STATIC_ASSERT(total_buckets(Pivots) < 0x100);
size_t n = N-N%16;
fill_oracle_brute_force_sse<Pivots>(strings, n, oracle, pivots, depth);
for (size_t i=n; i < N; ++i) {
const CharT c = get_char<CharT>(strings[i], depth);
if (c < pivots[0]) {
oracle[i] = left_bucket(0);
goto pivoting_done;
}
for (unsigned j=0; j < Pivots-1; ++j) {
if (c == pivots[j]) {
oracle[i] = middle_bucket(j);
goto pivoting_done;
} else if (c > pivots[j] and c < pivots[j+1]) {
oracle[i] = right_bucket(j);
goto pivoting_done;
}
}
if (c == pivots[Pivots-1]) {
oracle[i] = middle_bucket(Pivots-1);
} else {
oracle[i] = right_bucket(Pivots-1);
}
pivoting_done:
;
}
}
template <typename CharT, unsigned Pivots>
static void
multikey_multipivot(unsigned char** strings, size_t n, size_t depth)
{
BOOST_STATIC_ASSERT(Pivots > 0);
BOOST_STATIC_ASSERT(total_buckets(Pivots) < 0x100);
if (n < 15000) {
mkqsort(strings, n, depth);
return;
}
std::set<CharT> sample;
for (unsigned i=0; i < Pivots; ++i) {
double r=drand48();
size_t pos = ((size_t)((n-7)*r));
assert(pos+6 < n);
boost::array<CharT, 7> tmp;
for (unsigned j=0;j<tmp.size();++j) {
tmp[j] = get_char<CharT>(strings[pos+j], depth);
}
sample.insert(tmp.begin(), tmp.end());
}
for (CharT i=1; sample.size() < Pivots; ++i) {
if (is_end(i)) ++i;
sample.insert(i);
}
std::vector<CharT> sample_array(sample.begin(), sample.end());
sample.clear();
boost::array<CharT, Pivots> pivots;
unsigned step = sample_array.size() / Pivots;
assert(step > 0);
for (unsigned i=0; i < Pivots; ++i) {
pivots[i] = sample_array[step*i];
}
uint8_t* restrict oracle = static_cast<uint8_t*>(_mm_malloc(n, 16));
fill_oracle<Pivots>(strings, n, oracle, pivots, depth);
boost::array<size_t, total_buckets(Pivots)> bucketsize;
bucketsize.assign(0);
uint8_t prev = oracle[0];
bool sorted = true;
size_t i=1;
++bucketsize[prev];
for (; i < n; ++i) {
uint8_t bucket = oracle[i];
if (prev > bucket) {
sorted = false; break;
}
++bucketsize[bucket];
prev = bucket;
}
for (; i < n; ++i)
++bucketsize[oracle[i]];
if (not sorted) {
unsigned char** sorted = (unsigned char**)
malloc(n*sizeof(unsigned char*));
static boost::array<size_t, total_buckets(Pivots)> bucketindex;
bucketindex[0] = 0;
for (unsigned i=1; i < total_buckets(Pivots); ++i)
bucketindex[i] = bucketindex[i-1] + bucketsize[i-1];
for (size_t i=0; i < n; ++i)
sorted[bucketindex[oracle[i]]++] = strings[i];
memcpy(strings, sorted, n*sizeof(unsigned char*));
free(sorted);
}
_mm_free(oracle);
size_t b=0;
size_t bsum = bucketsize[0];
if (bsum) multikey_multipivot<CharT, Pivots>(strings, bsum, depth);
for (unsigned i=0; i < Pivots-1; ++i) {
b = bucketsize[middle_bucket(i)];
if (b and not is_end(pivots[i])) {
multikey_multipivot<CharT, Pivots>(strings+bsum, b,
depth+sizeof(CharT));
}
bsum += b;
if ((b = bucketsize[right_bucket(i)])) {
multikey_multipivot<CharT, Pivots>(strings+bsum, b,
depth+lcp(pivots[i], pivots[i+1]));
}
bsum += b;
}
if ((b = bucketsize[middle_bucket(Pivots-1)])) {
if (not is_end(pivots[Pivots-1]))
multikey_multipivot<CharT, Pivots>(strings+bsum,
b, depth+sizeof(CharT));
bsum += b;
}
if ((b = bucketsize[right_bucket(Pivots-1)])) {
multikey_multipivot<CharT, Pivots>(strings+bsum, b, depth);
}
}
void multikey_multipivot_brute_simd1(unsigned char** strings, size_t n)
{ multikey_multipivot<unsigned char, 16>(strings, n, 0); }
void multikey_multipivot_brute_simd2(unsigned char** strings, size_t n)
{ multikey_multipivot<uint16_t, 32>(strings, n, 0); }
void multikey_multipivot_brute_simd4(unsigned char** strings, size_t n)
{ multikey_multipivot<uint32_t, 32>(strings, n, 0); }
CONTESTANT_REGISTER(multikey_multipivot_brute_simd1,
"rantala/multikey_multipivot_brute_simd1",
"multikey_multipivot brute_simd with 1byte alphabet")
CONTESTANT_REGISTER(multikey_multipivot_brute_simd2,
"rantala/multikey_multipivot_brute_simd2",
"multikey_multipivot brute_simd with 2byte alphabet")
CONTESTANT_REGISTER(multikey_multipivot_brute_simd4,
"rantala/multikey_multipivot_brute_simd4",
"multikey_multipivot brute_simd with 4byte alphabet")
}