<tt.rantala@gmail.com>
http://doi.acm.org/10.1145/1217856.1217858
namespace rantala_msd_db {
using namespace rantala;
typedef unsigned char** Block;
typedef std::list<Block> FreeBlocks;
typedef std::list<Block> Bucket;
typedef boost::array<Bucket, 256> Buckets;
typedef std::vector<Block*> BackLinks;
static inline Block
take_free_block(FreeBlocks& fb)
{
assert(not fb.empty());
Block b(fb.front());
fb.pop_front();
return b;
}
template <unsigned B>
static void
msd_D(unsigned char** strings, size_t n, size_t depth)
{
if (n < 0x10000) {
msd_CE2_16bit(strings, n, depth);
return;
}
assert(n > B);
static Buckets buckets;
static boost::array<unsigned char*, (256+6)*B> temp_space;
static FreeBlocks freeblocks;
BackLinks backlinks(n/B+1);
boost::array<size_t, 256> bucketsize;
for (unsigned i=0; i < 256; ++i) {
bucketsize[i] = 0;
buckets[i].clear();
}
for (size_t i=0; i < 256+6; ++i)
freeblocks.push_back(&temp_space[i*B]);
for (size_t i=0; i < n-B; i+=B)
freeblocks.push_back(&strings[i]);
size_t i=0;
for (; i < n-n%32; i+=32) {
unsigned char cache[32];
for (unsigned j=0; j < 32; ++j) {
cache[j] = strings[i+j][depth];
}
for (unsigned j=0; j < 32; ++j) {
const unsigned char c = cache[j];
if (bucketsize[c] % B == 0) {
Block b = take_free_block(freeblocks);
buckets[c].push_back(b);
if (b >= strings && b < strings+n) {
backlinks[(b-strings)/B] = &(buckets[c].back());
}
}
assert(not buckets[c].empty());
buckets[c].back()[bucketsize[c] % B] = strings[i+j];
++bucketsize[c];
}
}
for (; i < n; ++i) {
const unsigned char c = strings[i][depth];
if (bucketsize[c] % B == 0) {
Block b = take_free_block(freeblocks);
buckets[c].push_back(b);
if (b >= strings && b < strings+n) {
backlinks[(b-strings)/B] = &(buckets[c].back());
}
}
assert(not buckets[c].empty());
buckets[c].back()[bucketsize[c] % B] = strings[i];
++bucketsize[c];
}
size_t pos = 0;
for (unsigned i=0; i < 256; ++i) {
if (bucketsize[i] == 0) continue;
Bucket::const_iterator it = buckets[i].begin();
for (size_t bucket_pos=0; bucket_pos < bucketsize[i]; ++it, bucket_pos+=B) {
const size_t block_items = std::min(size_t(B), bucketsize[i]-bucket_pos);
const size_t block_overlap = (pos+block_items-1)/B;
if (*it == (strings+pos)) {
assert(pos%B==0);
backlinks[pos/B] = 0;
pos += block_items;
continue;
}
if (backlinks[block_overlap]) {
Block tmp = take_free_block(freeblocks);
while (tmp >= strings && tmp < strings+pos)
tmp = take_free_block(freeblocks);
if (tmp >= strings && tmp < strings+n) {
assert(backlinks[(tmp-strings)/B]==0);
backlinks[(tmp-strings)/B] = backlinks[block_overlap];
}
memcpy(tmp, *(backlinks[block_overlap]), B*sizeof(unsigned char*));
*(backlinks[block_overlap]) = tmp;
backlinks[block_overlap] = 0;
}
if (*it >= strings && *it < strings+n) {
assert(*it > strings+pos);
backlinks[(*it-strings)/B] = 0;
}
memcpy(strings+pos, *it, block_items*sizeof(unsigned char*));
if (*it >= strings && *it < strings+n) {
freeblocks.push_back(*it);
} else {
freeblocks.push_front(*it);
}
pos += block_items;
}
}
freeblocks.clear();
backlinks.clear();
pos = bucketsize[0];
for (unsigned i=1; i < 256; ++i) {
if (bucketsize[i] == 0) continue;
msd_D<B>(strings+pos, bucketsize[i], depth+1);
pos += bucketsize[i];
}
}
void msd_DB(unsigned char** strings, size_t n)
{ msd_D<1024>(strings, n, 0); }
CONTESTANT_REGISTER(msd_DB,
"rantala/msd_DB",
"msd_DB")
}