<tt.rantala@gmail.com>
http://doi.acm.org/10.1145/1005813.1041517
#include <stdlib.h>
#include <assert.h>
#include <stdint.h>
#include <string.h>
#include <iostream>
#include <bitset>
#include <vector>
#include <boost/array.hpp>
#include "tools/debug.h"
#include "tools/get_char.h"
#include "tools/vector_malloc.h"
#include "tools/vector_realloc.h"
#include "tools/vector_block.h"
#include "tools/vector_bagwell.h"
#include "tools/vector_brodnik.h"
#include "../tools/contest.h"
#include "../sequential/bs-mkqs.h"
namespace rantala_burstsort {
using namespace rantala;
using boost::array;
template <typename T> struct max {};
template <> struct max<unsigned char> { enum { value = 0x100 }; };
template <> struct max<uint16_t> { enum { value = 0x10000 }; };
template <typename CharT>
struct TrieNode
{
array<void*, max<CharT>::value> buckets;
std::bitset<max<CharT>::value> is_trie;
TrieNode() { buckets.assign(0); }
};
template <typename CharT>
struct BurstSimple
{
template <typename BucketT>
TrieNode<CharT>*
operator()(const BucketT& bucket, size_t depth) const
{
TrieNode<CharT>* new_node = new TrieNode<CharT>;
const unsigned bucket_size = bucket.size();
unsigned i=0;
for (; i < bucket_size-bucket_size%64; i+=64) {
array<CharT, 64> cache;
array<unsigned char*, 64> strings;
for (unsigned j=0; j < 64; ++j) {
strings[j] = bucket[i+j];
cache[j] = get_char<CharT>(strings[j], depth);
}
for (unsigned j=0; j < 64; ++j) {
const CharT ch = cache[j];
BucketT* sub_bucket = static_cast<BucketT*>(
new_node->buckets[ch]);
if (not sub_bucket) {
new_node->buckets[ch] = sub_bucket
= new BucketT;
}
sub_bucket->push_back(strings[j]);
}
}
for (; i < bucket_size; ++i) {
unsigned char* ptr = bucket[i];
const CharT ch = get_char<CharT>(ptr, depth);
BucketT* sub_bucket = static_cast<BucketT*>(
new_node->buckets[ch]);
if (not sub_bucket) {
new_node->buckets[ch] = sub_bucket
= new BucketT;
}
sub_bucket->push_back(ptr);
}
return new_node;
}
};
template <typename CharT>
struct BurstRecursive
{
template <typename BucketT>
TrieNode<CharT>*
operator()(const BucketT& bucket, size_t depth) const
{
TrieNode<CharT>* new_node
= BurstSimple<CharT>()(bucket, depth);
const size_t threshold = std::max(
size_t(100), bucket.size()/2);
for (unsigned i=0; i < max<CharT>::value; ++i) {
assert(new_node->is_trie[i] == false);
BucketT* sub_bucket = static_cast<BucketT*>(
new_node->buckets[i]);
if (not sub_bucket) continue;
if (not is_end(i) and sub_bucket->size() > threshold) {
new_node->buckets[i] =
BurstRecursive<CharT>()(*sub_bucket,
depth+sizeof(CharT));
delete sub_bucket;
new_node->is_trie[i] = true;
}
}
return new_node;
}
};
template <typename CharT>
static TrieNode<CharT>*
random_sample(unsigned char** strings, size_t n)
{
const size_t sample_size = n/8192;
size_t max_nodes = 30000000/sizeof(TrieNode<CharT>);
debug()<<__PRETTY_FUNCTION__<<" sampling "<<sample_size<<" strings\n";
TrieNode<CharT>* root = new TrieNode<CharT>;
for (size_t i=0; i < sample_size; ++i) {
unsigned char* str = strings[size_t(drand48()*n)];
size_t depth = 0;
TrieNode<CharT>* node = root;
while (true) {
CharT c = get_char<CharT>(str, depth);
if (is_end(c)) break;
depth += sizeof(CharT);
if (not node->is_trie[c]) {
node->is_trie[c] = true;
node->buckets[c] = new TrieNode<CharT>;
if (--max_nodes==0) goto finish;
}
node = static_cast<TrieNode<CharT>*>(node->buckets[c]);
assert(node);
}
}
finish:
return root;
}
template <typename CharT>
static TrieNode<CharT>*
pseudo_sample(unsigned char** strings, size_t n)
{
debug()<<__func__<<"(): sampling "<<n/8192<<" strings ...\n";
size_t max_nodes = 30000000/sizeof(TrieNode<CharT>);
TrieNode<CharT>* root = new TrieNode<CharT>;
for (size_t i=0; i < n; i += 8192) {
unsigned char* str = strings[i];
size_t depth = 0;
TrieNode<CharT>* node = root;
while (true) {
CharT c = get_char<CharT>(str, depth);
if (is_end(c)) break;
depth += sizeof(CharT);
if (not node->is_trie[c]) {
node->is_trie[c] = true;
node->buckets[c] = new TrieNode<CharT>;
if (--max_nodes==0) goto finish;
}
node = static_cast<TrieNode<CharT>*>(node->buckets[c]);
assert(node);
}
}
finish:
debug()<<" Sampling done, created "
<<(30000000/sizeof(TrieNode<CharT>))-max_nodes<<" nodes.\n";
return root;
}
template <unsigned Threshold, typename BucketT,
typename BurstImpl, typename CharT>
static inline void
insert(TrieNode<CharT>* root, unsigned char** strings, size_t n)
{
for (size_t i=0; i < n; ++i) {
unsigned char* str = strings[i];
size_t depth = 0;
CharT c = get_char<CharT>(str, 0);
TrieNode<CharT>* node = root;
while (node->is_trie[c]) {
assert(not is_end(c));
node = static_cast<TrieNode<CharT>*>(node->buckets[c]);
depth += sizeof(CharT);
c = get_char<CharT>(str, depth);
}
BucketT* bucket = static_cast<BucketT*>(node->buckets[c]);
if (not bucket) {
node->buckets[c] = bucket = new BucketT;
}
bucket->push_back(str);
if (is_end(c)) continue;
if (bucket->size() > Threshold) {
node->buckets[c] = BurstImpl()(*bucket,
depth+sizeof(CharT));
node->is_trie[c] = true;
delete bucket;
}
}
}
static inline void
copy(const std::vector<unsigned char*>& bucket, unsigned char** dst)
{
std::copy(bucket.begin(), bucket.end(), dst);
}
template <typename BucketT, typename SmallSortT, typename CharT>
static unsigned char**
traverse(TrieNode<CharT>* node,
unsigned char** dst,
size_t depth,
SmallSortT small_sort)
{
for (unsigned i=0; i < max<CharT>::value; ++i) {
if (node->is_trie[i]) {
dst = traverse<BucketT>(
static_cast<TrieNode<CharT>*>(node->buckets[i]),
dst, depth+sizeof(CharT), small_sort);
} else {
BucketT* bucket =
static_cast<BucketT*>(node->buckets[i]);
if (not bucket) continue;
size_t bsize = bucket->size();
copy(*bucket, dst);
if (not is_end(i)) small_sort(dst, bsize, depth);
dst += bsize;
delete bucket;
}
}
delete node;
return dst;
}
#define SmallSort bs_mkqs::ssort2
void burstsort_vector(unsigned char** strings, size_t n)
{
typedef unsigned char CharT;
typedef std::vector<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = new TrieNode<CharT>;
insert<8000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_brodnik(unsigned char** strings, size_t n)
{
typedef unsigned char CharT;
typedef vector_brodnik<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = new TrieNode<CharT>;
insert<16000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_bagwell(unsigned char** strings, size_t n)
{
typedef unsigned char CharT;
typedef vector_bagwell<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = new TrieNode<CharT>;
insert<16000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_vector_block(unsigned char** strings, size_t n)
{
typedef unsigned char CharT;
typedef vector_block<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = new TrieNode<CharT>;
insert<16000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_superalphabet_vector(unsigned char** strings, size_t n)
{
typedef uint16_t CharT;
typedef std::vector<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = new TrieNode<CharT>;
insert<32000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_superalphabet_brodnik(unsigned char** strings, size_t n)
{
typedef uint16_t CharT;
typedef vector_brodnik<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = new TrieNode<CharT>;
insert<32000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_superalphabet_bagwell(unsigned char** strings, size_t n)
{
typedef uint16_t CharT;
typedef vector_bagwell<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = new TrieNode<CharT>;
insert<32000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_superalphabet_vector_block(unsigned char** strings, size_t n)
{
typedef uint16_t CharT;
typedef vector_block<unsigned char*, 128> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = new TrieNode<CharT>;
insert<32000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_sampling_vector(unsigned char** strings, size_t n)
{
typedef unsigned char CharT;
typedef std::vector<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = pseudo_sample<CharT>(strings, n);
insert<8000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_sampling_brodnik(unsigned char** strings, size_t n)
{
typedef unsigned char CharT;
typedef vector_brodnik<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = pseudo_sample<CharT>(strings, n);
insert<16000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_sampling_bagwell(unsigned char** strings, size_t n)
{
typedef unsigned char CharT;
typedef vector_bagwell<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = pseudo_sample<CharT>(strings, n);
insert<16000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_sampling_vector_block(unsigned char** strings, size_t n)
{
typedef unsigned char CharT;
typedef vector_block<unsigned char*, 128> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = pseudo_sample<CharT>(strings, n);
insert<16000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_sampling_superalphabet_vector(unsigned char** strings, size_t n)
{
typedef uint16_t CharT;
typedef std::vector<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = pseudo_sample<CharT>(strings, n);
insert<16000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_sampling_superalphabet_brodnik(unsigned char** strings, size_t n)
{
typedef uint16_t CharT;
typedef vector_brodnik<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = pseudo_sample<CharT>(strings, n);
insert<32000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_sampling_superalphabet_bagwell(unsigned char** strings, size_t n)
{
typedef uint16_t CharT;
typedef vector_bagwell<unsigned char*> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = pseudo_sample<CharT>(strings, n);
insert<32000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
void burstsort_sampling_superalphabet_vector_block(unsigned char** strings, size_t n)
{
typedef uint16_t CharT;
typedef vector_block<unsigned char*, 128> BucketT;
typedef BurstSimple<CharT> BurstImpl;
TrieNode<CharT>* root = pseudo_sample<CharT>(strings, n);
insert<32000, BucketT, BurstImpl>(root, strings, n);
traverse<BucketT>(root, strings, 0, SmallSort);
}
CONTESTANT_REGISTER(burstsort_vector,
"rantala/burstsort_vector",
"burstsort with std::vector bucket type")
CONTESTANT_REGISTER(burstsort_brodnik,
"rantala/burstsort_brodnik",
"burstsort with vector_brodnik bucket type")
CONTESTANT_REGISTER(burstsort_bagwell,
"rantala/burstsort_bagwell",
"burstsort with vector_bagwell bucket type")
CONTESTANT_REGISTER(burstsort_vector_block,
"rantala/burstsort_vector_block",
"burstsort with vector_block bucket type")
CONTESTANT_REGISTER(burstsort_superalphabet_vector,
"rantala/burstsort_superalphabet_vector",
"burstsort superalphabet with std::vector bucket type")
CONTESTANT_REGISTER(burstsort_superalphabet_brodnik,
"rantala/burstsort_superalphabet_brodnik",
"burstsort superalphabet with vector_brodnik bucket type")
CONTESTANT_REGISTER(burstsort_superalphabet_bagwell,
"rantala/burstsort_superalphabet_bagwell",
"burstsort superalphabet with vector_bagwell bucket type")
CONTESTANT_REGISTER(burstsort_superalphabet_vector_block,
"rantala/burstsort_superalphabet_vector_block",
"burstsort superalphabet with vector_block bucket type")
CONTESTANT_REGISTER(burstsort_sampling_vector,
"rantala/burstsort_sampling_vector",
"burstsort sampling with std::vector bucket type")
CONTESTANT_REGISTER(burstsort_sampling_brodnik,
"rantala/burstsort_sampling_brodnik",
"burstsort sampling with vector_brodnik bucket type")
CONTESTANT_REGISTER(burstsort_sampling_bagwell,
"rantala/burstsort_sampling_bagwell",
"burstsort sampling with vector_bagwell bucket type")
CONTESTANT_REGISTER(burstsort_sampling_vector_block,
"rantala/burstsort_sampling_vector_block",
"burstsort sampling with vector_block bucket type")
CONTESTANT_REGISTER(burstsort_sampling_superalphabet_vector,
"rantala/burstsort_sampling_superalphabet_vector",
"burstsort sampling superalphabet with std::vector bucket type")
CONTESTANT_REGISTER(burstsort_sampling_superalphabet_brodnik,
"rantala/burstsort_sampling_superalphabet_brodnik",
"burstsort sampling superalphabet with vector_brodnik bucket type")
CONTESTANT_REGISTER(burstsort_sampling_superalphabet_bagwell,
"rantala/burstsort_sampling_superalphabet_bagwell",
"burstsort sampling superalphabet with vector_bagwell bucket type")
CONTESTANT_REGISTER(burstsort_sampling_superalphabet_vector_block,
"rantala/burstsort_sampling_superalphabet_vector_block",
"burstsort sampling superalphabet with vector_block bucket type")
#undef SmallSort
}