<tb@panthema.net>
<http://www.gnu.org/licenses/>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <iostream>
#include <vector>
#include <algorithm>
#include "../tools/debug.h"
#undef DBGX
#define DBGX DBGX_OMP
#include "../tools/lcgrandom.h"
#include "../tools/contest.h"
#include "../tools/stringtools.h"
#include "../tools/jobqueue.h"
#include "../tools/logfloor.h"
#include "../tools/lockfree.h"
#include "../sequential/inssort.h"
#include "../sequential/bingmann-lcp_inssort.h"
#ifndef CALC_LCP
#define CALC_LCP 0
#endif
#if CALC_LCP
#define CALC_LCP_MKQS 1
#endif
#if !CALC_LCP
namespace bingmann_parallel_sample_sort {
#define CONTESTANT_REGISTER_PARALLEL_LCP(func,name,desc) \
CONTESTANT_REGISTER_PARALLEL(func, name, desc)
#else
namespace bingmann_parallel_sample_sort_lcp {
#define CONTESTANT_REGISTER_PARALLEL_LCP(func,name,desc) \
CONTESTANT_REGISTER_PARALLEL(func, name "_lcp", desc "_lcp")
#endif
using namespace stringtools;
using namespace jobqueue;
static const bool debug_steps = false;
static const bool debug_jobs = false;
static const bool debug_splitter = false;
static const bool debug_bucketsize = false;
static const bool debug_recursion = false;
static const bool debug_splitter_tree = false;
static const bool debug_lcp = false;
static const bool use_work_sharing = true;
#define PS5_ENABLE_RESTSIZE false
static const bool use_lcp_inssort = false;
#ifndef PS5_SINGLE_STEP
#define PS5_SINGLE_STEP false
#endif
static const bool use_only_first_sortstep = PS5_SINGLE_STEP;
static const size_t MAXPROCS = 2*64+1;
#ifndef PS5_L2CACHE
#define PS5_L2CACHE 256*1024
#endif
static const size_t l2cache = PS5_L2CACHE;
static const size_t g_smallsort_threshold = 1024*1024;
static const size_t g_inssort_threshold = 64;
typedef uint64_t key_type;
enum { TM_WAITING, TM_PARA_SS, TM_SEQ_SS, TM_MKQS, TM_INSSORT };
#ifndef TIMERARRAY_REAL
typedef ::TimerArrayDummy TimerArrayMT;
typedef ::ScopedTimerKeeperDummy ScopedTimerKeeperMT;
#endif
template <template <typename> class JobQueueGroupType = DefaultJobQueueGroup>
class Context
{
public:
size_t totalsize;
#if PS5_ENABLE_RESTSIZE
lockfree::lazy_counter<MAXPROCS> restsize;
#endif
size_t threadnum;
size_t para_ss_steps, seq_ss_steps, bs_steps;
TimerArrayMT timers;
typedef JobQueueGroupType<Context> jobqueuegroup_type;
typedef typename jobqueuegroup_type::jobqueue_type jobqueue_type;
typedef typename jobqueue_type::job_type job_type;
jobqueue_type jobqueue;
Context(jobqueuegroup_type* jqg = NULL)
: para_ss_steps(0), seq_ss_steps(0), bs_steps(0),
timers(16),
jobqueue(*this, jqg)
{ }
inline size_t sequential_threshold()
{
#if PS5_ENABLE_RESTSIZE
size_t wholesize = restsize.update().get();
#else
size_t wholesize = totalsize;
#endif
return std::max(g_smallsort_threshold, wholesize / threadnum);
}
inline void donesize(size_t n, size_t tid)
{
#if PS5_ENABLE_RESTSIZE
restsize.add(-n, tid);
#else
(void)n; (void)tid;
#endif
}
};
#if CALC_LCP
class SortStep
{
private:
std::atomic<unsigned int> m_substep_working;
virtual void substep_all_done() = 0;
protected:
SortStep()
: m_substep_working(0)
{ }
virtual ~SortStep()
{
assert(m_substep_working == 0);
}
void substep_add()
{
++m_substep_working;
}
public:
void substep_notify_done()
{
assert(m_substep_working > 0);
if (--m_substep_working == 0)
substep_all_done();
}
};
#else
class SortStep
{
protected:
SortStep()
{ }
void substep_add()
{ }
public:
void substep_notify_done()
{ }
};
#endif
static inline unsigned char
lcpKeyType(const key_type& a, const key_type& b)
{
return count_high_zero_bits(a ^ b) / 8;
}
static inline unsigned char
lcpKeyDepth(const key_type& a)
{
return sizeof(key_type) - (count_low_zero_bits(a) / 8);
}
static inline unsigned char
getCharAtDepth(const key_type& a, unsigned char d)
{
return static_cast<unsigned char>(a >> (8 * (sizeof(key_type)-1 - d)));
}
template <size_t numsplitters>
struct TreeBuilderFullDescent
{
key_type* m_splitter;
key_type* m_tree;
unsigned char* m_lcp_iter;
key_type* m_samples;
TreeBuilderFullDescent(key_type* splitter, key_type* splitter_tree, unsigned char* splitter_lcp,
key_type* samples, size_t samplesize)
: m_splitter( splitter ),
m_tree( splitter_tree ),
m_lcp_iter( splitter_lcp ),
m_samples( samples )
{
key_type sentinel = 0;
recurse(samples, samples + samplesize, 1, sentinel);
assert(m_splitter == splitter + numsplitters);
assert(m_lcp_iter == splitter_lcp + numsplitters);
splitter_lcp[0] &= 0x80;
splitter_lcp[numsplitters] = 0;
}
ptrdiff_t snum(key_type* s) const
{
return (ptrdiff_t)(s - m_samples);
}
key_type recurse(key_type* lo, key_type* hi, unsigned int treeidx,
key_type& rec_prevkey)
{
DBG(debug_splitter, "rec_buildtree(" << snum(lo) << "," << snum(hi)
<< ", treeidx=" << treeidx << ")");
key_type* mid = lo + (ptrdiff_t)(hi - lo) / 2;
DBG(debug_splitter, "tree[" << treeidx << "] = samples[" << snum(mid) << "] = "
<< toHex(*mid));
key_type mykey = m_tree[treeidx] = *mid;
#if 1
key_type* midlo = mid;
while (lo < midlo && *(midlo-1) == mykey) midlo--;
key_type* midhi = mid;
while (midhi+1 < hi && *midhi == mykey) midhi++;
if (midhi - midlo > 1)
DBG(0, "key range = [" << snum(midlo) << "," << snum(midhi) << ")");
#else
key_type *midlo = mid, *midhi = mid+1;
#endif
if (2 * treeidx < numsplitters)
{
key_type prevkey = recurse(lo, midlo, 2 * treeidx + 0, rec_prevkey);
key_type xorSplit = prevkey ^ mykey;
DBG(debug_splitter, " lcp: " << toHex(prevkey) << " XOR " << toHex(mykey) << " = "
<< toHex(xorSplit) << " - " << count_high_zero_bits(xorSplit) << " bits = "
<< count_high_zero_bits(xorSplit) / 8 << " chars lcp");
*m_splitter++ = mykey;
*m_lcp_iter++ = (count_high_zero_bits(xorSplit) / 8)
| ((mykey & 0xFF) ? 0 : 0x80);
return recurse(midhi, hi, 2 * treeidx + 1, mykey);
}
else
{
key_type xorSplit = rec_prevkey ^ mykey;
DBG(debug_splitter, " lcp: " << toHex(rec_prevkey) << " XOR " << toHex(mykey) << " = "
<< toHex(xorSplit) << " - " << count_high_zero_bits(xorSplit) << " bits = "
<< count_high_zero_bits(xorSplit) / 8 << " chars lcp");
*m_splitter++ = mykey;
*m_lcp_iter++ = (count_high_zero_bits(xorSplit) / 8)
| ((mykey & 0xFF) ? 0 : 0x80);
return mykey;
}
}
};
template <size_t treebits>
struct ClassifySimple
{
static const size_t numsplitters = (1 << treebits) - 1;
key_type splitter[numsplitters];
key_type splitter_tree[numsplitters+1];
inline unsigned int
find_bkt_tree(const key_type& key) const
{
unsigned int i = 1;
while ( i <= numsplitters )
{
#if 0
i = 2 * i + (key <= splitter_tree[i] ? 0 : 1);
#else
if (key <= splitter_tree[i])
i = 2*i + 0;
else
i = 2*i + 1;
#endif
}
i -= numsplitters+1;
size_t b = i * 2;
if (i < numsplitters && splitter[i] == key) b += 1;
return b;
}
inline void
classify(string* strB, string* strE, uint16_t* bktout, size_t depth) const
{
for (string* str = strB; str != strE; )
{
key_type key = get_char<key_type>(*str++, depth);
unsigned int b = find_bkt_tree(key);
*bktout++ = b;
}
}
inline key_type get_splitter(unsigned int i) const
{ return splitter[i]; }
inline void
build(key_type* samples, size_t samplesize, unsigned char* splitter_lcp)
{
TreeBuilderFullDescent<numsplitters>(splitter, splitter_tree, splitter_lcp,
samples, samplesize);
}
};
template <size_t treebits>
struct ClassifyUnrollTree
{
static const size_t numsplitters = (1 << treebits) - 1;
key_type splitter[numsplitters];
key_type splitter_tree[numsplitters+1];
__attribute__((optimize("unroll-all-loops")))
inline unsigned int
find_bkt_tree(const key_type& key) const
{
unsigned int i = 1;
for (size_t l = 0; l < treebits; ++l)
{
#if 0
i = 2 * i + (key <= splitter_tree[i] ? 0 : 1);
#else
if (key <= splitter_tree[i])
i = 2*i + 0;
else
i = 2*i + 1;
#endif
}
i -= numsplitters+1;
size_t b = i * 2;
if (i < numsplitters && splitter[i] == key) b += 1;
return b;
}
inline void
classify(string* strB, string* strE, uint16_t* bktout, size_t depth) const
{
for (string* str = strB; str != strE; )
{
key_type key = get_char<key_type>(*str++, depth);
unsigned int b = find_bkt_tree(key);
*bktout++ = b;
}
}
inline key_type get_splitter(unsigned int i) const
{ return splitter[i]; }
inline void
build(key_type* samples, size_t samplesize, unsigned char* splitter_lcp)
{
TreeBuilderFullDescent<numsplitters>(splitter, splitter_tree, splitter_lcp,
samples, samplesize);
}
};
template <size_t treebits>
struct ClassifyUnrollBoth : public ClassifyUnrollTree<treebits>
{
template <int U>
__attribute__((optimize("unroll-all-loops")))
inline void
find_bkt_tree_unroll(const key_type key[U], uint16_t obkt[U]) const
{
static const size_t numsplitters = this->numsplitters;
const key_type* splitter = this->splitter;
const key_type* splitter_tree = this->splitter_tree;
unsigned int i[U];
std::fill(i+0, i+U, 1);
for (size_t l = 0; l < treebits; ++l)
{
for (int u = 0; u < U; ++u)
{
#if 0
i[u] = 2 * i[u] + (key[u] <= splitter_tree[i[u]] ? 0 : 1);
#else
if (key[u] <= splitter_tree[i[u]])
i[u] = 2*i[u] + 0;
else
i[u] = 2*i[u] + 1;
#endif
}
}
for (int u = 0; u < U; ++u)
i[u] -= numsplitters+1;
for (int u = 0; u < U; ++u)
obkt[u] = i[u] * 2;
for (int u = 0; u < U; ++u)
{
if (i[u] < numsplitters && splitter[i[u]] == key[u]) obkt[u] += 1;
}
}
inline void
classify(string* strB, string* strE, uint16_t* bktout, size_t depth) const
{
for (string* str = strB; str != strE; )
{
static const int rollout = 4;
if (str + rollout < strE)
{
key_type key[rollout];
key[0] = get_char<key_type>(str[0], depth);
key[1] = get_char<key_type>(str[1], depth);
key[2] = get_char<key_type>(str[2], depth);
key[3] = get_char<key_type>(str[3], depth);
find_bkt_tree_unroll<rollout>(key, bktout);
str += rollout;
bktout += rollout;
}
else
{
key_type key = get_char<key_type>(*str++, depth);
unsigned int b = this->find_bkt_tree(key);
*bktout++ = b;
}
}
}
};
static inline void
insertion_sort(const stringtools::StringShadowPtr& strptr, size_t depth)
{
assert(!strptr.flipped());
if (!use_lcp_inssort)
inssort::inssort(strptr.output(), strptr.size(), depth);
else
bingmann_lcp_inssort::lcp_insertion_sort(strptr.output(), strptr.size(), depth);
}
static inline void
insertion_sort(const stringtools::StringShadowLcpPtr& strptr, size_t depth)
{
assert(!strptr.flipped());
bingmann_lcp_inssort::lcp_insertion_sort(
strptr.output(), strptr.lcparray(), strptr.size(), depth);
}
static inline void
insertion_sort(const stringtools::StringShadowOutPtr& strptr, size_t depth)
{
assert(!strptr.flipped());
if (!use_lcp_inssort)
inssort::inssort(strptr.output(), strptr.size(), depth);
else
bingmann_lcp_inssort::lcp_insertion_sort(strptr.output(), strptr.size(), depth);
}
static inline void
insertion_sort(const stringtools::StringShadowLcpOutPtr& strptr, size_t depth)
{
assert(!strptr.flipped());
bingmann_lcp_inssort::lcp_insertion_sort(
strptr.output(), strptr.lcparray(), strptr.size(), depth);
}
static inline void
insertion_sort(const stringtools::StringShadowLcpCacheOutPtr& strptr, size_t depth)
{
assert(!strptr.flipped());
bingmann_lcp_inssort::lcp_insertion_sort_cache(
strptr.output(), strptr.lcparray(), strptr.cache(),
strptr.size(), depth);
}
template <size_t bktnum, typename Classify, typename StringPtr, typename BktSizeType>
void sample_sort_lcp(const Classify& classifier, const StringPtr& strptr, size_t depth, const BktSizeType* bkt)
{
assert(!strptr.flipped());
assert(strptr.check());
size_t b = 0;
key_type prevkey = 0;
goto even_first;
while (b < bktnum)
{
if (bkt[b] != bkt[b+1])
{
prevkey = classifier.get_splitter(b/2);
assert( prevkey == get_char<key_type>(strptr.out(bkt[b+1]-1), depth) );
break;
}
++b;
even_first:
if (bkt[b] != bkt[b+1])
{
prevkey = get_char<key_type>( strptr.out(bkt[b+1]-1), depth );
break;
}
++b;
}
++b;
if (b < bktnum && b % 2 == 0) goto even_bucket;
while (b < bktnum)
{
if (bkt[b] != bkt[b+1])
{
key_type thiskey = classifier.get_splitter(b/2);
assert( thiskey == get_char<key_type>(strptr.out(bkt[b]), depth) );
int rlcp = lcpKeyType(prevkey, thiskey);
strptr.set_lcp(bkt[b], depth + rlcp);
strptr.set_cache(bkt[b], getCharAtDepth(thiskey, rlcp));
DBG(debug_lcp, "LCP at odd-bucket " << b << " [" << bkt[b] << "," << bkt[b+1]
<< ") is " << strptr.lcp(bkt[b]));
prevkey = thiskey;
assert( prevkey == get_char<key_type>(strptr.out(bkt[b+1]-1), depth) );
}
++b;
even_bucket:
if (bkt[b] != bkt[b+1])
{
key_type thiskey = get_char<key_type>( strptr.out(bkt[b]), depth );
int rlcp = lcpKeyType(prevkey, thiskey);
strptr.set_lcp(bkt[b], depth + rlcp);
strptr.set_cache(bkt[b], getCharAtDepth(thiskey, rlcp));
DBG(debug_lcp, "LCP at even-bucket " << b << " [" << bkt[b] << "," << bkt[b+1]
<< ") is " << strptr.lcp(bkt[b]));
prevkey = get_char<key_type>( strptr.out(bkt[b+1]-1), depth );
}
++b;
}
}
template <template <size_t> class Classify, typename Context, typename StringPtr>
void Enqueue(Context& ctx, SortStep* sstep, const StringPtr& strptr, size_t depth);
template <typename Context, template <size_t> class Classify, typename StringPtr,
typename BktSizeType>
struct SmallsortJob : public Context::job_type, public SortStep
{
SortStep* pstep;
size_t thrid;
StringPtr in_strptr;
size_t in_depth;
typedef BktSizeType bktsize_type;
SmallsortJob(Context& ctx, SortStep* _pstep,
const StringPtr& strptr, size_t depth)
: pstep(_pstep), in_strptr(strptr), in_depth(depth)
{
DBG(debug_steps, "enqueue depth=" << in_depth << " size=" << in_strptr.size() << " flip=" << in_strptr.flipped());
ctx.jobqueue.enqueue(this);
}
struct SeqSampleSortStep
{
#if 0
static const size_t numsplitters2 = 2*16;
#else
static const size_t numsplitters2 = (l2cache - sizeof(size_t)) / (2 * sizeof(size_t));
#endif
static const size_t treebits = logfloor_<numsplitters2>::value;
static const size_t numsplitters = (1 << treebits) - 1;
static const size_t bktnum = 2 * numsplitters + 1;
StringPtr strptr;
size_t idx;
size_t depth;
bktsize_type bkt[bktnum+1];
#if CALC_LCP
Classify<treebits> classifier;
#endif
unsigned char splitter_lcp[numsplitters+1];
SeqSampleSortStep(Context& ctx, const StringPtr& _strptr, size_t _depth, uint16_t* bktcache)
: strptr(_strptr), idx(0), depth(_depth)
{
size_t n = strptr.size();
const size_t oversample_factor = 2;
const size_t samplesize = oversample_factor * numsplitters;
key_type samples[ samplesize ];
string* strings = strptr.active();
LCGRandom rng(strings);
for (unsigned int i = 0; i < samplesize; ++i)
{
samples[i] = get_char<key_type>(strings[ rng() % n ], depth);
}
std::sort(samples, samples + samplesize);
#if !CALC_LCP
Classify<treebits> classifier;
#endif
classifier.build(samples, samplesize, splitter_lcp);
classifier.classify(strings, strings + n, bktcache, depth);
bktsize_type bktsize[bktnum];
memset(bktsize, 0, bktnum * sizeof(bktsize_type));
for (size_t si = 0; si < n; ++si)
++bktsize[ bktcache[si] ];
bkt[0] = bktsize[0];
for (unsigned int i=1; i < bktnum; ++i) {
bkt[i] = bkt[i-1] + bktsize[i];
}
assert(bkt[bktnum-1] == n);
bkt[bktnum] = n;
string* strB = strptr.active();
string* strE = strptr.active() + strptr.size();
string* sorted = strptr.shadow();
for (string* str = strB; str != strE; ++str, ++bktcache)
sorted[ --bkt[ *bktcache ] ] = *str;
++ctx.seq_ss_steps;
}
void calculate_lcp()
{
#if CALC_LCP
DBG(debug_lcp, "Calculate LCP after sample sort step " << strptr);
sample_sort_lcp<bktnum>(classifier, strptr.original(), depth, bkt);
#endif
}
};
uint16_t* bktcache;
size_t bktcache_size;
size_t ss_pop_front;
std::vector<SeqSampleSortStep> ss_stack;
virtual bool run(Context& ctx)
{
ScopedTimerKeeperMT tm_seq_ss(ctx.timers, TM_SEQ_SS);
size_t n = in_strptr.size();
DBG(debug_jobs, "Process SmallsortJob " << this << " of size " << n);
thrid = PS5_ENABLE_RESTSIZE ? omp_get_thread_num() : 0;
substep_add();
bktcache = NULL;
bktcache_size = 0;
ss_pop_front = 0;
ms_pop_front = 0;
if (n >= g_smallsort_threshold)
{
bktcache = new uint16_t[n];
bktcache_size = n * sizeof(uint16_t);
sort_sample_sort(ctx, in_strptr, in_depth);
}
else
{
sort_mkqs_cache(ctx, in_strptr, in_depth);
}
delete [] bktcache;
substep_notify_done();
return false;
}
void sort_sample_sort(Context& ctx, const StringPtr& strptr, size_t depth)
{
typedef SeqSampleSortStep Step;
assert(ss_pop_front == 0);
assert(ss_stack.size() == 0);
ss_stack.push_back( Step(ctx,strptr,depth,bktcache) );
while ( ss_stack.size() > ss_pop_front )
{
Step& s = ss_stack.back();
size_t i = s.idx++;
if ( i < Step::bktnum )
{
size_t bktsize = s.bkt[i+1] - s.bkt[i];
StringPtr sp = s.strptr.flip(s.bkt[i], bktsize);
if (i % 2 == 0)
{
if (bktsize == 0)
;
else if (bktsize < g_smallsort_threshold)
{
assert(i/2 <= Step::numsplitters);
if (i == Step::bktnum-1)
DBG(debug_recursion, "Recurse[" << s.depth << "]: > bkt " << i << " size " << bktsize << " no lcp");
else
DBG(debug_recursion, "Recurse[" << s.depth << "]: < bkt " << i << " size " << bktsize << " lcp " << int(s.splitter_lcp[i/2] & 0x7F));
sort_mkqs_cache(ctx, sp, s.depth + (s.splitter_lcp[i/2] & 0x7F));
}
else
{
if (i == Step::bktnum-1)
DBG(debug_recursion, "Recurse[" << s.depth << "]: > bkt " << i << " size " << bktsize << " no lcp");
else
DBG(debug_recursion, "Recurse[" << s.depth << "]: < bkt " << i << " size " << bktsize << " lcp " << int(s.splitter_lcp[i/2] & 0x7F));
ss_stack.push_back( Step(ctx, sp, s.depth + (s.splitter_lcp[i/2] & 0x7F), bktcache) );
}
}
else
{
if (bktsize == 0)
;
else if ( s.splitter_lcp[i/2] & 0x80 ) {
DBG(debug_recursion, "Recurse[" << s.depth << "]: = bkt " << i << " size " << bktsize << " is done!");
sp = sp.copy_back();
#if CALC_LCP
sp.fill_lcp(s.depth + lcpKeyDepth(s.classifier.get_splitter(i/2)));
#endif
ctx.donesize(bktsize, thrid);
}
else if (bktsize < g_smallsort_threshold)
{
DBG(debug_recursion, "Recurse[" << s.depth << "]: = bkt " << i << " size " << bktsize << " lcp keydepth!");
sort_mkqs_cache(ctx, sp, s.depth + sizeof(key_type));
}
else
{
DBG(debug_recursion, "Recurse[" << s.depth << "]: = bkt " << i << " size " << bktsize << " lcp keydepth!");
ss_stack.push_back( Step(ctx, sp, s.depth + sizeof(key_type), bktcache) );
}
}
}
else
{
assert( ss_stack.size() > ss_pop_front );
ss_stack.back().calculate_lcp();
ss_stack.pop_back();
}
if (use_work_sharing && ctx.jobqueue.has_idle()) {
sample_sort_free_work(ctx);
}
}
}
void sample_sort_free_work(Context& ctx)
{
assert(ss_stack.size() >= ss_pop_front);
if (ss_stack.size() == ss_pop_front) {
return mkqs_free_work(ctx);
}
DBG(debug_jobs, "Freeing top level of SmallsortJob's sample_sort stack");
typedef SeqSampleSortStep Step;
Step& s = ss_stack[ss_pop_front];
while ( s.idx < Step::bktnum )
{
size_t i = s.idx++;
size_t bktsize = s.bkt[i+1] - s.bkt[i];
StringPtr sp = s.strptr.flip(s.bkt[i], bktsize);
if (i % 2 == 0)
{
if (bktsize == 0)
;
else
{
if (i == Step::bktnum-1)
DBG(debug_recursion, "Recurse[" << s.depth << "]: > bkt " << i << " size " << bktsize << " no lcp");
else
DBG(debug_recursion, "Recurse[" << s.depth << "]: < bkt " << i << " size " << bktsize << " lcp " << int(s.splitter_lcp[i/2] & 0x7F));
substep_add();
Enqueue<Classify>( ctx, this, sp, s.depth + (s.splitter_lcp[i/2] & 0x7F) );
}
}
else
{
if (bktsize == 0)
;
else if ( s.splitter_lcp[i/2] & 0x80 ) {
DBG(debug_recursion, "Recurse[" << s.depth << "]: = bkt " << i << " size " << bktsize << " is done!");
sp = sp.copy_back();
#if CALC_LCP
sp.fill_lcp(s.depth + lcpKeyDepth(s.classifier.get_splitter(i/2)));
#endif
ctx.donesize(bktsize, thrid);
}
else
{
DBG(debug_recursion, "Recurse[" << s.depth << "]: = bkt " << i << " size " << bktsize << " lcp keydepth!");
substep_add();
Enqueue<Classify>( ctx, this, sp, s.depth + sizeof(key_type) );
}
}
}
++ss_pop_front;
}
static inline int cmp(const key_type& a, const key_type& b)
{
return (a > b) ? 1 : (a < b) ? -1 : 0;
}
template <typename Type>
static inline size_t
med3(Type* A, size_t i, size_t j, size_t k)
{
if (A[i] == A[j]) return i;
if (A[k] == A[i] || A[k] == A[j]) return k;
if (A[i] < A[j]) {
if (A[j] < A[k]) return j;
if (A[i] < A[k]) return k;
return i;
}
else {
if (A[j] > A[k]) return j;
if (A[i] < A[k]) return i;
return k;
}
}
static inline void
insertion_sort_cache_block(const StringPtr& strptr, key_type* cache)
{
string* strings = strptr.output();
unsigned int n = strptr.size();
unsigned int pi, pj;
for (pi = 1; --n > 0; ++pi) {
string tmps = strings[pi];
key_type tmpc = cache[pi];
for (pj = pi; pj > 0; --pj) {
if (cache[pj-1] <= tmpc)
break;
strings[pj] = strings[pj-1];
cache[pj] = cache[pj-1];
}
strings[pj] = tmps;
cache[pj] = tmpc;
}
}
template <bool CacheDirty>
static inline void
insertion_sort_cache(const StringPtr& _strptr, key_type* cache, size_t depth)
{
StringPtr strptr = _strptr.copy_back();
if (strptr.size() <= 1) return;
if (CacheDirty) return insertion_sort(strptr, depth);
insertion_sort_cache_block(strptr, cache);
size_t start = 0, bktsize = 1;
for (size_t i = 0; i < strptr.size() - 1; ++i) {
if (cache[i] == cache[i+1]) {
++bktsize;
continue;
}
if (start != 0) {
int rlcp = lcpKeyType(cache[start-1], cache[start]);
strptr.set_lcp(start, depth + rlcp);
strptr.set_cache(start, getCharAtDepth(cache[start], rlcp));
}
if (bktsize > 1) {
if (cache[start] & 0xFF)
insertion_sort(strptr.sub(start, bktsize), depth + sizeof(key_type));
else {
strptr.sub(start, bktsize).fill_lcp(depth + lcpKeyDepth(cache[start]));
}
}
bktsize = 1;
start = i+1;
}
if (start != 0) {
int rlcp = lcpKeyType(cache[start-1], cache[start]);
strptr.set_lcp(start, depth + rlcp);
strptr.set_cache(start, getCharAtDepth(cache[start], rlcp));
}
if (bktsize > 1) {
if (cache[start] & 0xFF)
insertion_sort(strptr.sub(start, bktsize), depth + sizeof(key_type));
else
strptr.sub(start, bktsize).fill_lcp(depth + lcpKeyDepth(cache[start]));
}
}
struct MKQSStep
{
StringPtr strptr;
key_type* cache;
size_t num_lt, num_eq, num_gt, depth;
size_t idx;
unsigned char eq_recurse;
#if CALC_LCP_MKQS == 1
char_type dchar_eq, dchar_gt;
unsigned char lcp_lt, lcp_eq, lcp_gt;
#elif CALC_LCP_MKQS == 2
key_type pivot;
#endif
MKQSStep(Context& ctx, const StringPtr& _strptr, key_type* _cache, size_t _depth, bool CacheDirty)
: strptr(_strptr), cache(_cache), depth(_depth), idx(0)
{
size_t n = strptr.size();
string* strings = strptr.active();
if (CacheDirty) {
for (size_t i = 0; i < n; ++i) {
cache[i] = get_char<key_type>(strings[i], depth);
}
}
size_t p = med3(cache,
med3(cache, 0, n/8, n/4),
med3(cache, n/2-n/8, n/2, n/2+n/8),
med3(cache, n-1-n/4, n-1-n/8, n-3)
);
std::swap(strings[0], strings[p]);
std::swap(cache[0], cache[p]);
key_type pivot = cache[0];
#if CALC_LCP_MKQS == 1
key_type max_lt = 0, min_gt = std::numeric_limits<key_type>::max();
#elif CALC_LCP_MKQS == 2
this->pivot = pivot;
#endif
size_t leq = 1, llt = 1, rgt = n-1, req = n-1;
while (true)
{
while (llt <= rgt)
{
int r = cmp(cache[llt], pivot);
if (r > 0) {
#if CALC_LCP_MKQS == 1
min_gt = std::min(min_gt, cache[llt]);
#endif
break;
}
else if (r == 0) {
std::swap(strings[leq], strings[llt]);
std::swap(cache[leq], cache[llt]);
leq++;
}
else {
#if CALC_LCP_MKQS == 1
max_lt = std::max(max_lt, cache[llt]);
#endif
}
++llt;
}
while (llt <= rgt)
{
int r = cmp(cache[rgt], pivot);
if (r < 0) {
#if CALC_LCP_MKQS == 1
max_lt = std::max(max_lt, cache[rgt]);
#endif
break;
}
else if (r == 0) {
std::swap(strings[req], strings[rgt]);
std::swap(cache[req], cache[rgt]);
req--;
}
else {
#if CALC_LCP_MKQS == 1
min_gt = std::min(min_gt, cache[rgt]);
#endif
}
--rgt;
}
if (llt > rgt)
break;
std::swap(strings[llt], strings[rgt]);
std::swap(cache[llt], cache[rgt]);
++llt;
--rgt;
}
size_t num_leq = leq, num_req = n-1-req;
num_eq = num_leq + num_req;
num_lt = llt - leq;
num_gt = req - rgt;
assert(num_eq > 0);
assert(num_lt + num_eq + num_gt == n);
const size_t size1 = std::min(num_leq, num_lt);
std::swap_ranges(strings, strings+size1, strings+llt-size1);
std::swap_ranges(cache, cache+size1, cache+llt-size1);
const size_t size2 = std::min(num_req, num_gt);
std::swap_ranges(strings+llt, strings+llt+size2, strings+n-size2);
std::swap_ranges(cache+llt, cache+llt+size2, cache+n-size2);
this->eq_recurse = (pivot & 0xFF);
#if CALC_LCP_MKQS == 1
if (num_lt > 0)
{
assert(max_lt == *std::max_element(cache + 0, cache + num_lt));
lcp_lt = lcpKeyType(max_lt, pivot);
dchar_eq = getCharAtDepth(pivot, lcp_lt);
DBG(debug_lcp, "LCP lt with pivot: " << depth + lcp_lt);
}
lcp_eq = lcpKeyDepth(pivot);
if (num_gt > 0)
{
assert(min_gt == *std::min_element(cache + num_lt + num_eq, cache + n));
lcp_gt = lcpKeyType(pivot, min_gt);
dchar_gt = getCharAtDepth(min_gt, lcp_gt);
DBG(debug_lcp, "LCP pivot with gt: " << depth + lcp_gt);
}
#endif
++ctx.bs_steps;
}
void calculate_lcp()
{
#if CALC_LCP_MKQS == 1
if (num_lt > 0)
{
strptr.original().set_lcp(num_lt, depth + lcp_lt);
strptr.original().set_cache(num_lt, dchar_eq);
}
if (num_gt > 0)
{
strptr.original().set_lcp(num_lt + num_eq, depth + lcp_gt);
strptr.original().set_cache(num_lt + num_eq, dchar_gt);
}
#elif CALC_LCP_MKQS == 2
if (num_lt > 0)
{
key_type max_lt = get_char<key_type>(strptr.original().out(num_lt-1), depth);
unsigned int rlcp = lcpKeyType(max_lt, pivot);
DBG(debug_lcp, "LCP lt with pivot: " << depth + rlcp);
strptr.original().set_lcp(num_lt, depth + rlcp);
strptr.original().set_cache(num_lt, getCharAtDepth(pivot, rlcp));
}
if (num_gt > 0)
{
key_type min_gt = get_char<key_type>(strptr.original().out(num_lt + num_eq), depth);
unsigned int rlcp = lcpKeyType(pivot, min_gt);
DBG(debug_lcp, "LCP pivot with gt: " << depth + rlcp);
strptr.original().set_lcp(num_lt + num_eq, depth + rlcp);
strptr.original().set_cache(num_lt + num_eq, getCharAtDepth(min_gt, rlcp));
}
#endif
}
};
size_t ms_pop_front;
std::vector<MKQSStep> ms_stack;
void sort_mkqs_cache(Context& ctx, const StringPtr& strptr, size_t depth)
{
ScopedTimerKeeperMT tm_mkqs(ctx.timers, TM_MKQS);
if (strptr.size() < g_inssort_threshold) {
ScopedTimerKeeperMT tm_inssort(ctx.timers, TM_INSSORT);
insertion_sort(strptr.copy_back(), depth);
ctx.donesize(strptr.size(), thrid);
return;
}
if (bktcache_size < strptr.size() * sizeof(key_type)) {
delete [] bktcache;
bktcache = (uint16_t*)new key_type[strptr.size()];
bktcache_size = strptr.size() * sizeof(key_type);
}
key_type* cache = (key_type*)bktcache;
assert(ms_pop_front == 0);
assert(ms_stack.size() == 0);
ms_stack.push_back( MKQSStep(ctx, strptr, cache, depth, true) );
while ( ms_stack.size() > ms_pop_front )
{
MKQSStep& ms = ms_stack.back();
++ms.idx;
if (ms.idx == 1)
{
if (ms.num_lt == 0)
;
else if (ms.num_lt < g_inssort_threshold) {
ScopedTimerKeeperMT tm_inssort(ctx.timers, TM_INSSORT);
insertion_sort_cache<false>(ms.strptr.sub(0, ms.num_lt),
ms.cache, ms.depth);
ctx.donesize(ms.num_lt, thrid);
}
else {
ms_stack.push_back(
MKQSStep(ctx,
ms.strptr.sub(0, ms.num_lt),
ms.cache, ms.depth, false) );
}
}
else if (ms.idx == 2)
{
StringPtr sp = ms.strptr.sub(ms.num_lt, ms.num_eq);
assert(ms.num_eq > 0);
if (!ms.eq_recurse) {
sp = sp.copy_back();
#if CALC_LCP_MKQS == 1
sp.fill_lcp(ms.depth + ms.lcp_eq);
#elif CALC_LCP_MKQS == 2
sp.fill_lcp(ms.depth + lcpKeyDepth(ms.pivot));
#endif
ctx.donesize(sp.size(), thrid);
}
else if (ms.num_eq < g_inssort_threshold) {
ScopedTimerKeeperMT tm_inssort(ctx.timers, TM_INSSORT);
insertion_sort_cache<true>(sp, ms.cache + ms.num_lt,
ms.depth + sizeof(key_type));
ctx.donesize(ms.num_eq, thrid);
}
else {
ms_stack.push_back(
MKQSStep(ctx, sp,
ms.cache + ms.num_lt,
ms.depth + sizeof(key_type), true) );
}
}
else if (ms.idx == 3)
{
StringPtr sp = ms.strptr.sub(ms.num_lt + ms.num_eq, ms.num_gt);
if (ms.num_gt == 0)
;
else if (ms.num_gt < g_inssort_threshold) {
ScopedTimerKeeperMT tm_inssort(ctx.timers, TM_INSSORT);
insertion_sort_cache<false>(sp, ms.cache + ms.num_lt + ms.num_eq,
ms.depth);
ctx.donesize(ms.num_gt, thrid);
}
else {
ms_stack.push_back(
MKQSStep(ctx, sp,
ms.cache + ms.num_lt + ms.num_eq,
ms.depth, false) );
}
}
else
{
assert( ms_stack.size() > ms_pop_front );
ms_stack.back().calculate_lcp();
ms_stack.pop_back();
}
if (use_work_sharing && ctx.jobqueue.has_idle()) {
sample_sort_free_work(ctx);
}
}
}
void mkqs_free_work(Context& ctx)
{
assert(ms_stack.size() >= ms_pop_front);
for (unsigned int fl = 0; fl < 8; ++fl)
{
if (ms_stack.size() == ms_pop_front) {
return;
}
DBG(debug_jobs, "Freeing top level of SmallsortJob's mkqs stack - size " << ms_stack.size());
MKQSStep& ms = ms_stack[ms_pop_front];
if (ms.idx == 0 && ms.num_lt != 0)
{
substep_add();
Enqueue<Classify>(ctx, this, ms.strptr.sub(0, ms.num_lt), ms.depth);
}
if (ms.idx <= 1)
{
assert(ms.num_eq > 0);
StringPtr sp = ms.strptr.sub(ms.num_lt, ms.num_eq);
if (ms.eq_recurse) {
substep_add();
Enqueue<Classify>(ctx, this, sp,
ms.depth + sizeof(key_type));
}
else {
sp = sp.copy_back();
#if CALC_LCP_MKQS == 1
sp.fill_lcp(ms.depth + ms.lcp_eq);
#elif CALC_LCP_MKQS == 2
sp.fill_lcp(ms.depth + lcpKeyDepth(ms.pivot));
#endif
ctx.donesize(ms.num_eq, thrid);
}
}
if (ms.idx <= 2 && ms.num_gt != 0)
{
substep_add();
Enqueue<Classify>(ctx, this,
ms.strptr.sub(ms.num_lt + ms.num_eq, ms.num_gt), ms.depth);
}
++ms_pop_front;
}
}
virtual void substep_all_done()
{
#if CALC_LCP
DBG(debug_recursion, "SmallSort[" << in_depth << "] all substeps done -> LCP calculation");
while (ms_pop_front > 0) {
DBG(debug_lcp, "SmallSort[" << in_depth << "] ms_pop_front: " << ms_pop_front);
ms_stack[--ms_pop_front].calculate_lcp();
}
while (ss_pop_front > 0) {
DBG(debug_lcp, "SmallSort[" << in_depth << "] ss_pop_front: " << ss_pop_front);
ss_stack[--ss_pop_front].calculate_lcp();
}
if (pstep) pstep->substep_notify_done();
delete this;
#endif
}
};
template <typename Context, template <size_t> class Classify, typename StringPtr>
struct SampleSortStep : public SortStep
{
#if 0
static const size_t numsplitters2 = 16;
#else
static const size_t numsplitters2 = (l2cache - sizeof(size_t)) / (2 * sizeof(size_t));
#endif
static const size_t treebits = logfloor_<numsplitters2>::value;
static const size_t numsplitters = (1 << treebits) - 1;
static const size_t bktnum = 2 * numsplitters + 1;
typedef typename Context::job_type job_type;
SortStep* pstep;
StringPtr strptr;
size_t depth;
unsigned int parts;
size_t psize;
std::atomic<unsigned int> pwork;
Classify<treebits> classifier;
unsigned char splitter_lcp[numsplitters+1];
size_t* bkt[MAXPROCS];
uint16_t* bktcache[MAXPROCS];
struct SampleJob : public job_type
{
SampleSortStep* step;
SampleJob(SampleSortStep* _step)
: step(_step) { }
virtual bool run(Context& ctx)
{
ScopedTimerKeeperMT tm_seq_ss(ctx.timers, TM_PARA_SS);
step->sample(ctx);
return true;
}
};
struct CountJob : public job_type
{
SampleSortStep* step;
unsigned int p;
CountJob(SampleSortStep* _step, unsigned int _p)
: step(_step), p(_p) { }
virtual bool run(Context& ctx)
{
ScopedTimerKeeperMT tm_seq_ss(ctx.timers, TM_PARA_SS);
step->count(p, ctx);
return true;
}
};
struct DistributeJob : public job_type
{
SampleSortStep* step;
unsigned int p;
DistributeJob(SampleSortStep* _step, unsigned int _p)
: step(_step), p(_p) { }
virtual bool run(Context& ctx)
{
ScopedTimerKeeperMT tm_seq_ss(ctx.timers, TM_PARA_SS);
step->distribute(p, ctx);
return true;
}
};
SampleSortStep(Context& ctx, SortStep* _pstep,
const StringPtr& _strptr, size_t _depth)
: pstep(_pstep), strptr(_strptr), depth(_depth)
{
parts = strptr.size() / ctx.sequential_threshold() * 2;
if (parts == 0) parts = 1;
if (parts > MAXPROCS) parts = MAXPROCS;
psize = (strptr.size() + parts-1) / parts;
DBG(debug_steps, "enqueue depth=" << depth << " size=" << strptr.size() << " parts=" << parts << " psize=" << psize << " flip=" << strptr.flipped());
ctx.jobqueue.enqueue( new SampleJob(this) );
++ctx.para_ss_steps;
}
void sample(Context& ctx)
{
DBG(debug_jobs, "Process SampleJob @ " << this);
const size_t oversample_factor = 2;
size_t samplesize = oversample_factor * numsplitters;
string* strings = strptr.active();
key_type samples[ samplesize ];
LCGRandom rng(&samples);
for (unsigned int i = 0; i < samplesize; ++i)
{
samples[i] = get_char<key_type>(strings[ rng() % strptr.size() ], depth);
}
std::sort(samples, samples + samplesize);
classifier.build(samples, samplesize, splitter_lcp);
pwork = parts;
for (unsigned int p = 0; p < parts; ++p)
ctx.jobqueue.enqueue( new CountJob(this, p) );
}
void count(unsigned int p, Context& ctx)
{
DBG(debug_jobs, "Process CountJob " << p << " @ " << this);
string* strB = strptr.active() + p * psize;
string* strE = strptr.active() + std::min((p+1) * psize, strptr.size());
if (strE < strB) strE = strB;
uint16_t* mybktcache = bktcache[p] = new uint16_t[strE-strB];
classifier.classify(strB, strE, mybktcache, depth);
size_t* mybkt = bkt[p] = new size_t[bktnum + (p == 0 ? 1 : 0)];
memset(mybkt, 0, bktnum * sizeof(size_t));
for (uint16_t* bc = mybktcache; bc != mybktcache + (strE-strB); ++bc)
++mybkt[ *bc ];
if (--pwork == 0)
count_finished(ctx);
}
void count_finished(Context& ctx)
{
DBG(debug_jobs, "Finishing CountJob " << this << " with prefixsum");
if (use_only_first_sortstep)
return;
size_t sum = 0;
for (unsigned int i = 0; i < bktnum; ++i)
{
for (unsigned int p = 0; p < parts; ++p)
{
bkt[p][i] = (sum += bkt[p][i]);
}
}
assert(sum == strptr.size());
pwork = parts;
for (unsigned int p = 0; p < parts; ++p)
ctx.jobqueue.enqueue( new DistributeJob(this, p) );
}
void distribute(unsigned int p, Context& ctx)
{
DBG(debug_jobs, "Process DistributeJob " << p << " @ " << this);
string* strB = strptr.active() + p * psize;
string* strE = strptr.active() + std::min((p+1) * psize, strptr.size());
if (strE < strB) strE = strB;
string* sorted = strptr.shadow();
uint16_t* mybktcache = bktcache[p];
size_t* mybkt = bkt[p];
for (string* str = strB; str != strE; ++str, ++mybktcache)
sorted[ --mybkt[ *mybktcache ] ] = *str;
if (p != 0)
delete [] bkt[p];
delete [] bktcache[p];
if (--pwork == 0)
distribute_finished(ctx);
}
void distribute_finished(Context& ctx)
{
DBG(debug_jobs, "Finishing DistributeJob " << this << " with enqueuing subjobs");
size_t thrid = PS5_ENABLE_RESTSIZE ? omp_get_thread_num() : 0;
size_t* bkt = this->bkt[0];
assert(bkt);
assert(bkt[0] == 0);
bkt[bktnum] = strptr.size();
substep_add();
size_t i = 0;
while (i < bktnum-1)
{
size_t bktsize = bkt[i+1] - bkt[i];
if (bktsize == 0)
;
else if (bktsize == 1) {
strptr.flip(bkt[i], 1).copy_back();
ctx.donesize(1, thrid);
}
else
{
DBG(debug_recursion, "Recurse[" << depth << "]: < bkt " << bkt[i] << " size " << bktsize << " lcp " << int(splitter_lcp[i/2] & 0x7F));
substep_add();
Enqueue<Classify>(ctx, this, strptr.flip(bkt[i], bktsize), depth + (splitter_lcp[i/2] & 0x7F));
}
++i;
bktsize = bkt[i+1] - bkt[i];
if (bktsize == 0)
;
else if (bktsize == 1) {
strptr.flip(bkt[i], 1).copy_back();
ctx.donesize(1, thrid);
}
else
{
if ( splitter_lcp[i/2] & 0x80 ) {
DBG(debug_recursion, "Recurse[" << depth << "]: = bkt " << bkt[i] << " size " << bktsize << " is done!");
StringPtr sp = strptr.flip(bkt[i], bktsize).copy_back();
sp.fill_lcp(depth + lcpKeyDepth(classifier.get_splitter(i/2)));
ctx.donesize(bktsize, thrid);
}
else {
DBG(debug_recursion, "Recurse[" << depth << "]: = bkt " << bkt[i] << " size " << bktsize << " lcp keydepth!");
substep_add();
Enqueue<Classify>(ctx, this, strptr.flip(bkt[i], bktsize), depth + sizeof(key_type));
}
}
++i;
}
size_t bktsize = bkt[i+1] - bkt[i];
if (bktsize == 0)
;
else if (bktsize == 1) {
strptr.flip(bkt[i], 1).copy_back();
ctx.donesize(1, thrid);
}
else
{
DBG(debug_recursion, "Recurse[" << depth << "]: > bkt " << bkt[i] << " size " << bktsize << " no lcp");
substep_add();
Enqueue<Classify>(ctx, this, strptr.flip(bkt[i], bktsize), depth);
}
substep_notify_done();
#if !CALC_LCP
delete [] bkt;
delete this;
#endif
}
#if CALC_LCP
virtual void substep_all_done()
{
DBG(debug_steps, "pSampleSortStep[" << depth << "]: all substeps done.");
sample_sort_lcp<bktnum>(classifier, strptr.original(), depth, bkt[0]);
delete [] bkt[0];
if (pstep) pstep->substep_notify_done();
delete this;
}
#endif
static inline void put_stats()
{
g_stats >> "l2cache" << size_t(l2cache)
>> "splitter_treebits" << size_t(treebits)
>> "numsplitters" << size_t(numsplitters)
>> "use_work_sharing" << use_work_sharing
>> "use_restsize" << PS5_ENABLE_RESTSIZE
>> "use_lcp_inssort" << use_lcp_inssort;
}
};
template <template <size_t> class Classify, typename Context, typename StringPtr>
void Enqueue(Context& ctx, SortStep* pstep,
const StringPtr& strptr, size_t depth)
{
if (strptr.size() > ctx.sequential_threshold() || use_only_first_sortstep) {
new SampleSortStep<Context, Classify, StringPtr>(ctx, pstep, strptr, depth);
}
else {
if (strptr.size() < ((uint64_t)1 << 32))
new SmallsortJob<Context, Classify, StringPtr, uint32_t>
(ctx, pstep, strptr, depth);
else
new SmallsortJob<Context, Classify, StringPtr, uint64_t>
(ctx, pstep, strptr, depth);
}
}
#if CALC_LCP
typedef stringtools::StringShadowLcpPtr StringPtr;
typedef stringtools::StringShadowLcpOutPtr StringOutPtr;
#else
typedef stringtools::StringShadowPtr StringPtr;
typedef stringtools::StringShadowOutPtr StringOutPtr;
#endif
template <template <size_t> class Classify>
void parallel_sample_sort_base(string* strings, size_t n, size_t depth)
{
Context<> ctx;
ctx.totalsize = n;
#if PS5_ENABLE_RESTSIZE
ctx.restsize = n;
#endif
ctx.threadnum = omp_get_max_threads();
SampleSortStep<Context<>, Classify, StringPtr>::put_stats();
string* shadow = new string[n];
StringPtr strptr(strings, shadow, n);
ctx.timers.start(ctx.threadnum);
Enqueue<Classify>(ctx, NULL, strptr, depth);
ctx.jobqueue.loop();
ctx.timers.stop();
#if PS5_ENABLE_RESTSIZE
assert(!PS5_ENABLE_RESTSIZE || ctx.restsize.update().get() == 0);
#endif
#if CALC_LCP
if (0)
{
unsigned int err = 0;
for (size_t i = 1; i < n; ++i)
{
string s1 = strptr.out(i-1), s2 = strptr.out(i);
size_t h = calc_lcp(s1, s2);
if (h != strptr.lcp(i)) {
std::cout << "lcp[" << i << "] mismatch " << h << " != " << strptr.lcp(i) << std::endl;
++err;
}
}
std::cout << err << " lcp errors" << std::endl;
}
#endif
delete [] shadow;
g_stats >> "steps_para_sample_sort" << ctx.para_ss_steps
>> "steps_seq_sample_sort" << ctx.seq_ss_steps
>> "steps_base_sort" << ctx.bs_steps;
if (ctx.timers.is_real)
{
g_stats >> "tm_waiting" << ctx.timers.get(TM_WAITING)
>> "tm_jq_work" << ctx.jobqueue.m_timers.get(ctx.jobqueue.TM_WORK)
>> "tm_jq_idle" << ctx.jobqueue.m_timers.get(ctx.jobqueue.TM_IDLE)
>> "tm_para_ss" << ctx.timers.get(TM_PARA_SS)
>> "tm_seq_ss" << ctx.timers.get(TM_SEQ_SS)
>> "tm_mkqs" << ctx.timers.get(TM_MKQS)
>> "tm_inssort" << ctx.timers.get(TM_INSSORT)
>> "tm_sum" << ctx.timers.get_sum();
}
}
template <template <size_t> class Classify>
void parallel_sample_sort_out_base(string* strings, string* output, size_t n, size_t depth)
{
Context<> ctx;
ctx.totalsize = n;
#if PS5_ENABLE_RESTSIZE
ctx.restsize = n;
#endif
ctx.threadnum = omp_get_max_threads();
SampleSortStep<Context<>, Classify, StringOutPtr>::put_stats();
string* shadow = new string[n];
StringOutPtr strptr(strings, shadow, output, n);
ctx.timers.start(ctx.threadnum);
Enqueue<Classify>(ctx, NULL, strptr, depth);
ctx.jobqueue.loop();
ctx.timers.stop();
#if PS5_ENABLE_RESTSIZE
assert(ctx.restsize.update().get() == 0);
#endif
#if CALC_LCP
if (0)
{
unsigned int err = 0;
for (size_t i = 1; i < n; ++i)
{
string s1 = strptr.out(i-1), s2 = strptr.out(i);
size_t h = calc_lcp(s1, s2);
if (h != strptr.lcp(i)) {
std::cout << "lcp[" << i << "] mismatch " << h << " != " << strptr.lcp(i) << std::endl;
++err;
}
}
std::cout << err << " lcp errors" << std::endl;
}
#endif
delete [] shadow;
g_stats >> "steps_para_sample_sort" << ctx.para_ss_steps
>> "steps_seq_sample_sort" << ctx.seq_ss_steps
>> "steps_base_sort" << ctx.bs_steps;
if (ctx.timers.is_real)
{
g_stats >> "tm_waiting" << ctx.timers.get(TM_WAITING)
>> "tm_jq_work" << ctx.jobqueue.m_timers.get(ctx.jobqueue.TM_WORK)
>> "tm_jq_idle" << ctx.jobqueue.m_timers.get(ctx.jobqueue.TM_IDLE)
>> "tm_para_ss" << ctx.timers.get(TM_PARA_SS)
>> "tm_seq_ss" << ctx.timers.get(TM_SEQ_SS)
>> "tm_mkqs" << ctx.timers.get(TM_MKQS)
>> "tm_inssort" << ctx.timers.get(TM_INSSORT)
>> "tm_sum" << ctx.timers.get_sum();
}
}
#ifdef CALC_LCP
void parallel_sample_sort_numa(string *strings, size_t n,
int numaNode, int numberOfThreads,
const LcpCacheStringPtr& output)
{
numa_run_on_node(numaNode);
numa_set_preferred(numaNode);
Context<> ctx;
ctx.totalsize = n;
#if PS5_ENABLE_RESTSIZE
ctx.restsize = n;
#endif
ctx.threadnum = numberOfThreads;
StringShadowLcpCacheOutPtr
strptr(strings, (string*)output.lcps, output.strings,
output.cachedChars, n);
Enqueue<ClassifyUnrollBoth>(ctx, NULL, strptr, 0);
ctx.jobqueue.numaLoop(numaNode, numberOfThreads);
output.firstLcp() = 0;
output.firstCached() = output.firstString()[0];
#if PS5_ENABLE_RESTSIZE
assert(ctx.restsize.update().get() == 0);
#endif
g_stats >> "steps_para_sample_sort" << ctx.para_ss_steps
>> "steps_seq_sample_sort" << ctx.seq_ss_steps
>> "steps_base_sort" << ctx.bs_steps;
}
void parallel_sample_sort_numa2(const StringShadowLcpCacheOutPtr* strptr,
unsigned numInputs)
{
typedef Context<NumaJobQueueGroup> context_type;
context_type::jobqueuegroup_type group;
context_type* ctx[numInputs];
for (unsigned i = 0; i < numInputs; ++i)
{
ctx[i] = new context_type(&group);
ctx[i]->totalsize = strptr[i].size();
#if PS5_ENABLE_RESTSIZE
ctx[i]->restsize = strptr[i].size();
#endif
ctx[i]->threadnum = group.calcThreadNum(i, numInputs);
if (ctx[i]->threadnum == 0)
ctx[i]->threadnum = 1;
Enqueue<ClassifyUnrollBoth>(*ctx[i], NULL, strptr[i], 0);
group.add_jobqueue(&ctx[i]->jobqueue);
}
group.numaLaunch();
for (unsigned i = 0; i < numInputs; ++i)
{
strptr[i].lcparray()[0] = 0;
strptr[i].set_cache(0, strptr[i].out(0)[0]);
#if PS5_ENABLE_RESTSIZE
assert(ctx[i].restsize.update().get() == 0);
#endif
g_stats >> "steps_para_sample_sort" << ctx[i]->para_ss_steps
>> "steps_seq_sample_sort" << ctx[i]->seq_ss_steps
>> "steps_base_sort" << ctx[i]->bs_steps;
delete ctx[i];
}
}
#endif
static inline void
parallel_sample_sortBTC(string* strings, size_t n)
{
parallel_sample_sort_base<ClassifySimple>(strings, n, 0);
}
CONTESTANT_REGISTER_PARALLEL_LCP(
parallel_sample_sortBTC,
"bingmann/parallel_sample_sortBTC",
"pS5: binary tree, bktcache")
static inline void
parallel_sample_sortBTCU1(string* strings, size_t n)
{
parallel_sample_sort_base<ClassifyUnrollTree>(strings, n, 0);
}
CONTESTANT_REGISTER_PARALLEL_LCP(
parallel_sample_sortBTCU1,
"bingmann/parallel_sample_sortBTCU1",
"pS5: binary tree, bktcache, unroll tree")
static inline void
parallel_sample_sortBTCU2(string* strings, size_t n)
{
parallel_sample_sort_base<ClassifyUnrollBoth>(strings, n, 0);
}
CONTESTANT_REGISTER_PARALLEL_LCP(
parallel_sample_sortBTCU2,
"bingmann/parallel_sample_sortBTCU2",
"pS5: binary tree, bktcache, unroll tree and strings")
static inline void
parallel_sample_sortBTCU2_out(string* strings, size_t n)
{
string* output = new string[n];
parallel_sample_sort_out_base<ClassifyUnrollBoth>(strings, output, n, 0);
memcpy(strings, output, n * sizeof(string));
delete [] output;
}
CONTESTANT_REGISTER_PARALLEL_LCP(
parallel_sample_sortBTCU2_out,
"bingmann/parallel_sample_sortBTCU2_out",
"pS5: binary tree, bktcache, unroll tree and strings, separate output")
template <size_t treebits>
struct ClassifyBinarySearch
{
static const size_t numsplitters = (1 << treebits) - 1;
key_type splitter[numsplitters];
inline unsigned int
find_bkt_tree(const key_type& key) const
{
unsigned int lo = 0, hi = numsplitters;
while ( lo < hi )
{
unsigned int mid = (lo + hi) >> 1;
assert(mid < numsplitters);
if (key <= splitter[mid])
hi = mid;
else
lo = mid + 1;
}
size_t b = lo * 2;
if (lo < numsplitters && splitter[lo] == key) b += 1;
return b;
}
inline void
classify(string* strB, string* strE, uint16_t* bktout, size_t depth) const
{
for (string* str = strB; str != strE; )
{
key_type key = get_char<key_type>(*str++, depth);
unsigned int b = find_bkt_tree(key);
*bktout++ = b;
}
}
inline key_type get_splitter(unsigned int i) const
{ return splitter[i]; }
inline void
build(key_type* samples, size_t samplesize, unsigned char* splitter_lcp)
{
const size_t oversample_factor = samplesize / numsplitters;
DBG(debug_splitter, "oversample_factor: " << oversample_factor);
DBG(debug_splitter, "splitter:");
splitter_lcp[0] = 0;
for (size_t i = 0, j = oversample_factor/2; i < numsplitters; ++i)
{
splitter[i] = samples[j];
DBG(debug_splitter, "key " << toHex(splitter[i]));
if (i != 0) {
key_type xorSplit = splitter[i-1] ^ splitter[i];
DBG1(debug_splitter, " XOR -> " << toHex(xorSplit) << " - ");
DBG3(debug_splitter, count_high_zero_bits(xorSplit) << " bits = "
<< count_high_zero_bits(xorSplit) / 8 << " chars lcp");
splitter_lcp[i] = count_high_zero_bits(xorSplit) / 8;
}
j += oversample_factor;
}
}
};
static inline void
parallel_sample_sortBSC(string* strings, size_t n)
{
parallel_sample_sort_base<ClassifyBinarySearch>(strings, n, 0);
}
CONTESTANT_REGISTER_PARALLEL_LCP(
parallel_sample_sortBSC,
"bingmann/parallel_sample_sortBSC",
"pS5: binary search, bktcache")
#include "bingmann-parallel_sample_sort_equal.h"
#include "bingmann-parallel_sample_sort_treecalc.h"
}