http://stxxl.sourceforge.net
<jmehnert@mpi-sb.mpg.de>
<tb@panthema.net>
<daniel.feist@student.kit.edu>
http://www.boost.org/LICENSE_1_0.txt
#include <algorithm>
#include <cassert>
#include <cctype>
#include <cstddef>
#include <cstdlib>
#include <iostream>
#include <limits>
#include <string>
#include <vector>
#include <stxxl/algorithm>
#include <stxxl/cmdline>
#include <stxxl/io>
#include <stxxl/random>
#include <stxxl/sorter>
#include <stxxl/stats>
#include <stxxl/stream>
#include <stxxl/vector>
#include <stxxl/bits/common/uint_types.h>
using stxxl::uint64;
using stxxl::external_size_type;
namespace stream = stxxl::stream;
uint64 ram_use = 1024 * 1024 * 1024;
typedef unsigned char alphabet_type;
typedef external_size_type size_type;
@param
@param
template <typename InputT, typename InputSA>
bool sacheck(InputT& inputT, InputSA& inputSA)
{
typedef typename InputSA::value_type offset_type;
typedef stxxl::tuple<offset_type, offset_type> pair_type;
typedef stxxl::tuple<offset_type, offset_type, offset_type> triple_type;
typedef stxxl::stream::counter<offset_type> index_counter_type;
index_counter_type index_counter;
typedef stream::make_tuple<index_counter_type, InputSA> tuple_index_sa_type;
tuple_index_sa_type tuple_index_sa(index_counter, inputSA);
typedef stxxl::tuple_less2nd<pair_type> pair_less_type;
typedef typename stream::sort<tuple_index_sa_type, pair_less_type> build_isa_type;
build_isa_type build_isa(tuple_index_sa, pair_less_type(), ram_use / 3);
typedef stxxl::tuple_less1st<triple_type> triple_less_type;
typedef typename stream::use_push<triple_type> triple_push_type;
typedef typename stream::runs_creator<triple_push_type, triple_less_type> triple_rc_type;
typedef typename stream::runs_merger<typename triple_rc_type::sorted_runs_type, triple_less_type> triple_rm_type;
triple_rc_type triple_rc(triple_less_type(), ram_use / 3);
size_type totalSize;
{
offset_type prev_isa = (*build_isa).first;
offset_type counter = 0;
while (!build_isa.empty())
{
if ((*build_isa).second != counter) {
std::cout << "Error: suffix array is not a permutation of 0..n-1." << std::endl;
return false;
}
++counter;
++build_isa;
if (!build_isa.empty()) {
triple_rc.push(triple_type(prev_isa, *inputT, (*build_isa).first));
prev_isa = (*build_isa).first;
}
++inputT;
}
totalSize = counter;
}
if (totalSize == 1) return true;
triple_rm_type triple_rm(triple_rc.result(), triple_less_type(), ram_use / 3);
{
triple_type prev_triple = *triple_rm;
size_type counter = 0;
++triple_rm;
while (!triple_rm.empty())
{
const triple_type& this_triple = *triple_rm;
if (prev_triple.second > this_triple.second)
{
std::cout << "Error: suffix array position " << counter << " ordered incorrectly." << std::endl;
return false;
}
else if (prev_triple.second == this_triple.second)
{
if (this_triple.third == (offset_type)totalSize) {
std::cout << "Error: suffix array position " << counter << " ordered incorrectly." << std::endl;
return false;
}
if (prev_triple.third != (offset_type)totalSize && prev_triple.third > this_triple.third) {
std::cout << "Error: suffix array position " << counter << " ordered incorrectly." << std::endl;
return false;
}
}
prev_triple = this_triple;
++triple_rm;
++counter;
}
}
return true;
}
template <typename InputT, typename InputSA>
bool sacheck_vectors(InputT& inputT, InputSA& inputSA)
{
typename stream::streamify_traits<typename InputT::iterator>::stream_type streamT
= stream::streamify(inputT.begin(), inputT.end());
typename stream::streamify_traits<typename InputSA::iterator>::stream_type streamSA
= stream::streamify(inputSA.begin(), inputSA.end());
return sacheck(streamT, streamSA);
}
template <typename offset_type>
class skew
{
public:
typedef stxxl::tuple<offset_type, offset_type> skew_pair_type;
typedef stxxl::tuple<offset_type, offset_type, offset_type> skew_triple_type;
typedef stxxl::tuple<offset_type, offset_type, offset_type, offset_type> skew_quad_type;
typedef stxxl::tuple<offset_type, offset_type, offset_type, offset_type, offset_type> skew_quint_type;
typedef typename stxxl::VECTOR_GENERATOR<offset_type, 1, 2>::result offset_array_type;
typedef stream::vector_iterator2stream<typename offset_array_type::iterator> offset_array_it_rg;
struct less_mod0
{
typedef skew_quint_type value_type;
bool operator () (const value_type& a, const value_type& b) const
{
if (a.second == b.second)
return a.fourth < b.fourth;
else
return a.second < b.second;
}
static value_type min_value() { return value_type::min_value(); }
static value_type max_value() { return value_type::max_value(); }
};
typedef stxxl::tuple_less2nd<skew_quad_type> less_mod1;
typedef stxxl::tuple_less2nd<skew_quint_type> less_mod2;
struct less_skew
{
typedef skew_pair_type value_type;
bool operator () (const value_type& a, const value_type& b) const
{
if ((a.first & 1) == (b.first & 1))
return a.first < b.first;
else
return (a.first & 1) < (b.first & 1);
}
static value_type min_value() { return value_type::min_value(); }
static value_type max_value() { return value_type::max_value(); }
};
template <typename alphabet_type>
struct less_quad
{
typedef stxxl::tuple<offset_type, alphabet_type, alphabet_type, alphabet_type> value_type;
bool operator () (const value_type& a, const value_type& b) const
{
if (a.second == b.second) {
if (a.third == b.third)
return a.fourth < b.fourth;
else
return a.third < b.third;
}
else
return a.second < b.second;
}
static value_type min_value() { return value_type::min_value(); }
static value_type max_value() { return value_type::max_value(); }
};
template <class quad_type>
static inline bool quad_eq(const quad_type& a, const quad_type& b)
{
return (a.second == b.second) && (a.third == b.third) && (a.fourth == b.fourth);
}
template <class Input>
class naming
{
public:
typedef typename Input::value_type quad_type;
typedef skew_pair_type value_type;
private:
Input& A;
bool& unique;
offset_type lexname;
quad_type prev;
skew_pair_type result;
public:
naming(Input& A_, bool& unique_) :
A(A_), unique(unique_), lexname(0)
{
assert(!A.empty());
unique = true;
prev = *A;
result.first = prev.first;
result.second = lexname;
}
const value_type& operator * () const
{
return result;
}
naming& operator ++ ()
{
assert(!A.empty());
++A;
if (A.empty())
return *this;
quad_type curr = *A;
if (!quad_eq(prev, curr)) {
++lexname;
} else {
if (!A.empty() && curr.second != offset_type(0)) {
unique = false;
}
}
result.first = curr.first;
result.second = lexname;
prev = curr;
return *this;
}
bool empty() const
{
return A.empty();
}
};
template <class InputA, class InputB, const int add_alphabet = 0>
class make_pairs
{
public:
typedef stxxl::tuple<typename InputA::value_type, offset_type> value_type;
private:
InputA& A;
InputB& B;
value_type result;
public:
make_pairs(InputA& a, InputB& b)
: A(a), B(b)
{
assert(!A.empty());
assert(!B.empty());
if (!empty()) {
result = value_type(*A, *B + add_alphabet);
}
}
const value_type& operator * () const
{ return result; }
make_pairs& operator ++ ()
{
assert(!A.empty());
assert(!B.empty());
++A;
++B;
if (!A.empty() && !B.empty()) {
result = value_type(*A, *B + add_alphabet);
}
return *this;
}
bool empty() const
{ return (A.empty() || B.empty()); }
};
@param
@param
@param
template <class Input, typename alphabet_type, const int add_alphabet = 0>
class make_quads
{
public:
typedef stxxl::tuple<offset_type, alphabet_type, alphabet_type, alphabet_type> value_type;
private:
Input& A;
value_type current;
offset_type counter;
unsigned int z3z;
bool finished;
offset_array_type& text;
public:
make_quads(Input& data_in_, offset_array_type& text_)
: A(data_in_),
current(0, 0, 0, 0),
counter(0),
z3z(0),
finished(false),
text(text_)
{
assert(!A.empty());
current.first = counter;
current.second = (*A).second + add_alphabet;
++A;
if (!A.empty()) {
current.third = (*A).second + add_alphabet;
++A;
}
else {
current.third = 0;
current.fourth = 0;
}
if (!A.empty()) {
current.fourth = (*A).second + add_alphabet;
}
else {
current.fourth = 0;
}
}
const value_type& operator * () const
{ return current; }
make_quads& operator ++ ()
{
assert(!A.empty() || !finished);
if (current.second != offset_type(0)) {
text.push_back(current.second);
}
if (++z3z == 3) z3z = 0;
current.first = ++counter;
current.second = current.third;
current.third = current.fourth;
if (!A.empty())
++A;
if (!A.empty()) {
current.fourth = (*A).second + add_alphabet;
}
else {
current.fourth = 0;
}
if ((current.second == offset_type(0)) && (z3z != 1)) {
finished = true;
}
return *this;
}
bool empty() const
{ return (A.empty() && finished); }
};
template <class Input>
class extract_mod12
{
public:
typedef typename Input::value_type value_type;
private:
Input& A;
offset_type counter;
offset_type output_counter;
value_type result;
public:
extract_mod12(Input& A_)
: A(A_),
counter(0),
output_counter(0)
{
assert(!A.empty());
++A, ++counter;
if (!A.empty()) {
result = *A;
result.first = output_counter;
}
}
const value_type& operator * () const
{ return result; }
extract_mod12& operator ++ ()
{
assert(!A.empty());
++A, ++counter, ++output_counter;
if (!A.empty() && (counter % 3) == 0) {
++A, ++counter;
}
if (!A.empty()) {
result = *A;
result.first = output_counter;
}
return *this;
}
bool empty() const
{ return A.empty(); }
};
@param<i, t_i, t_{i+1}, ISA12[i+1], ISA12[i+2]>
@param<i, ISA12[i], t_i, ISA12[i+1]>
@param<i, ISA[i], t_i, t_{i+1}, ISA12[i+1]>
template <class Mod0, class Mod1, class Mod2>
class merge_sa
{
public:
typedef offset_type value_type;
private:
Mod0& A;
Mod1& B;
Mod2& C;
skew_quint_type s0;
skew_quad_type s1;
skew_quint_type s2;
int selected;
bool done[3];
offset_type index;
offset_type merge_result;
bool cmp_mod1_less_mod2()
{
assert(!done[1] && !done[2]);
return s1.second < s2.second;
}
bool cmp_mod0_less_mod2()
{
assert(!done[0] && !done[2]);
if (s0.second == s2.third) {
if (s0.third == s2.fourth)
return s0.fifth < s2.fifth;
else
return s0.third < s2.fourth;
}
else
return s0.second < s2.third;
}
bool cmp_mod0_less_mod1()
{
assert(!done[0] && !done[1]);
if (s0.second == s1.third)
return s0.fourth < s1.fourth;
else
return s0.second < s1.third;
}
void merge()
{
assert(!done[0] || !done[1] || !done[2]);
if (done[0])
{
if (done[2] || (!done[1] && cmp_mod1_less_mod2()))
{
selected = 1;
merge_result = s1.first;
}
else
{
selected = 2;
merge_result = s2.first;
}
}
else if (done[1] || cmp_mod0_less_mod1())
{
if (done[2] || cmp_mod0_less_mod2())
{
selected = 0;
merge_result = s0.first;
}
else
{
selected = 2;
merge_result = s2.first;
}
}
else
{
if (done[2] || cmp_mod1_less_mod2())
{
selected = 1;
merge_result = s1.first;
}
else
{
selected = 2;
merge_result = s2.first;
}
}
assert(!done[selected]);
}
public:
bool empty() const
{
return (A.empty() && B.empty() && C.empty());
}
merge_sa(Mod0& x1, Mod1& x2, Mod2& x3)
: A(x1), B(x2), C(x3), selected(-1), index(0)
{
assert(!A.empty());
assert(!B.empty());
assert(!C.empty());
done[0] = false;
done[1] = false;
done[2] = false;
s0 = *A;
s1 = *B;
s2 = *C;
merge();
}
const value_type& operator * () const
{
return merge_result;
}
merge_sa& operator ++ ()
{
if (selected == 0) {
assert(!A.empty());
++A;
if (!A.empty())
s0 = *A;
else
done[0] = true;
}
else if (selected == 1) {
assert(!B.empty());
++B;
if (!B.empty())
s1 = *B;
else
done[1] = true;
}
else {
assert(!C.empty());
assert(selected == 2);
++C;
if (!C.empty())
s2 = *C;
else
done[2] = true;
}
++index;
if (!empty())
merge();
return *this;
}
};
static inline size_t subp_size(size_t n)
{
return (n / 3) * 2 + ((n % 3) == 2);
}
@param
@param
@param
template <class S, class Mod1, class Mod2>
class build_sa
{
public:
typedef offset_type value_type;
static const unsigned int add_rank = 1;
private:
typedef typename stream::use_push<skew_quad_type> mod1_push_type;
typedef typename stream::runs_creator<mod1_push_type, less_mod1> mod1_runs_type;
typedef typename mod1_runs_type::sorted_runs_type sorted_mod1_runs_type;
typedef typename stream::runs_merger<sorted_mod1_runs_type, less_mod1> mod1_rm_type;
typedef typename stream::use_push<skew_quint_type> mod2_push_type;
typedef typename stream::runs_creator<mod2_push_type, less_mod2> mod2_runs_type;
typedef typename mod2_runs_type::sorted_runs_type sorted_mod2_runs_type;
typedef typename stream::runs_merger<sorted_mod2_runs_type, less_mod2> mod2_rm_type;
typedef typename stream::use_push<skew_quint_type> mod0_push_type;
typedef typename stream::runs_creator<mod0_push_type, less_mod0> mod0_runs_type;
typedef typename mod0_runs_type::sorted_runs_type sorted_mod0_runs_type;
typedef typename stream::runs_merger<sorted_mod0_runs_type, less_mod0> mod0_rm_type;
typedef merge_sa<mod0_rm_type, mod1_rm_type, mod2_rm_type> merge_sa_type;
less_mod0 c0;
less_mod1 c1;
less_mod2 c2;
mod1_rm_type* mod1_result;
mod2_rm_type* mod2_result;
mod0_rm_type* mod0_result;
merge_sa_type* vmerge_sa;
S& source;
Mod1& mod_1;
Mod2& mod_2;
offset_type t[3];
offset_type old_t2;
offset_type old_mod2;
bool exists[3];
offset_type mod_one;
offset_type mod_two;
offset_type index;
bool ready;
value_type result;
public:
build_sa(S& source_, Mod1& mod_1_, Mod2& mod_2_, size_t a_size, size_t memsize)
: source(source_), mod_1(mod_1_), mod_2(mod_2_), index(0), ready(false)
{
assert(!source_.empty());
mod0_runs_type mod0_runs(c0, memsize / 4);
mod1_runs_type mod1_runs(c1, memsize / 4);
mod2_runs_type mod2_runs(c2, memsize / 4);
while (!source.empty())
{
exists[0] = false;
exists[1] = false;
exists[2] = false;
if (!source.empty()) {
t[0] = *source;
++source;
exists[0] = true;
}
if (!source.empty()) {
assert(!mod_1.empty());
t[1] = *source;
++source;
mod_one = *mod_1 + add_rank;
++mod_1;
exists[1] = true;
}
if (!source.empty()) {
assert(!mod_2.empty());
t[2] = *source;
++source;
mod_two = *mod_2 + add_rank;
++mod_2;
exists[2] = true;
}
assert(t[0] != offset_type(0));
assert(t[1] != offset_type(0));
assert(t[2] != offset_type(0));
if (exists[2]) {
mod0_runs.push(skew_quint_type(index, t[0], t[1], mod_one, mod_two));
mod1_runs.push(skew_quad_type(index + 1, mod_one, t[1], mod_two));
if (index != offset_type(0)) {
mod2_runs.push(skew_quint_type((index - 1), old_mod2, old_t2, t[0], mod_one));
}
}
else if (exists[1]) {
mod0_runs.push(skew_quint_type(index, t[0], t[1], mod_one, 0));
mod1_runs.push(skew_quad_type(index + 1, mod_one, t[1], 0));
if (index != offset_type(0)) {
mod2_runs.push(skew_quint_type((index - 1), old_mod2, old_t2, t[0], mod_one));
}
}
else {
assert(exists[0]);
mod0_runs.push(skew_quint_type(index, t[0], 0, 0, 0));
if (index != offset_type(0)) {
mod2_runs.push(skew_quint_type((index - 1), old_mod2, old_t2, t[0], 0));
}
}
old_mod2 = mod_two;
old_t2 = t[2];
index += 3;
}
if ((a_size % 3) == 0) {
if (index != offset_type(0)) {
mod2_runs.push(skew_quint_type((index - 1), old_mod2, old_t2, 0, 0));
}
}
mod0_runs.deallocate();
mod1_runs.deallocate();
mod2_runs.deallocate();
std::cout << "merging S0 = " << mod0_runs.size() << ", S1 = " << mod1_runs.size()
<< ", S2 = " << mod2_runs.size() << " tuples" << std::endl;
mod0_result = new mod0_rm_type(mod0_runs.result(), less_mod0(), memsize / 5);
mod1_result = new mod1_rm_type(mod1_runs.result(), less_mod1(), memsize / 5);
mod2_result = new mod2_rm_type(mod2_runs.result(), less_mod2(), memsize / 5);
vmerge_sa = new merge_sa_type(*mod0_result, *mod1_result, *mod2_result);
result = *(*vmerge_sa);
}
const value_type& operator * () const
{
return result;
}
build_sa& operator ++ ()
{
assert(vmerge_sa != 0 && !vmerge_sa->empty());
++(*vmerge_sa);
if (!vmerge_sa->empty()) {
result = *(*vmerge_sa);
}
else {
assert(vmerge_sa->empty());
ready = true;
assert(vmerge_sa != NULL);
delete vmerge_sa, vmerge_sa = NULL;
assert(mod0_result != NULL && mod1_result != NULL && mod2_result != NULL);
delete mod0_result, mod0_result = NULL;
delete mod1_result, mod1_result = NULL;
delete mod2_result, mod2_result = NULL;
}
return *this;
}
~build_sa()
{
if (vmerge_sa) delete vmerge_sa;
if (mod0_result) delete mod0_result;
if (mod1_result) delete mod1_result;
if (mod2_result) delete mod2_result;
}
bool empty() const
{
return ready;
}
};
@param
template <class Input>
class algorithm
{
public:
typedef offset_type value_type;
typedef typename Input::value_type alphabet_type;
protected:
bool finished;
unsigned int rec_depth;
protected:
typedef stxxl::stream::counter<offset_type> counter_stream_type;
typedef stxxl::tuple_less1st<skew_pair_type> mod12cmp;
typedef stxxl::sorter<skew_pair_type, mod12cmp> mod12_sorter_type;
typedef stream::choose<mod12_sorter_type, 2> isa_second_type;
typedef build_sa<offset_array_it_rg, isa_second_type, isa_second_type> buildSA_type;
typedef make_pairs<buildSA_type, counter_stream_type> precompute_isa_type;
template <typename RecInputType>
buildSA_type * skew3(RecInputType& p_Input)
{
typedef make_quads<RecInputType, offset_type, 1> make_quads_input_type;
typedef extract_mod12<make_quads_input_type> mod12_quads_input_type;
typedef typename stream::sort<mod12_quads_input_type, less_quad<offset_type> > sort_mod12_input_type;
typedef naming<sort_mod12_input_type> naming_input_type;
mod12_sorter_type m1_sorter(mod12cmp(), ram_use / 5);
mod12_sorter_type m2_sorter(mod12cmp(), ram_use / 5);
typedef stxxl::stream::concatenate<mod12_sorter_type, mod12_sorter_type> concatenation_type;
offset_array_type text;
make_quads_input_type quads_input(p_Input, text);
mod12_quads_input_type mod12_quads_input(quads_input);
sort_mod12_input_type sort_mod12_input(mod12_quads_input, less_quad<offset_type>(), ram_use / 5);
bool unique = false;
naming_input_type names_input(sort_mod12_input, unique);
size_type concat_length = 0;
while (!names_input.empty()) {
const skew_pair_type& tmp = *names_input;
if (tmp.first & 1) {
m2_sorter.push(tmp);
}
else {
m1_sorter.push(tmp);
}
++names_input;
concat_length++;
}
std::cout << "recursion string length = " << concat_length << std::endl;
m1_sorter.sort();
m2_sorter.sort();
if (!unique)
{
std::cout << "not unique -> next recursion level = " << ++rec_depth << std::endl;
concatenation_type concat_mod1mod2(m1_sorter, m2_sorter);
buildSA_type* recType = skew3(concat_mod1mod2);
std::cout << "exit recursion level = " << --rec_depth << std::endl;
counter_stream_type isa_loop_index;
precompute_isa_type isa_pairs(*recType, isa_loop_index);
offset_type special = (concat_length != subp_size(text.size()));
offset_type mod2_pos = offset_type((subp_size(text.size()) >> 1) + (subp_size(text.size()) & 1) + special);
mod12_sorter_type isa1_pair(mod12cmp(), ram_use / 5);
mod12_sorter_type isa2_pair(mod12cmp(), ram_use / 5);
while (!isa_pairs.empty()) {
const skew_pair_type& tmp = *isa_pairs;
if (tmp.first < mod2_pos) {
if (tmp.first + special < mod2_pos)
isa1_pair.push(tmp);
} else {
isa2_pair.push(tmp);
}
++isa_pairs;
}
delete recType;
isa1_pair.finish();
isa2_pair.finish();
offset_array_it_rg input(text.begin(), text.end());
isa1_pair.sort(ram_use / 8);
isa2_pair.sort(ram_use / 8);
isa_second_type isa1(isa1_pair);
isa_second_type isa2(isa2_pair);
return new buildSA_type(input, isa1, isa2, text.size(), ram_use);
}
else
{
std::cout << "unique names!" << std::endl;
isa_second_type isa1(m1_sorter);
isa_second_type isa2(m2_sorter);
offset_array_it_rg source(text.begin(), text.end());
return new buildSA_type(source, isa1, isa2, text.size(), ram_use);
}
}
protected:
typedef make_pairs<counter_stream_type, Input> make_pairs_input_type;
buildSA_type* out_sa;
public:
algorithm(Input& data_in)
: finished(false), rec_depth(0)
{
counter_stream_type dummy;
make_pairs_input_type pairs_input(dummy, data_in);
out_sa = skew3(pairs_input);
}
const value_type& operator * () const
{
return *(*out_sa);
}
algorithm& operator ++ ()
{
assert(out_sa);
assert(!out_sa->empty());
++(*out_sa);
if (out_sa->empty()) {
finished = true;
delete out_sa;
out_sa = NULL;
}
return *this;
}
~algorithm()
{
if (out_sa) delete out_sa;
}
bool empty() const
{
return finished;
}
};
};
template <typename alphabet_type>
static inline std::string dumpC(alphabet_type c)
{
std::ostringstream oss;
if (isalnum(c)) oss << '\'' << (char)c << '\'';
else oss << (int)c;
return oss.str();
}
template <typename InputType>
class cut_stream
{
public:
typedef typename InputType::value_type value_type;
protected:
InputType& m_input;
size_type m_count;
public:
cut_stream(InputType& input, size_type count)
: m_input(input), m_count(count)
{ }
const value_type& operator * () const
{
assert(m_count > 0);
return *m_input;
}
cut_stream& operator ++ ()
{
assert(!empty());
--m_count;
++m_input;
return *this;
}
bool empty() const
{
return (m_count == 0) || m_input.empty();
}
};
alphabet_type unary_generator()
{
return 'a';
}
template <typename offset_type>
int process(const std::string& input_filename, const std::string& output_filename,
size_type sizelimit,
bool text_output_flag, bool check_flag, bool input_verbatim)
{
static const size_t block_size = sizeof(offset_type) * 1024 * 1024 / 2;
typedef typename stxxl::VECTOR_GENERATOR<alphabet_type, 1, 2>::result alphabet_vector_type;
typedef typename stxxl::VECTOR_GENERATOR<offset_type, 1, 2, block_size>::result offset_vector_type;
stxxl::syscall_file* input_file = NULL, * output_file = NULL;
alphabet_vector_type input_vector;
offset_vector_type output_vector;
using stxxl::file;
if (input_verbatim)
{
input_vector.resize(input_filename.size());
std::copy(input_filename.begin(), input_filename.end(), input_vector.begin());
}
else if (input_filename == "random")
{
if (sizelimit == std::numeric_limits<size_type>::max()) {
std::cout << "You must provide -s <size> for generated inputs." << std::endl;
return 1;
}
input_vector.resize(sizelimit);
stxxl::random_number8_r rand8;
stxxl::generate(input_vector.begin(), input_vector.end(), rand8);
}
else if (input_filename == "unary")
{
if (sizelimit == std::numeric_limits<size_type>::max()) {
std::cout << "You must provide -s <size> for generated inputs." << std::endl;
return 1;
}
input_vector.resize(sizelimit);
stxxl::generate(input_vector.begin(), input_vector.end(), unary_generator);
}
else
{
input_file = new stxxl::syscall_file(input_filename, file::RDONLY | file::DIRECT);
alphabet_vector_type file_input_vector(input_file);
input_vector.swap(file_input_vector);
}
if (output_filename.size())
{
output_file = new stxxl::syscall_file(output_filename, file::RDWR | file::CREAT | file::DIRECT);
offset_vector_type file_output_vector(output_file);
output_vector.swap(file_output_vector);
}
stxxl::stats* Stats = stxxl::stats::get_instance();
stxxl::stats_data stats_begin(*Stats);
typedef alphabet_vector_type::bufreader_type input_type;
typedef cut_stream<input_type> cut_input_type;
typedef typename skew<offset_type>::template algorithm<cut_input_type> skew_type;
size_type size = input_vector.size();
if (size > sizelimit) size = sizelimit;
std::cout << "input size = " << size << std::endl;
if (size + 3 >= std::numeric_limits<offset_type>::max()) {
std::cout << "error: input is too long for selected word size!" << std::endl;
return -1;
}
input_type input(input_vector);
cut_input_type cut_input(input, size);
skew_type skew(cut_input);
output_vector.resize(size);
stream::materialize(skew, output_vector.begin(), output_vector.end());
std::cout << "output size = " << output_vector.size() << std::endl;
std::cout << (stxxl::stats_data(*Stats) - stats_begin);
if (text_output_flag)
{
std::cout << std::endl << "resulting suffix array:" << std::endl;
for (unsigned int i = 0; i < output_vector.size(); i++) {
std::cout << i << " : " << output_vector[i] << " : ";
const alphabet_vector_type& cinput = input_vector;
for (unsigned int j = 0; output_vector[i] + j < cinput.size(); j++) {
std::cout << dumpC(cinput[output_vector[i] + j]) << " ";
}
std::cout << std::endl;
}
}
int ret = 0;
if (check_flag)
{
(std::cout << "checking suffix array... ").flush();
if (!sacheck_vectors(input_vector, output_vector)) {
std::cout << "failed!" << std::endl;
ret = -1;
}
else
std::cout << "ok." << std::endl;
}
if (input_file) {
input_vector = alphabet_vector_type();
delete input_file;
}
if (output_file) {
output_vector = offset_vector_type();
delete output_file;
}
return ret;
}
int main(int argc, char* argv[])
{
stxxl::cmdline_parser cp;
cp.set_description("DC3 aka skew3 algorithm for external memory suffix array construction.");
cp.set_author("Jens Mehnert <jmehnert@mpi-sb.mpg.de>, Timo Bingmann <tb@panthema.net>, Daniel Feist <daniel.feist@student.kit.edu>");
std::string input_filename, output_filename;
size_type sizelimit = std::numeric_limits<size_type>::max();
bool text_output_flag = false;
bool check_flag = false;
bool input_verbatim = false;
unsigned wordsize = 32;
cp.add_param_string("input", "Path to input file (or verbatim text).\n The special inputs 'random' and 'unary' generate such text on-the-fly.", input_filename);
cp.add_flag('c', "check", "Check suffix array for correctness.", check_flag);
cp.add_flag('t', "text", "Print out suffix array in readable text.", text_output_flag);
cp.add_string('o', "output", "Output suffix array to given path.", output_filename);
cp.add_flag('v', "verbatim", "Consider \"input\" as verbatim text to construct suffix array on.", input_verbatim);
cp.add_bytes('s', "size", "Cut input text to given size, e.g. 2 GiB.", sizelimit);
cp.add_bytes('M', "memuse", "Amount of RAM to use, default: 1 GiB.", ram_use);
cp.add_uint('w', "wordsize", "Set word size of suffix array to 32, 40 or 64 bit, default: 32-bit.", wordsize);
if (!cp.process(argc, argv))
return -1;
if (wordsize == 32)
return process<stxxl::uint32>(input_filename, output_filename, sizelimit,
text_output_flag, check_flag, input_verbatim);
else if (wordsize == 40)
return process<stxxl::uint40>(input_filename, output_filename, sizelimit,
text_output_flag, check_flag, input_verbatim);
else if (wordsize == 64)
return process<stxxl::uint64>(input_filename, output_filename, sizelimit,
text_output_flag, check_flag, input_verbatim);
else
std::cerr << "Invalid wordsize for suffix array: 32, 40 or 64 are allowed." << std::endl;
return -1;
}