From bf0c5892d85ceab47105bcd2d0b8a9af16ff5725 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Tue, 5 Oct 2010 11:51:17 -0300 Subject: [PATCH] Lots of work. --- INSTALL | 238 ++++++++++++++++++++++++++++- Makefile.am | 2 +- configure.ac | 2 +- cxxmph/Makefile.am | 6 +- cxxmph/cmph_hash_map.h | 2 +- cxxmph/mphtable.cc | 124 +++++++++++---- cxxmph/mphtable.h | 85 +++-------- cxxmph/{trigraph.c => trigraph.cc} | 14 +- cxxmph/trigraph.h | 15 +- 9 files changed, 378 insertions(+), 110 deletions(-) rename cxxmph/{trigraph.c => trigraph.cc} (54%) diff --git a/INSTALL b/INSTALL index 1c1a83c..5458714 100644 --- a/INSTALL +++ b/INSTALL @@ -1,6 +1,234 @@ -Run the commands below or refer to the autotools documentation for more -sophisticated options. +Installation Instructions +************************* + +Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005, +2006 Free Software Foundation, Inc. + +This file is free documentation; the Free Software Foundation gives +unlimited permission to copy, distribute and modify it. + +Basic Installation +================== + +Briefly, the shell commands `./configure; make; make install' should +configure, build, and install this package. The following +more-detailed instructions are generic; see the `README' file for +instructions specific to this package. + + The `configure' shell script attempts to guess correct values for +various system-dependent variables used during compilation. It uses +those values to create a `Makefile' in each directory of the package. +It may also create one or more `.h' files containing system-dependent +definitions. Finally, it creates a shell script `config.status' that +you can run in the future to recreate the current configuration, and a +file `config.log' containing compiler output (useful mainly for +debugging `configure'). + + It can also use an optional file (typically called `config.cache' +and enabled with `--cache-file=config.cache' or simply `-C') that saves +the results of its tests to speed up reconfiguring. Caching is +disabled by default to prevent problems with accidental use of stale +cache files. + + If you need to do unusual things to compile the package, please try +to figure out how `configure' could check whether to do them, and mail +diffs or instructions to the address given in the `README' so they can +be considered for the next release. If you are using the cache, and at +some point `config.cache' contains results you don't want to keep, you +may remove or edit it. + + The file `configure.ac' (or `configure.in') is used to create +`configure' by a program called `autoconf'. You need `configure.ac' if +you want to change it or regenerate `configure' using a newer version +of `autoconf'. + +The simplest way to compile this package is: + + 1. `cd' to the directory containing the package's source code and type + `./configure' to configure the package for your system. + + Running `configure' might take a while. While running, it prints + some messages telling which features it is checking for. + + 2. Type `make' to compile the package. + + 3. Optionally, type `make check' to run any self-tests that come with + the package. + + 4. Type `make install' to install the programs and any data files and + documentation. + + 5. You can remove the program binaries and object files from the + source code directory by typing `make clean'. To also remove the + files that `configure' created (so you can compile the package for + a different kind of computer), type `make distclean'. There is + also a `make maintainer-clean' target, but that is intended mainly + for the package's developers. If you use it, you may have to get + all sorts of other programs in order to regenerate files that came + with the distribution. + +Compilers and Options +===================== + +Some systems require unusual options for compilation or linking that the +`configure' script does not know about. Run `./configure --help' for +details on some of the pertinent environment variables. + + You can give `configure' initial values for configuration parameters +by setting variables in the command line or in the environment. Here +is an example: + + ./configure CC=c99 CFLAGS=-g LIBS=-lposix + + *Note Defining Variables::, for more details. + +Compiling For Multiple Architectures +==================================== + +You can compile the package for more than one kind of computer at the +same time, by placing the object files for each architecture in their +own directory. To do this, you can use GNU `make'. `cd' to the +directory where you want the object files and executables to go and run +the `configure' script. `configure' automatically checks for the +source code in the directory that `configure' is in and in `..'. + + With a non-GNU `make', it is safer to compile the package for one +architecture at a time in the source code directory. After you have +installed the package for one architecture, use `make distclean' before +reconfiguring for another architecture. + +Installation Names +================== + +By default, `make install' installs the package's commands under +`/usr/local/bin', include files under `/usr/local/include', etc. You +can specify an installation prefix other than `/usr/local' by giving +`configure' the option `--prefix=PREFIX'. + + You can specify separate installation prefixes for +architecture-specific files and architecture-independent files. If you +pass the option `--exec-prefix=PREFIX' to `configure', the package uses +PREFIX as the prefix for installing programs and libraries. +Documentation and other data files still use the regular prefix. + + In addition, if you use an unusual directory layout you can give +options like `--bindir=DIR' to specify different values for particular +kinds of files. Run `configure --help' for a list of the directories +you can set and what kinds of files go in them. + + If the package supports it, you can cause programs to be installed +with an extra prefix or suffix on their names by giving `configure' the +option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. + +Optional Features +================= + +Some packages pay attention to `--enable-FEATURE' options to +`configure', where FEATURE indicates an optional part of the package. +They may also pay attention to `--with-PACKAGE' options, where PACKAGE +is something like `gnu-as' or `x' (for the X Window System). The +`README' should mention any `--enable-' and `--with-' options that the +package recognizes. + + For packages that use the X Window System, `configure' can usually +find the X include and library files automatically, but if it doesn't, +you can use the `configure' options `--x-includes=DIR' and +`--x-libraries=DIR' to specify their locations. + +Specifying the System Type +========================== + +There may be some features `configure' cannot figure out automatically, +but needs to determine by the type of machine the package will run on. +Usually, assuming the package is built to be run on the _same_ +architectures, `configure' can figure that out, but if it prints a +message saying it cannot guess the machine type, give it the +`--build=TYPE' option. TYPE can either be a short name for the system +type, such as `sun4', or a canonical name which has the form: + + CPU-COMPANY-SYSTEM + +where SYSTEM can have one of these forms: + + OS KERNEL-OS + + See the file `config.sub' for the possible values of each field. If +`config.sub' isn't included in this package, then this package doesn't +need to know the machine type. + + If you are _building_ compiler tools for cross-compiling, you should +use the option `--target=TYPE' to select the type of system they will +produce code for. + + If you want to _use_ a cross compiler, that generates code for a +platform different from the build platform, you should specify the +"host" platform (i.e., that on which the generated programs will +eventually be run) with `--host=TYPE'. + +Sharing Defaults +================ + +If you want to set default values for `configure' scripts to share, you +can create a site shell script called `config.site' that gives default +values for variables like `CC', `cache_file', and `prefix'. +`configure' looks for `PREFIX/share/config.site' if it exists, then +`PREFIX/etc/config.site' if it exists. Or, you can set the +`CONFIG_SITE' environment variable to the location of the site script. +A warning: not all `configure' scripts look for a site script. + +Defining Variables +================== + +Variables not defined in a site shell script can be set in the +environment passed to `configure'. However, some packages may run +configure again during the build, and the customized values of these +variables may be lost. In order to avoid this problem, you should set +them in the `configure' command line, using `VAR=value'. For example: + + ./configure CC=/usr/local2/bin/gcc + +causes the specified `gcc' to be used as the C compiler (unless it is +overridden in the site shell script). + +Unfortunately, this technique does not work for `CONFIG_SHELL' due to +an Autoconf bug. Until the bug is fixed you can use this workaround: + + CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash + +`configure' Invocation +====================== + +`configure' recognizes the following options to control how it operates. + +`--help' +`-h' + Print a summary of the options to `configure', and exit. + +`--version' +`-V' + Print the version of Autoconf used to generate the `configure' + script, and exit. + +`--cache-file=FILE' + Enable the cache: use and save the results of the tests in FILE, + traditionally `config.cache'. FILE defaults to `/dev/null' to + disable caching. + +`--config-cache' +`-C' + Alias for `--cache-file=config.cache'. + +`--quiet' +`--silent' +`-q' + Do not print messages saying which checks are being made. To + suppress all normal output, redirect it to `/dev/null' (any error + messages will still be shown). + +`--srcdir=DIR' + Look for the package's source code in directory DIR. Usually + `configure' can determine that directory automatically. + +`configure' also accepts some other, not widely useful, options. Run +`configure --help' for more details. -./configure --prefix=/usr -make -sudo make install diff --git a/Makefile.am b/Makefile.am index fc9a62a..0569dc0 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = src tests examples man +SUBDIRS = src tests examples cxxmph man EXTRA_DIST = cmph.spec configure.ac cmph.pc.in pkgconfigdir = $(libdir)/pkgconfig diff --git a/configure.ac b/configure.ac index 01c3343..7f0e2a2 100644 --- a/configure.ac +++ b/configure.ac @@ -37,4 +37,4 @@ dnl Checks for library functions. AC_CHECK_SPOON dnl AC_OUTPUT(Makefile tests/Makefile samples/Makefile) -AC_OUTPUT(Makefile src/Makefile tests/Makefile examples/Makefile man/Makefile cmph.pc) +AC_OUTPUT(Makefile src/Makefile cxxmph/Makefile tests/Makefile examples/Makefile man/Makefile cmph.pc) diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index fda6742..e29b81e 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,6 +1,8 @@ bin_PROGRAMS = cmph_hash_map_test +lib_LTLIBRARIES = libcxxmph.la -INCLUDES = -I../src/ +libcxxmph_la_SOURCES = trigragh.h trigraph.cc +libcxxmph_la_LDFLAGS = -version-info 0:0:0 -cmph_hash_map_test_LDADD = ../src/libcmph.la +cmph_hash_map_test_LDADD = libcxxmph.la cmph_hash_map_test_SOURCES = cmph_hash_map_test.cc diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index 55ef648..3923dc8 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -1,4 +1,4 @@ -#include +#include #include #include // for std::pair diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index b4de79d..7b79d0d 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -1,37 +1,105 @@ #include -template struct bitcount { - enum { value = (n & mask ? 1:0) + bitcount> 1>::value }; -}; -template struct bitcount { enum { value = 0 }; }; +#include "mphtable.h" -template struct bitposition { - enum +using std::vector; -template class CompileTimeByteTable { - public: - CompileTimeByteTable : current(op::value) { } - int operator[] (int i) { return *(¤t + i); } -private: - unsigned char current; - CompileTimeByteTable next; -}; +template +template -static CompileTimeByteTable<256, bitcount> BitcountTable; +void MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { + TableBuilderState st; + st.c = 1.23; + st.b = 7; + st.m = end - begin; + st.r = static_cast(ceil((st.c*st.m)/3)); + if ((st.r % 2) == 0) st.r += 1; + st.n = 3*st.r; + st.k = 1U << st.b; + st.ranktablesize = static_cast( + ceil(st.n / static_cast(st.k))); + st.graph_builder = TriGraph(st.m, st.n); // giant copy + st.edges_queue.resize(st.m) - -#define mix(a,b,c) \ -{ \ - a -= b; a -= c; a ^= (c>>13); \ - b -= c; b -= a; b ^= (a<<8); \ - c -= a; c -= b; c ^= (b>>13); \ - a -= b; a -= c; a ^= (c>>12); \ - b -= c; b -= a; b ^= (a<<16); \ - c -= a; c -= b; c ^= (b>>5); \ - a -= b; a -= c; a ^= (c>>3); \ - b -= c; b -= a; b ^= (a<<10); \ - c -= a; c -= b; c ^= (b>>15); \ + int iterations = 1000; + while (1) { + hasher hasher0 = HashFcn(); + ok = Mapping(st.graph_builder, st.edges_queue); + if (ok) break; + else --iterations; + if (iterations == 0) break; + } + if (iterations == 0) return false; + vector graph; + st.graph_builder.ExtractEdgesAndClear(&graph); + Assigning(graph, st.edges_queue); + vector().swap(st.edges_queue); + Ranking(graph); + } +template +int MPHTable::GenerateQueue( + cmph_uint32 nedges, cmph_uint32 nvertices, + TriGraph* graph, Queue* queue) { + cmph_uint32 queue_head = 0, queue_tail = 0; + // Relies on vector using 1 bit per element + vector marked_edge((nedges >> 3) + 1, false); + queue->swap(Queue(nvertices, 0)); + for (int i = 0; i < nedges; ++i) { + TriGraph::Edge e = graph.edges[i].vertices; + if (graph.vertex_degree_[e.vertices[0]] == 1 || + graph.vertex_degree_[e.vertices[1]] == 1 || + graph.vertex_degree[e.vertices[2]] == 1) { + if (!marked_edge[i]) { + (*queue)[queue_head++] = i; + marked_edge[i] = true; + } + } + } + while (queue_tail != queue_head) { + cmph_uint32 current_edge = (*queue)[queue_tail++]; + graph->RemoveEdge(current_edge); + TriGraph::Edge e = graph->edges[current_edge]; + for (int i = 0; i < 3; ++i) { + cmph_uint32 v = e.vertices[i]; + if (graph->vertex_degree[v] == 1) { + cmph_uint32 first_edge = graph->first_edge_[v]; + if (!marked_edge[first_edge) { + queue[queue_head++] = first_edge; + marked_edge[first_edge] = true; + } + } + } + } + vector().swap(marked_edge); + return queue_head - nedges; +} -static const int kMaskStepSelectTable = std::limit::max; +template +int MPHTable::Mapping(TriGraph* graph, Queue* queue) { + int cycles = 0; + graph->Reset(m, n); + for (ForwardIterator it = begin_; it != end_; ++it) { + cmph_uint32 hash_values[3]; + for (int i = 0; i < 3; ++i) { + hash_values[i] = hasher_(*it); + } + cmph_uint32 v0 = hash_values[0] % bdz->r; + cmph_uint32 v1 = hash_values[1] % bdz->r + bdz->r; + cmph_uint32 v2 = hash_values[2] % bdz->r + (bdz->r << 1); + graph->AddEdge(Edge(v0, v1, v2)); + } + cycles = GenerateQueue(bdz->m, bdz->n, queue, graph); + return cycles == 0; +} + +void MPHTable::Assigning(TriGraph* graph, Queue* queue) { +} +void MPHTable::Ranking(TriGraph* graph, Queue* queue) { +} +cmph_uint32 MPHTable::Search(const key_type& key) { +} + +cmph_uint32 MPHTable::Rank(const key_type& key) { +} diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index a72dcb6..309ce7f 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -1,83 +1,44 @@ // Minimal perfect hash abstraction implementing the BDZ algorithm +#include + #include "trigraph.h" -template +template > class MPHTable { public: typedef Key key_type; + typedef NewRandomlySeededHashFcn hasher; MPHTable(); ~MPHTable(); - template + template bool Reset(ForwardIterator begin, ForwardIterator end); cmph_uint32 index(const key_type& x) const; private: - typedef vector Queue; + typedef std::vector Queue; + template + struct TableBuilderState { + ForwardIterator begin; + ForwardIterator end; + Queue edges_queue; + TriGraph graph_builder; + double c; + cmph_uint32 m; + cmph_uint32 n; + cmph_uint32 k; + cmph_uint32 ranktablesize; + }; int GenerateQueue( cmph_uint32 nedges, cmph_uint32 nvertices, TriGraph* graph, Queue* queue); + void Assigning(TriGraph* graph, Queue* queue); + void Ranking(TriGraph* graph, Queue* queue); + cmph_uint32 Search(const StringPiece& key); + cmph_uint32 Rank(const StringPiece& key); - // Generates three hash values for k in a single pass. - static hash_vector(cmph_uint32 seed, const char* k, cmph_uint32 keylen, cmph_uint32* hashes) ; + std::vector graph_; }; -int MPHTable::GenerateQueue( - cmph_uint32 nedges, cmph_uint32 nvertices, -TriGraph* graph, Queue* queue) { - cmph_uint32 queue_head = 0, queue_tail = 0; - vector marked_edge((nedges >> 3) + 1, false); - queue->swap(Queue(nvertices, 0)); - for (int i = 0; i < nedges; ++i) { - TriGraph::Edge e = graph.edges[i].vertices; - if (graph.vertex_degree_[e.vertices[0]] == 1 || - graph.vertex_degree_[e.vertices[1]] == 1 || - graph.vertex_degree[e.vertices[2]] == 1) { - if (!marked_edge[i]) { - (*queue)[queue_head++] = i; - marked_edge[i] = true; - } - } - } - while (queue_tail != queue_head) { - cmph_uint32 current_edge = (*queue)[queue_tail++]; - graph->RemoveEdge(current_edge); - TriGraph::Edge e = graph->edges[current_edge]; - for (int i = 0; i < 3; ++i) { - cmph_uint32 v = e.vertices[i]; - if (graph->vertex_degree[v] == 1) { - cmph_uint32 first_edge = graph->first_edge_[v]; - if (!marked_edge[first_edge) { - queue[queue_head++] = first_edge; - marked_edge[first_edge] = true; - } - } - } - } - marked_edge.swap(vector()); - return queue_head - nedges; -} -int MPHTable::Mapping(TriGraph* graph, Queue* queue) { - int cycles = 0; - cmph_uint32 hl[3]; - graph->Reset(m, n); - ForwardIterator it = begin; - for (cmph_uint32 e = 0; e < end - begin; ++e) { - cmph_uint32 h0, h1, h2; - StringPiece key = *it; - hash_vector(bdz->hl, key.data(), key.len(), hl); - h0 = hl[0] % bdz->r; - h1 = hl[1] % bdz->r + bdz->r; - h2 = hl[2] % bdz->r + (bdz->r << 1); - AddEdge(graph, h0, h1, h2); - } - cycles = GenerateQueue(bdz->m, bdz->n, queue, graph); - return cycles == 0; -} - -void MPHTable::Assigning(TriGraph* graph, Queue* queue); -void MPHTable::Ranking(TriGraph* graph, Queue* queue); -cmph_uint32 MPHTable::Search(const StringPiece& key); -cmph_uint32 MPHTable::Rank(const StringPiece& key); diff --git a/cxxmph/trigraph.c b/cxxmph/trigraph.cc similarity index 54% rename from cxxmph/trigraph.c rename to cxxmph/trigraph.cc index b156416..89b6721 100644 --- a/cxxmph/trigraph.c +++ b/cxxmph/trigraph.cc @@ -1,18 +1,22 @@ +#include + #include "trigraph.h" +using std::vector; + namespace { -static const cmph_uint8 kInvalidEdge = std::limits::max; +static const cmph_uint8 kInvalidEdge = std::numeric_limits::max(); } TriGraph::TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices) : nedges_(0), - edges_(nedges, 0), + edges_(nedges), first_edge_(nvertices, kInvalidEdge), vertex_degree_(nvertices, 0) { } -void Trigraph::ExtractEdgesAndClear(vector* edges) { - first_edge_.swap(vector()); - vertex_degree_.swap(vector()); +void TriGraph::ExtractEdgesAndClear(vector* edges) { + vector().swap(first_edge_); + vector().swap(vertex_degree_); nedges_ = 0; edges->swap(edges_); } diff --git a/cxxmph/trigraph.h b/cxxmph/trigraph.h index aacf101..e4f8440 100644 --- a/cxxmph/trigraph.h +++ b/cxxmph/trigraph.h @@ -1,5 +1,10 @@ +#include + +#include "../src/cmph_types.h" + class TriGraph { struct Edge { + Edge() { } Edge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2); cmph_uint32 vertices[3]; }; @@ -9,13 +14,13 @@ class TriGraph { }; TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices); - void AddEdge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2); + void AddEdge(const Edge& edge); void RemoveEdge(cmph_uint32 current_edge); - void ExtractEdgesAndClear(vector* edges); + void ExtractEdgesAndClear(std::vector* edges); private: cmph_uint32 nedges_; - vector edges_; - vector first_edge_; - vector vertex_degree_; + std::vector edges_; + std::vector first_edge_; + std::vector vertex_degree_; };