diff --git a/COPYING b/COPYING index 4d2513e..e114d20 100644 --- a/COPYING +++ b/COPYING @@ -1,3 +1,5 @@ The code of the cmph library is dual licensed under the LGPL version 2 and MPL 1.1 licenses. Please refer to the LGPL-2 and MPL-1.1 files in the repository for the full description of each of the licenses. + +For cxxmph, the files stringpiece.h and MurmurHash2 are covered by the BSD and MIT licenses, respectively. diff --git a/INSTALL b/INSTALL deleted file mode 100644 index 7d1c323..0000000 --- a/INSTALL +++ /dev/null @@ -1,365 +0,0 @@ -Installation Instructions -************************* - -Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005, -2006, 2007, 2008, 2009 Free Software Foundation, Inc. - - Copying and distribution of this file, with or without modification, -are permitted in any medium without royalty provided the copyright -notice and this notice are preserved. This file is offered as-is, -without warranty of any kind. - -Basic Installation -================== - - Briefly, the shell commands `./configure; make; make install' should -configure, build, and install this package. The following -more-detailed instructions are generic; see the `README' file for -instructions specific to this package. Some packages provide this -`INSTALL' file but do not implement all of the features documented -below. The lack of an optional feature in a given package is not -necessarily a bug. More recommendations for GNU packages can be found -in *note Makefile Conventions: (standards)Makefile Conventions. - - The `configure' shell script attempts to guess correct values for -various system-dependent variables used during compilation. It uses -those values to create a `Makefile' in each directory of the package. -It may also create one or more `.h' files containing system-dependent -definitions. Finally, it creates a shell script `config.status' that -you can run in the future to recreate the current configuration, and a -file `config.log' containing compiler output (useful mainly for -debugging `configure'). - - It can also use an optional file (typically called `config.cache' -and enabled with `--cache-file=config.cache' or simply `-C') that saves -the results of its tests to speed up reconfiguring. Caching is -disabled by default to prevent problems with accidental use of stale -cache files. - - If you need to do unusual things to compile the package, please try -to figure out how `configure' could check whether to do them, and mail -diffs or instructions to the address given in the `README' so they can -be considered for the next release. If you are using the cache, and at -some point `config.cache' contains results you don't want to keep, you -may remove or edit it. - - The file `configure.ac' (or `configure.in') is used to create -`configure' by a program called `autoconf'. You need `configure.ac' if -you want to change it or regenerate `configure' using a newer version -of `autoconf'. - - The simplest way to compile this package is: - - 1. `cd' to the directory containing the package's source code and type - `./configure' to configure the package for your system. - - Running `configure' might take a while. While running, it prints - some messages telling which features it is checking for. - - 2. Type `make' to compile the package. - - 3. Optionally, type `make check' to run any self-tests that come with - the package, generally using the just-built uninstalled binaries. - - 4. Type `make install' to install the programs and any data files and - documentation. When installing into a prefix owned by root, it is - recommended that the package be configured and built as a regular - user, and only the `make install' phase executed with root - privileges. - - 5. Optionally, type `make installcheck' to repeat any self-tests, but - this time using the binaries in their final installed location. - This target does not install anything. Running this target as a - regular user, particularly if the prior `make install' required - root privileges, verifies that the installation completed - correctly. - - 6. You can remove the program binaries and object files from the - source code directory by typing `make clean'. To also remove the - files that `configure' created (so you can compile the package for - a different kind of computer), type `make distclean'. There is - also a `make maintainer-clean' target, but that is intended mainly - for the package's developers. If you use it, you may have to get - all sorts of other programs in order to regenerate files that came - with the distribution. - - 7. Often, you can also type `make uninstall' to remove the installed - files again. In practice, not all packages have tested that - uninstallation works correctly, even though it is required by the - GNU Coding Standards. - - 8. Some packages, particularly those that use Automake, provide `make - distcheck', which can by used by developers to test that all other - targets like `make install' and `make uninstall' work correctly. - This target is generally not run by end users. - -Compilers and Options -===================== - - Some systems require unusual options for compilation or linking that -the `configure' script does not know about. Run `./configure --help' -for details on some of the pertinent environment variables. - - You can give `configure' initial values for configuration parameters -by setting variables in the command line or in the environment. Here -is an example: - - ./configure CC=c99 CFLAGS=-g LIBS=-lposix - - *Note Defining Variables::, for more details. - -Compiling For Multiple Architectures -==================================== - - You can compile the package for more than one kind of computer at the -same time, by placing the object files for each architecture in their -own directory. To do this, you can use GNU `make'. `cd' to the -directory where you want the object files and executables to go and run -the `configure' script. `configure' automatically checks for the -source code in the directory that `configure' is in and in `..'. This -is known as a "VPATH" build. - - With a non-GNU `make', it is safer to compile the package for one -architecture at a time in the source code directory. After you have -installed the package for one architecture, use `make distclean' before -reconfiguring for another architecture. - - On MacOS X 10.5 and later systems, you can create libraries and -executables that work on multiple system types--known as "fat" or -"universal" binaries--by specifying multiple `-arch' options to the -compiler but only a single `-arch' option to the preprocessor. Like -this: - - ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ - CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \ - CPP="gcc -E" CXXCPP="g++ -E" - - This is not guaranteed to produce working output in all cases, you -may have to build one architecture at a time and combine the results -using the `lipo' tool if you have problems. - -Installation Names -================== - - By default, `make install' installs the package's commands under -`/usr/local/bin', include files under `/usr/local/include', etc. You -can specify an installation prefix other than `/usr/local' by giving -`configure' the option `--prefix=PREFIX', where PREFIX must be an -absolute file name. - - You can specify separate installation prefixes for -architecture-specific files and architecture-independent files. If you -pass the option `--exec-prefix=PREFIX' to `configure', the package uses -PREFIX as the prefix for installing programs and libraries. -Documentation and other data files still use the regular prefix. - - In addition, if you use an unusual directory layout you can give -options like `--bindir=DIR' to specify different values for particular -kinds of files. Run `configure --help' for a list of the directories -you can set and what kinds of files go in them. In general, the -default for these options is expressed in terms of `${prefix}', so that -specifying just `--prefix' will affect all of the other directory -specifications that were not explicitly provided. - - The most portable way to affect installation locations is to pass the -correct locations to `configure'; however, many packages provide one or -both of the following shortcuts of passing variable assignments to the -`make install' command line to change installation locations without -having to reconfigure or recompile. - - The first method involves providing an override variable for each -affected directory. For example, `make install -prefix=/alternate/directory' will choose an alternate location for all -directory configuration variables that were expressed in terms of -`${prefix}'. Any directories that were specified during `configure', -but not in terms of `${prefix}', must each be overridden at install -time for the entire installation to be relocated. The approach of -makefile variable overrides for each directory variable is required by -the GNU Coding Standards, and ideally causes no recompilation. -However, some platforms have known limitations with the semantics of -shared libraries that end up requiring recompilation when using this -method, particularly noticeable in packages that use GNU Libtool. - - The second method involves providing the `DESTDIR' variable. For -example, `make install DESTDIR=/alternate/directory' will prepend -`/alternate/directory' before all installation names. The approach of -`DESTDIR' overrides is not required by the GNU Coding Standards, and -does not work on platforms that have drive letters. On the other hand, -it does better at avoiding recompilation issues, and works well even -when some directory options were not specified in terms of `${prefix}' -at `configure' time. - -Optional Features -================= - - If the package supports it, you can cause programs to be installed -with an extra prefix or suffix on their names by giving `configure' the -option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. - - Some packages pay attention to `--enable-FEATURE' options to -`configure', where FEATURE indicates an optional part of the package. -They may also pay attention to `--with-PACKAGE' options, where PACKAGE -is something like `gnu-as' or `x' (for the X Window System). The -`README' should mention any `--enable-' and `--with-' options that the -package recognizes. - - For packages that use the X Window System, `configure' can usually -find the X include and library files automatically, but if it doesn't, -you can use the `configure' options `--x-includes=DIR' and -`--x-libraries=DIR' to specify their locations. - - Some packages offer the ability to configure how verbose the -execution of `make' will be. For these packages, running `./configure ---enable-silent-rules' sets the default to minimal output, which can be -overridden with `make V=1'; while running `./configure ---disable-silent-rules' sets the default to verbose, which can be -overridden with `make V=0'. - -Particular systems -================== - - On HP-UX, the default C compiler is not ANSI C compatible. If GNU -CC is not installed, it is recommended to use the following options in -order to use an ANSI C compiler: - - ./configure CC="cc -Ae -D_XOPEN_SOURCE=500" - -and if that doesn't work, install pre-built binaries of GCC for HP-UX. - - On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot -parse its `' header file. The option `-nodtk' can be used as -a workaround. If GNU CC is not installed, it is therefore recommended -to try - - ./configure CC="cc" - -and if that doesn't work, try - - ./configure CC="cc -nodtk" - - On Solaris, don't put `/usr/ucb' early in your `PATH'. This -directory contains several dysfunctional programs; working variants of -these programs are available in `/usr/bin'. So, if you need `/usr/ucb' -in your `PATH', put it _after_ `/usr/bin'. - - On Haiku, software installed for all users goes in `/boot/common', -not `/usr/local'. It is recommended to use the following options: - - ./configure --prefix=/boot/common - -Specifying the System Type -========================== - - There may be some features `configure' cannot figure out -automatically, but needs to determine by the type of machine the package -will run on. Usually, assuming the package is built to be run on the -_same_ architectures, `configure' can figure that out, but if it prints -a message saying it cannot guess the machine type, give it the -`--build=TYPE' option. TYPE can either be a short name for the system -type, such as `sun4', or a canonical name which has the form: - - CPU-COMPANY-SYSTEM - -where SYSTEM can have one of these forms: - - OS - KERNEL-OS - - See the file `config.sub' for the possible values of each field. If -`config.sub' isn't included in this package, then this package doesn't -need to know the machine type. - - If you are _building_ compiler tools for cross-compiling, you should -use the option `--target=TYPE' to select the type of system they will -produce code for. - - If you want to _use_ a cross compiler, that generates code for a -platform different from the build platform, you should specify the -"host" platform (i.e., that on which the generated programs will -eventually be run) with `--host=TYPE'. - -Sharing Defaults -================ - - If you want to set default values for `configure' scripts to share, -you can create a site shell script called `config.site' that gives -default values for variables like `CC', `cache_file', and `prefix'. -`configure' looks for `PREFIX/share/config.site' if it exists, then -`PREFIX/etc/config.site' if it exists. Or, you can set the -`CONFIG_SITE' environment variable to the location of the site script. -A warning: not all `configure' scripts look for a site script. - -Defining Variables -================== - - Variables not defined in a site shell script can be set in the -environment passed to `configure'. However, some packages may run -configure again during the build, and the customized values of these -variables may be lost. In order to avoid this problem, you should set -them in the `configure' command line, using `VAR=value'. For example: - - ./configure CC=/usr/local2/bin/gcc - -causes the specified `gcc' to be used as the C compiler (unless it is -overridden in the site shell script). - -Unfortunately, this technique does not work for `CONFIG_SHELL' due to -an Autoconf bug. Until the bug is fixed you can use this workaround: - - CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash - -`configure' Invocation -====================== - - `configure' recognizes the following options to control how it -operates. - -`--help' -`-h' - Print a summary of all of the options to `configure', and exit. - -`--help=short' -`--help=recursive' - Print a summary of the options unique to this package's - `configure', and exit. The `short' variant lists options used - only in the top level, while the `recursive' variant lists options - also present in any nested packages. - -`--version' -`-V' - Print the version of Autoconf used to generate the `configure' - script, and exit. - -`--cache-file=FILE' - Enable the cache: use and save the results of the tests in FILE, - traditionally `config.cache'. FILE defaults to `/dev/null' to - disable caching. - -`--config-cache' -`-C' - Alias for `--cache-file=config.cache'. - -`--quiet' -`--silent' -`-q' - Do not print messages saying which checks are being made. To - suppress all normal output, redirect it to `/dev/null' (any error - messages will still be shown). - -`--srcdir=DIR' - Look for the package's source code in directory DIR. Usually - `configure' can determine that directory automatically. - -`--prefix=DIR' - Use DIR as the installation prefix. *note Installation Names:: - for more details, including other options available for fine-tuning - the installation locations. - -`--no-create' -`-n' - Run the configure checks, but stop before creating any output - files. - -`configure' also accepts some other, not widely useful, options. Run -`configure --help' for more details. - diff --git a/Makefile.am b/Makefile.am index cc8f26f..cdf1447 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = src tests examples man +SUBDIRS = src tests examples man $(CXXMPH) EXTRA_DIST = cmph.spec configure.ac cmph.pc.in LGPL-2 MPL-1.1 pkgconfigdir = $(libdir)/pkgconfig diff --git a/NEWSLOG.t2t b/NEWSLOG.t2t index ec8e7b6..b74bf2a 100644 --- a/NEWSLOG.t2t +++ b/NEWSLOG.t2t @@ -5,6 +5,10 @@ News Log ---------------------------------------- +==News for version 1.1== + +Fixed a bug in the chd_pc algorithm and reorganized tests. + ==News for version 1.0== This is a bugfix only version, after which a revamp of the cmph code and diff --git a/README.t2t b/README.t2t index d94e70e..21d851f 100644 --- a/README.t2t +++ b/README.t2t @@ -88,6 +88,10 @@ The CMPH Library encapsulates the newest and more efficient algorithms in an eas ---------------------------------------- +==News for version 1.1== + +Fixed a bug in the chd_pc algorithm and reorganized tests. + ==News for version 1.0== This is a bugfix only version, after which a revamp of the cmph code and diff --git a/acinclude.m4 b/acinclude.m4 index f216360..e926f46 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -1,9 +1,95 @@ +AC_DEFUN([AC_ENABLE_CXXMPH], [AC_ARG_ENABLE([cxxmph], + [ --enable-cxxmph enable the c++ cxxmph library ], + [case "${enableval}" in + yes) cxxmph=true ;; + no) cxxmph=false ;; + *) AC_MSG_ERROR([bad value ${enableval} for --enable-cxxmph]) ;; + esac],[cxxmph=false])]) + AC_DEFUN([AC_CHECK_SPOON], [ AC_ARG_WITH(spoon, [ --with-spoon=SPOON this is inocuous, since the truth is that there is no spoon ]) AC_MSG_CHECKING(if there is spoon) AC_MSG_RESULT(no) ]) +dnl Check for baseline language coverage in the compiler for the C++0x standard. +# AC_COMPILE_STDCXX_OX +AC_DEFUN([AC_COMPILE_STDCXX_0X], [ + AC_CACHE_CHECK(if compiler supports C++0x features without additional flags, + ac_cv_cxx_compile_cxx0x_native, + [AC_LANG_SAVE + AC_LANG_CPLUSPLUS + AC_TRY_COMPILE([ + #include + #include + template + struct check + { + static_assert(sizeof(int) <= sizeof(T), "not big enough"); + }; + + typedef check> right_angle_brackets; + + int a; + decltype(a) b; + ],, + ac_cv_cxx_compile_cxx0x_native=yes, ac_cv_cxx_compile_cxx0x_native=no) + AC_LANG_RESTORE + ]) + + AC_CACHE_CHECK(if compiler supports C++0x features with -std=c++0x, + ac_cv_cxx_compile_cxx0x_cxx, + [AC_LANG_SAVE + AC_LANG_CPLUSPLUS + ac_save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS -std=c++0x" + AC_TRY_COMPILE([ + #include + template + struct check + { + static_assert(sizeof(int) <= sizeof(T), "not big enough"); + }; + + typedef check> right_angle_brackets; + + int a; + decltype(a) b;],, + ac_cv_cxx_compile_cxx0x_cxx=yes, ac_cv_cxx_compile_cxx0x_cxx=no) + CXXFLAGS="$ac_save_CXXFLAGS" + AC_LANG_RESTORE + ]) + + AC_CACHE_CHECK(if compiler supports C++0x features with -std=gnu++0x, + ac_cv_cxx_compile_cxx0x_gxx, + [AC_LANG_SAVE + AC_LANG_CPLUSPLUS + ac_save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS -std=gnu++0x" + AC_TRY_COMPILE([ + #include + template + struct check + { + static_assert(sizeof(int) <= sizeof(T), "not big enough"); + }; + + typedef check> right_angle_brackets; + + int a; + decltype(a) b;],, + ac_cv_cxx_compile_cxx0x_gxx=yes, ac_cv_cxx_compile_cxx0x_gxx=no) + CXXFLAGS="$ac_save_CXXFLAGS" + AC_LANG_RESTORE + ]) + + if test "$ac_cv_cxx_compile_cxx0x_native" = yes || + test "$ac_cv_cxx_compile_cxx0x_cxx" = yes || + test "$ac_cv_cxx_compile_cxx0x_gxx" = yes; then + AC_DEFINE(HAVE_STDCXX_0X,,[Define if g++ supports C++0x features. ]) + fi +]) + dnl By default, many hosts won't let programs access large files; dnl one must use special compiler options to get large-file access to work. dnl For more details about this brain damage please see: diff --git a/configure.ac b/configure.ac index e2cbde0..172b02f 100644 --- a/configure.ac +++ b/configure.ac @@ -1,16 +1,16 @@ dnl Process this file with autoconf to produce a configure script. -AC_INIT(Makefile.am) +AC_INIT +AC_CONFIG_SRCDIR([Makefile.am]) AM_INIT_AUTOMAKE(cmph, 1.0) -AM_CONFIG_HEADER(config.h) +AC_CONFIG_HEADERS([config.h]) AC_CONFIG_MACRO_DIR([m4]) dnl Checks for programs. AC_PROG_AWK AC_PROG_CC -AC_PROG_CXX AC_PROG_INSTALL AC_PROG_LN_S -AC_PROG_LIBTOOL +LT_INIT AC_SYS_EXTRA_LARGEFILE if test "x$ac_cv_sys_largefile_CFLAGS" = "xno" ; then ac_cv_sys_largefile_CFLAGS="" @@ -25,17 +25,32 @@ CFLAGS="$CFLAGS $ac_cv_sys_largefile_CFLAGS" LDFLAGS="$LDFLAGS $ac_cv_sys_largefile_LDFLAGS" LIBS="$LIBS $ac_cv_sys_largefile_LIBS" - dnl Checks for headers AC_CHECK_HEADERS([getopt.h math.h]) dnl Checks for libraries. -AC_CHECK_LIBM +LT_LIB_M LDFLAGS="$LIBM $LDFLAGS" CFLAGS="-Wall -Werror" -dnl Checks for library functions. +AC_PROG_CXX +AC_ENABLE_CXXMPH +if test x$cxxmph = xtrue; then + AC_COMPILE_STDCXX_0X + if test x$ac_cv_cxx_compile_cxx0x_native = "xno"; then + if test x$ac_cv_cxx_compile_cxx0x_cxx = "xyes"; then + CXXFLAGS="$CXXFLAGS -std=c++0x" + elif test x$ac_cv_cxx_compile_cxx0x_gxx = "xyes"; then + CXXFLAGS="$CXXFLAGS -std=gnu++0x" + else + AC_MSG_ERROR("cxxmph demands a working c++0x compiler.") + fi + fi + AC_SUBST([CXXMPH], "cxxmph") +fi AC_CHECK_SPOON -dnl AC_OUTPUT(Makefile tests/Makefile samples/Makefile) -AC_OUTPUT(Makefile src/Makefile tests/Makefile examples/Makefile man/Makefile cmph.pc) +dnl AC_CONFIG_FILES([Makefile tests/Makefile samples/Makefile]) +AC_OUTPUT +AC_CONFIG_FILES([Makefile src/Makefile cxxmph/Makefile tests/Makefile examples/Makefile man/Makefile cmph.pc]) +AC_OUTPUT diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am new file mode 100644 index 0000000..db8ffa1 --- /dev/null +++ b/cxxmph/Makefile.am @@ -0,0 +1,32 @@ +TESTS = $(check_PROGRAMS) +check_PROGRAMS = mph_bits_test hollow_iterator_test mph_map_test mph_index_test trigraph_test +noinst_PROGRAMS = bm_index bm_map +bin_PROGRAMS = cxxmph +lib_LTLIBRARIES = libcxxmph.la +libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc mph_bits.h mph_bits.cc +libcxxmph_la_LDFLAGS = -version-info 0:0:0 +cxxmph_includedir = $(includedir)/cxxmph/ +cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h + +mph_map_test_LDADD = libcxxmph.la +mph_map_test_SOURCES = mph_map_test.cc + +mph_index_test_LDADD = libcxxmph.la +mph_index_test_SOURCES = mph_index_test.cc + +bm_index_LDADD = libcxxmph.la -lcmph +bm_index_SOURCES = bm_common.cc bm_index.cc + +trigraph_test_LDADD = libcxxmph.la +trigraph_test_SOURCES = trigraph_test.cc + +bm_map_LDADD = libcxxmph.la +bm_map_SOURCES = bm_common.cc bm_map.cc + +cxxmph_LDADD = libcxxmph.la +cxxmph_SOURCES = cxxmph.cc + +hollow_iterator_test_SOURCES = hollow_iterator_test.cc +mph_bits_test_SOURCES = mph_bits_test.cc +mph_bits_test_LDADD = libcxxmph.la + diff --git a/cxxmph/MurmurHash3.cpp b/cxxmph/MurmurHash3.cpp new file mode 100644 index 0000000..09ffb26 --- /dev/null +++ b/cxxmph/MurmurHash3.cpp @@ -0,0 +1,335 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "MurmurHash3.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define FORCE_INLINE __attribute__((always_inline)) + +inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + uint32_t c1 = 0x239b961b; + uint32_t c2 = 0xab0e9789; + uint32_t c3 = 0x38b34ae5; + uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i*4+0); + uint32_t k2 = getblock(blocks,i*4+1); + uint32_t k3 = getblock(blocks,i*4+2); + uint32_t k4 = getblock(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + h3 = fmix(h3); + h4 = fmix(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(int i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock(blocks,i*2+0); + uint64_t k2 = getblock(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= uint64_t(tail[14]) << 48; + case 14: k2 ^= uint64_t(tail[13]) << 40; + case 13: k2 ^= uint64_t(tail[12]) << 32; + case 12: k2 ^= uint64_t(tail[11]) << 24; + case 11: k2 ^= uint64_t(tail[10]) << 16; + case 10: k2 ^= uint64_t(tail[ 9]) << 8; + case 9: k2 ^= uint64_t(tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= uint64_t(tail[ 7]) << 56; + case 7: k1 ^= uint64_t(tail[ 6]) << 48; + case 6: k1 ^= uint64_t(tail[ 5]) << 40; + case 5: k1 ^= uint64_t(tail[ 4]) << 32; + case 4: k1 ^= uint64_t(tail[ 3]) << 24; + case 3: k1 ^= uint64_t(tail[ 2]) << 16; + case 2: k1 ^= uint64_t(tail[ 1]) << 8; + case 1: k1 ^= uint64_t(tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- + diff --git a/cxxmph/MurmurHash3.h b/cxxmph/MurmurHash3.h new file mode 100644 index 0000000..54e9d3f --- /dev/null +++ b/cxxmph/MurmurHash3.h @@ -0,0 +1,37 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +typedef unsigned char uint8_t; +typedef unsigned long uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ diff --git a/cxxmph/benchmark.cc b/cxxmph/benchmark.cc new file mode 100644 index 0000000..70175e1 --- /dev/null +++ b/cxxmph/benchmark.cc @@ -0,0 +1,142 @@ +#include "benchmark.h" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using std::cerr; +using std::cout; +using std::endl; +using std::setfill; +using std::setw; +using std::string; +using std::ostringstream; +using std::vector; + +namespace { + +/* Subtract the `struct timeval' values X and Y, + storing the result in RESULT. + Return 1 if the difference is negative, otherwise 0. */ +int timeval_subtract ( + struct timeval *result, struct timeval *x, struct timeval* y) { + /* Perform the carry for the later subtraction by updating y. */ + if (x->tv_usec < y->tv_usec) { + int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; + y->tv_usec -= 1000000 * nsec; + y->tv_sec += nsec; + } + if (x->tv_usec - y->tv_usec > 1000000) { + int nsec = (x->tv_usec - y->tv_usec) / 1000000; + y->tv_usec += 1000000 * nsec; + y->tv_sec -= nsec; + } + + /* Compute the time remaining to wait. + tv_usec is certainly positive. */ + result->tv_sec = x->tv_sec - y->tv_sec; + result->tv_usec = x->tv_usec - y->tv_usec; + + /* Return 1 if result is negative. */ + return x->tv_sec < y->tv_sec; +} + +// C++ iostream is terrible for formatting. +string timeval_to_string(timeval tv) { + ostringstream out; + out << setfill(' ') << setw(3) << tv.tv_sec << '.'; + out << setfill('0') << setw(6) << tv.tv_usec; + return out.str(); +} + +struct rusage getrusage_or_die() { + struct rusage rs; + int ret = getrusage(RUSAGE_SELF, &rs); + if (ret != 0) { + cerr << "rusage failed: " << strerror(errno) << endl; + exit(-1); + } + return rs; +} + +struct timeval gettimeofday_or_die() { + struct timeval tv; + int ret = gettimeofday(&tv, NULL); + if (ret != 0) { + cerr << "gettimeofday failed: " << strerror(errno) << endl; + exit(-1); + } + return tv; +} + +#ifdef HAVE_CXA_DEMANGLE +string demangle(const string& name) { + char buf[1024]; + unsigned int size = 1024; + int status; + char* res = abi::__cxa_demangle( + name.c_str(), buf, &size, &status); + return res; +} +#else +string demangle(const string& name) { return name; } +#endif + + +static vector g_benchmarks; + +} // anonymous namespace + +namespace cxxmph { + +/* static */ void Benchmark::Register(Benchmark* bm) { + if (bm->name().empty()) { + string name = demangle(typeid(*bm).name()); + bm->set_name(name); + } + g_benchmarks.push_back(bm); +} + +/* static */ void Benchmark::RunAll() { + for (int i = 0; i < g_benchmarks.size(); ++i) { + std::auto_ptr bm(g_benchmarks[i]); + if (!bm->SetUp()) { + cerr << "Set up phase for benchmark " + << bm->name() << " failed." << endl; + continue; + } + bm->MeasureRun(); + bm->TearDown(); + } +} + +void Benchmark::MeasureRun() { + struct timeval walltime_begin = gettimeofday_or_die(); + struct rusage begin = getrusage_or_die(); + Run(); + struct rusage end = getrusage_or_die(); + struct timeval walltime_end = gettimeofday_or_die(); + + struct timeval utime; + timeval_subtract(&utime, &end.ru_utime, &begin.ru_utime); + struct timeval stime; + timeval_subtract(&stime, &end.ru_stime, &begin.ru_stime); + struct timeval wtime; + timeval_subtract(&wtime, &walltime_end, &walltime_begin); + + cout << "Benchmark: " << name_ << endl; + cout << "CPU User time : " << timeval_to_string(utime) << endl; + cout << "CPU System time: " << timeval_to_string(stime) << endl; + cout << "Wall clock time: " << timeval_to_string(wtime) << endl; + cout << endl; +} + +} // namespace cxxmph diff --git a/cxxmph/benchmark.h b/cxxmph/benchmark.h new file mode 100644 index 0000000..cecbc2f --- /dev/null +++ b/cxxmph/benchmark.h @@ -0,0 +1,32 @@ +#ifndef __CXXMPH_BENCHMARK_H__ +#define __CXXMPH_BENCHMARK_H__ + +#include +#include + +namespace cxxmph { + +class Benchmark { + public: + Benchmark() {} + virtual ~Benchmark() {} + + const std::string& name() { return name_; } + void set_name(const std::string& name) { name_ = name; } + + static void Register(Benchmark* bm); + static void RunAll(); + + protected: + virtual bool SetUp() { return true; }; + virtual void Run() = 0; + virtual bool TearDown() { return true; }; + + private: + std::string name_; + void MeasureRun(); +}; + +} // namespace cxxmph + +#endif diff --git a/cxxmph/bm_common.cc b/cxxmph/bm_common.cc new file mode 100644 index 0000000..7e94dcf --- /dev/null +++ b/cxxmph/bm_common.cc @@ -0,0 +1,71 @@ +#include +#include +#include +#include +#include + +#include "bm_common.h" + +using std::cerr; +using std::endl; +using std::set; +using std::string; +using std::vector; + +namespace cxxmph { + +bool UrlsBenchmark::SetUp() { + vector urls; + std::ifstream f(urls_file_.c_str()); + if (!f.is_open()) { + cerr << "Failed to open urls file " << urls_file_ << endl; + return false; + } + string buffer; + while(std::getline(f, buffer)) urls.push_back(buffer); + set unique(urls.begin(), urls.end()); + if (unique.size() != urls.size()) { + cerr << "Input file has repeated keys." << endl; + return false; + } + urls.swap(urls_); + return true; +} + +bool SearchUrlsBenchmark::SetUp() { + if (!UrlsBenchmark::SetUp()) return false; + int32_t miss_ratio_int32 = std::numeric_limits::max() * miss_ratio_; + forced_miss_urls_.resize(nsearches_); + random_.resize(nsearches_); + for (int i = 0; i < nsearches_; ++i) { + random_[i] = urls_[random() % urls_.size()]; + if (random() < miss_ratio_int32) { + forced_miss_urls_[i] = random_[i].as_string() + ".force_miss"; + random_[i] = forced_miss_urls_[i]; + } + } + return true; +} + +bool Uint64Benchmark::SetUp() { + set unique; + for (int i = 0; i < count_; ++i) { + uint64_t v; + do { v = random(); } while (unique.find(v) != unique.end()); + values_.push_back(v); + unique.insert(v); + } + return true; +} + +bool SearchUint64Benchmark::SetUp() { + if (!Uint64Benchmark::SetUp()) return false; + random_.resize(nsearches_); + for (int i = 0; i < nsearches_; ++i) { + uint32_t pos = random() % values_.size(); + random_[i] = values_[pos]; + } + return true; +} + +} // namespace cxxmph diff --git a/cxxmph/bm_common.h b/cxxmph/bm_common.h new file mode 100644 index 0000000..eed12df --- /dev/null +++ b/cxxmph/bm_common.h @@ -0,0 +1,69 @@ +#ifndef __CXXMPH_BM_COMMON_H__ +#define __CXXMPH_BM_COMMON_H__ + +#include "stringpiece.h" + +#include +#include +#include // std::hash +#include "MurmurHash3.h" + +#include "benchmark.h" + +namespace std { +template <> struct hash { + uint32_t operator()(const cxxmph::StringPiece& k) const { + uint32_t out; + MurmurHash3_x86_32(k.data(), k.length(), 1, &out); + return out; + } +}; +} // namespace std + +namespace cxxmph { + +class UrlsBenchmark : public Benchmark { + public: + UrlsBenchmark(const std::string& urls_file) : urls_file_(urls_file) { } + protected: + virtual bool SetUp(); + const std::string urls_file_; + std::vector urls_; +}; + +class SearchUrlsBenchmark : public UrlsBenchmark { + public: + SearchUrlsBenchmark(const std::string& urls_file, uint32_t nsearches, float miss_ratio) + : UrlsBenchmark(urls_file), nsearches_(nsearches), miss_ratio_(miss_ratio) {} + protected: + virtual bool SetUp(); + const uint32_t nsearches_; + float miss_ratio_; + std::vector forced_miss_urls_; + std::vector random_; +}; + +class Uint64Benchmark : public Benchmark { + public: + Uint64Benchmark(uint32_t count) : count_(count) { } + virtual void Run() {} + protected: + virtual bool SetUp(); + const uint32_t count_; + std::vector values_; +}; + +class SearchUint64Benchmark : public Uint64Benchmark { + public: + SearchUint64Benchmark(uint32_t count, uint32_t nsearches) + : Uint64Benchmark(count), nsearches_(nsearches) { } + virtual void Run() {}; + protected: + virtual bool SetUp(); + const uint32_t nsearches_; + std::vector random_; +}; + +} // namespace cxxmph + +#endif // __CXXMPH_BM_COMMON_H__ diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc new file mode 100644 index 0000000..9345a11 --- /dev/null +++ b/cxxmph/bm_index.cc @@ -0,0 +1,143 @@ +#include + +#include +#include +#include +#include + +#include "bm_common.h" +#include "stringpiece.h" +#include "mph_index.h" + +using namespace cxxmph; + +using std::string; +using std::unordered_map; + +class BM_MPHIndexCreate : public UrlsBenchmark { + public: + BM_MPHIndexCreate(const std::string& urls_file) + : UrlsBenchmark(urls_file) { } + protected: + virtual void Run() { + SimpleMPHIndex index; + index.Reset(urls_.begin(), urls_.end(), urls_.size()); + } +}; + +class BM_STLIndexCreate : public UrlsBenchmark { + public: + BM_STLIndexCreate(const std::string& urls_file) + : UrlsBenchmark(urls_file) { } + protected: + virtual void Run() { + unordered_map index; + int idx = 0; + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + index.insert(make_pair(*it, idx++)); + } + } +}; + +class BM_MPHIndexSearch : public SearchUrlsBenchmark { + public: + BM_MPHIndexSearch(const std::string& urls_file, int nsearches) + : SearchUrlsBenchmark(urls_file, nsearches, 0) { } + virtual void Run() { + for (auto it = random_.begin(); it != random_.end(); ++it) { + auto idx = index_.index(*it); + // Collision check to be fair with STL + // if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; + } + } + protected: + virtual bool SetUp () { + if (!SearchUrlsBenchmark::SetUp()) return false; + index_.Reset(urls_.begin(), urls_.end(), urls_.size()); + return true; + } + SimpleMPHIndex index_; +}; + +class BM_CmphIndexSearch : public SearchUrlsBenchmark { + public: + BM_CmphIndexSearch(const std::string& urls_file, int nsearches) + : SearchUrlsBenchmark(urls_file, nsearches, 0) { } + ~BM_CmphIndexSearch() { if (index_) cmph_destroy(index_); } + virtual void Run() { + for (auto it = random_.begin(); it != random_.end(); ++it) { + auto idx = cmph_search(index_, it->data(), it->length()); + // Collision check to be fair with STL + if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; + } + } + protected: + virtual bool SetUp() { + if (!SearchUrlsBenchmark::SetUp()) { + cerr << "Parent class setup failed." << endl; + return false; + } + FILE* f = fopen(urls_file_.c_str(), "r"); + if (!f) { + cerr << "Faied to open " << urls_file_ << endl; + return false; + } + cmph_io_adapter_t* source = cmph_io_nlfile_adapter(f); + if (!source) { + cerr << "Faied to create io adapter for " << urls_file_ << endl; + return false; + } + cmph_config_t* config = cmph_config_new(source); + if (!config) { + cerr << "Failed to create config" << endl; + return false; + } + cmph_config_set_algo(config, CMPH_BDZ); + cmph_t* mphf = cmph_new(config); + if (!mphf) { + cerr << "Failed to create mphf." << endl; + return false; + } + + cmph_config_destroy(config); + cmph_io_nlfile_adapter_destroy(source); + fclose(f); + index_ = mphf; + return true; + } + cmph_t* index_; +}; + + +class BM_STLIndexSearch : public SearchUrlsBenchmark { + public: + BM_STLIndexSearch(const std::string& urls_file, int nsearches) + : SearchUrlsBenchmark(urls_file, nsearches, 0) { } + virtual void Run() { + for (auto it = random_.begin(); it != random_.end(); ++it) { + auto idx = index_.find(*it); + } + } + protected: + virtual bool SetUp () { + if (!SearchUrlsBenchmark::SetUp()) return false; + unordered_map index; + int idx = 0; + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + index.insert(make_pair(*it, idx++)); + } + index.swap(index_); + return true; + } + unordered_map index_; +}; + +int main(int argc, char** argv) { + Benchmark::Register(new BM_MPHIndexCreate("URLS100k")); + Benchmark::Register(new BM_STLIndexCreate("URLS100k")); + Benchmark::Register(new BM_MPHIndexSearch("URLS100k", 10*1000*1000)); + Benchmark::Register(new BM_STLIndexSearch("URLS100k", 10*1000*1000)); + Benchmark::Register(new BM_CmphIndexSearch("URLS100k", 10*1000*1000)); + Benchmark::RunAll(); + return 0; +} diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc new file mode 100644 index 0000000..0a0b225 --- /dev/null +++ b/cxxmph/bm_map.cc @@ -0,0 +1,101 @@ +#include +#include + +#include "bm_common.h" +#include "mph_map.h" + +using cxxmph::mph_map; +using std::string; +using std::unordered_map; + +namespace cxxmph { + +template +const T* myfind(const MapType& mymap, const T& k) { + auto it = mymap.find(k); + auto end = mymap.end(); + if (it == end) return NULL; + return &it->second; +} + +template +class BM_CreateUrls : public UrlsBenchmark { + public: + BM_CreateUrls(const string& urls_file) : UrlsBenchmark(urls_file) { } + virtual void Run() { + MapType mymap; + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + mymap[*it] = *it; + } + } +}; + +template +class BM_SearchUrls : public SearchUrlsBenchmark { + public: + BM_SearchUrls(const std::string& urls_file, int nsearches, float miss_ratio) + : SearchUrlsBenchmark(urls_file, nsearches, miss_ratio) { } + virtual void Run() { + for (auto it = random_.begin(); it != random_.end(); ++it) { + auto v = myfind(mymap_, *it); + assert(it->ends_with(".force_miss") ^ v != NULL); + assert(!v || *v == *it); + } + } + protected: + virtual bool SetUp() { + if (!SearchUrlsBenchmark::SetUp()) return false; + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + mymap_[*it] = *it; + } + mymap_.rehash(mymap_.bucket_count()); + fprintf(stderr, "Occupation: %f\n", static_cast(mymap_.size())/mymap_.bucket_count()); + return true; + } + MapType mymap_; +}; + +template +class BM_SearchUint64 : public SearchUint64Benchmark { + public: + BM_SearchUint64() : SearchUint64Benchmark(100000, 10*1000*1000) { } + virtual bool SetUp() { + if (!SearchUint64Benchmark::SetUp()) return false; + for (int i = 0; i < values_.size(); ++i) { + mymap_[values_[i]] = values_[i]; + } + mymap_.rehash(mymap_.bucket_count()); + // Double check if everything is all right + for (int i = 0; i < values_.size(); ++i) { + if (mymap_[values_[i]] != values_[i]) return false; + } + return true; + } + virtual void Run() { + for (auto it = random_.begin(); it != random_.end(); ++it) { + auto v = myfind(mymap_, *it); + if (*v != *it) { + fprintf(stderr, "Looked for %lu got %lu\n", *it, *v); + exit(-1); + } + } + } + MapType mymap_; +}; + +} // namespace cxxmph + +using namespace cxxmph; + +int main(int argc, char** argv) { + srandom(4); + Benchmark::Register(new BM_CreateUrls>("URLS100k")); + Benchmark::Register(new BM_CreateUrls>("URLS100k")); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); + Benchmark::Register(new BM_SearchUint64>); + Benchmark::Register(new BM_SearchUint64>); + Benchmark::RunAll(); +} diff --git a/cxxmph/cxxmph.cc b/cxxmph/cxxmph.cc new file mode 100644 index 0000000..e9bffd0 --- /dev/null +++ b/cxxmph/cxxmph.cc @@ -0,0 +1,70 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// Author: davi@google.com (Davi Reis) + +#include + +#include +#include +#include +#include + +#include "mph_map.h" +#include "config.h" + +using std::cerr; +using std::cout; +using std::endl; +using std::getline; +using std::ifstream; +using std::string; +using std::vector; + +using cxxmph::mph_map; + +void usage(const char* prg) { + cerr << "usage: " << prg << " [-v] [-h] [-V] " << endl; +} +void usage_long(const char* prg) { + usage(prg); + cerr << " -h\t print this help message" << endl; + cerr << " -V\t print version number and exit" << endl; + cerr << " -v\t increase verbosity (may be used multiple times)" << endl; +} + +int main(int argc, char** argv) { + + int verbosity = 0; + while (1) { + char ch = (char)getopt(argc, argv, "hv"); + if (ch == -1) break; + switch (ch) { + case 'h': + usage_long(argv[0]); + return 0; + case 'V': + std::cout << VERSION << std::endl; + return 0; + case 'v': + ++verbosity; + break; + } + } + if (optind != argc - 1) { + usage(argv[0]); + return 1; + } + vector keys; + ifstream f(argv[optind]); + string buffer; + while (!getline(f, buffer).eof()) keys.push_back(buffer); + for (int i = 0; i < keys.size(); ++i) string s = keys[i]; + mph_map table; + + for (int i = 0; i < keys.size(); ++i) table[keys[i]] = keys[i]; + mph_map::const_iterator it = table.begin(); + mph_map::const_iterator end = table.end(); + for (int i = 0; it != end; ++it, ++i) { + cout << i << ": " << it->first + <<" -> " << it->second << endl; + } +} diff --git a/cxxmph/hollow_iterator.h b/cxxmph/hollow_iterator.h new file mode 100644 index 0000000..c650d21 --- /dev/null +++ b/cxxmph/hollow_iterator.h @@ -0,0 +1,71 @@ +#ifndef __CXXMPH_HOLLOW_ITERATOR_H__ +#define __CXXMPH_HOLLOW_ITERATOR_H__ + +#include + +namespace cxxmph { + +template +struct hollow_iterator_base + : public std::iterator { + typedef presence_type presence; + typedef container_type container; + typedef iterator_type iterator; + typedef hollow_iterator_base& self_reference; + typedef typename iterator::reference reference; + typedef typename iterator::pointer pointer; + + hollow_iterator_base(container* c, presence* p, iterator it) + : c_(c), p_(p), it_(it) { if (c_) find_present(); } + self_reference operator++() { + ++it_; find_present(); + } + reference operator*() { return *it_; } + pointer operator->() { return &(*it_); } + + // TODO find syntax to make this less permissible at compile time + template + bool operator==(const T& rhs) { return rhs.it_ == this->it_; } + template + bool operator!=(const T& rhs) { return rhs.it_ != this->it_; } + + public: // TODO find syntax to make this friend of const iterator + void find_present() { + while (it_ != c_->end() && !((*p_)[it_-c_->begin()])) ++it_; + } + container* c_; + presence* p_; + iterator it_; +}; + +template +struct hollow_iterator : public hollow_iterator_base< + container_type, std::vector, typename container_type::iterator> { + typedef hollow_iterator_base< + container_type, std::vector, typename container_type::iterator> parent_class; + hollow_iterator() : parent_class(NULL, NULL, typename container_type::iterator()) { } + hollow_iterator(typename parent_class::container* c, + typename parent_class::presence* p, + typename parent_class::iterator it) + : parent_class(c, p, it) { } +}; + +template +struct hollow_const_iterator : public hollow_iterator_base< + const container_type, const std::vector, typename container_type::const_iterator> { + typedef hollow_iterator_base< + const container_type, const std::vector, typename container_type::const_iterator> parent_class; + typedef hollow_const_iterator self_type; + typedef hollow_iterator non_const_type; + hollow_const_iterator(non_const_type rhs) : parent_class(rhs.c_, rhs.p_, typename container_type::const_iterator(rhs.it_)) { } + hollow_const_iterator() : parent_class(NULL, NULL, typename container_type::iterator()) { } + hollow_const_iterator(const typename parent_class::container* c, + const typename parent_class::presence* p, + typename parent_class::iterator it) + : parent_class(c, p, it) { } +}; + +} // namespace cxxmph + +#endif // __CXXMPH_HOLLOW_ITERATOR_H__ diff --git a/cxxmph/hollow_iterator_test.cc b/cxxmph/hollow_iterator_test.cc new file mode 100644 index 0000000..07963ae --- /dev/null +++ b/cxxmph/hollow_iterator_test.cc @@ -0,0 +1,38 @@ +#include +#include +#include + +#include "hollow_iterator.h" + +using std::vector; +using cxxmph::hollow_iterator; +using cxxmph::hollow_const_iterator; + +int main(int argc, char** argv) { + vector v; + vector p; + for (int i = 0; i < 100; ++i) { + v.push_back(i); + p.push_back(i % 2 == 0); + } + auto begin = hollow_iterator>(&v, &p, v.begin()); + auto end = hollow_iterator>(&v, &p, v.end()); + for (auto it = begin; it != end; ++it) { + if (((*it) % 2) != 0) exit(-1); + } + hollow_const_iterator> const_begin(begin); + hollow_const_iterator> const_end(end); + for (auto it = const_begin; it != const_end; ++it) { + if (((*it) % 2) != 0) exit(-1); + } + vector::iterator vit1 = v.begin(); + vector::const_iterator vit2 = v.begin(); + if (vit1 != vit2) exit(-1); + auto it1 = hollow_iterator>(&v, &p, v.begin()); + auto it2 = hollow_const_iterator>(&v, &p, v.begin()); + if (it1 != it2) exit(-1); + + hollow_iterator> default_constructed; + default_constructed = hollow_iterator>(&v, &p, v.begin()); +} + diff --git a/cxxmph/mph_bits.cc b/cxxmph/mph_bits.cc new file mode 100644 index 0000000..510572c --- /dev/null +++ b/cxxmph/mph_bits.cc @@ -0,0 +1,7 @@ +#include "mph_bits.h" + +namespace cxxmph { + +const uint8_t dynamic_2bitset::vmask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; + +} diff --git a/cxxmph/mph_bits.h b/cxxmph/mph_bits.h new file mode 100644 index 0000000..c9eaabb --- /dev/null +++ b/cxxmph/mph_bits.h @@ -0,0 +1,75 @@ +#ifndef __CXXMPH_MPH_BITS_H__ +#define __CXXMPH_MPH_BITS_H__ + +#include // for uint32_t and friends + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cxxmph { + +class dynamic_2bitset { + public: + dynamic_2bitset() : size_(0), fill_(false) {} + dynamic_2bitset(uint32_t size, bool fill = false) + : size_(size), fill_(fill), data_(ceil(size / 4.0), ones()*fill) { + } + + const uint8_t operator[](uint32_t i) const { return get(i); } + const uint8_t get(uint32_t i) const { + assert(i < size()); + assert((i >> 2) < data_.size()); + return (data_[(i >> 2)] >> (((i & 3) << 1)) & 3); + } + uint8_t set(uint32_t i, uint8_t v) { + assert((i >> 2) < data_.size()); + data_[(i >> 2)] |= ones() ^ dynamic_2bitset::vmask[i & 3]; + data_[(i >> 2)] &= ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]); + assert(v <= 3); + assert(get(i) == v); + } + void resize(uint32_t size) { + size_ = size; + data_.resize(size >> 2, fill_*ones()); + } + void swap(dynamic_2bitset& other) { + std::swap(other.size_, size_); + std::swap(other.fill_, fill_); + other.data_.swap(data_); + } + void clear() { data_.clear(); size_ = 0; } + + uint32_t size() const { return size_; } + static const uint8_t vmask[]; + const std::vector& data() const { return data_; } + private: + uint32_t size_; + bool fill_; + std::vector data_; + const uint8_t ones() { return std::numeric_limits::max(); } +}; + +static uint32_t nextpoweroftwo(uint32_t k) { + if (k == 0) return 1; + k--; + for (int i=1; i> i; + return k+1; +} + +// Interesting bit tricks that might end up here: +// http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord +// Fast a % (k*2^t) +// http://www.azillionmonkeys.com/qed/adiv.html +// rank and select: +// http://vigna.dsi.unimi.it/ftp/papers/Broadword.pdf + +} // namespace cxxmph + +#endif diff --git a/cxxmph/mph_bits_test.cc b/cxxmph/mph_bits_test.cc new file mode 100644 index 0000000..c1680e3 --- /dev/null +++ b/cxxmph/mph_bits_test.cc @@ -0,0 +1,57 @@ +#include +#include + +#include "mph_bits.h" + +using cxxmph::dynamic_2bitset; +int main(int argc, char** argv) { + dynamic_2bitset small(256, true); + for (int i = 0; i < small.size(); ++i) small.set(i, i % 4); + for (int i = 0; i < small.size(); ++i) { + if (small[i] != i % 4) { + fprintf(stderr, "wrong bits %d at %d expected %d\n", small[i], i, i % 4); + exit(-1); + } + } + + int size = 256; + dynamic_2bitset bits(size, true /* fill with ones */); + for (int i = 0; i < size; ++i) { + if (bits[i] != 3) { + fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, 3); + exit(-1); + } + } + for (int i = 0; i < size; ++i) bits.set(i, 0); + for (int i = 0; i < size; ++i) { + if (bits[i] != 0) { + fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, 0); + exit(-1); + } + } + for (int i = 0; i < size; ++i) bits.set(i, i % 4); + for (int i = 0; i < size; ++i) { + if (bits[i] != i % 4) { + fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, i % 4); + exit(-1); + } + } + dynamic_2bitset size_corner1(1); + if (size_corner1.size() != 1) exit(-1); + dynamic_2bitset size_corner2(2); + if (size_corner2.size() != 2) exit(-1); + (dynamic_2bitset(4, true)).swap(size_corner2); + if (size_corner2.size() != 4) exit(-1); + for (int i = 0; i < size_corner2.size(); ++i) { + if (size_corner2[i] != 3) exit(-1); + } + size_corner2.clear(); + if (size_corner2.size() != 0) exit(-1); + + dynamic_2bitset empty; + empty.clear(); + dynamic_2bitset large(1000, true); + empty.swap(large); +} + + diff --git a/cxxmph/mph_index.cc b/cxxmph/mph_index.cc new file mode 100644 index 0000000..8b6baec --- /dev/null +++ b/cxxmph/mph_index.cc @@ -0,0 +1,205 @@ +#include +#include +#include + +using std::cerr; +using std::endl; + +#include "mph_index.h" + +using std::vector; + +namespace { + +static const uint8_t kUnassigned = 3; +// table used for looking up the number of assigned vertices to a 8-bit integer +static uint8_t kBdzLookupIndex[] = +{ +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0 +}; + +} // anonymous namespace + +namespace cxxmph { + +MPHIndex::~MPHIndex() { + clear(); +} + +void MPHIndex::clear() { + delete [] ranktable_; + ranktable_ = NULL; + ranktable_size_ = 0; + // TODO(davi) implement me +} + +bool MPHIndex::GenerateQueue( + TriGraph* graph, vector* queue_output) { + uint32_t queue_head = 0, queue_tail = 0; + uint32_t nedges = m_; + uint32_t nvertices = n_; + // Relies on vector using 1 bit per element + vector marked_edge(nedges + 1, false); + vector queue(nvertices, 0); + for (uint32_t i = 0; i < nedges; ++i) { + const TriGraph::Edge& e = graph->edges()[i]; + if (graph->vertex_degree()[e[0]] == 1 || + graph->vertex_degree()[e[1]] == 1 || + graph->vertex_degree()[e[2]] == 1) { + if (!marked_edge[i]) { + queue[queue_head++] = i; + marked_edge[i] = true; + } + } + } + /* + for (unsigned int i = 0; i < marked_edge.size(); ++i) { + cerr << "vertex with degree " << static_cast(graph->vertex_degree()[i]) << " marked " << marked_edge[i] << endl; + } + for (unsigned int i = 0; i < queue.size(); ++i) { + cerr << "vertex " << i << " queued at " << queue[i] << endl; + } + */ + // At this point queue head is the number of edges touching at least one + // vertex of degree 1. + // cerr << "Queue head " << queue_head << " Queue tail " << queue_tail << endl; + // graph->DebugGraph(); + while (queue_tail != queue_head) { + uint32_t current_edge = queue[queue_tail++]; + graph->RemoveEdge(current_edge); + const TriGraph::Edge& e = graph->edges()[current_edge]; + for (int i = 0; i < 3; ++i) { + uint32_t v = e[i]; + if (graph->vertex_degree()[v] == 1) { + uint32_t first_edge = graph->first_edge()[v]; + if (!marked_edge[first_edge]) { + queue[queue_head++] = first_edge; + marked_edge[first_edge] = true; + } + } + } + } + /* + for (unsigned int i = 0; i < queue.size(); ++i) { + cerr << "vertex " << i << " queued at " << queue[i] << endl; + } + */ + int cycles = queue_head - nedges; + if (cycles == 0) queue.swap(*queue_output); + return cycles == 0; +} + +void MPHIndex::Assigning( + const vector& edges, const vector& queue) { + uint32_t current_edge = 0; + vector marked_vertices(n_ + 1); + dynamic_2bitset().swap(g_); + // Initialize vector of half nibbles with all bits set. + dynamic_2bitset g(n_, true /* set bits to 1 */); + + uint32_t nedges = m_; // for legibility + for (int i = nedges - 1; i + 1 >= 1; --i) { + current_edge = queue[i]; + const TriGraph::Edge& e = edges[current_edge]; + /* + cerr << "B: " << e[0] << " " << e[1] << " " << e[2] << " -> " + << get_2bit_value(g_, e[0]) << " " + << get_2bit_value(g_, e[1]) << " " + << get_2bit_value(g_, e[2]) << " edge " << current_edge << endl; + */ + if (!marked_vertices[e[0]]) { + if (!marked_vertices[e[1]]) { + g.set(e[1], kUnassigned); + marked_vertices[e[1]] = true; + } + if (!marked_vertices[e[2]]) { + g.set(e[2], kUnassigned); + assert(marked_vertices.size() > e[2]); + marked_vertices[e[2]] = true; + } + g.set(e[0], (6 - (g[e[1]] + g[e[2]])) % 3); + marked_vertices[e[0]] = true; + } else if (!marked_vertices[e[1]]) { + if (!marked_vertices[e[2]]) { + g.set(e[2], kUnassigned); + marked_vertices[e[2]] = true; + } + g.set(e[1], (7 - (g[e[0]] + g[e[2]])) % 3); + marked_vertices[e[1]] = true; + } else { + g.set(e[2], (8 - (g[e[0]] + g[e[1]])) % 3); + marked_vertices[e[2]] = true; + } + /* + cerr << "A: " << e[0] << " " << e[1] << " " << e[2] << " -> " + << static_cast(g[e[0]]) << " " + << static_cast(g[e[1]]) << " " + << static_cast(g[e[2]]) << " " << endl; + */ + } + g_.swap(g); +} + +void MPHIndex::Ranking() { + uint32_t nbytes_total = static_cast(ceil(n_ / 4.0)); + uint32_t size = k_ >> 2U; + ranktable_size_ = static_cast( + ceil(n_ / static_cast(k_))); + delete [] ranktable_; + ranktable_ = NULL; + uint32_t* ranktable = new uint32_t[ranktable_size_]; + memset(ranktable, 0, ranktable_size_*sizeof(uint32_t)); + uint32_t offset = 0; + uint32_t count = 0; + uint32_t i = 1; + while (1) { + if (i == ranktable_size_) break; + uint32_t nbytes = size < nbytes_total ? size : nbytes_total; + for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupIndex[g_[offset + j]]; + ranktable[i] = count; + offset += nbytes; + nbytes_total -= size; + ++i; + } + ranktable_ = ranktable; +} + +uint32_t MPHIndex::Rank(uint32_t vertex) const { + uint32_t index = vertex >> b_; + uint32_t base_rank = ranktable_[index]; + uint32_t beg_idx_v = index << b_; + uint32_t beg_idx_b = beg_idx_v >> 2; + uint32_t end_idx_b = vertex >> 2; + while (beg_idx_b < end_idx_b) base_rank += kBdzLookupIndex[g_.data()[beg_idx_b++]]; + beg_idx_v = beg_idx_b << 2; + // cerr << "beg_idx_v: " << beg_idx_v << endl; + // cerr << "base rank: " << base_rank << endl; + // cerr << "G: "; + // for (unsigned int i = 0; i < n_; ++i) { + // cerr << static_cast(g_[i]) << " "; + // } + // cerr << endl; + while (beg_idx_v < vertex) { + if (g_[beg_idx_v] != kUnassigned) ++base_rank; + ++beg_idx_v; + } + // cerr << "Base rank: " << base_rank << endl; + return base_rank; +} + +} // namespace cxxmph diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h new file mode 100644 index 0000000..2a217bc --- /dev/null +++ b/cxxmph/mph_index.h @@ -0,0 +1,230 @@ +#ifndef __CXXMPH_MPH_INDEX_H__ +#define __CXXMPH_MPH_INDEX_H__ + +// Minimal perfect hash abstraction implementing the BDZ algorithm +// +// This is a data structure that given a set of known keys S, will create a +// mapping from S to [0..|S|). The class is informed about S through the Reset +// method and the mapping is queried by calling index(key). +// +// This is a pretty uncommon data structure, and if you application has a real +// use case for it, chances are that it is a real win. If all you are doing is +// a straightforward implementation of an in-memory associative mapping data +// structure (e.g., mph_map.h), then it will probably be slower, since that the +// evaluation of index() is typically slower than the total cost of running a +// traditional hash function over a key and doing 2-3 conflict resolutions on +// 100byte-ish strings. +// +// Thesis presenting this and similar algorithms: +// http://homepages.dcc.ufmg.br/~fbotelho/en/talks/thesis2008/thesis.pdf +// +// +// Notes: +// +// Most users can use the SimpleMPHIndex wrapper instead of the MPHIndex which +// have confusing template parameters. +// This class only implements a minimal perfect hash function, it does not +// implement an associative mapping data structure. + +#include + +#include +#include +#include +#include // for std::hash +#include + +#include + +using std::cerr; +using std::endl; + +#include "seeded_hash.h" +#include "mph_bits.h" +#include "trigraph.h" + +namespace cxxmph { + +class MPHIndex { + public: + MPHIndex(double c = 1.23, uint8_t b = 7) : + c_(c), b_(b), m_(0), n_(0), k_(0), r_(1), + ranktable_(NULL), ranktable_size_(0) { } + ~MPHIndex(); + + template + bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size); + template // must agree with Reset + // Get a unique identifier for k, in the range [0;size()). If x wasn't part + // of the input in the last Reset call, returns a random value. + uint32_t index(const Key& x) const; + uint32_t size() const { return m_; } + void clear(); + + // Advanced users functions. Please avoid unless you know what you are doing. + uint32_t perfect_hash_size() const { return n_; } + template // must agree with Reset + uint32_t perfect_hash(const Key& x) const; + template // must agree with Reset + uint32_t minimal_perfect_hash(const Key& x) const; + + // Crazy functions. Ignore. + template // must agree with Reset + void hash_vector(const Key& x, uint32_t* h) const; + + private: + template + bool Mapping(ForwardIterator begin, ForwardIterator end, + std::vector* edges, + std::vector* queue); + bool GenerateQueue(TriGraph* graph, std::vector* queue); + void Assigning(const std::vector& edges, + const std::vector& queue); + void Ranking(); + uint32_t Rank(uint32_t vertex) const; + + // Algorithm parameters + // Perfect hash function density. If this was a 2graph, + // then probability of having an acyclic graph would be + // sqrt(1-(2/c)^2). See section 3 for details. + // http://www.it-c.dk/people/pagh/papers/simpleperf.pdf + double c_; + uint8_t b_; // Number of bits of the kth index in the ranktable + + // Values used during generation + uint32_t m_; // edges count + uint32_t n_; // vertex count + uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$ + + // Values used during search + + // Partition vertex count, derived from c parameter. + uint32_t r_; + uint32_t nest_displacement_[3]; // derived from r_ + + // The array containing the minimal perfect hash function graph. + dynamic_2bitset g_; + uint8_t threebit_mod3[10]; // speed up mod3 calculation for 3bit ints + // The table used for the rank step of the minimal perfect hash function + const uint32_t* ranktable_; + uint32_t ranktable_size_; + // The selected hash seed triplet for finding the edges in the minimal + // perfect hash function graph. + uint32_t hash_seed_[3]; +}; + +// Template method needs to go in the header file. +template +bool MPHIndex::Reset( + ForwardIterator begin, ForwardIterator end, uint32_t size) { + if (end == begin) { + clear(); + return true; + } + m_ = size; + r_ = static_cast(ceil((c_*m_)/3)); + if ((r_ % 2) == 0) r_ += 1; + // This can be used to speed mods, but increases occupation too much. + // Needs to try http://gmplib.org/manual/Integer-Exponentiation.html instead + // r_ = nextpoweroftwo(r_); + nest_displacement_[0] = 0; + nest_displacement_[1] = r_; + nest_displacement_[2] = (r_ << 1); + for (int i = 0; i < sizeof(threebit_mod3); ++i) threebit_mod3[i] = i % 3; + + n_ = 3*r_; + k_ = 1U << b_; + + // cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; + + int iterations = 1000; + std::vector edges; + std::vector queue; + while (1) { + // cerr << "Iterations missing: " << iterations << endl; + for (int i = 0; i < 3; ++i) hash_seed_[i] = random(); + if (Mapping(begin, end, &edges, &queue)) break; + else --iterations; + if (iterations == 0) break; + } + if (iterations == 0) return false; + Assigning(edges, queue); + std::vector().swap(edges); + Ranking(); + return true; +} + +template +bool MPHIndex::Mapping( + ForwardIterator begin, ForwardIterator end, + std::vector* edges, std::vector* queue) { + TriGraph graph(n_, m_); + for (ForwardIterator it = begin; it != end; ++it) { + uint32_t h[4]; + SeededHashFcn().hash64(*it, hash_seed_[0], reinterpret_cast(&h)); + // for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); + uint32_t v0 = h[0] % r_; + uint32_t v1 = h[1] % r_ + r_; + uint32_t v2 = h[2] % r_ + (r_ << 1); + // cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; + graph.AddEdge(TriGraph::Edge(v0, v1, v2)); + } + if (GenerateQueue(&graph, queue)) { + graph.ExtractEdgesAndClear(edges); + return true; + } + return false; +} + +template +void MPHIndex::hash_vector(const Key& key, uint32_t* h) const { + SeededHashFcn().hash64(key, hash_seed_[0], h); +} + +template +uint32_t MPHIndex::perfect_hash(const Key& key) const { + uint32_t h[4]; + if (!g_.size()) return 0; + SeededHashFcn().hash64(key, hash_seed_[0], h); + h[0] = (h[0] % r_) + nest_displacement_[0]; + h[1] = (h[1] % r_) + nest_displacement_[1]; + h[2] = (h[2] % r_) + nest_displacement_[2]; + // h[0] = (h[0] & (r_-1)) + nest_displacement_[0]; + // h[1] = (h[1] & (r_-1)) + nest_displacement_[1]; + // h[2] = (h[2] & (r_-1)) + nest_displacement_[2]; + assert((h[0]) < g_.size()); + assert((h[1]) < g_.size()); + assert((h[2]) < g_.size()); + uint8_t nest = threebit_mod3[ + g_[h[0]] + g_[h[1]] + g_[h[2]]]; + uint32_t vertex = h[nest]; + return vertex; +} +template +uint32_t MPHIndex::minimal_perfect_hash(const Key& key) const { + return Rank(perfect_hash(key)); +} + +template +uint32_t MPHIndex::index(const Key& key) const { + return minimal_perfect_hash(key); +} + +// Simple wrapper around MPHIndex to simplify calling code. Please refer to the +// MPHIndex class for documentation. +template >::hash_function> +class SimpleMPHIndex : public MPHIndex { + public: + template + bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size) { + return MPHIndex::Reset(begin, end, size); + } + uint32_t index(const Key& key) const { return MPHIndex::index(key); } + uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash(key); } + uint32_t minimal_perfect_hash(const Key& key) const { return MPHIndex::minimal_perfect_hash(key); } + void hash_vector(const Key& key, uint32_t* h) const { MPHIndex::hash_vector(key, h); } +}; + +} // namespace cxxmph + +#endif // __CXXMPH_MPH_INDEX_H__ diff --git a/cxxmph/mph_index_test.cc b/cxxmph/mph_index_test.cc new file mode 100644 index 0000000..d414bac --- /dev/null +++ b/cxxmph/mph_index_test.cc @@ -0,0 +1,42 @@ +#include +#include +#include +#include + +#include "mph_index.h" + +using std::string; +using std::vector; +using namespace cxxmph; + +int main(int argc, char** argv) { + + srand(1); + vector keys; + keys.push_back("davi"); + keys.push_back("paulo"); + keys.push_back("joao"); + keys.push_back("maria"); + keys.push_back("bruno"); + keys.push_back("paula"); + keys.push_back("diego"); + keys.push_back("diogo"); + keys.push_back("algume"); + + SimpleMPHIndex mph_index; + if (!mph_index.Reset(keys.begin(), keys.end(), keys.size())) { exit(-1); } + vector ids; + for (vector::size_type i = 0; i < keys.size(); ++i) { + ids.push_back(mph_index.index(keys[i])); + cerr << " " << *(ids.end() - 1); + } + cerr << endl; + sort(ids.begin(), ids.end()); + for (vector::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast::value_type>(i)); + /* + char* serialized = new char[mph_index.serialize_bytes_needed()]; + mph_index.serialize(serialized); + SimpleMPHIndex other_mph_index; + other_mph_index.deserialize(serialized); + */ +} diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h new file mode 100644 index 0000000..9440fe8 --- /dev/null +++ b/cxxmph/mph_map.h @@ -0,0 +1,261 @@ +#ifndef __CXXMPH_MPH_MAP_H__ +#define __CXXMPH_MPH_MAP_H__ +// Implementation of the unordered associative mapping interface using a +// minimal perfect hash function. +// +// This class is about 20% to 100% slower than unordered_map (or ext/hash_map) +// and should not be used if performance is a concern. In fact, you should only +// use it for educational purposes. +// +// See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl + +#include +#include +#include +#include +#include +#include +#include // for std::pair + +#include "mph_bits.h" +#include "mph_index.h" +#include "hollow_iterator.h" + +namespace cxxmph { + +using std::pair; +using std::make_pair; +using std::unordered_map; +using std::vector; + +// Save on repetitive typing. +#define MPH_MAP_TMPL_SPEC template +#define MPH_MAP_CLASS_SPEC mph_map +#define MPH_MAP_METHOD_DECL(r, m) MPH_MAP_TMPL_SPEC typename MPH_MAP_CLASS_SPEC::r MPH_MAP_CLASS_SPEC::m + +template , class EqualKey = std::equal_to, class Alloc = std::allocator > +class mph_map { + public: + typedef Key key_type; + typedef Data data_type; + typedef pair value_type; + typedef HashFcn hasher; + typedef EqualKey key_equal; + + typedef typename std::vector::pointer pointer; + typedef typename std::vector::reference reference; + typedef typename std::vector::const_reference const_reference; + typedef typename std::vector::size_type size_type; + typedef typename std::vector::difference_type difference_type; + + typedef hollow_iterator> iterator; + typedef hollow_const_iterator> const_iterator; + + // For making macros simpler. + typedef void void_type; + typedef bool bool_type; + typedef pair insert_return_type; + + mph_map(); + ~mph_map(); + + iterator begin(); + iterator end(); + const_iterator begin() const; + const_iterator end() const; + size_type size() const; + bool empty() const; + void clear(); + void erase(iterator pos); + void erase(const key_type& k); + pair insert(const value_type& x); + iterator find(const key_type& k) { return slow_find(k, index_.perfect_hash(k)); } + const_iterator find(const key_type& k) const { return slow_find(k, index_.perfect_hash(k)); }; + typedef int32_t my_int32_t; // help macros + int32_t index(const key_type& k) const; + data_type& operator[](const key_type &k); + const data_type& operator[](const key_type &k) const; + + size_type bucket_count() const { return index_.perfect_hash_size() + slack_.bucket_count(); } + void rehash(size_type nbuckets /*ignored*/); + + protected: // mimicking STL implementation + EqualKey equal_; + + private: + template + struct iterator_first : public iterator { + iterator_first(iterator it) : iterator(it) { } + const typename iterator::value_type::first_type& operator*() { + return this->iterator::operator*().first; + } + }; + + template + iterator_first make_iterator_first(iterator it) { + return iterator_first(it); + } + + iterator make_iterator(typename std::vector::iterator it) { + return hollow_iterator>(&values_, &present_, it); + } + const_iterator make_iterator(typename std::vector::const_iterator it) const { + return hollow_const_iterator>(&values_, &present_, it); + } + + // Experimental functions, not always faster + iterator fast_find(const key_type& k); + iterator slow_find(const key_type& k, uint32_t perfect_hash); + const_iterator slow_find(const key_type& k, uint32_t perfect_hash) const; + + void pack(); + std::vector values_; + std::vector present_; + SimpleMPHIndex::hash_function> index_; + // TODO(davi) optimize slack to hold 128 unique bits from hash64 as key + typedef unordered_map slack_type; + slack_type slack_; + size_type size_; + + mutable uint64_t fast_; + mutable uint64_t fast_taken_; + mutable uint64_t slow_; + mutable uint64_t very_slow_; +}; + +MPH_MAP_TMPL_SPEC +bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) { + return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin()); +} + +MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() : size_(0) { + clear(); + pack(); +} + +MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() { + // fprintf(stderr, "Fast taken: %d Fast: %d Slow %d very_slow %d ratio %f\n", fast_taken_, fast_, slow_, very_slow_, fast_*1.0/slow_); +} + +MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { + auto it = find(x.first); + auto it_end = end(); + if (it != it_end) return make_pair(it, false); + bool should_pack = false; + if (values_.capacity() == values_.size() && values_.size() > 256) { + should_pack = true; + } + values_.push_back(x); + present_.push_back(true); + ++size_; + slack_.insert(make_pair(x.first, values_.size() - 1)); + if (should_pack) pack(); + it = find(x.first); + slow_ = 0; + very_slow_ = 0; + fast_ = 0; + fast_taken_ = 0; + return make_pair(it, true); +} + +MPH_MAP_METHOD_DECL(void_type, pack)() { + // fprintf(stderr, "Paki %d values\n", values_.size()); + if (values_.empty()) return; + assert(std::unordered_set(make_iterator_first(begin()), make_iterator_first(end())).size() == size()); + bool success = index_.Reset( + make_iterator_first(begin()), + make_iterator_first(end()), size_); + assert(success); + std::vector new_values(index_.perfect_hash_size()); + new_values.reserve(new_values.size() * 2); + std::vector new_present(index_.perfect_hash_size(), false); + new_present.reserve(new_present.size() * 2); + for (iterator it = begin(), it_end = end(); it != it_end; ++it) { + size_type id = index_.perfect_hash(it->first); + assert(id < new_values.size()); + new_values[id] = *it; + new_present[id] = true; + } + // fprintf(stderr, "Collision ratio: %f\n", collisions*1.0/size()); + values_.swap(new_values); + present_.swap(new_present); + slack_type().swap(slack_); +} + +MPH_MAP_METHOD_DECL(iterator, begin)() { return make_iterator(values_.begin()); } +MPH_MAP_METHOD_DECL(iterator, end)() { return make_iterator(values_.end()); } +MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return make_iterator(values_.begin()); } +MPH_MAP_METHOD_DECL(const_iterator, end)() const { return make_iterator(values_.end()); } +MPH_MAP_METHOD_DECL(bool_type, empty)() const { return size_ == 0; } +MPH_MAP_METHOD_DECL(size_type, size)() const { return size_; } + +MPH_MAP_METHOD_DECL(void_type, clear)() { + values_.clear(); + present_.clear(); + slack_.clear(); + index_.clear(); + size_ = 0; +} + +MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { + present_[pos - begin] = false; + uint32_t h[4]; + index_.hash_vector(pos->first, &h); + *pos = value_type(); + --size_; +} +MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { + iterator it = find(k); + if (it == end()) return; + erase(it); +} + +MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfect_hash) const { + if (__builtin_expect(index_.perfect_hash_size(), 1)) { + if (__builtin_expect(present_[perfect_hash], true)) { + auto vit = values_.begin() + perfect_hash; + if (equal_(k, vit->first)) return make_iterator(vit); + } + } + if (__builtin_expect(!slack_.empty(), 0)) { + ++very_slow_; + auto sit = slack_.find(k); + if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second); + } + return end(); +} + +MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_hash) { + if (__builtin_expect(index_.perfect_hash_size(), 1)) { + if (__builtin_expect(present_[perfect_hash], true)) { + auto vit = values_.begin() + perfect_hash; + if (equal_(k, vit->first)) return make_iterator(vit); + } + } + if (__builtin_expect(!slack_.empty(), 0)) { + ++very_slow_; + auto sit = slack_.find(k); + if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second); + } + return end(); +} + +MPH_MAP_METHOD_DECL(my_int32_t, index)(const key_type& k) const { + if (index_.size() == 0) return -1; + return index_.perfect_hash(k); +} + +MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) { + return insert(make_pair(k, data_type())).first->second; +} +MPH_MAP_METHOD_DECL(void_type, rehash)(size_type nbuckets) { + pack(); + vector(values_.begin(), values_.end()).swap(values_); + vector(present_.begin(), present_.end()).swap(present_); + slack_type().swap(slack_); +} + + +} // namespace cxxmph + +#endif // __CXXMPH_MPH_MAP_H__ diff --git a/cxxmph/mph_map_test.cc b/cxxmph/mph_map_test.cc new file mode 100644 index 0000000..dd8eb5a --- /dev/null +++ b/cxxmph/mph_map_test.cc @@ -0,0 +1,61 @@ +#include +#include +#include +#include + +#include "mph_map.h" + +using std::make_pair; +using std::string; +using cxxmph::mph_map; + +int main(int argc, char** argv) { + mph_map b; + int32_t num_keys = 1000*10; + for (int i = 0; i < num_keys; ++i) { + b.insert(make_pair(i, i)); + } + b.rehash(b.size()); + for (int i = 0; i < 1000000; ++i) { + auto it = b.find(i % num_keys); + if (it == b.end()) { + std::cerr << "Failed to find " << i << std::endl; + exit(-1); + } + if (it->first != it->second || it->first != i % num_keys) { + std::cerr << "Found " << it->first << " looking for " << i << std::endl; + exit(-1); + } + } + /* + mph_map h; + h.insert(std::make_pair("-1",-1)); + mph_map::const_iterator it; + for (it = h.begin(); it != h.end(); ++it) { + if (it->second != -1) exit(-1); + } + int32_t num_valid = 100; + for (int i = 0; i < num_valid; ++i) { + char buf[10]; + snprintf(buf, 10, "%d", i); + h.insert(std::make_pair(buf, i)); + } + for (int j = 0; j < 100; ++j) { + for (int i = 1000; i > 0; --i) { + char buf[10]; + snprintf(buf, 10, "%d", i - 1); + auto it = h.find(buf); + if (i < num_valid && it->second != i - 1) exit(-1); + } + } + for (int j = 0; j < 100; ++j) { + for (int i = 1000; i > 0; --i) { + char buf[10]; + int key = i*100 - 1; + snprintf(buf, 10, "%d", key); + auto it = h.find(buf); + if (key < num_valid && it->second != key) exit(-1); + } + } + */ +} diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h new file mode 100644 index 0000000..0979ef1 --- /dev/null +++ b/cxxmph/seeded_hash.h @@ -0,0 +1,154 @@ +#ifndef __CXXMPH_SEEDED_HASH_H__ +#define __CXXMPH_SEEDED_HASH_H__ + +#include // for uint32_t and friends + +#include +#include // for std::hash + +#include "MurmurHash3.h" +#include "stringpiece.h" + +// From murmur, only used naively to extend 32 bits functions to 128 bits. +uint32_t fmix ( uint32_t h ); +// Used for a quick and dirty hash function for integers. Probably a bad idea. +uint64_t fmix ( uint64_t h ); + +namespace cxxmph { + +template +struct seeded_hash_function { + template + uint32_t operator()(const Key& k, uint32_t seed) const { + return HashFcn()(k) ^ seed; + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + for (int i = 0; i < 4; ++i) { + out[i] = HashFcn()(k) ^ seed; + seed = fmix(seed); + } + } +}; + +struct Murmur3 { + template + uint32_t operator()(const Key& k) const { + uint32_t out; + MurmurHash3_x86_32(reinterpret_cast(&k), sizeof(Key), 1 /* seed */, &out); + return out; + } + template + void hash64(const Key& k, uint32_t* out) const { + MurmurHash3_x64_128(reinterpret_cast(&k), sizeof(Key), 1 /* seed */, out); + } +}; + +struct Murmur3StringPiece { + template + uint32_t operator()(const Key& k) const { + StringPiece s(k); + uint32_t out; + MurmurHash3_x86_32(s.data(), s.length(), 1 /* seed */, &out); + return out; + } + template + void hash64(const Key& k, uint32_t* out) const { + StringPiece s(k); + MurmurHash3_x64_128(s.data(), s.length(), 1 /* seed */, out); + } +}; + +struct Murmur3Fmix64bitsType { + template + uint32_t operator()(const Key& k) const { + return fmix(*reinterpret_cast(&k)); + } + template + void hash64(const Key& k, uint32_t* out) const { + *reinterpret_cast(out) = fmix(k); + *(out + 2) = fmix(*out); + } +}; + +template <> +struct seeded_hash_function { + template + uint32_t operator()(const Key& k, uint32_t seed) const { + uint32_t out; + MurmurHash3_x86_32(reinterpret_cast(&k), sizeof(Key), seed, &out); + return out; + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + MurmurHash3_x64_128(reinterpret_cast(&k), sizeof(Key), seed, out); + } +}; + +template <> +struct seeded_hash_function { + template + uint32_t operator()(const Key& k, uint32_t seed) const { + StringPiece s(k); + uint32_t out; + MurmurHash3_x86_32(s.data(), s.length(), seed, &out); + return out; + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + StringPiece s(k); + MurmurHash3_x64_128(s.data(), s.length(), seed, out); + } +}; + +template <> +struct seeded_hash_function { + template + uint32_t operator()(const Key& k, uint32_t seed) const { + return fmix(k + seed); + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + *reinterpret_cast(out) = fmix(k ^ seed); + *(out + 2) = fmix(*out); + } +}; + + +template struct seeded_hash +{ typedef seeded_hash_function hash_function; }; +// Use Murmur3 instead for all types defined in std::hash, plus +// std::string which is commonly extended. +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; + +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; + +} // namespace cxxmph + +#endif // __CXXMPH_SEEDED_HASH_H__ diff --git a/cxxmph/stringpiece.h b/cxxmph/stringpiece.h new file mode 100644 index 0000000..06cea3a --- /dev/null +++ b/cxxmph/stringpiece.h @@ -0,0 +1,182 @@ +// Copyright 2001-2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// A string-like object that points to a sized piece of memory. +// +// Functions or methods may use const StringPiece& parameters to accept either +// a "const char*" or a "string" value that will be implicitly converted to +// a StringPiece. The implicit conversion means that it is often appropriate +// to include this .h file in other files rather than forward-declaring +// StringPiece as would be appropriate for most other Google classes. +// +// Systematic usage of StringPiece is encouraged as it will reduce unnecessary +// conversions from "const char*" to "string" and back again. +// +// +// Arghh! I wish C++ literals were "string". + +#ifndef CXXMPH_STRINGPIECE_H__ +#define CXXMPH_STRINGPIECE_H__ + +#include +#include +#include +#include + +namespace cxxmph { + +class StringPiece { + private: + const char* ptr_; + int length_; + + public: + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected. + StringPiece() : ptr_(NULL), length_(0) { } + StringPiece(const char* str) + : ptr_(str), length_((str == NULL) ? 0 : static_cast(strlen(str))) { } + StringPiece(const std::string& str) + : ptr_(str.data()), length_(static_cast(str.size())) { } + StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { } + + // data() may return a pointer to a buffer with embedded NULs, and the + // returned buffer may or may not be null terminated. Therefore it is + // typically a mistake to pass data() to a routine that expects a NUL + // terminated string. + const char* data() const { return ptr_; } + int size() const { return length_; } + int length() const { return length_; } + bool empty() const { return length_ == 0; } + + void clear() { ptr_ = NULL; length_ = 0; } + void set(const char* data, int len) { ptr_ = data; length_ = len; } + void set(const char* str) { + ptr_ = str; + if (str != NULL) + length_ = static_cast(strlen(str)); + else + length_ = 0; + } + void set(const void* data, int len) { + ptr_ = reinterpret_cast(data); + length_ = len; + } + + char operator[](int i) const { return ptr_[i]; } + + void remove_prefix(int n) { + ptr_ += n; + length_ -= n; + } + + void remove_suffix(int n) { + length_ -= n; + } + + int compare(const StringPiece& x) const { + int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_)); + if (r == 0) { + if (length_ < x.length_) r = -1; + else if (length_ > x.length_) r = +1; + } + return r; + } + + std::string as_string() const { + return std::string(data(), size()); + } + // We also define ToString() here, since many other string-like + // interfaces name the routine that converts to a C++ string + // "ToString", and it's confusing to have the method that does that + // for a StringPiece be called "as_string()". We also leave the + // "as_string()" method defined here for existing code. + std::string ToString() const { + return std::string(data(), size()); + } + + void CopyToString(std::string* target) const; + void AppendToString(std::string* target) const; + + // Does "this" start with "x" + bool starts_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (memcmp(ptr_, x.ptr_, x.length_) == 0)); + } + + // Does "this" end with "x" + bool ends_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); + } + + // standard STL container boilerplate + typedef char value_type; + typedef const char* pointer; + typedef const char& reference; + typedef const char& const_reference; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + static const size_type npos; + typedef const char* const_iterator; + typedef const char* iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + iterator begin() const { return ptr_; } + iterator end() const { return ptr_ + length_; } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(ptr_ + length_); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(ptr_); + } + // STLS says return size_type, but Google says return int + int max_size() const { return length_; } + int capacity() const { return length_; } + + int copy(char* buf, size_type n, size_type pos = 0) const; + + int find(const StringPiece& s, size_type pos = 0) const; + int find(char c, size_type pos = 0) const; + int rfind(const StringPiece& s, size_type pos = npos) const; + int rfind(char c, size_type pos = npos) const; + + StringPiece substr(size_type pos, size_type n = npos) const; +}; + +inline bool operator==(const StringPiece& x, const StringPiece& y) { + return x.length() == y.length() && memcmp(x.data(), y.data(), x.length()) == 0; +} + +inline bool operator!=(const StringPiece& x, const StringPiece& y) { + return !(x == y); +} + +inline bool operator<(const StringPiece& x, const StringPiece& y) { + const int r = memcmp(x.data(), y.data(), + std::min(x.size(), y.size())); + return ((r < 0) || ((r == 0) && (x.size() < y.size()))); +} + +inline bool operator>(const StringPiece& x, const StringPiece& y) { + return y < x; +} + +inline bool operator<=(const StringPiece& x, const StringPiece& y) { + return !(x > y); +} + +inline bool operator>=(const StringPiece& x, StringPiece& y) { + return !(x < y); +} + +} // namespace cxxmph + +// allow StringPiece to be logged +inline std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece) { + o << piece.as_string(); return o; +} + +#endif // CXXMPH_STRINGPIECE_H__ diff --git a/cxxmph/trigraph.cc b/cxxmph/trigraph.cc new file mode 100644 index 0000000..5e9fd66 --- /dev/null +++ b/cxxmph/trigraph.cc @@ -0,0 +1,81 @@ +#include +#include +#include + +#include "trigraph.h" + +using std::cerr; +using std::endl; +using std::vector; + +namespace { +static const uint32_t kInvalidEdge = std::numeric_limits::max(); +} + +namespace cxxmph { + +TriGraph::TriGraph(uint32_t nvertices, uint32_t nedges) + : nedges_(0), + edges_(nedges), + next_edge_(nedges), + first_edge_(nvertices, kInvalidEdge), + vertex_degree_(nvertices, 0) { } + +void TriGraph::ExtractEdgesAndClear(vector* edges) { + vector().swap(next_edge_); + vector().swap(first_edge_); + vector().swap(vertex_degree_); + nedges_ = 0; + edges->swap(edges_); +} +void TriGraph::AddEdge(const Edge& edge) { + edges_[nedges_] = edge; + assert(first_edge_.size() > edge[0]); + assert(first_edge_.size() > edge[1]); + assert(first_edge_.size() > edge[0]); + assert(first_edge_.size() > edge[1]); + assert(first_edge_.size() > edge[2]); + assert(next_edge_.size() > nedges_); + next_edge_[nedges_] = Edge( + first_edge_[edge[0]], first_edge_[edge[1]], first_edge_[edge[2]]); + first_edge_[edge[0]] = first_edge_[edge[1]] = first_edge_[edge[2]] = nedges_; + ++vertex_degree_[edge[0]]; + ++vertex_degree_[edge[1]]; + ++vertex_degree_[edge[2]]; + ++nedges_; +} + +void TriGraph::RemoveEdge(uint32_t current_edge) { + // cerr << "Removing edge " << current_edge << " from " << nedges_ << " existing edges " << endl; + for (int i = 0; i < 3; ++i) { + uint32_t vertex = edges_[current_edge][i]; + uint32_t edge1 = first_edge_[vertex]; + uint32_t edge2 = kInvalidEdge; + uint32_t j = 0; + while (edge1 != current_edge && edge1 != kInvalidEdge) { + edge2 = edge1; + if (edges_[edge1][0] == vertex) j = 0; + else if (edges_[edge1][1] == vertex) j = 1; + else j = 2; + edge1 = next_edge_[edge1][j]; + } + assert(edge1 != kInvalidEdge); + if (edge2 != kInvalidEdge) next_edge_[edge2][j] = next_edge_[edge1][i]; + else first_edge_[vertex] = next_edge_[edge1][i]; + --vertex_degree_[vertex]; + } +} + +void TriGraph::DebugGraph() const { + int i; + for(i = 0; i < edges_.size(); i++){ + cerr << i << " " << edges_[i][0] << " " << edges_[i][1] << " " << edges_[i][2] + << " nexts " << next_edge_[i][0] << " " << next_edge_[i][1] << " " << next_edge_[i][2] << endl; + } + for(i = 0; i < first_edge_.size();i++){ + cerr << "first for vertice " < // for uint32_t and friends + +#include + +namespace cxxmph { + +class TriGraph { + public: + struct Edge { + Edge() { } + Edge(uint32_t v0, uint32_t v1, uint32_t v2) { + vertices[0] = v0; + vertices[1] = v1; + vertices[2] = v2; + } + uint32_t& operator[](uint8_t v) { return vertices[v]; } + const uint32_t& operator[](uint8_t v) const { return vertices[v]; } + uint32_t vertices[3]; + }; + TriGraph(uint32_t nedges, uint32_t nvertices); + void AddEdge(const Edge& edge); + void RemoveEdge(uint32_t edge_id); + void ExtractEdgesAndClear(std::vector* edges); + void DebugGraph() const; + + const std::vector& edges() const { return edges_; } + const std::vector& vertex_degree() const { return vertex_degree_; } + const std::vector& first_edge() const { return first_edge_; } + + private: + uint32_t nedges_; // total number of edges + std::vector edges_; + std::vector next_edge_; // for implementing removal + std::vector first_edge_; // the first edge for this vertex + std::vector vertex_degree_; // number of edges for this vertex +}; + +} // namespace cxxmph + +#endif // __CXXMPH_TRIGRAPH_H__ diff --git a/cxxmph/trigraph_test.cc b/cxxmph/trigraph_test.cc new file mode 100644 index 0000000..6220138 --- /dev/null +++ b/cxxmph/trigraph_test.cc @@ -0,0 +1,22 @@ +#include + +#include "trigraph.h" + +using cxxmph::TriGraph; + +int main(int argc, char** argv) { + TriGraph g(4, 2); + g.AddEdge(TriGraph::Edge(0, 1, 2)); + g.AddEdge(TriGraph::Edge(1, 3, 2)); + assert(g.vertex_degree()[0] == 1); + assert(g.vertex_degree()[1] == 2); + assert(g.vertex_degree()[2] == 2); + assert(g.vertex_degree()[3] == 1); + g.RemoveEdge(0); + assert(g.vertex_degree()[0] == 0); + assert(g.vertex_degree()[1] == 1); + assert(g.vertex_degree()[2] == 1); + assert(g.vertex_degree()[3] == 1); + std::vector edges; + g.ExtractEdgesAndClear(&edges); +} diff --git a/src/Makefile.am b/src/Makefile.am index f3896dc..0ab079a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,8 +1,9 @@ bin_PROGRAMS = cmph +noinst_PROGRAMS = bm_numbers lib_LTLIBRARIES = libcmph.la include_HEADERS = cmph.h cmph_types.h cmph_time.h chd_ph.h libcmph_la_SOURCES = hash.h hash.c \ - jenkins_hash.h jenkins_hash.c\ + jenkins_hash.h jenkins_hash.c MurmurHash2.h\ hash_state.h debug.h \ vstack.h vstack.c vqueue.h vqueue.c\ graph.h graph.c bitbool.h \ @@ -23,9 +24,14 @@ libcmph_la_SOURCES = hash.h hash.c \ select.h select.c select_lookup_tables.h \ compressed_seq.h compressed_seq.c \ compressed_rank.h compressed_rank.c \ + linear_string_map.h linear_string_map.c \ + cmph_benchmark.h cmph_benchmark.c \ cmph_time.h libcmph_la_LDFLAGS = -version-info 0:0:0 cmph_SOURCES = main.c wingetopt.h wingetopt.c cmph_LDADD = libcmph.la + +bm_numbers_SOURCES = bm_numbers.c +bm_numbers_LDADD = libcmph.la diff --git a/src/bdz.c b/src/bdz.c index e6ce700..2c0de90 100755 --- a/src/bdz.c +++ b/src/bdz.c @@ -9,7 +9,7 @@ #include #include #include -//#define DEBUG +// #define DEBUG #include "debug.h" #define UNASSIGNED 3U #define NULL_EDGE 0xffffffff @@ -35,9 +35,9 @@ const cmph_uint8 bdz_lookup_table[] = 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0 -}; +}; -typedef struct +typedef struct { cmph_uint32 vertices[3]; cmph_uint32 next_edges[3]; @@ -54,12 +54,12 @@ static void bdz_free_queue(bdz_queue_t * queue) free(*queue); }; -typedef struct +typedef struct { cmph_uint32 nedges; bdz_edge_t * edges; cmph_uint32 * first_edge; - cmph_uint8 * vert_degree; + cmph_uint8 * vert_degree; }bdz_graph3_t; @@ -67,7 +67,7 @@ static void bdz_alloc_graph3(bdz_graph3_t * graph3, cmph_uint32 nedges, cmph_uin { graph3->edges=malloc(nedges*sizeof(bdz_edge_t)); graph3->first_edge=malloc(nvertices*sizeof(cmph_uint32)); - graph3->vert_degree=malloc((size_t)nvertices); + graph3->vert_degree=malloc((size_t)nvertices); }; static void bdz_init_graph3(bdz_graph3_t * graph3, cmph_uint32 nedges, cmph_uint32 nvertices) { @@ -115,10 +115,12 @@ static void bdz_dump_graph(bdz_graph3_t* graph3, cmph_uint32 nedges, cmph_uint32 graph3->edges[i].next_edges[1],graph3->edges[i].next_edges[2]); }; + #ifdef DEBUG for(i=0;ifirst_edge[i]); }; + #endif }; static void bdz_remove_edge(bdz_graph3_t * graph3, cmph_uint32 curr_edge) @@ -134,7 +136,7 @@ static void bdz_remove_edge(bdz_graph3_t * graph3, cmph_uint32 curr_edge) j=0; } else if(graph3->edges[edge1].vertices[1]==vert){ j=1; - } else + } else j=2; edge1=graph3->edges[edge1].next_edges[j]; }; @@ -143,16 +145,16 @@ static void bdz_remove_edge(bdz_graph3_t * graph3, cmph_uint32 curr_edge) bdz_dump_graph(graph3,graph3->nedges,graph3->nedges+graph3->nedges/4); exit(-1); }; - + if(edge2!=NULL_EDGE){ - graph3->edges[edge2].next_edges[j] = + graph3->edges[edge2].next_edges[j] = graph3->edges[edge1].next_edges[i]; - } else + } else graph3->first_edge[vert]= graph3->edges[edge1].next_edges[i]; graph3->vert_degree[vert]--; }; - + }; static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_queue_t queue, bdz_graph3_t* graph3) @@ -168,7 +170,7 @@ static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_que v0=graph3->edges[i].vertices[0]; v1=graph3->edges[i].vertices[1]; v2=graph3->edges[i].vertices[2]; - if(graph3->vert_degree[v0]==1 || + if(graph3->vert_degree[v0]==1 || graph3->vert_degree[v1]==1 || graph3->vert_degree[v2]==1){ if(!GETBIT(marked_edge,i)) { @@ -177,9 +179,14 @@ static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_que } }; }; + DEBUGP("Queue head %d Queue tail %d\n", queue_head, queue_tail); + #ifdef DEBUG + bdz_dump_graph(graph3,graph3->nedges,graph3->nedges+graph3->nedges/4); + #endif while(queue_tail!=queue_head){ curr_edge=queue[queue_tail++]; bdz_remove_edge(graph3,curr_edge); + DEBUGP("Removing edge %d\n", curr_edge); v0=graph3->edges[curr_edge].vertices[0]; v1=graph3->edges[curr_edge].vertices[1]; v2=graph3->edges[curr_edge].vertices[2]; @@ -189,7 +196,7 @@ static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_que queue[queue_head++]=tmp_edge; SETBIT(marked_edge,tmp_edge); }; - + }; if(graph3->vert_degree[v1]==1) { tmp_edge=graph3->first_edge[v1]; @@ -197,7 +204,7 @@ static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_que queue[queue_head++]=tmp_edge; SETBIT(marked_edge,tmp_edge); }; - + }; if(graph3->vert_degree[v2]==1){ tmp_edge=graph3->first_edge[v2]; @@ -220,7 +227,7 @@ bdz_config_data_t *bdz_config_new(void) { bdz_config_data_t *bdz; bdz = (bdz_config_data_t *)malloc(sizeof(bdz_config_data_t)); - assert(bdz); + if (!bdz) return NULL; memset(bdz, 0, sizeof(bdz_config_data_t)); bdz->hashfunc = CMPH_HASH_JENKINS; bdz->g = NULL; @@ -321,10 +328,10 @@ cmph_t *bdz_new(cmph_config_t *mph, double c) fprintf(stderr, "acyclic graph creation failure - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } + } else break; } - + if (iterations == 0) { bdz_free_queue(&edges); @@ -346,7 +353,7 @@ cmph_t *bdz_new(cmph_config_t *mph, double c) fprintf(stderr, "Entering ranking step for mph creation of %u keys with graph sized %u\n", bdz->m, bdz->n); } ranking(bdz); - #ifdef CMPH_TIMING + #ifdef CMPH_TIMING ELAPSED_TIME_IN_SECONDS(&construction_time); #endif mphf = (cmph_t *)malloc(sizeof(cmph_t)); @@ -374,17 +381,17 @@ cmph_t *bdz_new(cmph_config_t *mph, double c) } - #ifdef CMPH_TIMING + #ifdef CMPH_TIMING register cmph_uint32 space_usage = bdz_packed_size(mphf)*8; register cmph_uint32 keys_per_bucket = 1; construction_time = construction_time - construction_time_begin; fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", bdz->m, bdz->m/(double)bdz->n, keys_per_bucket, construction_time, space_usage/(double)bdz->m); - #endif + #endif return mphf; } - + static int bdz_mapping(cmph_config_t *mph, bdz_graph3_t* graph3, bdz_queue_t queue) { cmph_uint32 e; @@ -398,15 +405,16 @@ static int bdz_mapping(cmph_config_t *mph, bdz_graph3_t* graph3, bdz_queue_t que cmph_uint32 h0, h1, h2; cmph_uint32 keylen; char *key = NULL; - mph->key_source->read(mph->key_source->data, &key, &keylen); + mph->key_source->read(mph->key_source->data, &key, &keylen); hash_vector(bdz->hl, key, keylen,hl); h0 = hl[0] % bdz->r; h1 = hl[1] % bdz->r + bdz->r; h2 = hl[2] % bdz->r + (bdz->r << 1); + DEBUGP("Key: %.*s (%u %u %u)\n", keylen, key, h0, h1, h2); mph->key_source->dispose(mph->key_source->data, key, keylen); bdz_add_edge(graph3,h0,h1,h2); } - cycles = bdz_generate_queue(bdz->m, bdz->n, queue, graph3); + cycles = bdz_generate_queue(bdz->m, bdz->n, queue, graph3); return (cycles == 0); } @@ -418,7 +426,7 @@ static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t cmph_uint32 v0,v1,v2; cmph_uint8 * marked_vertices =malloc((size_t)(bdz->n >> 3) + 1); cmph_uint32 sizeg = (cmph_uint32)ceil(bdz->n/4.0); - bdz->g = (cmph_uint8 *)calloc((size_t)(sizeg), sizeof(cmph_uint8)); + bdz->g = (cmph_uint8 *)calloc((size_t)(sizeg), sizeof(cmph_uint8)); memset(marked_vertices, 0, (size_t)(bdz->n >> 3) + 1); memset(bdz->g, 0xff, (size_t)(sizeg)); @@ -427,16 +435,16 @@ static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t v0=graph3->edges[curr_edge].vertices[0]; v1=graph3->edges[curr_edge].vertices[1]; v2=graph3->edges[curr_edge].vertices[2]; - DEBUGP("B:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz->g, v0), GETVALUE(bdz->g, v1), GETVALUE(bdz->g, v2)); + DEBUGP("B:%u %u %u -- %u %u %u edge %u\n", v0, v1, v2, GETVALUE(bdz->g, v0), GETVALUE(bdz->g, v1), GETVALUE(bdz->g, v2), curr_edge); if(!GETBIT(marked_vertices, v0)){ if(!GETBIT(marked_vertices,v1)) { - SETVALUE1(bdz->g, v1, UNASSIGNED); + SETVALUE1(bdz->g, v1, UNASSIGNED); SETBIT(marked_vertices, v1); } if(!GETBIT(marked_vertices,v2)) { - SETVALUE1(bdz->g, v2, UNASSIGNED); + SETVALUE1(bdz->g, v2, UNASSIGNED); SETBIT(marked_vertices, v2); } SETVALUE1(bdz->g, v0, (6-(GETVALUE(bdz->g, v1) + GETVALUE(bdz->g,v2)))%3); @@ -499,7 +507,7 @@ int bdz_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(&(data->r), sizeof(cmph_uint32), (size_t)1, fd); - + cmph_uint32 sizeg = (cmph_uint32)ceil(data->n/4.0); nbytes = fwrite(data->g, sizeof(cmph_uint8)*sizeg, (size_t)1, fd); @@ -533,12 +541,12 @@ void bdz_load(FILE *f, cmph_t *mphf) nbytes = fread(buf, (size_t)buflen, (size_t)1, f); bdz->hl = hash_state_load(buf, buflen); free(buf); - + DEBUGP("Reading m and n\n"); - nbytes = fread(&(bdz->n), sizeof(cmph_uint32), (size_t)1, f); - nbytes = fread(&(bdz->m), sizeof(cmph_uint32), (size_t)1, f); - nbytes = fread(&(bdz->r), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz->n), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz->m), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz->r), sizeof(cmph_uint32), (size_t)1, f); sizeg = (cmph_uint32)ceil(bdz->n/4.0); bdz->g = (cmph_uint8 *)calloc((size_t)(sizeg), sizeof(cmph_uint8)); nbytes = fread(bdz->g, sizeg*sizeof(cmph_uint8), (size_t)1, f); @@ -558,7 +566,7 @@ void bdz_load(FILE *f, cmph_t *mphf) #endif return; } - + static inline cmph_uint32 rank(cmph_uint32 b, cmph_uint32 * ranktable, cmph_uint8 * g, cmph_uint32 vertex) { @@ -570,15 +578,17 @@ static inline cmph_uint32 rank(cmph_uint32 b, cmph_uint32 * ranktable, cmph_uint while(beg_idx_b < end_idx_b) { base_rank += bdz_lookup_table[*(g + beg_idx_b++)]; - + } + DEBUGP("base rank %u\n", base_rank); beg_idx_v = beg_idx_b << 2; - while(beg_idx_v < vertex) + DEBUGP("beg_idx_v %u\n", beg_idx_v); + while(beg_idx_v < vertex) { if(GETVALUE(g, beg_idx_v) != UNASSIGNED) base_rank++; beg_idx_v++; } - + return base_rank; } @@ -592,6 +602,7 @@ cmph_uint32 bdz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) hl[1] = hl[1] % bdz->r + bdz->r; hl[2] = hl[2] % bdz->r + (bdz->r << 1); vertex = hl[(GETVALUE(bdz->g, hl[0]) + GETVALUE(bdz->g, hl[1]) + GETVALUE(bdz->g, hl[2])) % 3]; + DEBUGP("Search found vertex %u\n", vertex); return rank(bdz->b, bdz->ranktable, bdz->g, vertex); } @@ -599,7 +610,7 @@ cmph_uint32 bdz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) void bdz_destroy(cmph_t *mphf) { bdz_data_t *data = (bdz_data_t *)mphf->data; - free(data->g); + free(data->g); hash_state_destroy(data->hl); free(data->ranktable); free(data); @@ -649,18 +660,18 @@ void bdz_pack(cmph_t *mphf, void *packed_mphf) * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 bdz_packed_size(cmph_t *mphf) { bdz_data_t *data = (bdz_data_t *)mphf->data; - CMPH_HASH hl_type = hash_get_type(data->hl); + CMPH_HASH hl_type = hash_get_type(data->hl); return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(hl_type) + 3*sizeof(cmph_uint32) + sizeof(cmph_uint32)*(data->ranktablesize) + sizeof(cmph_uint8) + sizeof(cmph_uint8)* (cmph_uint32)(ceil(data->n/4.0))); } /** cmph_uint32 bdz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -668,13 +679,13 @@ cmph_uint32 bdz_packed_size(cmph_t *mphf) */ cmph_uint32 bdz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) { - + register cmph_uint32 vertex; register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf; register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4; register cmph_uint32 *ranktable = (cmph_uint32*)(hl_ptr + hash_state_packed_size(hl_type)); - + register cmph_uint32 r = *ranktable++; register cmph_uint32 ranktablesize = *ranktable++; register cmph_uint8 * g = (cmph_uint8 *)(ranktable + ranktablesize); diff --git a/src/bdz_ph.c b/src/bdz_ph.c index 16257c0..ad52d78 100755 --- a/src/bdz_ph.c +++ b/src/bdz_ph.c @@ -24,7 +24,7 @@ static cmph_uint8 lookup_table[5][256] = { {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, }; -typedef struct +typedef struct { cmph_uint32 vertices[3]; cmph_uint32 next_edges[3]; @@ -41,12 +41,12 @@ static void bdz_ph_free_queue(bdz_ph_queue_t * queue) free(*queue); }; -typedef struct +typedef struct { cmph_uint32 nedges; bdz_ph_edge_t * edges; cmph_uint32 * first_edge; - cmph_uint8 * vert_degree; + cmph_uint8 * vert_degree; }bdz_ph_graph3_t; @@ -54,7 +54,7 @@ static void bdz_ph_alloc_graph3(bdz_ph_graph3_t * graph3, cmph_uint32 nedges, cm { graph3->edges=malloc(nedges*sizeof(bdz_ph_edge_t)); graph3->first_edge=malloc(nvertices*sizeof(cmph_uint32)); - graph3->vert_degree=malloc((size_t)nvertices); + graph3->vert_degree=malloc((size_t)nvertices); }; static void bdz_ph_init_graph3(bdz_ph_graph3_t * graph3, cmph_uint32 nedges, cmph_uint32 nvertices) { @@ -101,10 +101,10 @@ static void bdz_ph_dump_graph(bdz_ph_graph3_t* graph3, cmph_uint32 nedges, cmph_ printf(" nexts %d %d %d",graph3->edges[i].next_edges[0], graph3->edges[i].next_edges[1],graph3->edges[i].next_edges[2]); }; - + for(i=0;ifirst_edge[i]); - + }; }; @@ -121,7 +121,7 @@ static void bdz_ph_remove_edge(bdz_ph_graph3_t * graph3, cmph_uint32 curr_edge) j=0; } else if(graph3->edges[edge1].vertices[1]==vert){ j=1; - } else + } else j=2; edge1=graph3->edges[edge1].next_edges[j]; }; @@ -130,16 +130,16 @@ static void bdz_ph_remove_edge(bdz_ph_graph3_t * graph3, cmph_uint32 curr_edge) bdz_ph_dump_graph(graph3,graph3->nedges,graph3->nedges+graph3->nedges/4); exit(-1); }; - + if(edge2!=NULL_EDGE){ - graph3->edges[edge2].next_edges[j] = + graph3->edges[edge2].next_edges[j] = graph3->edges[edge1].next_edges[i]; - } else + } else graph3->first_edge[vert]= graph3->edges[edge1].next_edges[i]; graph3->vert_degree[vert]--; }; - + }; static int bdz_ph_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_ph_queue_t queue, bdz_ph_graph3_t* graph3) @@ -176,7 +176,7 @@ static int bdz_ph_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_ queue[queue_head++]=tmp_edge; SETBIT(marked_edge,tmp_edge); }; - + }; if(graph3->vert_degree[v1]==1) { tmp_edge=graph3->first_edge[v1]; @@ -184,7 +184,7 @@ static int bdz_ph_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_ queue[queue_head++]=tmp_edge; SETBIT(marked_edge,tmp_edge); }; - + }; if(graph3->vert_degree[v2]==1){ tmp_edge=graph3->first_edge[v2]; @@ -229,7 +229,7 @@ void bdz_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 1) break; //bdz_ph only uses one linear hash function - bdz_ph->hashfunc = *hashptr; + bdz_ph->hashfunc = *hashptr; ++i, ++hashptr; } } @@ -251,16 +251,16 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c) if (c == 0) c = 1.23; // validating restrictions over parameter c. DEBUGP("c: %f\n", c); - bdz_ph->m = mph->key_source->nkeys; - bdz_ph->r = (cmph_uint32)ceil((c * mph->key_source->nkeys)/3); + bdz_ph->m = mph->key_source->nkeys; + bdz_ph->r = (cmph_uint32)ceil((c * mph->key_source->nkeys)/3); if ((bdz_ph->r % 2) == 0) bdz_ph->r += 1; bdz_ph->n = 3*bdz_ph->r; - + bdz_ph_alloc_graph3(&graph3, bdz_ph->m, bdz_ph->n); bdz_ph_alloc_queue(&edges,bdz_ph->m); DEBUGP("Created hypergraph\n"); - + DEBUGP("m (edges): %u n (vertices): %u r: %u c: %f \n", bdz_ph->m, bdz_ph->n, bdz_ph->r, c); // Mapping step @@ -287,10 +287,10 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c) fprintf(stderr, "acyclic graph creation failure - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } + } else break; } - + if (iterations == 0) { // free(bdz_ph->g); @@ -308,7 +308,7 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c) bdz_ph_free_queue(&edges); bdz_ph_free_graph3(&graph3); - + if (mph->verbosity) { fprintf(stderr, "Starting optimization step\n"); @@ -338,23 +338,23 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c) fprintf(stderr, "Successfully generated minimal perfect hash function\n"); } - #ifdef CMPH_TIMING + #ifdef CMPH_TIMING register cmph_uint32 space_usage = bdz_ph_packed_size(mphf)*8; register cmph_uint32 keys_per_bucket = 1; construction_time = construction_time - construction_time_begin; fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", bdz_ph->m, bdz_ph->m/(double)bdz_ph->n, keys_per_bucket, construction_time, space_usage/(double)bdz_ph->m); - #endif + #endif return mphf; } - + static int bdz_ph_mapping(cmph_config_t *mph, bdz_ph_graph3_t* graph3, bdz_ph_queue_t queue) { cmph_uint32 e; int cycles = 0; cmph_uint32 hl[3]; - + bdz_ph_config_data_t *bdz_ph = (bdz_ph_config_data_t *)mph->data; bdz_ph_init_graph3(graph3, bdz_ph->m, bdz_ph->n); mph->key_source->rewind(mph->key_source->data); @@ -363,7 +363,7 @@ static int bdz_ph_mapping(cmph_config_t *mph, bdz_ph_graph3_t* graph3, bdz_ph_qu cmph_uint32 h0, h1, h2; cmph_uint32 keylen; char *key = NULL; - mph->key_source->read(mph->key_source->data, &key, &keylen); + mph->key_source->read(mph->key_source->data, &key, &keylen); hash_vector(bdz_ph->hl, key, keylen, hl); h0 = hl[0] % bdz_ph->r; h1 = hl[1] % bdz_ph->r + bdz_ph->r; @@ -371,7 +371,7 @@ static int bdz_ph_mapping(cmph_config_t *mph, bdz_ph_graph3_t* graph3, bdz_ph_qu mph->key_source->dispose(mph->key_source->data, key, keylen); bdz_ph_add_edge(graph3,h0,h1,h2); } - cycles = bdz_ph_generate_queue(bdz_ph->m, bdz_ph->n, queue, graph3); + cycles = bdz_ph_generate_queue(bdz_ph->m, bdz_ph->n, queue, graph3); return (cycles == 0); } @@ -383,7 +383,7 @@ static void assigning(bdz_ph_config_data_t *bdz_ph, bdz_ph_graph3_t* graph3, bdz cmph_uint32 v0,v1,v2; cmph_uint8 * marked_vertices =malloc((size_t)(bdz_ph->n >> 3) + 1); cmph_uint32 sizeg = (cmph_uint32)ceil(bdz_ph->n/4.0); - bdz_ph->g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); + bdz_ph->g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); memset(marked_vertices, 0, (size_t)(bdz_ph->n >> 3) + 1); //memset(bdz_ph->g, 0xff, sizeg); @@ -396,14 +396,14 @@ static void assigning(bdz_ph_config_data_t *bdz_ph, bdz_ph_graph3_t* graph3, bdz if(!GETBIT(marked_vertices, v0)){ if(!GETBIT(marked_vertices,v1)) { - //SETVALUE(bdz_ph->g, v1, UNASSIGNED); + //SETVALUE(bdz_ph->g, v1, UNASSIGNED); SETBIT(marked_vertices, v1); } if(!GETBIT(marked_vertices,v2)) { - //SETVALUE(bdz_ph->g, v2, UNASSIGNED); + //SETVALUE(bdz_ph->g, v2, UNASSIGNED); SETBIT(marked_vertices, v2); - } + } SETVALUE0(bdz_ph->g, v0, (6-(GETVALUE(bdz_ph->g, v1) + GETVALUE(bdz_ph->g,v2)))%3); SETBIT(marked_vertices, v0); } else if(!GETBIT(marked_vertices, v1)) { @@ -417,7 +417,7 @@ static void assigning(bdz_ph_config_data_t *bdz_ph, bdz_ph_graph3_t* graph3, bdz }else { SETVALUE0(bdz_ph->g, v2, (8-(GETVALUE(bdz_ph->g,v0)+GETVALUE(bdz_ph->g, v1)))%3); SETBIT(marked_vertices, v2); - } + } DEBUGP("A:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz_ph->g, v0), GETVALUE(bdz_ph->g, v1), GETVALUE(bdz_ph->g, v2)); }; free(marked_vertices); @@ -428,11 +428,11 @@ static void bdz_ph_optimization(bdz_ph_config_data_t *bdz_ph) cmph_uint32 i; cmph_uint8 byte = 0; cmph_uint32 sizeg = (cmph_uint32)ceil(bdz_ph->n/5.0); - cmph_uint8 * new_g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); + cmph_uint8 * new_g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); cmph_uint8 value; cmph_uint32 idx; - for(i = 0; i < bdz_ph->n; i++) - { + for(i = 0; i < bdz_ph->n; i++) + { idx = i/5; byte = new_g[idx]; value = GETVALUE(bdz_ph->g, i); @@ -462,7 +462,7 @@ int bdz_ph_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(&(data->r), sizeof(cmph_uint32), (size_t)1, fd); - sizeg = (cmph_uint32)ceil(data->n/5.0); + sizeg = (cmph_uint32)ceil(data->n/5.0); nbytes = fwrite(data->g, sizeof(cmph_uint8)*sizeg, (size_t)1, fd); #ifdef DEBUG @@ -491,19 +491,19 @@ void bdz_ph_load(FILE *f, cmph_t *mphf) nbytes = fread(buf, (size_t)buflen, (size_t)1, f); bdz_ph->hl = hash_state_load(buf, buflen); free(buf); - + DEBUGP("Reading m and n\n"); - nbytes = fread(&(bdz_ph->n), sizeof(cmph_uint32), (size_t)1, f); - nbytes = fread(&(bdz_ph->m), sizeof(cmph_uint32), (size_t)1, f); - nbytes = fread(&(bdz_ph->r), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz_ph->n), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz_ph->m), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz_ph->r), sizeof(cmph_uint32), (size_t)1, f); sizeg = (cmph_uint32)ceil(bdz_ph->n/5.0); bdz_ph->g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); nbytes = fread(bdz_ph->g, sizeg*sizeof(cmph_uint8), (size_t)1, f); return; } - + cmph_uint32 bdz_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { @@ -520,12 +520,12 @@ cmph_uint32 bdz_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) byte0 = bdz_ph->g[hl[0]/5]; byte1 = bdz_ph->g[hl[1]/5]; byte2 = bdz_ph->g[hl[2]/5]; - + byte0 = lookup_table[hl[0]%5U][byte0]; byte1 = lookup_table[hl[1]%5U][byte1]; byte2 = lookup_table[hl[2]%5U][byte2]; vertex = hl[(byte0 + byte1 + byte2)%3]; - + return vertex; } @@ -533,7 +533,7 @@ cmph_uint32 bdz_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) void bdz_ph_destroy(cmph_t *mphf) { bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data; - free(data->g); + free(data->g); hash_state_destroy(data->hl); free(data); free(mphf); @@ -571,17 +571,17 @@ void bdz_ph_pack(cmph_t *mphf, void *packed_mphf) * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 bdz_ph_packed_size(cmph_t *mphf) { bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data; - CMPH_HASH hl_type = hash_get_type(data->hl); + CMPH_HASH hl_type = hash_get_type(data->hl); cmph_uint32 sizeg = (cmph_uint32)ceil(data->n/5.0); return (cmph_uint32) (sizeof(CMPH_ALGO) + hash_state_packed_size(hl_type) + 2*sizeof(cmph_uint32) + sizeof(cmph_uint8)*sizeg); } /** cmph_uint32 bdz_ph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -589,21 +589,21 @@ cmph_uint32 bdz_ph_packed_size(cmph_t *mphf) */ cmph_uint32 bdz_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) { - + register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf; register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4; - + register cmph_uint8 * ptr = hl_ptr + hash_state_packed_size(hl_type); register cmph_uint32 r = *((cmph_uint32*) ptr); register cmph_uint8 * g = ptr + 4; - + cmph_uint32 hl[3]; register cmph_uint8 byte0, byte1, byte2; register cmph_uint32 vertex; hash_vector_packed(hl_ptr, hl_type, key, keylen, hl); - + hl[0] = hl[0] % r; hl[1] = hl[1] % r + r; hl[2] = hl[2] % r + (r << 1); @@ -611,11 +611,11 @@ cmph_uint32 bdz_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 byte0 = g[hl[0]/5]; byte1 = g[hl[1]/5]; byte2 = g[hl[2]/5]; - + byte0 = lookup_table[hl[0]%5][byte0]; byte1 = lookup_table[hl[1]%5][byte1]; byte2 = lookup_table[hl[2]%5][byte2]; vertex = hl[(byte0 + byte1 + byte2)%3]; - + return vertex; } diff --git a/src/bm_numbers.c b/src/bm_numbers.c new file mode 100644 index 0000000..4ede2d7 --- /dev/null +++ b/src/bm_numbers.c @@ -0,0 +1,130 @@ +#include +#include + +#include "bitbool.h" +#include "cmph.h" +#include "cmph_benchmark.h" +#include "linear_string_map.h" + +// Generates a vector with random unique 32 bits integers +cmph_uint32* random_numbers_vector_new(cmph_uint32 size) { + cmph_uint32 i = 0; + cmph_uint32 dup_bits = sizeof(cmph_uint32)*size*8; + char* dup = (char*)malloc(dup_bits/8); + cmph_uint32* vec = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*size); + memset(dup, 0, dup_bits/8); + for (i = 0; i < size; ++i) { + cmph_uint32 v = random(); + while (GETBIT(dup, v % dup_bits)) { v = random(); } + SETBIT(dup, v % dup_bits); + vec[i] = v; + } + free(dup); + return vec; +} + +int cmph_uint32_cmp(const void *a, const void *b) { + return *(const cmph_uint32*)a - *(const cmph_uint32*)b; +} + +char* create_lsmap_key(CMPH_ALGO algo, int iters) { + char mphf_name[128]; + snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters); + return strdup(mphf_name); +} + +static cmph_uint32 g_numbers_len = 0; +static cmph_uint32 *g_numbers = NULL; +static lsmap_t *g_created_mphf = NULL; +static lsmap_t *g_expected_probes = NULL; +static lsmap_t *g_mphf_probes = NULL; + +void bm_create(CMPH_ALGO algo, int iters) { + cmph_io_adapter_t* source = NULL; + cmph_config_t* config = NULL; + cmph_t* mphf = NULL; + + if (iters > g_numbers_len) { + fprintf(stderr, "No input with proper size."); + exit(-1); + } + + source = cmph_io_struct_vector_adapter( + (void*)g_numbers, sizeof(cmph_uint32), + 0, sizeof(cmph_uint32), iters); + config = cmph_config_new(source); + cmph_config_set_algo(config, algo); + mphf = cmph_new(config); + if (!mphf) { + fprintf(stderr, "Failed to create mphf for algorithm %s with %u keys", + cmph_names[algo], iters); + exit(-1); + } + cmph_config_destroy(config); + cmph_io_struct_vector_adapter_destroy(source); + lsmap_append(g_created_mphf, create_lsmap_key(algo, iters), mphf); +} + +void bm_search(CMPH_ALGO algo, int iters) { + int i = 0; + char *mphf_name; + cmph_t* mphf = NULL; + + mphf_name = create_lsmap_key(algo, iters); + mphf = (cmph_t*)lsmap_search(g_created_mphf, mphf_name); + free(mphf_name); + + cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); + cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); + + for (i = 0; i < iters * 100; ++i) { + cmph_uint32 pos = random() % iters; + const char* buf = (const char*)(g_numbers + pos); + cmph_uint32 h = cmph_search(mphf, buf, sizeof(cmph_uint32)); + ++count[pos]; + ++hash_count[h]; + } + + // Verify correctness later. + lsmap_append(g_expected_probes, create_lsmap_key(algo, iters), count); + lsmap_append(g_mphf_probes, create_lsmap_key(algo, iters), hash_count); +} + +void verify() { } + +#define DECLARE_ALGO(algo) \ + void bm_create_ ## algo(int iters) { bm_create(algo, iters); } \ + void bm_search_ ## algo(int iters) { bm_search(algo, iters); } + +DECLARE_ALGO(CMPH_BMZ); +DECLARE_ALGO(CMPH_CHM); +DECLARE_ALGO(CMPH_BRZ); +DECLARE_ALGO(CMPH_FCH); +DECLARE_ALGO(CMPH_BDZ); + +int main(int argc, char** argv) { + g_numbers_len = 1000 * 1000; + g_numbers = random_numbers_vector_new(g_numbers_len); + g_created_mphf = lsmap_new(); + g_expected_probes = lsmap_new(); + g_mphf_probes = lsmap_new(); + + BM_REGISTER(bm_create_CMPH_BMZ, 1000 * 1000); + BM_REGISTER(bm_search_CMPH_BMZ, 1000 * 1000); + BM_REGISTER(bm_create_CMPH_CHM, 1000 * 1000); + BM_REGISTER(bm_search_CMPH_CHM, 1000 * 1000); +// BM_REGISTER(bm_create_CMPH_BRZ, 1000 * 1000); +// BM_REGISTER(bm_search_CMPH_BRZ, 1000 * 1000); +// BM_REGISTER(bm_create_CMPH_FCH, 1000 * 1000); +// BM_REGISTER(bm_search_CMPH_FCH, 1000 * 1000); + BM_REGISTER(bm_create_CMPH_BDZ, 1000 * 1000); + BM_REGISTER(bm_search_CMPH_BDZ, 1000 * 1000); + run_benchmarks(argc, argv); + + verify(); + free(g_numbers); + lsmap_foreach_key(g_created_mphf, (void(*)(const char*))free); + lsmap_foreach_value(g_created_mphf, (void(*)(void*))cmph_destroy); + lsmap_destroy(g_created_mphf); + return 0; +} diff --git a/src/bmz.c b/src/bmz.c index 3eabfb7..eb3190e 100644 --- a/src/bmz.c +++ b/src/bmz.c @@ -12,7 +12,7 @@ #include #include -//#define DEBUG +// #define DEBUG #include "debug.h" static int bmz_gen_edges(cmph_config_t *mph); @@ -24,7 +24,7 @@ bmz_config_data_t *bmz_config_new(void) { bmz_config_data_t *bmz = NULL; bmz = (bmz_config_data_t *)malloc(sizeof(bmz_config_data_t)); - assert(bmz); + if (!bmz) return NULL; memset(bmz, 0, sizeof(bmz_config_data_t)); bmz->hashfuncs[0] = CMPH_HASH_JENKINS; bmz->hashfuncs[1] = CMPH_HASH_JENKINS; @@ -49,7 +49,7 @@ void bmz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 2) break; //bmz only uses two hash functions - bmz->hashfuncs[i] = *hashptr; + bmz->hashfuncs[i] = *hashptr; ++i, ++hashptr; } } @@ -68,8 +68,8 @@ cmph_t *bmz_new(cmph_config_t *mph, double c) bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data; if (c == 0) c = 1.15; // validating restrictions over parameter c. DEBUGP("c: %f\n", c); - bmz->m = mph->key_source->nkeys; - bmz->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); + bmz->m = mph->key_source->nkeys; + bmz->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz->m, bmz->n, c); bmz->graph = graph_new(bmz->n, bmz->m); DEBUGP("Created graph\n"); @@ -81,7 +81,7 @@ cmph_t *bmz_new(cmph_config_t *mph, double c) { // Mapping step cmph_uint32 biggest_g_value = 0; - cmph_uint32 biggest_edge_value = 1; + cmph_uint32 biggest_edge_value = 1; iterations = 100; if (mph->verbosity) { @@ -109,12 +109,12 @@ cmph_t *bmz_new(cmph_config_t *mph, double c) fprintf(stderr, "simple graph creation failure - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } + } else break; } if (iterations == 0) { - graph_destroy(bmz->graph); + graph_destroy(bmz->graph); return NULL; } // Ordering step @@ -155,20 +155,26 @@ cmph_t *bmz_new(cmph_config_t *mph, double c) } bmz_traverse_non_critical_nodes(bmz, used_edges, visited); // non_critical_nodes } - else + else { iterations_map--; if (mph->verbosity) fprintf(stderr, "Restarting mapping step. %u iterations remaining.\n", iterations_map); - } + } free(used_edges); free(visited); - }while(restart_mapping && iterations_map > 0); + } while(restart_mapping && iterations_map > 0); graph_destroy(bmz->graph); bmz->graph = NULL; - if (iterations_map == 0) + if (iterations_map == 0) { return NULL; } + #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < bmz->n; ++i) fprintf(stderr, "%u ", bmz->g[i]); + fprintf(stderr, "\n"); + #endif + mphf = (cmph_t *)malloc(sizeof(cmph_t)); mphf->algo = mph->algo; bmzf = (bmz_data_t *)malloc(sizeof(bmz_data_t)); @@ -206,15 +212,15 @@ static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint3 while(!vqueue_is_empty(q)) { v = vqueue_remove(q); - it = graph_neighbors_it(bmz->graph, v); + it = graph_neighbors_it(bmz->graph, v); while ((u = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR) - { + { if (graph_node_is_critical(bmz->graph, u) && (!GETBIT(visited,u))) { collision = 1; while(collision) // lookahead to resolve collisions { - next_g = *biggest_g_value + 1; + next_g = *biggest_g_value + 1; it1 = graph_neighbors_it(bmz->graph, u); collision = 0; while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR) @@ -226,7 +232,7 @@ static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint3 vqueue_destroy(q); return 1; // restart mapping step. } - if (GETBIT(used_edges, (next_g + bmz->g[lav]))) + if (GETBIT(used_edges, (next_g + bmz->g[lav]))) { collision = 1; break; @@ -234,7 +240,7 @@ static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint3 } } if (next_g > *biggest_g_value) *biggest_g_value = next_g; - } + } // Marking used edges... it1 = graph_neighbors_it(bmz->graph, u); while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR) @@ -248,9 +254,9 @@ static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint3 bmz->g[u] = next_g; // Labelling vertex u. SETBIT(visited,u); vqueue_insert(q, u); - } + } } - + } vqueue_destroy(q); return 0; @@ -276,22 +282,22 @@ static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, while(!vqueue_is_empty(q)) { v = vqueue_remove(q); - it = graph_neighbors_it(bmz->graph, v); + it = graph_neighbors_it(bmz->graph, v); while ((u = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR) - { + { if (graph_node_is_critical(bmz->graph, u) && (!GETBIT(visited,u))) { cmph_uint32 next_g_index = 0; collision = 1; while(collision) // lookahead to resolve collisions { - if (next_g_index < nunused_g_values) + if (next_g_index < nunused_g_values) { - next_g = unused_g_values[next_g_index++]; + next_g = unused_g_values[next_g_index++]; } - else + else { - next_g = *biggest_g_value + 1; + next_g = *biggest_g_value + 1; next_g_index = UINT_MAX; } it1 = graph_neighbors_it(bmz->graph, u); @@ -306,7 +312,7 @@ static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, free(unused_g_values); return 1; // restart mapping step. } - if (GETBIT(used_edges, (next_g + bmz->g[lav]))) + if (GETBIT(used_edges, (next_g + bmz->g[lav]))) { collision = 1; break; @@ -318,13 +324,13 @@ static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, if(nunused_g_values == unused_g_values_capacity) { unused_g_values = (cmph_uint32 *)realloc(unused_g_values, (unused_g_values_capacity + BUFSIZ)*sizeof(cmph_uint32)); - unused_g_values_capacity += BUFSIZ; - } - unused_g_values[nunused_g_values++] = next_g; + unused_g_values_capacity += BUFSIZ; + } + unused_g_values[nunused_g_values++] = next_g; } if (next_g > *biggest_g_value) *biggest_g_value = next_g; - } + } next_g_index--; if (next_g_index < nunused_g_values) unused_g_values[next_g_index] = unused_g_values[--nunused_g_values]; @@ -341,13 +347,13 @@ static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, bmz->g[u] = next_g; // Labelling vertex u. SETBIT(visited, u); vqueue_insert(q, u); - } + } } - + } vqueue_destroy(q); free(unused_g_values); - return 0; + return 0; } static cmph_uint32 next_unused_edge(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint32 unused_edge_index) @@ -375,8 +381,8 @@ static void bmz_traverse(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_u SETBIT(visited, neighbor); (*unused_edge_index)++; bmz_traverse(bmz, used_edges, neighbor, unused_edge_index, visited); - - } + + } } static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint8 * visited) @@ -388,7 +394,7 @@ static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * { v1 = graph_vertex_id(bmz->graph, i, 0); v2 = graph_vertex_id(bmz->graph, i, 1); - if((GETBIT(visited,v1) && GETBIT(visited,v2)) || (!GETBIT(visited,v1) && !GETBIT(visited,v2))) continue; + if((GETBIT(visited,v1) && GETBIT(visited,v2)) || (!GETBIT(visited,v1) && !GETBIT(visited,v2))) continue; if(GETBIT(visited,v1)) bmz_traverse(bmz, used_edges, v1, &unused_edge_index, visited); else bmz_traverse(bmz, used_edges, v2, &unused_edge_index, visited); @@ -397,7 +403,7 @@ static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * for(i = 0; i < bmz->n; i++) { if(!GETBIT(visited,i)) - { + { bmz->g[i] = 0; SETBIT(visited, i); bmz_traverse(bmz, used_edges, i, &unused_edge_index, visited); @@ -405,14 +411,14 @@ static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * } } - + static int bmz_gen_edges(cmph_config_t *mph) { cmph_uint32 e; bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data; cmph_uint8 multiple_edges = 0; DEBUGP("Generating edges for %u vertices\n", bmz->n); - graph_clear_edges(bmz->graph); + graph_clear_edges(bmz->graph); mph->key_source->rewind(mph->key_source->data); for (e = 0; e < mph->key_source->nkeys; ++e) { @@ -420,20 +426,19 @@ static int bmz_gen_edges(cmph_config_t *mph) cmph_uint32 keylen; char *key = NULL; mph->key_source->read(mph->key_source->data, &key, &keylen); - -// if (key == NULL)fprintf(stderr, "key = %s -- read BMZ\n", key); + h1 = hash(bmz->hashes[0], key, keylen) % bmz->n; h2 = hash(bmz->hashes[1], key, keylen) % bmz->n; if (h1 == h2) if (++h2 >= bmz->n) h2 = 0; - if (h1 == h2) + DEBUGP("key: %.*s h1: %u h2: %u\n", keylen, key, h1, h2); + if (h1 == h2) { if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); mph->key_source->dispose(mph->key_source->data, key, keylen); return 0; } - //DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key); + DEBUGP("Adding edge: %u -> %u for key %.*s\n", h1, h2, keylen, key); mph->key_source->dispose(mph->key_source->data, key, keylen); -// fprintf(stderr, "key = %s -- dispose BMZ\n", key); multiple_edges = graph_contains_edge(bmz->graph, h1, h2); if (mph->verbosity && multiple_edges) fprintf(stderr, "A non simple graph was generated\n"); if (multiple_edges) return 0; // checking multiple edge restriction. @@ -467,7 +472,7 @@ int bmz_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); - + nbytes = fwrite(data->g, sizeof(cmph_uint32)*(data->n), (size_t)1, fd); #ifdef DEBUG cmph_uint32 i; @@ -505,8 +510,8 @@ void bmz_load(FILE *f, cmph_t *mphf) } DEBUGP("Reading m and n\n"); - nbytes = fread(&(bmz->n), sizeof(cmph_uint32), (size_t)1, f); - nbytes = fread(&(bmz->m), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bmz->n), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bmz->m), sizeof(cmph_uint32), (size_t)1, f); bmz->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*bmz->n); nbytes = fread(bmz->g, bmz->n*sizeof(cmph_uint32), (size_t)1, f); @@ -517,22 +522,22 @@ void bmz_load(FILE *f, cmph_t *mphf) #endif return; } - + cmph_uint32 bmz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { bmz_data_t *bmz = mphf->data; cmph_uint32 h1 = hash(bmz->hashes[0], key, keylen) % bmz->n; cmph_uint32 h2 = hash(bmz->hashes[1], key, keylen) % bmz->n; - DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + DEBUGP("key: %.*s h1: %u h2: %u\n", keylen, key, h1, h2); if (h1 == h2 && ++h2 > bmz->n) h2 = 0; - DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, bmz->g[h1], bmz->g[h2], bmz->m); + DEBUGP("key: %.*s g[h1]: %u g[h2]: %u edges: %u\n", keylen, key, bmz->g[h1], bmz->g[h2], bmz->m); return bmz->g[h1] + bmz->g[h2]; } void bmz_destroy(cmph_t *mphf) { bmz_data_t *data = (bmz_data_t *)mphf->data; - free(data->g); + free(data->g); hash_state_destroy(data->hashes[0]); hash_state_destroy(data->hashes[1]); free(data->hashes); @@ -543,7 +548,7 @@ void bmz_destroy(cmph_t *mphf) /** \fn void bmz_pack(cmph_t *mphf, void *packed_mphf); * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. * \param mphf pointer to the resulting mphf - * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() */ void bmz_pack(cmph_t *mphf, void *packed_mphf) { @@ -574,26 +579,26 @@ void bmz_pack(cmph_t *mphf, void *packed_mphf) ptr += sizeof(data->n); // packing g - memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n); + memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n); } /** \fn cmph_uint32 bmz_packed_size(cmph_t *mphf); * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 bmz_packed_size(cmph_t *mphf) { bmz_data_t *data = (bmz_data_t *)mphf->data; - CMPH_HASH h1_type = hash_get_type(data->hashes[0]); - CMPH_HASH h2_type = hash_get_type(data->hashes[1]); + CMPH_HASH h1_type = hash_get_type(data->hashes[0]); + CMPH_HASH h2_type = hash_get_type(data->hashes[1]); - return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + 3*sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n); } /** cmph_uint32 bmz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -608,13 +613,13 @@ cmph_uint32 bmz_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type); register CMPH_HASH h2_type = *((cmph_uint32 *)h2_ptr); h2_ptr += 4; - + register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type)); - - register cmph_uint32 n = *g_ptr++; - - register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; - register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; + + register cmph_uint32 n = *g_ptr++; + + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; + register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; if (h1 == h2 && ++h2 > n) h2 = 0; - return (g_ptr[h1] + g_ptr[h2]); + return (g_ptr[h1] + g_ptr[h2]); } diff --git a/src/bmz8.c b/src/bmz8.c index 4db4dfc..54ba606 100644 --- a/src/bmz8.c +++ b/src/bmz8.c @@ -23,7 +23,7 @@ bmz8_config_data_t *bmz8_config_new(void) { bmz8_config_data_t *bmz8; bmz8 = (bmz8_config_data_t *)malloc(sizeof(bmz8_config_data_t)); - assert(bmz8); + if (!bmz8) return NULL; memset(bmz8, 0, sizeof(bmz8_config_data_t)); bmz8->hashfuncs[0] = CMPH_HASH_JENKINS; bmz8->hashfuncs[1] = CMPH_HASH_JENKINS; @@ -48,7 +48,7 @@ void bmz8_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 2) break; //bmz8 only uses two hash functions - bmz8->hashfuncs[i] = *hashptr; + bmz8->hashfuncs[i] = *hashptr; ++i, ++hashptr; } } @@ -64,7 +64,7 @@ cmph_t *bmz8_new(cmph_config_t *mph, double c) cmph_uint8 restart_mapping = 0; cmph_uint8 * visited = NULL; bmz8_config_data_t *bmz8 = (bmz8_config_data_t *)mph->data; - + if (mph->key_source->nkeys >= 256) { if (mph->verbosity) fprintf(stderr, "The number of keys in BMZ8 must be lower than 256.\n"); @@ -72,8 +72,8 @@ cmph_t *bmz8_new(cmph_config_t *mph, double c) } if (c == 0) c = 1.15; // validating restrictions over parameter c. DEBUGP("c: %f\n", c); - bmz8->m = (cmph_uint8) mph->key_source->nkeys; - bmz8->n = (cmph_uint8) ceil(c * mph->key_source->nkeys); + bmz8->m = (cmph_uint8) mph->key_source->nkeys; + bmz8->n = (cmph_uint8) ceil(c * mph->key_source->nkeys); DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz8->m, bmz8->n, c); bmz8->graph = graph_new(bmz8->n, bmz8->m); DEBUGP("Created graph\n"); @@ -113,8 +113,8 @@ cmph_t *bmz8_new(cmph_config_t *mph, double c) fprintf(stderr, "simple graph creation failure - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } - else break; + } + else break; } if (iterations == 0) { @@ -161,19 +161,19 @@ cmph_t *bmz8_new(cmph_config_t *mph, double c) } bmz8_traverse_non_critical_nodes(bmz8, used_edges, visited); // non_critical_nodes } - else + else { iterations_map--; if (mph->verbosity) fprintf(stderr, "Restarting mapping step. %u iterations remaining.\n", iterations_map); - } + } free(used_edges); free(visited); }while(restart_mapping && iterations_map > 0); - graph_destroy(bmz8->graph); + graph_destroy(bmz8->graph); bmz8->graph = NULL; - if (iterations_map == 0) + if (iterations_map == 0) { return NULL; } @@ -213,15 +213,15 @@ static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_ui while(!vqueue_is_empty(q)) { v = vqueue_remove(q); - it = graph_neighbors_it(bmz8->graph, v); + it = graph_neighbors_it(bmz8->graph, v); while ((u = graph_next_neighbor(bmz8->graph, &it)) != GRAPH_NO_NEIGHBOR) - { + { if (graph_node_is_critical(bmz8->graph, u) && (!GETBIT(visited,u))) { collision = 1; while(collision) // lookahead to resolve collisions { - next_g = (cmph_uint8)(*biggest_g_value + 1); + next_g = (cmph_uint8)(*biggest_g_value + 1); it1 = graph_neighbors_it(bmz8->graph, u); collision = 0; while((lav = graph_next_neighbor(bmz8->graph, &it1)) != GRAPH_NO_NEIGHBOR) @@ -233,7 +233,7 @@ static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_ui vqueue_destroy(q); return 1; // restart mapping step. } - if (GETBIT(used_edges, (next_g + bmz8->g[lav]))) + if (GETBIT(used_edges, (next_g + bmz8->g[lav]))) { collision = 1; break; @@ -241,7 +241,7 @@ static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_ui } } if (next_g > *biggest_g_value) *biggest_g_value = next_g; - } + } // Marking used edges... it1 = graph_neighbors_it(bmz8->graph, u); while((lav = graph_next_neighbor(bmz8->graph, &it1)) != GRAPH_NO_NEIGHBOR) @@ -250,16 +250,16 @@ static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_ui { SETBIT(used_edges,(next_g + bmz8->g[lav])); - if(next_g + bmz8->g[lav] > *biggest_edge_value) + if(next_g + bmz8->g[lav] > *biggest_edge_value) *biggest_edge_value = (cmph_uint8)(next_g + bmz8->g[lav]); } } bmz8->g[u] = next_g; // Labelling vertex u. SETBIT(visited,u); vqueue_insert(q, u); - } + } } - + } vqueue_destroy(q); return 0; @@ -268,8 +268,8 @@ static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_ui static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz8, cmph_uint32 v, cmph_uint8 * biggest_g_value, cmph_uint8 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited) { cmph_uint8 next_g; - cmph_uint32 u; - cmph_uint32 lav; + cmph_uint32 u; + cmph_uint32 lav; cmph_uint8 collision; cmph_uint8 * unused_g_values = NULL; cmph_uint8 unused_g_values_capacity = 0; @@ -280,27 +280,27 @@ static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz DEBUGP("Labelling critical vertices\n"); bmz8->g[v] = (cmph_uint8)(ceil ((double)(*biggest_edge_value)/2) - 1); SETBIT(visited, v); - next_g = (cmph_uint8)floor((double)(*biggest_edge_value/2)); + next_g = (cmph_uint8)floor((double)(*biggest_edge_value/2)); vqueue_insert(q, v); while(!vqueue_is_empty(q)) { v = vqueue_remove(q); - it = graph_neighbors_it(bmz8->graph, v); + it = graph_neighbors_it(bmz8->graph, v); while ((u = graph_next_neighbor(bmz8->graph, &it)) != GRAPH_NO_NEIGHBOR) - { + { if (graph_node_is_critical(bmz8->graph, u) && (!GETBIT(visited,u))) { cmph_uint8 next_g_index = 0; collision = 1; while(collision) // lookahead to resolve collisions { - if (next_g_index < nunused_g_values) + if (next_g_index < nunused_g_values) { next_g = unused_g_values[next_g_index++]; } - else + else { - next_g = (cmph_uint8)(*biggest_g_value + 1); + next_g = (cmph_uint8)(*biggest_g_value + 1); next_g_index = 255;//UINT_MAX; } it1 = graph_neighbors_it(bmz8->graph, u); @@ -315,7 +315,7 @@ static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz free(unused_g_values); return 1; // restart mapping step. } - if (GETBIT(used_edges, (next_g + bmz8->g[lav]))) + if (GETBIT(used_edges, (next_g + bmz8->g[lav]))) { collision = 1; break; @@ -327,14 +327,14 @@ static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz if(nunused_g_values == unused_g_values_capacity) { unused_g_values = (cmph_uint8*)realloc(unused_g_values, ((size_t)(unused_g_values_capacity + BUFSIZ))*sizeof(cmph_uint8)); - unused_g_values_capacity += (cmph_uint8)BUFSIZ; - } - unused_g_values[nunused_g_values++] = next_g; + unused_g_values_capacity += (cmph_uint8)BUFSIZ; + } + unused_g_values[nunused_g_values++] = next_g; } if (next_g > *biggest_g_value) *biggest_g_value = next_g; } - + next_g_index--; if (next_g_index < nunused_g_values) unused_g_values[next_g_index] = unused_g_values[--nunused_g_values]; @@ -345,22 +345,22 @@ static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz if (graph_node_is_critical(bmz8->graph, lav) && GETBIT(visited, lav)) { SETBIT(used_edges,(next_g + bmz8->g[lav])); - if(next_g + bmz8->g[lav] > *biggest_edge_value) + if(next_g + bmz8->g[lav] > *biggest_edge_value) *biggest_edge_value = (cmph_uint8)(next_g + bmz8->g[lav]); } } - + bmz8->g[u] = next_g; // Labelling vertex u. SETBIT(visited, u); vqueue_insert(q, u); - - } + + } } - + } vqueue_destroy(q); free(unused_g_values); - return 0; + return 0; } static cmph_uint8 next_unused_edge(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmph_uint32 unused_edge_index) @@ -388,8 +388,8 @@ static void bmz8_traverse(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmp SETBIT(visited, neighbor); (*unused_edge_index)++; bmz8_traverse(bmz8, used_edges, neighbor, unused_edge_index, visited); - - } + + } } static void bmz8_traverse_non_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmph_uint8 * visited) @@ -401,7 +401,7 @@ static void bmz8_traverse_non_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint { v1 = (cmph_uint8)graph_vertex_id(bmz8->graph, i, 0); v2 = (cmph_uint8)graph_vertex_id(bmz8->graph, i, 1); - if((GETBIT(visited,v1) && GETBIT(visited,v2)) || (!GETBIT(visited,v1) && !GETBIT(visited,v2))) continue; + if((GETBIT(visited,v1) && GETBIT(visited,v2)) || (!GETBIT(visited,v1) && !GETBIT(visited,v2))) continue; if(GETBIT(visited,v1)) bmz8_traverse(bmz8, used_edges, v1, &unused_edge_index, visited); else bmz8_traverse(bmz8, used_edges, v2, &unused_edge_index, visited); @@ -410,7 +410,7 @@ static void bmz8_traverse_non_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint for(i = 0; i < bmz8->n; i++) { if(!GETBIT(visited,i)) - { + { bmz8->g[i] = 0; SETBIT(visited, i); bmz8_traverse(bmz8, used_edges, i, &unused_edge_index, visited); @@ -418,14 +418,14 @@ static void bmz8_traverse_non_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint } } - + static int bmz8_gen_edges(cmph_config_t *mph) { cmph_uint8 e; bmz8_config_data_t *bmz8 = (bmz8_config_data_t *)mph->data; cmph_uint8 multiple_edges = 0; DEBUGP("Generating edges for %u vertices\n", bmz8->n); - graph_clear_edges(bmz8->graph); + graph_clear_edges(bmz8->graph); mph->key_source->rewind(mph->key_source->data); for (e = 0; e < mph->key_source->nkeys; ++e) { @@ -433,12 +433,12 @@ static int bmz8_gen_edges(cmph_config_t *mph) cmph_uint32 keylen; char *key = NULL; mph->key_source->read(mph->key_source->data, &key, &keylen); - + // if (key == NULL)fprintf(stderr, "key = %s -- read BMZ\n", key); h1 = (cmph_uint8)(hash(bmz8->hashes[0], key, keylen) % bmz8->n); h2 = (cmph_uint8)(hash(bmz8->hashes[1], key, keylen) % bmz8->n); if (h1 == h2) if (++h2 >= bmz8->n) h2 = 0; - if (h1 == h2) + if (h1 == h2) { if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); mph->key_source->dispose(mph->key_source->data, key, keylen); @@ -480,7 +480,7 @@ int bmz8_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(&(data->n), sizeof(cmph_uint8), (size_t)1, fd); nbytes = fwrite(&(data->m), sizeof(cmph_uint8), (size_t)1, fd); - + nbytes = fwrite(data->g, sizeof(cmph_uint8)*(data->n), (size_t)1, fd); /* #ifdef DEBUG fprintf(stderr, "G: "); @@ -518,8 +518,8 @@ void bmz8_load(FILE *f, cmph_t *mphf) } DEBUGP("Reading m and n\n"); - nbytes = fread(&(bmz8->n), sizeof(cmph_uint8), (size_t)1, f); - nbytes = fread(&(bmz8->m), sizeof(cmph_uint8), (size_t)1, f); + nbytes = fread(&(bmz8->n), sizeof(cmph_uint8), (size_t)1, f); + nbytes = fread(&(bmz8->m), sizeof(cmph_uint8), (size_t)1, f); bmz8->g = (cmph_uint8 *)malloc(sizeof(cmph_uint8)*bmz8->n); nbytes = fread(bmz8->g, bmz8->n*sizeof(cmph_uint8), (size_t)1, f); @@ -530,7 +530,7 @@ void bmz8_load(FILE *f, cmph_t *mphf) #endif return; } - + cmph_uint8 bmz8_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { @@ -556,7 +556,7 @@ void bmz8_destroy(cmph_t *mphf) /** \fn void bmz8_pack(cmph_t *mphf, void *packed_mphf); * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. * \param mphf pointer to the resulting mphf - * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() */ void bmz8_pack(cmph_t *mphf, void *packed_mphf) { @@ -585,26 +585,26 @@ void bmz8_pack(cmph_t *mphf, void *packed_mphf) *ptr++ = data->n; // packing g - memcpy(ptr, data->g, sizeof(cmph_uint8)*data->n); + memcpy(ptr, data->g, sizeof(cmph_uint8)*data->n); } /** \fn cmph_uint32 bmz8_packed_size(cmph_t *mphf); * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 bmz8_packed_size(cmph_t *mphf) { bmz8_data_t *data = (bmz8_data_t *)mphf->data; - CMPH_HASH h1_type = hash_get_type(data->hashes[0]); - CMPH_HASH h2_type = hash_get_type(data->hashes[1]); + CMPH_HASH h1_type = hash_get_type(data->hashes[0]); + CMPH_HASH h2_type = hash_get_type(data->hashes[1]); - return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + 2*sizeof(cmph_uint32) + sizeof(cmph_uint8) + sizeof(cmph_uint8)*data->n); } /** cmph_uint8 bmz8_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -619,14 +619,14 @@ cmph_uint8 bmz8_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type); register CMPH_HASH h2_type = *((cmph_uint32 *)h2_ptr); h2_ptr += 4; - + register cmph_uint8 *g_ptr = h2_ptr + hash_state_packed_size(h2_type); - - register cmph_uint8 n = *g_ptr++; - - register cmph_uint8 h1 = (cmph_uint8)(hash_packed(h1_ptr, h1_type, key, keylen) % n); - register cmph_uint8 h2 = (cmph_uint8)(hash_packed(h2_ptr, h2_type, key, keylen) % n); + + register cmph_uint8 n = *g_ptr++; + + register cmph_uint8 h1 = (cmph_uint8)(hash_packed(h1_ptr, h1_type, key, keylen) % n); + register cmph_uint8 h2 = (cmph_uint8)(hash_packed(h2_ptr, h2_type, key, keylen) % n); DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); if (h1 == h2 && ++h2 > n) h2 = 0; - return (cmph_uint8)(g_ptr[h1] + g_ptr[h2]); + return (cmph_uint8)(g_ptr[h1] + g_ptr[h2]); } diff --git a/src/brz.c b/src/brz.c index f9c48ef..bac5bc5 100755 --- a/src/brz.c +++ b/src/brz.c @@ -26,8 +26,9 @@ static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fch static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen); brz_config_data_t *brz_config_new(void) { - brz_config_data_t *brz = NULL; + brz_config_data_t *brz = NULL; brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t)); + if (!brz) return NULL; brz->algo = CMPH_FCH; brz->b = 128; brz->hashfuncs[0] = CMPH_HASH_JENKINS; @@ -42,7 +43,7 @@ brz_config_data_t *brz_config_new(void) brz->memory_availability = 1024*1024; brz->tmp_dir = (cmph_uint8 *)calloc((size_t)10, sizeof(cmph_uint8)); brz->mphf_fd = NULL; - strcpy((char *)(brz->tmp_dir), "/var/tmp/"); + strcpy((char *)(brz->tmp_dir), "/var/tmp/"); assert(brz); return brz; } @@ -63,7 +64,7 @@ void brz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 3) break; //brz only uses three hash functions - brz->hashfuncs[i] = *hashptr; + brz->hashfuncs[i] = *hashptr; ++i, ++hashptr; } } @@ -84,14 +85,14 @@ void brz_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir) if(tmp_dir[len-1] != '/') { brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+2, sizeof(cmph_uint8)); - sprintf((char *)(brz->tmp_dir), "%s/", (char *)tmp_dir); + sprintf((char *)(brz->tmp_dir), "%s/", (char *)tmp_dir); } else { brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+1, sizeof(cmph_uint8)); - sprintf((char *)(brz->tmp_dir), "%s", (char *)tmp_dir); + sprintf((char *)(brz->tmp_dir), "%s", (char *)tmp_dir); } - + } } @@ -105,14 +106,14 @@ void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd) void brz_config_set_b(cmph_config_t *mph, cmph_uint32 b) { brz_config_data_t *brz = (brz_config_data_t *)mph->data; - if(b <= 64 || b >= 175) + if(b <= 64 || b >= 175) { b = 128; } brz->b = (cmph_uint8)b; } -void brz_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) +void brz_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) { if (algo == CMPH_BMZ8 || algo == CMPH_FCH) // supported algorithms { @@ -147,13 +148,13 @@ cmph_t *brz_new(cmph_config_t *mph, double c) brz->k = (cmph_uint32)ceil(brz->m/((double)brz->b)); DEBUGP("k: %u\n", brz->k); brz->size = (cmph_uint8 *) calloc((size_t)brz->k, sizeof(cmph_uint8)); - + // Clustering the keys by graph id. if (mph->verbosity) { - fprintf(stderr, "Partioning the set of keys.\n"); + fprintf(stderr, "Partioning the set of keys.\n"); } - + while(1) { int ok; @@ -172,17 +173,17 @@ cmph_t *brz_new(cmph_config_t *mph, double c) fprintf(stderr, "Failure: A graph with more than 255 keys was created - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } - else break; + } + else break; } - if (iterations == 0) + if (iterations == 0) { DEBUGP("Graphs with more than 255 keys were created in all 20 iterations\n"); free(brz->size); return NULL; } DEBUGP("Graphs generated\n"); - + brz->offset = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32)); for (i = 1; i < brz->k; ++i) { @@ -209,7 +210,7 @@ cmph_t *brz_new(cmph_config_t *mph, double c) brzf->m = brz->m; brzf->algo = brz->algo; mphf->data = brzf; - mphf->size = brz->m; + mphf->size = brz->m; DEBUGP("Successfully generated minimal perfect hash\n"); if (mph->verbosity) { @@ -240,7 +241,7 @@ static int brz_gen_mphf(cmph_config_t *mph) cmph_uint32 cur_bucket = 0; cmph_uint8 nkeys_vd = 0; cmph_uint8 ** keys_vd = NULL; - + mph->key_source->rewind(mph->key_source->data); DEBUGP("Generating graphs from %u keys\n", brz->m); // Partitioning @@ -249,7 +250,7 @@ static int brz_gen_mphf(cmph_config_t *mph) mph->key_source->read(mph->key_source->data, &key, &keylen); /* Buffers management */ - if (memory_usage + keylen + sizeof(keylen) > brz->memory_availability) // flush buffers + if (memory_usage + keylen + sizeof(keylen) > brz->memory_availability) // flush buffers { if(mph->verbosity) { @@ -265,8 +266,8 @@ static int brz_gen_mphf(cmph_config_t *mph) sum += value; value = buckets_size[i]; buckets_size[i] = sum; - - } + + } memory_usage = 0; keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32)); for(i = 0; i < nkeys_in_buffer; i++) @@ -298,8 +299,8 @@ static int brz_gen_mphf(cmph_config_t *mph) memcpy(buffer + memory_usage + sizeof(keylen), key, (size_t)keylen); memory_usage += keylen + (cmph_uint32)sizeof(keylen); h0 = hash(brz->h0, key, keylen) % brz->k; - - if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0]))) + + if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0]))) { free(buffer); free(buckets_size); @@ -310,8 +311,8 @@ static int brz_gen_mphf(cmph_config_t *mph) nkeys_in_buffer++; mph->key_source->dispose(mph->key_source->data, key, keylen); } - if (memory_usage != 0) // flush buffers - { + if (memory_usage != 0) // flush buffers + { if(mph->verbosity) { fprintf(stderr, "Flushing %u\n", nkeys_in_buffer); @@ -370,12 +371,12 @@ static int brz_gen_mphf(cmph_config_t *mph) nbytes = fwrite(&(brz->algo), sizeof(brz->algo), (size_t)1, brz->mphf_fd); nbytes = fwrite(&(brz->k), sizeof(cmph_uint32), (size_t)1, brz->mphf_fd); // number of MPHFs nbytes = fwrite(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, brz->mphf_fd); - + //tmp_fds = (FILE **)calloc(nflushes, sizeof(FILE *)); buff_manager = buffer_manager_new(brz->memory_availability, nflushes); buffer_merge = (cmph_uint8 **)calloc((size_t)nflushes, sizeof(cmph_uint8 *)); buffer_h0 = (cmph_uint32 *)calloc((size_t)nflushes, sizeof(cmph_uint32)); - + memory_usage = 0; for(i = 0; i < nflushes; i++) { @@ -388,7 +389,7 @@ static int brz_gen_mphf(cmph_config_t *mph) h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k; buffer_h0[i] = h0; buffer_merge[i] = (cmph_uint8 *)key; - key = NULL; //transfer memory ownership + key = NULL; //transfer memory ownership } e = 0; keys_vd = (cmph_uint8 **)calloc((size_t)MAX_BUCKET_SIZE, sizeof(cmph_uint8 *)); @@ -429,7 +430,7 @@ static int brz_gen_mphf(cmph_config_t *mph) e++; buffer_h0[i] = UINT_MAX; } - + if(nkeys_vd == brz->size[cur_bucket]) // Generating mphf for each bucket. { cmph_io_adapter_t *source = NULL; @@ -444,7 +445,7 @@ static int brz_gen_mphf(cmph_config_t *mph) //cmph_config_set_algo(config, CMPH_BMZ8); cmph_config_set_graphsize(config, brz->c); mphf_tmp = cmph_new(config); - if (mphf_tmp == NULL) + if (mphf_tmp == NULL) { if(mph->verbosity) fprintf(stderr, "ERROR: Can't generate MPHF for bucket %u out of %u\n", cur_bucket + 1, brz->k); error = 1; @@ -453,9 +454,9 @@ static int brz_gen_mphf(cmph_config_t *mph) cmph_io_byte_vector_adapter_destroy(source); break; } - if(mph->verbosity) + if(mph->verbosity) { - if (cur_bucket % 1000 == 0) + if (cur_bucket % 1000 == 0) { fprintf(stderr, "MPHF for bucket %u out of %u was generated.\n", cur_bucket + 1, brz->k); } @@ -465,7 +466,7 @@ static int brz_gen_mphf(cmph_config_t *mph) case CMPH_FCH: { fch_data_t * fchf = NULL; - fchf = (fch_data_t *)mphf_tmp->data; + fchf = (fch_data_t *)mphf_tmp->data; bufmphf = brz_copy_partial_fch_mphf(brz, fchf, cur_bucket, &buflenmphf); } break; @@ -516,7 +517,7 @@ static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fch { cmph_uint32 i = 0; cmph_uint32 buflenh1 = 0; - cmph_uint32 buflenh2 = 0; + cmph_uint32 buflenh2 = 0; char * bufh1 = NULL; char * bufh2 = NULL; char * buf = NULL; @@ -528,7 +529,7 @@ static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fch memcpy(buf, &buflenh1, sizeof(cmph_uint32)); memcpy(buf+sizeof(cmph_uint32), bufh1, (size_t)buflenh1); memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32)); - memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2); + memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2); for (i = 0; i < n; i++) memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2+i,(fchf->g + i), (size_t)1); free(bufh1); free(bufh2); @@ -537,7 +538,7 @@ static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fch static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen) { cmph_uint32 buflenh1 = 0; - cmph_uint32 buflenh2 = 0; + cmph_uint32 buflenh2 = 0; char * bufh1 = NULL; char * bufh2 = NULL; char * buf = NULL; @@ -572,7 +573,7 @@ int brz_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); free(buf); // Dumping m and the vector offset. - nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(data->offset, sizeof(cmph_uint32)*(data->k), (size_t)1, fd); return 1; } @@ -591,7 +592,7 @@ void brz_load(FILE *f, cmph_t *mphf) nbytes = fread(&(brz->algo), sizeof(brz->algo), (size_t)1, f); // Reading algo. nbytes = fread(&(brz->k), sizeof(cmph_uint32), (size_t)1, f); brz->size = (cmph_uint8 *) malloc(sizeof(cmph_uint8)*brz->k); - nbytes = fread(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, f); + nbytes = fread(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, f); brz->h1 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); brz->h2 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); brz->g = (cmph_uint8 **) calloc((size_t)brz->k, sizeof(cmph_uint8 *)); @@ -635,7 +636,7 @@ void brz_load(FILE *f, cmph_t *mphf) brz->h0 = hash_state_load(buf, buflen); free(buf); - //loading c, m, and the vector offset. + //loading c, m, and the vector offset. nbytes = fread(&(brz->m), sizeof(cmph_uint32), (size_t)1, f); brz->offset = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*brz->k); nbytes = fread(brz->offset, sizeof(cmph_uint32)*(brz->k), (size_t)1, f); @@ -654,9 +655,9 @@ static cmph_uint32 brz_bmz8_search(brz_data_t *brz, const char *key, cmph_uint32 register cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % n; register cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % n; register cmph_uint8 mphf_bucket; - + if (h1 == h2 && ++h2 >= n) h2 = 0; - mphf_bucket = (cmph_uint8)(brz->g[h0][h1] + brz->g[h0][h2]); + mphf_bucket = (cmph_uint8)(brz->g[h0][h1] + brz->g[h0][h2]); DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0); DEBUGP("key: %s g[h1]: %u g[h2]: %u offset[h0]: %u edges: %u\n", key, brz->g[h0][h1], brz->g[h0][h2], brz->offset[h0], brz->m); DEBUGP("Address: %u\n", mphf_bucket + brz->offset[h0]); @@ -722,61 +723,61 @@ void brz_destroy(cmph_t *mphf) /** \fn void brz_pack(cmph_t *mphf, void *packed_mphf); * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. * \param mphf pointer to the resulting mphf - * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() */ void brz_pack(cmph_t *mphf, void *packed_mphf) { brz_data_t *data = (brz_data_t *)mphf->data; cmph_uint8 * ptr = packed_mphf; cmph_uint32 i,n; - + // packing internal algo type memcpy(ptr, &(data->algo), sizeof(data->algo)); ptr += sizeof(data->algo); // packing h0 type - CMPH_HASH h0_type = hash_get_type(data->h0); + CMPH_HASH h0_type = hash_get_type(data->h0); memcpy(ptr, &h0_type, sizeof(h0_type)); ptr += sizeof(h0_type); // packing h0 hash_state_pack(data->h0, ptr); ptr += hash_state_packed_size(h0_type); - + // packing k memcpy(ptr, &(data->k), sizeof(data->k)); ptr += sizeof(data->k); // packing c - *((cmph_uint64 *)ptr) = (cmph_uint64)data->c; + *((cmph_uint64 *)ptr) = (cmph_uint64)data->c; ptr += sizeof(data->c); // packing h1 type - CMPH_HASH h1_type = hash_get_type(data->h1[0]); + CMPH_HASH h1_type = hash_get_type(data->h1[0]); memcpy(ptr, &h1_type, sizeof(h1_type)); ptr += sizeof(h1_type); // packing h2 type - CMPH_HASH h2_type = hash_get_type(data->h2[0]); + CMPH_HASH h2_type = hash_get_type(data->h2[0]); memcpy(ptr, &h2_type, sizeof(h2_type)); ptr += sizeof(h2_type); // packing size - memcpy(ptr, data->size, sizeof(cmph_uint8)*data->k); + memcpy(ptr, data->size, sizeof(cmph_uint8)*data->k); ptr += data->k; - + // packing offset - memcpy(ptr, data->offset, sizeof(cmph_uint32)*data->k); + memcpy(ptr, data->offset, sizeof(cmph_uint32)*data->k); ptr += sizeof(cmph_uint32)*data->k; - + #if defined (__ia64) || defined (__x86_64__) cmph_uint64 * g_is_ptr = (cmph_uint64 *)ptr; #else cmph_uint32 * g_is_ptr = (cmph_uint32 *)ptr; #endif - + cmph_uint8 * g_i = (cmph_uint8 *) (g_is_ptr + data->k); - + for(i = 0; i < data->k; i++) { #if defined (__ia64) || defined (__x86_64__) @@ -787,7 +788,7 @@ void brz_pack(cmph_t *mphf, void *packed_mphf) // packing h1[i] hash_state_pack(data->h1[i], g_i); g_i += hash_state_packed_size(h1_type); - + // packing h2[i] hash_state_pack(data->h2[i], g_i); g_i += hash_state_packed_size(h2_type); @@ -803,9 +804,9 @@ void brz_pack(cmph_t *mphf, void *packed_mphf) break; default: assert(0); } - memcpy(g_i, data->g[i], sizeof(cmph_uint8)*n); + memcpy(g_i, data->g[i], sizeof(cmph_uint8)*n); g_i += n; - + } } @@ -814,16 +815,16 @@ void brz_pack(cmph_t *mphf, void *packed_mphf) * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 brz_packed_size(cmph_t *mphf) { cmph_uint32 i; cmph_uint32 size = 0; brz_data_t *data = (brz_data_t *)mphf->data; - CMPH_HASH h0_type = hash_get_type(data->h0); - CMPH_HASH h1_type = hash_get_type(data->h1[0]); + CMPH_HASH h0_type = hash_get_type(data->h0); + CMPH_HASH h1_type = hash_get_type(data->h1[0]); CMPH_HASH h2_type = hash_get_type(data->h2[0]); - size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) + + size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) + sizeof(double) + sizeof(cmph_uint8)*data->k + sizeof(cmph_uint32)*data->k); // pointers to g_is #if defined (__ia64) || defined (__x86_64__) @@ -831,10 +832,10 @@ cmph_uint32 brz_packed_size(cmph_t *mphf) #else size += (cmph_uint32) sizeof(cmph_uint32)*data->k; #endif - + size += hash_state_packed_size(h1_type) * data->k; size += hash_state_packed_size(h2_type) * data->k; - + cmph_uint32 n = 0; for(i = 0; i < data->k; i++) { @@ -848,7 +849,7 @@ cmph_uint32 brz_packed_size(cmph_t *mphf) break; default: assert(0); } - size += n; + size += n; } return size; } @@ -859,28 +860,28 @@ static cmph_uint32 brz_bmz8_search_packed(cmph_uint32 *packed_mphf, const char * { register CMPH_HASH h0_type = *packed_mphf++; register cmph_uint32 *h0_ptr = packed_mphf; - packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type)); - + packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type)); + register cmph_uint32 k = *packed_mphf++; register double c = (double)(*((cmph_uint64*)packed_mphf)); packed_mphf += 2; - register CMPH_HASH h1_type = *packed_mphf++; - - register CMPH_HASH h2_type = *packed_mphf++; + register CMPH_HASH h1_type = *packed_mphf++; + + register CMPH_HASH h2_type = *packed_mphf++; register cmph_uint8 * size = (cmph_uint8 *) packed_mphf; - packed_mphf = (cmph_uint32 *)(size + k); - + packed_mphf = (cmph_uint32 *)(size + k); + register cmph_uint32 * offset = packed_mphf; packed_mphf += k; register cmph_uint32 h0; - + hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint); h0 = fingerprint[2] % k; - + register cmph_uint32 m = size[h0]; register cmph_uint32 n = (cmph_uint32)ceil(c * m); @@ -889,69 +890,69 @@ static cmph_uint32 brz_bmz8_search_packed(cmph_uint32 *packed_mphf, const char * #else register cmph_uint32 * g_is_ptr = packed_mphf; #endif - + register cmph_uint8 * h1_ptr = (cmph_uint8 *) g_is_ptr[h0]; - + register cmph_uint8 * h2_ptr = h1_ptr + hash_state_packed_size(h1_type); register cmph_uint8 * g = h2_ptr + hash_state_packed_size(h2_type); - + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; register cmph_uint8 mphf_bucket; - + if (h1 == h2 && ++h2 >= n) h2 = 0; - mphf_bucket = (cmph_uint8)(g[h1] + g[h2]); + mphf_bucket = (cmph_uint8)(g[h1] + g[h2]); DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0); DEBUGP("Address: %u\n", mphf_bucket + offset[h0]); - return (mphf_bucket + offset[h0]); + return (mphf_bucket + offset[h0]); } static cmph_uint32 brz_fch_search_packed(cmph_uint32 *packed_mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) { register CMPH_HASH h0_type = *packed_mphf++; - + register cmph_uint32 *h0_ptr = packed_mphf; - packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type)); - + packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type)); + register cmph_uint32 k = *packed_mphf++; register double c = (double)(*((cmph_uint64*)packed_mphf)); packed_mphf += 2; - register CMPH_HASH h1_type = *packed_mphf++; + register CMPH_HASH h1_type = *packed_mphf++; - register CMPH_HASH h2_type = *packed_mphf++; + register CMPH_HASH h2_type = *packed_mphf++; register cmph_uint8 * size = (cmph_uint8 *) packed_mphf; - packed_mphf = (cmph_uint32 *)(size + k); - + packed_mphf = (cmph_uint32 *)(size + k); + register cmph_uint32 * offset = packed_mphf; packed_mphf += k; - + register cmph_uint32 h0; - + hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint); h0 = fingerprint[2] % k; - + register cmph_uint32 m = size[h0]; register cmph_uint32 b = fch_calc_b(c, m); register double p1 = fch_calc_p1(m); register double p2 = fch_calc_p2(b); - + #if defined (__ia64) || defined (__x86_64__) register cmph_uint64 * g_is_ptr = (cmph_uint64 *)packed_mphf; #else register cmph_uint32 * g_is_ptr = packed_mphf; #endif - + register cmph_uint8 * h1_ptr = (cmph_uint8 *) g_is_ptr[h0]; - + register cmph_uint8 * h2_ptr = h1_ptr + hash_state_packed_size(h1_type); register cmph_uint8 * g = h2_ptr + hash_state_packed_size(h2_type); - + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m; register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % m; @@ -962,7 +963,7 @@ static cmph_uint32 brz_fch_search_packed(cmph_uint32 *packed_mphf, const char *k } /** cmph_uint32 brz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -970,7 +971,7 @@ static cmph_uint32 brz_fch_search_packed(cmph_uint32 *packed_mphf, const char *k */ cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) { - register cmph_uint32 *ptr = (cmph_uint32 *)packed_mphf; + register cmph_uint32 *ptr = (cmph_uint32 *)packed_mphf; register CMPH_ALGO algo = *ptr++; cmph_uint32 fingerprint[3]; switch(algo) @@ -982,4 +983,3 @@ cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke default: assert(0); } } - diff --git a/src/buffer_entry.c b/src/buffer_entry.c index 5dcc4d5..65ebfda 100644 --- a/src/buffer_entry.c +++ b/src/buffer_entry.c @@ -17,7 +17,7 @@ struct __buffer_entry_t buffer_entry_t * buffer_entry_new(cmph_uint32 capacity) { buffer_entry_t *buff_entry = (buffer_entry_t *)malloc(sizeof(buffer_entry_t)); - assert(buff_entry); + if (!buff_entry) return NULL; buff_entry->fd = NULL; buff_entry->buff = NULL; buff_entry->capacity = capacity; @@ -62,7 +62,7 @@ cmph_uint8 * buffer_entry_read_key(buffer_entry_t * buffer_entry, cmph_uint32 * free(buf); return NULL; } - if((buffer_entry->pos + lacked_bytes) > buffer_entry->nbytes) + if((buffer_entry->pos + lacked_bytes) > buffer_entry->nbytes) { copied_bytes = buffer_entry->nbytes - buffer_entry->pos; lacked_bytes = (buffer_entry->pos + lacked_bytes) - buffer_entry->nbytes; @@ -71,7 +71,7 @@ cmph_uint8 * buffer_entry_read_key(buffer_entry_t * buffer_entry, cmph_uint32 * } memcpy(keylen + copied_bytes, buffer_entry->buff + buffer_entry->pos, (size_t)lacked_bytes); buffer_entry->pos += lacked_bytes; - + lacked_bytes = *keylen; copied_bytes = 0; buf = (cmph_uint8 *)malloc(*keylen + sizeof(*keylen)); @@ -83,7 +83,7 @@ cmph_uint8 * buffer_entry_read_key(buffer_entry_t * buffer_entry, cmph_uint32 * memcpy(buf + sizeof(*keylen), buffer_entry->buff + buffer_entry->pos, (size_t)copied_bytes); } buffer_entry_load(buffer_entry); - } + } memcpy(buf+sizeof(*keylen)+copied_bytes, buffer_entry->buff + buffer_entry->pos, (size_t)lacked_bytes); buffer_entry->pos += lacked_bytes; return buf; @@ -97,7 +97,7 @@ void buffer_entry_destroy(buffer_entry_t * buffer_entry) buffer_entry->buff = NULL; buffer_entry->capacity = 0; buffer_entry->nbytes = 0; - buffer_entry->pos = 0; + buffer_entry->pos = 0; buffer_entry->eof = 0; free(buffer_entry); } diff --git a/src/buffer_manage.c b/src/buffer_manage.c index fdefc62..93ec327 100644 --- a/src/buffer_manage.c +++ b/src/buffer_manage.c @@ -16,7 +16,7 @@ buffer_manage_t * buffer_manage_new(cmph_uint32 memory_avail, cmph_uint32 nentri { cmph_uint32 memory_avail_entry, i; buffer_manage_t *buff_manage = (buffer_manage_t *)malloc(sizeof(buffer_manage_t)); - assert(buff_manage); + if (!buff_manage) return NULL; buff_manage->memory_avail = memory_avail; buff_manage->buffer_entries = (buffer_entry_t **)calloc((size_t)nentries, sizeof(buffer_entry_t *)); buff_manage->memory_avail_list = (cmph_uint32 *)calloc((size_t)nentries, sizeof(cmph_uint32)); @@ -26,7 +26,7 @@ buffer_manage_t * buffer_manage_new(cmph_uint32 memory_avail, cmph_uint32 nentri for(i = 0; i < buff_manage->nentries; i++) { buff_manage->buffer_entries[i] = buffer_entry_new(memory_avail_entry); - } + } return buff_manage; } @@ -54,7 +54,7 @@ cmph_uint8 * buffer_manage_read_key(buffer_manage_t * buffer_manage, cmph_uint32 } void buffer_manage_destroy(buffer_manage_t * buffer_manage) -{ +{ cmph_uint32 i; for(i = 0; i < buffer_manage->nentries; i++) { diff --git a/src/buffer_manager.c b/src/buffer_manager.c index 5a051e2..243d4d9 100644 --- a/src/buffer_manager.c +++ b/src/buffer_manager.c @@ -16,7 +16,7 @@ buffer_manager_t * buffer_manager_new(cmph_uint32 memory_avail, cmph_uint32 nent { cmph_uint32 memory_avail_entry, i; buffer_manager_t *buff_manager = (buffer_manager_t *)malloc(sizeof(buffer_manager_t)); - assert(buff_manager); + if (!buff_manager) return NULL; buff_manager->memory_avail = memory_avail; buff_manager->buffer_entries = (buffer_entry_t **)calloc((size_t)nentries, sizeof(buffer_entry_t *)); buff_manager->memory_avail_list = (cmph_uint32 *)calloc((size_t)nentries, sizeof(cmph_uint32)); @@ -26,7 +26,7 @@ buffer_manager_t * buffer_manager_new(cmph_uint32 memory_avail, cmph_uint32 nent for(i = 0; i < buff_manager->nentries; i++) { buff_manager->buffer_entries[i] = buffer_entry_new(memory_avail_entry); - } + } return buff_manager; } @@ -52,7 +52,7 @@ cmph_uint8 * buffer_manager_read_key(buffer_manager_t * buffer_manager, cmph_uin } void buffer_manager_destroy(buffer_manager_t * buffer_manager) -{ +{ cmph_uint32 i; for(i = 0; i < buffer_manager->nentries; i++) { diff --git a/src/chd.c b/src/chd.c index 3eec2b3..6aafdbc 100644 --- a/src/chd.c +++ b/src/chd.c @@ -18,7 +18,7 @@ chd_config_data_t *chd_config_new(cmph_config_t *mph) cmph_io_adapter_t *key_source = mph->key_source; chd_config_data_t *chd; chd = (chd_config_data_t *)malloc(sizeof(chd_config_data_t)); - assert(chd); + if (!chd) return NULL; memset(chd, 0, sizeof(chd_config_data_t)); chd->chd_ph = cmph_config_new(key_source); @@ -69,12 +69,12 @@ cmph_t *chd_new(cmph_config_t *mph, double c) chd_config_data_t *chd = (chd_config_data_t *)mph->data; chd_ph_config_data_t * chd_ph = (chd_ph_config_data_t *)chd->chd_ph->data; compressed_rank_t cr; - + register cmph_t * chd_phf = NULL; - register cmph_uint32 packed_chd_phf_size = 0; + register cmph_uint32 packed_chd_phf_size = 0; cmph_uint8 * packed_chd_phf = NULL; - - register cmph_uint32 packed_cr_size = 0; + + register cmph_uint32 packed_cr_size = 0; cmph_uint8 * packed_cr = NULL; register cmph_uint32 i, idx, nkeys, nvals, nbins; @@ -86,24 +86,24 @@ cmph_t *chd_new(cmph_config_t *mph, double c) ELAPSED_TIME_IN_SECONDS(&construction_time_begin); #endif - cmph_config_set_verbosity(chd->chd_ph, mph->verbosity); + cmph_config_set_verbosity(chd->chd_ph, mph->verbosity); cmph_config_set_graphsize(chd->chd_ph, c); - + if (mph->verbosity) { fprintf(stderr, "Generating a CHD_PH perfect hash function with a load factor equal to %.3f\n", c); } - + chd_phf = cmph_new(chd->chd_ph); - - if(chd_phf == NULL) + + if(chd_phf == NULL) { return NULL; } - - packed_chd_phf_size = cmph_packed_size(chd_phf); + + packed_chd_phf_size = cmph_packed_size(chd_phf); DEBUGP("packed_chd_phf_size = %u\n", packed_chd_phf_size); - + /* Make sure that we have enough space to pack the mphf. */ packed_chd_phf = calloc((size_t)packed_chd_phf_size,(size_t)1); @@ -111,8 +111,8 @@ cmph_t *chd_new(cmph_config_t *mph, double c) cmph_pack(chd_phf, packed_chd_phf); cmph_destroy(chd_phf); - - + + if (mph->verbosity) { fprintf(stderr, "Compressing the range of the resulting CHD_PH perfect hash function\n"); @@ -121,11 +121,11 @@ cmph_t *chd_new(cmph_config_t *mph, double c) compressed_rank_init(&cr); nbins = chd_ph->n; nkeys = chd_ph->m; - nvals = nbins - nkeys; - + nvals = nbins - nkeys; + vals_table = (cmph_uint32 *)calloc(nvals, sizeof(cmph_uint32)); occup_table = (cmph_uint32 *)chd_ph->occup_table; - + for(i = 0, idx = 0; i < nbins; i++) { if(!GETBIT32(occup_table, i)) @@ -133,10 +133,10 @@ cmph_t *chd_new(cmph_config_t *mph, double c) vals_table[idx++] = i; } } - + compressed_rank_generate(&cr, vals_table, nvals); free(vals_table); - + packed_cr_size = compressed_rank_packed_size(&cr); packed_cr = (cmph_uint8 *) calloc(packed_cr_size, sizeof(cmph_uint8)); compressed_rank_pack(&cr, packed_cr); @@ -145,16 +145,16 @@ cmph_t *chd_new(cmph_config_t *mph, double c) mphf = (cmph_t *)malloc(sizeof(cmph_t)); mphf->algo = mph->algo; chdf = (chd_data_t *)malloc(sizeof(chd_data_t)); - + chdf->packed_cr = packed_cr; packed_cr = NULL; //transfer memory ownership chdf->packed_chd_phf = packed_chd_phf; packed_chd_phf = NULL; //transfer memory ownership - + chdf->packed_chd_phf_size = packed_chd_phf_size; chdf->packed_cr_size = packed_cr_size; - + mphf->data = chdf; mphf->size = nkeys; @@ -163,12 +163,12 @@ cmph_t *chd_new(cmph_config_t *mph, double c) { fprintf(stderr, "Successfully generated minimal perfect hash function\n"); } - #ifdef CMPH_TIMING + #ifdef CMPH_TIMING ELAPSED_TIME_IN_SECONDS(&construction_time); register cmph_uint32 space_usage = chd_packed_size(mphf)*8; construction_time = construction_time - construction_time_begin; fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", nkeys, c, chd_ph->keys_per_bucket, construction_time, space_usage/(double)nkeys); - #endif + #endif return mphf; } @@ -196,7 +196,7 @@ int chd_dump(cmph_t *mphf, FILE *fd) { register size_t nbytes; chd_data_t *data = (chd_data_t *)mphf->data; - + __cmph_dump(mphf, fd); // Dumping CHD_PH perfect hash function @@ -207,7 +207,7 @@ int chd_dump(cmph_t *mphf, FILE *fd) DEBUGP("Dumping compressed rank structure with %u bytes to disk\n", 1); nbytes = fwrite(&data->packed_cr_size, sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(data->packed_cr, data->packed_cr_size, (size_t)1, fd); - + return 1; } @@ -242,10 +242,10 @@ void chd_pack(cmph_t *mphf, void *packed_mphf) // packing packed_cr_size and packed_cr *ptr = data->packed_cr_size; ptr8 = (cmph_uint8 *) (ptr + 1); - + memcpy(ptr8, data->packed_cr, data->packed_cr_size); ptr8 += data->packed_cr_size; - + ptr = (cmph_uint32 *) ptr8; *ptr = data->packed_chd_phf_size; @@ -268,5 +268,3 @@ cmph_uint32 chd_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint8 * packed_chd_phf = ((cmph_uint8 *) ptr) + packed_cr_size + sizeof(cmph_uint32); return _chd_search(packed_chd_phf, ptr, key, keylen); } - - diff --git a/src/chd_ph.c b/src/chd_ph.c index 71f83fb..d225156 100644 --- a/src/chd_ph.c +++ b/src/chd_ph.c @@ -29,7 +29,7 @@ struct _chd_ph_item_t }; typedef struct _chd_ph_item_t chd_ph_item_t; -// struct to represent the items at mapping phase only. +// struct to represent the items at mapping phase only. struct _chd_ph_map_item_t { cmph_uint32 f; @@ -85,7 +85,7 @@ static cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets,chd_ph_map_item register chd_ph_map_item_t * tmp_map_item = map_items + item_idx; register chd_ph_bucket_t * bucket = buckets + tmp_map_item->bucket_num; tmp_item = items + bucket->items_list; - + for(i = 0; i < bucket->size; i++) { if(tmp_item->f == tmp_map_item->f && tmp_item->h == tmp_map_item->h) @@ -105,7 +105,7 @@ void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets) free(buckets); } -static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, +static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, cmph_uint32 *max_bucket_size); static chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets,chd_ph_item_t ** items, @@ -131,7 +131,7 @@ static inline double chd_ph_get_entropy(cmph_uint32 * disp_table, cmph_uint32 n, { probe_counts[disp_table[i]]++; }; - + for(i = 0; i < max_probes; i++) { if(probe_counts[i] > 0) @@ -145,9 +145,9 @@ chd_ph_config_data_t *chd_ph_config_new(void) { chd_ph_config_data_t *chd_ph; chd_ph = (chd_ph_config_data_t *)malloc(sizeof(chd_ph_config_data_t)); - assert(chd_ph); + if (!chd_ph) return NULL; memset(chd_ph, 0, sizeof(chd_ph_config_data_t)); - + chd_ph->hashfunc = CMPH_HASH_JENKINS; chd_ph->cs = NULL; chd_ph->nbuckets = 0; @@ -159,7 +159,7 @@ chd_ph_config_data_t *chd_ph_config_new(void) chd_ph->keys_per_bin = 1; chd_ph->keys_per_bucket = 4; chd_ph->occup_table = 0; - + return chd_ph; } @@ -184,7 +184,7 @@ void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 1) break; //chd_ph only uses one linear hash function - chd_ph->hashfunc = *hashptr; + chd_ph->hashfunc = *hashptr; ++i, ++hashptr; } } @@ -228,24 +228,24 @@ cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_ { mapping_iterations--; if (chd_ph->hl) hash_state_destroy(chd_ph->hl); - chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m); + chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m); chd_ph_bucket_clean(buckets, chd_ph->nbuckets); - mph->key_source->rewind(mph->key_source->data); + mph->key_source->rewind(mph->key_source->data); for(i = 0; i < chd_ph->m; i++) { - mph->key_source->read(mph->key_source->data, &key, &keylen); + mph->key_source->read(mph->key_source->data, &key, &keylen); hash_vector(chd_ph->hl, key, keylen, hl); - + map_item = (map_items + i); g = hl[0] % chd_ph->nbuckets; map_item->f = hl[1] % chd_ph->n; map_item->h = hl[2] % (chd_ph->n - 1) + 1; map_item->bucket_num=g; - mph->key_source->dispose(mph->key_source->data, key, keylen); + mph->key_source->dispose(mph->key_source->data, key, keylen); // if(buckets[g].size == (chd_ph->keys_per_bucket << 2)) // { // DEBUGP("BUCKET = %u -- SIZE = %u -- MAXIMUM SIZE = %u\n", g, buckets[g].size, (chd_ph->keys_per_bucket << 2)); @@ -275,7 +275,7 @@ cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_ free(map_items); return 1; // SUCCESS } - + if(mapping_iterations == 0) { goto error; @@ -292,7 +292,7 @@ chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_ cmph_uint32 nbuckets, cmph_uint32 nitems, cmph_uint32 max_bucket_size) { chd_ph_sorted_list_t * sorted_lists = (chd_ph_sorted_list_t *) calloc(max_bucket_size + 1, sizeof(chd_ph_sorted_list_t)); - + chd_ph_bucket_t * input_buckets = (*_buckets); chd_ph_bucket_t * output_buckets; chd_ph_item_t * input_items = (*_items); @@ -319,7 +319,7 @@ chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_ // Store the buckets in a new array which is sorted by bucket sizes output_buckets = calloc(nbuckets, sizeof(chd_ph_bucket_t)); // everything is initialized with zero // non_empty_buckets = nbuckets; - + for(i = 0; i < nbuckets; i++) { bucket_size = input_buckets[i].size; @@ -338,8 +338,8 @@ chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_ // Return the buckets sorted in new order and free the old buckets sorted in old order free(input_buckets); (*_buckets) = output_buckets; - - + + // Store the items according to the new order of buckets. output_items = (chd_ph_item_t*)calloc(nitems, sizeof(chd_ph_item_t)); position = 0; @@ -426,26 +426,26 @@ static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph } position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n); UNSETBIT32(((cmph_uint32*)chd_ph->occup_table), position); - + // ([position/32]^=(1<<(position%32)); item++; i--; }; }; return 0; - } + } return 1; }; -static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, cmph_uint32 max_probes, +static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, cmph_uint32 max_probes, cmph_uint32 * disp_table, cmph_uint32 bucket_num, cmph_uint32 size) - + { register cmph_uint32 probe0_num, probe1_num, probe_num; probe0_num = 0; probe1_num = 0; probe_num = 0; - + while(1) { if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, bucket_num,size)) @@ -469,7 +469,7 @@ static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucke }; static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t * buckets, chd_ph_item_t *items, - cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table) { register cmph_uint32 i = 0; @@ -490,8 +490,8 @@ static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_buc return 1; }; -static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, - cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, +static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table) { register cmph_uint32 i,j, non_placed_bucket; @@ -516,10 +516,10 @@ static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_buc { // if bucket is successfully placed remove it from list if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, curr_bucket, i)) - { + { disp_table[buckets[curr_bucket].bucket_id] = probe0_num + probe1_num * chd_ph->n; // DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]); - } + } else { // DEBUGP("BUCKET %u NOT PLACED\n", curr_bucket); @@ -529,7 +529,7 @@ static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_buc #endif buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list = buckets[curr_bucket].items_list; buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id = buckets[curr_bucket].bucket_id; -#ifdef DEBUG +#ifdef DEBUG buckets[curr_bucket].items_list=items_list; buckets[curr_bucket].bucket_id=bucket_id; #endif @@ -557,7 +557,7 @@ static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_buc }; cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items , - cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table) { if(chd_ph->use_h) @@ -582,7 +582,7 @@ static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, memset(chd_ph->occup_table, 0, chd_ph->n); else memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32)); - + for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++) for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size + sorted_lists[bucket_size].buckets_list; i++) @@ -602,7 +602,7 @@ static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, return 0; } (chd_ph->occup_table[position])++; - } + } else { if(GETBIT32(((cmph_uint32*)chd_ph->occup_table), position)) @@ -624,7 +624,7 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) cmph_t *mphf = NULL; chd_ph_data_t *chd_phf = NULL; chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; - + register double load_factor = c; register cmph_uint8 searching_success = 0; register cmph_uint32 max_probes = 1 << 20; // default value for max_probes @@ -645,24 +645,24 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) chd_ph->m = mph->key_source->nkeys; DEBUGP("m = %u\n", chd_ph->m); - + chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1; DEBUGP("nbuckets = %u\n", chd_ph->nbuckets); - + if(load_factor < 0.5 ) { load_factor = 0.5; } - + if(load_factor >= 0.99) { load_factor = 0.99; } - + DEBUGP("load_factor = %.3f\n", load_factor); - + chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1; - + //Round the number of bins to the prime immediately above if(chd_ph->n % 2 == 0) chd_ph->n++; for(;;) @@ -670,35 +670,35 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) if(check_primality(chd_ph->n) == 1) break; chd_ph->n += 2; // just odd numbers can be primes for n > 2 - + }; - + DEBUGP("n = %u \n", chd_ph->n); if(chd_ph->keys_per_bin == 1) { space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n); } - + if(mph->verbosity) { fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound); } // We allocate the working tables - buckets = chd_ph_bucket_new(chd_ph->nbuckets); + buckets = chd_ph_bucket_new(chd_ph->nbuckets); items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t)); max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes); - + if(chd_ph->keys_per_bin == 1) chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32)); else chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8)); - + disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32)); -// +// // init_genrand(time(0)); - + while(1) { iterations --; @@ -706,12 +706,12 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) { fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n); } - + if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size)) { if (mph->verbosity) { - fprintf(stderr, "Failure in mapping step\n"); + fprintf(stderr, "Failure in mapping step\n"); } failure = 1; goto cleanup; @@ -727,15 +727,15 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) } sorted_lists = chd_ph_ordering(&buckets, &items, chd_ph->nbuckets, chd_ph->m, max_bucket_size); - + if (mph->verbosity) { fprintf(stderr, "Starting searching step\n"); } - + searching_success = chd_ph_searching(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table); if(searching_success) break; - + // reset occup_table if(chd_ph->keys_per_bin > 1) memset(chd_ph->occup_table, 0, chd_ph->n); @@ -757,19 +757,19 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) { if(!chd_ph_check_bin_hashing(chd_ph, buckets, items, disp_table,sorted_lists,max_bucket_size)) { - + DEBUGP("Error for bin packing generation"); failure = 1; goto cleanup; } } #endif - + if (mph->verbosity) { fprintf(stderr, "Starting compressing step\n"); } - + if(chd_ph->cs) { free(chd_ph->cs); @@ -777,7 +777,7 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); compressed_seq_init(chd_ph->cs); compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets); - + #ifdef CMPH_TIMING ELAPSED_TIME_IN_SECONDS(&construction_time); register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes); @@ -785,11 +785,11 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) #endif cleanup: - chd_ph_bucket_destroy(buckets); + chd_ph_bucket_destroy(buckets); free(items); free(sorted_lists); free(disp_table); - if(failure) + if(failure) { if(chd_ph->hl) { @@ -802,14 +802,14 @@ cleanup: mphf = (cmph_t *)malloc(sizeof(cmph_t)); mphf->algo = mph->algo; chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t)); - + chd_phf->cs = chd_ph->cs; chd_ph->cs = NULL; //transfer memory ownership chd_phf->hl = chd_ph->hl; chd_ph->hl = NULL; //transfer memory ownership chd_phf->n = chd_ph->n; chd_phf->nbuckets = chd_ph->nbuckets; - + mphf->data = chd_phf; mphf->size = chd_ph->n; @@ -818,12 +818,12 @@ cleanup: { fprintf(stderr, "Successfully generated minimal perfect hash function\n"); } - - #ifdef CMPH_TIMING + + #ifdef CMPH_TIMING register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8; construction_time = construction_time - construction_time_begin; fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m); - #endif + #endif return mphf; } @@ -846,19 +846,19 @@ void chd_ph_load(FILE *fd, cmph_t *mphf) nbytes = fread(buf, (size_t)buflen, (size_t)1, fd); chd_ph->hl = hash_state_load(buf, buflen); free(buf); - + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd); DEBUGP("Compressed sequence structure has %u bytes\n", buflen); buf = (char *)malloc((size_t)buflen); nbytes = fread(buf, (size_t)buflen, (size_t)1, fd); - chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); + chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); compressed_seq_load(chd_ph->cs, buf, buflen); free(buf); - + // loading n and nbuckets DEBUGP("Reading n and nbuckets\n"); - nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd); - nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd); } int chd_ph_dump(cmph_t *mphf, FILE *fd) @@ -867,7 +867,7 @@ int chd_ph_dump(cmph_t *mphf, FILE *fd) cmph_uint32 buflen; register size_t nbytes; chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; - + __cmph_dump(mphf, fd); hash_state_dump(data->hl, &buf, &buflen); @@ -906,11 +906,11 @@ cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) register cmph_uint32 disp,position; register cmph_uint32 probe0_num,probe1_num; register cmph_uint32 f,g,h; - hash_vector(chd_ph->hl, key, keylen, hl); + hash_vector(chd_ph->hl, key, keylen, hl); g = hl[0] % chd_ph->nbuckets; f = hl[1] % chd_ph->n; h = hl[2] % (chd_ph->n-1) + 1; - + disp = compressed_seq_query(chd_ph->cs, g); probe0_num = disp % chd_ph->n; probe1_num = disp/chd_ph->n; @@ -949,10 +949,10 @@ void chd_ph_pack(cmph_t *mphf, void *packed_mphf) cmph_uint32 chd_ph_packed_size(cmph_t *mphf) { register chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; - register CMPH_HASH hl_type = hash_get_type(data->hl); + register CMPH_HASH hl_type = hash_get_type(data->hl); register cmph_uint32 hash_state_pack_size = hash_state_packed_size(hl_type); register cmph_uint32 cs_pack_size = compressed_seq_packed_size(data->cs); - + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_pack_size + cs_pack_size + 3*sizeof(cmph_uint32)); } @@ -961,28 +961,25 @@ cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 { register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf; register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4; - + register cmph_uint32 * ptr = (cmph_uint32 *)(hl_ptr + hash_state_packed_size(hl_type)); register cmph_uint32 n = *ptr++; register cmph_uint32 nbuckets = *ptr++; cmph_uint32 hl[3]; - + register cmph_uint32 disp,position; register cmph_uint32 probe0_num,probe1_num; register cmph_uint32 f,g,h; - + hash_vector_packed(hl_ptr, hl_type, key, keylen, hl); g = hl[0] % nbuckets; f = hl[1] % n; h = hl[2] % (n-1) + 1; - + disp = compressed_seq_query_packed(ptr, g); probe0_num = disp % n; probe1_num = disp/n; position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % n); return position; } - - - diff --git a/src/chm.c b/src/chm.c index 9cdbf41..5c416b1 100644 --- a/src/chm.c +++ b/src/chm.c @@ -21,7 +21,7 @@ chm_config_data_t *chm_config_new(void) { chm_config_data_t *chm = NULL; chm = (chm_config_data_t *)malloc(sizeof(chm_config_data_t)); - assert(chm); + if (!chm) return NULL; memset(chm, 0, sizeof(chm_config_data_t)); chm->hashfuncs[0] = CMPH_HASH_JENKINS; chm->hashfuncs[1] = CMPH_HASH_JENKINS; @@ -45,7 +45,7 @@ void chm_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 2) break; //chm only uses two hash functions - chm->hashfuncs[i] = *hashptr; + chm->hashfuncs[i] = *hashptr; ++i, ++hashptr; } } @@ -61,7 +61,7 @@ cmph_t *chm_new(cmph_config_t *mph, double c) chm_config_data_t *chm = (chm_config_data_t *)mph->data; chm->m = mph->key_source->nkeys; if (c == 0) c = 2.09; - chm->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); + chm->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); DEBUGP("m (edges): %u n (vertices): %u c: %f\n", chm->m, chm->n, c); chm->graph = graph_new(chm->n, chm->m); DEBUGP("Created graph\n"); @@ -92,12 +92,12 @@ cmph_t *chm_new(cmph_config_t *mph, double c) fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } - else break; + } + else break; } if (iterations == 0) { - graph_destroy(chm->graph); + graph_destroy(chm->graph); return NULL; } @@ -120,7 +120,7 @@ cmph_t *chm_new(cmph_config_t *mph, double c) chm_traverse(chm, visited, i); } } - graph_destroy(chm->graph); + graph_destroy(chm->graph); free(visited); chm->graph = NULL; @@ -149,7 +149,7 @@ static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint3 graph_iterator_t it = graph_neighbors_it(chm->graph, v); cmph_uint32 neighbor = 0; SETBIT(visited,v); - + DEBUGP("Visiting vertex %u\n", v); while((neighbor = graph_next_neighbor(chm->graph, &it)) != GRAPH_NO_NEIGHBOR) { @@ -162,7 +162,7 @@ static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint3 chm_traverse(chm, visited, neighbor); } } - + static int chm_gen_edges(cmph_config_t *mph) { cmph_uint32 e; @@ -170,7 +170,7 @@ static int chm_gen_edges(cmph_config_t *mph) int cycles = 0; DEBUGP("Generating edges for %u vertices with hash functions %s and %s\n", chm->n, cmph_hash_names[chm->hashfuncs[0]], cmph_hash_names[chm->hashfuncs[1]]); - graph_clear_edges(chm->graph); + graph_clear_edges(chm->graph); mph->key_source->rewind(mph->key_source->data); for (e = 0; e < mph->key_source->nkeys; ++e) { @@ -181,7 +181,7 @@ static int chm_gen_edges(cmph_config_t *mph) h1 = hash(chm->hashes[0], key, keylen) % chm->n; h2 = hash(chm->hashes[1], key, keylen) % chm->n; if (h1 == h2) if (++h2 >= chm->n) h2 = 0; - if (h1 == h2) + if (h1 == h2) { if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); mph->key_source->dispose(mph->key_source->data, key, keylen); @@ -205,7 +205,7 @@ int chm_dump(cmph_t *mphf, FILE *fd) cmph_uint32 two = 2; //number of hash functions chm_data_t *data = (chm_data_t *)mphf->data; register size_t nbytes; - + __cmph_dump(mphf, fd); nbytes = fwrite(&two, sizeof(cmph_uint32), (size_t)1, fd); @@ -223,7 +223,7 @@ int chm_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); - + nbytes = fwrite(data->g, sizeof(cmph_uint32)*data->n, (size_t)1, fd); /* #ifdef DEBUG fprintf(stderr, "G: "); @@ -260,8 +260,8 @@ void chm_load(FILE *f, cmph_t *mphf) } DEBUGP("Reading m and n\n"); - nbytes = fread(&(chm->n), sizeof(cmph_uint32), (size_t)1, f); - nbytes = fread(&(chm->m), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(chm->n), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(chm->m), sizeof(cmph_uint32), (size_t)1, f); chm->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*chm->n); nbytes = fread(chm->g, chm->n*sizeof(cmph_uint32), (size_t)1, f); @@ -272,7 +272,7 @@ void chm_load(FILE *f, cmph_t *mphf) #endif return; } - + cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { @@ -287,7 +287,7 @@ cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) void chm_destroy(cmph_t *mphf) { chm_data_t *data = (chm_data_t *)mphf->data; - free(data->g); + free(data->g); hash_state_destroy(data->hashes[0]); hash_state_destroy(data->hashes[1]); free(data->hashes); @@ -298,7 +298,7 @@ void chm_destroy(cmph_t *mphf) /** \fn void chm_pack(cmph_t *mphf, void *packed_mphf); * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. * \param mphf pointer to the resulting mphf - * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() */ void chm_pack(cmph_t *mphf, void *packed_mphf) { @@ -332,26 +332,26 @@ void chm_pack(cmph_t *mphf, void *packed_mphf) ptr += sizeof(data->m); // packing g - memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n); + memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n); } /** \fn cmph_uint32 chm_packed_size(cmph_t *mphf); * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 chm_packed_size(cmph_t *mphf) { chm_data_t *data = (chm_data_t *)mphf->data; - CMPH_HASH h1_type = hash_get_type(data->hashes[0]); - CMPH_HASH h2_type = hash_get_type(data->hashes[1]); + CMPH_HASH h1_type = hash_get_type(data->hashes[0]); + CMPH_HASH h2_type = hash_get_type(data->hashes[1]); - return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + 4*sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n); } /** cmph_uint32 chm_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -366,16 +366,16 @@ cmph_uint32 chm_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type); register CMPH_HASH h2_type = *((cmph_uint32 *)h2_ptr); h2_ptr += 4; - + register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type)); - - register cmph_uint32 n = *g_ptr++; - register cmph_uint32 m = *g_ptr++; - - register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; - register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; + + register cmph_uint32 n = *g_ptr++; + register cmph_uint32 m = *g_ptr++; + + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; + register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); if (h1 == h2 && ++h2 >= n) h2 = 0; DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, g_ptr[h1], g_ptr[h2], m); - return (g_ptr[h1] + g_ptr[h2]) % m; + return (g_ptr[h1] + g_ptr[h2]) % m; } diff --git a/src/cmph.c b/src/cmph.c index cba735f..f460dd0 100644 --- a/src/cmph.c +++ b/src/cmph.c @@ -1,10 +1,10 @@ #include "cmph.h" #include "cmph_structs.h" #include "chm.h" -#include "bmz.h" -#include "bmz8.h" -#include "brz.h" -#include "fch.h" +#include "bmz.h" +#include "bmz8.h" +#include "brz.h" +#include "fch.h" #include "bdz.h" #include "bdz_ph.h" #include "chd_ph.h" @@ -13,23 +13,23 @@ #include #include #include -//#define DEBUG +// #define DEBUG #include "debug.h" const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz", "bdz_ph", "chd_ph", "chd", NULL }; -typedef struct +typedef struct { void *vector; - cmph_uint32 position; // access position when data is a vector + cmph_uint32 position; // access position when data is a vector } cmph_vector_t; -/** +/** * Support a vector of struct as the source of keys. * - * E.g. The keys could be the fieldB's in a vector of struct rec where + * E.g. The keys could be the fieldB's in a vector of struct rec where * struct rec is defined as: * struct rec { * fieldA; @@ -37,7 +37,7 @@ typedef struct * fieldC; * } */ -typedef struct +typedef struct { void *vector; /* Pointer to the vector of struct */ cmph_uint32 position; /* current position */ @@ -61,7 +61,7 @@ static int key_nlfile_read(void *data, char **key, cmph_uint32 *keylen) while(1) { char buf[BUFSIZ]; - char *c = fgets(buf, BUFSIZ, fd); + char *c = fgets(buf, BUFSIZ, fd); if (c == NULL) return -1; if (feof(fd)) return -1; *key = (char *)realloc(*key, *keylen + strlen(buf) + 1); @@ -156,8 +156,12 @@ static cmph_uint32 count_nlfile_keys(FILE *fd) while(1) { char buf[BUFSIZ]; - ptr = fgets(buf, BUFSIZ, fd); + ptr = fgets(buf, BUFSIZ, fd); if (feof(fd)) break; + if (ferror(fd) || ptr == NULL) { + perror("Error reading input file"); + return 0; + } if (buf[strlen(buf) - 1] != '\n') continue; ++count; } @@ -264,12 +268,12 @@ cmph_io_adapter_t *cmph_io_struct_vector_adapter(void * vector, cmph_uint32 stru key_source->read = key_struct_vector_read; key_source->dispose = key_vector_dispose; key_source->rewind = key_struct_vector_rewind; - return key_source; + return key_source; } void cmph_io_struct_vector_adapter_destroy(cmph_io_adapter_t * key_source) { - cmph_io_struct_vector_destroy(key_source); + cmph_io_struct_vector_destroy(key_source); } cmph_io_adapter_t *cmph_io_vector_adapter(char ** vector, cmph_uint32 nkeys) @@ -370,7 +374,7 @@ void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) void cmph_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir) { - if (mph->algo == CMPH_BRZ) + if (mph->algo == CMPH_BRZ) { brz_config_set_tmp_dir(mph, tmp_dir); } @@ -379,7 +383,7 @@ void cmph_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir) void cmph_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd) { - if (mph->algo == CMPH_BRZ) + if (mph->algo == CMPH_BRZ) { brz_config_set_mphf_fd(mph, mphf_fd); } @@ -387,19 +391,19 @@ void cmph_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd) void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b) { - if (mph->algo == CMPH_BRZ) + if (mph->algo == CMPH_BRZ) { brz_config_set_b(mph, b); } - else if (mph->algo == CMPH_BDZ) + else if (mph->algo == CMPH_BDZ) { bdz_config_set_b(mph, b); } - else if (mph->algo == CMPH_CHD_PH) + else if (mph->algo == CMPH_CHD_PH) { chd_ph_config_set_b(mph, b); } - else if (mph->algo == CMPH_CHD) + else if (mph->algo == CMPH_CHD) { chd_config_set_b(mph, b); } @@ -407,11 +411,11 @@ void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b) void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) { - if (mph->algo == CMPH_CHD_PH) + if (mph->algo == CMPH_CHD_PH) { chd_ph_config_set_keys_per_bin(mph, keys_per_bin); } - else if (mph->algo == CMPH_CHD) + else if (mph->algo == CMPH_CHD) { chd_config_set_keys_per_bin(mph, keys_per_bin); } @@ -419,7 +423,7 @@ void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability) { - if (mph->algo == CMPH_BRZ) + if (mph->algo == CMPH_BRZ) { brz_config_set_memory_availability(mph, memory_availability); } @@ -519,7 +523,7 @@ cmph_t *cmph_new(cmph_config_t *mph) double c = mph->c; DEBUGP("Creating mph with algorithm %s\n", cmph_names[mph->algo]); - switch (mph->algo) + switch (mph->algo) { case CMPH_CHM: DEBUGP("Creating chm hash\n"); @@ -654,28 +658,28 @@ cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) case CMPH_CHM: return chm_search(mphf, key, keylen); case CMPH_BMZ: /* included -- Fabiano */ - DEBUGP("bmz algorithm search\n"); + DEBUGP("bmz algorithm search\n"); return bmz_search(mphf, key, keylen); case CMPH_BMZ8: /* included -- Fabiano */ - DEBUGP("bmz8 algorithm search\n"); + DEBUGP("bmz8 algorithm search\n"); return bmz8_search(mphf, key, keylen); case CMPH_BRZ: /* included -- Fabiano */ - DEBUGP("brz algorithm search\n"); + DEBUGP("brz algorithm search\n"); return brz_search(mphf, key, keylen); case CMPH_FCH: /* included -- Fabiano */ - DEBUGP("fch algorithm search\n"); + DEBUGP("fch algorithm search\n"); return fch_search(mphf, key, keylen); case CMPH_BDZ: /* included -- Fabiano */ - DEBUGP("bdz algorithm search\n"); + DEBUGP("bdz algorithm search\n"); return bdz_search(mphf, key, keylen); case CMPH_BDZ_PH: /* included -- Fabiano */ - DEBUGP("bdz_ph algorithm search\n"); + DEBUGP("bdz_ph algorithm search\n"); return bdz_ph_search(mphf, key, keylen); case CMPH_CHD_PH: /* included -- Fabiano */ - DEBUGP("chd_ph algorithm search\n"); + DEBUGP("chd_ph algorithm search\n"); return chd_ph_search(mphf, key, keylen); case CMPH_CHD: /* included -- Fabiano */ - DEBUGP("chd algorithm search\n"); + DEBUGP("chd algorithm search\n"); return chd_search(mphf, key, keylen); default: assert(0); @@ -688,7 +692,7 @@ cmph_uint32 cmph_size(cmph_t *mphf) { return mphf->size; } - + void cmph_destroy(cmph_t *mphf) { switch(mphf->algo) @@ -720,7 +724,7 @@ void cmph_destroy(cmph_t *mphf) case CMPH_CHD: /* included -- Fabiano */ chd_destroy(mphf); return; - default: + default: assert(0); } assert(0); @@ -731,12 +735,12 @@ void cmph_destroy(cmph_t *mphf) /** \fn void cmph_pack(cmph_t *mphf, void *packed_mphf); * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. * \param mphf pointer to the resulting mphf - * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() */ void cmph_pack(cmph_t *mphf, void *packed_mphf) { // packing algorithm type to be used in cmph.c - cmph_uint32 * ptr = (cmph_uint32 *) packed_mphf; + cmph_uint32 * ptr = (cmph_uint32 *) packed_mphf; *ptr++ = mphf->algo; DEBUGP("mphf->algo = %u\n", mphf->algo); switch(mphf->algo) @@ -768,7 +772,7 @@ void cmph_pack(cmph_t *mphf, void *packed_mphf) case CMPH_CHD: /* included -- Fabiano */ chd_pack(mphf, ptr); break; - default: + default: assert(0); } return; @@ -778,7 +782,7 @@ void cmph_pack(cmph_t *mphf, void *packed_mphf) * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 cmph_packed_size(cmph_t *mphf) { switch(mphf->algo) @@ -801,14 +805,14 @@ cmph_uint32 cmph_packed_size(cmph_t *mphf) return chd_ph_packed_size(mphf); case CMPH_CHD: /* included -- Fabiano */ return chd_packed_size(mphf); - default: + default: assert(0); } return 0; // FAILURE } /** cmph_uint32 cmph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -838,7 +842,7 @@ cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 k return chd_ph_search_packed(++ptr, key, keylen); case CMPH_CHD: /* included -- Fabiano */ return chd_search_packed(++ptr, key, keylen); - default: + default: assert(0); } return 0; // FAILURE diff --git a/src/cmph_benchmark.c b/src/cmph_benchmark.c new file mode 100644 index 0000000..0023e2f --- /dev/null +++ b/src/cmph_benchmark.c @@ -0,0 +1,129 @@ +// A simple benchmark tool around getrusage + +#include +#include +#include +#include +#include + +#include "cmph_benchmark.h" + +typedef struct { + const char* name; + void (*func)(int); + int iters; + struct rusage begin; + struct rusage end; +} benchmark_t; + +static benchmark_t* global_benchmarks = NULL; + +/* Subtract the `struct timeval' values X and Y, + storing the result in RESULT. + Return 1 if the difference is negative, otherwise 0. */ + +int timeval_subtract ( + struct timeval *result, struct timeval *x, struct timeval* y) { + /* Perform the carry for the later subtraction by updating y. */ + if (x->tv_usec < y->tv_usec) { + int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; + y->tv_usec -= 1000000 * nsec; + y->tv_sec += nsec; + } + if (x->tv_usec - y->tv_usec > 1000000) { + int nsec = (x->tv_usec - y->tv_usec) / 1000000; + y->tv_usec += 1000000 * nsec; + y->tv_sec -= nsec; + } + + /* Compute the time remaining to wait. + tv_usec is certainly positive. */ + result->tv_sec = x->tv_sec - y->tv_sec; + result->tv_usec = x->tv_usec - y->tv_usec; + + /* Return 1 if result is negative. */ + return x->tv_sec < y->tv_sec; +} + +benchmark_t* find_benchmark(const char* name) { + benchmark_t* benchmark = global_benchmarks; + while (benchmark && benchmark->name != NULL) { + if (strcmp(benchmark->name, name) == 0) break; + ++benchmark; + } + if (!benchmark || !benchmark->name) return NULL; + return benchmark; +} + +int global_benchmarks_length() { + benchmark_t* benchmark = global_benchmarks; + int length = 0; + if (benchmark == NULL) return 0; + while (benchmark->name != NULL) ++length, ++benchmark; + return length; +} + +void bm_register(const char* name, void (*func)(int), int iters) { + benchmark_t benchmark; + int length = global_benchmarks_length(); + benchmark.name = name; + benchmark.func = func; + benchmark.iters = iters; + assert(!find_benchmark(name)); + global_benchmarks = realloc( + global_benchmarks, (length + 2)*sizeof(benchmark_t)); + global_benchmarks[length] = benchmark; + memset(&benchmark, 0, sizeof(benchmark_t)); // pivot + global_benchmarks[length + 1] = benchmark; +} + +void bm_start(const char* name) { + benchmark_t* benchmark; + struct rusage rs; + + benchmark = find_benchmark(name); + assert(benchmark); + int ret = getrusage(RUSAGE_SELF, &rs); + if (ret != 0) { + perror("rusage failed"); + exit(-1); + } + benchmark->begin = rs; + (*benchmark->func)(benchmark->iters); +} + +void bm_end(const char* name) { + benchmark_t* benchmark; + struct rusage rs; + + int ret = getrusage(RUSAGE_SELF, &rs); + if (ret != 0) { + perror("rusage failed"); + exit(-1); + } + + benchmark = find_benchmark(name); + benchmark->end = rs; + + struct timeval utime; + timeval_subtract(&utime, &benchmark->end.ru_utime, &benchmark->begin.ru_utime); + struct timeval stime; + timeval_subtract(&stime, &benchmark->end.ru_stime, &benchmark->begin.ru_stime); + + printf("Benchmark: %s\n", benchmark->name); + printf("User time used : %ld.%06ld\n", + utime.tv_sec, (long int)utime.tv_usec); + printf("System time used: %ld.%06ld\n", + stime.tv_sec, (long int)stime.tv_usec); + printf("\n"); +} + +void run_benchmarks(int argc, char** argv) { + benchmark_t* benchmark = global_benchmarks; + while (benchmark && benchmark->name != NULL) { + bm_start(benchmark->name); + bm_end(benchmark->name); + ++benchmark; + } +} + diff --git a/src/cmph_benchmark.h b/src/cmph_benchmark.h new file mode 100644 index 0000000..bd0eb78 --- /dev/null +++ b/src/cmph_benchmark.h @@ -0,0 +1,20 @@ +#ifndef __CMPH_BENCHMARK_H__ +#define __CMPH_BENCHMARK_H__ + +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define BM_REGISTER(func, iters) bm_register(#func, func, iters) +void bm_register(const char* name, void (*func)(int), int iters); +void run_benchmarks(int argc, char** argv); + +#ifdef __cplusplus +} +#endif + +#endif // __CMPH_BENCHMARK_H__ diff --git a/src/cmph_structs.c b/src/cmph_structs.c index bcd3da3..2c28bc3 100644 --- a/src/cmph_structs.c +++ b/src/cmph_structs.c @@ -28,7 +28,7 @@ void __cmph_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(cmph_names[mphf->algo], (size_t)(strlen(cmph_names[mphf->algo]) + 1), (size_t)1, fd); nbytes = fwrite(&(mphf->size), sizeof(mphf->size), (size_t)1, fd); } -cmph_t *__cmph_load(FILE *f) +cmph_t *__cmph_load(FILE *f) { cmph_t *mphf = NULL; cmph_uint32 i; @@ -36,7 +36,7 @@ cmph_t *__cmph_load(FILE *f) char *ptr = algo_name; CMPH_ALGO algo = CMPH_COUNT; register size_t nbytes; - + DEBUGP("Loading mphf\n"); while(1) { @@ -52,7 +52,7 @@ cmph_t *__cmph_load(FILE *f) algo = i; } } - if (algo == CMPH_COUNT) + if (algo == CMPH_COUNT) { DEBUGP("Algorithm %s not found\n", algo_name); return NULL; @@ -65,5 +65,3 @@ cmph_t *__cmph_load(FILE *f) return mphf; } - - diff --git a/src/djb2_hash.c b/src/djb2_hash.c index d3b4330..25f8220 100644 --- a/src/djb2_hash.c +++ b/src/djb2_hash.c @@ -4,6 +4,7 @@ djb2_state_t *djb2_state_new() { djb2_state_t *state = (djb2_state_t *)malloc(sizeof(djb2_state_t)); + if (!djb2_state) return NULL; state->hashfunc = CMPH_HASH_DJB2; return state; } @@ -18,7 +19,7 @@ cmph_uint32 djb2_hash(djb2_state_t *state, const char *k, cmph_uint32 keylen) register cmph_uint32 hash = 5381; const unsigned char *ptr = (unsigned char *)k; cmph_uint32 i = 0; - while (i < keylen) + while (i < keylen) { hash = hash*33 ^ *ptr; ++ptr, ++i; diff --git a/src/fch.c b/src/fch.c index 67b68fb..9ca4e03 100644 --- a/src/fch.c +++ b/src/fch.c @@ -23,7 +23,7 @@ fch_config_data_t *fch_config_new() { fch_config_data_t *fch; fch = (fch_config_data_t *)malloc(sizeof(fch_config_data_t)); - assert(fch); + if (!fch) return NULL; memset(fch, 0, sizeof(fch_config_data_t)); fch->hashfuncs[0] = CMPH_HASH_JENKINS; fch->hashfuncs[1] = CMPH_HASH_JENKINS; @@ -50,7 +50,7 @@ void fch_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 2) break; //fch only uses two hash functions - fch->hashfuncs[i] = *hashptr; + fch->hashfuncs[i] = *hashptr; ++i, ++hashptr; } } @@ -88,36 +88,36 @@ static fch_buckets_t * mapping(cmph_config_t *mph) fch_buckets_t *buckets = NULL; fch_config_data_t *fch = (fch_config_data_t *)mph->data; if (fch->h1) hash_state_destroy(fch->h1); - fch->h1 = hash_state_new(fch->hashfuncs[0], fch->m); + fch->h1 = hash_state_new(fch->hashfuncs[0], fch->m); fch->b = fch_calc_b(fch->c, fch->m); fch->p1 = fch_calc_p1(fch->m); fch->p2 = fch_calc_p2(fch->b); //DEBUGP("b:%u p1:%f p2:%f\n", fch->b, fch->p1, fch->p2); buckets = fch_buckets_new(fch->b); - mph->key_source->rewind(mph->key_source->data); + mph->key_source->rewind(mph->key_source->data); for(i = 0; i < fch->m; i++) { cmph_uint32 h1, keylen; char *key = NULL; - mph->key_source->read(mph->key_source->data, &key, &keylen); + mph->key_source->read(mph->key_source->data, &key, &keylen); h1 = hash(fch->h1, key, keylen) % fch->m; h1 = mixh10h11h12 (fch->b, fch->p1, fch->p2, h1); fch_buckets_insert(buckets, h1, key, keylen); key = NULL; // transger memory ownership - + } - return buckets; + return buckets; } -// returns the buckets indexes sorted by their sizes. +// returns the buckets indexes sorted by their sizes. static cmph_uint32 * ordering(fch_buckets_t * buckets) { return fch_buckets_get_indexes_sorted_by_size(buckets); } -/* Check whether function h2 causes collisions among the keys of each bucket */ +/* Check whether function h2 causes collisions among the keys of each bucket */ static cmph_uint8 check_for_collisions_h2(fch_config_data_t *fch, fch_buckets_t * buckets, cmph_uint32 *sorted_indexes) { //cmph_uint32 max_size = fch_buckets_get_max_size(buckets); @@ -146,7 +146,7 @@ static cmph_uint8 check_for_collisions_h2(fch_config_data_t *fch, fch_buckets_t } static void permut(cmph_uint32 * vector, cmph_uint32 n) -{ +{ cmph_uint32 i, j, b; for (i = 0; i < n; i++) { j = (cmph_uint32) rand() % n; @@ -179,12 +179,12 @@ static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph { map_table[random_table[i]] = i; } - do { + do { if (fch->h2) hash_state_destroy(fch->h2); - fch->h2 = hash_state_new(fch->hashfuncs[1], fch->m); + fch->h2 = hash_state_new(fch->hashfuncs[1], fch->m); restart = check_for_collisions_h2(fch, buckets, sorted_indexes); filled_count = 0; - if (!restart) + if (!restart) { searching_iterations++; iteration_to_generate_h2 = 0; //DEBUGP("searching_iterations: %u\n", searching_iterations); @@ -192,7 +192,7 @@ static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph else { iteration_to_generate_h2++; //DEBUGP("iteration_to_generate_h2: %u\n", iteration_to_generate_h2); - } + } for(i = 0; (i < nbuckets) && !restart; i++) { cmph_uint32 bucketsize = fch_buckets_get_size(buckets, sorted_indexes[i]); if (bucketsize == 0) @@ -204,8 +204,8 @@ static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph for(z = 0; (z < (fch->m - filled_count)) && restart; z++) { char * key = fch_buckets_get_key(buckets, sorted_indexes[i], INDEX); cmph_uint32 keylen = fch_buckets_get_keylength(buckets, sorted_indexes[i], INDEX); - cmph_uint32 h2 = hash(fch->h2, key, keylen) % fch->m; - counter = 0; + cmph_uint32 h2 = hash(fch->h2, key, keylen) % fch->m; + counter = 0; restart = 0; // false fch->g[sorted_indexes[i]] = (fch->m + random_table[filled_count + z] - h2) % fch->m; //DEBUGP("g[%u]: %u\n", sorted_indexes[i], fch->g[sorted_indexes[i]]); @@ -217,7 +217,7 @@ static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph h2 = hash(fch->h2, key, keylen) % fch->m; index = (h2 + fch->g[sorted_indexes[i]]) % fch->m; //DEBUGP("key:%s keylen:%u index: %u h2:%u bucketsize:%u\n", key, keylen, index, h2, bucketsize); - if (map_table[index] >= filled_count) { + if (map_table[index] >= filled_count) { cmph_uint32 y = map_table[index]; cmph_uint32 ry = random_table[y]; random_table[y] = random_table[filled_count]; @@ -225,19 +225,19 @@ static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph map_table[random_table[y]] = y; map_table[random_table[filled_count]] = filled_count; filled_count++; - counter ++; + counter ++; } - else { + else { restart = 1; // true filled_count = filled_count - counter; - counter = 0; + counter = 0; break; } j = (j + 1) % bucketsize; - } while(j % bucketsize != INDEX); + } while(j % bucketsize != INDEX); } //getchar(); - } + } } while(restart && (searching_iterations < 10) && (iteration_to_generate_h2 < 1000)); free(map_table); free(random_table); @@ -264,7 +264,7 @@ cmph_t *fch_new(cmph_config_t *mph, double c) fch->h2 = NULL; fch->g = NULL; do - { + { if (mph->verbosity) { fprintf(stderr, "Entering mapping step for mph creation of %u keys\n", fch->m); @@ -283,7 +283,7 @@ cmph_t *fch_new(cmph_config_t *mph, double c) } restart_mapping = searching(fch, buckets, sorted_indexes); iterations--; - + } while(restart_mapping && iterations > 0); if (buckets) fch_buckets_destroy(buckets); if (sorted_indexes) free (sorted_indexes); @@ -317,7 +317,7 @@ int fch_dump(cmph_t *mphf, FILE *fd) char *buf = NULL; cmph_uint32 buflen; register size_t nbytes; - + fch_data_t *data = (fch_data_t *)mphf->data; __cmph_dump(mphf, fd); @@ -365,7 +365,7 @@ void fch_load(FILE *f, cmph_t *mphf) nbytes = fread(buf, (size_t)buflen, (size_t)1, f); fch->h1 = hash_state_load(buf, buflen); free(buf); - + //DEBUGP("Loading fch mphf\n"); mphf->data = fch; //DEBUGP("Reading h2\n"); @@ -376,8 +376,8 @@ void fch_load(FILE *f, cmph_t *mphf) nbytes = fread(buf, (size_t)buflen, (size_t)1, f); fch->h2 = hash_state_load(buf, buflen); free(buf); - - + + //DEBUGP("Reading m and n\n"); nbytes = fread(&(fch->m), sizeof(cmph_uint32), (size_t)1, f); nbytes = fread(&(fch->c), sizeof(double), (size_t)1, f); @@ -418,7 +418,7 @@ void fch_destroy(cmph_t *mphf) /** \fn void fch_pack(cmph_t *mphf, void *packed_mphf); * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. * \param mphf pointer to the resulting mphf - * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() */ void fch_pack(cmph_t *mphf, void *packed_mphf) { @@ -450,37 +450,37 @@ void fch_pack(cmph_t *mphf, void *packed_mphf) // packing b *((cmph_uint32 *) ptr) = data->b; ptr += sizeof(data->b); - + // packing p1 - *((cmph_uint64 *)ptr) = (cmph_uint64)data->p1; + *((cmph_uint64 *)ptr) = (cmph_uint64)data->p1; ptr += sizeof(data->p1); // packing p2 - *((cmph_uint64 *)ptr) = (cmph_uint64)data->p2; + *((cmph_uint64 *)ptr) = (cmph_uint64)data->p2; ptr += sizeof(data->p2); // packing g - memcpy(ptr, data->g, sizeof(cmph_uint32)*(data->b)); + memcpy(ptr, data->g, sizeof(cmph_uint32)*(data->b)); } /** \fn cmph_uint32 fch_packed_size(cmph_t *mphf); * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 fch_packed_size(cmph_t *mphf) { fch_data_t *data = (fch_data_t *)mphf->data; - CMPH_HASH h1_type = hash_get_type(data->h1); - CMPH_HASH h2_type = hash_get_type(data->h2); + CMPH_HASH h1_type = hash_get_type(data->h1); + CMPH_HASH h2_type = hash_get_type(data->h2); - return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + 4*sizeof(cmph_uint32) + 2*sizeof(double) + sizeof(cmph_uint32)*(data->b)); } /** cmph_uint32 fch_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -495,12 +495,12 @@ cmph_uint32 fch_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type); register CMPH_HASH h2_type = *((cmph_uint32 *)h2_ptr); h2_ptr += 4; - - register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type)); - - register cmph_uint32 m = *g_ptr++; - register cmph_uint32 b = *g_ptr++; + register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type)); + + register cmph_uint32 m = *g_ptr++; + + register cmph_uint32 b = *g_ptr++; register double p1 = (double)(*((cmph_uint64 *)g_ptr)); g_ptr += 2; @@ -508,10 +508,9 @@ cmph_uint32 fch_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register double p2 = (double)(*((cmph_uint64 *)g_ptr)); g_ptr += 2; - register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m; + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m; register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % m; h1 = mixh10h11h12 (b, p1, p2, h1); return (h2 + g_ptr[h1]) % m; } - diff --git a/src/fch_buckets.c b/src/fch_buckets.c index a588f14..0c11051 100644 --- a/src/fch_buckets.c +++ b/src/fch_buckets.c @@ -20,7 +20,7 @@ typedef struct __fch_bucket_t -static void fch_bucket_new(fch_bucket_t *bucket) +static void fch_bucket_new(fch_bucket_t *bucket) { assert(bucket); bucket->size = 0; @@ -109,16 +109,16 @@ struct __fch_buckets_t { fch_bucket_t * values; cmph_uint32 nbuckets, max_size; - + }; fch_buckets_t * fch_buckets_new(cmph_uint32 nbuckets) { cmph_uint32 i; fch_buckets_t *buckets = (fch_buckets_t *)malloc(sizeof(fch_buckets_t)); - assert(buckets); + if (!buckets) return NULL; buckets->values = (fch_bucket_t *)calloc((size_t)nbuckets, sizeof(fch_bucket_t)); - for (i = 0; i < nbuckets; i++) fch_bucket_new(buckets->values + i); + for (i = 0; i < nbuckets; i++) fch_bucket_new(buckets->values + i); assert(buckets->values); buckets->nbuckets = nbuckets; buckets->max_size = 0; @@ -135,7 +135,7 @@ void fch_buckets_insert(fch_buckets_t * buckets, cmph_uint32 index, char * key, { assert(index < buckets->nbuckets); fch_bucket_insert(buckets->values + index, key, length); - if (fch_bucket_size(buckets->values + index) > buckets->max_size) + if (fch_bucket_size(buckets->values + index) > buckets->max_size) { buckets->max_size = fch_bucket_size(buckets->values + index); } @@ -170,16 +170,16 @@ cmph_uint32 fch_buckets_get_nbuckets(fch_buckets_t * buckets) return buckets->nbuckets; } -cmph_uint32 * fch_buckets_get_indexes_sorted_by_size(fch_buckets_t * buckets) +cmph_uint32 * fch_buckets_get_indexes_sorted_by_size(fch_buckets_t * buckets) { cmph_uint32 i = 0; cmph_uint32 sum = 0, value; cmph_uint32 *nbuckets_size = (cmph_uint32 *) calloc((size_t)buckets->max_size + 1, sizeof(cmph_uint32)); cmph_uint32 * sorted_indexes = (cmph_uint32 *) calloc((size_t)buckets->nbuckets, sizeof(cmph_uint32)); - + // collect how many buckets for each size. for(i = 0; i < buckets->nbuckets; i++) nbuckets_size[fch_bucket_size(buckets->values + i)] ++; - + // calculating offset considering a decreasing order of buckets size. value = nbuckets_size[buckets->max_size]; nbuckets_size[buckets->max_size] = sum; @@ -188,13 +188,13 @@ cmph_uint32 * fch_buckets_get_indexes_sorted_by_size(fch_buckets_t * buckets) sum += value; value = nbuckets_size[i]; nbuckets_size[i] = sum; - + } - for(i = 0; i < buckets->nbuckets; i++) + for(i = 0; i < buckets->nbuckets; i++) { sorted_indexes[nbuckets_size[fch_bucket_size(buckets->values + i)]] = (cmph_uint32)i; nbuckets_size[fch_bucket_size(buckets->values + i)] ++; - } + } free(nbuckets_size); return sorted_indexes; } @@ -208,7 +208,7 @@ void fch_buckets_print(fch_buckets_t * buckets) void fch_buckets_destroy(fch_buckets_t * buckets) { cmph_uint32 i; - for (i = 0; i < buckets->nbuckets; i++) fch_bucket_destroy(buckets->values + i); + for (i = 0; i < buckets->nbuckets; i++) fch_bucket_destroy(buckets->values + i); free(buckets->values); free(buckets); } diff --git a/src/fnv_hash.c b/src/fnv_hash.c index aeaca8f..0ef1f48 100644 --- a/src/fnv_hash.c +++ b/src/fnv_hash.c @@ -4,6 +4,7 @@ fnv_state_t *fnv_state_new() { fnv_state_t *state = (fnv_state_t *)malloc(sizeof(fnv_state_t)); + if (!state) return NULL; state->hashfunc = CMPH_HASH_FNV; return state; } @@ -15,13 +16,13 @@ void fnv_state_destroy(fnv_state_t *state) cmph_uint32 fnv_hash(fnv_state_t *state, const char *k, cmph_uint32 keylen) { - const unsigned char *bp = (const unsigned char *)k; - const unsigned char *be = bp + keylen; - static unsigned int hval = 0; + const unsigned char *bp = (const unsigned char *)k; + const unsigned char *be = bp + keylen; + static unsigned int hval = 0; - while (bp < be) + while (bp < be) { - + //hval *= 0x01000193; good for non-gcc compiler hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24); //good for gcc @@ -41,6 +42,7 @@ void fnv_state_dump(fnv_state_t *state, char **buf, cmph_uint32 *buflen) fnv_state_t * fnv_state_copy(fnv_state_t *src_state) { fnv_state_t *dest_state = (fnv_state_t *)malloc(sizeof(fnv_state_t)); + if (!dest_state) return NULL; dest_state->hashfunc = src_state->hashfunc; return dest_state; } diff --git a/src/graph.c b/src/graph.c index c29fd8b..97737ad 100644 --- a/src/graph.c +++ b/src/graph.c @@ -8,7 +8,7 @@ #include "vstack.h" #include "bitbool.h" -//#define DEBUG +// #define DEBUG #include "debug.h" /* static const cmph_uint8 bitmask[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; */ @@ -77,7 +77,7 @@ void graph_print(graph_t *g) printf("%u -> %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]); } } - + } return; } @@ -130,7 +130,7 @@ static void del_edge_point(graph_t *g, cmph_uint32 v1, cmph_uint32 v2) DEBUGP("Deleting edge point %u %u\n", v1, v2); e = g->first[v1]; - if (check_edge(g, e, v1, v2)) + if (check_edge(g, e, v1, v2)) { g->first[v1] = g->next[e]; //g->edges[e] = EMPTY; @@ -151,7 +151,7 @@ static void del_edge_point(graph_t *g, cmph_uint32 v1, cmph_uint32 v2) DEBUGP("Deleted\n"); } - + void graph_del_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2) { g->shrinking = 1; @@ -163,7 +163,7 @@ void graph_clear_edges(graph_t *g) { cmph_uint32 i; for (i = 0; i < g->nnodes; ++i) g->first[i] = EMPTY; - for (i = 0; i < g->nedges*2; ++i) + for (i = 0; i < g->nedges*2; ++i) { g->edges[i] = EMPTY; g->next[i] = EMPTY; @@ -176,9 +176,9 @@ static cmph_uint8 find_degree1_edge(graph_t *g, cmph_uint32 v, cmph_uint8 *delet { cmph_uint32 edge = g->first[v]; cmph_uint8 found = 0; - DEBUGP("Checking degree of vertex %u\n", v); + DEBUGP("Checking degree of vertex %u connected to edge %u\n", v, edge); if (edge == EMPTY) return 0; - else if (!(GETBIT(deleted, abs_edge(edge, 0)))) + else if (!(GETBIT(deleted, abs_edge(edge, 0)))) { found = 1; *e = edge; @@ -206,17 +206,17 @@ static void cyclic_del_edge(graph_t *g, cmph_uint32 v, cmph_uint8 *deleted) degree1 = find_degree1_edge(g, v1, deleted, &e); if (!degree1) return; - while(1) + while(1) { DEBUGP("Deleting edge %u (%u->%u)\n", e, g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]); SETBIT(deleted, abs_edge(e, 0)); - + v2 = g->edges[abs_edge(e, 0)]; if (v2 == v1) v2 = g->edges[abs_edge(e, 1)]; - DEBUGP("Checking if second endpoint %u has degree 1\n", v2); + DEBUGP("Checking if second endpoint %u has degree 1\n", v2); degree1 = find_degree1_edge(g, v2, deleted, &e); - if (degree1) + if (degree1) { DEBUGP("Inspecting vertex %u\n", v2); v1 = v2; @@ -240,7 +240,7 @@ int graph_is_cyclic(graph_t *g) } for (i = 0; i < g->nedges; ++i) { - if (!(GETBIT(deleted, i))) + if (!(GETBIT(deleted, i))) { DEBUGP("Edge %u %u->%u was not deleted\n", i, g->edges[i], g->edges[i + g->nedges]); free(deleted); @@ -275,15 +275,15 @@ void graph_obtain_critical_nodes(graph_t *g) /* included -- Fabiano*/ for (i = 0; i < g->nedges; ++i) { - if (!(GETBIT(deleted,i))) + if (!(GETBIT(deleted,i))) { DEBUGP("Edge %u %u->%u belongs to the 2-core\n", i, g->edges[i], g->edges[i + g->nedges]); - if(!(GETBIT(g->critical_nodes,g->edges[i]))) + if(!(GETBIT(g->critical_nodes,g->edges[i]))) { g->ncritical_nodes ++; SETBIT(g->critical_nodes,g->edges[i]); } - if(!(GETBIT(g->critical_nodes,g->edges[i + g->nedges]))) + if(!(GETBIT(g->critical_nodes,g->edges[i + g->nedges]))) { g->ncritical_nodes ++; SETBIT(g->critical_nodes,g->edges[i + g->nedges]); @@ -328,11 +328,9 @@ graph_iterator_t graph_neighbors_it(graph_t *g, cmph_uint32 v) cmph_uint32 graph_next_neighbor(graph_t *g, graph_iterator_t* it) { cmph_uint32 ret; - if(it->edge == EMPTY) return GRAPH_NO_NEIGHBOR; + if(it->edge == EMPTY) return GRAPH_NO_NEIGHBOR; if (g->edges[it->edge] == it->vertex) ret = g->edges[it->edge + g->nedges]; else ret = g->edges[it->edge]; it->edge = g->next[it->edge]; return ret; } - - diff --git a/src/hash.c b/src/hash.c index 7ab0b04..aa8c95f 100644 --- a/src/hash.c +++ b/src/hash.c @@ -133,7 +133,7 @@ void hash_state_destroy(hash_state_t *state) * \brief Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed. * \param state points to the hash function * \param hash_packed pointer to the contiguous memory area used to store the hash function. The size of hash_packed must be at least hash_state_packed_size() - * + * * Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed. * However, the hash function type must be packed outside. */ @@ -142,20 +142,20 @@ void hash_state_pack(hash_state_t *state, void *hash_packed) switch (state->hashfunc) { case CMPH_HASH_JENKINS: - // pack the jenkins hash function + // pack the jenkins hash function jenkins_state_pack((jenkins_state_t *)state, hash_packed); break; default: assert(0); } - return; + return; } /** \fn cmph_uint32 hash_state_packed_size(CMPH_HASH hashfunc) * \brief Return the amount of space needed to pack a hash function. * \param hashfunc function type * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 hash_state_packed_size(CMPH_HASH hashfunc) { cmph_uint32 size = 0; @@ -197,7 +197,7 @@ cmph_uint32 hash_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cm * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. */ void hash_vector_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) -{ +{ switch (hashfunc) { case CMPH_HASH_JENKINS: diff --git a/src/hashtree.c b/src/hashtree.c index 2f3567e..1bfd852 100644 --- a/src/hashtree.c +++ b/src/hashtree.c @@ -41,7 +41,7 @@ void hashtree_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 3) break; //hashtree only uses three hash functions - hashtree->hashfuncs[i] = *hashptr; + hashtree->hashfuncs[i] = *hashptr; ++i, ++hashptr; } } @@ -55,8 +55,8 @@ cmph_t *hashtree_new(cmph_config_t *mph, double c) cmph_uint32 iterations = 20; cmph_uint8 *visited = NULL; hashtree_config_data_t *hashtree = (hashtree_config_data_t *)mph->data; - hashtree->m = mph->key_source->nkeys; - hashtree->n = ceil(c * mph->key_source->nkeys); + hashtree->m = mph->key_source->nkeys; + hashtree->n = ceil(c * mph->key_source->nkeys); DEBUGP("m (edges): %u n (vertices): %u c: %f\n", hashtree->m, hashtree->n, c); hashtree->graph = graph_new(hashtree->n, hashtree->m); DEBUGP("Created graph\n"); @@ -87,12 +87,12 @@ cmph_t *hashtree_new(cmph_config_t *mph, double c) fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } - else break; + } + else break; } if (iterations == 0) { - graph_destroy(hashtree->graph); + graph_destroy(hashtree->graph); return NULL; } @@ -115,7 +115,7 @@ cmph_t *hashtree_new(cmph_config_t *mph, double c) hashtree_traverse(hashtree, visited, i); } } - graph_destroy(hashtree->graph); + graph_destroy(hashtree->graph); free(visited); hashtree->graph = NULL; @@ -144,7 +144,7 @@ static void hashtree_traverse(hashtree_config_data_t *hashtree, cmph_uint8 *visi graph_iterator_t it = graph_neighbors_it(hashtree->graph, v); cmph_uint32 neighbor = 0; SETBIT(visited,v); - + DEBUGP("Visiting vertex %u\n", v); while((neighbor = graph_next_neighbor(hashtree->graph, &it)) != GRAPH_NO_NEIGHBOR) { @@ -157,7 +157,7 @@ static void hashtree_traverse(hashtree_config_data_t *hashtree, cmph_uint8 *visi hashtree_traverse(hashtree, visited, neighbor); } } - + static int hashtree_gen_edges(cmph_config_t *mph) { cmph_uint32 e; @@ -165,7 +165,7 @@ static int hashtree_gen_edges(cmph_config_t *mph) int cycles = 0; DEBUGP("Generating edges for %u vertices with hash functions %s and %s\n", hashtree->n, cmph_hash_names[hashtree->hashfuncs[0]], cmph_hash_names[hashtree->hashfuncs[1]]); - graph_clear_edges(hashtree->graph); + graph_clear_edges(hashtree->graph); mph->key_source->rewind(mph->key_source->data); for (e = 0; e < mph->key_source->nkeys; ++e) { @@ -176,7 +176,7 @@ static int hashtree_gen_edges(cmph_config_t *mph) h1 = hash(hashtree->hashes[0], key, keylen) % hashtree->n; h2 = hash(hashtree->hashes[1], key, keylen) % hashtree->n; if (h1 == h2) if (++h2 >= hashtree->n) h2 = 0; - if (h1 == h2) + if (h1 == h2) { if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); mph->key_source->dispose(mph->key_source->data, key, keylen); @@ -216,7 +216,7 @@ int hashtree_dump(cmph_t *mphf, FILE *fd) fwrite(&(data->n), sizeof(cmph_uint32), 1, fd); fwrite(&(data->m), sizeof(cmph_uint32), 1, fd); - + fwrite(data->g, sizeof(cmph_uint32)*data->n, 1, fd); #ifdef DEBUG fprintf(stderr, "G: "); @@ -253,8 +253,8 @@ void hashtree_load(FILE *f, cmph_t *mphf) } DEBUGP("Reading m and n\n"); - fread(&(hashtree->n), sizeof(cmph_uint32), 1, f); - fread(&(hashtree->m), sizeof(cmph_uint32), 1, f); + fread(&(hashtree->n), sizeof(cmph_uint32), 1, f); + fread(&(hashtree->m), sizeof(cmph_uint32), 1, f); hashtree->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*hashtree->n); fread(hashtree->g, hashtree->n*sizeof(cmph_uint32), 1, f); @@ -265,7 +265,7 @@ void hashtree_load(FILE *f, cmph_t *mphf) #endif return; } - + cmph_uint32 hashtree_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { @@ -280,7 +280,7 @@ cmph_uint32 hashtree_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) void hashtree_destroy(cmph_t *mphf) { hashtree_data_t *data = (hashtree_data_t *)mphf->data; - free(data->g); + free(data->g); hash_state_destroy(data->hashes[0]); hash_state_destroy(data->hashes[1]); free(data->hashes); diff --git a/src/jenkins_hash.c b/src/jenkins_hash.c index 65cdff9..d540216 100644 --- a/src/jenkins_hash.c +++ b/src/jenkins_hash.c @@ -28,16 +28,16 @@ have at least 1/4 probability of changing. * If mix() is run forward, every bit of c will change between 1/3 and 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.) - mix() was built out of 36 single-cycle latency instructions in a + mix() was built out of 36 single-cycle latency instructions in a structure that could supported 2x parallelism, like so: - a -= b; + a -= b; a -= c; x = (c>>13); b -= c; a ^= x; b -= a; x = (a<<8); c -= a; b ^= x; c -= b; x = (b>>13); ... - Unfortunately, superscalar Pentiums and Sparcs can't take advantage + Unfortunately, superscalar Pentiums and Sparcs can't take advantage of that parallelism. They've also turned some of those single-cycle latency instructions into multi-cycle latency instructions. Still, this is the fastest good hash I could find. There were about 2^^68 @@ -87,6 +87,7 @@ acceptable. Do NOT use for cryptographic purposes. jenkins_state_t *jenkins_state_new(cmph_uint32 size) //size of hash table { jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t)); + if (!state) return NULL; DEBUGP("Initializing jenkins hash\n"); state->seed = ((cmph_uint32)rand() % size); return state; @@ -121,28 +122,28 @@ static inline void __jenkins_hash_vector(cmph_uint32 seed, const char *k, cmph_u hashes[2] += length; switch(len) /* all the case statements fall through */ { - case 11: + case 11: hashes[2] +=((cmph_uint32)k[10]<<24); - case 10: + case 10: hashes[2] +=((cmph_uint32)k[9]<<16); - case 9 : + case 9 : hashes[2] +=((cmph_uint32)k[8]<<8); /* the first byte of hashes[2] is reserved for the length */ - case 8 : + case 8 : hashes[1] +=((cmph_uint32)k[7]<<24); - case 7 : + case 7 : hashes[1] +=((cmph_uint32)k[6]<<16); - case 6 : + case 6 : hashes[1] +=((cmph_uint32)k[5]<<8); case 5 : hashes[1] +=(cmph_uint8) k[4]; - case 4 : + case 4 : hashes[0] +=((cmph_uint32)k[3]<<24); - case 3 : + case 3 : hashes[0] +=((cmph_uint32)k[2]<<16); - case 2 : + case 2 : hashes[0] +=((cmph_uint32)k[1]<<8); - case 1 : + case 1 : hashes[0] +=(cmph_uint8)k[0]; /* case 0: nothing left to add */ } @@ -158,13 +159,13 @@ cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keyl /* cmph_uint32 a, b, c; cmph_uint32 len, length; - // Set up the internal state + // Set up the internal state length = keylen; len = length; - a = b = 0x9e3779b9; // the golden ratio; an arbitrary value - c = state->seed; // the previous hash value - seed in our case + a = b = 0x9e3779b9; // the golden ratio; an arbitrary value + c = state->seed; // the previous hash value - seed in our case - // handle most of the key + // handle most of the key while (len >= 12) { a += (k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24)); @@ -176,37 +177,37 @@ cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keyl // handle the last 11 bytes c += length; - switch(len) /// all the case statements fall through + switch(len) /// all the case statements fall through { - case 11: + case 11: c +=((cmph_uint32)k[10]<<24); - case 10: + case 10: c +=((cmph_uint32)k[9]<<16); - case 9 : + case 9 : c +=((cmph_uint32)k[8]<<8); - // the first byte of c is reserved for the length - case 8 : + // the first byte of c is reserved for the length + case 8 : b +=((cmph_uint32)k[7]<<24); - case 7 : + case 7 : b +=((cmph_uint32)k[6]<<16); - case 6 : + case 6 : b +=((cmph_uint32)k[5]<<8); - case 5 : + case 5 : b +=k[4]; - case 4 : + case 4 : a +=((cmph_uint32)k[3]<<24); - case 3 : + case 3 : a +=((cmph_uint32)k[2]<<16); - case 2 : + case 2 : a +=((cmph_uint32)k[1]<<8); - case 1 : + case 1 : a +=k[0]; - // case 0: nothing left to add + // case 0: nothing left to add } mix(a,b,c); - /// report the result + /// report the result return c; */ @@ -221,7 +222,7 @@ void jenkins_state_dump(jenkins_state_t *state, char **buf, cmph_uint32 *buflen) { *buflen = sizeof(cmph_uint32); *buf = (char *)malloc(sizeof(cmph_uint32)); - if (!*buf) + if (!*buf) { *buflen = UINT_MAX; return; @@ -252,7 +253,7 @@ jenkins_state_t *jenkins_state_load(const char *buf, cmph_uint32 buflen) /** \fn void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed); * \brief Support the ability to pack a jenkins function into a preallocated contiguous memory space pointed by jenkins_packed. * \param state points to the jenkins function - * \param jenkins_packed pointer to the contiguous memory area used to store the jenkins function. The size of jenkins_packed must be at least jenkins_state_packed_size() + * \param jenkins_packed pointer to the contiguous memory area used to store the jenkins function. The size of jenkins_packed must be at least jenkins_state_packed_size() */ void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed) { @@ -265,7 +266,7 @@ void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed) /** \fn cmph_uint32 jenkins_state_packed_size(jenkins_state_t *state); * \brief Return the amount of space needed to pack a jenkins function. * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 jenkins_state_packed_size(void) { return sizeof(cmph_uint32); diff --git a/src/linear_string_map.c b/src/linear_string_map.c new file mode 100644 index 0000000..85f8d21 --- /dev/null +++ b/src/linear_string_map.c @@ -0,0 +1,68 @@ +#include +#include +#include + +#include "linear_string_map.h" + +struct __linear_string_map_t { + const char *key; + void *value; + struct __linear_string_map_t* next; +}; + +lsmap_t *lsmap_new() { + lsmap_t* lsmap = (lsmap_t*)malloc(sizeof(lsmap_t)); + if (!lsmap) return NULL; + lsmap->key = "dummy node"; + lsmap->next = NULL; + return lsmap; +} + +int lsmap_size(lsmap_t *lsmap) { + int size = 0; + while (lsmap->next != NULL) ++size; + return size; +} + +void lsmap_append(lsmap_t *lsmap, const char *key, void *value) { + while (lsmap->next != NULL) lsmap = lsmap->next; + lsmap->next = (lsmap_t*)malloc(sizeof(lsmap_t)); + lsmap->key = key; + lsmap->value = value; + lsmap = lsmap->next; + lsmap->key = "dummy node"; + lsmap->next = NULL; +} + +void* lsmap_search(lsmap_t *lsmap, const char *key) { + while (lsmap->next != NULL) { + if (strcmp(lsmap->key, key) == 0) { + return lsmap->value; + } + lsmap = lsmap->next; + } + return NULL; +} + +void lsmap_foreach_key(lsmap_t *lsmap, void (*f)(const char*)) { + while (lsmap->next != NULL) { + f(lsmap->key); + lsmap = lsmap->next; + } +} + +void lsmap_foreach_value(lsmap_t *lsmap, void (*f)(void*)) { + while (lsmap->next != NULL) { + f(lsmap->value); + lsmap = lsmap->next; + } +} + +void lsmap_destroy(lsmap_t *lsmap) { + while (lsmap->next != NULL) { + lsmap_t* freeme = lsmap; + lsmap = lsmap->next; + free(freeme); + } + free(lsmap); +} diff --git a/src/linear_string_map.h b/src/linear_string_map.h new file mode 100644 index 0000000..2e2287e --- /dev/null +++ b/src/linear_string_map.h @@ -0,0 +1,13 @@ +// A simple linked list based dynamic sized associative map from const char* to +// void*. Designed to maximize ease of use instead of performance. Should be +// used in benchmarks and tests only, not to be distributed with the cmph +// runtime headers. + +typedef struct __linear_string_map_t lsmap_t; + +lsmap_t *lsmap_new(); +void lsmap_append(lsmap_t *lsmap, const char *key, void *value); +void* lsmap_search(lsmap_t *lsmap, const char *key); +void lsmap_foreach_key(lsmap_t* lsmap, void (*f)(const char*)); +void lsmap_foreach_value(lsmap_t* lsmap, void (*f)(void*)); +void lsmap_destroy(lsmap_t* lsmap); diff --git a/src/main.c b/src/main.c index f739b32..95a75c5 100644 --- a/src/main.c +++ b/src/main.c @@ -22,13 +22,13 @@ void usage(const char *prg) { - fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg); + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg); } void usage_long(const char *prg) { cmph_uint32 i; - fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg); - fprintf(stderr, "Minimum perfect hashing tool\n\n"); + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg); + fprintf(stderr, "Minimum perfect hashing tool\n\n"); fprintf(stderr, " -h\t print this help message\n"); fprintf(stderr, " -c\t c value determines:\n"); fprintf(stderr, " \t * the number of vertices in the graph for the algorithms BMZ and CHM\n"); @@ -57,7 +57,7 @@ void usage_long(const char *prg) fprintf(stderr, " \t and its value should be an integer in the range [1,32]. Default is 4. The\n"); fprintf(stderr, " \t larger is this value, the slower is the construction of the functions.\n"); fprintf(stderr, " \t This parameter has no effect for other algorithms.\n\n"); - fprintf(stderr, " -t\t set the number of keys per bin for a t-perfect hashing function. A t-perfect\n"); + fprintf(stderr, " -t\t set the number of keys per bin for a t-perfect hashing function. A t-perfect\n"); fprintf(stderr, " \t hash function allows at most t collisions in a given bin. This parameter applies\n"); fprintf(stderr, " \t only to the CHD and CHD_PH algorithms. Its value should be an integer in the\n"); fprintf(stderr, " \t range [1,128]. Defaul is 1\n"); @@ -182,7 +182,7 @@ int main(int argc, char **argv) break; } } - if (!valid) + if (!valid) { fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION); return -1; @@ -204,7 +204,7 @@ int main(int argc, char **argv) break; } } - if (!valid) + if (!valid) { fprintf(stderr, "Invalid hash function: %s\n", optarg); return -1; @@ -223,7 +223,7 @@ int main(int argc, char **argv) return 1; } keys_file = argv[optind]; - + if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL); srand(seed); int ret = 0; @@ -232,7 +232,7 @@ int main(int argc, char **argv) mphf_file = (char *)malloc(strlen(keys_file) + 5); memcpy(mphf_file, keys_file, strlen(keys_file)); memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5); - } + } keys_fd = fopen(keys_file, "r"); @@ -258,7 +258,7 @@ int main(int argc, char **argv) cmph_config_set_memory_availability(config, memory_availability); cmph_config_set_b(config, b); cmph_config_set_keys_per_bin(config, keys_per_bin); - + //if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15; if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15; if (c != 0) cmph_config_set_graphsize(config, c); @@ -279,8 +279,8 @@ int main(int argc, char **argv) free(mphf_file); return -1; } - cmph_dump(mphf, mphf_fd); - cmph_destroy(mphf); + cmph_dump(mphf, mphf_fd); + cmph_destroy(mphf); fclose(mphf_fd); } else @@ -329,7 +329,7 @@ int main(int argc, char **argv) } source->dispose(source->data, buf, buflen); } - + cmph_destroy(mphf); free(hashtable); } @@ -338,5 +338,5 @@ int main(int argc, char **argv) free(tmp_dir); cmph_io_nlfile_adapter_destroy(source); return ret; - + } diff --git a/src/sdbm_hash.c b/src/sdbm_hash.c index 2f706c9..3a052fd 100644 --- a/src/sdbm_hash.c +++ b/src/sdbm_hash.c @@ -4,6 +4,7 @@ sdbm_state_t *sdbm_state_new() { sdbm_state_t *state = (sdbm_state_t *)malloc(sizeof(sdbm_state_t)); + if (!state) return NULL; state->hashfunc = CMPH_HASH_SDBM; return state; } diff --git a/src/vqueue.c b/src/vqueue.c index 0619dd7..5c90ee0 100644 --- a/src/vqueue.c +++ b/src/vqueue.c @@ -12,7 +12,7 @@ vqueue_t * vqueue_new(cmph_uint32 capacity) { size_t capacity_plus_one = capacity + 1; vqueue_t *q = (vqueue_t *)malloc(sizeof(vqueue_t)); - assert(q); + if (!q) return NULL; q->values = (cmph_uint32 *)calloc(capacity_plus_one, sizeof(cmph_uint32)); q->beg = q->end = 0; q->capacity = (cmph_uint32) capacity_plus_one; @@ -43,7 +43,7 @@ void vqueue_print(vqueue_t * q) cmph_uint32 i; for (i = q->beg; i != q->end; i = (i + 1)%q->capacity) fprintf(stderr, "%u\n", q->values[(i + 1)%q->capacity]); -} +} void vqueue_destroy(vqueue_t *q) { diff --git a/src/vstack.c b/src/vstack.c index 96f5380..8791550 100644 --- a/src/vstack.c +++ b/src/vstack.c @@ -76,4 +76,3 @@ void vstack_reserve(vstack_t *stack, cmph_uint32 size) DEBUGP("Increased\n"); } } - diff --git a/tests/Makefile.am b/tests/Makefile.am index a0fe694..361c67b 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,4 +1,6 @@ -noinst_PROGRAMS = graph_tests packed_mphf_tests mphf_tests select_tests compressed_seq_tests compressed_rank_tests +TESTS = $(check_PROGRAMS) +check_PROGRAMS = graph_tests select_tests compressed_seq_tests compressed_rank_tests cmph_benchmark_test +noinst_PROGRAMS = packed_mphf_tests mphf_tests INCLUDES = -I../src/ @@ -19,3 +21,6 @@ compressed_seq_tests_LDADD = ../src/libcmph.la compressed_rank_tests_SOURCES = compressed_rank_tests.c compressed_rank_tests_LDADD = ../src/libcmph.la + +cmph_benchmark_test_SOURCES = cmph_benchmark_test.c +cmph_benchmark_test_LDADD = ../src/libcmph.la diff --git a/tests/cmph_benchmark_test.c b/tests/cmph_benchmark_test.c new file mode 100644 index 0000000..4a75acf --- /dev/null +++ b/tests/cmph_benchmark_test.c @@ -0,0 +1,23 @@ +#include // for sleep +#include + +#include "cmph_benchmark.h" + +void bm_sleep(int iters) { + sleep(1); +} + +void bm_increment(int iters) { + int i, v = 0; + for (i = 0; i < INT_MAX; ++i) { + v += i; + } +} + +int main(int argc, char** argv) { + BM_REGISTER(bm_sleep, 1); + BM_REGISTER(bm_increment, 1); + run_benchmarks(argc, argv); + return 0; +} +