diff --git a/Makefile.am b/Makefile.am index aaae22b..cdc3a23 100644 --- a/Makefile.am +++ b/Makefile.am @@ -4,5 +4,6 @@ pkgconfig_DATA = cmph.pc if USE_CXXMPH pkgconfig_DATA += cxxmph.pc endif +ACLOCAL_AMFLAGS="-I m4" pkgconfigdir = $(libdir)/pkgconfig diff --git a/configure.ac b/configure.ac index fa6a58f..b0c7e08 100644 --- a/configure.ac +++ b/configure.ac @@ -31,7 +31,7 @@ AC_CHECK_HEADERS([getopt.h math.h]) dnl Checks for libraries. LT_LIB_M LDFLAGS="$LIBM $LDFLAGS" -CFLAGS="-Wall" +#CFLAGS="-Wall -g" AC_PROG_CXX CXXFLAGS="-Wall -Wno-unused-function -DNDEBUG -O3 -fomit-frame-pointer $CXXFLAGS" @@ -40,7 +40,7 @@ if test x$cxxmph = xtrue; then AC_COMPILE_STDCXX_0X if test x$ac_cv_cxx_compile_cxx0x_native = "xno"; then if test x$ac_cv_cxx_compile_cxx0x_cxx = "xyes"; then - CXXFLAGS="$CXXFLAGS -std=c++0x" + CXXFLAGS="$CXXFLAGS -std=c++11" elif test x$ac_cv_cxx_compile_cxx0x_gxx = "xyes"; then CXXFLAGS="$CXXFLAGS -std=gnu++0x" else diff --git a/cxxmph/.ycm_extra_conf.py b/cxxmph/.ycm_extra_conf.py new file mode 100644 index 0000000..91da0a6 --- /dev/null +++ b/cxxmph/.ycm_extra_conf.py @@ -0,0 +1,58 @@ +import os +import ycm_core + +flags = [ +'-Wall', +'-Wextra', +'-Werror', +'-DNDEBUG', +'-DUSE_CLANG_COMPLETER', +'-std=c++11', +'-x', +'c++', +'-isystem' + '/usr/lib/c++/v1', +'-I', +'.', +] + +def DirectoryOfThisScript(): + return os.path.dirname( os.path.abspath( __file__ ) ) + + +def MakeRelativePathsInFlagsAbsolute( flags, working_directory ): + if not working_directory: + return list( flags ) + new_flags = [] + make_next_absolute = False + path_flags = [ '-isystem', '-I', '-iquote', '--sysroot=' ] + for flag in flags: + new_flag = flag + + if make_next_absolute: + make_next_absolute = False + if not flag.startswith( '/' ): + new_flag = os.path.join( working_directory, flag ) + + for path_flag in path_flags: + if flag == path_flag: + make_next_absolute = True + break + + if flag.startswith( path_flag ): + path = flag[ len( path_flag ): ] + new_flag = path_flag + os.path.join( working_directory, path ) + break + + if new_flag: + new_flags.append( new_flag ) + return new_flags + + +def FlagsForFile( filename ): + relative_to = DirectoryOfThisScript() + final_flags = MakeRelativePathsInFlagsAbsolute( flags, relative_to ) + return { + 'flags': final_flags, + 'do_cache': True + } diff --git a/cxxmph/MurmurHash3.cpp b/cxxmph/MurmurHash3.cpp index 09ffb26..dff0eaa 100644 --- a/cxxmph/MurmurHash3.cpp +++ b/cxxmph/MurmurHash3.cpp @@ -52,12 +52,12 @@ inline uint64_t rotl64 ( uint64_t x, int8_t r ) // Block read - if your platform needs to do endian-swapping or can only // handle aligned reads, do the conversion here -FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) +/*FORCE_INLINE*/ uint32_t getblock ( const uint32_t * p, int i ) { return p[i]; } -FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) +/*FORCE_INLINE*/ uint64_t getblock ( const uint64_t * p, int i ) { return p[i]; } @@ -65,7 +65,7 @@ FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) //----------------------------------------------------------------------------- // Finalization mix - force all bits of a hash block to avalanche -FORCE_INLINE uint32_t fmix ( uint32_t h ) +/*FORCE_INLINE*/ uint32_t fmix ( uint32_t h ) { h ^= h >> 16; h *= 0x85ebca6b; @@ -78,7 +78,7 @@ FORCE_INLINE uint32_t fmix ( uint32_t h ) //---------- -FORCE_INLINE uint64_t fmix ( uint64_t k ) +/*FORCE_INLINE*/ uint64_t fmix ( uint64_t k ) { k ^= k >> 33; k *= BIG_CONSTANT(0xff51afd7ed558ccd); diff --git a/cxxmph/hollow_iterator_test.cc b/cxxmph/hollow_iterator_test.cc index de235c0..b3647dd 100644 --- a/cxxmph/hollow_iterator_test.cc +++ b/cxxmph/hollow_iterator_test.cc @@ -12,7 +12,7 @@ using cxxmph::hollow_iterator_base; using cxxmph::make_hollow; using cxxmph::is_empty; -int main(int argc, char** argv) { +int main(int, char**) { vector v; vector p; for (int i = 0; i < 100; ++i) { diff --git a/cxxmph/mph_bits.h b/cxxmph/mph_bits.h index 3a3bb47..eab677e 100644 --- a/cxxmph/mph_bits.h +++ b/cxxmph/mph_bits.h @@ -44,7 +44,7 @@ class dynamic_2bitset { other.data_.swap(data_); } void clear() { data_.clear(); size_ = 0; } - + uint32_t size() const { return size_; } static const uint8_t vmask[]; const std::vector& data() const { return data_; } diff --git a/cxxmph/mph_index.cc b/cxxmph/mph_index.cc index fba852e..f5be3dd 100644 --- a/cxxmph/mph_index.cc +++ b/cxxmph/mph_index.cc @@ -39,6 +39,7 @@ namespace cxxmph { MPHIndex::~MPHIndex() { clear(); + } void MPHIndex::clear() { diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index db81233..5948348 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -238,7 +238,7 @@ MPH_MAP_INLINE_METHOD_DECL(my_int32_t, index)(const key_type& k) const { MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) { return insert(make_pair(k, data_type())).first->second; } -MPH_MAP_METHOD_DECL(void_type, rehash)(size_type nbuckets) { +MPH_MAP_METHOD_DECL(void_type, rehash)(size_type /*nbuckets*/) { pack(); vector(values_.begin(), values_.end()).swap(values_); vector(present_.begin(), present_.end()).swap(present_); diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index 1204b6f..162521d 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -15,7 +15,7 @@ namespace cxxmph { struct h128 { const uint32_t& operator[](uint8_t i) const { return uint32[i]; } uint32_t& operator[](uint8_t i) { return uint32[i]; } - const uint64_t get64(bool second) const { return (static_cast(uint32[second << 1]) << 32) | uint32[1 + (second << 1)]; } + uint64_t get64(bool second) const { return (static_cast(uint32[second << 1]) << 32) | uint32[1 + (second << 1)]; } void set64(uint64_t v, bool second) { uint32[second << 1] = v >> 32; uint32[1+(second<<1)] = ((v << 32) >> 32); } bool operator==(const h128 rhs) const { return memcmp(uint32, rhs.uint32, sizeof(uint32)) == 0; } diff --git a/examples/Makefile.am b/examples/Makefile.am index f20e8c7..df5763f 100755 --- a/examples/Makefile.am +++ b/examples/Makefile.am @@ -1,6 +1,6 @@ -noinst_PROGRAMS = vector_adapter_ex1 file_adapter_ex2 struct_vector_adapter_ex3 +noinst_PROGRAMS = vector_adapter_ex1 file_adapter_ex2 struct_vector_adapter_ex3 small_set_ex4 -INCLUDES = -I../src/ +AM_CPPFLAGS = -I../src/ vector_adapter_ex1_LDADD = ../src/libcmph.la vector_adapter_ex1_SOURCES = vector_adapter_ex1.c @@ -10,3 +10,6 @@ file_adapter_ex2_SOURCES = file_adapter_ex2.c struct_vector_adapter_ex3_LDADD = ../src/libcmph.la struct_vector_adapter_ex3_SOURCES = struct_vector_adapter_ex3.c + +small_set_ex4_LDADD = ../src/libcmph.la +small_set_ex4_SOURCES = small_set_ex4.c diff --git a/examples/small_set_ex4.c b/examples/small_set_ex4.c new file mode 100644 index 0000000..dc77a05 --- /dev/null +++ b/examples/small_set_ex4.c @@ -0,0 +1,105 @@ +#include + +int test(cmph_uint32* items_to_hash, cmph_uint32 items_len, CMPH_ALGO alg_n) +{ + cmph_t *hash; + cmph_config_t *config; + cmph_io_adapter_t *source; + cmph_uint32 i; + char filename[256]; + FILE* mphf_fd = NULL; + + printf("%s (%u)\n", cmph_names[alg_n], alg_n); + + source = cmph_io_struct_vector_adapter(items_to_hash, + (cmph_uint32)sizeof(cmph_uint32), + 0, + (cmph_uint32)sizeof(cmph_uint32), + items_len); + config = cmph_config_new(source); + cmph_config_set_algo(config, alg_n); + if (alg_n == CMPH_BRZ) { + sprintf(filename, "%s_%u.mph", cmph_names[alg_n], items_len); + mphf_fd = fopen(filename, "w"); + cmph_config_set_mphf_fd(config, mphf_fd); + } + hash = cmph_new(config); + cmph_config_destroy(config); + + if (alg_n == CMPH_BRZ) { + cmph_dump(hash, mphf_fd); + cmph_destroy(hash); + fclose(mphf_fd); + mphf_fd = fopen(filename, "r"); + hash = cmph_load(mphf_fd); + } + printf("packed_size %u\n",cmph_packed_size(hash)); + + for (i=0; i %u\n", + items_to_hash[i], + cmph_search(hash, + (char*)(items_to_hash+i), + (cmph_uint32)sizeof(cmph_uint32))); + printf("\n"); + + cmph_io_vector_adapter_destroy(source); + cmph_destroy(hash); + + if (alg_n == CMPH_BRZ) { + fclose(mphf_fd); + } + return 0; +} + +int main (void) +{ + cmph_uint32 vec1[] = {1,2,3,4,5}; + cmph_uint32 vec1_len = 5; + + cmph_uint32 vec2[] = {7576423, 7554496}; //CMPH_FCH, CMPH_BDZ, CMPH_BDZ_PH (4,5,6) + cmph_uint32 vec2_len = 2; + cmph_uint32 vec3[] = {2184764, 1882984, 1170551}; // CMPH_CHD_PH, CMPH_CHD (7,8) + cmph_uint32 vec3_len = 3; + cmph_uint32 vec4[] = {2184764}; // CMPH_CHD_PH, CMPH_CHD (7,8) + cmph_uint32 vec4_len = 1; + cmph_uint32 i; + + // Testing with vec1 + cmph_uint32* values = (cmph_uint32*)vec1; + cmph_uint32 length = vec1_len; + printf("TESTING VECTOR WITH %u INTEGERS\n", length); + for (i = 0; i < CMPH_COUNT; i++) + { + test(values, length, i); + } + + // Testing with vec2 + values = (cmph_uint32*)vec2; + length = vec2_len; + printf("TESTING VECTOR WITH %u INTEGERS\n", length); + for (i = 0; i < CMPH_COUNT; i++) + { + test(values, length, i); + } + + // Testing with vec3 + values = (cmph_uint32*)vec3; + length = vec3_len; + printf("TESTING VECTOR WITH %u INTEGERS\n", length); + for (i = 0; i < CMPH_COUNT; i++) + { + test(values, length, i); + } + + // Testing with vec4 + values = (cmph_uint32*)vec4; + length = vec4_len; + printf("TESTING VECTOR WITH %u INTEGERS\n", length); + for (i = 0; i < CMPH_COUNT; i++) + { + test(values, length, i); + } + + return 0; +} diff --git a/src/bdz.c b/src/bdz.c index bfcb918..61e3eb2 100755 --- a/src/bdz.c +++ b/src/bdz.c @@ -288,6 +288,11 @@ cmph_t *bdz_new(cmph_config_t *mph, double c) bdz->m = mph->key_source->nkeys; bdz->r = (cmph_uint32)ceil((c * mph->key_source->nkeys)/3); if ((bdz->r % 2) == 0) bdz->r+=1; + + if (bdz->r == 1) { // workaround for small key sets + bdz->r = 3; + } + bdz->n = 3*bdz->r; bdz->k = (1U << bdz->b); diff --git a/src/bdz_ph.c b/src/bdz_ph.c index 3d8b711..532ee41 100755 --- a/src/bdz_ph.c +++ b/src/bdz_ph.c @@ -254,6 +254,11 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c) bdz_ph->m = mph->key_source->nkeys; bdz_ph->r = (cmph_uint32)ceil((c * mph->key_source->nkeys)/3); if ((bdz_ph->r % 2) == 0) bdz_ph->r += 1; + + if (bdz_ph->r == 1) { // workaround for small key sets + bdz_ph->r = 3; + } + bdz_ph->n = 3*bdz_ph->r; diff --git a/src/bmz.c b/src/bmz.c index 062677c..f5f7a25 100644 --- a/src/bmz.c +++ b/src/bmz.c @@ -70,6 +70,12 @@ cmph_t *bmz_new(cmph_config_t *mph, double c) DEBUGP("c: %f\n", c); bmz->m = mph->key_source->nkeys; bmz->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); + + if (bmz->n < 5) // workaround for small key sets + { + bmz->n = 5; + } + DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz->m, bmz->n, c); bmz->graph = graph_new(bmz->n, bmz->m); DEBUGP("Created graph\n"); @@ -530,7 +536,7 @@ cmph_uint32 bmz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) cmph_uint32 h1 = hash(bmz->hashes[0], key, keylen) % bmz->n; cmph_uint32 h2 = hash(bmz->hashes[1], key, keylen) % bmz->n; DEBUGP("key: %.*s h1: %u h2: %u\n", keylen, key, h1, h2); - if (h1 == h2 && ++h2 > bmz->n) h2 = 0; + if (h1 == h2 && ++h2 >= bmz->n) h2 = 0; DEBUGP("key: %.*s g[h1]: %u g[h2]: %u edges: %u\n", keylen, key, bmz->g[h1], bmz->g[h2], bmz->m); return bmz->g[h1] + bmz->g[h2]; } @@ -620,6 +626,6 @@ cmph_uint32 bmz_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; - if (h1 == h2 && ++h2 > n) h2 = 0; + if (h1 == h2 && ++h2 >= n) h2 = 0; return (g_ptr[h1] + g_ptr[h2]); } diff --git a/src/bmz8.c b/src/bmz8.c index dc981df..894463d 100644 --- a/src/bmz8.c +++ b/src/bmz8.c @@ -74,6 +74,12 @@ cmph_t *bmz8_new(cmph_config_t *mph, double c) DEBUGP("c: %f\n", c); bmz8->m = (cmph_uint8) mph->key_source->nkeys; bmz8->n = (cmph_uint8) ceil(c * mph->key_source->nkeys); + + if (bmz8->n < 5) // workaround for small key sets + { + bmz8->n = 5; + } + DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz8->m, bmz8->n, c); bmz8->graph = graph_new(bmz8->n, bmz8->m); DEBUGP("Created graph\n"); diff --git a/src/brz.c b/src/brz.c index 885db9d..1a7a729 100755 --- a/src/brz.c +++ b/src/brz.c @@ -27,9 +27,9 @@ static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * b brz_config_data_t *brz_config_new(void) { brz_config_data_t *brz = NULL; - brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t)); - if (!brz) return NULL; - brz->algo = CMPH_FCH; + brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t)); + if (!brz) return NULL; + brz->algo = CMPH_FCH; brz->b = 128; brz->hashfuncs[0] = CMPH_HASH_JENKINS; brz->hashfuncs[1] = CMPH_HASH_JENKINS; @@ -131,6 +131,15 @@ cmph_t *brz_new(cmph_config_t *mph, double c) DEBUGP("c: %f\n", c); brz_config_data_t *brz = (brz_config_data_t *)mph->data; + + // Since we keep dumping partial pieces of the MPHF as it gets created + // the caller must set the file to store the resulting MPHF before calling + // this function. + if (brz->mphf_fd == NULL) + { + return NULL; + } + switch(brz->algo) // validating restrictions over parameter c. { case CMPH_BMZ8: @@ -144,6 +153,11 @@ cmph_t *brz_new(cmph_config_t *mph, double c) } brz->c = c; brz->m = mph->key_source->nkeys; + if (brz->m < 5) + { + brz->c = 5; + } + DEBUGP("m: %u\n", brz->m); brz->k = (cmph_uint32)ceil(brz->m/((double)brz->b)); DEBUGP("k: %u\n", brz->k); @@ -364,7 +378,7 @@ static int brz_gen_mphf(cmph_config_t *mph) { fprintf(stderr, "\nMPHF generation \n"); } - /* Starting to dump to disk the resultant MPHF: __cmph_dump function */ + /* Starting to dump to disk the resulting MPHF: __cmph_dump function */ nbytes = fwrite(cmph_names[CMPH_BRZ], (size_t)(strlen(cmph_names[CMPH_BRZ]) + 1), (size_t)1, brz->mphf_fd); nbytes = fwrite(&(brz->m), sizeof(brz->m), (size_t)1, brz->mphf_fd); nbytes = fwrite(&(brz->c), sizeof(double), (size_t)1, brz->mphf_fd); @@ -442,7 +456,7 @@ static int brz_gen_mphf(cmph_config_t *mph) source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd); config = cmph_config_new(source); cmph_config_set_algo(config, brz->algo); - //cmph_config_set_algo(config, CMPH_BMZ8); + cmph_config_set_hashfuncs(config, brz->hashfuncs); cmph_config_set_graphsize(config, brz->c); mphf_tmp = cmph_new(config); if (mphf_tmp == NULL) @@ -565,7 +579,7 @@ int brz_dump(cmph_t *mphf, FILE *fd) cmph_uint32 buflen; register size_t nbytes; DEBUGP("Dumping brzf\n"); - // The initial part of the MPHF have already been dumped to disk during construction + // The initial part of the MPHF has already been dumped to disk during construction // Dumping h0 hash_state_dump(data->h0, &buf, &buflen); DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); @@ -730,7 +744,13 @@ void brz_pack(cmph_t *mphf, void *packed_mphf) brz_data_t *data = (brz_data_t *)mphf->data; cmph_uint8 * ptr = (cmph_uint8 *)packed_mphf; cmph_uint32 i,n; - + + // This assumes that if one function pointer is NULL, + // all the others will be as well. + if (data->h1 == NULL) + { + return; + } // packing internal algo type memcpy(ptr, &(data->algo), sizeof(data->algo)); ptr += sizeof(data->algo); @@ -821,9 +841,21 @@ cmph_uint32 brz_packed_size(cmph_t *mphf) cmph_uint32 i; cmph_uint32 size = 0; brz_data_t *data = (brz_data_t *)mphf->data; - CMPH_HASH h0_type = hash_get_type(data->h0); - CMPH_HASH h1_type = hash_get_type(data->h1[0]); - CMPH_HASH h2_type = hash_get_type(data->h2[0]); + CMPH_HASH h0_type; + CMPH_HASH h1_type; + CMPH_HASH h2_type; + + // This assumes that if one function pointer is NULL, + // all the others will be as well. + if (data->h1 == NULL) + { + return 0U; + } + + h0_type = hash_get_type(data->h0); + h1_type = hash_get_type(data->h1[0]); + h2_type = hash_get_type(data->h2[0]); + size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) + sizeof(double) + sizeof(cmph_uint8)*data->k + sizeof(cmph_uint32)*data->k); // pointers to g_is diff --git a/src/brz.h b/src/brz.h index 648f174..df21d77 100644 --- a/src/brz.h +++ b/src/brz.h @@ -3,6 +3,21 @@ #include "cmph.h" +/* + * The BRZ algorithm has been built so to consume the bare minimum + * amount of memory to generate the MPHFs. Thereby we decided + * to dump the resulting MPHFs to disk while creating them. Thus, + * to use the BRZ algorithm, one has to call brz_config_set_mphf_fd + * before calling brz_new. Otherwise we will fail the MPHF creation. + * One side effect of this design decision is that the resulting + * MPHF cannot be used until its dumping process is finalized + * by calling brz_dump and the caller must use brz_load before + * any call to either one of the following functions is made: + * brz_search + * brz_pack + * brz_packed_size + * brz_search_packed + */ typedef struct __brz_data_t brz_data_t; typedef struct __brz_config_data_t brz_config_data_t; diff --git a/src/chd_ph.c b/src/chd_ph.c index 43b936f..fbcd517 100644 --- a/src/chd_ph.c +++ b/src/chd_ph.c @@ -627,7 +627,8 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) register double load_factor = c; register cmph_uint8 searching_success = 0; - register cmph_uint32 max_probes = 1 << 20; // default value for max_probes + register cmph_uint32 max_probes_default = 1 << 20; // default value for max_probes + register cmph_uint32 max_probes; register cmph_uint32 iterations = 100; chd_ph_bucket_t * buckets = NULL; chd_ph_item_t * items = NULL; @@ -688,7 +689,13 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) buckets = chd_ph_bucket_new(chd_ph->nbuckets); items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t)); - max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes); + max_probes = (cmph_uint32)((log(chd_ph->m)/log(2))/20); + + if (max_probes == 0) { + max_probes = max_probes_default; + } else { + max_probes = max_probes * max_probes_default; + } if(chd_ph->keys_per_bin == 1) chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32)); diff --git a/src/cmph.c b/src/cmph.c index f460dd0..7a49c45 100644 --- a/src/cmph.c +++ b/src/cmph.c @@ -96,11 +96,15 @@ static int key_struct_vector_read(void *data, char **key, cmph_uint32 *keylen) { cmph_struct_vector_t *cmph_struct_vector = (cmph_struct_vector_t *)data; char *keys_vd = (char *)cmph_struct_vector->vector; + cmph_uint64 keys_vd_offset; size_t size; *keylen = cmph_struct_vector->key_len; size = *keylen; *key = (char *)malloc(size); - memcpy(*key, (keys_vd + (cmph_struct_vector->position * cmph_struct_vector->struct_size) + cmph_struct_vector->key_offset), size); + keys_vd_offset = ((cmph_uint64)cmph_struct_vector->position * + (cmph_uint64)cmph_struct_vector->struct_size) + + (cmph_uint64)cmph_struct_vector->key_offset; + memcpy(*key, keys_vd + keys_vd_offset, size); cmph_struct_vector->position = cmph_struct_vector->position + 1; return (int)(*keylen); } @@ -157,11 +161,11 @@ static cmph_uint32 count_nlfile_keys(FILE *fd) { char buf[BUFSIZ]; ptr = fgets(buf, BUFSIZ, fd); - if (feof(fd)) break; - if (ferror(fd) || ptr == NULL) { - perror("Error reading input file"); - return 0; - } + if (feof(fd)) break; + if (ferror(fd) || ptr == NULL) { + perror("Error reading input file"); + return 0; + } if (buf[strlen(buf) - 1] != '\n') continue; ++count; } diff --git a/src/cmph_structs.c b/src/cmph_structs.c index fe095c1..ec0f17d 100644 --- a/src/cmph_structs.c +++ b/src/cmph_structs.c @@ -38,13 +38,18 @@ cmph_t *__cmph_load(FILE *f) register size_t nbytes; DEBUGP("Loading mphf\n"); - while(1) + for(i = 0; i < BUFSIZ; i++) { size_t c = fread(ptr, (size_t)1, (size_t)1, f); if (c != 1) return NULL; if (*ptr == 0) break; ++ptr; } + if(algo_name[i] != 0) + { + DEBUGP("Attempted buffer overflow while loading mph file\n"); + return NULL; + } for(i = 0; i < CMPH_COUNT; ++i) { if (strcmp(algo_name, cmph_names[i]) == 0) diff --git a/src/jenkins_hash.c b/src/jenkins_hash.c index d540216..13c50d6 100644 --- a/src/jenkins_hash.c +++ b/src/jenkins_hash.c @@ -98,7 +98,7 @@ void jenkins_state_destroy(jenkins_state_t *state) } -static inline void __jenkins_hash_vector(cmph_uint32 seed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) +static inline void __jenkins_hash_vector(cmph_uint32 seed, const unsigned char *k, cmph_uint32 keylen, cmph_uint32 * hashes) { register cmph_uint32 len, length; @@ -154,7 +154,7 @@ static inline void __jenkins_hash_vector(cmph_uint32 seed, const char *k, cmph_u cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen) { cmph_uint32 hashes[3]; - __jenkins_hash_vector(state->seed, k, keylen, hashes); + __jenkins_hash_vector(state->seed, (const unsigned char*)k, keylen, hashes); return hashes[2]; /* cmph_uint32 a, b, c; cmph_uint32 len, length; @@ -215,7 +215,7 @@ cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keyl void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) { - __jenkins_hash_vector(state->seed, k, keylen, hashes); + __jenkins_hash_vector(state->seed, (const unsigned char*)k, keylen, hashes); } void jenkins_state_dump(jenkins_state_t *state, char **buf, cmph_uint32 *buflen) @@ -282,7 +282,7 @@ cmph_uint32 jenkins_state_packed_size(void) cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen) { cmph_uint32 hashes[3]; - __jenkins_hash_vector(*((cmph_uint32 *)jenkins_packed), k, keylen, hashes); + __jenkins_hash_vector(*((cmph_uint32 *)jenkins_packed), (const unsigned char*)k, keylen, hashes); return hashes[2]; } @@ -294,5 +294,5 @@ cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 */ void jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) { - __jenkins_hash_vector(*((cmph_uint32 *)jenkins_packed), k, keylen, hashes); + __jenkins_hash_vector(*((cmph_uint32 *)jenkins_packed), (const unsigned char*)k, keylen, hashes); } diff --git a/tests/Makefile.am b/tests/Makefile.am index 361c67b..285ff37 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -2,7 +2,7 @@ TESTS = $(check_PROGRAMS) check_PROGRAMS = graph_tests select_tests compressed_seq_tests compressed_rank_tests cmph_benchmark_test noinst_PROGRAMS = packed_mphf_tests mphf_tests -INCLUDES = -I../src/ +AM_CPPFLAGS = -I../src/ graph_tests_SOURCES = graph_tests.c graph_tests_LDADD = ../src/libcmph.la