From 79d250d15222d00456708e5293efeabaea0bd1ac Mon Sep 17 00:00:00 2001 From: fc_botelho Date: Wed, 18 Mar 2009 19:40:23 +0000 Subject: [PATCH] compressed hash and displace method added --- src/Makefile | 8 +- src/Makefile.am | 3 +- src/Makefile.in | 8 +- src/bdz_ph.c | 7 - src/brz.c | 4 + src/chd_ph.c | 833 ++++++++++++++++++++++++++++++++++++++ src/chd_ph.h | 59 +++ src/chd_structs_ph.h | 32 ++ src/cmph.c | 85 +++- src/cmph.h | 1 + src/cmph_types.h | 2 +- src/main.c | 29 +- src/miller_rabin.c | 67 +++ src/miller_rabin.h | 5 + tests/packed_mphf_tests.c | 5 +- 15 files changed, 1108 insertions(+), 40 deletions(-) create mode 100644 src/chd_ph.c create mode 100644 src/chd_ph.h create mode 100644 src/chd_structs_ph.h create mode 100644 src/miller_rabin.c create mode 100644 src/miller_rabin.h diff --git a/src/Makefile b/src/Makefile index 3f665fa..32c5e16 100644 --- a/src/Makefile +++ b/src/Makefile @@ -60,7 +60,8 @@ libcmph_la_LIBADD = am_libcmph_la_OBJECTS = hash.lo jenkins_hash.lo vstack.lo vqueue.lo \ graph.lo cmph.lo cmph_structs.lo chm.lo bmz.lo bmz8.lo bdz.lo \ bdz_ph.lo buffer_manager.lo buffer_entry.lo brz.lo fch.lo \ - fch_buckets.lo select.lo compressed_seq.lo + fch_buckets.lo select.lo compressed_seq.lo chd_ph.lo \ + miller_rabin.lo libcmph_la_OBJECTS = $(am_libcmph_la_OBJECTS) libcmph_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ @@ -207,7 +208,8 @@ libcmph_la_SOURCES = hash.c jenkins_hash.c\ chm.c bmz.c bmz8.c bdz.c bdz_ph.c\ buffer_manager.c buffer_entry.c\ brz.c fch.c fch_buckets.c \ - select.c compressed_seq.c + select.c compressed_seq.c \ + chd_ph.c miller_rabin.c libcmph_la_LDFLAGS = -version-info 0:0:0 cmph_SOURCES = main.c wingetopt.h wingetopt.c @@ -319,6 +321,7 @@ include ./$(DEPDIR)/bmz8.Plo include ./$(DEPDIR)/brz.Plo include ./$(DEPDIR)/buffer_entry.Plo include ./$(DEPDIR)/buffer_manager.Plo +include ./$(DEPDIR)/chd_ph.Plo include ./$(DEPDIR)/chm.Plo include ./$(DEPDIR)/cmph.Plo include ./$(DEPDIR)/cmph_structs.Plo @@ -329,6 +332,7 @@ include ./$(DEPDIR)/graph.Plo include ./$(DEPDIR)/hash.Plo include ./$(DEPDIR)/jenkins_hash.Plo include ./$(DEPDIR)/main.Po +include ./$(DEPDIR)/miller_rabin.Plo include ./$(DEPDIR)/select.Plo include ./$(DEPDIR)/vqueue.Plo include ./$(DEPDIR)/vstack.Plo diff --git a/src/Makefile.am b/src/Makefile.am index cfaf360..4202a4f 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -7,7 +7,8 @@ libcmph_la_SOURCES = hash.c jenkins_hash.c\ chm.c bmz.c bmz8.c bdz.c bdz_ph.c\ buffer_manager.c buffer_entry.c\ brz.c fch.c fch_buckets.c \ - select.c compressed_seq.c + select.c compressed_seq.c \ + chd_ph.c miller_rabin.c libcmph_la_LDFLAGS = -version-info 0:0:0 diff --git a/src/Makefile.in b/src/Makefile.in index 302b6f2..491c4bb 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -60,7 +60,8 @@ libcmph_la_LIBADD = am_libcmph_la_OBJECTS = hash.lo jenkins_hash.lo vstack.lo vqueue.lo \ graph.lo cmph.lo cmph_structs.lo chm.lo bmz.lo bmz8.lo bdz.lo \ bdz_ph.lo buffer_manager.lo buffer_entry.lo brz.lo fch.lo \ - fch_buckets.lo select.lo compressed_seq.lo + fch_buckets.lo select.lo compressed_seq.lo chd_ph.lo \ + miller_rabin.lo libcmph_la_OBJECTS = $(am_libcmph_la_OBJECTS) libcmph_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ @@ -207,7 +208,8 @@ libcmph_la_SOURCES = hash.c jenkins_hash.c\ chm.c bmz.c bmz8.c bdz.c bdz_ph.c\ buffer_manager.c buffer_entry.c\ brz.c fch.c fch_buckets.c \ - select.c compressed_seq.c + select.c compressed_seq.c \ + chd_ph.c miller_rabin.c libcmph_la_LDFLAGS = -version-info 0:0:0 cmph_SOURCES = main.c wingetopt.h wingetopt.c @@ -319,6 +321,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/brz.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffer_entry.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffer_manager.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chd_ph.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chm.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cmph.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cmph_structs.Plo@am__quote@ @@ -329,6 +332,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hash.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jenkins_hash.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Po@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/miller_rabin.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/select.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vqueue.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vstack.Plo@am__quote@ diff --git a/src/bdz_ph.c b/src/bdz_ph.c index 2c91086..933ba85 100755 --- a/src/bdz_ph.c +++ b/src/bdz_ph.c @@ -484,13 +484,6 @@ void bdz_ph_load(FILE *f, cmph_t *mphf) bdz_ph->g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); nbytes = fread(bdz_ph->g, sizeg*sizeof(cmph_uint8), (size_t)1, f); -/* #ifdef DEBUG - cmph_uint32 i; - fprintf(stderr, "G: "); - for (i = 0; i < bdz_ph->n; ++i) fprintf(stderr, "%u ", GETVALUE(bdz_ph->g,i)); - fprintf(stderr, "\n"); - #endif -*/ return; } diff --git a/src/brz.c b/src/brz.c index e34d7f6..479630a 100755 --- a/src/brz.c +++ b/src/brz.c @@ -105,6 +105,10 @@ void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd) void brz_config_set_b(cmph_config_t *mph, cmph_uint32 b) { brz_config_data_t *brz = (brz_config_data_t *)mph->data; + if(b <= 64 || b >= 175) + { + b = 128; + } brz->b = b; } diff --git a/src/chd_ph.c b/src/chd_ph.c new file mode 100644 index 0000000..47d44fd --- /dev/null +++ b/src/chd_ph.c @@ -0,0 +1,833 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "cmph_structs.h" +#include "chd_structs_ph.h" +#include "chd_ph.h" +#include"miller_rabin.h" + +#define DEBUG +#include "debug.h" + +// NO_ELEMENT is equivalent to null pointer +#ifndef NO_ELEMENT +#define NO_ELEMENT UINT_MAX +#endif + +// struct to represents the buckets items +struct _chd_ph_item_t +{ + cmph_uint32 f; + cmph_uint32 h; + struct _chd_ph_item_t * next; +}; +typedef struct _chd_ph_item_t chd_ph_item_t; + + +// struct to represent a bucket +struct _chd_ph_bucket_t +{ + cmph_uint32 size; + chd_ph_item_t * items_list; + cmph_uint32 next_in_list; +}; +typedef struct _chd_ph_bucket_t chd_ph_bucket_t; + +static inline chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets); +static inline void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets); +static inline cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets, cmph_uint32 g, chd_ph_item_t * item); +static inline void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets); + +chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets) +{ + chd_ph_bucket_t * buckets = (chd_ph_bucket_t *) calloc(nbuckets, sizeof(chd_ph_bucket_t)); + return buckets; +} + +void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets) +{ + register cmph_uint32 i = 0; + assert(buckets); + for(i = 0; i < nbuckets; i++) + { + buckets[i].size = 0; + buckets[i].items_list = 0; + buckets[i].next_in_list = NO_ELEMENT; + }; +} + +cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets, cmph_uint32 g, chd_ph_item_t * item) +{ + chd_ph_item_t * item1, * prior_item1; + item1 = buckets[g].items_list; + prior_item1 = 0; + while(item1 != 0 && (item1->f < item->f || (item1->f == item->f && item1->h < item->h)) ) + { + prior_item1 = item1; + item1 = item1->next; + }; + + if(item1 != 0 && item1->f == item->f && item1->h == item->h) + { + DEBUGP("Item not added\n"); + return 0; + }; + item->next = item1; + if(prior_item1 == 0) + buckets[g].items_list = item; + else + prior_item1->next = item; + + buckets[g].size++; + + return 1; +} + +void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets) +{ + free(buckets); +} + +static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, + cmph_uint32 *max_bucket_size); + +static inline cmph_uint32 * chd_ph_ordering(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets, cmph_uint32 max_bucket_size); + +static inline cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size, + cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table, + cmph_uint8 * occup_table); + +static inline double chd_ph_space_lower_bound(cmph_uint32 _n, cmph_uint32 _r) +{ + double r = _r, n = _n; + return (1 + (r/n - 1.0 + 1.0/(2.0*n))*log(1 - n/r))/log(2); +}; + +/* computes the entropy of non empty buckets.*/ +static inline double chd_ph_get_entropy(cmph_uint32 * disp_table, cmph_uint32 n, cmph_uint32 max_probes) +{ + register cmph_uint32 * probe_counts = (cmph_uint32 *) calloc(max_probes, sizeof(cmph_uint32)); + register cmph_uint32 i; + register double entropy = 0; + + for(i = 0; i < n; i++) + { + probe_counts[disp_table[i]]++; + }; + + for(i = 0; i < max_probes; i++) + { + if(probe_counts[i] > 0) + entropy -= probe_counts[i]*log((double)probe_counts[i]/(double)n)/log(2); + }; + free(probe_counts); + return entropy; +}; + +chd_ph_config_data_t *chd_ph_config_new() +{ + chd_ph_config_data_t *chd_ph; + chd_ph = (chd_ph_config_data_t *)malloc(sizeof(chd_ph_config_data_t)); + assert(chd_ph); + memset(chd_ph, 0, sizeof(chd_ph_config_data_t)); + + chd_ph->hashfunc = CMPH_HASH_JENKINS; + chd_ph->cs = NULL; + chd_ph->nbuckets = 0; + chd_ph->n = 0; + chd_ph->hl = NULL; + + chd_ph->m = 0; + chd_ph->use_h = 1; + chd_ph->keys_per_bin = 1; + chd_ph->keys_per_bucket = 4; + + //The following fields are used just for statistics + chd_ph->space_usage = 0; + chd_ph->entropy = 0.0; + return chd_ph; +} + +void chd_ph_config_destroy(cmph_config_t *mph) +{ + chd_ph_config_data_t *data = (chd_ph_config_data_t *) mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + free(data); +} + + +void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + cmph_uint32 i = 0; + while(*hashptr != CMPH_HASH_COUNT) + { + if (i >= 1) break; //chd_ph only uses one linear hash function + chd_ph->hashfunc = *hashptr; + ++i, ++hashptr; + } +} + + +void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket) +{ + assert(mph); + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + if(keys_per_bucket <= 1 || keys_per_bucket >= 15) + { + keys_per_bucket = 4; + } + chd_ph->keys_per_bucket = keys_per_bucket; +} + + +void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) +{ + assert(mph); + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + if(keys_per_bin <= 1 || keys_per_bin >= 128) + { + keys_per_bin = 1; + } + chd_ph->keys_per_bin = keys_per_bin; +} + +cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, cmph_uint32 *max_bucket_size) +{ + register cmph_uint32 i = 0, g = 0; + cmph_uint32 hl[3]; + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + char * key = NULL; + cmph_uint32 keylen = 0; + chd_ph_item_t * item; + register cmph_uint32 mapping_iterations = 1000; + *max_bucket_size = 0; + while(1) + { + mapping_iterations--; + if (chd_ph->hl) hash_state_destroy(chd_ph->hl); + chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m); + + chd_ph_bucket_clean(buckets, chd_ph->nbuckets); + + mph->key_source->rewind(mph->key_source->data); + + for(i = 0; i < chd_ph->m; i++) + { + mph->key_source->read(mph->key_source->data, &key, &keylen); + hash_vector(chd_ph->hl, key, keylen, hl); + + item = (items + i); + + g = hl[0] % chd_ph->nbuckets; + item->f = hl[1] % chd_ph->n; + item->h = hl[2] % (chd_ph->n - 1) + 1; + + mph->key_source->dispose(mph->key_source->data, key, keylen); + +// if(buckets[g].size == (chd_ph->keys_per_bucket << 2)) +// { +// DEBUGP("BUCKET = %u -- SIZE = %u -- MAXIMUM SIZE = %u\n", g, buckets[g].size, (chd_ph->keys_per_bucket << 2)); +// goto error; +// } + + if(!chd_ph_bucket_insert(buckets, chd_ph->nbuckets, g, item)) + { + break; + } + + if(buckets[g].size > *max_bucket_size) + { + *max_bucket_size = buckets[g].size; + } + } + + if(i == chd_ph->m) + { + return 1; // SUCCESS + } + + if(mapping_iterations == 0) + { + goto error; + } + } +error: + hash_state_destroy(chd_ph->hl); + chd_ph->hl = NULL; + return 0; // FAILURE + +} + +cmph_uint32 * chd_ph_ordering(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets, cmph_uint32 max_bucket_size) +{ + cmph_uint32 * sorted_lists = (cmph_uint32 *) calloc(max_bucket_size + 1, sizeof(cmph_uint32)); + register cmph_uint32 i, size; + DEBUGP("MAX BUCKET SIZE = %u\n", max_bucket_size); + for(i = 0; i <= max_bucket_size; i++) + { + sorted_lists[i] = NO_ELEMENT; + } + for(i = 0; i < nbuckets; i++) + { + size = buckets[i].size; + if(size == 0) + continue; + buckets[i].next_in_list = sorted_lists[size]; + sorted_lists[size] = i; + }; + + return sorted_lists; +} + +static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint8 * occup_table, + cmph_uint32 probe0_num, cmph_uint32 probe1_num, cmph_uint32 bucket_num) +{ + register cmph_uint32 i; + register cmph_uint32 size = buckets[bucket_num].size; + register chd_ph_item_t * item; + register cmph_uint32 position; + + item = buckets[bucket_num].items_list; + // try place bucket with probe_num + for(i = 0; i < size; i++) // placement + { + position = (item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n; + + if(occup_table[position] >= chd_ph->keys_per_bin) + { + break; + } + occup_table[position]++; + + item = item->next; + }; + + if(i != size) // Undo the placement + { + item = buckets[bucket_num].items_list; + while(1) + { + if(i == 0) + { + break; + } + position = (item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n; + occup_table[position]--; + item = item->next; + i--; + }; + return 0; + }; + return 1; +}; + +static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_probes, + cmph_uint32 * disp_table, cmph_uint8 * occup_table, cmph_uint32 bucket_num) + +{ + register cmph_uint32 probe0_num, probe1_num, probe_num; + probe0_num = 0; + probe1_num = 0; + probe_num = 0; + + while(1) + { + if(place_bucket_probe(chd_ph, buckets, occup_table, probe0_num, probe1_num, bucket_num)) + { + disp_table[bucket_num] = probe0_num + probe1_num * chd_ph->n; + return 1; + } + probe0_num++; + if(probe0_num >= chd_ph->n) + { + probe0_num -= chd_ph->n; + probe1_num++; + }; + probe_num++; + if(probe_num >= max_probes || probe1_num >= chd_ph->n) + { + return 0; + }; + }; + return 0; +}; + +static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size, + cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table, + cmph_uint8 * occup_table) +{ + register cmph_uint32 i = 0; + register cmph_uint32 curr_bucket = 0; + + for(i = max_bucket_size; i > 0; i--) + { + curr_bucket = sorted_lists[i]; + while(curr_bucket != NO_ELEMENT) + { + if(!place_bucket(chd_ph, buckets, max_probes, disp_table, occup_table, curr_bucket)) + { + return 0; + } + curr_bucket = buckets[curr_bucket].next_in_list; + }; + + }; + return 1; +}; + +static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size, + cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table, + cmph_uint8 * occup_table) +{ + register cmph_uint32 i; + register cmph_uint32 curr_bucket, prev_bucket; + register cmph_uint32 probe_num, probe0_num, probe1_num; + DEBUGP("USING HEURISTIC TO PLACE BUCKETS\n"); + for(i = max_bucket_size; i > 0; i--) + { + probe_num = 0; + probe0_num = 0; + probe1_num = 0; + while(sorted_lists[i] != NO_ELEMENT) + { + prev_bucket = NO_ELEMENT; + curr_bucket = sorted_lists[i]; + while(curr_bucket != NO_ELEMENT) + { + // if bucket is successfully placed remove it from list + if(place_bucket_probe(chd_ph, buckets, occup_table, probe0_num, probe1_num, curr_bucket)) + { + disp_table[curr_bucket] = probe0_num + probe1_num * chd_ph->n; +// DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]); + if(prev_bucket == NO_ELEMENT) + { + sorted_lists[i] = buckets[curr_bucket].next_in_list; + } + else + { + buckets[prev_bucket].next_in_list = buckets[curr_bucket].next_in_list; + } + + } + else + { +// DEBUGP("BUCKET %u NOT PLACED\n", curr_bucket); + prev_bucket = curr_bucket; + } + curr_bucket = buckets[curr_bucket].next_in_list; + }; + probe0_num++; + if(probe0_num >= chd_ph->n) + { + probe0_num -= chd_ph->n; + probe1_num++; + }; + probe_num++; + if(probe_num >= max_probes || probe1_num >= chd_ph->n) + { + return 0; + }; + }; + }; + return 1; +}; + +cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size, + cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table, + cmph_uint8 * occup_table) +{ + if(chd_ph->use_h) + { + return place_buckets2(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table, occup_table); + } + else + { + return place_buckets1(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table, occup_table); + } + +} + +static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, + cmph_uint32 * disp_table, cmph_uint8 * occup_table) +{ + register cmph_uint32 i, j; + register cmph_uint32 position, probe0_num, probe1_num; + register cmph_uint32 m = 0; + register chd_ph_item_t * item; + + memset(occup_table, 0, chd_ph->n); + for(i = 0; i < chd_ph->nbuckets; i++) + { + j = buckets[i].size; + item = buckets[i].items_list; + probe0_num = disp_table[i] % chd_ph->n; + probe1_num = disp_table[i] / chd_ph->n; + for(; j > 0; j--) + { + if(item == 0) + { + return 0; + } + m++; + position = (item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n; + if(occup_table[position] >= chd_ph->keys_per_bin) + { + return 0; + } + occup_table[position]++; + item = item->next; + }; + }; + DEBUGP("We were able to place m = %u keys\n", m); + return 1; +}; + + +cmph_t *chd_ph_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + chd_ph_data_t *chd_phf = NULL; + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + + register double load_factor = c; + register cmph_uint8 searching_success = 0; + register cmph_uint32 max_probes = 1 << 18; // default value for max_probes + register cmph_uint32 iterations = 100; + chd_ph_bucket_t * buckets = NULL; + chd_ph_item_t * items = NULL; + register cmph_uint8 failure = 0; + cmph_uint32 max_bucket_size = 0; + cmph_uint32 * sorted_lists = NULL; + cmph_uint32 * disp_table = NULL; + cmph_uint8 * occup_table; + + chd_ph->m = mph->key_source->nkeys; + DEBUGP("m = %u\n", chd_ph->m); + + chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1; + DEBUGP("nbuckets = %u\n", chd_ph->nbuckets); + + if(load_factor < 0.5 ) + { + load_factor = 0.5; + } + + if(load_factor >= 0.99) + { + load_factor = 0.99; + } + + DEBUGP("load_factor = %.3f\n", load_factor); + + chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1; + + //Round the number of bins to the prime immediately above + if(chd_ph->n % 2 == 0) chd_ph->n++; + for(;;) + { + if(check_primality(chd_ph->n) == 1) + break; + chd_ph->n += 2; // just odd numbers can be primes for n > 2 + + }; + + DEBUGP("n = %u \n", chd_ph->n); + + if(mph->verbosity && chd_ph->keys_per_bin == 1) + { + fprintf(stderr, "space lower bound is %.3f bits per key", chd_ph_space_lower_bound(chd_ph->m, chd_ph->n)); + } + + // We allocate the working tables + buckets = chd_ph_bucket_new(chd_ph->nbuckets); + items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t)); + + max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes); + occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8)); + disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32)); +// +// init_genrand(time(0)); + + while(1) + { + iterations --; + if (mph->verbosity) + { + fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n); + } + + if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size)) + { + if (mph->verbosity) + { + fprintf(stderr, "Failure in mapping step\n"); + } + failure = 1; + goto cleanup; + } + + if (mph->verbosity) + { + fprintf(stderr, "Starting ordering step\n"); + } + if(sorted_lists) + { + free(sorted_lists); + } + sorted_lists = chd_ph_ordering(buckets, chd_ph->nbuckets, max_bucket_size); + + if (mph->verbosity) + { + fprintf(stderr, "Starting searching step\n"); + } + + searching_success = chd_ph_searching(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table, occup_table); + + if(searching_success) break; + + // reset occup_table + memset(occup_table, 0, chd_ph->n); + if(iterations == 0) + { + // Cleanup memory + if (mph->verbosity) + { + fprintf(stderr, "Failure because the max trials was exceeded\n"); + } + failure = 1; + goto cleanup; + }; + } + + #ifdef DEBUG + chd_ph->entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes); + DEBUGP("Entropy = %.4f\n", chd_ph->entropy/chd_ph->m); + + if(!chd_ph_check_bin_hashing(chd_ph, buckets, disp_table, occup_table)) + { + + DEBUGP("Error for bin packing generation"); + return NULL; + }; + #endif + + if (mph->verbosity) + { + fprintf(stderr, "Starting compressing step\n"); + } + + if(chd_ph->cs) + { + free(chd_ph->cs); + } + chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); + compressed_seq_init(chd_ph->cs); + compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets); + chd_ph->space_usage = compressed_seq_get_space_usage(chd_ph->cs); + chd_ph->space_usage += 64; + DEBUGP("space_usage/key = %.4f\n", chd_ph->space_usage/(double)chd_ph->m); + +cleanup: + chd_ph_bucket_destroy(buckets); + free(items); + free(sorted_lists); + free(disp_table); + free(occup_table); + if(failure) + { + if(chd_ph->hl) + { + hash_state_destroy(chd_ph->hl); + } + chd_ph->hl = NULL; + return NULL; + } + + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t)); + + chd_phf->cs = chd_ph->cs; + chd_ph->cs = NULL; //transfer memory ownership + chd_phf->hl = chd_ph->hl; + chd_ph->hl = NULL; //transfer memory ownership + chd_phf->n = chd_ph->n; + chd_phf->nbuckets = chd_ph->nbuckets; + + mphf->data = chd_phf; + mphf->size = chd_ph->n; + + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + + return mphf; +} + + + +void chd_ph_load(FILE *fd, cmph_t *mphf) +{ + char *buf = NULL; + cmph_uint32 buflen; + register cmph_uint32 nbytes; + chd_ph_data_t *chd_ph = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t)); + + DEBUGP("Loading chd_ph mphf\n"); + mphf->data = chd_ph; + + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, fd); + chd_ph->hl = hash_state_load(buf, buflen); + free(buf); + + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + DEBUGP("Compressed sequence structure has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, fd); + chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); + compressed_seq_load(chd_ph->cs, buf, buflen); + free(buf); + + // loading n and nbuckets + DEBUGP("Reading n and nbuckets\n"); + nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd); +} + +int chd_ph_dump(cmph_t *mphf, FILE *fd) +{ + char *buf = NULL; + cmph_uint32 buflen; + register cmph_uint32 nbytes; + chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; + + __cmph_dump(mphf, fd); + + hash_state_dump(data->hl, &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + compressed_seq_dump(data->cs, &buf, &buflen); + DEBUGP("Dumping compressed sequence structure with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + // dumping n and nbuckets + nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->nbuckets), sizeof(cmph_uint32), (size_t)1, fd); + return 1; +} + +void chd_ph_destroy(cmph_t *mphf) +{ + chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; + compressed_seq_destroy(data->cs); + free(data->cs); + hash_state_destroy(data->hl); + free(data); + free(mphf); + +} + +cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + register chd_ph_data_t * chd_ph = mphf->data; + cmph_uint32 hl[3]; + register cmph_uint32 disp,position; + register cmph_uint32 probe0_num,probe1_num; + register cmph_uint32 f,g,h; + hash_vector(chd_ph->hl, key, keylen, hl); + g = hl[0] % chd_ph->nbuckets; + f = hl[1] % chd_ph->n; + h = hl[2] % (chd_ph->n-1) + 1; + + disp = compressed_seq_query(chd_ph->cs, g); + probe0_num = disp % chd_ph->n; + probe1_num = disp/chd_ph->n; + position = (f + ((cmph_uint64 )h)*probe0_num + probe1_num) % chd_ph->n; + return position; +} + +void chd_ph_pack(cmph_t *mphf, void *packed_mphf) +{ + chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; + cmph_uint8 * ptr = packed_mphf; + + // packing hl type + CMPH_HASH hl_type = hash_get_type(data->hl); + *((cmph_uint32 *) ptr) = hl_type; + ptr += sizeof(cmph_uint32); + + // packing hl + hash_state_pack(data->hl, ptr); + ptr += hash_state_packed_size(hl_type); + + // packing n + *((cmph_uint32 *) ptr) = data->n; + ptr += sizeof(data->n); + + // packing nbuckets + *((cmph_uint32 *) ptr) = data->nbuckets; + ptr += sizeof(data->nbuckets); + + // packing cs + compressed_seq_pack(data->cs, ptr); + //ptr += compressed_seq_packed_size(data->cs); + +} + +cmph_uint32 chd_ph_packed_size(cmph_t *mphf) +{ + register chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; + register CMPH_HASH hl_type = hash_get_type(data->hl); + register cmph_uint32 hash_state_pack_size = hash_state_packed_size(hl_type); + register cmph_uint32 cs_pack_size = compressed_seq_packed_size(data->cs); + + return (sizeof(CMPH_ALGO) + hash_state_pack_size + cs_pack_size + 3*sizeof(cmph_uint32)); + +} + +cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf; + register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4; + + register cmph_uint32 * ptr = (cmph_uint32 *)(hl_ptr + hash_state_packed_size(hl_type)); + register cmph_uint32 n = *ptr++; + register cmph_uint32 nbuckets = *ptr++; + cmph_uint32 hl[3]; + + register cmph_uint32 disp,position; + register cmph_uint32 probe0_num,probe1_num; + register cmph_uint32 f,g,h; + + hash_vector_packed(hl_ptr, hl_type, key, keylen, hl); + + g = hl[0] % nbuckets; + f = hl[1] % n; + h = hl[2] % (n-1) + 1; + + disp = compressed_seq_query_packed(ptr, g); + probe0_num = disp % n; + probe1_num = disp/n; + position = (f + ((cmph_uint64 )h)*probe0_num + probe1_num) % n; + return position; +} + + + diff --git a/src/chd_ph.h b/src/chd_ph.h new file mode 100644 index 0000000..d2bdb02 --- /dev/null +++ b/src/chd_ph.h @@ -0,0 +1,59 @@ +#ifndef _CMPH_CHD_PH_H__ +#define _CMPH_CHD_PH_H__ + +#include "cmph.h" + +typedef struct __chd_ph_data_t chd_ph_data_t; +typedef struct __chd_ph_config_data_t chd_ph_config_data_t; + +/* Config API */ +chd_ph_config_data_t *chd_ph_config_new(); +void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); + +/** \fn void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin); + * \brief Allows to set the number of keys per bin. + * \param mph pointer to the configuration structure + * \param keys_per_bin value for the number of keys per bin + */ +void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin); + +/** \fn void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket); + * \brief Allows to set the number of keys per bucket. + * \param mph pointer to the configuration structure + * \param keys_per_bucket value for the number of keys per bucket + */ +void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket); +void chd_ph_config_destroy(cmph_config_t *mph); + + +/* Chd algorithm API */ +cmph_t *chd_ph_new(cmph_config_t *mph, double c); +void chd_ph_load(FILE *fd, cmph_t *mphf); +int chd_ph_dump(cmph_t *mphf, FILE *fd); +void chd_ph_destroy(cmph_t *mphf); +cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** \fn void chd_ph_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void chd_ph_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 chd_ph_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 chd_ph_packed_size(cmph_t *mphf); + +/** cmph_uint32 chd_ph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + +#endif diff --git a/src/chd_structs_ph.h b/src/chd_structs_ph.h new file mode 100644 index 0000000..c93b6bd --- /dev/null +++ b/src/chd_structs_ph.h @@ -0,0 +1,32 @@ +#ifndef __CMPH_CHD_PH_STRUCTS_H__ +#define __CMPH_CHD_PH_STRUCTS_H__ + +#include "hash_state.h" +#include "compressed_seq.h" + +struct __chd_ph_data_t +{ + compressed_seq_t * cs; // compressed displacement values + cmph_uint32 nbuckets; // number of buckets + cmph_uint32 n; // number of bins + hash_state_t *hl; // linear hash function +}; + +struct __chd_ph_config_data_t +{ + CMPH_HASH hashfunc; // linear hash function to be used + compressed_seq_t * cs; // compressed displacement values + cmph_uint32 nbuckets; // number of buckets + cmph_uint32 n; // number of bins + hash_state_t *hl; // linear hash function + + cmph_uint32 m; // number of keys + cmph_uint8 use_h; // flag to indicate the of use of a heuristic (use_h = 1) + cmph_uint32 keys_per_bin;//maximum number of keys per bin + cmph_uint32 keys_per_bucket; // average number of keys per bucket + + //The following fields are used just for statistics + cmph_uint32 space_usage; + double entropy; +}; +#endif diff --git a/src/cmph.c b/src/cmph.c index 5142daf..40f16ec 100644 --- a/src/cmph.c +++ b/src/cmph.c @@ -7,6 +7,7 @@ #include "fch.h" /* included -- Fabiano */ #include "bdz.h" /* included -- Fabiano */ #include "bdz_ph.h" /* included -- Fabiano */ +#include "chd_ph.h" /* included -- Fabiano */ #include #include @@ -14,8 +15,7 @@ //#define DEBUG #include "debug.h" -const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz", - "bdz_ph", NULL }; /* included -- Fabiano */ +const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz", "bdz_ph", "chd_ph", NULL }; /* included -- Fabiano */ typedef struct { @@ -322,6 +322,9 @@ void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) case CMPH_BDZ_PH: bdz_ph_config_destroy(mph); break; + case CMPH_CHD_PH: + chd_ph_config_destroy(mph); + break; default: assert(0); } @@ -348,6 +351,9 @@ void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) case CMPH_BDZ_PH: mph->data = bdz_ph_config_new(); break; + case CMPH_CHD_PH: + mph->data = chd_ph_config_new(); + break; default: assert(0); } @@ -382,6 +388,18 @@ void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b) { bdz_config_set_b(mph, b); } + else if (mph->algo == CMPH_CHD_PH) + { + chd_ph_config_set_b(mph, b); + } +} + +void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) +{ + if (mph->algo == CMPH_CHD_PH) + { + chd_ph_config_set_keys_per_bin(mph, keys_per_bin); + } } void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability) @@ -406,19 +424,22 @@ void cmph_config_destroy(cmph_config_t *mph) bmz_config_destroy(mph); break; case CMPH_BMZ8: /* included -- Fabiano */ - bmz8_config_destroy(mph); + bmz8_config_destroy(mph); break; case CMPH_BRZ: /* included -- Fabiano */ - brz_config_destroy(mph); + brz_config_destroy(mph); break; case CMPH_FCH: /* included -- Fabiano */ - fch_config_destroy(mph); + fch_config_destroy(mph); break; case CMPH_BDZ: /* included -- Fabiano */ - bdz_config_destroy(mph); + bdz_config_destroy(mph); break; case CMPH_BDZ_PH: /* included -- Fabiano */ - bdz_ph_config_destroy(mph); + bdz_ph_config_destroy(mph); + break; + case CMPH_CHD_PH: /* included -- Fabiano */ + chd_ph_config_destroy(mph); break; default: assert(0); @@ -457,6 +478,9 @@ void cmph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) case CMPH_BDZ_PH: /* included -- Fabiano */ bdz_ph_config_set_hashfuncs(mph, hashfuncs); break; + case CMPH_CHD_PH: /* included -- Fabiano */ + chd_ph_config_set_hashfuncs(mph, hashfuncs); + break; default: break; } @@ -506,6 +530,10 @@ cmph_t *cmph_new(cmph_config_t *mph) DEBUGP("Creating bdz_ph hash\n"); mphf = bdz_ph_new(mph, c); break; + case CMPH_CHD_PH: /* included -- Fabiano */ + DEBUGP("Creating chd_ph hash\n"); + mphf = chd_ph_new(mph, c); + break; default: assert(0); } @@ -519,17 +547,19 @@ int cmph_dump(cmph_t *mphf, FILE *f) case CMPH_CHM: return chm_dump(mphf, f); case CMPH_BMZ: /* included -- Fabiano */ - return bmz_dump(mphf, f); + return bmz_dump(mphf, f); case CMPH_BMZ8: /* included -- Fabiano */ - return bmz8_dump(mphf, f); + return bmz8_dump(mphf, f); case CMPH_BRZ: /* included -- Fabiano */ - return brz_dump(mphf, f); + return brz_dump(mphf, f); case CMPH_FCH: /* included -- Fabiano */ - return fch_dump(mphf, f); + return fch_dump(mphf, f); case CMPH_BDZ: /* included -- Fabiano */ - return bdz_dump(mphf, f); + return bdz_dump(mphf, f); case CMPH_BDZ_PH: /* included -- Fabiano */ - return bdz_ph_dump(mphf, f); + return bdz_ph_dump(mphf, f); + case CMPH_CHD_PH: /* included -- Fabiano */ + return chd_ph_dump(mphf, f); default: assert(0); } @@ -573,6 +603,10 @@ cmph_t *cmph_load(FILE *f) DEBUGP("Loading bdz_ph algorithm dependent parts\n"); bdz_ph_load(f, mphf); break; + case CMPH_CHD_PH: /* included -- Fabiano */ + DEBUGP("Loading chd_ph algorithm dependent parts\n"); + chd_ph_load(f, mphf); + break; default: assert(0); } @@ -606,6 +640,9 @@ cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) case CMPH_BDZ_PH: /* included -- Fabiano */ DEBUGP("bdz_ph algorithm search\n"); return bdz_ph_search(mphf, key, keylen); + case CMPH_CHD_PH: /* included -- Fabiano */ + DEBUGP("chd_ph algorithm search\n"); + return chd_ph_search(mphf, key, keylen); default: assert(0); } @@ -626,22 +663,25 @@ void cmph_destroy(cmph_t *mphf) chm_destroy(mphf); return; case CMPH_BMZ: /* included -- Fabiano */ - bmz_destroy(mphf); + bmz_destroy(mphf); return; case CMPH_BMZ8: /* included -- Fabiano */ - bmz8_destroy(mphf); + bmz8_destroy(mphf); return; case CMPH_BRZ: /* included -- Fabiano */ - brz_destroy(mphf); + brz_destroy(mphf); return; case CMPH_FCH: /* included -- Fabiano */ - fch_destroy(mphf); + fch_destroy(mphf); return; case CMPH_BDZ: /* included -- Fabiano */ - bdz_destroy(mphf); + bdz_destroy(mphf); return; case CMPH_BDZ_PH: /* included -- Fabiano */ - bdz_ph_destroy(mphf); + bdz_ph_destroy(mphf); + return; + case CMPH_CHD_PH: /* included -- Fabiano */ + chd_ph_destroy(mphf); return; default: assert(0); @@ -685,6 +725,9 @@ void cmph_pack(cmph_t *mphf, void *packed_mphf) case CMPH_BDZ_PH: /* included -- Fabiano */ bdz_ph_pack(mphf, ptr); break; + case CMPH_CHD_PH: /* included -- Fabiano */ + chd_ph_pack(mphf, ptr); + break; default: assert(0); } @@ -714,6 +757,8 @@ cmph_uint32 cmph_packed_size(cmph_t *mphf) return bdz_packed_size(mphf); case CMPH_BDZ_PH: /* included -- Fabiano */ return bdz_ph_packed_size(mphf); + case CMPH_CHD_PH: /* included -- Fabiano */ + return chd_ph_packed_size(mphf); default: assert(0); } @@ -747,6 +792,8 @@ cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 k return bdz_search_packed(++ptr, key, keylen); case CMPH_BDZ_PH: /* included -- Fabiano */ return bdz_ph_search_packed(++ptr, key, keylen); + case CMPH_CHD_PH: /* included -- Fabiano */ + return chd_ph_search_packed(++ptr, key, keylen); default: assert(0); } diff --git a/src/cmph.h b/src/cmph.h index 758f103..35a77ca 100644 --- a/src/cmph.h +++ b/src/cmph.h @@ -54,6 +54,7 @@ void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo); void cmph_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir); void cmph_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd); void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b); +void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin); void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability); void cmph_config_destroy(cmph_config_t *mph); diff --git a/src/cmph_types.h b/src/cmph_types.h index ef45c24..fdf9735 100644 --- a/src/cmph_types.h +++ b/src/cmph_types.h @@ -35,7 +35,7 @@ typedef unsigned int cmph_uint32; typedef enum { CMPH_HASH_JENKINS, CMPH_HASH_COUNT } CMPH_HASH; extern const char *cmph_hash_names[]; typedef enum { CMPH_BMZ, CMPH_BMZ8, CMPH_CHM, CMPH_BRZ, CMPH_FCH, - CMPH_BDZ, CMPH_BDZ_PH, CMPH_COUNT } CMPH_ALGO; /* included -- Fabiano */ + CMPH_BDZ, CMPH_BDZ_PH, CMPH_CHD_PH, CMPH_COUNT } CMPH_ALGO; /* included -- Fabiano */ extern const char *cmph_names[]; #endif diff --git a/src/main.c b/src/main.c index 7ce3186..658ee09 100644 --- a/src/main.c +++ b/src/main.c @@ -22,17 +22,18 @@ void usage(const char *prg) { - fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir] [-m file.mph] keysfile\n", prg); + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg); } void usage_long(const char *prg) { cmph_uint32 i; - fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir] [-m file.mph] keysfile\n", prg); + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg); fprintf(stderr, "Minimum perfect hashing tool\n\n"); fprintf(stderr, " -h\t print this help message\n"); fprintf(stderr, " -c\t c value determines:\n"); fprintf(stderr, " \t the number of vertices in the graph for the algorithms BMZ and CHM\n"); fprintf(stderr, " \t the number of bits per key required in the FCH algorithm\n"); + fprintf(stderr, " \t the load factor in the CHD_PH algorithm\n"); fprintf(stderr, " -a\t algorithm - valid values are\n"); for (i = 0; i < CMPH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_names[i]); fprintf(stderr, " -f\t hash function (may be used multiple times) - valid values are\n"); @@ -51,7 +52,12 @@ void usage_long(const char *prg) fprintf(stderr, " \t In this case its value should be an integer in the range [64,175].\n"); fprintf(stderr, " \t If BDZ algorithm is selected in option -a, than it is used to\n"); fprintf(stderr, " \t determine the size of some precomputed rank information and\n"); - fprintf(stderr, " \t its value should be an integer in the range [3,10]\n"); + fprintf(stderr, " \t its value should be an integer in the range [3,10].\n"); + fprintf(stderr, " \t If CHD_PH algorithm is selected in option -a, than it is used to\n"); + fprintf(stderr, " \t set average number of keys per bucket and its value should be an\n"); + fprintf(stderr, " \t an integer in the range [1,32].\n"); + fprintf(stderr, " -t\t set the number of keys per bin for a t-perfect hashing function.\n"); + fprintf(stderr, " \t A t-perfect hashing function allows at most t collisions in a given bin.\n"); fprintf(stderr, " keysfile\t line separated file with keys\n"); } @@ -75,10 +81,11 @@ int main(int argc, char **argv) char * tmp_dir = NULL; cmph_io_adapter_t *source; cmph_uint32 memory_availability = 0; - cmph_uint32 b = 128; + cmph_uint32 b = 0; + cmph_uint32 keys_per_bin = 0; while (1) { - char ch = getopt(argc, argv, "hVvgc:k:a:M:b:f:m:d:s:"); + char ch = getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:"); if (ch == -1) break; switch (ch) { @@ -141,6 +148,16 @@ int main(int argc, char **argv) } } break; + case 't': + { + char *cptr; + keys_per_bin = strtoul(optarg, &cptr, 10); + if(*cptr != 0) { + fprintf(stderr, "Parameter t was not found: %s\n", optarg); + exit(1); + } + } + break; case 'v': ++verbosity; break; @@ -237,6 +254,8 @@ int main(int argc, char **argv) cmph_config_set_mphf_fd(config, mphf_fd); cmph_config_set_memory_availability(config, memory_availability); cmph_config_set_b(config, b); + cmph_config_set_keys_per_bin(config, keys_per_bin); + //if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15; if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15; if (c != 0) cmph_config_set_graphsize(config, c); diff --git a/src/miller_rabin.c b/src/miller_rabin.c new file mode 100644 index 0000000..17d0ed3 --- /dev/null +++ b/src/miller_rabin.c @@ -0,0 +1,67 @@ +#include "miller_rabin.h" + +static inline cmph_uint64 int_pow(cmph_uint64 a, cmph_uint64 d, cmph_uint64 n) +{ + cmph_uint64 a_pow = a; + cmph_uint64 res = 1; + while(d > 0) + { + if((d & 1) == 1) + res =(((cmph_uint64)res) * a_pow) % n; + a_pow = (((cmph_uint64)a_pow) * a_pow) % n; + d /= 2; + }; + return res; +}; + +static inline cmph_uint8 check_witness(cmph_uint64 a_exp_d, cmph_uint64 n, cmph_uint64 s) +{ + cmph_uint64 i; + cmph_uint64 a_exp = a_exp_d; + if(a_exp == 1 || a_exp == (n - 1)) + return 1; + for(i = 1; i < s; i++) + { + a_exp = (((cmph_uint64)a_exp) * a_exp) % n; + if(a_exp == (n - 1)) + return 1; + }; + return 0; +}; + +cmph_uint8 check_primality(cmph_uint64 n) +{ + cmph_uint64 a, d, s, a_exp_d; + if((n % 2) == 0) + return 0; + if((n % 3) == 0) + return 0; + if((n % 5) == 0) + return 0; + if((n % 7 ) == 0) + return 0; + //we decompoe the number n - 1 into 2^s*d + s = 0; + d = n - 1; + do + { + s++; + d /= 2; + }while((d % 2) == 0); + + a = 2; + a_exp_d = int_pow(a, d, n); + if(check_witness(a_exp_d, n, s) == 0) + return 0; + a = 7; + a_exp_d = int_pow(a, d, n); + if(check_witness(a_exp_d, n, s) == 0) + return 0; + a = 61; + a_exp_d = int_pow(a, d, n); + if(check_witness(a_exp_d, n, s) == 0) + return 0; + return 1; +}; + + diff --git a/src/miller_rabin.h b/src/miller_rabin.h new file mode 100644 index 0000000..42dc6ce --- /dev/null +++ b/src/miller_rabin.h @@ -0,0 +1,5 @@ +#ifndef _CMPH_MILLER_RABIN_H__ +#define _CMPH_MILLER_RABIN_H__ +#include "cmph_types.h" +cmph_uint8 check_primality(cmph_uint64 n); +#endif diff --git a/tests/packed_mphf_tests.c b/tests/packed_mphf_tests.c index 1c09aea..3bf161f 100644 --- a/tests/packed_mphf_tests.c +++ b/tests/packed_mphf_tests.c @@ -164,14 +164,13 @@ int main(int argc, char **argv) } source->dispose(source->data, buf, buflen); } - free(packed_mphf); - cmph_destroy(mphf); + cmph_destroy(mphf); free(hashtable); fclose(keys_fd); free(mphf_file); - cmph_io_nlfile_adapter_destroy(source); + cmph_io_nlfile_adapter_destroy(source); return ret; }