From 872c528cd995634feb1748e601a949e0f4e08f11 Mon Sep 17 00:00:00 2001 From: fc_botelho Date: Wed, 26 Mar 2008 20:26:48 +0000 Subject: [PATCH] *** empty log message *** --- src/bdz.c | 128 ++++++++++++++++++++--- src/bdz.h | 38 +++++++ src/bdz_ph.c | 121 ++++++++++++++++++++- src/bdz_ph.h | 38 +++++++ src/bmz.c | 93 +++++++++++++++++ src/bmz.h | 38 +++++++ src/bmz8.c | 94 +++++++++++++++++ src/bmz8.h | 38 +++++++ src/brz.c | 49 +++++++++ src/brz.h | 38 +++++++ src/chm.c | 101 ++++++++++++++++++ src/chm.h | 38 +++++++ src/cmph.c | 153 ++++++++++++++++++++++++++- src/cmph.h | 47 +++++++++ src/fch.c | 50 +++++++++ src/fch.h | 38 +++++++ src/hash.c | 95 ++++++++++++++++- src/hash.h | 31 ++++++ src/jenkins_hash.c | 185 +++++++++++++++++++++------------ src/jenkins_hash.h | 34 +++++- src/main.c | 3 +- tests/Makefile.am | 13 ++- tests/mphf_fingerprint_tests.c | 162 +++++++++++++++++++++++++++++ tests/mphf_tests.c | 161 ++++++++++++++++++++++++++++ tests/packed_mphf_tests.c | 177 +++++++++++++++++++++++++++++++ 25 files changed, 1875 insertions(+), 88 deletions(-) create mode 100644 tests/mphf_fingerprint_tests.c create mode 100644 tests/mphf_tests.c create mode 100644 tests/packed_mphf_tests.c diff --git a/src/bdz.c b/src/bdz.c index ed0a405..1318862 100755 --- a/src/bdz.c +++ b/src/bdz.c @@ -214,7 +214,7 @@ static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_que static int bdz_mapping(cmph_config_t *mph, bdz_graph3_t* graph3, bdz_queue_t queue); static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t queue); static void ranking(bdz_config_data_t *bdz); -static cmph_uint32 rank(bdz_data_t *bdz, cmph_uint32 vertex); +static cmph_uint32 rank(cmph_uint8 b, cmph_uint32 * ranktable, cmph_uint8 * g, cmph_uint32 vertex); bdz_config_data_t *bdz_config_new() { @@ -553,22 +553,22 @@ cmph_uint32 bdz_search_ph(cmph_t *mphf, const char *key, cmph_uint32 keylen) return vertex; } -static inline cmph_uint32 rank(bdz_data_t *bdz, cmph_uint32 vertex) +static inline cmph_uint32 rank(cmph_uint8 b, cmph_uint32 * ranktable, cmph_uint8 * g, cmph_uint32 vertex) { - cmph_uint32 index = vertex >> bdz->b; - cmph_uint32 base_rank = bdz->ranktable[index]; - cmph_uint32 beg_idx_v = index << bdz->b; - cmph_uint32 beg_idx_b = beg_idx_v >> 2; - cmph_uint32 end_idx_b = vertex >> 2; + register cmph_uint32 index = vertex >> b; + register cmph_uint32 base_rank = ranktable[index]; + register cmph_uint32 beg_idx_v = index << b; + register cmph_uint32 beg_idx_b = beg_idx_v >> 2; + register cmph_uint32 end_idx_b = vertex >> 2; while(beg_idx_b < end_idx_b) { - base_rank += bdz_lookup_table[*(bdz->g + beg_idx_b++)]; + base_rank += bdz_lookup_table[*(g + beg_idx_b++)]; } beg_idx_v = beg_idx_b << 2; while(beg_idx_v < vertex) { - if(GETVALUE(bdz->g, beg_idx_v) != UNASSIGNED) base_rank++; + if(GETVALUE(g, beg_idx_v) != UNASSIGNED) base_rank++; beg_idx_v++; } @@ -577,15 +577,15 @@ static inline cmph_uint32 rank(bdz_data_t *bdz, cmph_uint32 vertex) cmph_uint32 bdz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { - bdz_data_t *bdz = mphf->data; + register cmph_uint32 vertex; + register bdz_data_t *bdz = mphf->data; cmph_uint32 hl[3]; hash_vector(bdz->hl, key, keylen, hl); - cmph_uint32 vertex; hl[0] = hl[0] % bdz->r; hl[1] = hl[1] % bdz->r + bdz->r; hl[2] = hl[2] % bdz->r + (bdz->r << 1); vertex = hl[(GETVALUE(bdz->g, hl[0]) + GETVALUE(bdz->g, hl[1]) + GETVALUE(bdz->g, hl[2])) % 3]; - return rank(bdz, vertex); + return rank(bdz->b, bdz->ranktable, bdz->g, vertex); } @@ -598,3 +598,107 @@ void bdz_destroy(cmph_t *mphf) free(data); free(mphf); } + +/** cmph_uint32 bdz_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 bdz_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + register cmph_uint32 vertex; + register bdz_data_t *bdz = mphf->data; + cmph_uint32 hl[3]; + + hash_vector(bdz->hl, key, keylen, hl); + memcpy(fingerprint, hl, sizeof(hl)); + hl[0] = hl[0] % bdz->r; + hl[1] = hl[1] % bdz->r + bdz->r; + hl[2] = hl[2] % bdz->r + (bdz->r << 1); + vertex = hl[(GETVALUE(bdz->g, hl[0]) + GETVALUE(bdz->g, hl[1]) + GETVALUE(bdz->g, hl[2])) % 3]; + return rank(bdz->b, bdz->ranktable, bdz->g, vertex); +} + +/** \fn void bdz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bdz_pack(cmph_t *mphf, void *packed_mphf) +{ + bdz_data_t *data = (bdz_data_t *)mphf->data; + cmph_uint32 * ptr = packed_mphf; + + // packing hl + hash_state_pack(data->hl, ptr); + + + ptr += (hash_state_packed_size(data->hl) >> 2); // (hash_state_packed_size(data->hl) / 4); + + // packing r + *ptr++ = data->r; + + // packing ranktablesize + *ptr++ = data->ranktablesize; + + // packing ranktable + memcpy(ptr, data->ranktable, sizeof(cmph_uint32)*(data->ranktablesize)); + ptr += data->ranktablesize; + + cmph_uint8 * ptr8 = (cmph_uint8 *) ptr; + + // packing b + *ptr8++ = data->b; + + // packing g + memcpy(ptr8, data->g, sizeof(cmph_uint8)*((data->n >> 2) +1)); +} + +/** \fn cmph_uint32 bdz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bdz_packed_size(cmph_t *mphf) +{ + bdz_data_t *data = (bdz_data_t *)mphf->data; + return (sizeof(CMPH_ALGO) + hash_state_packed_size(data->hl) + (sizeof(cmph_uint32) << 1) + sizeof(cmph_uint32)*(data->ranktablesize) + sizeof(cmph_uint8) + sizeof(cmph_uint8)*((data->n >> 2) +1)); +} + +/** cmph_uint32 bdz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 bdz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register cmph_uint32 vertex; + register cmph_uint32 *hl_ptr = (cmph_uint32 *)packed_mphf; + register cmph_uint32 hl_size = *hl_ptr; + register cmph_uint32 *ptr = hl_ptr + (hl_size >> 2); // h2_ptr + h2_size/4 + + register cmph_uint32 r = *ptr++; + register cmph_uint32 ranktablesize = *ptr++; + register cmph_uint32 *ranktable = ptr; + ptr += ranktablesize; + + register cmph_uint8 * g = (cmph_uint8 *)ptr; + register cmph_uint8 b = *g++; + + cmph_uint32 hl[3]; + hash_vector_packed(hl_ptr, key, keylen, hl); + hl[0] = hl[0] % r; + hl[1] = hl[1] % r + r; + hl[2] = hl[2] % r + (r << 1); + vertex = hl[(GETVALUE(g, hl[0]) + GETVALUE(g, hl[1]) + GETVALUE(g, hl[2])) % 3]; + return rank(b, ranktable, g, vertex); +} diff --git a/src/bdz.h b/src/bdz.h index 0b15e01..900ef6f 100755 --- a/src/bdz.h +++ b/src/bdz.h @@ -16,4 +16,42 @@ void bdz_load(FILE *f, cmph_t *mphf); int bdz_dump(cmph_t *mphf, FILE *f); void bdz_destroy(cmph_t *mphf); cmph_uint32 bdz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** cmph_uint32 bdz_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 bdz_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + +/** \fn void bdz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bdz_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 bdz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bdz_packed_size(cmph_t *mphf); + +/** cmph_uint32 bdz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 bdz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + #endif diff --git a/src/bdz_ph.c b/src/bdz_ph.c index 3ca4700..4a76de6 100755 --- a/src/bdz_ph.c +++ b/src/bdz_ph.c @@ -478,21 +478,23 @@ void bdz_ph_load(FILE *f, cmph_t *mphf) bdz_ph->g = (cmph_uint8 *)calloc((bdz_ph->n/5)+1, sizeof(cmph_uint8)); fread(bdz_ph->g, ((bdz_ph->n/5)+1)*sizeof(cmph_uint8), 1, f); - #ifdef DEBUG +/* #ifdef DEBUG + cmph_uint32 i; fprintf(stderr, "G: "); for (i = 0; i < bdz_ph->n; ++i) fprintf(stderr, "%u ", GETVALUE(bdz_ph->g,i)); fprintf(stderr, "\n"); #endif +*/ return; } cmph_uint32 bdz_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { - bdz_ph_data_t *bdz_ph = mphf->data; + register bdz_ph_data_t *bdz_ph = mphf->data; cmph_uint32 hl[3]; - cmph_uint8 byte0, byte1, byte2; - cmph_uint32 vertex; + register cmph_uint8 byte0, byte1, byte2; + register cmph_uint32 vertex; hash_vector(bdz_ph->hl, key, keylen,hl); hl[0] = hl[0] % bdz_ph->r; @@ -520,3 +522,114 @@ void bdz_ph_destroy(cmph_t *mphf) free(data); free(mphf); } + +/** cmph_uint32 bdz_ph_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 bdz_ph_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + register bdz_ph_data_t *bdz_ph = mphf->data; + cmph_uint32 hl[3]; + register cmph_uint8 byte0, byte1, byte2; + register cmph_uint32 vertex; + + hash_vector(bdz_ph->hl, key, keylen,hl); + memcpy(fingerprint, hl, sizeof(hl)); + + hl[0] = hl[0] % bdz_ph->r; + hl[1] = hl[1] % bdz_ph->r + bdz_ph->r; + hl[2] = hl[2] % bdz_ph->r + (bdz_ph->r << 1); + + byte0 = bdz_ph->g[hl[0]/5]; + byte1 = bdz_ph->g[hl[1]/5]; + byte2 = bdz_ph->g[hl[2]/5]; + + byte0 = lookup_table[hl[0]%5][byte0]; + byte1 = lookup_table[hl[1]%5][byte1]; + byte2 = lookup_table[hl[2]%5][byte2]; + vertex = hl[(byte0 + byte1 + byte2)%3]; + + return vertex; +} + +/** \fn void bdz_ph_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bdz_ph_pack(cmph_t *mphf, void *packed_mphf) +{ + bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data; + cmph_uint32 * ptr = packed_mphf; + + // packing hl + hash_state_pack(data->hl, ptr); + + + ptr += (hash_state_packed_size(data->hl) >> 2); // (hash_state_packed_size(data->hl) / 4); + + // packing r + *ptr++ = data->r; + + // packing g + memcpy(ptr, data->g, sizeof(cmph_uint8)*((data->n/5)+1)); +} + +/** \fn cmph_uint32 bdz_ph_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bdz_ph_packed_size(cmph_t *mphf) +{ + bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data; + return (sizeof(CMPH_ALGO) + hash_state_packed_size(data->hl) + sizeof(cmph_uint32) + sizeof(cmph_uint8)*((data->n/5)+1)); +} + +/** cmph_uint32 bdz_ph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 bdz_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + + register cmph_uint32 *hl_ptr = (cmph_uint32 *)packed_mphf; + register cmph_uint32 hl_size = *hl_ptr; + register cmph_uint32 *ptr = hl_ptr + (hl_size >> 2); // h2_ptr + h2_size/4 + + register cmph_uint32 r = *ptr++; + register cmph_uint8 * g = (cmph_uint8 *)ptr; + + cmph_uint32 hl[3]; + register cmph_uint8 byte0, byte1, byte2; + register cmph_uint32 vertex; + + hash_vector_packed(hl_ptr, key, keylen, hl); + + hl[0] = hl[0] % r; + hl[1] = hl[1] % r + r; + hl[2] = hl[2] % r + (r << 1); + + byte0 = g[hl[0]/5]; + byte1 = g[hl[1]/5]; + byte2 = g[hl[2]/5]; + + byte0 = lookup_table[hl[0]%5][byte0]; + byte1 = lookup_table[hl[1]%5][byte1]; + byte2 = lookup_table[hl[2]%5][byte2]; + vertex = hl[(byte0 + byte1 + byte2)%3]; + + return vertex; +} diff --git a/src/bdz_ph.h b/src/bdz_ph.h index fc61e11..114db14 100755 --- a/src/bdz_ph.h +++ b/src/bdz_ph.h @@ -15,4 +15,42 @@ void bdz_ph_load(FILE *f, cmph_t *mphf); int bdz_ph_dump(cmph_t *mphf, FILE *f); void bdz_ph_destroy(cmph_t *mphf); cmph_uint32 bdz_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** cmph_uint32 bdz_ph_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 bdz_ph_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + +/** \fn void bdz_ph_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bdz_ph_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 bdz_ph_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bdz_ph_packed_size(cmph_t *mphf); + +/** cmph_uint32 bdz_ph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 bdz_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + #endif diff --git a/src/bmz.c b/src/bmz.c index ba972ff..ebab981 100644 --- a/src/bmz.c +++ b/src/bmz.c @@ -538,3 +538,96 @@ void bmz_destroy(cmph_t *mphf) free(data); free(mphf); } + +/** cmph_uint32 bmz_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 bmz_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + bmz_data_t *bmz = mphf->data; + cmph_uint32 h1, h2; + + hash_vector(bmz->hashes[0], key, keylen, fingerprint); + h1 = fingerprint[2] % bmz->n; + + hash_vector(bmz->hashes[1], key, keylen, fingerprint); + h2 = fingerprint[2] % bmz->n; + + DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + if (h1 == h2 && ++h2 > bmz->n) h2 = 0; + DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, bmz->g[h1], bmz->g[h2], bmz->m); + return bmz->g[h1] + bmz->g[h2]; +} + +/** \fn void bmz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bmz_pack(cmph_t *mphf, void *packed_mphf) +{ + bmz_data_t *data = (bmz_data_t *)mphf->data; + cmph_uint32 * ptr = packed_mphf; + + // packing h1 + hash_state_pack(data->hashes[0], ptr); + + ptr += (hash_state_packed_size(data->hashes[0]) >> 2); // (hash_state_packed_size(data->hashes[0]) / 4); + + // packing h2 + hash_state_pack(data->hashes[1], ptr); + ptr += (hash_state_packed_size(data->hashes[1]) >> 2); // (hash_state_packed_size(data->hashes[1]) / 4); + + // packing n + *ptr++ = data->n; + + // packing g + memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n); +} + +/** \fn cmph_uint32 bmz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bmz_packed_size(cmph_t *mphf) +{ + bmz_data_t *data = (bmz_data_t *)mphf->data; + return (sizeof(CMPH_ALGO) + 2*hash_state_packed_size(data->hashes[0]) + sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n); +} + +/** cmph_uint32 bmz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 bmz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register cmph_uint32 *h1_ptr = (cmph_uint32 *)packed_mphf; + register cmph_uint32 h1_size = *h1_ptr; + + register cmph_uint32 *h2_ptr = h1_ptr + (h1_size >> 2); // h1_ptr + h1_size/4 + register cmph_uint32 h2_size = *h2_ptr; + + register cmph_uint32 *g_ptr = h2_ptr + (h2_size >> 2); // h2_ptr + h2_size/4 + + register cmph_uint32 n = *g_ptr++; + + register cmph_uint32 h1 = hash_packed(h1_ptr, key, keylen) % n; + register cmph_uint32 h2 = hash_packed(h2_ptr, key, keylen) % n; + + if (h1 == h2 && ++h2 > n) h2 = 0; + + return (g_ptr[h1] + g_ptr[h2]); +} diff --git a/src/bmz.h b/src/bmz.h index 2d444a0..1e65a71 100644 --- a/src/bmz.h +++ b/src/bmz.h @@ -15,4 +15,42 @@ void bmz_load(FILE *f, cmph_t *mphf); int bmz_dump(cmph_t *mphf, FILE *f); void bmz_destroy(cmph_t *mphf); cmph_uint32 bmz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** cmph_uint32 bmz_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 bmz_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + +/** \fn void bmz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bmz_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 bmz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bmz_packed_size(cmph_t *mphf); + +/** cmph_uint32 bmz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 bmz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + #endif diff --git a/src/bmz8.c b/src/bmz8.c index f18322a..0979735 100644 --- a/src/bmz8.c +++ b/src/bmz8.c @@ -547,3 +547,97 @@ void bmz8_destroy(cmph_t *mphf) free(data); free(mphf); } + +/** cmph_uint8 bmz8_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint8 bmz8_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + bmz8_data_t *bmz8 = mphf->data; + cmph_uint8 h1, h2; + + hash_vector(bmz8->hashes[0], key, keylen, fingerprint); + h1 = fingerprint[2] % bmz8->n; + + hash_vector(bmz8->hashes[1], key, keylen, fingerprint); + h2 = fingerprint[2] % bmz8->n; + + DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + if (h1 == h2 && ++h2 > bmz8->n) h2 = 0; + DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, bmz8->g[h1], bmz8->g[h2], bmz8->m); + return bmz8->g[h1] + bmz8->g[h2]; +} + +/** \fn void bmz8_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bmz8_pack(cmph_t *mphf, void *packed_mphf) +{ + bmz8_data_t *data = (bmz8_data_t *)mphf->data; + cmph_uint8 * ptr = packed_mphf; + + // packing h1 + hash_state_pack(data->hashes[0], ptr); + + ptr += hash_state_packed_size(data->hashes[0]); + + // packing h2 + hash_state_pack(data->hashes[1], ptr); + ptr += hash_state_packed_size(data->hashes[1]); + + // packing n + *ptr++ = data->n; + + // packing g + memcpy(ptr, data->g, sizeof(cmph_uint8)*data->n); + +} + +/** \fn cmph_uint32 bmz8_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bmz8_packed_size(cmph_t *mphf) +{ + bmz8_data_t *data = (bmz8_data_t *)mphf->data; + return (sizeof(CMPH_ALGO) + 2*hash_state_packed_size(data->hashes[0]) + sizeof(cmph_uint8) + sizeof(cmph_uint8)*data->n); +} + +/** cmph_uint8 bmz8_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint8 bmz8_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register cmph_uint32 *h1_ptr = (cmph_uint32 *)packed_mphf; + register cmph_uint32 h1_size = *h1_ptr; + + register cmph_uint32 *h2_ptr = h1_ptr + (h1_size >> 2); // h1_ptr + h1_size/4 + register cmph_uint32 h2_size = *h2_ptr; + + register cmph_uint8 *g_ptr = (cmph_uint8 *)(h2_ptr + (h2_size >> 2)); // h2_ptr + h2_size/4 + + register cmph_uint8 n = *g_ptr++; + + register cmph_uint8 h1 = hash_packed(h1_ptr, key, keylen) % n; + register cmph_uint8 h2 = hash_packed(h2_ptr, key, keylen) % n; + + if (h1 == h2 && ++h2 > n) h2 = 0; + + return (g_ptr[h1] + g_ptr[h2]); +} diff --git a/src/bmz8.h b/src/bmz8.h index cb35cd4..66faec0 100644 --- a/src/bmz8.h +++ b/src/bmz8.h @@ -15,4 +15,42 @@ void bmz8_load(FILE *f, cmph_t *mphf); int bmz8_dump(cmph_t *mphf, FILE *f); void bmz8_destroy(cmph_t *mphf); cmph_uint8 bmz8_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** cmph_uint8 bmz8_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint8 bmz8_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + +/** \fn void bmz8_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bmz8_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 bmz8_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bmz8_packed_size(cmph_t *mphf); + +/** cmph_uint8 bmz8_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint8 bmz8_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + #endif diff --git a/src/brz.c b/src/brz.c index c8cf2c1..8e406ad 100755 --- a/src/brz.c +++ b/src/brz.c @@ -701,3 +701,52 @@ void brz_destroy(cmph_t *mphf) free(data); free(mphf); } + +/** cmph_uint32 brz_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 brz_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + return 0; +} + +/** \fn void brz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void brz_pack(cmph_t *mphf, void *packed_mphf) +{ +} + +/** \fn cmph_uint32 brz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 brz_packed_size(cmph_t *mphf) +{ + return 0; +} + +/** cmph_uint32 brz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + return 0; +} + diff --git a/src/brz.h b/src/brz.h index 1f128dd..b01f32d 100644 --- a/src/brz.h +++ b/src/brz.h @@ -20,4 +20,42 @@ void brz_load(FILE *f, cmph_t *mphf); int brz_dump(cmph_t *mphf, FILE *f); void brz_destroy(cmph_t *mphf); cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** cmph_uint32 brz_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 brz_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + +/** \fn void brz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void brz_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 brz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 brz_packed_size(cmph_t *mphf); + +/** cmph_uint32 brz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + #endif diff --git a/src/chm.c b/src/chm.c index 6caedb5..0ed591f 100644 --- a/src/chm.c +++ b/src/chm.c @@ -292,3 +292,104 @@ void chm_destroy(cmph_t *mphf) free(data); free(mphf); } + + +/** cmph_uint32 chm_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 chm_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + chm_data_t *chm = mphf->data; + cmph_uint32 h1, h2; + + hash_vector(chm->hashes[0], key, keylen, fingerprint); + h1 = fingerprint[2] % chm->n; + + hash_vector(chm->hashes[1], key, keylen, fingerprint); + h2 = fingerprint[2] % chm->n; + + DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + if (h1 == h2 && ++h2 >= chm->n) h2 = 0; + DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, chm->g[h1], chm->g[h2], chm->m); + return (chm->g[h1] + chm->g[h2]) % chm->m; +} + +/** \fn void chm_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void chm_pack(cmph_t *mphf, void *packed_mphf) +{ + chm_data_t *data = (chm_data_t *)mphf->data; + cmph_uint32 * ptr = packed_mphf; + + // packing h1 + hash_state_pack(data->hashes[0], ptr); + + ptr += (hash_state_packed_size(data->hashes[0]) >> 2); // (hash_state_packed_size(data->hashes[0]) / 4); + + // packing h2 + hash_state_pack(data->hashes[1], ptr); + ptr += (hash_state_packed_size(data->hashes[1]) >> 2); // (hash_state_packed_size(data->hashes[1]) / 4); + + // packing n + *ptr++ = data->n; + + // packing m + *ptr++ = data->m; + + // packing g + memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n); +} + +/** \fn cmph_uint32 chm_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 chm_packed_size(cmph_t *mphf) +{ + chm_data_t *data = (chm_data_t *)mphf->data; + return (sizeof(CMPH_ALGO) + 2*hash_state_packed_size(data->hashes[0]) + 2*sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n); +} + +/** cmph_uint32 chm_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 chm_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register cmph_uint32 *h1_ptr = (cmph_uint32 *)packed_mphf; + register cmph_uint32 h1_size = *h1_ptr; + +// fprintf(stderr, "h1_size:%u\n", h1_size); + + register cmph_uint32 *h2_ptr = h1_ptr + (h1_size >> 2); // h1_ptr + h1_size/4 + register cmph_uint32 h2_size = *h2_ptr; +// fprintf(stderr, "h2_size:%u\n", h2_size); + + register cmph_uint32 *g_ptr = h2_ptr + (h2_size >> 2); // h2_ptr + h2_size/4 + + register cmph_uint32 n = *g_ptr++; + register cmph_uint32 m = *g_ptr++; + + register cmph_uint32 h1 = hash_packed(h1_ptr, key, keylen) % n; + register cmph_uint32 h2 = hash_packed(h2_ptr, key, keylen) % n; + DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + if (h1 == h2 && ++h2 >= n) h2 = 0; + DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, g_ptr[h1], g_ptr[h2], m); + return (g_ptr[h1] + g_ptr[h2]) % m; +} diff --git a/src/chm.h b/src/chm.h index 0f7ac3f..ea60839 100644 --- a/src/chm.h +++ b/src/chm.h @@ -15,4 +15,42 @@ void chm_load(FILE *f, cmph_t *mphf); int chm_dump(cmph_t *mphf, FILE *f); void chm_destroy(cmph_t *mphf); cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** cmph_uint32 chm_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 chm_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + +/** \fn void chm_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void chm_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 chm_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 chm_packed_size(cmph_t *mphf); + +/** cmph_uint32 chm_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 chm_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + #endif diff --git a/src/cmph.c b/src/cmph.c index 5518c56..7084765 100644 --- a/src/cmph.c +++ b/src/cmph.c @@ -490,7 +490,7 @@ cmph_t *cmph_load(FILE *f) cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { - DEBUGP("mphf algorithm: %u \n", mphf->algo); + DEBUGP("mphf algorithm: %u \n", mphf->algo); switch(mphf->algo) { case CMPH_CHM: @@ -520,6 +520,54 @@ cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) return 0; } + + +/** cmph_uint32 cmph_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 cmph_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + DEBUGP("mphf algorithm: %u \n", mphf->algo); + switch(mphf->algo) + { + case CMPH_CHM: + return chm_search_fingerprint(mphf, key, keylen, fingerprint); + case CMPH_BMZ: /* included -- Fabiano */ + DEBUGP("bmz algorithm search\n"); + return bmz_search_fingerprint(mphf, key, keylen, fingerprint); + case CMPH_BMZ8: /* included -- Fabiano */ + DEBUGP("bmz8 algorithm search\n"); + return bmz8_search_fingerprint(mphf, key, keylen, fingerprint); + case CMPH_BRZ: /* included -- Fabiano */ + DEBUGP("brz algorithm search\n"); + return brz_search_fingerprint(mphf, key, keylen, fingerprint); + case CMPH_FCH: /* included -- Fabiano */ + DEBUGP("fch algorithm search\n"); + return fch_search_fingerprint(mphf, key, keylen, fingerprint); + case CMPH_BDZ: /* included -- Fabiano */ + DEBUGP("bdz algorithm search\n"); + return bdz_search_fingerprint(mphf, key, keylen, fingerprint); + case CMPH_BDZ_PH: /* included -- Fabiano */ + DEBUGP("bdz_ph algorithm search\n"); + return bdz_ph_search_fingerprint(mphf, key, keylen, fingerprint); + default: + assert(0); + } + assert(0); + return 0; +} + + + cmph_uint32 cmph_size(cmph_t *mphf) { return mphf->size; @@ -556,3 +604,106 @@ void cmph_destroy(cmph_t *mphf) assert(0); return; } + + +/** \fn void cmph_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void cmph_pack(cmph_t *mphf, void *packed_mphf) +{ + // packing algorithm type to be used in cmph.c + cmph_uint32 * ptr = (cmph_uint32 *) packed_mphf; + *ptr++ = mphf->algo; + DEBUGP("mphf->algo = %u\n", mphf->algo); + switch(mphf->algo) + { + case CMPH_CHM: + chm_pack(mphf, ptr); + break; + case CMPH_BMZ: /* included -- Fabiano */ + bmz_pack(mphf, ptr); + break; + case CMPH_BMZ8: /* included -- Fabiano */ + bmz8_pack(mphf, ptr); + break; + case CMPH_BRZ: /* included -- Fabiano */ + brz_pack(mphf, ptr); + break; + case CMPH_FCH: /* included -- Fabiano */ + fch_pack(mphf, ptr); + break; + case CMPH_BDZ: /* included -- Fabiano */ + bdz_pack(mphf, ptr); + break; + case CMPH_BDZ_PH: /* included -- Fabiano */ + bdz_ph_pack(mphf, ptr); + break; + default: + assert(0); + } + return; +} + +/** \fn cmph_uint32 cmph_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 cmph_packed_size(cmph_t *mphf) +{ + switch(mphf->algo) + { + case CMPH_CHM: + return chm_packed_size(mphf); + case CMPH_BMZ: /* included -- Fabiano */ + return bmz_packed_size(mphf); + case CMPH_BMZ8: /* included -- Fabiano */ + return bmz8_packed_size(mphf); + case CMPH_BRZ: /* included -- Fabiano */ + return brz_packed_size(mphf); + case CMPH_FCH: /* included -- Fabiano */ + return fch_packed_size(mphf); + case CMPH_BDZ: /* included -- Fabiano */ + return bdz_packed_size(mphf); + case CMPH_BDZ_PH: /* included -- Fabiano */ + return bdz_ph_packed_size(mphf); + default: + assert(0); + } + return 0; // FAILURE +} + +/** cmph_uint32 cmph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + cmph_uint32 *ptr = (cmph_uint32 *)packed_mphf; +// fprintf(stderr, "algo:%u\n", *ptr); + switch(*ptr) + { + case CMPH_CHM: + return chm_search_packed(++ptr, key, keylen); + case CMPH_BMZ: /* included -- Fabiano */ + return bmz_search_packed(++ptr, key, keylen); + case CMPH_BMZ8: /* included -- Fabiano */ + return bmz8_search_packed(++ptr, key, keylen); + case CMPH_BRZ: /* included -- Fabiano */ + return brz_search_packed(++ptr, key, keylen); + case CMPH_FCH: /* included -- Fabiano */ + return fch_search_packed(++ptr, key, keylen); + case CMPH_BDZ: /* included -- Fabiano */ + return bdz_search_packed(++ptr, key, keylen); + case CMPH_BDZ_PH: /* included -- Fabiano */ + return bdz_ph_search_packed(++ptr, key, keylen); + default: + assert(0); + } + return 0; // FAILURE +} diff --git a/src/cmph.h b/src/cmph.h index 235a1e2..df9992a 100644 --- a/src/cmph.h +++ b/src/cmph.h @@ -51,7 +51,31 @@ void cmph_config_destroy(cmph_config_t *mph); /** Hash API **/ cmph_t *cmph_new(cmph_config_t *mph); + +/** cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + * \brief Computes the mphf value. + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + */ cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** cmph_uint32 cmph_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 cmph_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + + cmph_uint32 cmph_size(cmph_t *mphf); void cmph_destroy(cmph_t *mphf); @@ -59,6 +83,29 @@ void cmph_destroy(cmph_t *mphf); int cmph_dump(cmph_t *mphf, FILE *f); cmph_t *cmph_load(FILE *f); +/** \fn void cmph_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void cmph_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 cmph_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 cmph_packed_size(cmph_t *mphf); + +/** cmph_uint32 cmph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + #ifdef __cplusplus } #endif diff --git a/src/fch.c b/src/fch.c index 4e56a1d..b3a19ce 100644 --- a/src/fch.c +++ b/src/fch.c @@ -410,3 +410,53 @@ void fch_destroy(cmph_t *mphf) free(data); free(mphf); } + +/** cmph_uint32 fch_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 fch_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + return 0; +} + +/** \fn void fch_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void fch_pack(cmph_t *mphf, void *packed_mphf) +{ +} + +/** \fn cmph_uint32 fch_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 fch_packed_size(cmph_t *mphf) +{ + return 0; +} + + +/** cmph_uint32 fch_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 fch_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + return 0; +} + diff --git a/src/fch.h b/src/fch.h index 5ce2811..4b5d197 100644 --- a/src/fch.h +++ b/src/fch.h @@ -21,4 +21,42 @@ void fch_load(FILE *f, cmph_t *mphf); int fch_dump(cmph_t *mphf, FILE *f); void fch_destroy(cmph_t *mphf); cmph_uint32 fch_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** cmph_uint32 fch_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + * \brief Computes the mphf value and a fingerprint of 12 bytes (i.e., figerprint should be a prealocated area to fit three 4-byte integers). + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + * + * Computes the mphf value and a fingerprint of 12 bytes. The figerprint pointer should be + * a prealocated area to fit three 4-byte integers. You don't need to use all the 12 bytes + * as fingerprint. According to the application, just few bits can be enough, once mphf does + * not allow collisions for the keys previously known. + */ +cmph_uint32 fch_search_fingerprint(cmph_t *mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint); + +/** \fn void fch_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void fch_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 fch_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 fch_packed_size(cmph_t *mphf); + +/** cmph_uint32 fch_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 fch_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + #endif diff --git a/src/hash.c b/src/hash.c index 7a754d8..60f630f 100644 --- a/src/hash.c +++ b/src/hash.c @@ -43,7 +43,7 @@ void hash_vector(hash_state_t *state, const char *key, cmph_uint32 keylen, cmph_ switch (state->hashfunc) { case CMPH_HASH_JENKINS: - jenkins_hash_vector((jenkins_state_t *)state, key, keylen, hashes); + jenkins_hash_vector_((jenkins_state_t *)state, key, keylen, hashes); break; default: assert(0); @@ -123,3 +123,96 @@ void hash_state_destroy(hash_state_t *state) } return; } + +/** \fn void hash_state_pack(hash_state_t *state, void *hash_packed); + * \brief Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed. + * \param state points to the hash function + * \param hash_packed pointer to the contiguous memory area used to store the hash function. The size of hash_packed must be at least hash_state_packed_size() + */ +void hash_state_pack(hash_state_t *state, void *hash_packed) +{ + cmph_uint32 * ptr = (cmph_uint32 *)hash_packed; + cmph_uint32 * ptr_size = ptr++; + + // Reserve space for the hash function size + *ptr_size = 0; + + // Pack the hash function type + *ptr++ = state->hashfunc; + + switch (state->hashfunc) + { + case CMPH_HASH_JENKINS: + // pack the jenkins hash function + jenkins_state_pack((jenkins_state_t *)state, ptr); + *ptr_size = sizeof(cmph_uint32) + sizeof(CMPH_HASH) + jenkins_state_packed_size(); + break; + default: + assert(0); + } + return; +} + +/** \fn cmph_uint32 hash_state_packed_size(hash_state_t *state); + * \brief Return the amount of space needed to pack a hash function. + * \param state points to a hash function + * \return the size of the packed function or zero for failures + */ +cmph_uint32 hash_state_packed_size(hash_state_t *state) +{ + cmph_uint32 size = sizeof(cmph_uint32) + sizeof(CMPH_HASH); + switch (state->hashfunc) + { + case CMPH_HASH_JENKINS: + size += jenkins_state_packed_size(); + break; + default: + assert(0); + } + return size; +} + + +/** \fn cmph_uint32 hash_packed(void *hash_packed, const char *k, cmph_uint32 keylen) + * \param hash_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \return an integer that represents a hash value of 32 bits. + */ +cmph_uint32 hash_packed(void *hash_packed, const char *k, cmph_uint32 keylen) +{ + register cmph_uint32 * ptr = (((cmph_uint32 *) hash_packed) + 1); + + register CMPH_HASH hashfunc = *ptr++; + + switch (hashfunc) + { + case CMPH_HASH_JENKINS: + return jenkins_hash_packed(ptr, k, keylen); + default: + assert(0); + } + assert(0); + return 0; +} + +/** \fn hash_vector_packed(void *hash_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + * \param hash_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. + */ +void hash_vector_packed(void *hash_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) +{ + cmph_uint32 * ptr = (((cmph_uint32 *) hash_packed) + 1); + + CMPH_HASH hashfunc = *ptr++; + switch (hashfunc) + { + case CMPH_HASH_JENKINS: + jenkins_hash_vector_packed(ptr, k, keylen, hashes); + break; + default: + assert(0); + } +} diff --git a/src/hash.h b/src/hash.h index 092fe1c..62711a5 100644 --- a/src/hash.h +++ b/src/hash.h @@ -31,4 +31,35 @@ hash_state_t *hash_state_load(const char *buf, cmph_uint32 buflen); void hash_state_destroy(hash_state_t *state); +/** \fn void hash_state_pack(hash_state_t *state, void *hash_packed); + * \brief Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed. + * \param state points to the hash function + * \param hash_packed pointer to the contiguous memory area used to store the hash function. The size of hash_packed must be at least hash_state_packed_size() + */ +void hash_state_pack(hash_state_t *state, void *hash_packed); + +/** \fn cmph_uint32 hash_state_packed_size(hash_state_t *state); + * \brief Return the amount of space needed to pack a hash function. + * \param state points to a hash function + * \return the size of the packed function or zero for failures + */ +cmph_uint32 hash_state_packed_size(hash_state_t *state); + + +/** \fn cmph_uint32 hash_packed(void *hash_packed, const char *k, cmph_uint32 keylen); + * \param hash_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \return an integer that represents a hash value of 32 bits. + */ +cmph_uint32 hash_packed(void *hash_packed, const char *k, cmph_uint32 keylen); + +/** \fn hash_vector_packed(void *hash_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + * \param hash_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. + */ +void hash_vector_packed(void *hash_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + #endif diff --git a/src/jenkins_hash.c b/src/jenkins_hash.c index 934dd0c..373d89f 100644 --- a/src/jenkins_hash.c +++ b/src/jenkins_hash.c @@ -96,78 +96,16 @@ void jenkins_state_destroy(jenkins_state_t *state) free(state); } -cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen) + +inline void __jenkins_hash_vector(cmph_uint32 seed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) { - cmph_uint32 a, b, c; - cmph_uint32 len, length; - - /* Set up the internal state */ - length = keylen; - len = length; - a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */ - c = state->seed; /* the previous hash value - seed in our case */ - - /*---------------------------------------- handle most of the key */ - while (len >= 12) - { - a += (k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24)); - b += (k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24)); - c += (k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24)); - mix(a,b,c); - k += 12; len -= 12; - } - - /*------------------------------------- handle the last 11 bytes */ - c += length; - switch(len) /* all the case statements fall through */ - { - case 11: - c +=((cmph_uint32)k[10]<<24); - case 10: - c +=((cmph_uint32)k[9]<<16); - case 9 : - c +=((cmph_uint32)k[8]<<8); - /* the first byte of c is reserved for the length */ - case 8 : - b +=((cmph_uint32)k[7]<<24); - case 7 : - b +=((cmph_uint32)k[6]<<16); - case 6 : - b +=((cmph_uint32)k[5]<<8); - case 5 : - b +=k[4]; - case 4 : - a +=((cmph_uint32)k[3]<<24); - case 3 : - a +=((cmph_uint32)k[2]<<16); - case 2 : - a +=((cmph_uint32)k[1]<<8); - case 1 : - a +=k[0]; - /* case 0: nothing left to add */ - } - - mix(a,b,c); - - /*-------------------------------------------- report the result */ - - //c = (c & hashmask(state->size)); - //c = (c >= state->size) ? c ^ state->size: c; - - //state->last_hash = c; Do not update last_hash because we use a fixed - //seed - return c; -} - -void jenkins_hash_vector(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) -{ - cmph_uint32 len, length; + register cmph_uint32 len, length; /* Set up the internal state */ length = keylen; len = length; hashes[0] = hashes[1] = 0x9e3779b9; /* the golden ratio; an arbitrary value */ - hashes[2] = state->seed; /* the previous hash value - seed in our case */ + hashes[2] = seed; /* the previous hash value - seed in our case */ /*---------------------------------------- handle most of the key */ while (len >= 12) @@ -212,6 +150,73 @@ void jenkins_hash_vector(jenkins_state_t *state, const char *k, cmph_uint32 keyl mix(hashes[0],hashes[1],hashes[2]); } +cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen) +{ + cmph_uint32 hashes[3]; + __jenkins_hash_vector(state->seed, k, keylen, hashes); + return hashes[2]; +/* cmph_uint32 a, b, c; + cmph_uint32 len, length; + + // Set up the internal state + length = keylen; + len = length; + a = b = 0x9e3779b9; // the golden ratio; an arbitrary value + c = state->seed; // the previous hash value - seed in our case + + // handle most of the key + while (len >= 12) + { + a += (k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24)); + b += (k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24)); + c += (k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24)); + mix(a,b,c); + k += 12; len -= 12; + } + + // handle the last 11 bytes + c += length; + switch(len) /// all the case statements fall through + { + case 11: + c +=((cmph_uint32)k[10]<<24); + case 10: + c +=((cmph_uint32)k[9]<<16); + case 9 : + c +=((cmph_uint32)k[8]<<8); + // the first byte of c is reserved for the length + case 8 : + b +=((cmph_uint32)k[7]<<24); + case 7 : + b +=((cmph_uint32)k[6]<<16); + case 6 : + b +=((cmph_uint32)k[5]<<8); + case 5 : + b +=k[4]; + case 4 : + a +=((cmph_uint32)k[3]<<24); + case 3 : + a +=((cmph_uint32)k[2]<<16); + case 2 : + a +=((cmph_uint32)k[1]<<8); + case 1 : + a +=k[0]; + // case 0: nothing left to add + } + + mix(a,b,c); + + /// report the result + + return c; + */ +} + +void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) +{ + __jenkins_hash_vector(state->seed, k, keylen, hashes); +} + void jenkins_state_dump(jenkins_state_t *state, char **buf, cmph_uint32 *buflen) { *buflen = sizeof(cmph_uint32); @@ -242,3 +247,51 @@ jenkins_state_t *jenkins_state_load(const char *buf, cmph_uint32 buflen) DEBUGP("Loaded jenkins state with seed %u\n", state->seed); return state; } + + +/** \fn void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed); + * \brief Support the ability to pack a jenkins function into a preallocated contiguous memory space pointed by jenkins_packed. + * \param state points to the jenkins function + * \param jenkins_packed pointer to the contiguous memory area used to store the jenkins function. The size of jenkins_packed must be at least jenkins_state_packed_size() + */ +void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed) +{ + if (state && jenkins_packed) + { + memcpy(jenkins_packed, &(state->seed), sizeof(cmph_uint32)); + } +} + +/** \fn cmph_uint32 jenkins_state_packed_size(jenkins_state_t *state); + * \brief Return the amount of space needed to pack a jenkins function. + * \return the size of the packed function or zero for failures + */ +cmph_uint32 jenkins_state_packed_size() +{ + return sizeof(cmph_uint32); +} + + +/** \fn cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen); + * \param jenkins_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \return an integer that represents a hash value of 32 bits. + */ +cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen) +{ + cmph_uint32 hashes[3]; + __jenkins_hash_vector(*((cmph_uint32 *)jenkins_packed), k, keylen, hashes); + return hashes[2]; +} + +/** \fn jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + * \param jenkins_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. + */ +void jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) +{ + __jenkins_hash_vector(*((cmph_uint32 *)jenkins_packed), k, keylen, hashes); +} diff --git a/src/jenkins_hash.h b/src/jenkins_hash.h index df04627..fb422d4 100644 --- a/src/jenkins_hash.h +++ b/src/jenkins_hash.h @@ -19,17 +19,47 @@ jenkins_state_t *jenkins_state_new(cmph_uint32 size); //size of hash table */ cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen); -/** \fn void jenkins_hash_vector(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); +/** \fn void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); * \param state is a pointer to a jenkins_state_t structure * \param key is a pointer to a key * \param keylen is the key length * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. */ -void jenkins_hash_vector(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); +void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); void jenkins_state_dump(jenkins_state_t *state, char **buf, cmph_uint32 *buflen); jenkins_state_t *jenkins_state_copy(jenkins_state_t *src_state); jenkins_state_t *jenkins_state_load(const char *buf, cmph_uint32 buflen); void jenkins_state_destroy(jenkins_state_t *state); +/** \fn void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed); + * \brief Support the ability to pack a jenkins function into a preallocated contiguous memory space pointed by jenkins_packed. + * \param state points to the jenkins function + * \param jenkins_packed pointer to the contiguous memory area used to store the jenkins function. The size of jenkins_packed must be at least jenkins_state_packed_size() + */ +void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed); + +/** \fn cmph_uint32 jenkins_state_packed_size(jenkins_state_t *state); + * \brief Return the amount of space needed to pack a jenkins function. + * \return the size of the packed function or zero for failures + */ +cmph_uint32 jenkins_state_packed_size(); + + +/** \fn cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen); + * \param jenkins_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \return an integer that represents a hash value of 32 bits. + */ +cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen); + +/** \fn jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + * \param jenkins_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. + */ +void jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + #endif diff --git a/src/main.c b/src/main.c index a39dbd4..fbf3806 100644 --- a/src/main.c +++ b/src/main.c @@ -14,7 +14,7 @@ #include "hash.h" #ifdef WIN32 -#define VERSION "0.2" +#define VERSION "0.8" #else #include "config.h" #endif @@ -305,6 +305,7 @@ int main(int argc, char **argv) } source->dispose(source->data, buf, buflen); } + cmph_destroy(mphf); free(hashtable); } diff --git a/tests/Makefile.am b/tests/Makefile.am index 50cfcd6..5242d4e 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,4 +1,15 @@ -noinst_PROGRAMS = graph_tests +noinst_PROGRAMS = graph_tests packed_mphf_tests mphf_tests mphf_fingerprint_tests + +INCLUDES = -I../src/ graph_tests_SOURCES = graph_tests.c graph_tests_LDADD = ../src/libcmph.la + +packed_mphf_tests_SOURCES = packed_mphf_tests.c +packed_mphf_tests_LDADD = ../src/libcmph.la + +mphf_tests_SOURCES = mphf_tests.c +mphf_tests_LDADD = ../src/libcmph.la + +mphf_fingerprint_tests_SOURCES = mphf_fingerprint_tests.c +mphf_fingerprint_tests_LDADD = ../src/libcmph.la diff --git a/tests/mphf_fingerprint_tests.c b/tests/mphf_fingerprint_tests.c new file mode 100644 index 0000000..671b453 --- /dev/null +++ b/tests/mphf_fingerprint_tests.c @@ -0,0 +1,162 @@ +#ifdef WIN32 +#include "../wingetopt.h" +#else +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef WIN32 +#define VERSION "0.8" +#else +#include "config.h" +#endif + + +void usage(const char *prg) +{ + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-m file.mph] keysfile\n", prg); +} +void usage_long(const char *prg) +{ + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-m file.mph] keysfile\n", prg); + fprintf(stderr, "Packed MPHFs testing tool\n\n"); + fprintf(stderr, " -h\t print this help message\n"); + fprintf(stderr, " -V\t print version number and exit\n"); + fprintf(stderr, " -v\t increase verbosity (may be used multiple times)\n"); + fprintf(stderr, " -k\t number of keys\n"); + fprintf(stderr, " -m\t minimum perfect hash function file \n"); + fprintf(stderr, " keysfile\t line separated file with keys\n"); +} + +int main(int argc, char **argv) +{ + char verbosity = 0; + char *mphf_file = NULL; + const char *keys_file = NULL; + FILE *mphf_fd = stdout; + FILE *keys_fd; + cmph_uint32 nkeys = UINT_MAX; + cmph_uint32 i = 0; + cmph_t *mphf = NULL; + cmph_io_adapter_t *source; + cmph_uint32 fingerprint[3]; + while (1) + { + char ch = getopt(argc, argv, "hVvk:m:"); + if (ch == -1) break; + switch (ch) + { + case 'k': + { + char *endptr; + nkeys = strtoul(optarg, &endptr, 10); + if(*endptr != 0) { + fprintf(stderr, "Invalid number of keys %s\n", optarg); + exit(1); + } + } + break; + case 'm': + mphf_file = strdup(optarg); + break; + case 'v': + ++verbosity; + break; + case 'V': + printf("%s\n", VERSION); + return 0; + case 'h': + usage_long(argv[0]); + return 0; + default: + usage(argv[0]); + return 1; + } + } + + if (optind != argc - 1) + { + usage(argv[0]); + return 1; + } + keys_file = argv[optind]; + + int ret = 0; + if (mphf_file == NULL) + { + mphf_file = (char *)malloc(strlen(keys_file) + 5); + memcpy(mphf_file, keys_file, strlen(keys_file)); + memcpy(mphf_file + strlen(keys_file), ".mph\0", 5); + } + + keys_fd = fopen(keys_file, "r"); + + if (keys_fd == NULL) + { + fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno)); + return -1; + } + + if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd); + else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys); + + cmph_uint8 * hashtable = NULL; + mphf_fd = fopen(mphf_file, "r"); + if (mphf_fd == NULL) + { + fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno)); + free(mphf_file); + return -1; + } + mphf = cmph_load(mphf_fd); + fclose(mphf_fd); + if (!mphf) + { + fprintf(stderr, "Unable to parser input file %s\n", mphf_file); + free(mphf_file); + return -1; + } + cmph_uint32 siz = cmph_size(mphf); + hashtable = (cmph_uint8*)malloc(siz*sizeof(cmph_uint8)); + memset(hashtable, 0, siz); + //check all keys + for (i = 0; i < source->nkeys; ++i) + { + cmph_uint32 h; + char *buf; + cmph_uint32 buflen = 0; + source->read(source->data, &buf, &buflen); + h = cmph_search_fingerprint(mphf, buf, buflen, fingerprint); + if (!(h < siz)) + { + fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf); + ret = 1; + } else if(hashtable[h]) + { + fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf); + ret = 1; + } else hashtable[h] = 1; + + if (verbosity) + { + printf("%s -> %u -- fingerprint: %u %u %u\n", buf, h, fingerprint[0], fingerprint[1], fingerprint[2]); + } + source->dispose(source->data, buf, buflen); + } + + cmph_destroy(mphf); + free(hashtable); + + fclose(keys_fd); + free(mphf_file); + cmph_io_nlfile_adapter_destroy(source); + return ret; + +} diff --git a/tests/mphf_tests.c b/tests/mphf_tests.c new file mode 100644 index 0000000..74486f3 --- /dev/null +++ b/tests/mphf_tests.c @@ -0,0 +1,161 @@ +#ifdef WIN32 +#include "../wingetopt.h" +#else +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef WIN32 +#define VERSION "0.8" +#else +#include "config.h" +#endif + + +void usage(const char *prg) +{ + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-m file.mph] keysfile\n", prg); +} +void usage_long(const char *prg) +{ + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-m file.mph] keysfile\n", prg); + fprintf(stderr, "Packed MPHFs testing tool\n\n"); + fprintf(stderr, " -h\t print this help message\n"); + fprintf(stderr, " -V\t print version number and exit\n"); + fprintf(stderr, " -v\t increase verbosity (may be used multiple times)\n"); + fprintf(stderr, " -k\t number of keys\n"); + fprintf(stderr, " -m\t minimum perfect hash function file \n"); + fprintf(stderr, " keysfile\t line separated file with keys\n"); +} + +int main(int argc, char **argv) +{ + char verbosity = 0; + char *mphf_file = NULL; + const char *keys_file = NULL; + FILE *mphf_fd = stdout; + FILE *keys_fd; + cmph_uint32 nkeys = UINT_MAX; + cmph_uint32 i = 0; + cmph_t *mphf = NULL; + cmph_io_adapter_t *source; + while (1) + { + char ch = getopt(argc, argv, "hVvk:m:"); + if (ch == -1) break; + switch (ch) + { + case 'k': + { + char *endptr; + nkeys = strtoul(optarg, &endptr, 10); + if(*endptr != 0) { + fprintf(stderr, "Invalid number of keys %s\n", optarg); + exit(1); + } + } + break; + case 'm': + mphf_file = strdup(optarg); + break; + case 'v': + ++verbosity; + break; + case 'V': + printf("%s\n", VERSION); + return 0; + case 'h': + usage_long(argv[0]); + return 0; + default: + usage(argv[0]); + return 1; + } + } + + if (optind != argc - 1) + { + usage(argv[0]); + return 1; + } + keys_file = argv[optind]; + + int ret = 0; + if (mphf_file == NULL) + { + mphf_file = (char *)malloc(strlen(keys_file) + 5); + memcpy(mphf_file, keys_file, strlen(keys_file)); + memcpy(mphf_file + strlen(keys_file), ".mph\0", 5); + } + + keys_fd = fopen(keys_file, "r"); + + if (keys_fd == NULL) + { + fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno)); + return -1; + } + + if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd); + else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys); + + cmph_uint8 * hashtable = NULL; + mphf_fd = fopen(mphf_file, "r"); + if (mphf_fd == NULL) + { + fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno)); + free(mphf_file); + return -1; + } + mphf = cmph_load(mphf_fd); + fclose(mphf_fd); + if (!mphf) + { + fprintf(stderr, "Unable to parser input file %s\n", mphf_file); + free(mphf_file); + return -1; + } + cmph_uint32 siz = cmph_size(mphf); + hashtable = (cmph_uint8*)malloc(siz*sizeof(cmph_uint8)); + memset(hashtable, 0, siz); + //check all keys + for (i = 0; i < source->nkeys; ++i) + { + cmph_uint32 h; + char *buf; + cmph_uint32 buflen = 0; + source->read(source->data, &buf, &buflen); + h = cmph_search(mphf, buf, buflen); + if (!(h < siz)) + { + fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf); + ret = 1; + } else if(hashtable[h]) + { + fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf); + ret = 1; + } else hashtable[h] = 1; + + if (verbosity) + { + printf("%s -> %u\n", buf, h); + } + source->dispose(source->data, buf, buflen); + } + + cmph_destroy(mphf); + free(hashtable); + + fclose(keys_fd); + free(mphf_file); + cmph_io_nlfile_adapter_destroy(source); + return ret; + +} diff --git a/tests/packed_mphf_tests.c b/tests/packed_mphf_tests.c new file mode 100644 index 0000000..429e507 --- /dev/null +++ b/tests/packed_mphf_tests.c @@ -0,0 +1,177 @@ +#ifdef WIN32 +#include "../wingetopt.h" +#else +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +//#include "hash.h" + +#ifdef WIN32 +#define VERSION "0.8" +#else +#include "config.h" +#endif + + +void usage(const char *prg) +{ + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-m file.mph] keysfile\n", prg); +} +void usage_long(const char *prg) +{ + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-m file.mph] keysfile\n", prg); + fprintf(stderr, "Packed MPHFs testing tool\n\n"); + fprintf(stderr, " -h\t print this help message\n"); + fprintf(stderr, " -V\t print version number and exit\n"); + fprintf(stderr, " -v\t increase verbosity (may be used multiple times)\n"); + fprintf(stderr, " -k\t number of keys\n"); + fprintf(stderr, " -m\t minimum perfect hash function file \n"); + fprintf(stderr, " keysfile\t line separated file with keys\n"); +} + +int main(int argc, char **argv) +{ + char verbosity = 0; + char *mphf_file = NULL; + const char *keys_file = NULL; + FILE *mphf_fd = stdout; + FILE *keys_fd; + cmph_uint32 nkeys = UINT_MAX; + cmph_uint32 i = 0; + cmph_t *mphf = NULL; + cmph_io_adapter_t *source; + while (1) + { + char ch = getopt(argc, argv, "hVvk:m:"); + if (ch == -1) break; + switch (ch) + { + case 'k': + { + char *endptr; + nkeys = strtoul(optarg, &endptr, 10); + if(*endptr != 0) { + fprintf(stderr, "Invalid number of keys %s\n", optarg); + exit(1); + } + } + break; + case 'm': + mphf_file = strdup(optarg); + break; + case 'v': + ++verbosity; + break; + case 'V': + printf("%s\n", VERSION); + return 0; + case 'h': + usage_long(argv[0]); + return 0; + default: + usage(argv[0]); + return 1; + } + } + + if (optind != argc - 1) + { + usage(argv[0]); + return 1; + } + keys_file = argv[optind]; + + int ret = 0; + if (mphf_file == NULL) + { + mphf_file = (char *)malloc(strlen(keys_file) + 5); + memcpy(mphf_file, keys_file, strlen(keys_file)); + memcpy(mphf_file + strlen(keys_file), ".mph\0", 5); + } + + keys_fd = fopen(keys_file, "r"); + + if (keys_fd == NULL) + { + fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno)); + return -1; + } + + if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd); + else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys); + + cmph_uint8 * hashtable = NULL; + mphf_fd = fopen(mphf_file, "r"); + if (mphf_fd == NULL) + { + fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno)); + free(mphf_file); + return -1; + } + mphf = cmph_load(mphf_fd); + fclose(mphf_fd); + if (!mphf) + { + fprintf(stderr, "Unable to parser input file %s\n", mphf_file); + free(mphf_file); + return -1; + } + cmph_uint32 siz = cmph_size(mphf); + hashtable = (cmph_uint8*)malloc(siz*sizeof(cmph_uint8)); + memset(hashtable, 0, siz); + + // packing the function + /* Determine how much space is needed to pack the mphf. */ + cmph_uint32 packed_size = cmph_packed_size(mphf); + fprintf(stderr, "packed_size = %u\n", packed_size); + + /* Make sure that we have enough space to pack the mphf. */ + cmph_uint8 * packed_mphf = calloc(packed_size,1); + + /* Pack the mphf. */ + cmph_pack(mphf, packed_mphf); + + // testing the packed function + //check all keys + for (i = 0; i < source->nkeys; ++i) + { + cmph_uint32 h; + char *buf; + cmph_uint32 buflen = 0; + source->read(source->data, &buf, &buflen); + h = cmph_search_packed(packed_mphf, buf, buflen); + + if (!(h < siz)) + { + fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf); + ret = 1; + } else if(hashtable[h]) + { + fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf); + ret = 1; + } else hashtable[h] = 1; + + if (verbosity) + { + printf("%s -> %u\n", buf, h); + } + source->dispose(source->data, buf, buflen); + } + + free(packed_mphf); + cmph_destroy(mphf); + free(hashtable); + + fclose(keys_fd); + free(mphf_file); + cmph_io_nlfile_adapter_destroy(source); + return ret; + +}