compressed hash and displace method added
This commit is contained in:
parent
83ce8f171a
commit
79d250d152
|
@ -60,7 +60,8 @@ libcmph_la_LIBADD =
|
|||
am_libcmph_la_OBJECTS = hash.lo jenkins_hash.lo vstack.lo vqueue.lo \
|
||||
graph.lo cmph.lo cmph_structs.lo chm.lo bmz.lo bmz8.lo bdz.lo \
|
||||
bdz_ph.lo buffer_manager.lo buffer_entry.lo brz.lo fch.lo \
|
||||
fch_buckets.lo select.lo compressed_seq.lo
|
||||
fch_buckets.lo select.lo compressed_seq.lo chd_ph.lo \
|
||||
miller_rabin.lo
|
||||
libcmph_la_OBJECTS = $(am_libcmph_la_OBJECTS)
|
||||
libcmph_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
|
||||
|
@ -207,7 +208,8 @@ libcmph_la_SOURCES = hash.c jenkins_hash.c\
|
|||
chm.c bmz.c bmz8.c bdz.c bdz_ph.c\
|
||||
buffer_manager.c buffer_entry.c\
|
||||
brz.c fch.c fch_buckets.c \
|
||||
select.c compressed_seq.c
|
||||
select.c compressed_seq.c \
|
||||
chd_ph.c miller_rabin.c
|
||||
|
||||
libcmph_la_LDFLAGS = -version-info 0:0:0
|
||||
cmph_SOURCES = main.c wingetopt.h wingetopt.c
|
||||
|
@ -319,6 +321,7 @@ include ./$(DEPDIR)/bmz8.Plo
|
|||
include ./$(DEPDIR)/brz.Plo
|
||||
include ./$(DEPDIR)/buffer_entry.Plo
|
||||
include ./$(DEPDIR)/buffer_manager.Plo
|
||||
include ./$(DEPDIR)/chd_ph.Plo
|
||||
include ./$(DEPDIR)/chm.Plo
|
||||
include ./$(DEPDIR)/cmph.Plo
|
||||
include ./$(DEPDIR)/cmph_structs.Plo
|
||||
|
@ -329,6 +332,7 @@ include ./$(DEPDIR)/graph.Plo
|
|||
include ./$(DEPDIR)/hash.Plo
|
||||
include ./$(DEPDIR)/jenkins_hash.Plo
|
||||
include ./$(DEPDIR)/main.Po
|
||||
include ./$(DEPDIR)/miller_rabin.Plo
|
||||
include ./$(DEPDIR)/select.Plo
|
||||
include ./$(DEPDIR)/vqueue.Plo
|
||||
include ./$(DEPDIR)/vstack.Plo
|
||||
|
|
|
@ -7,7 +7,8 @@ libcmph_la_SOURCES = hash.c jenkins_hash.c\
|
|||
chm.c bmz.c bmz8.c bdz.c bdz_ph.c\
|
||||
buffer_manager.c buffer_entry.c\
|
||||
brz.c fch.c fch_buckets.c \
|
||||
select.c compressed_seq.c
|
||||
select.c compressed_seq.c \
|
||||
chd_ph.c miller_rabin.c
|
||||
|
||||
libcmph_la_LDFLAGS = -version-info 0:0:0
|
||||
|
||||
|
|
|
@ -60,7 +60,8 @@ libcmph_la_LIBADD =
|
|||
am_libcmph_la_OBJECTS = hash.lo jenkins_hash.lo vstack.lo vqueue.lo \
|
||||
graph.lo cmph.lo cmph_structs.lo chm.lo bmz.lo bmz8.lo bdz.lo \
|
||||
bdz_ph.lo buffer_manager.lo buffer_entry.lo brz.lo fch.lo \
|
||||
fch_buckets.lo select.lo compressed_seq.lo
|
||||
fch_buckets.lo select.lo compressed_seq.lo chd_ph.lo \
|
||||
miller_rabin.lo
|
||||
libcmph_la_OBJECTS = $(am_libcmph_la_OBJECTS)
|
||||
libcmph_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
|
||||
|
@ -207,7 +208,8 @@ libcmph_la_SOURCES = hash.c jenkins_hash.c\
|
|||
chm.c bmz.c bmz8.c bdz.c bdz_ph.c\
|
||||
buffer_manager.c buffer_entry.c\
|
||||
brz.c fch.c fch_buckets.c \
|
||||
select.c compressed_seq.c
|
||||
select.c compressed_seq.c \
|
||||
chd_ph.c miller_rabin.c
|
||||
|
||||
libcmph_la_LDFLAGS = -version-info 0:0:0
|
||||
cmph_SOURCES = main.c wingetopt.h wingetopt.c
|
||||
|
@ -319,6 +321,7 @@ distclean-compile:
|
|||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/brz.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffer_entry.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/buffer_manager.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chd_ph.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chm.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cmph.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cmph_structs.Plo@am__quote@
|
||||
|
@ -329,6 +332,7 @@ distclean-compile:
|
|||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/hash.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/jenkins_hash.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/main.Po@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/miller_rabin.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/select.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vqueue.Plo@am__quote@
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/vstack.Plo@am__quote@
|
||||
|
|
|
@ -484,13 +484,6 @@ void bdz_ph_load(FILE *f, cmph_t *mphf)
|
|||
bdz_ph->g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8));
|
||||
nbytes = fread(bdz_ph->g, sizeg*sizeof(cmph_uint8), (size_t)1, f);
|
||||
|
||||
/* #ifdef DEBUG
|
||||
cmph_uint32 i;
|
||||
fprintf(stderr, "G: ");
|
||||
for (i = 0; i < bdz_ph->n; ++i) fprintf(stderr, "%u ", GETVALUE(bdz_ph->g,i));
|
||||
fprintf(stderr, "\n");
|
||||
#endif
|
||||
*/
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -105,6 +105,10 @@ void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd)
|
|||
void brz_config_set_b(cmph_config_t *mph, cmph_uint32 b)
|
||||
{
|
||||
brz_config_data_t *brz = (brz_config_data_t *)mph->data;
|
||||
if(b <= 64 || b >= 175)
|
||||
{
|
||||
b = 128;
|
||||
}
|
||||
brz->b = b;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,833 @@
|
|||
#include<stdio.h>
|
||||
#include<stdlib.h>
|
||||
#include<string.h>
|
||||
#include<math.h>
|
||||
#include<time.h>
|
||||
#include<assert.h>
|
||||
#include<limits.h>
|
||||
|
||||
#include "cmph_structs.h"
|
||||
#include "chd_structs_ph.h"
|
||||
#include "chd_ph.h"
|
||||
#include"miller_rabin.h"
|
||||
|
||||
#define DEBUG
|
||||
#include "debug.h"
|
||||
|
||||
// NO_ELEMENT is equivalent to null pointer
|
||||
#ifndef NO_ELEMENT
|
||||
#define NO_ELEMENT UINT_MAX
|
||||
#endif
|
||||
|
||||
// struct to represents the buckets items
|
||||
struct _chd_ph_item_t
|
||||
{
|
||||
cmph_uint32 f;
|
||||
cmph_uint32 h;
|
||||
struct _chd_ph_item_t * next;
|
||||
};
|
||||
typedef struct _chd_ph_item_t chd_ph_item_t;
|
||||
|
||||
|
||||
// struct to represent a bucket
|
||||
struct _chd_ph_bucket_t
|
||||
{
|
||||
cmph_uint32 size;
|
||||
chd_ph_item_t * items_list;
|
||||
cmph_uint32 next_in_list;
|
||||
};
|
||||
typedef struct _chd_ph_bucket_t chd_ph_bucket_t;
|
||||
|
||||
static inline chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets);
|
||||
static inline void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets);
|
||||
static inline cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets, cmph_uint32 g, chd_ph_item_t * item);
|
||||
static inline void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets);
|
||||
|
||||
chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets)
|
||||
{
|
||||
chd_ph_bucket_t * buckets = (chd_ph_bucket_t *) calloc(nbuckets, sizeof(chd_ph_bucket_t));
|
||||
return buckets;
|
||||
}
|
||||
|
||||
void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets)
|
||||
{
|
||||
register cmph_uint32 i = 0;
|
||||
assert(buckets);
|
||||
for(i = 0; i < nbuckets; i++)
|
||||
{
|
||||
buckets[i].size = 0;
|
||||
buckets[i].items_list = 0;
|
||||
buckets[i].next_in_list = NO_ELEMENT;
|
||||
};
|
||||
}
|
||||
|
||||
cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets, cmph_uint32 g, chd_ph_item_t * item)
|
||||
{
|
||||
chd_ph_item_t * item1, * prior_item1;
|
||||
item1 = buckets[g].items_list;
|
||||
prior_item1 = 0;
|
||||
while(item1 != 0 && (item1->f < item->f || (item1->f == item->f && item1->h < item->h)) )
|
||||
{
|
||||
prior_item1 = item1;
|
||||
item1 = item1->next;
|
||||
};
|
||||
|
||||
if(item1 != 0 && item1->f == item->f && item1->h == item->h)
|
||||
{
|
||||
DEBUGP("Item not added\n");
|
||||
return 0;
|
||||
};
|
||||
item->next = item1;
|
||||
if(prior_item1 == 0)
|
||||
buckets[g].items_list = item;
|
||||
else
|
||||
prior_item1->next = item;
|
||||
|
||||
buckets[g].size++;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets)
|
||||
{
|
||||
free(buckets);
|
||||
}
|
||||
|
||||
static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items,
|
||||
cmph_uint32 *max_bucket_size);
|
||||
|
||||
static inline cmph_uint32 * chd_ph_ordering(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets, cmph_uint32 max_bucket_size);
|
||||
|
||||
static inline cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size,
|
||||
cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table,
|
||||
cmph_uint8 * occup_table);
|
||||
|
||||
static inline double chd_ph_space_lower_bound(cmph_uint32 _n, cmph_uint32 _r)
|
||||
{
|
||||
double r = _r, n = _n;
|
||||
return (1 + (r/n - 1.0 + 1.0/(2.0*n))*log(1 - n/r))/log(2);
|
||||
};
|
||||
|
||||
/* computes the entropy of non empty buckets.*/
|
||||
static inline double chd_ph_get_entropy(cmph_uint32 * disp_table, cmph_uint32 n, cmph_uint32 max_probes)
|
||||
{
|
||||
register cmph_uint32 * probe_counts = (cmph_uint32 *) calloc(max_probes, sizeof(cmph_uint32));
|
||||
register cmph_uint32 i;
|
||||
register double entropy = 0;
|
||||
|
||||
for(i = 0; i < n; i++)
|
||||
{
|
||||
probe_counts[disp_table[i]]++;
|
||||
};
|
||||
|
||||
for(i = 0; i < max_probes; i++)
|
||||
{
|
||||
if(probe_counts[i] > 0)
|
||||
entropy -= probe_counts[i]*log((double)probe_counts[i]/(double)n)/log(2);
|
||||
};
|
||||
free(probe_counts);
|
||||
return entropy;
|
||||
};
|
||||
|
||||
chd_ph_config_data_t *chd_ph_config_new()
|
||||
{
|
||||
chd_ph_config_data_t *chd_ph;
|
||||
chd_ph = (chd_ph_config_data_t *)malloc(sizeof(chd_ph_config_data_t));
|
||||
assert(chd_ph);
|
||||
memset(chd_ph, 0, sizeof(chd_ph_config_data_t));
|
||||
|
||||
chd_ph->hashfunc = CMPH_HASH_JENKINS;
|
||||
chd_ph->cs = NULL;
|
||||
chd_ph->nbuckets = 0;
|
||||
chd_ph->n = 0;
|
||||
chd_ph->hl = NULL;
|
||||
|
||||
chd_ph->m = 0;
|
||||
chd_ph->use_h = 1;
|
||||
chd_ph->keys_per_bin = 1;
|
||||
chd_ph->keys_per_bucket = 4;
|
||||
|
||||
//The following fields are used just for statistics
|
||||
chd_ph->space_usage = 0;
|
||||
chd_ph->entropy = 0.0;
|
||||
return chd_ph;
|
||||
}
|
||||
|
||||
void chd_ph_config_destroy(cmph_config_t *mph)
|
||||
{
|
||||
chd_ph_config_data_t *data = (chd_ph_config_data_t *) mph->data;
|
||||
DEBUGP("Destroying algorithm dependent data\n");
|
||||
free(data);
|
||||
}
|
||||
|
||||
|
||||
void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
|
||||
{
|
||||
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
|
||||
CMPH_HASH *hashptr = hashfuncs;
|
||||
cmph_uint32 i = 0;
|
||||
while(*hashptr != CMPH_HASH_COUNT)
|
||||
{
|
||||
if (i >= 1) break; //chd_ph only uses one linear hash function
|
||||
chd_ph->hashfunc = *hashptr;
|
||||
++i, ++hashptr;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket)
|
||||
{
|
||||
assert(mph);
|
||||
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
|
||||
if(keys_per_bucket <= 1 || keys_per_bucket >= 15)
|
||||
{
|
||||
keys_per_bucket = 4;
|
||||
}
|
||||
chd_ph->keys_per_bucket = keys_per_bucket;
|
||||
}
|
||||
|
||||
|
||||
void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
|
||||
{
|
||||
assert(mph);
|
||||
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
|
||||
if(keys_per_bin <= 1 || keys_per_bin >= 128)
|
||||
{
|
||||
keys_per_bin = 1;
|
||||
}
|
||||
chd_ph->keys_per_bin = keys_per_bin;
|
||||
}
|
||||
|
||||
cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, cmph_uint32 *max_bucket_size)
|
||||
{
|
||||
register cmph_uint32 i = 0, g = 0;
|
||||
cmph_uint32 hl[3];
|
||||
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
|
||||
char * key = NULL;
|
||||
cmph_uint32 keylen = 0;
|
||||
chd_ph_item_t * item;
|
||||
register cmph_uint32 mapping_iterations = 1000;
|
||||
*max_bucket_size = 0;
|
||||
while(1)
|
||||
{
|
||||
mapping_iterations--;
|
||||
if (chd_ph->hl) hash_state_destroy(chd_ph->hl);
|
||||
chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m);
|
||||
|
||||
chd_ph_bucket_clean(buckets, chd_ph->nbuckets);
|
||||
|
||||
mph->key_source->rewind(mph->key_source->data);
|
||||
|
||||
for(i = 0; i < chd_ph->m; i++)
|
||||
{
|
||||
mph->key_source->read(mph->key_source->data, &key, &keylen);
|
||||
hash_vector(chd_ph->hl, key, keylen, hl);
|
||||
|
||||
item = (items + i);
|
||||
|
||||
g = hl[0] % chd_ph->nbuckets;
|
||||
item->f = hl[1] % chd_ph->n;
|
||||
item->h = hl[2] % (chd_ph->n - 1) + 1;
|
||||
|
||||
mph->key_source->dispose(mph->key_source->data, key, keylen);
|
||||
|
||||
// if(buckets[g].size == (chd_ph->keys_per_bucket << 2))
|
||||
// {
|
||||
// DEBUGP("BUCKET = %u -- SIZE = %u -- MAXIMUM SIZE = %u\n", g, buckets[g].size, (chd_ph->keys_per_bucket << 2));
|
||||
// goto error;
|
||||
// }
|
||||
|
||||
if(!chd_ph_bucket_insert(buckets, chd_ph->nbuckets, g, item))
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
if(buckets[g].size > *max_bucket_size)
|
||||
{
|
||||
*max_bucket_size = buckets[g].size;
|
||||
}
|
||||
}
|
||||
|
||||
if(i == chd_ph->m)
|
||||
{
|
||||
return 1; // SUCCESS
|
||||
}
|
||||
|
||||
if(mapping_iterations == 0)
|
||||
{
|
||||
goto error;
|
||||
}
|
||||
}
|
||||
error:
|
||||
hash_state_destroy(chd_ph->hl);
|
||||
chd_ph->hl = NULL;
|
||||
return 0; // FAILURE
|
||||
|
||||
}
|
||||
|
||||
cmph_uint32 * chd_ph_ordering(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets, cmph_uint32 max_bucket_size)
|
||||
{
|
||||
cmph_uint32 * sorted_lists = (cmph_uint32 *) calloc(max_bucket_size + 1, sizeof(cmph_uint32));
|
||||
register cmph_uint32 i, size;
|
||||
DEBUGP("MAX BUCKET SIZE = %u\n", max_bucket_size);
|
||||
for(i = 0; i <= max_bucket_size; i++)
|
||||
{
|
||||
sorted_lists[i] = NO_ELEMENT;
|
||||
}
|
||||
for(i = 0; i < nbuckets; i++)
|
||||
{
|
||||
size = buckets[i].size;
|
||||
if(size == 0)
|
||||
continue;
|
||||
buckets[i].next_in_list = sorted_lists[size];
|
||||
sorted_lists[size] = i;
|
||||
};
|
||||
|
||||
return sorted_lists;
|
||||
}
|
||||
|
||||
static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint8 * occup_table,
|
||||
cmph_uint32 probe0_num, cmph_uint32 probe1_num, cmph_uint32 bucket_num)
|
||||
{
|
||||
register cmph_uint32 i;
|
||||
register cmph_uint32 size = buckets[bucket_num].size;
|
||||
register chd_ph_item_t * item;
|
||||
register cmph_uint32 position;
|
||||
|
||||
item = buckets[bucket_num].items_list;
|
||||
// try place bucket with probe_num
|
||||
for(i = 0; i < size; i++) // placement
|
||||
{
|
||||
position = (item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n;
|
||||
|
||||
if(occup_table[position] >= chd_ph->keys_per_bin)
|
||||
{
|
||||
break;
|
||||
}
|
||||
occup_table[position]++;
|
||||
|
||||
item = item->next;
|
||||
};
|
||||
|
||||
if(i != size) // Undo the placement
|
||||
{
|
||||
item = buckets[bucket_num].items_list;
|
||||
while(1)
|
||||
{
|
||||
if(i == 0)
|
||||
{
|
||||
break;
|
||||
}
|
||||
position = (item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n;
|
||||
occup_table[position]--;
|
||||
item = item->next;
|
||||
i--;
|
||||
};
|
||||
return 0;
|
||||
};
|
||||
return 1;
|
||||
};
|
||||
|
||||
static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_probes,
|
||||
cmph_uint32 * disp_table, cmph_uint8 * occup_table, cmph_uint32 bucket_num)
|
||||
|
||||
{
|
||||
register cmph_uint32 probe0_num, probe1_num, probe_num;
|
||||
probe0_num = 0;
|
||||
probe1_num = 0;
|
||||
probe_num = 0;
|
||||
|
||||
while(1)
|
||||
{
|
||||
if(place_bucket_probe(chd_ph, buckets, occup_table, probe0_num, probe1_num, bucket_num))
|
||||
{
|
||||
disp_table[bucket_num] = probe0_num + probe1_num * chd_ph->n;
|
||||
return 1;
|
||||
}
|
||||
probe0_num++;
|
||||
if(probe0_num >= chd_ph->n)
|
||||
{
|
||||
probe0_num -= chd_ph->n;
|
||||
probe1_num++;
|
||||
};
|
||||
probe_num++;
|
||||
if(probe_num >= max_probes || probe1_num >= chd_ph->n)
|
||||
{
|
||||
return 0;
|
||||
};
|
||||
};
|
||||
return 0;
|
||||
};
|
||||
|
||||
static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size,
|
||||
cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table,
|
||||
cmph_uint8 * occup_table)
|
||||
{
|
||||
register cmph_uint32 i = 0;
|
||||
register cmph_uint32 curr_bucket = 0;
|
||||
|
||||
for(i = max_bucket_size; i > 0; i--)
|
||||
{
|
||||
curr_bucket = sorted_lists[i];
|
||||
while(curr_bucket != NO_ELEMENT)
|
||||
{
|
||||
if(!place_bucket(chd_ph, buckets, max_probes, disp_table, occup_table, curr_bucket))
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
curr_bucket = buckets[curr_bucket].next_in_list;
|
||||
};
|
||||
|
||||
};
|
||||
return 1;
|
||||
};
|
||||
|
||||
static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size,
|
||||
cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table,
|
||||
cmph_uint8 * occup_table)
|
||||
{
|
||||
register cmph_uint32 i;
|
||||
register cmph_uint32 curr_bucket, prev_bucket;
|
||||
register cmph_uint32 probe_num, probe0_num, probe1_num;
|
||||
DEBUGP("USING HEURISTIC TO PLACE BUCKETS\n");
|
||||
for(i = max_bucket_size; i > 0; i--)
|
||||
{
|
||||
probe_num = 0;
|
||||
probe0_num = 0;
|
||||
probe1_num = 0;
|
||||
while(sorted_lists[i] != NO_ELEMENT)
|
||||
{
|
||||
prev_bucket = NO_ELEMENT;
|
||||
curr_bucket = sorted_lists[i];
|
||||
while(curr_bucket != NO_ELEMENT)
|
||||
{
|
||||
// if bucket is successfully placed remove it from list
|
||||
if(place_bucket_probe(chd_ph, buckets, occup_table, probe0_num, probe1_num, curr_bucket))
|
||||
{
|
||||
disp_table[curr_bucket] = probe0_num + probe1_num * chd_ph->n;
|
||||
// DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]);
|
||||
if(prev_bucket == NO_ELEMENT)
|
||||
{
|
||||
sorted_lists[i] = buckets[curr_bucket].next_in_list;
|
||||
}
|
||||
else
|
||||
{
|
||||
buckets[prev_bucket].next_in_list = buckets[curr_bucket].next_in_list;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
// DEBUGP("BUCKET %u NOT PLACED\n", curr_bucket);
|
||||
prev_bucket = curr_bucket;
|
||||
}
|
||||
curr_bucket = buckets[curr_bucket].next_in_list;
|
||||
};
|
||||
probe0_num++;
|
||||
if(probe0_num >= chd_ph->n)
|
||||
{
|
||||
probe0_num -= chd_ph->n;
|
||||
probe1_num++;
|
||||
};
|
||||
probe_num++;
|
||||
if(probe_num >= max_probes || probe1_num >= chd_ph->n)
|
||||
{
|
||||
return 0;
|
||||
};
|
||||
};
|
||||
};
|
||||
return 1;
|
||||
};
|
||||
|
||||
cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size,
|
||||
cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table,
|
||||
cmph_uint8 * occup_table)
|
||||
{
|
||||
if(chd_ph->use_h)
|
||||
{
|
||||
return place_buckets2(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table, occup_table);
|
||||
}
|
||||
else
|
||||
{
|
||||
return place_buckets1(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table, occup_table);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets,
|
||||
cmph_uint32 * disp_table, cmph_uint8 * occup_table)
|
||||
{
|
||||
register cmph_uint32 i, j;
|
||||
register cmph_uint32 position, probe0_num, probe1_num;
|
||||
register cmph_uint32 m = 0;
|
||||
register chd_ph_item_t * item;
|
||||
|
||||
memset(occup_table, 0, chd_ph->n);
|
||||
for(i = 0; i < chd_ph->nbuckets; i++)
|
||||
{
|
||||
j = buckets[i].size;
|
||||
item = buckets[i].items_list;
|
||||
probe0_num = disp_table[i] % chd_ph->n;
|
||||
probe1_num = disp_table[i] / chd_ph->n;
|
||||
for(; j > 0; j--)
|
||||
{
|
||||
if(item == 0)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
m++;
|
||||
position = (item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n;
|
||||
if(occup_table[position] >= chd_ph->keys_per_bin)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
occup_table[position]++;
|
||||
item = item->next;
|
||||
};
|
||||
};
|
||||
DEBUGP("We were able to place m = %u keys\n", m);
|
||||
return 1;
|
||||
};
|
||||
|
||||
|
||||
cmph_t *chd_ph_new(cmph_config_t *mph, double c)
|
||||
{
|
||||
cmph_t *mphf = NULL;
|
||||
chd_ph_data_t *chd_phf = NULL;
|
||||
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
|
||||
|
||||
register double load_factor = c;
|
||||
register cmph_uint8 searching_success = 0;
|
||||
register cmph_uint32 max_probes = 1 << 18; // default value for max_probes
|
||||
register cmph_uint32 iterations = 100;
|
||||
chd_ph_bucket_t * buckets = NULL;
|
||||
chd_ph_item_t * items = NULL;
|
||||
register cmph_uint8 failure = 0;
|
||||
cmph_uint32 max_bucket_size = 0;
|
||||
cmph_uint32 * sorted_lists = NULL;
|
||||
cmph_uint32 * disp_table = NULL;
|
||||
cmph_uint8 * occup_table;
|
||||
|
||||
chd_ph->m = mph->key_source->nkeys;
|
||||
DEBUGP("m = %u\n", chd_ph->m);
|
||||
|
||||
chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1;
|
||||
DEBUGP("nbuckets = %u\n", chd_ph->nbuckets);
|
||||
|
||||
if(load_factor < 0.5 )
|
||||
{
|
||||
load_factor = 0.5;
|
||||
}
|
||||
|
||||
if(load_factor >= 0.99)
|
||||
{
|
||||
load_factor = 0.99;
|
||||
}
|
||||
|
||||
DEBUGP("load_factor = %.3f\n", load_factor);
|
||||
|
||||
chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1;
|
||||
|
||||
//Round the number of bins to the prime immediately above
|
||||
if(chd_ph->n % 2 == 0) chd_ph->n++;
|
||||
for(;;)
|
||||
{
|
||||
if(check_primality(chd_ph->n) == 1)
|
||||
break;
|
||||
chd_ph->n += 2; // just odd numbers can be primes for n > 2
|
||||
|
||||
};
|
||||
|
||||
DEBUGP("n = %u \n", chd_ph->n);
|
||||
|
||||
if(mph->verbosity && chd_ph->keys_per_bin == 1)
|
||||
{
|
||||
fprintf(stderr, "space lower bound is %.3f bits per key", chd_ph_space_lower_bound(chd_ph->m, chd_ph->n));
|
||||
}
|
||||
|
||||
// We allocate the working tables
|
||||
buckets = chd_ph_bucket_new(chd_ph->nbuckets);
|
||||
items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t));
|
||||
|
||||
max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes);
|
||||
occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8));
|
||||
disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32));
|
||||
//
|
||||
// init_genrand(time(0));
|
||||
|
||||
while(1)
|
||||
{
|
||||
iterations --;
|
||||
if (mph->verbosity)
|
||||
{
|
||||
fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n);
|
||||
}
|
||||
|
||||
if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size))
|
||||
{
|
||||
if (mph->verbosity)
|
||||
{
|
||||
fprintf(stderr, "Failure in mapping step\n");
|
||||
}
|
||||
failure = 1;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (mph->verbosity)
|
||||
{
|
||||
fprintf(stderr, "Starting ordering step\n");
|
||||
}
|
||||
if(sorted_lists)
|
||||
{
|
||||
free(sorted_lists);
|
||||
}
|
||||
sorted_lists = chd_ph_ordering(buckets, chd_ph->nbuckets, max_bucket_size);
|
||||
|
||||
if (mph->verbosity)
|
||||
{
|
||||
fprintf(stderr, "Starting searching step\n");
|
||||
}
|
||||
|
||||
searching_success = chd_ph_searching(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table, occup_table);
|
||||
|
||||
if(searching_success) break;
|
||||
|
||||
// reset occup_table
|
||||
memset(occup_table, 0, chd_ph->n);
|
||||
if(iterations == 0)
|
||||
{
|
||||
// Cleanup memory
|
||||
if (mph->verbosity)
|
||||
{
|
||||
fprintf(stderr, "Failure because the max trials was exceeded\n");
|
||||
}
|
||||
failure = 1;
|
||||
goto cleanup;
|
||||
};
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
chd_ph->entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes);
|
||||
DEBUGP("Entropy = %.4f\n", chd_ph->entropy/chd_ph->m);
|
||||
|
||||
if(!chd_ph_check_bin_hashing(chd_ph, buckets, disp_table, occup_table))
|
||||
{
|
||||
|
||||
DEBUGP("Error for bin packing generation");
|
||||
return NULL;
|
||||
};
|
||||
#endif
|
||||
|
||||
if (mph->verbosity)
|
||||
{
|
||||
fprintf(stderr, "Starting compressing step\n");
|
||||
}
|
||||
|
||||
if(chd_ph->cs)
|
||||
{
|
||||
free(chd_ph->cs);
|
||||
}
|
||||
chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
|
||||
compressed_seq_init(chd_ph->cs);
|
||||
compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets);
|
||||
chd_ph->space_usage = compressed_seq_get_space_usage(chd_ph->cs);
|
||||
chd_ph->space_usage += 64;
|
||||
DEBUGP("space_usage/key = %.4f\n", chd_ph->space_usage/(double)chd_ph->m);
|
||||
|
||||
cleanup:
|
||||
chd_ph_bucket_destroy(buckets);
|
||||
free(items);
|
||||
free(sorted_lists);
|
||||
free(disp_table);
|
||||
free(occup_table);
|
||||
if(failure)
|
||||
{
|
||||
if(chd_ph->hl)
|
||||
{
|
||||
hash_state_destroy(chd_ph->hl);
|
||||
}
|
||||
chd_ph->hl = NULL;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
mphf = (cmph_t *)malloc(sizeof(cmph_t));
|
||||
mphf->algo = mph->algo;
|
||||
chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
|
||||
|
||||
chd_phf->cs = chd_ph->cs;
|
||||
chd_ph->cs = NULL; //transfer memory ownership
|
||||
chd_phf->hl = chd_ph->hl;
|
||||
chd_ph->hl = NULL; //transfer memory ownership
|
||||
chd_phf->n = chd_ph->n;
|
||||
chd_phf->nbuckets = chd_ph->nbuckets;
|
||||
|
||||
mphf->data = chd_phf;
|
||||
mphf->size = chd_ph->n;
|
||||
|
||||
DEBUGP("Successfully generated minimal perfect hash\n");
|
||||
if (mph->verbosity)
|
||||
{
|
||||
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
|
||||
}
|
||||
|
||||
return mphf;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void chd_ph_load(FILE *fd, cmph_t *mphf)
|
||||
{
|
||||
char *buf = NULL;
|
||||
cmph_uint32 buflen;
|
||||
register cmph_uint32 nbytes;
|
||||
chd_ph_data_t *chd_ph = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
|
||||
|
||||
DEBUGP("Loading chd_ph mphf\n");
|
||||
mphf->data = chd_ph;
|
||||
|
||||
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
|
||||
DEBUGP("Hash state has %u bytes\n", buflen);
|
||||
buf = (char *)malloc((size_t)buflen);
|
||||
nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
|
||||
chd_ph->hl = hash_state_load(buf, buflen);
|
||||
free(buf);
|
||||
|
||||
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
|
||||
DEBUGP("Compressed sequence structure has %u bytes\n", buflen);
|
||||
buf = (char *)malloc((size_t)buflen);
|
||||
nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
|
||||
chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
|
||||
compressed_seq_load(chd_ph->cs, buf, buflen);
|
||||
free(buf);
|
||||
|
||||
// loading n and nbuckets
|
||||
DEBUGP("Reading n and nbuckets\n");
|
||||
nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd);
|
||||
nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
|
||||
}
|
||||
|
||||
int chd_ph_dump(cmph_t *mphf, FILE *fd)
|
||||
{
|
||||
char *buf = NULL;
|
||||
cmph_uint32 buflen;
|
||||
register cmph_uint32 nbytes;
|
||||
chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
|
||||
|
||||
__cmph_dump(mphf, fd);
|
||||
|
||||
hash_state_dump(data->hl, &buf, &buflen);
|
||||
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
|
||||
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
|
||||
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
|
||||
free(buf);
|
||||
|
||||
compressed_seq_dump(data->cs, &buf, &buflen);
|
||||
DEBUGP("Dumping compressed sequence structure with %u bytes to disk\n", buflen);
|
||||
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
|
||||
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
|
||||
free(buf);
|
||||
|
||||
// dumping n and nbuckets
|
||||
nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
|
||||
nbytes = fwrite(&(data->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
|
||||
return 1;
|
||||
}
|
||||
|
||||
void chd_ph_destroy(cmph_t *mphf)
|
||||
{
|
||||
chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
|
||||
compressed_seq_destroy(data->cs);
|
||||
free(data->cs);
|
||||
hash_state_destroy(data->hl);
|
||||
free(data);
|
||||
free(mphf);
|
||||
|
||||
}
|
||||
|
||||
cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
|
||||
{
|
||||
register chd_ph_data_t * chd_ph = mphf->data;
|
||||
cmph_uint32 hl[3];
|
||||
register cmph_uint32 disp,position;
|
||||
register cmph_uint32 probe0_num,probe1_num;
|
||||
register cmph_uint32 f,g,h;
|
||||
hash_vector(chd_ph->hl, key, keylen, hl);
|
||||
g = hl[0] % chd_ph->nbuckets;
|
||||
f = hl[1] % chd_ph->n;
|
||||
h = hl[2] % (chd_ph->n-1) + 1;
|
||||
|
||||
disp = compressed_seq_query(chd_ph->cs, g);
|
||||
probe0_num = disp % chd_ph->n;
|
||||
probe1_num = disp/chd_ph->n;
|
||||
position = (f + ((cmph_uint64 )h)*probe0_num + probe1_num) % chd_ph->n;
|
||||
return position;
|
||||
}
|
||||
|
||||
void chd_ph_pack(cmph_t *mphf, void *packed_mphf)
|
||||
{
|
||||
chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
|
||||
cmph_uint8 * ptr = packed_mphf;
|
||||
|
||||
// packing hl type
|
||||
CMPH_HASH hl_type = hash_get_type(data->hl);
|
||||
*((cmph_uint32 *) ptr) = hl_type;
|
||||
ptr += sizeof(cmph_uint32);
|
||||
|
||||
// packing hl
|
||||
hash_state_pack(data->hl, ptr);
|
||||
ptr += hash_state_packed_size(hl_type);
|
||||
|
||||
// packing n
|
||||
*((cmph_uint32 *) ptr) = data->n;
|
||||
ptr += sizeof(data->n);
|
||||
|
||||
// packing nbuckets
|
||||
*((cmph_uint32 *) ptr) = data->nbuckets;
|
||||
ptr += sizeof(data->nbuckets);
|
||||
|
||||
// packing cs
|
||||
compressed_seq_pack(data->cs, ptr);
|
||||
//ptr += compressed_seq_packed_size(data->cs);
|
||||
|
||||
}
|
||||
|
||||
cmph_uint32 chd_ph_packed_size(cmph_t *mphf)
|
||||
{
|
||||
register chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
|
||||
register CMPH_HASH hl_type = hash_get_type(data->hl);
|
||||
register cmph_uint32 hash_state_pack_size = hash_state_packed_size(hl_type);
|
||||
register cmph_uint32 cs_pack_size = compressed_seq_packed_size(data->cs);
|
||||
|
||||
return (sizeof(CMPH_ALGO) + hash_state_pack_size + cs_pack_size + 3*sizeof(cmph_uint32));
|
||||
|
||||
}
|
||||
|
||||
cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
|
||||
{
|
||||
register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf;
|
||||
register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4;
|
||||
|
||||
register cmph_uint32 * ptr = (cmph_uint32 *)(hl_ptr + hash_state_packed_size(hl_type));
|
||||
register cmph_uint32 n = *ptr++;
|
||||
register cmph_uint32 nbuckets = *ptr++;
|
||||
cmph_uint32 hl[3];
|
||||
|
||||
register cmph_uint32 disp,position;
|
||||
register cmph_uint32 probe0_num,probe1_num;
|
||||
register cmph_uint32 f,g,h;
|
||||
|
||||
hash_vector_packed(hl_ptr, hl_type, key, keylen, hl);
|
||||
|
||||
g = hl[0] % nbuckets;
|
||||
f = hl[1] % n;
|
||||
h = hl[2] % (n-1) + 1;
|
||||
|
||||
disp = compressed_seq_query_packed(ptr, g);
|
||||
probe0_num = disp % n;
|
||||
probe1_num = disp/n;
|
||||
position = (f + ((cmph_uint64 )h)*probe0_num + probe1_num) % n;
|
||||
return position;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
#ifndef _CMPH_CHD_PH_H__
|
||||
#define _CMPH_CHD_PH_H__
|
||||
|
||||
#include "cmph.h"
|
||||
|
||||
typedef struct __chd_ph_data_t chd_ph_data_t;
|
||||
typedef struct __chd_ph_config_data_t chd_ph_config_data_t;
|
||||
|
||||
/* Config API */
|
||||
chd_ph_config_data_t *chd_ph_config_new();
|
||||
void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
|
||||
|
||||
/** \fn void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin);
|
||||
* \brief Allows to set the number of keys per bin.
|
||||
* \param mph pointer to the configuration structure
|
||||
* \param keys_per_bin value for the number of keys per bin
|
||||
*/
|
||||
void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin);
|
||||
|
||||
/** \fn void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket);
|
||||
* \brief Allows to set the number of keys per bucket.
|
||||
* \param mph pointer to the configuration structure
|
||||
* \param keys_per_bucket value for the number of keys per bucket
|
||||
*/
|
||||
void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket);
|
||||
void chd_ph_config_destroy(cmph_config_t *mph);
|
||||
|
||||
|
||||
/* Chd algorithm API */
|
||||
cmph_t *chd_ph_new(cmph_config_t *mph, double c);
|
||||
void chd_ph_load(FILE *fd, cmph_t *mphf);
|
||||
int chd_ph_dump(cmph_t *mphf, FILE *fd);
|
||||
void chd_ph_destroy(cmph_t *mphf);
|
||||
cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
|
||||
|
||||
/** \fn void chd_ph_pack(cmph_t *mphf, void *packed_mphf);
|
||||
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
|
||||
* \param mphf pointer to the resulting mphf
|
||||
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
|
||||
*/
|
||||
void chd_ph_pack(cmph_t *mphf, void *packed_mphf);
|
||||
|
||||
/** \fn cmph_uint32 chd_ph_packed_size(cmph_t *mphf);
|
||||
* \brief Return the amount of space needed to pack mphf.
|
||||
* \param mphf pointer to a mphf
|
||||
* \return the size of the packed function or zero for failures
|
||||
*/
|
||||
cmph_uint32 chd_ph_packed_size(cmph_t *mphf);
|
||||
|
||||
/** cmph_uint32 chd_ph_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
|
||||
* \brief Use the packed mphf to do a search.
|
||||
* \param packed_mphf pointer to the packed mphf
|
||||
* \param key key to be hashed
|
||||
* \param keylen key legth in bytes
|
||||
* \return The mphf value
|
||||
*/
|
||||
cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
|
||||
|
||||
#endif
|
|
@ -0,0 +1,32 @@
|
|||
#ifndef __CMPH_CHD_PH_STRUCTS_H__
|
||||
#define __CMPH_CHD_PH_STRUCTS_H__
|
||||
|
||||
#include "hash_state.h"
|
||||
#include "compressed_seq.h"
|
||||
|
||||
struct __chd_ph_data_t
|
||||
{
|
||||
compressed_seq_t * cs; // compressed displacement values
|
||||
cmph_uint32 nbuckets; // number of buckets
|
||||
cmph_uint32 n; // number of bins
|
||||
hash_state_t *hl; // linear hash function
|
||||
};
|
||||
|
||||
struct __chd_ph_config_data_t
|
||||
{
|
||||
CMPH_HASH hashfunc; // linear hash function to be used
|
||||
compressed_seq_t * cs; // compressed displacement values
|
||||
cmph_uint32 nbuckets; // number of buckets
|
||||
cmph_uint32 n; // number of bins
|
||||
hash_state_t *hl; // linear hash function
|
||||
|
||||
cmph_uint32 m; // number of keys
|
||||
cmph_uint8 use_h; // flag to indicate the of use of a heuristic (use_h = 1)
|
||||
cmph_uint32 keys_per_bin;//maximum number of keys per bin
|
||||
cmph_uint32 keys_per_bucket; // average number of keys per bucket
|
||||
|
||||
//The following fields are used just for statistics
|
||||
cmph_uint32 space_usage;
|
||||
double entropy;
|
||||
};
|
||||
#endif
|
85
src/cmph.c
85
src/cmph.c
|
@ -7,6 +7,7 @@
|
|||
#include "fch.h" /* included -- Fabiano */
|
||||
#include "bdz.h" /* included -- Fabiano */
|
||||
#include "bdz_ph.h" /* included -- Fabiano */
|
||||
#include "chd_ph.h" /* included -- Fabiano */
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
|
@ -14,8 +15,7 @@
|
|||
//#define DEBUG
|
||||
#include "debug.h"
|
||||
|
||||
const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz",
|
||||
"bdz_ph", NULL }; /* included -- Fabiano */
|
||||
const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz", "bdz_ph", "chd_ph", NULL }; /* included -- Fabiano */
|
||||
|
||||
typedef struct
|
||||
{
|
||||
|
@ -322,6 +322,9 @@ void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo)
|
|||
case CMPH_BDZ_PH:
|
||||
bdz_ph_config_destroy(mph);
|
||||
break;
|
||||
case CMPH_CHD_PH:
|
||||
chd_ph_config_destroy(mph);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
@ -348,6 +351,9 @@ void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo)
|
|||
case CMPH_BDZ_PH:
|
||||
mph->data = bdz_ph_config_new();
|
||||
break;
|
||||
case CMPH_CHD_PH:
|
||||
mph->data = chd_ph_config_new();
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
@ -382,6 +388,18 @@ void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b)
|
|||
{
|
||||
bdz_config_set_b(mph, b);
|
||||
}
|
||||
else if (mph->algo == CMPH_CHD_PH)
|
||||
{
|
||||
chd_ph_config_set_b(mph, b);
|
||||
}
|
||||
}
|
||||
|
||||
void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
|
||||
{
|
||||
if (mph->algo == CMPH_CHD_PH)
|
||||
{
|
||||
chd_ph_config_set_keys_per_bin(mph, keys_per_bin);
|
||||
}
|
||||
}
|
||||
|
||||
void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability)
|
||||
|
@ -406,19 +424,22 @@ void cmph_config_destroy(cmph_config_t *mph)
|
|||
bmz_config_destroy(mph);
|
||||
break;
|
||||
case CMPH_BMZ8: /* included -- Fabiano */
|
||||
bmz8_config_destroy(mph);
|
||||
bmz8_config_destroy(mph);
|
||||
break;
|
||||
case CMPH_BRZ: /* included -- Fabiano */
|
||||
brz_config_destroy(mph);
|
||||
brz_config_destroy(mph);
|
||||
break;
|
||||
case CMPH_FCH: /* included -- Fabiano */
|
||||
fch_config_destroy(mph);
|
||||
fch_config_destroy(mph);
|
||||
break;
|
||||
case CMPH_BDZ: /* included -- Fabiano */
|
||||
bdz_config_destroy(mph);
|
||||
bdz_config_destroy(mph);
|
||||
break;
|
||||
case CMPH_BDZ_PH: /* included -- Fabiano */
|
||||
bdz_ph_config_destroy(mph);
|
||||
bdz_ph_config_destroy(mph);
|
||||
break;
|
||||
case CMPH_CHD_PH: /* included -- Fabiano */
|
||||
chd_ph_config_destroy(mph);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
|
@ -457,6 +478,9 @@ void cmph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
|
|||
case CMPH_BDZ_PH: /* included -- Fabiano */
|
||||
bdz_ph_config_set_hashfuncs(mph, hashfuncs);
|
||||
break;
|
||||
case CMPH_CHD_PH: /* included -- Fabiano */
|
||||
chd_ph_config_set_hashfuncs(mph, hashfuncs);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -506,6 +530,10 @@ cmph_t *cmph_new(cmph_config_t *mph)
|
|||
DEBUGP("Creating bdz_ph hash\n");
|
||||
mphf = bdz_ph_new(mph, c);
|
||||
break;
|
||||
case CMPH_CHD_PH: /* included -- Fabiano */
|
||||
DEBUGP("Creating chd_ph hash\n");
|
||||
mphf = chd_ph_new(mph, c);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
@ -519,17 +547,19 @@ int cmph_dump(cmph_t *mphf, FILE *f)
|
|||
case CMPH_CHM:
|
||||
return chm_dump(mphf, f);
|
||||
case CMPH_BMZ: /* included -- Fabiano */
|
||||
return bmz_dump(mphf, f);
|
||||
return bmz_dump(mphf, f);
|
||||
case CMPH_BMZ8: /* included -- Fabiano */
|
||||
return bmz8_dump(mphf, f);
|
||||
return bmz8_dump(mphf, f);
|
||||
case CMPH_BRZ: /* included -- Fabiano */
|
||||
return brz_dump(mphf, f);
|
||||
return brz_dump(mphf, f);
|
||||
case CMPH_FCH: /* included -- Fabiano */
|
||||
return fch_dump(mphf, f);
|
||||
return fch_dump(mphf, f);
|
||||
case CMPH_BDZ: /* included -- Fabiano */
|
||||
return bdz_dump(mphf, f);
|
||||
return bdz_dump(mphf, f);
|
||||
case CMPH_BDZ_PH: /* included -- Fabiano */
|
||||
return bdz_ph_dump(mphf, f);
|
||||
return bdz_ph_dump(mphf, f);
|
||||
case CMPH_CHD_PH: /* included -- Fabiano */
|
||||
return chd_ph_dump(mphf, f);
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
@ -573,6 +603,10 @@ cmph_t *cmph_load(FILE *f)
|
|||
DEBUGP("Loading bdz_ph algorithm dependent parts\n");
|
||||
bdz_ph_load(f, mphf);
|
||||
break;
|
||||
case CMPH_CHD_PH: /* included -- Fabiano */
|
||||
DEBUGP("Loading chd_ph algorithm dependent parts\n");
|
||||
chd_ph_load(f, mphf);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
@ -606,6 +640,9 @@ cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
|
|||
case CMPH_BDZ_PH: /* included -- Fabiano */
|
||||
DEBUGP("bdz_ph algorithm search\n");
|
||||
return bdz_ph_search(mphf, key, keylen);
|
||||
case CMPH_CHD_PH: /* included -- Fabiano */
|
||||
DEBUGP("chd_ph algorithm search\n");
|
||||
return chd_ph_search(mphf, key, keylen);
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
@ -626,22 +663,25 @@ void cmph_destroy(cmph_t *mphf)
|
|||
chm_destroy(mphf);
|
||||
return;
|
||||
case CMPH_BMZ: /* included -- Fabiano */
|
||||
bmz_destroy(mphf);
|
||||
bmz_destroy(mphf);
|
||||
return;
|
||||
case CMPH_BMZ8: /* included -- Fabiano */
|
||||
bmz8_destroy(mphf);
|
||||
bmz8_destroy(mphf);
|
||||
return;
|
||||
case CMPH_BRZ: /* included -- Fabiano */
|
||||
brz_destroy(mphf);
|
||||
brz_destroy(mphf);
|
||||
return;
|
||||
case CMPH_FCH: /* included -- Fabiano */
|
||||
fch_destroy(mphf);
|
||||
fch_destroy(mphf);
|
||||
return;
|
||||
case CMPH_BDZ: /* included -- Fabiano */
|
||||
bdz_destroy(mphf);
|
||||
bdz_destroy(mphf);
|
||||
return;
|
||||
case CMPH_BDZ_PH: /* included -- Fabiano */
|
||||
bdz_ph_destroy(mphf);
|
||||
bdz_ph_destroy(mphf);
|
||||
return;
|
||||
case CMPH_CHD_PH: /* included -- Fabiano */
|
||||
chd_ph_destroy(mphf);
|
||||
return;
|
||||
default:
|
||||
assert(0);
|
||||
|
@ -685,6 +725,9 @@ void cmph_pack(cmph_t *mphf, void *packed_mphf)
|
|||
case CMPH_BDZ_PH: /* included -- Fabiano */
|
||||
bdz_ph_pack(mphf, ptr);
|
||||
break;
|
||||
case CMPH_CHD_PH: /* included -- Fabiano */
|
||||
chd_ph_pack(mphf, ptr);
|
||||
break;
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
@ -714,6 +757,8 @@ cmph_uint32 cmph_packed_size(cmph_t *mphf)
|
|||
return bdz_packed_size(mphf);
|
||||
case CMPH_BDZ_PH: /* included -- Fabiano */
|
||||
return bdz_ph_packed_size(mphf);
|
||||
case CMPH_CHD_PH: /* included -- Fabiano */
|
||||
return chd_ph_packed_size(mphf);
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
@ -747,6 +792,8 @@ cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 k
|
|||
return bdz_search_packed(++ptr, key, keylen);
|
||||
case CMPH_BDZ_PH: /* included -- Fabiano */
|
||||
return bdz_ph_search_packed(++ptr, key, keylen);
|
||||
case CMPH_CHD_PH: /* included -- Fabiano */
|
||||
return chd_ph_search_packed(++ptr, key, keylen);
|
||||
default:
|
||||
assert(0);
|
||||
}
|
||||
|
|
|
@ -54,6 +54,7 @@ void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo);
|
|||
void cmph_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir);
|
||||
void cmph_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd);
|
||||
void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b);
|
||||
void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin);
|
||||
void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability);
|
||||
void cmph_config_destroy(cmph_config_t *mph);
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ typedef unsigned int cmph_uint32;
|
|||
typedef enum { CMPH_HASH_JENKINS, CMPH_HASH_COUNT } CMPH_HASH;
|
||||
extern const char *cmph_hash_names[];
|
||||
typedef enum { CMPH_BMZ, CMPH_BMZ8, CMPH_CHM, CMPH_BRZ, CMPH_FCH,
|
||||
CMPH_BDZ, CMPH_BDZ_PH, CMPH_COUNT } CMPH_ALGO; /* included -- Fabiano */
|
||||
CMPH_BDZ, CMPH_BDZ_PH, CMPH_CHD_PH, CMPH_COUNT } CMPH_ALGO; /* included -- Fabiano */
|
||||
extern const char *cmph_names[];
|
||||
|
||||
#endif
|
||||
|
|
29
src/main.c
29
src/main.c
|
@ -22,17 +22,18 @@
|
|||
|
||||
void usage(const char *prg)
|
||||
{
|
||||
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
|
||||
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
|
||||
}
|
||||
void usage_long(const char *prg)
|
||||
{
|
||||
cmph_uint32 i;
|
||||
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b BRZ_parameter] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
|
||||
fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg);
|
||||
fprintf(stderr, "Minimum perfect hashing tool\n\n");
|
||||
fprintf(stderr, " -h\t print this help message\n");
|
||||
fprintf(stderr, " -c\t c value determines:\n");
|
||||
fprintf(stderr, " \t the number of vertices in the graph for the algorithms BMZ and CHM\n");
|
||||
fprintf(stderr, " \t the number of bits per key required in the FCH algorithm\n");
|
||||
fprintf(stderr, " \t the load factor in the CHD_PH algorithm\n");
|
||||
fprintf(stderr, " -a\t algorithm - valid values are\n");
|
||||
for (i = 0; i < CMPH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_names[i]);
|
||||
fprintf(stderr, " -f\t hash function (may be used multiple times) - valid values are\n");
|
||||
|
@ -51,7 +52,12 @@ void usage_long(const char *prg)
|
|||
fprintf(stderr, " \t In this case its value should be an integer in the range [64,175].\n");
|
||||
fprintf(stderr, " \t If BDZ algorithm is selected in option -a, than it is used to\n");
|
||||
fprintf(stderr, " \t determine the size of some precomputed rank information and\n");
|
||||
fprintf(stderr, " \t its value should be an integer in the range [3,10]\n");
|
||||
fprintf(stderr, " \t its value should be an integer in the range [3,10].\n");
|
||||
fprintf(stderr, " \t If CHD_PH algorithm is selected in option -a, than it is used to\n");
|
||||
fprintf(stderr, " \t set average number of keys per bucket and its value should be an\n");
|
||||
fprintf(stderr, " \t an integer in the range [1,32].\n");
|
||||
fprintf(stderr, " -t\t set the number of keys per bin for a t-perfect hashing function.\n");
|
||||
fprintf(stderr, " \t A t-perfect hashing function allows at most t collisions in a given bin.\n");
|
||||
fprintf(stderr, " keysfile\t line separated file with keys\n");
|
||||
}
|
||||
|
||||
|
@ -75,10 +81,11 @@ int main(int argc, char **argv)
|
|||
char * tmp_dir = NULL;
|
||||
cmph_io_adapter_t *source;
|
||||
cmph_uint32 memory_availability = 0;
|
||||
cmph_uint32 b = 128;
|
||||
cmph_uint32 b = 0;
|
||||
cmph_uint32 keys_per_bin = 0;
|
||||
while (1)
|
||||
{
|
||||
char ch = getopt(argc, argv, "hVvgc:k:a:M:b:f:m:d:s:");
|
||||
char ch = getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:");
|
||||
if (ch == -1) break;
|
||||
switch (ch)
|
||||
{
|
||||
|
@ -141,6 +148,16 @@ int main(int argc, char **argv)
|
|||
}
|
||||
}
|
||||
break;
|
||||
case 't':
|
||||
{
|
||||
char *cptr;
|
||||
keys_per_bin = strtoul(optarg, &cptr, 10);
|
||||
if(*cptr != 0) {
|
||||
fprintf(stderr, "Parameter t was not found: %s\n", optarg);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
break;
|
||||
case 'v':
|
||||
++verbosity;
|
||||
break;
|
||||
|
@ -237,6 +254,8 @@ int main(int argc, char **argv)
|
|||
cmph_config_set_mphf_fd(config, mphf_fd);
|
||||
cmph_config_set_memory_availability(config, memory_availability);
|
||||
cmph_config_set_b(config, b);
|
||||
cmph_config_set_keys_per_bin(config, keys_per_bin);
|
||||
|
||||
//if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15;
|
||||
if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15;
|
||||
if (c != 0) cmph_config_set_graphsize(config, c);
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
#include "miller_rabin.h"
|
||||
|
||||
static inline cmph_uint64 int_pow(cmph_uint64 a, cmph_uint64 d, cmph_uint64 n)
|
||||
{
|
||||
cmph_uint64 a_pow = a;
|
||||
cmph_uint64 res = 1;
|
||||
while(d > 0)
|
||||
{
|
||||
if((d & 1) == 1)
|
||||
res =(((cmph_uint64)res) * a_pow) % n;
|
||||
a_pow = (((cmph_uint64)a_pow) * a_pow) % n;
|
||||
d /= 2;
|
||||
};
|
||||
return res;
|
||||
};
|
||||
|
||||
static inline cmph_uint8 check_witness(cmph_uint64 a_exp_d, cmph_uint64 n, cmph_uint64 s)
|
||||
{
|
||||
cmph_uint64 i;
|
||||
cmph_uint64 a_exp = a_exp_d;
|
||||
if(a_exp == 1 || a_exp == (n - 1))
|
||||
return 1;
|
||||
for(i = 1; i < s; i++)
|
||||
{
|
||||
a_exp = (((cmph_uint64)a_exp) * a_exp) % n;
|
||||
if(a_exp == (n - 1))
|
||||
return 1;
|
||||
};
|
||||
return 0;
|
||||
};
|
||||
|
||||
cmph_uint8 check_primality(cmph_uint64 n)
|
||||
{
|
||||
cmph_uint64 a, d, s, a_exp_d;
|
||||
if((n % 2) == 0)
|
||||
return 0;
|
||||
if((n % 3) == 0)
|
||||
return 0;
|
||||
if((n % 5) == 0)
|
||||
return 0;
|
||||
if((n % 7 ) == 0)
|
||||
return 0;
|
||||
//we decompoe the number n - 1 into 2^s*d
|
||||
s = 0;
|
||||
d = n - 1;
|
||||
do
|
||||
{
|
||||
s++;
|
||||
d /= 2;
|
||||
}while((d % 2) == 0);
|
||||
|
||||
a = 2;
|
||||
a_exp_d = int_pow(a, d, n);
|
||||
if(check_witness(a_exp_d, n, s) == 0)
|
||||
return 0;
|
||||
a = 7;
|
||||
a_exp_d = int_pow(a, d, n);
|
||||
if(check_witness(a_exp_d, n, s) == 0)
|
||||
return 0;
|
||||
a = 61;
|
||||
a_exp_d = int_pow(a, d, n);
|
||||
if(check_witness(a_exp_d, n, s) == 0)
|
||||
return 0;
|
||||
return 1;
|
||||
};
|
||||
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
#ifndef _CMPH_MILLER_RABIN_H__
|
||||
#define _CMPH_MILLER_RABIN_H__
|
||||
#include "cmph_types.h"
|
||||
cmph_uint8 check_primality(cmph_uint64 n);
|
||||
#endif
|
|
@ -164,14 +164,13 @@ int main(int argc, char **argv)
|
|||
}
|
||||
source->dispose(source->data, buf, buflen);
|
||||
}
|
||||
|
||||
free(packed_mphf);
|
||||
cmph_destroy(mphf);
|
||||
cmph_destroy(mphf);
|
||||
free(hashtable);
|
||||
|
||||
fclose(keys_fd);
|
||||
free(mphf_file);
|
||||
cmph_io_nlfile_adapter_destroy(source);
|
||||
cmph_io_nlfile_adapter_destroy(source);
|
||||
return ret;
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue