*** empty log message ***

This commit is contained in:
fc_botelho 2009-04-07 23:16:40 +00:00
parent b8d0614a2d
commit 1a11b02e71
25 changed files with 989 additions and 99 deletions

View File

@ -8,7 +8,8 @@ libcmph_la_SOURCES = hash.c jenkins_hash.c\
buffer_manager.c buffer_entry.c\
brz.c fch.c fch_buckets.c \
select.c compressed_seq.c \
chd_ph.c miller_rabin.c
chd.c chd_ph.c miller_rabin.c \
compressed_rank.c
libcmph_la_LDFLAGS = -version-info 0:0:0

View File

@ -269,6 +269,13 @@ cmph_t *bdz_new(cmph_config_t *mph, double c)
bdz_queue_t edges;
bdz_graph3_t graph3;
bdz_config_data_t *bdz = (bdz_config_data_t *)mph->data;
#ifdef CMPH_TIMING
double construction_time_begin = 0.0;
double construction_time = 0.0;
ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
#endif
if (c == 0) c = 1.23; // validating restrictions over parameter c.
DEBUGP("c: %f\n", c);
bdz->m = mph->key_source->nkeys;
@ -338,7 +345,9 @@ cmph_t *bdz_new(cmph_config_t *mph, double c)
fprintf(stderr, "Entering ranking step for mph creation of %u keys with graph sized %u\n", bdz->m, bdz->n);
}
ranking(bdz);
#ifdef CMPH_TIMING
ELAPSED_TIME_IN_SECONDS(&construction_time);
#endif
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
bdzf = (bdz_data_t *)malloc(sizeof(bdz_data_t));
@ -363,6 +372,14 @@ cmph_t *bdz_new(cmph_config_t *mph, double c)
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
#ifdef CMPH_TIMING
register cmph_uint32 space_usage = bdz_packed_size(mphf)*8;
register cmph_uint32 keys_per_bucket = 1;
construction_time = construction_time - construction_time_begin;
fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", bdz->m, bdz->m/(double)bdz->n, keys_per_bucket, construction_time, space_usage/(double)bdz->m);
#endif
return mphf;
}

View File

@ -242,6 +242,12 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c)
bdz_ph_queue_t edges;
bdz_ph_graph3_t graph3;
bdz_ph_config_data_t *bdz_ph = (bdz_ph_config_data_t *)mph->data;
#ifdef CMPH_TIMING
double construction_time_begin = 0.0;
double construction_time = 0.0;
ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
#endif
if (c == 0) c = 1.23; // validating restrictions over parameter c.
DEBUGP("c: %f\n", c);
@ -309,6 +315,9 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c)
bdz_ph_optimization(bdz_ph);
#ifdef CMPH_TIMING
ELAPSED_TIME_IN_SECONDS(&construction_time);
#endif
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
bdz_phf = (bdz_ph_data_t *)malloc(sizeof(bdz_ph_data_t));
@ -328,6 +337,13 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c)
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
#ifdef CMPH_TIMING
register cmph_uint32 space_usage = bdz_ph_packed_size(mphf)*8;
register cmph_uint32 keys_per_bucket = 1;
construction_time = construction_time - construction_time_begin;
fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", bdz_ph->m, bdz_ph->m/(double)bdz_ph->n, keys_per_bucket, construction_time, space_usage/(double)bdz_ph->m);
#endif
return mphf;
}

View File

@ -615,8 +615,6 @@ cmph_uint32 bmz_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke
register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n;
register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n;
DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
if (h1 == h2 && ++h2 > n) h2 = 0;
DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, g_ptr[h1], g_ptr[h2], m);
return (g_ptr[h1] + g_ptr[h2]);
}

View File

@ -479,11 +479,11 @@ int bmz8_dump(cmph_t *mphf, FILE *fd)
nbytes = fwrite(&(data->m), sizeof(cmph_uint8), (size_t)1, fd);
nbytes = fwrite(data->g, sizeof(cmph_uint8)*(data->n), (size_t)1, fd);
#ifdef DEBUG
/* #ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]);
fprintf(stderr, "\n");
#endif
#endif*/
return 1;
}
@ -625,6 +625,5 @@ cmph_uint8 bmz8_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke
register cmph_uint8 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n;
DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
if (h1 == h2 && ++h2 > n) h2 = 0;
DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, g_ptr[h1], g_ptr[h2], m);
return (g_ptr[h1] + g_ptr[h2]);
}

View File

@ -904,7 +904,6 @@ static cmph_uint32 brz_bmz8_search_packed(cmph_uint32 *packed_mphf, const char *
if (h1 == h2 && ++h2 >= n) h2 = 0;
mphf_bucket = g[h1] + g[h2];
DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0);
DEBUGP("key: %s g[h1]: %u g[h2]: %u offset[h0]: %u edges: %u\n", key, g[h1], g[h2], >offset[h0], m);
DEBUGP("Address: %u\n", mphf_bucket + offset[h0]);
return (mphf_bucket + offset[h0]);
}

271
src/chd.c Normal file
View File

@ -0,0 +1,271 @@
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<math.h>
#include<time.h>
#include<assert.h>
#include<limits.h>
#include "cmph_structs.h"
#include "chd_structs.h"
#include "chd.h"
//#define DEBUG
#include "debug.h"
chd_config_data_t *chd_config_new(cmph_config_t *mph)
{
cmph_io_adapter_t *key_source = mph->key_source;
chd_config_data_t *chd;
chd = (chd_config_data_t *)malloc(sizeof(chd_config_data_t));
assert(chd);
memset(chd, 0, sizeof(chd_config_data_t));
chd->chd_ph = cmph_config_new(key_source);
cmph_config_set_algo(chd->chd_ph, CMPH_CHD_PH);
return chd;
}
void chd_config_destroy(cmph_config_t *mph)
{
chd_config_data_t *data = (chd_config_data_t *) mph->data;
DEBUGP("Destroying algorithm dependent data\n");
if(data->chd_ph)
{
cmph_config_destroy(data->chd_ph);
data->chd_ph = NULL;
}
free(data);
}
void chd_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
chd_config_data_t *data = (chd_config_data_t *) mph->data;
cmph_config_set_hashfuncs(data->chd_ph, hashfuncs);
}
void chd_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket)
{
chd_config_data_t *data = (chd_config_data_t *) mph->data;
cmph_config_set_b(data->chd_ph, keys_per_bucket);
}
void chd_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
{
chd_config_data_t *data = (chd_config_data_t *) mph->data;
cmph_config_set_keys_per_bin(data->chd_ph, keys_per_bin);
}
cmph_t *chd_new(cmph_config_t *mph, double c)
{
cmph_t *mphf = NULL;
chd_data_t *chdf = NULL;
chd_config_data_t *chd = (chd_config_data_t *)mph->data;
chd_ph_config_data_t * chd_ph = (chd_ph_config_data_t *)chd->chd_ph->data;
compressed_rank_t cr;
register cmph_t * chd_phf = NULL;
register cmph_uint32 packed_chd_phf_size = 0;
cmph_uint8 * packed_chd_phf = NULL;
register cmph_uint32 packed_cr_size = 0;
cmph_uint8 * packed_cr = NULL;
register cmph_uint32 i, idx, nkeys, nvals, nbins;
cmph_uint32 * vals_table = NULL;
register cmph_uint8 * occup_table = NULL;
#ifdef CMPH_TIMING
double construction_time_begin = 0.0;
double construction_time = 0.0;
ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
#endif
cmph_config_set_verbosity(chd->chd_ph, mph->verbosity);
cmph_config_set_graphsize(chd->chd_ph, c);
if (mph->verbosity)
{
fprintf(stderr, "Generating a CHD_PH perfect hash function with a load factor equal to %.3f\n", c);
}
chd_phf = cmph_new(chd->chd_ph);
if(chd_phf == NULL)
{
return NULL;
}
packed_chd_phf_size = cmph_packed_size(chd_phf);
DEBUGP("packed_chd_phf_size = %u\n", packed_chd_phf_size);
/* Make sure that we have enough space to pack the mphf. */
packed_chd_phf = calloc((size_t)packed_chd_phf_size,(size_t)1);
/* Pack the mphf. */
cmph_pack(chd_phf, packed_chd_phf);
cmph_destroy(chd_phf);
if (mph->verbosity)
{
fprintf(stderr, "Compressing the range of the resulting CHD_PH perfect hash function\n");
}
compressed_rank_init(&cr);
nbins = chd_ph->n;
nkeys = chd_ph->m;
nvals = nbins - nkeys;
vals_table = (cmph_uint32 *)calloc(nvals, sizeof(cmph_uint32));
occup_table = chd_ph->occup_table;
for(i = 0, idx = 0; i < nbins; i++)
{
if(occup_table[i] == 0)
{
vals_table[idx++] = i;
}
}
compressed_rank_generate(&cr, vals_table, nvals);
free(vals_table);
packed_cr_size = compressed_rank_packed_size(&cr);
packed_cr = (cmph_uint8 *) calloc(packed_cr_size, sizeof(cmph_uint8));
compressed_rank_pack(&cr, packed_cr);
compressed_rank_destroy(&cr);
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
chdf = (chd_data_t *)malloc(sizeof(chd_data_t));
chdf->packed_cr = packed_cr;
packed_cr = NULL; //transfer memory ownership
chdf->packed_chd_phf = packed_chd_phf;
packed_chd_phf = NULL; //transfer memory ownership
chdf->packed_chd_phf_size = packed_chd_phf_size;
chdf->packed_cr_size = packed_cr_size;
mphf->data = chdf;
mphf->size = nkeys;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
#ifdef CMPH_TIMING
ELAPSED_TIME_IN_SECONDS(&construction_time);
register cmph_uint32 space_usage = chd_packed_size(mphf)*8;
construction_time = construction_time - construction_time_begin;
fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", nkeys, c, chd_ph->keys_per_bucket, construction_time, space_usage/(double)nkeys);
#endif
return mphf;
}
void chd_load(FILE *fd, cmph_t *mphf)
{
register cmph_uint32 nbytes;
chd_data_t *chd = (chd_data_t *)malloc(sizeof(chd_data_t));
DEBUGP("Loading chd mphf\n");
mphf->data = chd;
nbytes = fread(&chd->packed_chd_phf_size, sizeof(cmph_uint32), (size_t)1, fd);
DEBUGP("Loading CHD_PH perfect hash function with %u bytes to disk\n", chd->packed_chd_phf_size);
chd->packed_chd_phf = (cmph_uint8 *) calloc((size_t)chd->packed_chd_phf_size,(size_t)1);
nbytes = fread(chd->packed_chd_phf, chd->packed_chd_phf_size, (size_t)1, fd);
nbytes = fread(&chd->packed_cr_size, sizeof(cmph_uint32), (size_t)1, fd);
DEBUGP("Loading Compressed rank structure, which has %u bytes\n", chd->packed_cr_size);
chd->packed_cr = (cmph_uint8 *) calloc((size_t)chd->packed_cr_size, (size_t)1);
nbytes = fread(chd->packed_cr, chd->packed_cr_size, (size_t)1, fd);
}
int chd_dump(cmph_t *mphf, FILE *fd)
{
register cmph_uint32 nbytes;
chd_data_t *data = (chd_data_t *)mphf->data;
__cmph_dump(mphf, fd);
// Dumping CHD_PH perfect hash function
DEBUGP("Dumping CHD_PH perfect hash function with %u bytes to disk\n", data->packed_chd_phf_size);
nbytes = fwrite(&data->packed_chd_phf_size, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(data->packed_chd_phf, data->packed_chd_phf_size, (size_t)1, fd);
DEBUGP("Dumping compressed rank structure with %u bytes to disk\n", buflen);
nbytes = fwrite(&data->packed_cr_size, sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(data->packed_cr, data->packed_cr_size, (size_t)1, fd);
return 1;
}
void chd_destroy(cmph_t *mphf)
{
chd_data_t *data = (chd_data_t *)mphf->data;
free(data->packed_chd_phf);
free(data->packed_cr);
free(data);
free(mphf);
}
static inline cmph_uint32 _chd_search(void * packed_chd_phf, void * packed_cr, const char *key, cmph_uint32 keylen)
{
register cmph_uint32 bin_idx = cmph_search_packed(packed_chd_phf, key, keylen);
register cmph_uint32 rank = compressed_rank_query_packed(packed_cr, bin_idx);
return bin_idx - rank;
}
cmph_uint32 chd_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
register chd_data_t * chd = mphf->data;
return _chd_search(chd->packed_chd_phf, chd->packed_cr, key, keylen);
}
void chd_pack(cmph_t *mphf, void *packed_mphf)
{
chd_data_t *data = (chd_data_t *)mphf->data;
cmph_uint32 * ptr = packed_mphf;
cmph_uint8 * ptr8;
// packing packed_cr_size and packed_cr
*ptr = data->packed_cr_size;
ptr8 = (cmph_uint8 *) (ptr + 1);
memcpy(ptr8, data->packed_cr, data->packed_cr_size);
ptr8 += data->packed_cr_size;
ptr = (cmph_uint32 *) ptr8;
*ptr = data->packed_chd_phf_size;
ptr8 = (cmph_uint8 *) (ptr + 1);
memcpy(ptr8, data->packed_chd_phf, data->packed_chd_phf_size);
}
cmph_uint32 chd_packed_size(cmph_t *mphf)
{
register chd_data_t *data = (chd_data_t *)mphf->data;
return (sizeof(CMPH_ALGO) + 2*sizeof(cmph_uint32) + data->packed_cr_size + data->packed_chd_phf_size);
}
cmph_uint32 chd_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
{
register cmph_uint32 * ptr = packed_mphf;
register cmph_uint32 packed_cr_size = *ptr++;
register cmph_uint8 * packed_chd_phf = ((cmph_uint8 *) ptr) + packed_cr_size + sizeof(cmph_uint32);
return _chd_search(packed_chd_phf, ptr, key, keylen);
}

59
src/chd.h Normal file
View File

@ -0,0 +1,59 @@
#ifndef _CMPH_CHD_H__
#define _CMPH_CHD_H__
#include "cmph.h"
typedef struct __chd_data_t chd_data_t;
typedef struct __chd_config_data_t chd_config_data_t;
/* Config API */
chd_config_data_t *chd_config_new(cmph_config_t * mph);
void chd_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
/** \fn void chd_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin);
* \brief Allows to set the number of keys per bin.
* \param mph pointer to the configuration structure
* \param keys_per_bin value for the number of keys per bin
*/
void chd_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin);
/** \fn void chd_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket);
* \brief Allows to set the number of keys per bucket.
* \param mph pointer to the configuration structure
* \param keys_per_bucket value for the number of keys per bucket
*/
void chd_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket);
void chd_config_destroy(cmph_config_t *mph);
/* Chd algorithm API */
cmph_t *chd_new(cmph_config_t *mph, double c);
void chd_load(FILE *fd, cmph_t *mphf);
int chd_dump(cmph_t *mphf, FILE *fd);
void chd_destroy(cmph_t *mphf);
cmph_uint32 chd_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
/** \fn void chd_pack(cmph_t *mphf, void *packed_mphf);
* \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf.
* \param mphf pointer to the resulting mphf
* \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size()
*/
void chd_pack(cmph_t *mphf, void *packed_mphf);
/** \fn cmph_uint32 chd_packed_size(cmph_t *mphf);
* \brief Return the amount of space needed to pack mphf.
* \param mphf pointer to a mphf
* \return the size of the packed function or zero for failures
*/
cmph_uint32 chd_packed_size(cmph_t *mphf);
/** cmph_uint32 chd_search(void *packed_mphf, const char *key, cmph_uint32 keylen);
* \brief Use the packed mphf to do a search.
* \param packed_mphf pointer to the packed mphf
* \param key key to be hashed
* \param keylen key legth in bytes
* \return The mphf value
*/
cmph_uint32 chd_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
#endif

View File

@ -11,7 +11,7 @@
#include "chd_ph.h"
#include"miller_rabin.h"
#define DEBUG
//#define DEBUG
#include "debug.h"
// NO_ELEMENT is equivalent to null pointer
@ -99,8 +99,7 @@ static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * bu
static inline cmph_uint32 * chd_ph_ordering(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets, cmph_uint32 max_bucket_size);
static inline cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size,
cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table,
cmph_uint8 * occup_table);
cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table);
static inline double chd_ph_space_lower_bound(cmph_uint32 _n, cmph_uint32 _r)
{
@ -146,10 +145,8 @@ chd_ph_config_data_t *chd_ph_config_new()
chd_ph->use_h = 1;
chd_ph->keys_per_bin = 1;
chd_ph->keys_per_bucket = 4;
chd_ph->occup_table = 0;
//The following fields are used just for statistics
chd_ph->space_usage = 0;
chd_ph->entropy = 0.0;
return chd_ph;
}
@ -157,6 +154,11 @@ void chd_ph_config_destroy(cmph_config_t *mph)
{
chd_ph_config_data_t *data = (chd_ph_config_data_t *) mph->data;
DEBUGP("Destroying algorithm dependent data\n");
if(data->occup_table)
{
free(data->occup_table);
data->occup_table = NULL;
}
free(data);
}
@ -286,8 +288,8 @@ cmph_uint32 * chd_ph_ordering(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets, c
return sorted_lists;
}
static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint8 * occup_table,
cmph_uint32 probe0_num, cmph_uint32 probe1_num, cmph_uint32 bucket_num)
static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 probe0_num,
cmph_uint32 probe1_num, cmph_uint32 bucket_num)
{
register cmph_uint32 i;
register cmph_uint32 size = buckets[bucket_num].size;
@ -300,11 +302,11 @@ static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph
{
position = (item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n;
if(occup_table[position] >= chd_ph->keys_per_bin)
if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
{
break;
}
occup_table[position]++;
(chd_ph->occup_table[position])++;
item = item->next;
};
@ -319,7 +321,7 @@ static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph
break;
}
position = (item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n;
occup_table[position]--;
(chd_ph->occup_table[position])--;
item = item->next;
i--;
};
@ -329,7 +331,7 @@ static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph
};
static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_probes,
cmph_uint32 * disp_table, cmph_uint8 * occup_table, cmph_uint32 bucket_num)
cmph_uint32 * disp_table, cmph_uint32 bucket_num)
{
register cmph_uint32 probe0_num, probe1_num, probe_num;
@ -339,7 +341,7 @@ static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucke
while(1)
{
if(place_bucket_probe(chd_ph, buckets, occup_table, probe0_num, probe1_num, bucket_num))
if(place_bucket_probe(chd_ph, buckets, probe0_num, probe1_num, bucket_num))
{
disp_table[bucket_num] = probe0_num + probe1_num * chd_ph->n;
return 1;
@ -360,8 +362,7 @@ static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucke
};
static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size,
cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table,
cmph_uint8 * occup_table)
cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table)
{
register cmph_uint32 i = 0;
register cmph_uint32 curr_bucket = 0;
@ -371,7 +372,7 @@ static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_buc
curr_bucket = sorted_lists[i];
while(curr_bucket != NO_ELEMENT)
{
if(!place_bucket(chd_ph, buckets, max_probes, disp_table, occup_table, curr_bucket))
if(!place_bucket(chd_ph, buckets, max_probes, disp_table, curr_bucket))
{
return 0;
}
@ -383,8 +384,7 @@ static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_buc
};
static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size,
cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table,
cmph_uint8 * occup_table)
cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table)
{
register cmph_uint32 i;
register cmph_uint32 curr_bucket, prev_bucket;
@ -402,7 +402,7 @@ static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_buc
while(curr_bucket != NO_ELEMENT)
{
// if bucket is successfully placed remove it from list
if(place_bucket_probe(chd_ph, buckets, occup_table, probe0_num, probe1_num, curr_bucket))
if(place_bucket_probe(chd_ph, buckets, probe0_num, probe1_num, curr_bucket))
{
disp_table[curr_bucket] = probe0_num + probe1_num * chd_ph->n;
// DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]);
@ -440,29 +440,28 @@ static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_buc
};
cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, cmph_uint32 max_bucket_size,
cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table,
cmph_uint8 * occup_table)
cmph_uint32 *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table)
{
if(chd_ph->use_h)
{
return place_buckets2(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table, occup_table);
return place_buckets2(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table);
}
else
{
return place_buckets1(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table, occup_table);
return place_buckets1(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table);
}
}
static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets,
cmph_uint32 * disp_table, cmph_uint8 * occup_table)
cmph_uint32 * disp_table)
{
register cmph_uint32 i, j;
register cmph_uint32 position, probe0_num, probe1_num;
register cmph_uint32 m = 0;
register chd_ph_item_t * item;
memset(occup_table, 0, chd_ph->n);
memset(chd_ph->occup_table, 0, chd_ph->n);
for(i = 0; i < chd_ph->nbuckets; i++)
{
j = buckets[i].size;
@ -477,11 +476,11 @@ static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph,
}
m++;
position = (item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n;
if(occup_table[position] >= chd_ph->keys_per_bin)
if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
{
return 0;
}
occup_table[position]++;
(chd_ph->occup_table[position])++;
item = item->next;
};
};
@ -498,7 +497,7 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c)
register double load_factor = c;
register cmph_uint8 searching_success = 0;
register cmph_uint32 max_probes = 1 << 18; // default value for max_probes
register cmph_uint32 max_probes = 1 << 20; // default value for max_probes
register cmph_uint32 iterations = 100;
chd_ph_bucket_t * buckets = NULL;
chd_ph_item_t * items = NULL;
@ -506,7 +505,13 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c)
cmph_uint32 max_bucket_size = 0;
cmph_uint32 * sorted_lists = NULL;
cmph_uint32 * disp_table = NULL;
cmph_uint8 * occup_table;
register double space_lower_bound = 0;
#ifdef CMPH_TIMING
double construction_time_begin = 0.0;
double construction_time = 0.0;
ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
#endif
chd_ph->m = mph->key_source->nkeys;
DEBUGP("m = %u\n", chd_ph->m);
@ -539,10 +544,14 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c)
};
DEBUGP("n = %u \n", chd_ph->n);
if(mph->verbosity && chd_ph->keys_per_bin == 1)
if(chd_ph->keys_per_bin == 1)
{
fprintf(stderr, "space lower bound is %.3f bits per key\n", chd_ph_space_lower_bound(chd_ph->m, chd_ph->n));
space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n);
}
if(mph->verbosity)
{
fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound);
}
// We allocate the working tables
@ -550,7 +559,7 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c)
items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t));
max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes);
occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8));
chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8));
disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32));
//
// init_genrand(time(0));
@ -588,12 +597,12 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c)
fprintf(stderr, "Starting searching step\n");
}
searching_success = chd_ph_searching(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table, occup_table);
searching_success = chd_ph_searching(chd_ph, buckets, max_bucket_size, sorted_lists, max_probes, disp_table);
if(searching_success) break;
// reset occup_table
memset(occup_table, 0, chd_ph->n);
memset(chd_ph->occup_table, 0, chd_ph->n);
if(iterations == 0)
{
// Cleanup memory
@ -607,15 +616,14 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c)
}
#ifdef DEBUG
chd_ph->entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes);
DEBUGP("Entropy = %.4f\n", chd_ph->entropy/chd_ph->m);
if(!chd_ph_check_bin_hashing(chd_ph, buckets, disp_table, occup_table))
{
if(!chd_ph_check_bin_hashing(chd_ph, buckets, disp_table))
{
DEBUGP("Error for bin packing generation");
return NULL;
};
DEBUGP("Error for bin packing generation");
return NULL;
}
}
#endif
if (mph->verbosity)
@ -630,16 +638,18 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c)
chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
compressed_seq_init(chd_ph->cs);
compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets);
chd_ph->space_usage = compressed_seq_get_space_usage(chd_ph->cs);
chd_ph->space_usage += 64;
DEBUGP("space_usage/key = %.4f\n", chd_ph->space_usage/(double)chd_ph->m);
#ifdef CMPH_TIMING
ELAPSED_TIME_IN_SECONDS(&construction_time);
register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes);
DEBUGP("Entropy = %.4f\n", entropy/chd_ph->m);
#endif
cleanup:
chd_ph_bucket_destroy(buckets);
free(items);
free(sorted_lists);
free(disp_table);
free(occup_table);
if(failure)
{
if(chd_ph->hl)
@ -670,6 +680,12 @@ cleanup:
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
#ifdef CMPH_TIMING
register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8;
construction_time = construction_time - construction_time_begin;
fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m);
#endif
return mphf;
}

21
src/chd_structs.h Normal file
View File

@ -0,0 +1,21 @@
#ifndef __CMPH_CHD_STRUCTS_H__
#define __CMPH_CHD_STRUCTS_H__
#include "chd_structs_ph.h"
#include "chd_ph.h"
#include "compressed_rank.h"
struct __chd_data_t
{
cmph_uint32 packed_cr_size;
cmph_uint8 * packed_cr; // packed compressed rank structure to control the number of zeros in a bit vector
cmph_uint32 packed_chd_phf_size;
cmph_uint8 * packed_chd_phf;
};
struct __chd_config_data_t
{
cmph_config_t *chd_ph; // chd_ph algorithm must be used here
};
#endif

View File

@ -24,9 +24,6 @@ struct __chd_ph_config_data_t
cmph_uint8 use_h; // flag to indicate the of use of a heuristic (use_h = 1)
cmph_uint32 keys_per_bin;//maximum number of keys per bin
cmph_uint32 keys_per_bucket; // average number of keys per bucket
//The following fields are used just for statistics
cmph_uint32 space_usage;
double entropy;
cmph_uint8 *occup_table; // table that indicates occupied positions
};
#endif

View File

@ -225,11 +225,11 @@ int chm_dump(cmph_t *mphf, FILE *fd)
nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd);
nbytes = fwrite(data->g, sizeof(cmph_uint32)*data->n, (size_t)1, fd);
#ifdef DEBUG
/* #ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]);
fprintf(stderr, "\n");
#endif
#endif*/
return 1;
}

View File

@ -8,6 +8,7 @@
#include "bdz.h" /* included -- Fabiano */
#include "bdz_ph.h" /* included -- Fabiano */
#include "chd_ph.h" /* included -- Fabiano */
#include "chd.h" /* included -- Fabiano */
#include <stdlib.h>
#include <assert.h>
@ -15,7 +16,7 @@
//#define DEBUG
#include "debug.h"
const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz", "bdz_ph", "chd_ph", NULL }; /* included -- Fabiano */
const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz", "bdz_ph", "chd_ph", "chd", NULL }; /* included -- Fabiano */
typedef struct
{
@ -325,6 +326,9 @@ void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo)
case CMPH_CHD_PH:
chd_ph_config_destroy(mph);
break;
case CMPH_CHD:
chd_config_destroy(mph);
break;
default:
assert(0);
}
@ -354,6 +358,9 @@ void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo)
case CMPH_CHD_PH:
mph->data = chd_ph_config_new();
break;
case CMPH_CHD:
mph->data = chd_config_new(mph);
break;
default:
assert(0);
}
@ -392,6 +399,10 @@ void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b)
{
chd_ph_config_set_b(mph, b);
}
else if (mph->algo == CMPH_CHD)
{
chd_config_set_b(mph, b);
}
}
void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
@ -400,6 +411,10 @@ void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
{
chd_ph_config_set_keys_per_bin(mph, keys_per_bin);
}
else if (mph->algo == CMPH_CHD)
{
chd_config_set_keys_per_bin(mph, keys_per_bin);
}
}
void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability)
@ -441,6 +456,9 @@ void cmph_config_destroy(cmph_config_t *mph)
case CMPH_CHD_PH: /* included -- Fabiano */
chd_ph_config_destroy(mph);
break;
case CMPH_CHD: /* included -- Fabiano */
chd_config_destroy(mph);
break;
default:
assert(0);
}
@ -481,6 +499,9 @@ void cmph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
case CMPH_CHD_PH: /* included -- Fabiano */
chd_ph_config_set_hashfuncs(mph, hashfuncs);
break;
case CMPH_CHD: /* included -- Fabiano */
chd_config_set_hashfuncs(mph, hashfuncs);
break;
default:
break;
}
@ -534,6 +555,10 @@ cmph_t *cmph_new(cmph_config_t *mph)
DEBUGP("Creating chd_ph hash\n");
mphf = chd_ph_new(mph, c);
break;
case CMPH_CHD: /* included -- Fabiano */
DEBUGP("Creating chd hash\n");
mphf = chd_new(mph, c);
break;
default:
assert(0);
}
@ -560,6 +585,8 @@ int cmph_dump(cmph_t *mphf, FILE *f)
return bdz_ph_dump(mphf, f);
case CMPH_CHD_PH: /* included -- Fabiano */
return chd_ph_dump(mphf, f);
case CMPH_CHD: /* included -- Fabiano */
return chd_dump(mphf, f);
default:
assert(0);
}
@ -607,6 +634,10 @@ cmph_t *cmph_load(FILE *f)
DEBUGP("Loading chd_ph algorithm dependent parts\n");
chd_ph_load(f, mphf);
break;
case CMPH_CHD: /* included -- Fabiano */
DEBUGP("Loading chd algorithm dependent parts\n");
chd_load(f, mphf);
break;
default:
assert(0);
}
@ -643,6 +674,9 @@ cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
case CMPH_CHD_PH: /* included -- Fabiano */
DEBUGP("chd_ph algorithm search\n");
return chd_ph_search(mphf, key, keylen);
case CMPH_CHD: /* included -- Fabiano */
DEBUGP("chd algorithm search\n");
return chd_search(mphf, key, keylen);
default:
assert(0);
}
@ -683,6 +717,9 @@ void cmph_destroy(cmph_t *mphf)
case CMPH_CHD_PH: /* included -- Fabiano */
chd_ph_destroy(mphf);
return;
case CMPH_CHD: /* included -- Fabiano */
chd_destroy(mphf);
return;
default:
assert(0);
}
@ -728,6 +765,9 @@ void cmph_pack(cmph_t *mphf, void *packed_mphf)
case CMPH_CHD_PH: /* included -- Fabiano */
chd_ph_pack(mphf, ptr);
break;
case CMPH_CHD: /* included -- Fabiano */
chd_pack(mphf, ptr);
break;
default:
assert(0);
}
@ -759,6 +799,8 @@ cmph_uint32 cmph_packed_size(cmph_t *mphf)
return bdz_ph_packed_size(mphf);
case CMPH_CHD_PH: /* included -- Fabiano */
return chd_ph_packed_size(mphf);
case CMPH_CHD: /* included -- Fabiano */
return chd_packed_size(mphf);
default:
assert(0);
}
@ -794,6 +836,8 @@ cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 k
return bdz_ph_search_packed(++ptr, key, keylen);
case CMPH_CHD_PH: /* included -- Fabiano */
return chd_ph_search_packed(++ptr, key, keylen);
case CMPH_CHD: /* included -- Fabiano */
return chd_search_packed(++ptr, key, keylen);
default:
assert(0);
}

View File

@ -101,6 +101,10 @@ cmph_uint32 cmph_packed_size(cmph_t *mphf);
*/
cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen);
// TIMING functions. To use the macro CMPH_TIMING must be defined
#include "cmph_time.h"
#ifdef __cplusplus
}
#endif

62
src/cmph_time.h Normal file
View File

@ -0,0 +1,62 @@
#ifdef ELAPSED_TIME_IN_SECONDS
#undef ELAPSED_TIME_IN_SECONDS
#endif
#ifdef ELAPSED_TIME_IN_uSECONDS
#undef ELAPSED_TIME_IN_uSECONDS
#endif
#ifdef WIN32
// include headers to use gettimeofday
#else
#ifdef __GNUC__
#include <sys/time.h>
#include <sys/resource.h>
#endif
#endif
#ifdef __GNUC__
#ifndef __CMPH_TIME_H__
#define __CMPH_TIME_H__
static inline void elapsed_time_in_seconds(double * elapsed_time)
{
struct timeval e_time;
if (gettimeofday(&e_time, NULL) < 0) {
return;
}
*elapsed_time = (double)e_time.tv_sec + ((double)e_time.tv_usec/1000000.0);
}
static inline void dummy_elapsed_time_in_seconds()
{
}
static inline void elapsed_time_in_useconds(cmph_uint64 * elapsed_time)
{
struct timeval e_time;
if (gettimeofday(&e_time, NULL) < 0) {
return;
}
*elapsed_time = e_time.tv_sec*1000000 + e_time.tv_usec;
}
static inline void dummy_elapsed_time_in_useconds()
{
}
#endif
#endif
#ifdef CMPH_TIMING
#ifdef __GNUC__
#define ELAPSED_TIME_IN_SECONDS elapsed_time_in_seconds
#define ELAPSED_TIME_IN_uSECONDS elapsed_time_in_useconds
#else
#define ELAPSED_TIME_IN_SECONDS dummy_elapsed_time_in_seconds
#define ELAPSED_TIME_IN_uSECONDS dummy_elapsed_time_in_useconds
#endif
#else
#ifdef __GNUC__
#define ELAPSED_TIME_IN_SECONDS
#define ELAPSED_TIME_IN_uSECONDS
#else
#define ELAPSED_TIME_IN_SECONDS dummy_elapsed_time_in_seconds
#define ELAPSED_TIME_IN_uSECONDS dummy_elapsed_time_in_useconds
#endif
#endif

View File

@ -35,7 +35,7 @@ typedef unsigned int cmph_uint32;
typedef enum { CMPH_HASH_JENKINS, CMPH_HASH_COUNT } CMPH_HASH;
extern const char *cmph_hash_names[];
typedef enum { CMPH_BMZ, CMPH_BMZ8, CMPH_CHM, CMPH_BRZ, CMPH_FCH,
CMPH_BDZ, CMPH_BDZ_PH, CMPH_CHD_PH, CMPH_COUNT } CMPH_ALGO; /* included -- Fabiano */
CMPH_BDZ, CMPH_BDZ_PH, CMPH_CHD_PH, CMPH_CHD, CMPH_COUNT } CMPH_ALGO; /* included -- Fabiano */
extern const char *cmph_names[];
#endif

321
src/compressed_rank.c Normal file
View File

@ -0,0 +1,321 @@
#include<stdlib.h>
#include<stdio.h>
#include<limits.h>
#include<string.h>
#include"compressed_rank.h"
#include"bitbool.h"
// #define DEBUG
#include"debug.h"
static inline cmph_uint32 compressed_rank_i_log2(cmph_uint32 x)
{
register cmph_uint32 res = 0;
while(x > 1)
{
x >>= 1;
res++;
}
return res;
};
void compressed_rank_init(compressed_rank_t * cr)
{
cr->max_val = 0;
cr->n = 0;
cr->rem_r = 0;
select_init(&cr->sel);
cr->vals_rems = 0;
}
void compressed_rank_destroy(compressed_rank_t * cr)
{
free(cr->vals_rems);
cr->vals_rems = 0;
select_destroy(&cr->sel);
}
void compressed_rank_generate(compressed_rank_t * cr, cmph_uint32 * vals_table, cmph_uint32 n)
{
register cmph_uint32 i,j;
register cmph_uint32 rems_mask;
register cmph_uint32 * select_vec = 0;
cr->n = n;
cr->max_val = vals_table[cr->n - 1];
cr->rem_r = compressed_rank_i_log2(cr->max_val/cr->n);
if(cr->rem_r == 0)
{
cr->rem_r = 1;
}
select_vec = (cmph_uint32 *) calloc(cr->max_val >> cr->rem_r, sizeof(cmph_uint32));
cr->vals_rems = (cmph_uint32 *) calloc(BITS_TABLE_SIZE(cr->n, cr->rem_r), sizeof(cmph_uint32));
rems_mask = (1 << cr->rem_r) - 1;
for(i = 0; i < cr->n; i++)
{
set_bits_value(cr->vals_rems, i, vals_table[i] & rems_mask, cr->rem_r, rems_mask);
}
for(i = 1, j = 0; i <= cr->max_val >> cr->rem_r; i++)
{
while(i > (vals_table[j] >> cr->rem_r))
{
j++;
}
select_vec[i - 1] = j;
};
// FABIANO: before it was (cr->total_length >> cr->rem_r) + 1. But I wiped out the + 1 because
// I changed the select structure to work up to m, instead of up to m - 1.
select_generate(&cr->sel, select_vec, cr->max_val >> cr->rem_r, cr->n);
free(select_vec);
}
cmph_uint32 compressed_rank_query(compressed_rank_t * cr, cmph_uint32 idx)
{
register cmph_uint32 rems_mask;
register cmph_uint32 val_quot, val_rem;
register cmph_uint32 sel_res, rank;
if(idx > cr->max_val)
{
return cr->n;
}
val_quot = idx >> cr->rem_r;
rems_mask = (1 << cr->rem_r) - 1;
val_rem = idx & rems_mask;
if(val_quot == 0)
{
rank = sel_res = 0;
}
else
{
sel_res = select_query(&cr->sel, val_quot - 1) + 1;
rank = sel_res - val_quot;
}
do
{
if(GETBIT32(cr->sel.bits_vec, sel_res))
{
break;
}
if(get_bits_value(cr->vals_rems, rank, cr->rem_r, rems_mask) >= val_rem)
{
break;
}
sel_res++;
rank++;
} while(1);
return rank;
}
cmph_uint32 compressed_rank_get_space_usage(compressed_rank_t * cr)
{
register cmph_uint32 space_usage = select_get_space_usage(&cr->sel);
space_usage += BITS_TABLE_SIZE(cr->n, cr->rem_r)*sizeof(cmph_uint32)*8;
space_usage += 3*sizeof(cmph_uint32)*8;
return space_usage;
}
void compressed_rank_dump(compressed_rank_t * cr, char **buf, cmph_uint32 *buflen)
{
register cmph_uint32 sel_size = select_packed_size(&(cr->sel));
register cmph_uint32 vals_rems_size = BITS_TABLE_SIZE(cr->n, cr->rem_r) * sizeof(cmph_uint32);
register cmph_uint32 pos = 0;
char * buf_sel = 0;
cmph_uint32 buflen_sel = 0;
*buflen = 4*sizeof(cmph_uint32) + sel_size + vals_rems_size;
DEBUGP("sel_size = %u\n", sel_size);
DEBUGP("vals_rems_size = %u\n", vals_rems_size);
*buf = (char *)calloc(*buflen, sizeof(char));
if (!*buf)
{
*buflen = UINT_MAX;
return;
}
// dumping max_val, n and rem_r
memcpy(*buf, &(cr->max_val), sizeof(cmph_uint32));
pos += sizeof(cmph_uint32);
DEBUGP("max_val = %u\n", cr->max_val);
memcpy(*buf + pos, &(cr->n), sizeof(cmph_uint32));
pos += sizeof(cmph_uint32);
DEBUGP("n = %u\n", cr->n);
memcpy(*buf + pos, &(cr->rem_r), sizeof(cmph_uint32));
pos += sizeof(cmph_uint32);
DEBUGP("rem_r = %u\n", cr->rem_r);
// dumping sel
select_dump(&cr->sel, &buf_sel, &buflen_sel);
memcpy(*buf + pos, &buflen_sel, sizeof(cmph_uint32));
pos += sizeof(cmph_uint32);
DEBUGP("buflen_sel = %u\n", buflen_sel);
memcpy(*buf + pos, buf_sel, buflen_sel);
#ifdef DEBUG
cmph_uint32 i = 0;
for(i = 0; i < buflen_sel; i++)
{
DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(*buf + pos + i));
}
#endif
pos += buflen_sel;
free(buf_sel);
// dumping vals_rems
memcpy(*buf + pos, cr->vals_rems, vals_rems_size);
#ifdef DEBUG
for(i = 0; i < vals_rems_size; i++)
{
DEBUGP("pos = %u -- vals_rems_size = %u -- vals_rems[%u] = %u\n", pos, vals_rems_size, i, *(*buf + pos + i));
}
#endif
pos += vals_rems_size;
DEBUGP("Dumped compressed rank structure with size %u bytes\n", *buflen);
}
void compressed_rank_load(compressed_rank_t * cr, const char *buf, cmph_uint32 buflen)
{
register cmph_uint32 pos = 0;
cmph_uint32 buflen_sel = 0;
register cmph_uint32 vals_rems_size = 0;
// loading max_val, n, and rem_r
memcpy(&(cr->max_val), buf, sizeof(cmph_uint32));
pos += sizeof(cmph_uint32);
DEBUGP("max_val = %u\n", cr->max_val);
memcpy(&(cr->n), buf + pos, sizeof(cmph_uint32));
pos += sizeof(cmph_uint32);
DEBUGP("n = %u\n", cr->n);
memcpy(&(cr->rem_r), buf + pos, sizeof(cmph_uint32));
pos += sizeof(cmph_uint32);
DEBUGP("rem_r = %u\n", cr->rem_r);
// loading sel
memcpy(&buflen_sel, buf + pos, sizeof(cmph_uint32));
pos += sizeof(cmph_uint32);
DEBUGP("buflen_sel = %u\n", buflen_sel);
select_load(&cr->sel, buf + pos, buflen_sel);
#ifdef DEBUG
cmph_uint32 i = 0;
for(i = 0; i < buflen_sel; i++)
{
DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(buf + pos + i));
}
#endif
pos += buflen_sel;
// loading vals_rems
if(cr->vals_rems)
{
free(cr->vals_rems);
}
vals_rems_size = BITS_TABLE_SIZE(cr->n, cr->rem_r);
cr->vals_rems = (cmph_uint32 *) calloc(vals_rems_size, sizeof(cmph_uint32));
vals_rems_size *= 4;
memcpy(cr->vals_rems, buf + pos, vals_rems_size);
#ifdef DEBUG
for(i = 0; i < vals_rems_size; i++)
{
DEBUGP("pos = %u -- vals_rems_size = %u -- vals_rems[%u] = %u\n", pos, vals_rems_size, i, *(buf + pos + i));
}
#endif
pos += vals_rems_size;
DEBUGP("Loaded compressed rank structure with size %u bytes\n", buflen);
}
void compressed_rank_pack(compressed_rank_t *cr, void *cr_packed)
{
if (cr && cr_packed)
{
char *buf = NULL;
cmph_uint32 buflen = 0;
compressed_rank_dump(cr, &buf, &buflen);
memcpy(cr_packed, buf, buflen);
free(buf);
}
}
cmph_uint32 compressed_rank_packed_size(compressed_rank_t *cr)
{
register cmph_uint32 sel_size = select_packed_size(&cr->sel);
register cmph_uint32 vals_rems_size = BITS_TABLE_SIZE(cr->n, cr->rem_r) * sizeof(cmph_uint32);
return 4 * sizeof(cmph_uint32) + sel_size + vals_rems_size;
}
cmph_uint32 compressed_rank_query_packed(void * cr_packed, cmph_uint32 idx)
{
// unpacking cr_packed
register cmph_uint32 *ptr = (cmph_uint32 *)cr_packed;
register cmph_uint32 max_val = *ptr++;
register cmph_uint32 n = *ptr++;
register cmph_uint32 rem_r = *ptr++;
register cmph_uint32 buflen_sel = *ptr++;
register cmph_uint32 * sel_packed = ptr;
register cmph_uint32 * bits_vec = sel_packed + 2; // skipping n and m
register cmph_uint32 * vals_rems = (ptr += (buflen_sel >> 2));
// compressed sequence query computation
register cmph_uint32 rems_mask;
register cmph_uint32 val_quot, val_rem;
register cmph_uint32 sel_res, rank;
if(idx > max_val)
{
return n;
}
val_quot = idx >> rem_r;
rems_mask = (1 << rem_r) - 1;
val_rem = idx & rems_mask;
if(val_quot == 0)
{
rank = sel_res = 0;
}
else
{
sel_res = select_query_packed(sel_packed, val_quot - 1) + 1;
rank = sel_res - val_quot;
}
do
{
if(GETBIT32(bits_vec, sel_res))
{
break;
}
if(get_bits_value(vals_rems, rank, rem_r, rems_mask) >= val_rem)
{
break;
}
sel_res++;
rank++;
} while(1);
return rank;
}

55
src/compressed_rank.h Normal file
View File

@ -0,0 +1,55 @@
#ifndef __CMPH_COMPRESSED_RANK_H__
#define __CMPH_COMPRESSED_RANK_H__
#include "select.h"
struct _compressed_rank_t
{
cmph_uint32 max_val;
cmph_uint32 n; // number of values stored in vals_rems
// The length in bits of each value is decomposed into two compnents: the lg(n) MSBs are stored in rank_select data structure
// the remaining LSBs are stored in a table of n cells, each one of rem_r bits.
cmph_uint32 rem_r;
select_t sel;
cmph_uint32 * vals_rems;
};
typedef struct _compressed_rank_t compressed_rank_t;
void compressed_rank_init(compressed_rank_t * cr);
void compressed_rank_destroy(compressed_rank_t * cr);
void compressed_rank_generate(compressed_rank_t * cr, cmph_uint32 * vals_table, cmph_uint32 n);
cmph_uint32 compressed_rank_query(compressed_rank_t * cr, cmph_uint32 idx);
cmph_uint32 compressed_rank_get_space_usage(compressed_rank_t * cr);
void compressed_rank_dump(compressed_rank_t * cr, char **buf, cmph_uint32 *buflen);
void compressed_rank_load(compressed_rank_t * cr, const char *buf, cmph_uint32 buflen);
/** \fn void compressed_rank_pack(compressed_rank_t *cr, void *cr_packed);
* \brief Support the ability to pack a compressed_rank structure into a preallocated contiguous memory space pointed by cr_packed.
* \param cr points to the compressed_rank structure
* \param cr_packed pointer to the contiguous memory area used to store the compressed_rank structure. The size of cr_packed must be at least @see compressed_rank_packed_size
*/
void compressed_rank_pack(compressed_rank_t *cr, void *cr_packed);
/** \fn cmph_uint32 compressed_rank_packed_size(compressed_rank_t *cr);
* \brief Return the amount of space needed to pack a compressed_rank structure.
* \return the size of the packed compressed_rank structure or zero for failures
*/
cmph_uint32 compressed_rank_packed_size(compressed_rank_t *cr);
/** \fn cmph_uint32 compressed_rank_query_packed(void * cr_packed, cmph_uint32 idx);
* \param cr_packed is a pointer to a contiguous memory area
* \param idx is an index to compute the rank
* \return an integer that represents the compressed_rank value.
*/
cmph_uint32 compressed_rank_query_packed(void * cr_packed, cmph_uint32 idx);
#endif

View File

@ -10,7 +10,7 @@
// #define DEBUG
#include "debug.h"
static inline cmph_uint32 i_log2(cmph_uint32 x)
static inline cmph_uint32 compressed_seq_i_log2(cmph_uint32 x)
{
register cmph_uint32 res = 0;
@ -61,7 +61,7 @@ void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cm
}
else
{
lengths[i] = i_log2(vals_table[i] + 1);
lengths[i] = compressed_seq_i_log2(vals_table[i] + 1);
cs->total_length += lengths[i];
};
};
@ -82,7 +82,12 @@ void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cm
cs->total_length += lengths[i];
};
cs->rem_r = i_log2(cs->total_length/cs->n);
cs->rem_r = compressed_seq_i_log2(cs->total_length/cs->n);
if(cs->rem_r == 0)
{
cs->rem_r = 1;
}
if(cs->length_rems)
{
@ -118,7 +123,7 @@ cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs)
return 4 * sizeof(cmph_uint32) * 8 + space_usage;
}
cmph_int32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx)
cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx)
{
register cmph_uint32 enc_idx, enc_length;
register cmph_uint32 rems_mask;
@ -156,7 +161,7 @@ cmph_int32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx)
void compressed_seq_dump(compressed_seq_t * cs, char ** buf, cmph_uint32 * buflen)
{
register cmph_uint32 sel_size = select_get_space_usage(&cs->sel) >> 3;
register cmph_uint32 sel_size = select_packed_size(&(cs->sel));
register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r) * 4;
register cmph_uint32 store_table_size = ((cs->total_length + 31) >> 5) * 4;
register cmph_uint32 pos = 0;
@ -325,7 +330,7 @@ cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs)
}
cmph_int32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx)
cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx)
{
// unpacking cs_packed
register cmph_uint32 *ptr = (cmph_uint32 *)cs_packed;

View File

@ -38,13 +38,13 @@ void compressed_seq_destroy(compressed_seq_t * cs);
void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cmph_uint32 n);
/** \fn cmph_int32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx);
/** \fn cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx);
* \brief Returns the value stored at index @see idx of the compressed sequence structure.
* \param cs points to the compressed sequence structure
* \param idx index to retrieve the value from
* \return the value stored at index @see idx of the compressed sequence structure
*/
cmph_int32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx);
cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx);
/** \fn cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs);
@ -73,12 +73,12 @@ void compressed_seq_pack(compressed_seq_t *cs, void *cs_packed);
cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs);
/** \fn cmph_int32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx);
/** \fn cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx);
* \brief Returns the value stored at index @see idx of the packed compressed sequence structure.
* \param cs_packed is a pointer to a contiguous memory area
* \param idx is the index to retrieve the value from
* \return the value stored at index @see idx of the packed compressed sequence structure
*/
cmph_int32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx);
cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx);
#endif

View File

@ -164,7 +164,7 @@ void select_generate(select_t * sel, cmph_uint32 * keys_vec, cmph_uint32 n, cmph
select_generate_sel_table(sel);
};
static inline cmph_int32 _select_query(cmph_uint8 * bits_table, cmph_uint32 * select_table, cmph_uint32 one_idx)
static inline cmph_uint32 _select_query(cmph_uint8 * bits_table, cmph_uint32 * select_table, cmph_uint32 one_idx)
{
register cmph_uint32 vec_bit_idx ,vec_byte_idx;
register cmph_uint32 part_sum, old_part_sum;
@ -187,13 +187,13 @@ static inline cmph_int32 _select_query(cmph_uint8 * bits_table, cmph_uint32 * se
return select_lookup_table[bits_table[vec_byte_idx - 1]][one_idx - old_part_sum] + ((vec_byte_idx-1) << 3);
}
cmph_int32 select_query(select_t * sel, cmph_uint32 one_idx)
cmph_uint32 select_query(select_t * sel, cmph_uint32 one_idx)
{
return _select_query((cmph_uint8 *)sel->bits_vec, sel->select_table, one_idx);
};
static inline cmph_int32 _select_next_query(cmph_uint8 * bits_table, cmph_uint32 vec_bit_idx)
static inline cmph_uint32 _select_next_query(cmph_uint8 * bits_table, cmph_uint32 vec_bit_idx)
{
register cmph_uint32 vec_byte_idx, one_idx;
register cmph_uint32 part_sum, old_part_sum;
@ -214,7 +214,7 @@ static inline cmph_int32 _select_next_query(cmph_uint8 * bits_table, cmph_uint32
return select_lookup_table[bits_table[(vec_byte_idx - 1)]][(one_idx - old_part_sum)] + ((vec_byte_idx - 1) << 3);
}
cmph_int32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx)
cmph_uint32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx)
{
return _select_next_query((cmph_uint8 *)sel->bits_vec, vec_bit_idx);
};
@ -315,12 +315,7 @@ cmph_uint32 select_packed_size(select_t *sel)
/** \fn cmph_int32 select_query_packed(void * sel_packed, cmph_uint32 idx);
* \param sel_packed is a pointer to a contiguous memory area
* \param idx is the rank for which we want to calculate the inverse function select
* \return an integer that represents the select value of rank idx.
*/
cmph_int32 select_query_packed(void * sel_packed, cmph_uint32 one_idx)
cmph_uint32 select_query_packed(void * sel_packed, cmph_uint32 one_idx)
{
register cmph_uint32 *ptr = (cmph_uint32 *)sel_packed;
register cmph_uint32 n = *ptr++;
@ -334,12 +329,7 @@ cmph_int32 select_query_packed(void * sel_packed, cmph_uint32 one_idx)
}
/** \fn cmph_int32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx);
* \param sel_packed is a pointer to a contiguous memory area
* \param vec_bit_idx is a value prior computed by @see select_query_packed
* \return an integer that represents the next select value greater than @see vec_bit_idx.
*/
cmph_int32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx)
cmph_uint32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx)
{
register cmph_uint8 * bits_vec = (cmph_uint8 *)sel_packed;
bits_vec += 8; // skipping n and m

View File

@ -18,9 +18,9 @@ void select_destroy(select_t * sel);
void select_generate(select_t * sel, cmph_uint32 * keys_vec, cmph_uint32 n, cmph_uint32 m);
cmph_int32 select_query(select_t * sel, cmph_uint32 one_idx);
cmph_uint32 select_query(select_t * sel, cmph_uint32 one_idx);
cmph_int32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx);
cmph_uint32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx);
cmph_uint32 select_get_space_usage(select_t * sel);
@ -43,19 +43,19 @@ void select_pack(select_t *sel, void *sel_packed);
cmph_uint32 select_packed_size(select_t *sel);
/** \fn cmph_int32 select_query_packed(void * sel_packed, cmph_uint32 idx);
/** \fn cmph_uint32 select_query_packed(void * sel_packed, cmph_uint32 one_idx);
* \param sel_packed is a pointer to a contiguous memory area
* \param one_idx is the rank for which we want to calculate the inverse function select
* \return an integer that represents the select value of rank idx.
*/
cmph_int32 select_query_packed(void * sel_packed, cmph_uint32 one_idx);
cmph_uint32 select_query_packed(void * sel_packed, cmph_uint32 one_idx);
/** \fn cmph_int32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx);
/** \fn cmph_uint32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx);
* \param sel_packed is a pointer to a contiguous memory area
* \param vec_bit_idx is a value prior computed by @see select_query_packed
* \return an integer that represents the next select value greater than @see vec_bit_idx.
*/
cmph_int32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx);
cmph_uint32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx);
#endif

View File

@ -1,4 +1,4 @@
noinst_PROGRAMS = graph_tests packed_mphf_tests mphf_tests select_tests compressed_seq_tests
noinst_PROGRAMS = graph_tests packed_mphf_tests mphf_tests select_tests compressed_seq_tests compressed_rank_tests
INCLUDES = -I../src/
@ -16,3 +16,6 @@ select_tests_LDADD = ../src/libcmph.la
compressed_seq_tests_SOURCES = compressed_seq_tests.c
compressed_seq_tests_LDADD = ../src/libcmph.la
compressed_rank_tests_SOURCES = compressed_rank_tests.c
compressed_rank_tests_LDADD = ../src/libcmph.la

View File

@ -27,7 +27,7 @@ int main(int argc, char **argv)
cmph_uint32 i = 0;
cmph_uint32 n = 20;
cmph_uint32 keys_vec[] = { 0, 1, 1, 1, 2, 2, 2, 3, 5, 5,
6, 6, 9, 9, 9, 12, 12, 13, 17, 10017};
6, 6, 9, 9, 9, 12, 12, 13, 17, 1077};
char *buf = NULL;
cmph_uint32 buflen = 0;
char * cs_packed = NULL;

View File

@ -154,6 +154,12 @@ int main(int argc, char **argv)
// testing the packed function
//check all keys
#ifdef CMPH_TIMING
double evaluation_time_begin = 0.0;
double evaluation_time = 0.0;
ELAPSED_TIME_IN_SECONDS(&evaluation_time_begin);
#endif
for (i = 0; i < source->nkeys; ++i)
{
cmph_uint32 h;
@ -179,6 +185,12 @@ int main(int argc, char **argv)
}
source->dispose(source->data, buf, buflen);
}
#ifdef CMPH_TIMING
ELAPSED_TIME_IN_SECONDS(&evaluation_time);
evaluation_time = evaluation_time - evaluation_time_begin;
fprintf(stdout, "%u\t%.2f\n", source->nkeys, evaluation_time);
#endif
free(packed_mphf);
cmph_destroy(mphf);
free(hashtable);