1
Fork 0

It was changed the prefix czech by chm

This commit is contained in:
fc_botelho 2005-01-25 21:06:58 +00:00
parent 18516c9a11
commit d7ea6d6a3e
7 changed files with 381 additions and 22 deletions

View File

@ -14,7 +14,7 @@ libcmph_la_SOURCES = debug.h\
graph.h graph.c\ graph.h graph.c\
cmph.h cmph.c\ cmph.h cmph.c\
cmph_structs.h cmph_structs.c\ cmph_structs.h cmph_structs.c\
czech.h czech_structs.h czech.c\ chm.h chm_structs.h chm.c\
bmz.h bmz_structs.h bmz.c bmz.h bmz_structs.h bmz.c
libcmph_la_LDFLAGS = -version-info 0:0:0 libcmph_la_LDFLAGS = -version-info 0:0:0

317
src/chm.c Normal file
View File

@ -0,0 +1,317 @@
#include "graph.h"
#include "chm.h"
#include "cmph_structs.h"
#include "chm_structs.h"
#include "hash.h"
#include "bitbool.h"
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <netinet/in.h>
//#define DEBUG
#include "debug.h"
/* static const char bitmask[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; */
/* #define GETBIT(array, i) (array[(i) / 8] & bitmask[(i) % 8]) */
/* #define SETBIT(array, i) (array[(i) / 8] |= bitmask[(i) % 8]) */
/* #define UNSETBIT(array, i) (array[(i) / 8] &= (~(bitmask[(i) % 8]))) */
static int chm_gen_edges(cmph_config_t *mph);
static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v);
chm_config_data_t *chm_config_new(cmph_io_adapter_t *key_source)
{
chm_config_data_t *chm = NULL;
chm = (chm_config_data_t *)malloc(sizeof(chm_config_data_t));
chm->hashfuncs[0] = CMPH_HASH_JENKINS;
chm->hashfuncs[1] = CMPH_HASH_JENKINS;
chm->g = NULL;
chm->graph = NULL;
chm->hashes = NULL;
assert(chm);
return chm;
}
void chm_config_destroy(cmph_config_t *mph)
{
chm_config_data_t *data = (chm_config_data_t *)mph->data;
DEBUGP("Destroying algorithm dependent data\n");
free(data);
}
void chm_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{
chm_config_data_t *chm = (chm_config_data_t *)mph->data;
CMPH_HASH *hashptr = hashfuncs;
cmph_uint32 i = 0;
while(*hashptr != CMPH_HASH_COUNT)
{
if (i >= 2) break; //chm only uses two hash functions
chm->hashfuncs[i] = *hashptr;
++i, ++hashptr;
}
}
cmph_t *chm_new(cmph_config_t *mph, float c)
{
cmph_t *mphf = NULL;
chm_data_t *chmf = NULL;
cmph_uint32 i;
cmph_uint32 iterations = 20;
cmph_uint8 *visited = NULL;
chm_config_data_t *chm = (chm_config_data_t *)mph->data;
chm->m = mph->key_source->nkeys;
chm->n = ceil(c * mph->key_source->nkeys);
DEBUGP("m (edges): %u n (vertices): %u c: %f\n", chm->m, chm->n, c);
chm->graph = graph_new(chm->n, chm->m);
DEBUGP("Created graph\n");
chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3);
for(i = 0; i < 3; ++i) chm->hashes[i] = NULL;
//Mapping step
if (mph->verbosity)
{
fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", chm->m, chm->n);
}
while(1)
{
int ok;
chm->hashes[0] = hash_state_new(chm->hashfuncs[0], chm->n);
chm->hashes[1] = hash_state_new(chm->hashfuncs[1], chm->n);
ok = chm_gen_edges(mph);
if (!ok)
{
--iterations;
hash_state_destroy(chm->hashes[0]);
chm->hashes[0] = NULL;
hash_state_destroy(chm->hashes[1]);
chm->hashes[1] = NULL;
DEBUGP("%u iterations remaining\n", iterations);
if (mph->verbosity)
{
fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations);
}
if (iterations == 0) break;
}
else break;
}
if (iterations == 0)
{
graph_destroy(chm->graph);
return NULL;
}
//Assignment step
if (mph->verbosity)
{
fprintf(stderr, "Starting assignment step\n");
}
DEBUGP("Assignment step\n");
visited = (char *)malloc(chm->n/8 + 1);
memset(visited, 0, chm->n/8 + 1);
free(chm->g);
chm->g = malloc(chm->n * sizeof(cmph_uint32));
assert(chm->g);
for (i = 0; i < chm->n; ++i)
{
if (!GETBIT(visited,i))
{
chm->g[i] = 0;
chm_traverse(chm, visited, i);
}
}
graph_destroy(chm->graph);
free(visited);
chm->graph = NULL;
mphf = (cmph_t *)malloc(sizeof(cmph_t));
mphf->algo = mph->algo;
chmf = (chm_data_t *)malloc(sizeof(chm_config_data_t));
chmf->g = chm->g;
chm->g = NULL; //transfer memory ownership
chmf->hashes = chm->hashes;
chm->hashes = NULL; //transfer memory ownership
chmf->n = chm->n;
chmf->m = chm->m;
mphf->data = chmf;
mphf->size = chm->m;
DEBUGP("Successfully generated minimal perfect hash\n");
if (mph->verbosity)
{
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
}
return mphf;
}
static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v)
{
graph_iterator_t it = graph_neighbors_it(chm->graph, v);
cmph_uint32 neighbor = 0;
SETBIT(visited,v);
DEBUGP("Visiting vertex %u\n", v);
while((neighbor = graph_next_neighbor(chm->graph, &it)) != GRAPH_NO_NEIGHBOR)
{
DEBUGP("Visiting neighbor %u\n", neighbor);
if(GETBIT(visited,neighbor)) continue;
DEBUGP("Visiting neighbor %u\n", neighbor);
DEBUGP("Visiting edge %u->%u with id %u\n", v, neighbor, graph_edge_id(chm->graph, v, neighbor));
chm->g[neighbor] = graph_edge_id(chm->graph, v, neighbor) - chm->g[v];
DEBUGP("g is %u (%u - %u mod %u)\n", chm->g[neighbor], graph_edge_id(chm->graph, v, neighbor), chm->g[v], chm->m);
chm_traverse(chm, visited, neighbor);
}
}
static int chm_gen_edges(cmph_config_t *mph)
{
cmph_uint32 e;
chm_config_data_t *chm = (chm_config_data_t *)mph->data;
int cycles = 0;
DEBUGP("Generating edges for %u vertices\n", chm->n);
graph_clear_edges(chm->graph);
mph->key_source->rewind(mph->key_source->data);
for (e = 0; e < mph->key_source->nkeys; ++e)
{
cmph_uint32 h1, h2;
cmph_uint32 keylen;
char *key;
mph->key_source->read(mph->key_source->data, &key, &keylen);
h1 = hash(chm->hashes[0], key, keylen) % chm->n;
h2 = hash(chm->hashes[1], key, keylen) % chm->n;
if (h1 == h2) if (++h2 >= chm->n) h2 = 0;
if (h1 == h2)
{
if (mph->verbosity) fprintf(stderr, "Self loop for key %e\n", e);
mph->key_source->dispose(mph->key_source->data, key, keylen);
return 0;
}
DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key);
mph->key_source->dispose(mph->key_source->data, key, keylen);
graph_add_edge(chm->graph, h1, h2);
}
cycles = graph_is_cyclic(chm->graph);
if (mph->verbosity && cycles) fprintf(stderr, "Cyclic graph generated\n");
DEBUGP("Looking for cycles: %u\n", cycles);
return ! cycles;
}
int chm_dump(cmph_t *mphf, FILE *fd)
{
char *buf = NULL;
cmph_uint32 buflen;
cmph_uint32 nbuflen;
cmph_uint32 i;
cmph_uint32 two = htonl(2); //number of hash functions
chm_data_t *data = (chm_data_t *)mphf->data;
cmph_uint32 nn, nm;
__cmph_dump(mphf, fd);
fwrite(&two, sizeof(cmph_uint32), 1, fd);
hash_state_dump(data->hashes[0], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbuflen = htonl(buflen);
fwrite(&nbuflen, sizeof(cmph_uint32), 1, fd);
fwrite(buf, buflen, 1, fd);
free(buf);
hash_state_dump(data->hashes[1], &buf, &buflen);
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
nbuflen = htonl(buflen);
fwrite(&nbuflen, sizeof(cmph_uint32), 1, fd);
fwrite(buf, buflen, 1, fd);
free(buf);
nn = htonl(data->n);
fwrite(&nn, sizeof(cmph_uint32), 1, fd);
nm = htonl(data->m);
fwrite(&nm, sizeof(cmph_uint32), 1, fd);
for (i = 0; i < data->n; ++i)
{
cmph_uint32 ng = htonl(data->g[i]);
fwrite(&ng, sizeof(cmph_uint32), 1, fd);
}
#ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]);
fprintf(stderr, "\n");
#endif
return 1;
}
void chm_load(FILE *f, cmph_t *mphf)
{
cmph_uint32 nhashes;
char fbuf[BUFSIZ];
char *buf = NULL;
cmph_uint32 buflen;
cmph_uint32 i;
hash_state_t *state;
chm_data_t *chm = (chm_data_t *)malloc(sizeof(chm_data_t));
DEBUGP("Loading chm mphf\n");
mphf->data = chm;
fread(&nhashes, sizeof(cmph_uint32), 1, f);
nhashes = ntohl(nhashes);
chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1));
chm->hashes[nhashes] = NULL;
DEBUGP("Reading %u hashes\n", nhashes);
for (i = 0; i < nhashes; ++i)
{
hash_state_t *state = NULL;
fread(&buflen, sizeof(cmph_uint32), 1, f);
buflen = ntohl(buflen);
DEBUGP("Hash state has %u bytes\n", buflen);
buf = (char *)malloc(buflen);
fread(buf, buflen, 1, f);
state = hash_state_load(buf, buflen);
chm->hashes[i] = state;
free(buf);
}
DEBUGP("Reading m and n\n");
fread(&(chm->n), sizeof(cmph_uint32), 1, f);
chm->n = ntohl(chm->n);
fread(&(chm->m), sizeof(cmph_uint32), 1, f);
chm->m = ntohl(chm->m);
chm->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*chm->n);
fread(chm->g, chm->n*sizeof(cmph_uint32), 1, f);
for (i = 0; i < chm->n; ++i) chm->g[i] = ntohl(chm->g[i]);
#ifdef DEBUG
fprintf(stderr, "G: ");
for (i = 0; i < chm->n; ++i) fprintf(stderr, "%u ", chm->g[i]);
fprintf(stderr, "\n");
#endif
return;
}
cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
{
chm_data_t *chm = mphf->data;
cmph_uint32 h1 = hash(chm->hashes[0], key, keylen) % chm->n;
cmph_uint32 h2 = hash(chm->hashes[1], key, keylen) % chm->n;
DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2);
if (h1 == h2 && ++h2 > chm->n) h2 = 0;
DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, chm->g[h1], chm->g[h2], chm->m);
return (chm->g[h1] + chm->g[h2]) % chm->m;
}
void chm_destroy(cmph_t *mphf)
{
chm_data_t *data = (chm_data_t *)mphf->data;
free(data->g);
hash_state_destroy(data->hashes[0]);
hash_state_destroy(data->hashes[1]);
free(data->hashes);
free(data);
free(mphf);
}

18
src/chm.h Normal file
View File

@ -0,0 +1,18 @@
#ifndef __CMPH_CHM_H__
#define __CMPH_CHM_H__
#include "cmph.h"
typedef struct __chm_data_t chm_data_t;
typedef struct __chm_config_data_t chm_config_data_t;
chm_config_data_t *chm_config_new(cmph_io_adapter_t *key_source);
void chm_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs);
void chm_config_destroy(cmph_config_t *mph);
cmph_t *chm_new(cmph_config_t *mph, float c);
void chm_load(FILE *f, cmph_t *mphf);
int chm_dump(cmph_t *mphf, FILE *f);
void chm_destroy(cmph_t *mphf);
cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen);
#endif

24
src/chm_structs.h Normal file
View File

@ -0,0 +1,24 @@
#ifndef __CMPH_CHM_STRUCTS_H__
#define __CMPH_CHM_STRUCTS_H__
#include "hash_state.h"
struct __chm_data_t
{
cmph_uint32 m; //edges (words) count
cmph_uint32 n; //vertex count
cmph_uint32 *g;
hash_state_t **hashes;
};
struct __chm_config_data_t
{
CMPH_HASH hashfuncs[2];
cmph_uint32 m; //edges (words) count
cmph_uint32 n; //vertex count
graph_t *graph;
cmph_uint32 *g;
hash_state_t **hashes;
};
#endif

View File

@ -1,6 +1,6 @@
#include "cmph.h" #include "cmph.h"
#include "cmph_structs.h" #include "cmph_structs.h"
#include "czech.h" #include "chm.h"
#include "bmz.h" #include "bmz.h"
//#include "bmz.h" /* included -- Fabiano */ //#include "bmz.h" /* included -- Fabiano */
@ -10,7 +10,7 @@
//#define DEBUG //#define DEBUG
#include "debug.h" #include "debug.h"
const char *cmph_names[] = { "bmz", "czech", NULL }; /* included -- Fabiano */ const char *cmph_names[] = { "bmz", "chm", NULL }; /* included -- Fabiano */
static int key_nlfile_read(void *data, char **key, cmph_uint32 *keylen) static int key_nlfile_read(void *data, char **key, cmph_uint32 *keylen)
{ {
@ -97,7 +97,7 @@ cmph_config_t *cmph_config_new(cmph_io_adapter_t *key_source)
cmph_config_t *mph = NULL; cmph_config_t *mph = NULL;
mph = __config_new(key_source); mph = __config_new(key_source);
assert(mph); assert(mph);
mph->algo = CMPH_CZECH; // default value mph->algo = CMPH_CHM; // default value
return mph; return mph;
} }
@ -111,8 +111,8 @@ void cmph_config_destroy(cmph_config_t *mph)
DEBUGP("Destroying mph with algo %s\n", cmph_names[mph->algo]); DEBUGP("Destroying mph with algo %s\n", cmph_names[mph->algo]);
switch (mph->algo) switch (mph->algo)
{ {
case CMPH_CZECH: case CMPH_CHM:
czech_config_destroy(mph); chm_config_destroy(mph);
break; break;
case CMPH_BMZ: /* included -- Fabiano */ case CMPH_BMZ: /* included -- Fabiano */
bmz_config_destroy(mph); bmz_config_destroy(mph);
@ -132,8 +132,8 @@ void cmph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
{ {
switch (mph->algo) switch (mph->algo)
{ {
case CMPH_CZECH: case CMPH_CHM:
czech_config_set_hashfuncs(mph, hashfuncs); chm_config_set_hashfuncs(mph, hashfuncs);
break; break;
case CMPH_BMZ: /* included -- Fabiano */ case CMPH_BMZ: /* included -- Fabiano */
bmz_config_set_hashfuncs(mph, hashfuncs); bmz_config_set_hashfuncs(mph, hashfuncs);
@ -157,11 +157,11 @@ cmph_t *cmph_new(cmph_config_t *mph)
DEBUGP("Creating mph with algorithm %s\n", cmph_names[mph->algo]); DEBUGP("Creating mph with algorithm %s\n", cmph_names[mph->algo]);
switch (mph->algo) switch (mph->algo)
{ {
case CMPH_CZECH: case CMPH_CHM:
DEBUGP("Creating czech hash\n"); DEBUGP("Creating chm hash\n");
mph->data = czech_config_new(mph->key_source); mph->data = chm_config_new(mph->key_source);
if (c == 0) c = 2.09; if (c == 0) c = 2.09;
mphf = czech_new(mph, c); mphf = chm_new(mph, c);
break; break;
case CMPH_BMZ: /* included -- Fabiano */ case CMPH_BMZ: /* included -- Fabiano */
DEBUGP("Creating bmz hash\n"); DEBUGP("Creating bmz hash\n");
@ -179,8 +179,8 @@ int cmph_dump(cmph_t *mphf, FILE *f)
{ {
switch (mphf->algo) switch (mphf->algo)
{ {
case CMPH_CZECH: case CMPH_CHM:
return czech_dump(mphf, f); return chm_dump(mphf, f);
break; break;
case CMPH_BMZ: /* included -- Fabiano */ case CMPH_BMZ: /* included -- Fabiano */
return bmz_dump(mphf, f); return bmz_dump(mphf, f);
@ -201,8 +201,8 @@ cmph_t *cmph_load(FILE *f)
switch (mphf->algo) switch (mphf->algo)
{ {
case CMPH_CZECH: case CMPH_CHM:
czech_load(f, mphf); chm_load(f, mphf);
break; break;
case CMPH_BMZ: /* included -- Fabiano */ case CMPH_BMZ: /* included -- Fabiano */
DEBUGP("Loading bmz algorithm dependent parts\n"); DEBUGP("Loading bmz algorithm dependent parts\n");
@ -221,8 +221,8 @@ cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
DEBUGP("mphf algorithm: %u \n", mphf->algo); DEBUGP("mphf algorithm: %u \n", mphf->algo);
switch(mphf->algo) switch(mphf->algo)
{ {
case CMPH_CZECH: case CMPH_CHM:
return czech_search(mphf, key, keylen); return chm_search(mphf, key, keylen);
case CMPH_BMZ: /* included -- Fabiano */ case CMPH_BMZ: /* included -- Fabiano */
DEBUGP("bmz algorithm search\n"); DEBUGP("bmz algorithm search\n");
return bmz_search(mphf, key, keylen); return bmz_search(mphf, key, keylen);
@ -242,8 +242,8 @@ void cmph_destroy(cmph_t *mphf)
{ {
switch(mphf->algo) switch(mphf->algo)
{ {
case CMPH_CZECH: case CMPH_CHM:
czech_destroy(mphf); chm_destroy(mphf);
return; return;
case CMPH_BMZ: /* included -- Fabiano */ case CMPH_BMZ: /* included -- Fabiano */
bmz_destroy(mphf); bmz_destroy(mphf);

View File

@ -8,7 +8,7 @@ typedef unsigned int cmph_uint32;
typedef enum { CMPH_HASH_DJB2, CMPH_HASH_FNV, CMPH_HASH_GLIB, CMPH_HASH_JENKINS, typedef enum { CMPH_HASH_DJB2, CMPH_HASH_FNV, CMPH_HASH_GLIB, CMPH_HASH_JENKINS,
CMPH_HASH_PJW, CMPH_HASH_SDBM, CMPH_HASH_COUNT } CMPH_HASH; CMPH_HASH_PJW, CMPH_HASH_SDBM, CMPH_HASH_COUNT } CMPH_HASH;
extern const char *cmph_hash_names[]; extern const char *cmph_hash_names[];
typedef enum { CMPH_BMZ, CMPH_CZECH, CMPH_COUNT } CMPH_ALGO; /* included -- Fabiano */ typedef enum { CMPH_BMZ, CMPH_CHM, CMPH_COUNT } CMPH_ALGO; /* included -- Fabiano */
extern const char *cmph_names[]; extern const char *cmph_names[];
#endif #endif

View File

@ -54,7 +54,7 @@ int main(int argc, char **argv)
CMPH_HASH *hashes = NULL; CMPH_HASH *hashes = NULL;
cmph_uint32 nhashes = 0; cmph_uint32 nhashes = 0;
cmph_uint32 i; cmph_uint32 i;
CMPH_ALGO mph_algo = CMPH_CZECH; CMPH_ALGO mph_algo = CMPH_CHM;
float c = 2.09; float c = 2.09;
cmph_config_t *config = NULL; cmph_config_t *config = NULL;
cmph_t *mphf = NULL; cmph_t *mphf = NULL;