2005-09-24 00:43:33 +03:00
# include "graph.h"
2005-09-24 01:31:02 +03:00
# include "hashtree.h"
2005-09-24 00:43:33 +03:00
# include "cmph_structs.h"
2005-09-24 01:31:02 +03:00
# include "hastree_structs.h"
2005-09-24 00:43:33 +03:00
# include "hash.h"
# include "bitbool.h"
# include <math.h>
# include <stdlib.h>
# include <stdio.h>
# include <assert.h>
# include <string.h>
//#define DEBUG
# include "debug.h"
2005-09-24 01:31:02 +03:00
hashtree_config_data_t * hashtree_config_new ( )
2005-09-24 00:43:33 +03:00
{
2005-09-24 01:31:02 +03:00
hashtree_config_data_t * hashtree ;
hashtree = ( hashtree_config_data_t * ) malloc ( sizeof ( hashtree_config_data_t ) ) ;
if ( ! hashtree ) return NULL ;
memset ( hashtree , 0 , sizeof ( hashtree_config_data_t ) ) ;
hashtree - > hashfuncs [ 0 ] = CMPH_HASH_JENKINS ;
hashtree - > hashfuncs [ 1 ] = CMPH_HASH_JENKINS ;
hashtree - > hashfuncs [ 2 ] = CMPH_HASH_JENKINS ;
hashtree - > memory = 32 * 1024 * 1024 ;
return hashtree ;
2005-09-24 00:43:33 +03:00
}
2005-09-24 01:31:02 +03:00
void hashtree_config_destroy ( cmph_config_t * mph )
2005-09-24 00:43:33 +03:00
{
2005-09-24 01:31:02 +03:00
hashtree_config_data_t * data = ( hashtree_config_data_t * ) mph - > data ;
2005-09-24 00:43:33 +03:00
DEBUGP ( " Destroying algorithm dependent data \n " ) ;
free ( data ) ;
}
2005-09-24 01:31:02 +03:00
void hashtree_config_set_hashfuncs ( cmph_config_t * mph , CMPH_HASH * hashfuncs )
2005-09-24 00:43:33 +03:00
{
2005-09-24 01:31:02 +03:00
hashtree_config_data_t * hashtree = ( hashtree_config_data_t * ) mph - > data ;
2005-09-24 00:43:33 +03:00
CMPH_HASH * hashptr = hashfuncs ;
cmph_uint32 i = 0 ;
while ( * hashptr ! = CMPH_HASH_COUNT )
{
2005-09-24 01:31:02 +03:00
if ( i > = 3 ) break ; //hashtree only uses three hash functions
hashtree - > hashfuncs [ i ] = * hashptr ;
2005-09-24 00:43:33 +03:00
+ + i , + + hashptr ;
}
}
2005-09-24 01:31:02 +03:00
cmph_t * hashtree_new ( cmph_config_t * mph , float c )
2005-09-24 00:43:33 +03:00
{
cmph_t * mphf = NULL ;
2005-09-24 01:31:02 +03:00
hashtree_data_t * hashtreef = NULL ;
2005-09-24 00:43:33 +03:00
cmph_uint32 i ;
cmph_uint32 iterations = 20 ;
cmph_uint8 * visited = NULL ;
2005-09-24 01:31:02 +03:00
hashtree_config_data_t * hashtree = ( hashtree_config_data_t * ) mph - > data ;
hashtree - > m = mph - > key_source - > nkeys ;
hashtree - > n = ceil ( c * mph - > key_source - > nkeys ) ;
DEBUGP ( " m (edges): %u n (vertices): %u c: %f \n " , hashtree - > m , hashtree - > n , c ) ;
hashtree - > graph = graph_new ( hashtree - > n , hashtree - > m ) ;
2005-09-24 00:43:33 +03:00
DEBUGP ( " Created graph \n " ) ;
2005-09-24 01:31:02 +03:00
hashtree - > hashes = ( hash_state_t * * ) malloc ( sizeof ( hash_state_t * ) * 3 ) ;
for ( i = 0 ; i < 3 ; + + i ) hashtree - > hashes [ i ] = NULL ;
2005-09-24 00:43:33 +03:00
//Mapping step
if ( mph - > verbosity )
{
2005-09-24 01:31:02 +03:00
fprintf ( stderr , " Entering mapping step for mph creation of %u keys with graph sized %u \n " , hashtree - > m , hashtree - > n ) ;
2005-09-24 00:43:33 +03:00
}
while ( 1 )
{
int ok ;
2005-09-24 01:31:02 +03:00
hashtree - > hashes [ 0 ] = hash_state_new ( hashtree - > hashfuncs [ 0 ] , hashtree - > n ) ;
hashtree - > hashes [ 1 ] = hash_state_new ( hashtree - > hashfuncs [ 1 ] , hashtree - > n ) ;
ok = hashtree_gen_edges ( mph ) ;
2005-09-24 00:43:33 +03:00
if ( ! ok )
{
- - iterations ;
2005-09-24 01:31:02 +03:00
hash_state_destroy ( hashtree - > hashes [ 0 ] ) ;
hashtree - > hashes [ 0 ] = NULL ;
hash_state_destroy ( hashtree - > hashes [ 1 ] ) ;
hashtree - > hashes [ 1 ] = NULL ;
2005-09-24 00:43:33 +03:00
DEBUGP ( " %u iterations remaining \n " , iterations ) ;
if ( mph - > verbosity )
{
fprintf ( stderr , " Acyclic graph creation failure - %u iterations remaining \n " , iterations ) ;
}
if ( iterations = = 0 ) break ;
}
else break ;
}
if ( iterations = = 0 )
{
2005-09-24 01:31:02 +03:00
graph_destroy ( hashtree - > graph ) ;
2005-09-24 00:43:33 +03:00
return NULL ;
}
//Assignment step
if ( mph - > verbosity )
{
fprintf ( stderr , " Starting assignment step \n " ) ;
}
DEBUGP ( " Assignment step \n " ) ;
2005-09-24 01:31:02 +03:00
visited = ( char * ) malloc ( hashtree - > n / 8 + 1 ) ;
memset ( visited , 0 , hashtree - > n / 8 + 1 ) ;
free ( hashtree - > g ) ;
hashtree - > g = malloc ( hashtree - > n * sizeof ( cmph_uint32 ) ) ;
assert ( hashtree - > g ) ;
for ( i = 0 ; i < hashtree - > n ; + + i )
2005-09-24 00:43:33 +03:00
{
if ( ! GETBIT ( visited , i ) )
{
2005-09-24 01:31:02 +03:00
hashtree - > g [ i ] = 0 ;
hashtree_traverse ( hashtree , visited , i ) ;
2005-09-24 00:43:33 +03:00
}
}
2005-09-24 01:31:02 +03:00
graph_destroy ( hashtree - > graph ) ;
2005-09-24 00:43:33 +03:00
free ( visited ) ;
2005-09-24 01:31:02 +03:00
hashtree - > graph = NULL ;
2005-09-24 00:43:33 +03:00
mphf = ( cmph_t * ) malloc ( sizeof ( cmph_t ) ) ;
mphf - > algo = mph - > algo ;
2005-09-24 01:31:02 +03:00
hashtreef = ( hashtree_data_t * ) malloc ( sizeof ( hashtree_data_t ) ) ;
hashtreef - > g = hashtree - > g ;
hashtree - > g = NULL ; //transfer memory ownership
hashtreef - > hashes = hashtree - > hashes ;
hashtree - > hashes = NULL ; //transfer memory ownership
hashtreef - > n = hashtree - > n ;
hashtreef - > m = hashtree - > m ;
mphf - > data = hashtreef ;
mphf - > size = hashtree - > m ;
2005-09-24 00:43:33 +03:00
DEBUGP ( " Successfully generated minimal perfect hash \n " ) ;
if ( mph - > verbosity )
{
fprintf ( stderr , " Successfully generated minimal perfect hash function \n " ) ;
}
return mphf ;
}
2005-09-24 01:31:02 +03:00
static void hashtree_traverse ( hashtree_config_data_t * hashtree , cmph_uint8 * visited , cmph_uint32 v )
2005-09-24 00:43:33 +03:00
{
2005-09-24 01:31:02 +03:00
graph_iterator_t it = graph_neighbors_it ( hashtree - > graph , v ) ;
2005-09-24 00:43:33 +03:00
cmph_uint32 neighbor = 0 ;
SETBIT ( visited , v ) ;
DEBUGP ( " Visiting vertex %u \n " , v ) ;
2005-09-24 01:31:02 +03:00
while ( ( neighbor = graph_next_neighbor ( hashtree - > graph , & it ) ) ! = GRAPH_NO_NEIGHBOR )
2005-09-24 00:43:33 +03:00
{
DEBUGP ( " Visiting neighbor %u \n " , neighbor ) ;
if ( GETBIT ( visited , neighbor ) ) continue ;
DEBUGP ( " Visiting neighbor %u \n " , neighbor ) ;
2005-09-24 01:31:02 +03:00
DEBUGP ( " Visiting edge %u->%u with id %u \n " , v , neighbor , graph_edge_id ( hashtree - > graph , v , neighbor ) ) ;
hashtree - > g [ neighbor ] = graph_edge_id ( hashtree - > graph , v , neighbor ) - hashtree - > g [ v ] ;
DEBUGP ( " g is %u (%u - %u mod %u) \n " , hashtree - > g [ neighbor ] , graph_edge_id ( hashtree - > graph , v , neighbor ) , hashtree - > g [ v ] , hashtree - > m ) ;
hashtree_traverse ( hashtree , visited , neighbor ) ;
2005-09-24 00:43:33 +03:00
}
}
2005-09-24 01:31:02 +03:00
static int hashtree_gen_edges ( cmph_config_t * mph )
2005-09-24 00:43:33 +03:00
{
cmph_uint32 e ;
2005-09-24 01:31:02 +03:00
hashtree_config_data_t * hashtree = ( hashtree_config_data_t * ) mph - > data ;
2005-09-24 00:43:33 +03:00
int cycles = 0 ;
2005-09-24 01:31:02 +03:00
DEBUGP ( " Generating edges for %u vertices with hash functions %s and %s \n " , hashtree - > n , cmph_hash_names [ hashtree - > hashfuncs [ 0 ] ] , cmph_hash_names [ hashtree - > hashfuncs [ 1 ] ] ) ;
graph_clear_edges ( hashtree - > graph ) ;
2005-09-24 00:43:33 +03:00
mph - > key_source - > rewind ( mph - > key_source - > data ) ;
for ( e = 0 ; e < mph - > key_source - > nkeys ; + + e )
{
cmph_uint32 h1 , h2 ;
cmph_uint32 keylen ;
char * key ;
mph - > key_source - > read ( mph - > key_source - > data , & key , & keylen ) ;
2005-09-24 01:31:02 +03:00
h1 = hash ( hashtree - > hashes [ 0 ] , key , keylen ) % hashtree - > n ;
h2 = hash ( hashtree - > hashes [ 1 ] , key , keylen ) % hashtree - > n ;
if ( h1 = = h2 ) if ( + + h2 > = hashtree - > n ) h2 = 0 ;
2005-09-24 00:43:33 +03:00
if ( h1 = = h2 )
{
if ( mph - > verbosity ) fprintf ( stderr , " Self loop for key %u \n " , e ) ;
mph - > key_source - > dispose ( mph - > key_source - > data , key , keylen ) ;
return 0 ;
}
DEBUGP ( " Adding edge: %u -> %u for key %s \n " , h1 , h2 , key ) ;
mph - > key_source - > dispose ( mph - > key_source - > data , key , keylen ) ;
2005-09-24 01:31:02 +03:00
graph_add_edge ( hashtree - > graph , h1 , h2 ) ;
2005-09-24 00:43:33 +03:00
}
2005-09-24 01:31:02 +03:00
cycles = graph_is_cyclic ( hashtree - > graph ) ;
2005-09-24 00:43:33 +03:00
if ( mph - > verbosity & & cycles ) fprintf ( stderr , " Cyclic graph generated \n " ) ;
DEBUGP ( " Looking for cycles: %u \n " , cycles ) ;
return ! cycles ;
}
2005-09-24 01:31:02 +03:00
int hashtree_dump ( cmph_t * mphf , FILE * fd )
2005-09-24 00:43:33 +03:00
{
char * buf = NULL ;
cmph_uint32 buflen ;
cmph_uint32 two = 2 ; //number of hash functions
2005-09-24 01:31:02 +03:00
hashtree_data_t * data = ( hashtree_data_t * ) mphf - > data ;
2005-09-24 00:43:33 +03:00
__cmph_dump ( mphf , fd ) ;
fwrite ( & two , sizeof ( cmph_uint32 ) , 1 , fd ) ;
hash_state_dump ( data - > hashes [ 0 ] , & buf , & buflen ) ;
DEBUGP ( " Dumping hash state with %u bytes to disk \n " , buflen ) ;
fwrite ( & buflen , sizeof ( cmph_uint32 ) , 1 , fd ) ;
fwrite ( buf , buflen , 1 , fd ) ;
free ( buf ) ;
hash_state_dump ( data - > hashes [ 1 ] , & buf , & buflen ) ;
DEBUGP ( " Dumping hash state with %u bytes to disk \n " , buflen ) ;
fwrite ( & buflen , sizeof ( cmph_uint32 ) , 1 , fd ) ;
fwrite ( buf , buflen , 1 , fd ) ;
free ( buf ) ;
fwrite ( & ( data - > n ) , sizeof ( cmph_uint32 ) , 1 , fd ) ;
fwrite ( & ( data - > m ) , sizeof ( cmph_uint32 ) , 1 , fd ) ;
fwrite ( data - > g , sizeof ( cmph_uint32 ) * data - > n , 1 , fd ) ;
# ifdef DEBUG
fprintf ( stderr , " G: " ) ;
for ( i = 0 ; i < data - > n ; + + i ) fprintf ( stderr , " %u " , data - > g [ i ] ) ;
fprintf ( stderr , " \n " ) ;
# endif
return 1 ;
}
2005-09-24 01:31:02 +03:00
void hashtree_load ( FILE * f , cmph_t * mphf )
2005-09-24 00:43:33 +03:00
{
cmph_uint32 nhashes ;
char * buf = NULL ;
cmph_uint32 buflen ;
cmph_uint32 i ;
2005-09-24 01:31:02 +03:00
hashtree_data_t * hashtree = ( hashtree_data_t * ) malloc ( sizeof ( hashtree_data_t ) ) ;
2005-09-24 00:43:33 +03:00
2005-09-24 01:31:02 +03:00
DEBUGP ( " Loading hashtree mphf \n " ) ;
mphf - > data = hashtree ;
2005-09-24 00:43:33 +03:00
fread ( & nhashes , sizeof ( cmph_uint32 ) , 1 , f ) ;
2005-09-24 01:31:02 +03:00
hashtree - > hashes = ( hash_state_t * * ) malloc ( sizeof ( hash_state_t * ) * ( nhashes + 1 ) ) ;
hashtree - > hashes [ nhashes ] = NULL ;
2005-09-24 00:43:33 +03:00
DEBUGP ( " Reading %u hashes \n " , nhashes ) ;
for ( i = 0 ; i < nhashes ; + + i )
{
hash_state_t * state = NULL ;
fread ( & buflen , sizeof ( cmph_uint32 ) , 1 , f ) ;
DEBUGP ( " Hash state has %u bytes \n " , buflen ) ;
buf = ( char * ) malloc ( buflen ) ;
fread ( buf , buflen , 1 , f ) ;
state = hash_state_load ( buf , buflen ) ;
2005-09-24 01:31:02 +03:00
hashtree - > hashes [ i ] = state ;
2005-09-24 00:43:33 +03:00
free ( buf ) ;
}
DEBUGP ( " Reading m and n \n " ) ;
2005-09-24 01:31:02 +03:00
fread ( & ( hashtree - > n ) , sizeof ( cmph_uint32 ) , 1 , f ) ;
fread ( & ( hashtree - > m ) , sizeof ( cmph_uint32 ) , 1 , f ) ;
2005-09-24 00:43:33 +03:00
2005-09-24 01:31:02 +03:00
hashtree - > g = ( cmph_uint32 * ) malloc ( sizeof ( cmph_uint32 ) * hashtree - > n ) ;
fread ( hashtree - > g , hashtree - > n * sizeof ( cmph_uint32 ) , 1 , f ) ;
2005-09-24 00:43:33 +03:00
# ifdef DEBUG
fprintf ( stderr , " G: " ) ;
2005-09-24 01:31:02 +03:00
for ( i = 0 ; i < hashtree - > n ; + + i ) fprintf ( stderr , " %u " , hashtree - > g [ i ] ) ;
2005-09-24 00:43:33 +03:00
fprintf ( stderr , " \n " ) ;
# endif
return ;
}
2005-09-24 01:31:02 +03:00
cmph_uint32 hashtree_search ( cmph_t * mphf , const char * key , cmph_uint32 keylen )
2005-09-24 00:43:33 +03:00
{
2005-09-24 01:31:02 +03:00
hashtree_data_t * hashtree = mphf - > data ;
cmph_uint32 h1 = hash ( hashtree - > hashes [ 0 ] , key , keylen ) % hashtree - > n ;
cmph_uint32 h2 = hash ( hashtree - > hashes [ 1 ] , key , keylen ) % hashtree - > n ;
2005-09-24 00:43:33 +03:00
DEBUGP ( " key: %s h1: %u h2: %u \n " , key , h1 , h2 ) ;
2005-09-24 01:31:02 +03:00
if ( h1 = = h2 & & + + h2 > = hashtree - > n ) h2 = 0 ;
DEBUGP ( " key: %s g[h1]: %u g[h2]: %u edges: %u \n " , key , hashtree - > g [ h1 ] , hashtree - > g [ h2 ] , hashtree - > m ) ;
return ( hashtree - > g [ h1 ] + hashtree - > g [ h2 ] ) % hashtree - > m ;
2005-09-24 00:43:33 +03:00
}
2005-09-24 01:31:02 +03:00
void hashtree_destroy ( cmph_t * mphf )
2005-09-24 00:43:33 +03:00
{
2005-09-24 01:31:02 +03:00
hashtree_data_t * data = ( hashtree_data_t * ) mphf - > data ;
2005-09-24 00:43:33 +03:00
free ( data - > g ) ;
hash_state_destroy ( data - > hashes [ 0 ] ) ;
hash_state_destroy ( data - > hashes [ 1 ] ) ;
free ( data - > hashes ) ;
free ( data ) ;
free ( mphf ) ;
}