From f04df98f919ee2a5ed868c13c02f7f3310c710dc Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Fri, 10 Sep 2010 00:07:06 -0700 Subject: [PATCH 01/89] Extracting graph code. --- cxxmph/mphtable.h | 342 +++++++++------------------------------------- cxxmph/trigraph.c | 20 +++ cxxmph/trigraph.h | 21 +++ 3 files changed, 104 insertions(+), 279 deletions(-) create mode 100644 cxxmph/trigraph.c create mode 100644 cxxmph/trigraph.h diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index 50e2897..a72dcb6 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -1,299 +1,83 @@ +// Minimal perfect hash abstraction implementing the BDZ algorithm + +#include "trigraph.h" template class MPHTable { public: typedef Key key_type; - - MPHTable(cmph_t* mphf); + MPHTable(); ~MPHTable(); template - bool reset(Iterator begin, Iterator end); - + bool Reset(ForwardIterator begin, ForwardIterator end); cmph_uint32 index(const key_type& x) const; private: - MPHTable(); - cmph_t* mphf; + typedef vector Queue; + int GenerateQueue( + cmph_uint32 nedges, cmph_uint32 nvertices, + TriGraph* graph, Queue* queue); // Generates three hash values for k in a single pass. static hash_vector(cmph_uint32 seed, const char* k, cmph_uint32 keylen, cmph_uint32* hashes) ; }; -unsigned int MPHTable::index(const key_type& key) const { - cmph_uint32 keylen = sizeof(key); - chd_ph_data_t* chd = reinterpret_cast(mphf->data); +int MPHTable::GenerateQueue( + cmph_uint32 nedges, cmph_uint32 nvertices, +TriGraph* graph, Queue* queue) { + cmph_uint32 queue_head = 0, queue_tail = 0; + vector marked_edge((nedges >> 3) + 1, false); + queue->swap(Queue(nvertices, 0)); + for (int i = 0; i < nedges; ++i) { + TriGraph::Edge e = graph.edges[i].vertices; + if (graph.vertex_degree_[e.vertices[0]] == 1 || + graph.vertex_degree_[e.vertices[1]] == 1 || + graph.vertex_degree[e.vertices[2]] == 1) { + if (!marked_edge[i]) { + (*queue)[queue_head++] = i; + marked_edge[i] = true; + } + } + } + while (queue_tail != queue_head) { + cmph_uint32 current_edge = (*queue)[queue_tail++]; + graph->RemoveEdge(current_edge); + TriGraph::Edge e = graph->edges[current_edge]; + for (int i = 0; i < 3; ++i) { + cmph_uint32 v = e.vertices[i]; + if (graph->vertex_degree[v] == 1) { + cmph_uint32 first_edge = graph->first_edge_[v]; + if (!marked_edge[first_edge) { + queue[queue_head++] = first_edge; + marked_edge[first_edge] = true; + } + } + } + } + marked_edge.swap(vector()); + return queue_head - nedges; +} + +int MPHTable::Mapping(TriGraph* graph, Queue* queue) { + int cycles = 0; cmph_uint32 hl[3]; - cmph_uint32 dispatch, position; - cmph_uint32 probe0, probe1; - cmph_uint32 f,g,h; - hash_vector(chd_ph->hl, reinterpret_cast(&key), keylen, hl); - g = hl[0] % chd_ph->nbuckets; - f = hl[1] % chd_ph->n; - g = hl[2] % (chd_ph->n - 1) + 1; - dispatch = compressed_seq_query(chd_ph->cs, g); - probe0_num = disp % chd_ph->n; - probe1_num = disp / chd_ph->n; - position = (cmph_uint32)((f + ((cmph_uint64)h)*probe0 + probe1) % chd_ph->n); - return position; -} - -void MPHTable::hash_vector(cmph_uint32 seed, const char* k, cmph_uint32 keylen, - cmph_uint32* hashes) { - cmph_uint32 len = keylen, length = keylen; - hashes[0] = hashes[1] = 0x9e3779b9; // the golden ratio; an arbitrary value - hashes[2] = seed; // the previous hash value - seed in our case - // consume most of the key - while (len >= 12) { - hashes[0] += ((cmph_uint32)k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24)); - hashes[1] += ((cmph_uint32)k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24)); - hashes[2] += ((cmph_uint32)k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24)); - mix(hashes[0],hashes[1],hashes[2]); - k += 12; len -= 12; + graph->Reset(m, n); + ForwardIterator it = begin; + for (cmph_uint32 e = 0; e < end - begin; ++e) { + cmph_uint32 h0, h1, h2; + StringPiece key = *it; + hash_vector(bdz->hl, key.data(), key.len(), hl); + h0 = hl[0] % bdz->r; + h1 = hl[1] % bdz->r + bdz->r; + h2 = hl[2] % bdz->r + (bdz->r << 1); + AddEdge(graph, h0, h1, h2); } - // Consumes the remaining 11 bytes - hashes[2] += length; - switch(len) { // all the case statements fall through - case 11: - hashes[2] +=((cmph_uint32)k[10]<<24); - case 10: - hashes[2] +=((cmph_uint32)k[9]<<16); - case 9: - hashes[2] +=((cmph_uint32)k[8]<<8); - /* the first byte of hashes[2] is reserved for the length */ - case 8: - hashes[1] +=((cmph_uint32)k[7]<<24); - case 7: - hashes[1] +=((cmph_uint32)k[6]<<16); - case 6: - hashes[1] +=((cmph_uint32)k[5]<<8); - case 5: - hashes[1] +=(cmph_uint8) k[4]; - case 4: - hashes[0] +=((cmph_uint32)k[3]<<24); - case 3: - hashes[0] +=((cmph_uint32)k[2]<<16); - case 2: - hashes[0] +=((cmph_uint32)k[1]<<8); - case 1: - hashes[0] +=(cmph_uint8)k[0]; - /* case 0: nothing left to add */ - } - mix(hashes[0],hashes[1],hashes[2]); + cycles = GenerateQueue(bdz->m, bdz->n, queue, graph); + return cycles == 0; } -cmph_uint32 MPHTable::select_query(select_t* sel, cmph_uint32 one_idx) { - cmph_uint8* bits_table = sel->bits_vec; - cmph_uint32* select_table = sel->select_table; - - cmph_uint32 vec_bit_idx, vec_byte_idx; - cmph_uint32 part_sum, old_part_sum; - - vec_bit_idx = select_table[one_idx >> NBITS_STEP_SELECT_TABLE]; // one_idx >> NBITS_STEP_SELECT_TABLE = one_idx/STEP_SELECT_TABLE - vec_byte_idx = vec_bit_idx >> 3; // vec_bit_idx / 8 - - one_idx &= MASK_STEP_SELECT_TABLE; // one_idx %= STEP_SELECT_TABLE == one_idx &= MASK_STEP_SELECT_TABLE - one_idx += rank_lookup_table[bits_table[vec_byte_idx] & ((1 << (vec_bit_idx & 0x7)) - 1)]; - part_sum = 0; - - do { - old_part_sum = part_sum; - part_sum += rank_lookup_table[bits_table[vec_byte_idx]]; - vec_byte_idx++; - } while (part_sum <= one_idx); - return select_lookup_table[bits_table[vec_byte_idx - 1]][one_idx - old_part_sum] + ((vec_byte_idx-1) << 3); -} - -/* -rank_lookup_table[i] simply gives the number of bits set to one in the byte of value i. -For example if i = 01010101 in binary then we have : -rank_lookup_table[i] = 4 -*/ - -static cmph_uint8 rank_lookup_table[256] ={ - 0 , 1 , 1 , 2 , 1 , 2 , 2 , 3 , 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 -, 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 -, 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 -, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 -, 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 -, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 -, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 -, 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 , 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7 -, 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 -, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 -, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 -, 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 , 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7 -, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 -, 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 , 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7 -, 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 , 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7 -, 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7 , 5 , 6 , 6 , 7 , 6 , 7 , 7 , 8 - }; - -/* -select_lookup_table[i][j] simply gives the index of the j'th bit set to one in the byte of value i. -For example if i=01010101 in binary then we have : -select_lookup_table[i][0] = 0, the first bit set to one is at position 0 -select_lookup_table[i][1] = 2, the second bit set to one is at position 2 -select_lookup_table[i][2] = 4, the third bit set to one is at position 4 -select_lookup_table[i][3] = 6, the fourth bit set to one is at position 6 -select_lookup_table[i][4] = 255, there is no more than 4 bits set to one in i, so we return escape value 255. -*/ -static cmph_uint8 select_lookup_table[256][8]={ -{ 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 255 , 255 , 255 , 255 , 255 , 255 } , -{ 2 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 255 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 255 , 255 , 255 , 255 , 255 } , -{ 3 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 255 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 3 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 255 , 255 , 255 , 255 , 255 } , -{ 2 , 3 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 3 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 255 , 255 , 255 , 255 } , -{ 4 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 255 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 4 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 255 , 255 , 255 , 255 , 255 } , -{ 2 , 4 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 4 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 255 , 255 , 255 , 255 } , -{ 3 , 4 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 3 , 4 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 255 , 255 , 255 , 255 } , -{ 2 , 3 , 4 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 3 , 4 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 255 , 255 , 255 } , -{ 5 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 5 , 255 , 255 , 255 , 255 , 255 } , -{ 2 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 5 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 5 , 255 , 255 , 255 , 255 } , -{ 3 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 5 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 3 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 5 , 255 , 255 , 255 , 255 } , -{ 2 , 3 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 5 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 3 , 5 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 5 , 255 , 255 , 255 } , -{ 4 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 5 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 4 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 5 , 255 , 255 , 255 , 255 } , -{ 2 , 4 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 5 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 4 , 5 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 5 , 255 , 255 , 255 } , -{ 3 , 4 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 5 , 255 , 255 , 255 , 255 } , -{ 1 , 3 , 4 , 5 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 5 , 255 , 255 , 255 } , -{ 2 , 3 , 4 , 5 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 5 , 255 , 255 , 255 } , -{ 1 , 2 , 3 , 4 , 5 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 5 , 255 , 255 } , -{ 6 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 6 , 255 , 255 , 255 , 255 , 255 } , -{ 2 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 6 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 6 , 255 , 255 , 255 , 255 } , -{ 3 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 6 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 3 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 6 , 255 , 255 , 255 , 255 } , -{ 2 , 3 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 6 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 3 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 6 , 255 , 255 , 255 } , -{ 4 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 6 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 4 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 6 , 255 , 255 , 255 , 255 } , -{ 2 , 4 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 6 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 4 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 6 , 255 , 255 , 255 } , -{ 3 , 4 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 6 , 255 , 255 , 255 , 255 } , -{ 1 , 3 , 4 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 6 , 255 , 255 , 255 } , -{ 2 , 3 , 4 , 6 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 6 , 255 , 255 , 255 } , -{ 1 , 2 , 3 , 4 , 6 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 6 , 255 , 255 } , -{ 5 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 5 , 6 , 255 , 255 , 255 , 255 } , -{ 2 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 5 , 6 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 5 , 6 , 255 , 255 , 255 } , -{ 3 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 5 , 6 , 255 , 255 , 255 , 255 } , -{ 1 , 3 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 5 , 6 , 255 , 255 , 255 } , -{ 2 , 3 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 5 , 6 , 255 , 255 , 255 } , -{ 1 , 2 , 3 , 5 , 6 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 5 , 6 , 255 , 255 } , -{ 4 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 5 , 6 , 255 , 255 , 255 , 255 } , -{ 1 , 4 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 5 , 6 , 255 , 255 , 255 } , -{ 2 , 4 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 5 , 6 , 255 , 255 , 255 } , -{ 1 , 2 , 4 , 5 , 6 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 5 , 6 , 255 , 255 } , -{ 3 , 4 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 5 , 6 , 255 , 255 , 255 } , -{ 1 , 3 , 4 , 5 , 6 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 5 , 6 , 255 , 255 } , -{ 2 , 3 , 4 , 5 , 6 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 5 , 6 , 255 , 255 } , -{ 1 , 2 , 3 , 4 , 5 , 6 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 5 , 6 , 255 } , -{ 7 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 7 , 255 , 255 , 255 , 255 , 255 } , -{ 2 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 7 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 7 , 255 , 255 , 255 , 255 } , -{ 3 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 7 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 3 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 7 , 255 , 255 , 255 , 255 } , -{ 2 , 3 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 7 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 3 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 7 , 255 , 255 , 255 } , -{ 4 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 7 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 4 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 7 , 255 , 255 , 255 , 255 } , -{ 2 , 4 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 7 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 4 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 7 , 255 , 255 , 255 } , -{ 3 , 4 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 7 , 255 , 255 , 255 , 255 } , -{ 1 , 3 , 4 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 7 , 255 , 255 , 255 } , -{ 2 , 3 , 4 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 7 , 255 , 255 , 255 } , -{ 1 , 2 , 3 , 4 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 7 , 255 , 255 } , -{ 5 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 5 , 7 , 255 , 255 , 255 , 255 } , -{ 2 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 5 , 7 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 5 , 7 , 255 , 255 , 255 } , -{ 3 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 5 , 7 , 255 , 255 , 255 , 255 } , -{ 1 , 3 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 5 , 7 , 255 , 255 , 255 } , -{ 2 , 3 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 5 , 7 , 255 , 255 , 255 } , -{ 1 , 2 , 3 , 5 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 5 , 7 , 255 , 255 } , -{ 4 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 5 , 7 , 255 , 255 , 255 , 255 } , -{ 1 , 4 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 5 , 7 , 255 , 255 , 255 } , -{ 2 , 4 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 5 , 7 , 255 , 255 , 255 } , -{ 1 , 2 , 4 , 5 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 5 , 7 , 255 , 255 } , -{ 3 , 4 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 5 , 7 , 255 , 255 , 255 } , -{ 1 , 3 , 4 , 5 , 7 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 5 , 7 , 255 , 255 } , -{ 2 , 3 , 4 , 5 , 7 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 5 , 7 , 255 , 255 } , -{ 1 , 2 , 3 , 4 , 5 , 7 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 5 , 7 , 255 } , -{ 6 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , -{ 1 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 6 , 7 , 255 , 255 , 255 , 255 } , -{ 2 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 6 , 7 , 255 , 255 , 255 , 255 } , -{ 1 , 2 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 6 , 7 , 255 , 255 , 255 } , -{ 3 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 6 , 7 , 255 , 255 , 255 , 255 } , -{ 1 , 3 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 6 , 7 , 255 , 255 , 255 } , -{ 2 , 3 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 6 , 7 , 255 , 255 , 255 } , -{ 1 , 2 , 3 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 6 , 7 , 255 , 255 } , -{ 4 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 6 , 7 , 255 , 255 , 255 , 255 } , -{ 1 , 4 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 6 , 7 , 255 , 255 , 255 } , -{ 2 , 4 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 6 , 7 , 255 , 255 , 255 } , -{ 1 , 2 , 4 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 6 , 7 , 255 , 255 } , -{ 3 , 4 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 6 , 7 , 255 , 255 , 255 } , -{ 1 , 3 , 4 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 6 , 7 , 255 , 255 } , -{ 2 , 3 , 4 , 6 , 7 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 6 , 7 , 255 , 255 } , -{ 1 , 2 , 3 , 4 , 6 , 7 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 6 , 7 , 255 } , -{ 5 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , -{ 1 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 5 , 6 , 7 , 255 , 255 , 255 } , -{ 2 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 5 , 6 , 7 , 255 , 255 , 255 } , -{ 1 , 2 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 5 , 6 , 7 , 255 , 255 } , -{ 3 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 3 , 5 , 6 , 7 , 255 , 255 , 255 } , -{ 1 , 3 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 3 , 5 , 6 , 7 , 255 , 255 } , -{ 2 , 3 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 2 , 3 , 5 , 6 , 7 , 255 , 255 } , -{ 1 , 2 , 3 , 5 , 6 , 7 , 255 , 255 } , { 0 , 1 , 2 , 3 , 5 , 6 , 7 , 255 } , -{ 4 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 4 , 5 , 6 , 7 , 255 , 255 , 255 } , -{ 1 , 4 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 4 , 5 , 6 , 7 , 255 , 255 } , -{ 2 , 4 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 2 , 4 , 5 , 6 , 7 , 255 , 255 } , -{ 1 , 2 , 4 , 5 , 6 , 7 , 255 , 255 } , { 0 , 1 , 2 , 4 , 5 , 6 , 7 , 255 } , -{ 3 , 4 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 3 , 4 , 5 , 6 , 7 , 255 , 255 } , -{ 1 , 3 , 4 , 5 , 6 , 7 , 255 , 255 } , { 0 , 1 , 3 , 4 , 5 , 6 , 7 , 255 } , -{ 2 , 3 , 4 , 5 , 6 , 7 , 255 , 255 } , { 0 , 2 , 3 , 4 , 5 , 6 , 7 , 255 } , -{ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 255 } , { 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 } }; - -cmph_uint32 MPHTable::compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx) -{ - cmph_uint32 enc_idx, enc_length; - cmph_uint32 rems_mask; - cmph_uint32 stored_value; - cmph_uint32 sel_res; - - assert(idx < cs->n); // FABIANO ADDED - - rems_mask = (1U << cs->rem_r) - 1U; - - if(idx == 0) { - enc_idx = 0; - sel_res = select_query(&cs->sel, idx); - } else { - sel_res = select_query(&cs->sel, idx - 1); - enc_idx = (sel_res - (idx - 1)) << cs->rem_r; - enc_idx += get_bits_value(cs->length_rems, idx-1, cs->rem_r, rems_mask); - sel_res = select_next_query(&cs->sel, sel_res); - }; - - enc_length = (sel_res - idx) << cs->rem_r; - enc_length += get_bits_value(cs->length_rems, idx, cs->rem_r, rems_mask); - enc_length -= enc_idx; - if(enc_length == 0) return 0; - - stored_value = get_bits_at_pos(cs->store_table, enc_idx, enc_length); - return stored_value + ((1U << enc_length) - 1U); -}; +void MPHTable::Assigning(TriGraph* graph, Queue* queue); +void MPHTable::Ranking(TriGraph* graph, Queue* queue); +cmph_uint32 MPHTable::Search(const StringPiece& key); +cmph_uint32 MPHTable::Rank(const StringPiece& key); diff --git a/cxxmph/trigraph.c b/cxxmph/trigraph.c new file mode 100644 index 0000000..b156416 --- /dev/null +++ b/cxxmph/trigraph.c @@ -0,0 +1,20 @@ +#include "trigraph.h" + +namespace { +static const cmph_uint8 kInvalidEdge = std::limits::max; +} + +TriGraph::TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices) + : nedges_(0), + edges_(nedges, 0), + first_edge_(nvertices, kInvalidEdge), + vertex_degree_(nvertices, 0) { } + +void Trigraph::ExtractEdgesAndClear(vector* edges) { + first_edge_.swap(vector()); + vertex_degree_.swap(vector()); + nedges_ = 0; + edges->swap(edges_); +} +void TriGraph::AddEdge(const Edge& edge) { } +void TriGraph::RemoveEdge(cmph_uint32 current_edge) { } diff --git a/cxxmph/trigraph.h b/cxxmph/trigraph.h new file mode 100644 index 0000000..aacf101 --- /dev/null +++ b/cxxmph/trigraph.h @@ -0,0 +1,21 @@ +class TriGraph { + struct Edge { + Edge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2); + cmph_uint32 vertices[3]; + }; + struct ConnectedEdge { + Edge current; + Edge next; + }; + + TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices); + void AddEdge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2); + void RemoveEdge(cmph_uint32 current_edge); + void ExtractEdgesAndClear(vector* edges); + + private: + cmph_uint32 nedges_; + vector edges_; + vector first_edge_; + vector vertex_degree_; +}; From bf0c5892d85ceab47105bcd2d0b8a9af16ff5725 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Tue, 5 Oct 2010 11:51:17 -0300 Subject: [PATCH 02/89] Lots of work. --- INSTALL | 238 ++++++++++++++++++++++++++++- Makefile.am | 2 +- configure.ac | 2 +- cxxmph/Makefile.am | 6 +- cxxmph/cmph_hash_map.h | 2 +- cxxmph/mphtable.cc | 124 +++++++++++---- cxxmph/mphtable.h | 85 +++-------- cxxmph/{trigraph.c => trigraph.cc} | 14 +- cxxmph/trigraph.h | 15 +- 9 files changed, 378 insertions(+), 110 deletions(-) rename cxxmph/{trigraph.c => trigraph.cc} (54%) diff --git a/INSTALL b/INSTALL index 1c1a83c..5458714 100644 --- a/INSTALL +++ b/INSTALL @@ -1,6 +1,234 @@ -Run the commands below or refer to the autotools documentation for more -sophisticated options. +Installation Instructions +************************* + +Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005, +2006 Free Software Foundation, Inc. + +This file is free documentation; the Free Software Foundation gives +unlimited permission to copy, distribute and modify it. + +Basic Installation +================== + +Briefly, the shell commands `./configure; make; make install' should +configure, build, and install this package. The following +more-detailed instructions are generic; see the `README' file for +instructions specific to this package. + + The `configure' shell script attempts to guess correct values for +various system-dependent variables used during compilation. It uses +those values to create a `Makefile' in each directory of the package. +It may also create one or more `.h' files containing system-dependent +definitions. Finally, it creates a shell script `config.status' that +you can run in the future to recreate the current configuration, and a +file `config.log' containing compiler output (useful mainly for +debugging `configure'). + + It can also use an optional file (typically called `config.cache' +and enabled with `--cache-file=config.cache' or simply `-C') that saves +the results of its tests to speed up reconfiguring. Caching is +disabled by default to prevent problems with accidental use of stale +cache files. + + If you need to do unusual things to compile the package, please try +to figure out how `configure' could check whether to do them, and mail +diffs or instructions to the address given in the `README' so they can +be considered for the next release. If you are using the cache, and at +some point `config.cache' contains results you don't want to keep, you +may remove or edit it. + + The file `configure.ac' (or `configure.in') is used to create +`configure' by a program called `autoconf'. You need `configure.ac' if +you want to change it or regenerate `configure' using a newer version +of `autoconf'. + +The simplest way to compile this package is: + + 1. `cd' to the directory containing the package's source code and type + `./configure' to configure the package for your system. + + Running `configure' might take a while. While running, it prints + some messages telling which features it is checking for. + + 2. Type `make' to compile the package. + + 3. Optionally, type `make check' to run any self-tests that come with + the package. + + 4. Type `make install' to install the programs and any data files and + documentation. + + 5. You can remove the program binaries and object files from the + source code directory by typing `make clean'. To also remove the + files that `configure' created (so you can compile the package for + a different kind of computer), type `make distclean'. There is + also a `make maintainer-clean' target, but that is intended mainly + for the package's developers. If you use it, you may have to get + all sorts of other programs in order to regenerate files that came + with the distribution. + +Compilers and Options +===================== + +Some systems require unusual options for compilation or linking that the +`configure' script does not know about. Run `./configure --help' for +details on some of the pertinent environment variables. + + You can give `configure' initial values for configuration parameters +by setting variables in the command line or in the environment. Here +is an example: + + ./configure CC=c99 CFLAGS=-g LIBS=-lposix + + *Note Defining Variables::, for more details. + +Compiling For Multiple Architectures +==================================== + +You can compile the package for more than one kind of computer at the +same time, by placing the object files for each architecture in their +own directory. To do this, you can use GNU `make'. `cd' to the +directory where you want the object files and executables to go and run +the `configure' script. `configure' automatically checks for the +source code in the directory that `configure' is in and in `..'. + + With a non-GNU `make', it is safer to compile the package for one +architecture at a time in the source code directory. After you have +installed the package for one architecture, use `make distclean' before +reconfiguring for another architecture. + +Installation Names +================== + +By default, `make install' installs the package's commands under +`/usr/local/bin', include files under `/usr/local/include', etc. You +can specify an installation prefix other than `/usr/local' by giving +`configure' the option `--prefix=PREFIX'. + + You can specify separate installation prefixes for +architecture-specific files and architecture-independent files. If you +pass the option `--exec-prefix=PREFIX' to `configure', the package uses +PREFIX as the prefix for installing programs and libraries. +Documentation and other data files still use the regular prefix. + + In addition, if you use an unusual directory layout you can give +options like `--bindir=DIR' to specify different values for particular +kinds of files. Run `configure --help' for a list of the directories +you can set and what kinds of files go in them. + + If the package supports it, you can cause programs to be installed +with an extra prefix or suffix on their names by giving `configure' the +option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. + +Optional Features +================= + +Some packages pay attention to `--enable-FEATURE' options to +`configure', where FEATURE indicates an optional part of the package. +They may also pay attention to `--with-PACKAGE' options, where PACKAGE +is something like `gnu-as' or `x' (for the X Window System). The +`README' should mention any `--enable-' and `--with-' options that the +package recognizes. + + For packages that use the X Window System, `configure' can usually +find the X include and library files automatically, but if it doesn't, +you can use the `configure' options `--x-includes=DIR' and +`--x-libraries=DIR' to specify their locations. + +Specifying the System Type +========================== + +There may be some features `configure' cannot figure out automatically, +but needs to determine by the type of machine the package will run on. +Usually, assuming the package is built to be run on the _same_ +architectures, `configure' can figure that out, but if it prints a +message saying it cannot guess the machine type, give it the +`--build=TYPE' option. TYPE can either be a short name for the system +type, such as `sun4', or a canonical name which has the form: + + CPU-COMPANY-SYSTEM + +where SYSTEM can have one of these forms: + + OS KERNEL-OS + + See the file `config.sub' for the possible values of each field. If +`config.sub' isn't included in this package, then this package doesn't +need to know the machine type. + + If you are _building_ compiler tools for cross-compiling, you should +use the option `--target=TYPE' to select the type of system they will +produce code for. + + If you want to _use_ a cross compiler, that generates code for a +platform different from the build platform, you should specify the +"host" platform (i.e., that on which the generated programs will +eventually be run) with `--host=TYPE'. + +Sharing Defaults +================ + +If you want to set default values for `configure' scripts to share, you +can create a site shell script called `config.site' that gives default +values for variables like `CC', `cache_file', and `prefix'. +`configure' looks for `PREFIX/share/config.site' if it exists, then +`PREFIX/etc/config.site' if it exists. Or, you can set the +`CONFIG_SITE' environment variable to the location of the site script. +A warning: not all `configure' scripts look for a site script. + +Defining Variables +================== + +Variables not defined in a site shell script can be set in the +environment passed to `configure'. However, some packages may run +configure again during the build, and the customized values of these +variables may be lost. In order to avoid this problem, you should set +them in the `configure' command line, using `VAR=value'. For example: + + ./configure CC=/usr/local2/bin/gcc + +causes the specified `gcc' to be used as the C compiler (unless it is +overridden in the site shell script). + +Unfortunately, this technique does not work for `CONFIG_SHELL' due to +an Autoconf bug. Until the bug is fixed you can use this workaround: + + CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash + +`configure' Invocation +====================== + +`configure' recognizes the following options to control how it operates. + +`--help' +`-h' + Print a summary of the options to `configure', and exit. + +`--version' +`-V' + Print the version of Autoconf used to generate the `configure' + script, and exit. + +`--cache-file=FILE' + Enable the cache: use and save the results of the tests in FILE, + traditionally `config.cache'. FILE defaults to `/dev/null' to + disable caching. + +`--config-cache' +`-C' + Alias for `--cache-file=config.cache'. + +`--quiet' +`--silent' +`-q' + Do not print messages saying which checks are being made. To + suppress all normal output, redirect it to `/dev/null' (any error + messages will still be shown). + +`--srcdir=DIR' + Look for the package's source code in directory DIR. Usually + `configure' can determine that directory automatically. + +`configure' also accepts some other, not widely useful, options. Run +`configure --help' for more details. -./configure --prefix=/usr -make -sudo make install diff --git a/Makefile.am b/Makefile.am index fc9a62a..0569dc0 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = src tests examples man +SUBDIRS = src tests examples cxxmph man EXTRA_DIST = cmph.spec configure.ac cmph.pc.in pkgconfigdir = $(libdir)/pkgconfig diff --git a/configure.ac b/configure.ac index 01c3343..7f0e2a2 100644 --- a/configure.ac +++ b/configure.ac @@ -37,4 +37,4 @@ dnl Checks for library functions. AC_CHECK_SPOON dnl AC_OUTPUT(Makefile tests/Makefile samples/Makefile) -AC_OUTPUT(Makefile src/Makefile tests/Makefile examples/Makefile man/Makefile cmph.pc) +AC_OUTPUT(Makefile src/Makefile cxxmph/Makefile tests/Makefile examples/Makefile man/Makefile cmph.pc) diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index fda6742..e29b81e 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,6 +1,8 @@ bin_PROGRAMS = cmph_hash_map_test +lib_LTLIBRARIES = libcxxmph.la -INCLUDES = -I../src/ +libcxxmph_la_SOURCES = trigragh.h trigraph.cc +libcxxmph_la_LDFLAGS = -version-info 0:0:0 -cmph_hash_map_test_LDADD = ../src/libcmph.la +cmph_hash_map_test_LDADD = libcxxmph.la cmph_hash_map_test_SOURCES = cmph_hash_map_test.cc diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index 55ef648..3923dc8 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -1,4 +1,4 @@ -#include +#include #include #include // for std::pair diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index b4de79d..7b79d0d 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -1,37 +1,105 @@ #include -template struct bitcount { - enum { value = (n & mask ? 1:0) + bitcount> 1>::value }; -}; -template struct bitcount { enum { value = 0 }; }; +#include "mphtable.h" -template struct bitposition { - enum +using std::vector; -template class CompileTimeByteTable { - public: - CompileTimeByteTable : current(op::value) { } - int operator[] (int i) { return *(¤t + i); } -private: - unsigned char current; - CompileTimeByteTable next; -}; +template +template -static CompileTimeByteTable<256, bitcount> BitcountTable; +void MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { + TableBuilderState st; + st.c = 1.23; + st.b = 7; + st.m = end - begin; + st.r = static_cast(ceil((st.c*st.m)/3)); + if ((st.r % 2) == 0) st.r += 1; + st.n = 3*st.r; + st.k = 1U << st.b; + st.ranktablesize = static_cast( + ceil(st.n / static_cast(st.k))); + st.graph_builder = TriGraph(st.m, st.n); // giant copy + st.edges_queue.resize(st.m) - -#define mix(a,b,c) \ -{ \ - a -= b; a -= c; a ^= (c>>13); \ - b -= c; b -= a; b ^= (a<<8); \ - c -= a; c -= b; c ^= (b>>13); \ - a -= b; a -= c; a ^= (c>>12); \ - b -= c; b -= a; b ^= (a<<16); \ - c -= a; c -= b; c ^= (b>>5); \ - a -= b; a -= c; a ^= (c>>3); \ - b -= c; b -= a; b ^= (a<<10); \ - c -= a; c -= b; c ^= (b>>15); \ + int iterations = 1000; + while (1) { + hasher hasher0 = HashFcn(); + ok = Mapping(st.graph_builder, st.edges_queue); + if (ok) break; + else --iterations; + if (iterations == 0) break; + } + if (iterations == 0) return false; + vector graph; + st.graph_builder.ExtractEdgesAndClear(&graph); + Assigning(graph, st.edges_queue); + vector().swap(st.edges_queue); + Ranking(graph); + } +template +int MPHTable::GenerateQueue( + cmph_uint32 nedges, cmph_uint32 nvertices, + TriGraph* graph, Queue* queue) { + cmph_uint32 queue_head = 0, queue_tail = 0; + // Relies on vector using 1 bit per element + vector marked_edge((nedges >> 3) + 1, false); + queue->swap(Queue(nvertices, 0)); + for (int i = 0; i < nedges; ++i) { + TriGraph::Edge e = graph.edges[i].vertices; + if (graph.vertex_degree_[e.vertices[0]] == 1 || + graph.vertex_degree_[e.vertices[1]] == 1 || + graph.vertex_degree[e.vertices[2]] == 1) { + if (!marked_edge[i]) { + (*queue)[queue_head++] = i; + marked_edge[i] = true; + } + } + } + while (queue_tail != queue_head) { + cmph_uint32 current_edge = (*queue)[queue_tail++]; + graph->RemoveEdge(current_edge); + TriGraph::Edge e = graph->edges[current_edge]; + for (int i = 0; i < 3; ++i) { + cmph_uint32 v = e.vertices[i]; + if (graph->vertex_degree[v] == 1) { + cmph_uint32 first_edge = graph->first_edge_[v]; + if (!marked_edge[first_edge) { + queue[queue_head++] = first_edge; + marked_edge[first_edge] = true; + } + } + } + } + vector().swap(marked_edge); + return queue_head - nedges; +} -static const int kMaskStepSelectTable = std::limit::max; +template +int MPHTable::Mapping(TriGraph* graph, Queue* queue) { + int cycles = 0; + graph->Reset(m, n); + for (ForwardIterator it = begin_; it != end_; ++it) { + cmph_uint32 hash_values[3]; + for (int i = 0; i < 3; ++i) { + hash_values[i] = hasher_(*it); + } + cmph_uint32 v0 = hash_values[0] % bdz->r; + cmph_uint32 v1 = hash_values[1] % bdz->r + bdz->r; + cmph_uint32 v2 = hash_values[2] % bdz->r + (bdz->r << 1); + graph->AddEdge(Edge(v0, v1, v2)); + } + cycles = GenerateQueue(bdz->m, bdz->n, queue, graph); + return cycles == 0; +} + +void MPHTable::Assigning(TriGraph* graph, Queue* queue) { +} +void MPHTable::Ranking(TriGraph* graph, Queue* queue) { +} +cmph_uint32 MPHTable::Search(const key_type& key) { +} + +cmph_uint32 MPHTable::Rank(const key_type& key) { +} diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index a72dcb6..309ce7f 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -1,83 +1,44 @@ // Minimal perfect hash abstraction implementing the BDZ algorithm +#include + #include "trigraph.h" -template +template > class MPHTable { public: typedef Key key_type; + typedef NewRandomlySeededHashFcn hasher; MPHTable(); ~MPHTable(); - template + template bool Reset(ForwardIterator begin, ForwardIterator end); cmph_uint32 index(const key_type& x) const; private: - typedef vector Queue; + typedef std::vector Queue; + template + struct TableBuilderState { + ForwardIterator begin; + ForwardIterator end; + Queue edges_queue; + TriGraph graph_builder; + double c; + cmph_uint32 m; + cmph_uint32 n; + cmph_uint32 k; + cmph_uint32 ranktablesize; + }; int GenerateQueue( cmph_uint32 nedges, cmph_uint32 nvertices, TriGraph* graph, Queue* queue); + void Assigning(TriGraph* graph, Queue* queue); + void Ranking(TriGraph* graph, Queue* queue); + cmph_uint32 Search(const StringPiece& key); + cmph_uint32 Rank(const StringPiece& key); - // Generates three hash values for k in a single pass. - static hash_vector(cmph_uint32 seed, const char* k, cmph_uint32 keylen, cmph_uint32* hashes) ; + std::vector graph_; }; -int MPHTable::GenerateQueue( - cmph_uint32 nedges, cmph_uint32 nvertices, -TriGraph* graph, Queue* queue) { - cmph_uint32 queue_head = 0, queue_tail = 0; - vector marked_edge((nedges >> 3) + 1, false); - queue->swap(Queue(nvertices, 0)); - for (int i = 0; i < nedges; ++i) { - TriGraph::Edge e = graph.edges[i].vertices; - if (graph.vertex_degree_[e.vertices[0]] == 1 || - graph.vertex_degree_[e.vertices[1]] == 1 || - graph.vertex_degree[e.vertices[2]] == 1) { - if (!marked_edge[i]) { - (*queue)[queue_head++] = i; - marked_edge[i] = true; - } - } - } - while (queue_tail != queue_head) { - cmph_uint32 current_edge = (*queue)[queue_tail++]; - graph->RemoveEdge(current_edge); - TriGraph::Edge e = graph->edges[current_edge]; - for (int i = 0; i < 3; ++i) { - cmph_uint32 v = e.vertices[i]; - if (graph->vertex_degree[v] == 1) { - cmph_uint32 first_edge = graph->first_edge_[v]; - if (!marked_edge[first_edge) { - queue[queue_head++] = first_edge; - marked_edge[first_edge] = true; - } - } - } - } - marked_edge.swap(vector()); - return queue_head - nedges; -} -int MPHTable::Mapping(TriGraph* graph, Queue* queue) { - int cycles = 0; - cmph_uint32 hl[3]; - graph->Reset(m, n); - ForwardIterator it = begin; - for (cmph_uint32 e = 0; e < end - begin; ++e) { - cmph_uint32 h0, h1, h2; - StringPiece key = *it; - hash_vector(bdz->hl, key.data(), key.len(), hl); - h0 = hl[0] % bdz->r; - h1 = hl[1] % bdz->r + bdz->r; - h2 = hl[2] % bdz->r + (bdz->r << 1); - AddEdge(graph, h0, h1, h2); - } - cycles = GenerateQueue(bdz->m, bdz->n, queue, graph); - return cycles == 0; -} - -void MPHTable::Assigning(TriGraph* graph, Queue* queue); -void MPHTable::Ranking(TriGraph* graph, Queue* queue); -cmph_uint32 MPHTable::Search(const StringPiece& key); -cmph_uint32 MPHTable::Rank(const StringPiece& key); diff --git a/cxxmph/trigraph.c b/cxxmph/trigraph.cc similarity index 54% rename from cxxmph/trigraph.c rename to cxxmph/trigraph.cc index b156416..89b6721 100644 --- a/cxxmph/trigraph.c +++ b/cxxmph/trigraph.cc @@ -1,18 +1,22 @@ +#include + #include "trigraph.h" +using std::vector; + namespace { -static const cmph_uint8 kInvalidEdge = std::limits::max; +static const cmph_uint8 kInvalidEdge = std::numeric_limits::max(); } TriGraph::TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices) : nedges_(0), - edges_(nedges, 0), + edges_(nedges), first_edge_(nvertices, kInvalidEdge), vertex_degree_(nvertices, 0) { } -void Trigraph::ExtractEdgesAndClear(vector* edges) { - first_edge_.swap(vector()); - vertex_degree_.swap(vector()); +void TriGraph::ExtractEdgesAndClear(vector* edges) { + vector().swap(first_edge_); + vector().swap(vertex_degree_); nedges_ = 0; edges->swap(edges_); } diff --git a/cxxmph/trigraph.h b/cxxmph/trigraph.h index aacf101..e4f8440 100644 --- a/cxxmph/trigraph.h +++ b/cxxmph/trigraph.h @@ -1,5 +1,10 @@ +#include + +#include "../src/cmph_types.h" + class TriGraph { struct Edge { + Edge() { } Edge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2); cmph_uint32 vertices[3]; }; @@ -9,13 +14,13 @@ class TriGraph { }; TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices); - void AddEdge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2); + void AddEdge(const Edge& edge); void RemoveEdge(cmph_uint32 current_edge); - void ExtractEdgesAndClear(vector* edges); + void ExtractEdgesAndClear(std::vector* edges); private: cmph_uint32 nedges_; - vector edges_; - vector first_edge_; - vector vertex_degree_; + std::vector edges_; + std::vector first_edge_; + std::vector vertex_degree_; }; From 724e716d673087757f950ee5b98b33a44d426e03 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sun, 24 Oct 2010 19:12:47 -0700 Subject: [PATCH 03/89] Added murmur hash and finished porting all c code. --- cxxmph/Makefile.am | 7 +- cxxmph/MurmurHash2.h | 64 ++++++++++ cxxmph/cmph_hash_map.h | 2 - cxxmph/mphtable.cc | 234 +++++++++++++++++++++++++--------- cxxmph/mphtable.h | 63 +++++---- cxxmph/mphtable_test.cc | 22 ++++ cxxmph/randomly_seeded_hash.h | 24 ++++ cxxmph/stringpiece.h | 177 +++++++++++++++++++++++++ cxxmph/trigraph.cc | 41 +++++- cxxmph/trigraph.h | 39 ++++-- 10 files changed, 569 insertions(+), 104 deletions(-) create mode 100644 cxxmph/MurmurHash2.h create mode 100644 cxxmph/mphtable_test.cc create mode 100644 cxxmph/randomly_seeded_hash.h create mode 100644 cxxmph/stringpiece.h diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index e29b81e..da7fa84 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,8 +1,11 @@ -bin_PROGRAMS = cmph_hash_map_test +bin_PROGRAMS = cmph_hash_map_test mphtable_test lib_LTLIBRARIES = libcxxmph.la -libcxxmph_la_SOURCES = trigragh.h trigraph.cc +libcxxmph_la_SOURCES = stringpiece.h MurmurHash2.h randomly_seeded_hash.h trigragh.h trigraph.cc mphtable.h mphtable.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 cmph_hash_map_test_LDADD = libcxxmph.la cmph_hash_map_test_SOURCES = cmph_hash_map_test.cc + +mphtable_test_LDADD = libcxxmph.la +mphtable_test_SOURCES = mphtable_test.cc diff --git a/cxxmph/MurmurHash2.h b/cxxmph/MurmurHash2.h new file mode 100644 index 0000000..81051fe --- /dev/null +++ b/cxxmph/MurmurHash2.h @@ -0,0 +1,64 @@ +//----------------------------------------------------------------------------- +// MurmurHash2, by Austin Appleby + +// Note - This code makes a few assumptions about how your machine behaves - + +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 + +// And it has a few limitations - + +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. + +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k = *(unsigned int *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index 3923dc8..ac061ea 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -2,8 +2,6 @@ #include #include // for std::pair -#include - // Save on repetitive typing. #define CMPH_TMPL_SPEC template #define CMPH_CLASS_SPEC cmph_hash_map diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index 7b79d0d..2c5ba32 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -1,105 +1,213 @@ -#include +#include #include "mphtable.h" using std::vector; +namespace cxxmph { + template template - -void MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { - TableBuilderState st; - st.c = 1.23; - st.b = 7; - st.m = end - begin; - st.r = static_cast(ceil((st.c*st.m)/3)); - if ((st.r % 2) == 0) st.r += 1; - st.n = 3*st.r; - st.k = 1U << st.b; - st.ranktablesize = static_cast( - ceil(st.n / static_cast(st.k))); - st.graph_builder = TriGraph(st.m, st.n); // giant copy - st.edges_queue.resize(st.m) +bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { + TableBuilderState st; + m_ = end - begin; + r_ = static_cast(ceil((c_*m_)/3)); + if (r_ % 2) == 0) r_ += 1; + n_ = 3*r_; + k_ = 1U << b_; int iterations = 1000; while (1) { - hasher hasher0 = HashFcn(); - ok = Mapping(st.graph_builder, st.edges_queue); - if (ok) break; + for (int i = 0; i < 3; ++i) hash_function_[i] = hasher(); + vector edges; + vector queue; + if (Mapping(begin, end, &edges, &queue)) break; else --iterations; if (iterations == 0) break; } if (iterations == 0) return false; - vector graph; - st.graph_builder.ExtractEdgesAndClear(&graph); - Assigning(graph, st.edges_queue); - vector().swap(st.edges_queue); - Ranking(graph); + vector& edges; + graph->ExtractEdgesAndClear(&edges); + Assigning(queue, edges); + vector().swap(edges); + Ranking(); } template -int MPHTable::GenerateQueue( - cmph_uint32 nedges, cmph_uint32 nvertices, - TriGraph* graph, Queue* queue) { +bool MPHTable::GenerateQueue( + TriGraph* graph, vector* queue_output) { cmph_uint32 queue_head = 0, queue_tail = 0; + cmph_uint32 nedges = n_; + cmph_uint32 nvertices = m_; // Relies on vector using 1 bit per element vector marked_edge((nedges >> 3) + 1, false); - queue->swap(Queue(nvertices, 0)); + Queue queue(nvertices, 0); for (int i = 0; i < nedges; ++i) { - TriGraph::Edge e = graph.edges[i].vertices; - if (graph.vertex_degree_[e.vertices[0]] == 1 || - graph.vertex_degree_[e.vertices[1]] == 1 || - graph.vertex_degree[e.vertices[2]] == 1) { + const TriGraph::Edge& e = graph->edges()[i]; + if (graph->vertex_degree()[e[0]] == 1 || + graph->vertex_degree()[e[1]] == 1 || + graph->vertex_degree()[e[2]] == 1) { if (!marked_edge[i]) { - (*queue)[queue_head++] = i; + queue[queue_head++] = i; marked_edge[i] = true; } } } while (queue_tail != queue_head) { - cmph_uint32 current_edge = (*queue)[queue_tail++]; + cmph_uint32 current_edge = queue[queue_tail++]; graph->RemoveEdge(current_edge); - TriGraph::Edge e = graph->edges[current_edge]; + const TriGraph::Edge& e = graph->edges()[current_edge]; for (int i = 0; i < 3; ++i) { - cmph_uint32 v = e.vertices[i]; - if (graph->vertex_degree[v] == 1) { - cmph_uint32 first_edge = graph->first_edge_[v]; - if (!marked_edge[first_edge) { + cmph_uint32 v = e[i]; + if (graph->vertex_degree()[v] == 1) { + cmph_uint32 first_edge = graph->first_edge()[v]; + if (!marked_edge[first_edge]) { queue[queue_head++] = first_edge; marked_edge[first_edge] = true; } } } } - vector().swap(marked_edge); - return queue_head - nedges; -} - -template -int MPHTable::Mapping(TriGraph* graph, Queue* queue) { - int cycles = 0; - graph->Reset(m, n); - for (ForwardIterator it = begin_; it != end_; ++it) { - cmph_uint32 hash_values[3]; - for (int i = 0; i < 3; ++i) { - hash_values[i] = hasher_(*it); - } - cmph_uint32 v0 = hash_values[0] % bdz->r; - cmph_uint32 v1 = hash_values[1] % bdz->r + bdz->r; - cmph_uint32 v2 = hash_values[2] % bdz->r + (bdz->r << 1); - graph->AddEdge(Edge(v0, v1, v2)); - } - cycles = GenerateQueue(bdz->m, bdz->n, queue, graph); + int cycles = queue_head - nedges; + if (cycles == 0) queue.swap(*queue_output); return cycles == 0; } -void MPHTable::Assigning(TriGraph* graph, Queue* queue) { -} -void MPHTable::Ranking(TriGraph* graph, Queue* queue) { -} -cmph_uint32 MPHTable::Search(const key_type& key) { +template +template +bool MPHTable::Mapping( + ForwardIterator begin, ForwardIterator end, + vector* edges, vector queue) { + int cycles = 0; + TriGraph graph(m, n); + for (ForwardIterator it = begin; it != end; ++it) { + cmph_uint32 h[3]; + for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it); + cmph_uint32 v0 = h[0] % r_; + cmph_uint32 v1 = h[1] % r_ + r_; + cmph_uint32 v2 = h[2] % r_ + (r_ << 1); + graph.AddEdge(Edge(v0, v1, v2)); + } + if (GenerateQueue(&graph, queue)) { + graph.ExtractEdgesAndClear(edges); + return true; + } + return false; } -cmph_uint32 MPHTable::Rank(const key_type& key) { +template +void MPHTable::Assigning( + const vector& edges, const vector& queue) { + cmph_uint32 nedges = n_; + cmph_uint32 current_edge = 0; + vector marked_vertices(nedges + 1); + // TODO(davi) use half nibbles instead + // vector g(static_cast(ceil(nedges / 4.0)), + // std::numerical_limits::max()); + static const cmph_uint8 kUnassigned = 3; + vector(nedges, kUnassigned).swap(g_); + for (int i = nedges - 1; i + 1 >= 1; --i) { + current_edge = queue[i]; + const TriGraph::Edge& e = edges[current_edge]; + if (!marked_vertices[e[0]]) { + if (!marked_vertices[e[1]]) { + g_[e[1]] = kUnassigned; + marked_vertices[e[1]] = true; + } + if (!marked_vertices[e[2]]) { + g_[e[2]] = kUnassigned; + marked_vertices[e[2]] = true; + } + g_[e[0]] = (6 - g_[e[1]] + g_[e2]) % 3; + marked_vertices[e[0]] = true; + } else if (!marked_vertices[e[1]])) { + if (!marked_vertices[e[2]])) { + g_[e[2]] = kUnassigned; + marked_vertices[e[2]] = true; + } + g_[e[1]] = 7 - (g_[e[0]] + g_[e[2]]) % 3; + marked_vertices[e[1]] = true; + } else { + g_[e[2]] = (8 - g_[e[0]] + g_[e[1]]) % 3; + marked_vertices[e[2]] = true; + } + } } + +// table used for looking up the number of assigned vertices to a 8-bit integer +static cmph_uint8 kBdzLookupTable[] = +{ +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0 +}; + +template +void MPHTable::Ranking() { + cmph_uint32 nbytes_total = static_cast(ceil(st->n / 4.0)); + cmph_uint32 size = k_ >> 2U; + ranktablesize = static_cast(ceil(n_ / static_cast(k_))); + // TODO(davi) Change swap of member classes for resize + memset to avoid fragmentation + vector (ranktablesize).swap(ranktable_);; + cmph_uint32 offset = 0; + cmph_uint32 count = 0; + cmph_uint32 i = 0; + while (1) { + if (i == ranktable.size()) break; + cmph_uint32 nbytes = size < nbytes_total ? size : nbytes_total; + for (j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]]; + ranktable_[i] = count; + offset += nbytes; + nbytes_total -= size; + ++i; + } +} + +template +cmph_uint32 MPHTable::Search(const key_type& key) const { + cmph_uint32 vertex; + cmph_uint32 h[3]; + for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](key); + h[0] = h[0] % st->r; + h[1] = h[1] % st->r + st->r; + h[2] = h[2] % st->r + (st->r << 1); + cmph_uint32 vertex = h[(h[g_[h[0]] + g_[h[1]] + g_[h[2]]) % 3]; + return Rank(st->b, st->ranktable, vertex); +} + +template +cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { + cmph_uint32 index = vertex >> b_; + cmph_uint32 base_rank = ranktable_[index]; + cmph_uint32 beg_idx_v = index << b; + cmph_uint32 beg_idx_b = index >> 2 + cmph_uint32 end_idx_b = index >> 2 + while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]]; + beg_idx_v = beg_idx_b << 2; + while (beg_idx_v < vertex) { + if (g_[beg_idx_v) != kUnassigned) ++base_rank; + ++beg_idx_v; + } + return base_rank; +} + +template +cmph_uint32 MPHTable::index(const key_type& key) const { + return Search(key); +} + +} // namespace cxxmph diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index 309ce7f..eccff61 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -1,15 +1,22 @@ +#ifndef __CXXMPH_MPHTABLE_H__ +#define __CXXMPH_MPHTABLE_H__ + // Minimal perfect hash abstraction implementing the BDZ algorithm #include +#include "randomly_seeded_hash.h" +#include "stringpiece.h" #include "trigraph.h" -template > +namespace cxxmph { + +template class MPHTable { public: typedef Key key_type; typedef NewRandomlySeededHashFcn hasher; - MPHTable(); + MPHTable(double c = 1.23, cmph_uint8 b = 7) : c_(c), b_(b) { } ~MPHTable(); template @@ -17,28 +24,38 @@ class MPHTable { cmph_uint32 index(const key_type& x) const; private: - typedef std::vector Queue; - template - struct TableBuilderState { - ForwardIterator begin; - ForwardIterator end; - Queue edges_queue; - TriGraph graph_builder; - double c; - cmph_uint32 m; - cmph_uint32 n; - cmph_uint32 k; - cmph_uint32 ranktablesize; - }; - int GenerateQueue( - cmph_uint32 nedges, cmph_uint32 nvertices, - TriGraph* graph, Queue* queue); - void Assigning(TriGraph* graph, Queue* queue); - void Ranking(TriGraph* graph, Queue* queue); - cmph_uint32 Search(const StringPiece& key); - cmph_uint32 Rank(const StringPiece& key); + template + bool Mapping(ForwardIterator begin, ForwardIterator end, + vector* edges, vector queue); + bool GenerateQueue(TriGraph* graph, vector* queue); + void Assigning(TriGraph* graph_builder, Queue* queue); + void Ranking(TriGraph* graph_builder, Queue* queue); + cmph_uint32 Search(const StringPiece& key); + cmph_uint32 Rank(const StringPiece& key); - std::vector graph_; + // Algorithm parameters + cmph_uint8 b_; // Number of bits of the kth index in the ranktable + double c_; // Number of bits per key (? is it right) + + // Values used during generation + cmph_uint32 m_; // edges count + cmph_uint32 n_; // vertex count + cmph_uint32 k_ // kth index in ranktable, $k = log_2(n=3r)\varepsilon$ + + // Values used during search + + // Partition vertex count, derived from c parameter. + cmph_uint32 r_; + // The array containing the minimal perfect hash function graph. + std::vector g_; + // The table used for the rank step of the minimal perfect hash function + std::vector ranktable_; + // The selected hash function triplet for finding the edges in the minimal + // perfect hash function graph. + hasher hash_function_[3]; + }; +} // namespace cxxmph +#define // __CXXMPH_MPHTABLE_H__ diff --git a/cxxmph/mphtable_test.cc b/cxxmph/mphtable_test.cc new file mode 100644 index 0000000..e18b34d --- /dev/null +++ b/cxxmph/mphtable_test.cc @@ -0,0 +1,22 @@ +#include +#include + +#include "mphtable.h" + +using std::vector; +using cxxmph::MPHTable; + +int main(int argc, char** argv) { + vector keys; + keys.push_back(10); + keys.push_back(4); + keys.push_back(3); + + MPHTable mphtable; + assert(mphtable.Reset(keys.begin(), keys.end())); + vector ids; + for (int i = 0; i < keys.size(); ++i) ids.push_back(mphtable.index(keys[i])); + sort(ids.begin(), ids.end()); + for (int i = 0; i < ids.size(); ++i) assert(ids[i] == i); +} + diff --git a/cxxmph/randomly_seeded_hash.h b/cxxmph/randomly_seeded_hash.h new file mode 100644 index 0000000..69db56a --- /dev/null +++ b/cxxmph/randomly_seeded_hash.h @@ -0,0 +1,24 @@ +#ifndef __CXXMPH_RANDOMLY_SEEDED_HASH__ +#define __CXXMPH_RANDOMLY_SEEDED_HASH__ + +// Helper to create randomly seeded hash functions out of existing hash +// functions that take a seed as a parameter. + +#include + +#include "../src/cmph_types.h" +#include "MurmurHash2.h" + +namespace cxxmph { + +struct RandomlySeededMurmur2 { + RandomlySeededHashFunction() : seed(random()) { } + cmph_uint32 operator()(const StringPiece& key) { + return MurmurHash2(key.data(), key.length(), seed); + } + cmph_uint32 seed; +}; + +} // namespace cxxmph + +#endif // __CXXMPH_RANDOMLY_SEEDED_HASH__ diff --git a/cxxmph/stringpiece.h b/cxxmph/stringpiece.h new file mode 100644 index 0000000..fdd8f75 --- /dev/null +++ b/cxxmph/stringpiece.h @@ -0,0 +1,177 @@ +// Copyright 2001-2010 The RE2 Authors. All Rights Reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// A string-like object that points to a sized piece of memory. +// +// Functions or methods may use const StringPiece& parameters to accept either +// a "const char*" or a "string" value that will be implicitly converted to +// a StringPiece. The implicit conversion means that it is often appropriate +// to include this .h file in other files rather than forward-declaring +// StringPiece as would be appropriate for most other Google classes. +// +// Systematic usage of StringPiece is encouraged as it will reduce unnecessary +// conversions from "const char*" to "string" and back again. +// +// +// Arghh! I wish C++ literals were "string". + +#ifndef CXXMPH_STRINGPIECE_H__ +#define CXXMPH_STRINGPIECE_H__ + +#include +#include +#include + +namespace cxxmph { + +class StringPiece { + private: + const char* ptr_; + int length_; + + public: + // We provide non-explicit singleton constructors so users can pass + // in a "const char*" or a "string" wherever a "StringPiece" is + // expected. + StringPiece() : ptr_(NULL), length_(0) { } + StringPiece(const char* str) + : ptr_(str), length_((str == NULL) ? 0 : static_cast(strlen(str))) { } + StringPiece(const std::string& str) + : ptr_(str.data()), length_(static_cast(str.size())) { } + StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { } + + // data() may return a pointer to a buffer with embedded NULs, and the + // returned buffer may or may not be null terminated. Therefore it is + // typically a mistake to pass data() to a routine that expects a NUL + // terminated string. + const char* data() const { return ptr_; } + int size() const { return length_; } + int length() const { return length_; } + bool empty() const { return length_ == 0; } + + void clear() { ptr_ = NULL; length_ = 0; } + void set(const char* data, int len) { ptr_ = data; length_ = len; } + void set(const char* str) { + ptr_ = str; + if (str != NULL) + length_ = static_cast(strlen(str)); + else + length_ = 0; + } + void set(const void* data, int len) { + ptr_ = reinterpret_cast(data); + length_ = len; + } + + char operator[](int i) const { return ptr_[i]; } + + void remove_prefix(int n) { + ptr_ += n; + length_ -= n; + } + + void remove_suffix(int n) { + length_ -= n; + } + + int compare(const StringPiece& x) const { + int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_)); + if (r == 0) { + if (length_ < x.length_) r = -1; + else if (length_ > x.length_) r = +1; + } + return r; + } + + std::string as_string() const { + return std::string(data(), size()); + } + // We also define ToString() here, since many other string-like + // interfaces name the routine that converts to a C++ string + // "ToString", and it's confusing to have the method that does that + // for a StringPiece be called "as_string()". We also leave the + // "as_string()" method defined here for existing code. + std::string ToString() const { + return std::string(data(), size()); + } + + void CopyToString(std::string* target) const; + void AppendToString(std::string* target) const; + + // Does "this" start with "x" + bool starts_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (memcmp(ptr_, x.ptr_, x.length_) == 0)); + } + + // Does "this" end with "x" + bool ends_with(const StringPiece& x) const { + return ((length_ >= x.length_) && + (memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0)); + } + + // standard STL container boilerplate + typedef char value_type; + typedef const char* pointer; + typedef const char& reference; + typedef const char& const_reference; + typedef size_t size_type; + typedef ptrdiff_t difference_type; + static const size_type npos; + typedef const char* const_iterator; + typedef const char* iterator; + typedef std::reverse_iterator const_reverse_iterator; + typedef std::reverse_iterator reverse_iterator; + iterator begin() const { return ptr_; } + iterator end() const { return ptr_ + length_; } + const_reverse_iterator rbegin() const { + return const_reverse_iterator(ptr_ + length_); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(ptr_); + } + // STLS says return size_type, but Google says return int + int max_size() const { return length_; } + int capacity() const { return length_; } + + int copy(char* buf, size_type n, size_type pos = 0) const; + + int find(const StringPiece& s, size_type pos = 0) const; + int find(char c, size_type pos = 0) const; + int rfind(const StringPiece& s, size_type pos = npos) const; + int rfind(char c, size_type pos = npos) const; + + StringPiece substr(size_type pos, size_type n = npos) const; +}; + +} // namespace cxxmph + +bool operator==(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y); + +inline bool operator!=(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) { + return !(x == y); +} + +inline bool operator<(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) { + const int r = memcmp(x.data(), y.data(), + std::min(x.size(), y.size())); + return ((r < 0) || ((r == 0) && (x.size() < y.size()))); +} + +inline bool operator>(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) { + return y < x; +} + +inline bool operator<=(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) { + return !(x > y); +} + +inline bool operator>=(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) { + return !(x < y); +} + +// allow StringPiece to be logged +extern std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece); + +#endif // CXXMPH_STRINGPIECE_H__ diff --git a/cxxmph/trigraph.cc b/cxxmph/trigraph.cc index 89b6721..63c36e1 100644 --- a/cxxmph/trigraph.cc +++ b/cxxmph/trigraph.cc @@ -1,3 +1,4 @@ +#include #include #include "trigraph.h" @@ -8,17 +9,51 @@ namespace { static const cmph_uint8 kInvalidEdge = std::numeric_limits::max(); } +namespace cxxmph { + TriGraph::TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices) : nedges_(0), edges_(nedges), first_edge_(nvertices, kInvalidEdge), vertex_degree_(nvertices, 0) { } -void TriGraph::ExtractEdgesAndClear(vector* edges) { +void TriGraph::ExtractEdgesAndClear(vector* edges) { + vector().swap(next_edge_); vector().swap(first_edge_); vector().swap(vertex_degree_); nedges_ = 0; edges->swap(edges_); } -void TriGraph::AddEdge(const Edge& edge) { } -void TriGraph::RemoveEdge(cmph_uint32 current_edge) { } +void TriGraph::AddEdge(const Edge& edge) { + edges_[nedges_] = edge; + next_edge_[nedges_] = Edge( + first_edge_[edge[0]], first_edge_[edge[1]], first_edge_[edge[2]]); + first_edge_[edge[0]] = first_edge_[edge[1]] = first_edge_[edge[2]] = nedges_; + ++vertex_degree_[edge[0]]; + ++vertex_degree_[edge[1]]; + ++vertex_degree_[edge[2]]; + ++nedges_; +} + +void TriGraph::RemoveEdge(cmph_uint32 current_edge) { + cmph_uint32 vertex, edge1, edge2; + for (int i = 0; i < 3; ++i) { + cmph_uint32 vertex = edges_[current_edge][i]; + cmph_uint32 edge1 = first_edge_[vertex]; + cmph_uint32 edge2 = kInvalidEdge; + cmph_uint32 j = 0; + while (edge1 != current_edge && edge1 != kInvalidEdge) { + edge2 = edge1; + if (edges_[edge1][0] == vertex) j = 0; + else if (edges_[edge1][1] == vertex) j = 1; + else j = 2; + edge1 = next_edge_[edge1][j]; + } + assert(edge1 != kInvalidEdge); + if (edge2 != kInvalidEdge) next_edge_[edge2][j] = next_edge_[edge1][i]; + else first_edge_[vertex] = next_edge_[edge1][i]; + --vertex_degree_[vertex]; + } +} + +} // namespace cxxmph diff --git a/cxxmph/trigraph.h b/cxxmph/trigraph.h index e4f8440..9d60151 100644 --- a/cxxmph/trigraph.h +++ b/cxxmph/trigraph.h @@ -1,26 +1,43 @@ +#ifndef __CXXMPH_TRIGRAPH_H__ +#define __CXXMPH_TRIGRAPH_H__ +// Build a trigraph using a memory efficient representation. +// +// Prior knowledge of the number of edges and vertices for the graph is +// required. For each vertex, we store how many edges touch it (degree) and the +// index of the first edge in the vector of triples representing the edges. + + #include #include "../src/cmph_types.h" +namespace cxxmph { + class TriGraph { struct Edge { Edge() { } Edge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2); + cmph_uint32& operator[](cmph_uint8 v) { return vertices[v]; } + const cmph_uint32& operator[](cmph_uint8 v) const { return vertices[v]; } cmph_uint32 vertices[3]; }; - struct ConnectedEdge { - Edge current; - Edge next; - }; - TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices); void AddEdge(const Edge& edge); - void RemoveEdge(cmph_uint32 current_edge); - void ExtractEdgesAndClear(std::vector* edges); + void RemoveEdge(cmph_uint32 edge_id); + void ExtractEdgesAndClear(std::vector* edges); + + const std::vector& edges() const { return edges_; } + const std::vector& vertex_degree() const { return vertex_degree_; } + const std::vector& first_edge() const { return first_edge_; } private: - cmph_uint32 nedges_; - std::vector edges_; - std::vector first_edge_; - std::vector vertex_degree_; + cmph_uint32 nedges_; // total number of edges + std::vector edges_; + std::vector next_edge_; // for implementing removal + std::vector first_edge_; // the first edge for this vertex + std::vector vertex_degree_; // number of edges for this vertex }; + +} // namespace cxxmph + +#endif // __CXXMPH_TRIGRAPH_H__ From 385ce27a109e0daebb854a7f727b187881b82814 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Wed, 27 Oct 2010 17:17:09 -0700 Subject: [PATCH 04/89] Added half nibble code. --- cxxmph/Makefile.am | 5 +- cxxmph/MurmurHash2.h | 9 ++ cxxmph/mphtable.cc | 187 +++++++++++++--------------------- cxxmph/mphtable.h | 87 +++++++++++++--- cxxmph/mphtable_test.cc | 22 ++-- cxxmph/randomly_seeded_hash.h | 22 +++- cxxmph/trigraph.cc | 16 ++- cxxmph/trigraph.h | 7 +- 8 files changed, 213 insertions(+), 142 deletions(-) diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index da7fa84..10bd278 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cmph_hash_map_test mphtable_test +bin_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test lib_LTLIBRARIES = libcxxmph.la libcxxmph_la_SOURCES = stringpiece.h MurmurHash2.h randomly_seeded_hash.h trigragh.h trigraph.cc mphtable.h mphtable.cc @@ -9,3 +9,6 @@ cmph_hash_map_test_SOURCES = cmph_hash_map_test.cc mphtable_test_LDADD = libcxxmph.la mphtable_test_SOURCES = mphtable_test.cc + +trigraph_test_LDADD = libcxxmph.la +trigraph_test_SOURCES = trigraph_test.cc diff --git a/cxxmph/MurmurHash2.h b/cxxmph/MurmurHash2.h index 81051fe..aa9338f 100644 --- a/cxxmph/MurmurHash2.h +++ b/cxxmph/MurmurHash2.h @@ -1,3 +1,6 @@ +#ifndef __CXXMPH_MURMUR_HASH2__ +#define __CXXMPH_MURMUR_HASH2__ + //----------------------------------------------------------------------------- // MurmurHash2, by Austin Appleby @@ -12,6 +15,8 @@ // 2. It will not produce the same results on little-endian and big-endian // machines. +namespace { + unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) { // 'm' and 'r' are mixing constants generated offline. @@ -62,3 +67,7 @@ unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) return h; } + +} + +#endif // __CXXMPH_MURMUR_HASH2__ diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index 2c5ba32..88ab6ed 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -1,49 +1,58 @@ #include +#include + +using std::cerr; +using std::endl; #include "mphtable.h" using std::vector; -namespace cxxmph { +namespace { -template -template -bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { - TableBuilderState st; - m_ = end - begin; - r_ = static_cast(ceil((c_*m_)/3)); - if (r_ % 2) == 0) r_ += 1; - n_ = 3*r_; - k_ = 1U << b_; +static const cmph_uint8 kUnassigned = 3; +// table used for looking up the number of assigned vertices to a 8-bit integer +static cmph_uint8 kBdzLookupTable[] = +{ +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0 +}; - int iterations = 1000; - while (1) { - for (int i = 0; i < 3; ++i) hash_function_[i] = hasher(); - vector edges; - vector queue; - if (Mapping(begin, end, &edges, &queue)) break; - else --iterations; - if (iterations == 0) break; - } - if (iterations == 0) return false; - vector& edges; - graph->ExtractEdgesAndClear(&edges); - Assigning(queue, edges); - vector().swap(edges); - Ranking(); - +static const cmph_uint8 valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; +void set_2bit_value(vector *d, cmph_uint8 i, cmph_uint8 v) { + (*d)[(i >> 2)] &= (v << ((i & 3) << 1)) | valuemask[i & 3]; +} +cmph_uint8 get_2bit_value(const vector& d, cmph_uint8 i) { + return (d[(i >> 2)] >> ((i & 3) << 1)) & 3; } -template -bool MPHTable::GenerateQueue( +} // anonymous namespace + +namespace cxxmph { + +bool MPHTable::GenerateQueue( TriGraph* graph, vector* queue_output) { cmph_uint32 queue_head = 0, queue_tail = 0; - cmph_uint32 nedges = n_; - cmph_uint32 nvertices = m_; + cmph_uint32 nedges = m_; + cmph_uint32 nvertices = n_; // Relies on vector using 1 bit per element vector marked_edge((nedges >> 3) + 1, false); - Queue queue(nvertices, 0); - for (int i = 0; i < nedges; ++i) { + vector queue(nvertices, 0); + for (cmph_uint32 i = 0; i < nedges; ++i) { const TriGraph::Edge& e = graph->edges()[i]; if (graph->vertex_degree()[e[0]] == 1 || graph->vertex_degree()[e[1]] == 1 || @@ -74,102 +83,56 @@ bool MPHTable::GenerateQueue( return cycles == 0; } -template -template -bool MPHTable::Mapping( - ForwardIterator begin, ForwardIterator end, - vector* edges, vector queue) { - int cycles = 0; - TriGraph graph(m, n); - for (ForwardIterator it = begin; it != end; ++it) { - cmph_uint32 h[3]; - for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it); - cmph_uint32 v0 = h[0] % r_; - cmph_uint32 v1 = h[1] % r_ + r_; - cmph_uint32 v2 = h[2] % r_ + (r_ << 1); - graph.AddEdge(Edge(v0, v1, v2)); - } - if (GenerateQueue(&graph, queue)) { - graph.ExtractEdgesAndClear(edges); - return true; - } - return false; -} - -template -void MPHTable::Assigning( - const vector& edges, const vector& queue) { +void MPHTable::Assigning( + const vector& edges, const vector& queue) { cmph_uint32 nedges = n_; cmph_uint32 current_edge = 0; vector marked_vertices(nedges + 1); - // TODO(davi) use half nibbles instead - // vector g(static_cast(ceil(nedges / 4.0)), - // std::numerical_limits::max()); - static const cmph_uint8 kUnassigned = 3; - vector(nedges, kUnassigned).swap(g_); + // Initialize vector of half nibbles with all bits set. + vector(nedges, std::numeric_limits::max()).swap(g_); for (int i = nedges - 1; i + 1 >= 1; --i) { current_edge = queue[i]; const TriGraph::Edge& e = edges[current_edge]; if (!marked_vertices[e[0]]) { if (!marked_vertices[e[1]]) { - g_[e[1]] = kUnassigned; + set_2bit_value(&g_, e[1], kUnassigned); marked_vertices[e[1]] = true; } if (!marked_vertices[e[2]]) { - g_[e[2]] = kUnassigned; + set_2bit_value(&g_, e[2], kUnassigned); marked_vertices[e[2]] = true; } - g_[e[0]] = (6 - g_[e[1]] + g_[e2]) % 3; + set_2bit_value(&g_, e[0], (6 - (get_2bit_value(g_, e[1]) + get_2bit_value(g_, e[2]))) % 3); marked_vertices[e[0]] = true; - } else if (!marked_vertices[e[1]])) { - if (!marked_vertices[e[2]])) { - g_[e[2]] = kUnassigned; + } else if (!marked_vertices[e[1]]) { + if (!marked_vertices[e[2]]) { + set_2bit_value(&g_, e[2], kUnassigned); marked_vertices[e[2]] = true; } - g_[e[1]] = 7 - (g_[e[0]] + g_[e[2]]) % 3; + set_2bit_value(&g_, e[1], (7 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[2]))) % 3); marked_vertices[e[1]] = true; } else { - g_[e[2]] = (8 - g_[e[0]] + g_[e[1]]) % 3; + set_2bit_value(&g_, e[2], (8 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[1]))) % 3); marked_vertices[e[2]] = true; } } } -// table used for looking up the number of assigned vertices to a 8-bit integer -static cmph_uint8 kBdzLookupTable[] = -{ -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, -3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, -3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, -3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, -3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, -2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0 -}; - -template -void MPHTable::Ranking() { - cmph_uint32 nbytes_total = static_cast(ceil(st->n / 4.0)); +void MPHTable::Ranking() { + cmph_uint32 nbytes_total = static_cast(ceil(n_ / 4.0)); cmph_uint32 size = k_ >> 2U; - ranktablesize = static_cast(ceil(n_ / static_cast(k_))); - // TODO(davi) Change swap of member classes for resize + memset to avoid fragmentation + cmph_uint32 ranktablesize = static_cast( + ceil(n_ / static_cast(k_))); + // TODO(davi) Change swap of member classes for resize + memset to avoid + // fragmentation vector (ranktablesize).swap(ranktable_);; cmph_uint32 offset = 0; cmph_uint32 count = 0; cmph_uint32 i = 0; while (1) { - if (i == ranktable.size()) break; + if (i == ranktable_.size()) break; cmph_uint32 nbytes = size < nbytes_total ? size : nbytes_total; - for (j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]]; + for (cmph_uint32 j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]]; ranktable_[i] = count; offset += nbytes; nbytes_total -= size; @@ -177,36 +140,32 @@ void MPHTable::Ranking() { } } -template -cmph_uint32 MPHTable::Search(const key_type& key) const { - cmph_uint32 vertex; +cmph_uint32 MPHTable::Search(const key_type& key) const { cmph_uint32 h[3]; for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](key); - h[0] = h[0] % st->r; - h[1] = h[1] % st->r + st->r; - h[2] = h[2] % st->r + (st->r << 1); - cmph_uint32 vertex = h[(h[g_[h[0]] + g_[h[1]] + g_[h[2]]) % 3]; - return Rank(st->b, st->ranktable, vertex); + h[0] = h[0] % r_; + h[1] = h[1] % r_ + r_; + h[2] = h[2] % r_ + (r_ << 1); + cmph_uint32 vertex = h[(g_[h[0]] + g_[h[1]] + g_[h[2]]) % 3]; + return Rank(vertex); } -template -cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { +cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { cmph_uint32 index = vertex >> b_; cmph_uint32 base_rank = ranktable_[index]; - cmph_uint32 beg_idx_v = index << b; - cmph_uint32 beg_idx_b = index >> 2 - cmph_uint32 end_idx_b = index >> 2 + cmph_uint32 beg_idx_v = index << b_; + cmph_uint32 beg_idx_b = index >> 2; + cmph_uint32 end_idx_b = index >> 2; while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]]; beg_idx_v = beg_idx_b << 2; while (beg_idx_v < vertex) { - if (g_[beg_idx_v) != kUnassigned) ++base_rank; + if (g_[beg_idx_v] != kUnassigned) ++base_rank; ++beg_idx_v; } return base_rank; } -template -cmph_uint32 MPHTable::index(const key_type& key) const { +cmph_uint32 MPHTable::index(const key_type& key) const { return Search(key); } diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index eccff61..c0ef402 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -3,21 +3,29 @@ // Minimal perfect hash abstraction implementing the BDZ algorithm +#include #include +#include + +using std::cerr; +using std::endl; + #include "randomly_seeded_hash.h" #include "stringpiece.h" #include "trigraph.h" namespace cxxmph { -template class MPHTable { public: - typedef Key key_type; - typedef NewRandomlySeededHashFcn hasher; + // This class could be a template for both key type and hash function, but we + // chose to go with simplicity. + typedef StringPiece key_type; + typedef RandomlySeededHashFunction hasher_type; + MPHTable(double c = 1.23, cmph_uint8 b = 7) : c_(c), b_(b) { } - ~MPHTable(); + ~MPHTable() {} template bool Reset(ForwardIterator begin, ForwardIterator end); @@ -26,21 +34,23 @@ class MPHTable { private: template bool Mapping(ForwardIterator begin, ForwardIterator end, - vector* edges, vector queue); - bool GenerateQueue(TriGraph* graph, vector* queue); - void Assigning(TriGraph* graph_builder, Queue* queue); - void Ranking(TriGraph* graph_builder, Queue* queue); - cmph_uint32 Search(const StringPiece& key); - cmph_uint32 Rank(const StringPiece& key); + std::vector* edges, + std::vector* queue); + bool GenerateQueue(TriGraph* graph, std::vector* queue); + void Assigning(const std::vector& edges, + const std::vector& queue); + void Ranking(); + cmph_uint32 Search(const key_type& key) const; + cmph_uint32 Rank(cmph_uint32 vertex) const; // Algorithm parameters - cmph_uint8 b_; // Number of bits of the kth index in the ranktable double c_; // Number of bits per key (? is it right) + cmph_uint8 b_; // Number of bits of the kth index in the ranktable // Values used during generation cmph_uint32 m_; // edges count cmph_uint32 n_; // vertex count - cmph_uint32 k_ // kth index in ranktable, $k = log_2(n=3r)\varepsilon$ + cmph_uint32 k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$ // Values used during search @@ -52,10 +62,59 @@ class MPHTable { std::vector ranktable_; // The selected hash function triplet for finding the edges in the minimal // perfect hash function graph. - hasher hash_function_[3]; + hasher_type hash_function_[3]; }; +// Template method needs to go in the header file. +template +bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { + m_ = end - begin; + r_ = static_cast(ceil((c_*m_)/3)); + if ((r_ % 2) == 0) r_ += 1; + n_ = 3*r_; + k_ = 1U << b_; + + cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; + + int iterations = 1000; + std::vector edges; + std::vector queue; + while (1) { + cerr << "Iterations missing: " << iterations << endl; + for (int i = 0; i < 3; ++i) hash_function_[i] = hasher_type(); + if (Mapping(begin, end, &edges, &queue)) break; + else --iterations; + if (iterations == 0) break; + } + if (iterations == 0) return false; + Assigning(edges, queue); + std::vector().swap(edges); + Ranking(); + return true; +} + +template +bool MPHTable::Mapping( + ForwardIterator begin, ForwardIterator end, + std::vector* edges, std::vector* queue) { + TriGraph graph(n_, m_); + for (ForwardIterator it = begin; it != end; ++it) { + cmph_uint32 h[3]; + for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it); + cmph_uint32 v0 = h[0] % r_; + cmph_uint32 v1 = h[1] % r_ + r_; + cmph_uint32 v2 = h[2] % r_ + (r_ << 1); + cerr << "Key: " << *it << " vertex " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; + graph.AddEdge(TriGraph::Edge(v0, v1, v2)); + } + if (GenerateQueue(&graph, queue)) { + graph.ExtractEdgesAndClear(edges); + return true; + } + return false; +} + } // namespace cxxmph -#define // __CXXMPH_MPHTABLE_H__ +#endif // __CXXMPH_MPHTABLE_H__ diff --git a/cxxmph/mphtable_test.cc b/cxxmph/mphtable_test.cc index e18b34d..b08ffc5 100644 --- a/cxxmph/mphtable_test.cc +++ b/cxxmph/mphtable_test.cc @@ -1,22 +1,30 @@ #include +#include #include #include "mphtable.h" +using std::string; using std::vector; using cxxmph::MPHTable; int main(int argc, char** argv) { - vector keys; - keys.push_back(10); - keys.push_back(4); - keys.push_back(3); + vector keys; + keys.push_back("davi"); + keys.push_back("paulo"); + keys.push_back("joao"); + keys.push_back("maria"); + keys.push_back("bruno"); - MPHTable mphtable; + MPHTable mphtable; assert(mphtable.Reset(keys.begin(), keys.end())); vector ids; - for (int i = 0; i < keys.size(); ++i) ids.push_back(mphtable.index(keys[i])); + for (vector::size_type i = 0; i < keys.size(); ++i) { + ids.push_back(mphtable.index(keys[i])); + cerr << " " << *(ids.end() - 1); + } + cerr << endl; sort(ids.begin(), ids.end()); - for (int i = 0; i < ids.size(); ++i) assert(ids[i] == i); + for (vector::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast::value_type>(i)); } diff --git a/cxxmph/randomly_seeded_hash.h b/cxxmph/randomly_seeded_hash.h index 69db56a..fa382dd 100644 --- a/cxxmph/randomly_seeded_hash.h +++ b/cxxmph/randomly_seeded_hash.h @@ -8,17 +8,35 @@ #include "../src/cmph_types.h" #include "MurmurHash2.h" +#include "stringpiece.h" namespace cxxmph { -struct RandomlySeededMurmur2 { +template +struct RandomlySeededHashFunction { }; + +class Murmur2StringPiece { }; +class Murmur2Pod { }; + +template <> +struct RandomlySeededHashFunction { RandomlySeededHashFunction() : seed(random()) { } - cmph_uint32 operator()(const StringPiece& key) { + cmph_uint32 operator()(const StringPiece& key) const { return MurmurHash2(key.data(), key.length(), seed); } cmph_uint32 seed; }; +template<> +struct RandomlySeededHashFunction { + RandomlySeededHashFunction() : seed(random()) { } + template + cmph_uint32 operator()(const Key& key) const { + return MurmurHash2(&key, sizeof(key), seed); + } + cmph_uint32 seed; +}; + } // namespace cxxmph #endif // __CXXMPH_RANDOMLY_SEEDED_HASH__ diff --git a/cxxmph/trigraph.cc b/cxxmph/trigraph.cc index 63c36e1..ff738a6 100644 --- a/cxxmph/trigraph.cc +++ b/cxxmph/trigraph.cc @@ -1,8 +1,11 @@ #include #include +#include #include "trigraph.h" +using std::cerr; +using std::endl; using std::vector; namespace { @@ -11,9 +14,10 @@ static const cmph_uint8 kInvalidEdge = std::numeric_limits::max(); namespace cxxmph { -TriGraph::TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices) +TriGraph::TriGraph(cmph_uint32 nvertices, cmph_uint32 nedges) : nedges_(0), edges_(nedges), + next_edge_(nedges), first_edge_(nvertices, kInvalidEdge), vertex_degree_(nvertices, 0) { } @@ -25,7 +29,13 @@ void TriGraph::ExtractEdgesAndClear(vector* edges) { edges->swap(edges_); } void TriGraph::AddEdge(const Edge& edge) { - edges_[nedges_] = edge; + edges_[nedges_] = edge; + assert(first_edge_.size() > edge[0]); + assert(first_edge_.size() > edge[1]); + assert(first_edge_.size() > edge[0]); + assert(first_edge_.size() > edge[1]); + assert(first_edge_.size() > edge[2]); + assert(next_edge_.size() > nedges_); next_edge_[nedges_] = Edge( first_edge_[edge[0]], first_edge_[edge[1]], first_edge_[edge[2]]); first_edge_[edge[0]] = first_edge_[edge[1]] = first_edge_[edge[2]] = nedges_; @@ -36,7 +46,7 @@ void TriGraph::AddEdge(const Edge& edge) { } void TriGraph::RemoveEdge(cmph_uint32 current_edge) { - cmph_uint32 vertex, edge1, edge2; + cerr << "Removing edge " << current_edge << " from " << nedges_ << " existing edges " << endl; for (int i = 0; i < 3; ++i) { cmph_uint32 vertex = edges_[current_edge][i]; cmph_uint32 edge1 = first_edge_[vertex]; diff --git a/cxxmph/trigraph.h b/cxxmph/trigraph.h index 9d60151..18d8d98 100644 --- a/cxxmph/trigraph.h +++ b/cxxmph/trigraph.h @@ -14,9 +14,14 @@ namespace cxxmph { class TriGraph { + public: struct Edge { Edge() { } - Edge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2); + Edge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2) { + vertices[0] = v0; + vertices[1] = v1; + vertices[2] = v2; + } cmph_uint32& operator[](cmph_uint8 v) { return vertices[v]; } const cmph_uint32& operator[](cmph_uint8 v) const { return vertices[v]; } cmph_uint32 vertices[3]; From 22d149d3a8d362391df3c3e7f1a6e81c65203861 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Wed, 27 Oct 2010 19:45:43 -0700 Subject: [PATCH 05/89] It works. --- cxxmph/mphtable.cc | 54 ++++++++++++++++++++++++++++------- cxxmph/mphtable.h | 11 ++++--- cxxmph/mphtable_test.cc | 2 ++ cxxmph/randomly_seeded_hash.h | 17 +++++++++++ src/bdz.c | 10 +++++-- src/jenkins_hash.c | 4 +-- 6 files changed, 80 insertions(+), 18 deletions(-) diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index 88ab6ed..6f6a788 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -36,7 +36,7 @@ static const cmph_uint8 valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; void set_2bit_value(vector *d, cmph_uint8 i, cmph_uint8 v) { (*d)[(i >> 2)] &= (v << ((i & 3) << 1)) | valuemask[i & 3]; } -cmph_uint8 get_2bit_value(const vector& d, cmph_uint8 i) { +cmph_uint32 get_2bit_value(const vector& d, cmph_uint8 i) { return (d[(i >> 2)] >> ((i & 3) << 1)) & 3; } @@ -50,7 +50,7 @@ bool MPHTable::GenerateQueue( cmph_uint32 nedges = m_; cmph_uint32 nvertices = n_; // Relies on vector using 1 bit per element - vector marked_edge((nedges >> 3) + 1, false); + vector marked_edge(nedges + 1, false); vector queue(nvertices, 0); for (cmph_uint32 i = 0; i < nedges; ++i) { const TriGraph::Edge& e = graph->edges()[i]; @@ -63,6 +63,15 @@ bool MPHTable::GenerateQueue( } } } + for (unsigned int i = 0; i < marked_edge.size(); ++i) { + cerr << "vertex with degree " << static_cast(graph->vertex_degree()[i]) << " marked " << marked_edge[i] << endl; + } + for (unsigned int i = 0; i < queue.size(); ++i) { + cerr << "vertex " << i << " queued at " << queue[i] << endl; + } + // At this point queue head is the number of edges touching at least one + // vertex of degree 1. + cerr << "Queue head " << queue_head << " Queue tail " << queue_tail << endl; while (queue_tail != queue_head) { cmph_uint32 current_edge = queue[queue_tail++]; graph->RemoveEdge(current_edge); @@ -78,6 +87,9 @@ bool MPHTable::GenerateQueue( } } } + for (unsigned int i = 0; i < queue.size(); ++i) { + cerr << "vertex " << i << " queued at " << queue[i] << endl; + } int cycles = queue_head - nedges; if (cycles == 0) queue.swap(*queue_output); return cycles == 0; @@ -85,14 +97,21 @@ bool MPHTable::GenerateQueue( void MPHTable::Assigning( const vector& edges, const vector& queue) { - cmph_uint32 nedges = n_; + cmph_uint32 nedges = m_; cmph_uint32 current_edge = 0; vector marked_vertices(nedges + 1); // Initialize vector of half nibbles with all bits set. - vector(nedges, std::numeric_limits::max()).swap(g_); + cmph_uint32 sizeg = static_cast(ceil(n_/4.0)); + vector(sizeg, std::numeric_limits::max()).swap(g_); + for (int i = nedges - 1; i + 1 >= 1; --i) { current_edge = queue[i]; + cerr << "Current edge " << current_edge << " at queue pos " << i << endl; const TriGraph::Edge& e = edges[current_edge]; + cerr << "B: " << e[0] << " " << e[1] << " " << e[2] << " -> " + << get_2bit_value(g_, e[0]) << " " + << get_2bit_value(g_, e[1]) << " " + << get_2bit_value(g_, e[2]) << " " << endl; if (!marked_vertices[e[0]]) { if (!marked_vertices[e[1]]) { set_2bit_value(&g_, e[1], kUnassigned); @@ -115,6 +134,10 @@ void MPHTable::Assigning( set_2bit_value(&g_, e[2], (8 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[1]))) % 3); marked_vertices[e[2]] = true; } + cerr << "A: " << e[0] << " " << e[1] << " " << e[2] << " -> " + << get_2bit_value(g_, e[0]) << " " + << get_2bit_value(g_, e[1]) << " " + << get_2bit_value(g_, e[2]) << " " << endl; } } @@ -128,7 +151,7 @@ void MPHTable::Ranking() { vector (ranktablesize).swap(ranktable_);; cmph_uint32 offset = 0; cmph_uint32 count = 0; - cmph_uint32 i = 0; + cmph_uint32 i = 1; while (1) { if (i == ranktable_.size()) break; cmph_uint32 nbytes = size < nbytes_total ? size : nbytes_total; @@ -142,11 +165,13 @@ void MPHTable::Ranking() { cmph_uint32 MPHTable::Search(const key_type& key) const { cmph_uint32 h[3]; - for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](key); + // for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](key); + hash_function_[0](key, h); h[0] = h[0] % r_; h[1] = h[1] % r_ + r_; h[2] = h[2] % r_ + (r_ << 1); - cmph_uint32 vertex = h[(g_[h[0]] + g_[h[1]] + g_[h[2]]) % 3]; + cmph_uint32 vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3]; + cerr << "Search found vertex " << vertex << endl; return Rank(vertex); } @@ -154,14 +179,23 @@ cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { cmph_uint32 index = vertex >> b_; cmph_uint32 base_rank = ranktable_[index]; cmph_uint32 beg_idx_v = index << b_; - cmph_uint32 beg_idx_b = index >> 2; - cmph_uint32 end_idx_b = index >> 2; + cmph_uint32 beg_idx_b = beg_idx_v >> 2; + cmph_uint32 end_idx_b = vertex >> 2; while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]]; beg_idx_v = beg_idx_b << 2; + cerr << "beg_idx_v: " << beg_idx_v << endl; + cerr << "base rank: " << base_rank << endl; + + cerr << "G: "; + for (unsigned int i = 0; i < n_; ++i) { + cerr << get_2bit_value(g_, i) << " "; + } while (beg_idx_v < vertex) { - if (g_[beg_idx_v] != kUnassigned) ++base_rank; + cerr << get_2bit_value(g_, beg_idx_v) << " "; + if (get_2bit_value(g_, beg_idx_v) != kUnassigned) ++base_rank; ++beg_idx_v; } + cerr << "Base rank: " << base_rank << endl; return base_rank; } diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index c0ef402..84d56df 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -22,7 +22,7 @@ class MPHTable { // This class could be a template for both key type and hash function, but we // chose to go with simplicity. typedef StringPiece key_type; - typedef RandomlySeededHashFunction hasher_type; + typedef RandomlySeededHashFunction hasher_type; MPHTable(double c = 1.23, cmph_uint8 b = 7) : c_(c), b_(b) { } ~MPHTable() {} @@ -82,7 +82,9 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { std::vector queue; while (1) { cerr << "Iterations missing: " << iterations << endl; - for (int i = 0; i < 3; ++i) hash_function_[i] = hasher_type(); + // for (int i = 0; i < 3; ++i) hash_function_[i] = hasher_type(); + hash_function_[0] = hasher_type(); + cerr << "Seed: " << hash_function_[0].seed << endl; if (Mapping(begin, end, &edges, &queue)) break; else --iterations; if (iterations == 0) break; @@ -101,11 +103,12 @@ bool MPHTable::Mapping( TriGraph graph(n_, m_); for (ForwardIterator it = begin; it != end; ++it) { cmph_uint32 h[3]; - for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it); + // for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it); + hash_function_[0](*it, h); cmph_uint32 v0 = h[0] % r_; cmph_uint32 v1 = h[1] % r_ + r_; cmph_uint32 v2 = h[2] % r_ + (r_ << 1); - cerr << "Key: " << *it << " vertex " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; + cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; graph.AddEdge(TriGraph::Edge(v0, v1, v2)); } if (GenerateQueue(&graph, queue)) { diff --git a/cxxmph/mphtable_test.cc b/cxxmph/mphtable_test.cc index b08ffc5..8986ee0 100644 --- a/cxxmph/mphtable_test.cc +++ b/cxxmph/mphtable_test.cc @@ -9,6 +9,8 @@ using std::vector; using cxxmph::MPHTable; int main(int argc, char** argv) { + + srand(1); vector keys; keys.push_back("davi"); keys.push_back("paulo"); diff --git a/cxxmph/randomly_seeded_hash.h b/cxxmph/randomly_seeded_hash.h index fa382dd..60ab32d 100644 --- a/cxxmph/randomly_seeded_hash.h +++ b/cxxmph/randomly_seeded_hash.h @@ -8,6 +8,7 @@ #include "../src/cmph_types.h" #include "MurmurHash2.h" +#include "jenkins_hash.h" #include "stringpiece.h" namespace cxxmph { @@ -15,9 +16,25 @@ namespace cxxmph { template struct RandomlySeededHashFunction { }; +class JenkinsStringPiece { }; class Murmur2StringPiece { }; class Murmur2Pod { }; +template <> +struct RandomlySeededHashFunction { + RandomlySeededHashFunction() { + srand(1); + seed = 4; + } + cmph_uint32 operator()(const StringPiece& key) const { + return jenkins_hash(key.data(), key.length(), seed); + } + void operator()(const StringPiece& key, cmph_uint32* hashes) const { + __jenkins_hash_vector(seed, key.data(), key.length(), hashes); + } + cmph_uint32 seed; +}; + template <> struct RandomlySeededHashFunction { RandomlySeededHashFunction() : seed(random()) { } diff --git a/src/bdz.c b/src/bdz.c index f422c8f..5dce597 100755 --- a/src/bdz.c +++ b/src/bdz.c @@ -9,7 +9,7 @@ #include #include #include -//#define DEBUG +#define DEBUG #include "debug.h" #define UNASSIGNED 3U #define NULL_EDGE 0xffffffff @@ -177,9 +177,11 @@ static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_que } }; }; + DEBUGP("Queue head %d Queue tail %d\n", queue_head, queue_tail); while(queue_tail!=queue_head){ curr_edge=queue[queue_tail++]; bdz_remove_edge(graph3,curr_edge); + DEBUGP("Removing edge %d\n", curr_edge); v0=graph3->edges[curr_edge].vertices[0]; v1=graph3->edges[curr_edge].vertices[1]; v2=graph3->edges[curr_edge].vertices[2]; @@ -403,6 +405,7 @@ static int bdz_mapping(cmph_config_t *mph, bdz_graph3_t* graph3, bdz_queue_t que h0 = hl[0] % bdz->r; h1 = hl[1] % bdz->r + bdz->r; h2 = hl[2] % bdz->r + (bdz->r << 1); + DEBUGP("Key: %s (%u %u %u)\n", key, h0, h1, h2); mph->key_source->dispose(mph->key_source->data, key, keylen); bdz_add_edge(graph3,h0,h1,h2); } @@ -427,7 +430,7 @@ static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t v0=graph3->edges[curr_edge].vertices[0]; v1=graph3->edges[curr_edge].vertices[1]; v2=graph3->edges[curr_edge].vertices[2]; - DEBUGP("B:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz->g, v0), GETVALUE(bdz->g, v1), GETVALUE(bdz->g, v2)); + DEBUGP("B:%u %u %u -- %u %u %u edge %u\n", v0, v1, v2, GETVALUE(bdz->g, v0), GETVALUE(bdz->g, v1), GETVALUE(bdz->g, v2), curr_edge); if(!GETBIT(marked_vertices, v0)){ if(!GETBIT(marked_vertices,v1)) { @@ -585,7 +588,9 @@ static inline cmph_uint32 rank(cmph_uint32 b, cmph_uint32 * ranktable, cmph_uint base_rank += bdz_lookup_table[*(g + beg_idx_b++)]; } + DEBUGP("base rank %u\n", base_rank); beg_idx_v = beg_idx_b << 2; + DEBUGP("beg_idx_v %u\n", beg_idx_v); while(beg_idx_v < vertex) { if(GETVALUE(g, beg_idx_v) != UNASSIGNED) base_rank++; @@ -605,6 +610,7 @@ cmph_uint32 bdz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) hl[1] = hl[1] % bdz->r + bdz->r; hl[2] = hl[2] % bdz->r + (bdz->r << 1); vertex = hl[(GETVALUE(bdz->g, hl[0]) + GETVALUE(bdz->g, hl[1]) + GETVALUE(bdz->g, hl[2])) % 3]; + DEBUGP("Search found vertex %u\n", vertex); return rank(bdz->b, bdz->ranktable, bdz->g, vertex); } diff --git a/src/jenkins_hash.c b/src/jenkins_hash.c index f5233a5..4697f74 100644 --- a/src/jenkins_hash.c +++ b/src/jenkins_hash.c @@ -7,7 +7,7 @@ #include #include -//#define DEBUG +#define DEBUG #include "debug.h" #define hashsize(n) ((cmph_uint32)1<<(n)) @@ -87,8 +87,8 @@ acceptable. Do NOT use for cryptographic purposes. jenkins_state_t *jenkins_state_new(cmph_uint32 size) //size of hash table { jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t)); - DEBUGP("Initializing jenkins hash\n"); state->seed = ((cmph_uint32)rand() % size); + DEBUGP("Initializied jenkins hash with seed %d\n", state->seed); return state; } void jenkins_state_destroy(jenkins_state_t *state) From 5fab72278109bfed4a77990ca52ea826ec732e8c Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Thu, 28 Oct 2010 17:53:40 -0700 Subject: [PATCH 06/89] Now going to adapt hash_map. --- cxxmph/mphtable.cc | 4 ++-- cxxmph/mphtable.h | 10 +++++----- cxxmph/mphtable_test.cc | 4 ++++ 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index 6f6a788..8d461c2 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -165,8 +165,8 @@ void MPHTable::Ranking() { cmph_uint32 MPHTable::Search(const key_type& key) const { cmph_uint32 h[3]; - // for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](key); - hash_function_[0](key, h); + for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](key); + // hash_function_[0](key, h); h[0] = h[0] % r_; h[1] = h[1] % r_ + r_; h[2] = h[2] % r_ + (r_ << 1); diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index 84d56df..d130ace 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -22,7 +22,7 @@ class MPHTable { // This class could be a template for both key type and hash function, but we // chose to go with simplicity. typedef StringPiece key_type; - typedef RandomlySeededHashFunction hasher_type; + typedef RandomlySeededHashFunction hasher_type; MPHTable(double c = 1.23, cmph_uint8 b = 7) : c_(c), b_(b) { } ~MPHTable() {} @@ -82,8 +82,8 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { std::vector queue; while (1) { cerr << "Iterations missing: " << iterations << endl; - // for (int i = 0; i < 3; ++i) hash_function_[i] = hasher_type(); - hash_function_[0] = hasher_type(); + for (int i = 0; i < 3; ++i) hash_function_[i] = hasher_type(); + // hash_function_[0] = hasher_type(); cerr << "Seed: " << hash_function_[0].seed << endl; if (Mapping(begin, end, &edges, &queue)) break; else --iterations; @@ -103,8 +103,8 @@ bool MPHTable::Mapping( TriGraph graph(n_, m_); for (ForwardIterator it = begin; it != end; ++it) { cmph_uint32 h[3]; - // for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it); - hash_function_[0](*it, h); + for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it); + // hash_function_[0](*it, h); cmph_uint32 v0 = h[0] % r_; cmph_uint32 v1 = h[1] % r_ + r_; cmph_uint32 v2 = h[2] % r_ + (r_ << 1); diff --git a/cxxmph/mphtable_test.cc b/cxxmph/mphtable_test.cc index 8986ee0..c444b49 100644 --- a/cxxmph/mphtable_test.cc +++ b/cxxmph/mphtable_test.cc @@ -17,6 +17,10 @@ int main(int argc, char** argv) { keys.push_back("joao"); keys.push_back("maria"); keys.push_back("bruno"); + keys.push_back("paula"); + keys.push_back("diego"); + keys.push_back("diogo"); + keys.push_back("algume"); MPHTable mphtable; assert(mphtable.Reset(keys.begin(), keys.end())); From 7ead7bff2fe424091c4ed179ebc7a483a3bf173e Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Thu, 28 Oct 2010 23:26:37 -0700 Subject: [PATCH 07/89] Better. --- cxxmph/cmph_hash_map.h | 56 ++++++++++++++++++++---------------- cxxmph/cmph_hash_map_test.cc | 26 ++++++++++++----- cxxmph/mphtable.cc | 3 ++ cxxmph/mphtable.h | 2 ++ 4 files changed, 55 insertions(+), 32 deletions(-) diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index ac061ea..12d39f1 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -2,6 +2,20 @@ #include #include // for std::pair +#include "MurmurHash2.h" +#include "mphtable.h" +#include "iterator_first.h" + +namespace __gnu_cxx { +template <> struct hash { + std::size_t operator()(std::string const& s) const { + return MurmurHash2(s.c_str(), s.length(), 1 /* seed */); + } +}; +} + +namespace cxxmph { + // Save on repetitive typing. #define CMPH_TMPL_SPEC template #define CMPH_CLASS_SPEC cmph_hash_map @@ -51,7 +65,7 @@ class cmph_hash_map { private: void rehash(); std::vector values_; - cmph_t* cmph_; + MPHTable table_; typedef typename __gnu_cxx::hash_map slack_type; slack_type slack_; }; @@ -61,12 +75,11 @@ bool operator==(const CMPH_CLASS_SPEC& lhs, const CMPH_CLASS_SPEC& rhs) { return lhs.values_ == rhs.values_; } -CMPH_TMPL_SPEC CMPH_CLASS_SPEC::cmph_hash_map() : cmph_(NULL) { +CMPH_TMPL_SPEC CMPH_CLASS_SPEC::cmph_hash_map() { rehash(); } CMPH_TMPL_SPEC CMPH_CLASS_SPEC::~cmph_hash_map() { - if(cmph_) cmph_destroy(cmph_); } CMPH_METHOD_DECL(insert_return_type, insert)(const value_type& x) { @@ -74,28 +87,22 @@ CMPH_METHOD_DECL(insert_return_type, insert)(const value_type& x) { if (it != end()) return std::make_pair(it, false); values_.push_back(x); slack_.insert(std::make_pair(x.first, values_.size() - 1)); - if ((slack_.size() > 10 && !cmph_) || - (cmph_ && slack_.size() > cmph_size(cmph_) * 2)) rehash(); + if ((slack_.size() > 10 && table_.size() == 0) || + (table_.size() && slack_.size() > table_.size() * 2)) { + rehash(); + } it = find(x.first); - // std::cerr << "inserted " << x.first.i_ << " at " << values_.begin() - it; return std::make_pair(it, true); } CMPH_METHOD_DECL(void_type, rehash)() { if (values_.empty()) return; slack_type().swap(slack_); - cmph_io_adapter_t* source = cmph_io_struct_vector_adapter( - &(values_[0]), sizeof(value_type), 0, sizeof(key_type), values_.size()); - cmph_config_t* cmph_config = cmph_config_new(source); - cmph_config_set_algo(cmph_config, CMPH_CHD); - // cmph_config_set_verbosity(cmph_config, 1); - if (cmph_) cmph_destroy(cmph_); - cmph_ = cmph_new(cmph_config); - cmph_config_destroy(cmph_config); - cmph_io_struct_vector_adapter_destroy(source); + table_.Reset(make_iterator_first(values_.begin()), + make_iterator_first(values_.end())); std::vector new_values(values_.size()); - for (int i = 0; i < values_.size(); ++i) { - size_type id = cmph_search(cmph_, reinterpret_cast(&(values_[i].first)), sizeof(key_type)); + for (unsigned int i = 0; i < values_.size(); ++i) { + size_type id = table_.index(values_[i].first); new_values[id] = values_[i]; } values_.swap(new_values); @@ -110,8 +117,7 @@ CMPH_METHOD_DECL(bool_type, empty)() const { return values_.empty(); } CMPH_METHOD_DECL(void_type, clear)() { values_.clear(); slack_.clear(); - cmph_destroy(cmph_); - cmph_ = NULL; + table_.clear(); } CMPH_METHOD_DECL(void_type, erase)(iterator pos) { @@ -129,9 +135,8 @@ CMPH_METHOD_DECL(const_iterator, find)(const key_type& k) const { typename slack_type::const_iterator it = slack_.find(k); if (it != slack_.end()) return values_.begin() + it->second; } - if (!cmph_) return end(); - size_type id = cmph_search(cmph_, reinterpret_cast(&k), - sizeof(key_type)); + if (table_.size() == 0) return end(); + size_type id = table_.index(k); if (key_equal()(values_[id].first, k)) { return values_.begin() + id; } @@ -142,9 +147,8 @@ CMPH_METHOD_DECL(iterator, find)(const key_type& k) { typename slack_type::const_iterator it = slack_.find(k); if (it != slack_.end()) return values_.begin() + it->second; } - if (!cmph_) return end(); - size_type id = cmph_search(cmph_, reinterpret_cast(&k), - sizeof(key_type)); + if (table_.size() == 0) return end(); + size_type id = table_.index(k); if (key_equal()(values_[id].first, k)) { return values_.begin() + id; } @@ -155,3 +159,5 @@ CMPH_METHOD_DECL(iterator, find)(const key_type& k) { CMPH_METHOD_DECL(data_type&, operator[])(const key_type& k) { return insert(std::make_pair(k, data_type())).first->second; } + +} // namespace cxxmph diff --git a/cxxmph/cmph_hash_map_test.cc b/cxxmph/cmph_hash_map_test.cc index ad6961d..c70af58 100644 --- a/cxxmph/cmph_hash_map_test.cc +++ b/cxxmph/cmph_hash_map_test.cc @@ -1,19 +1,31 @@ #include "cmph_hash_map.h" +#include #include +#include + +using std::string; +using cxxmph::cmph_hash_map; int main(int argc, char** argv) { - cmph_hash_map h; - h.insert(std::make_pair(-1,-1)); - for (cmph_hash_map::const_iterator it = h.begin(); it != h.end(); ++it) { + cmph_hash_map h; + h.insert(std::make_pair("-1",-1)); + cmph_hash_map::const_iterator it; + for (it = h.begin(); it != h.end(); ++it) { std::cout << it->first << " -> " << it->second << std::endl; } - std::cout << "Search -1 gives " << h.find(-1)->second << std::endl; - for (int i = 0; i < 1000; ++i) h.insert(std::make_pair(i, i)); + std::cout << "Search -1 gives " << h.find("-1")->second << std::endl; + for (int i = 0; i < 1000; ++i) { + char buf[10]; + snprintf(buf, 10, "%d", i); + h.insert(std::make_pair(buf, i)); + } for (int j = 0; j < 1000; ++j) { for (int i = 1000; i > 0; --i) { - h.find(i - 1); - // std::cout << "Search " << i - 1 << " gives " << h.find(i - 1)->second << std::endl; + char buf[10]; + snprintf(buf, 10, "%d", i - 1); + h.find(buf); + // std::cout << "Search " << i - 1 << " gives " << h.find(i - 1)->second << std::endl; } } } diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index 8d461c2..0b899da 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -44,6 +44,9 @@ cmph_uint32 get_2bit_value(const vector& d, cmph_uint8 i) { namespace cxxmph { +void MPHTable::clear() { + // TODO(davi) impolement me +} bool MPHTable::GenerateQueue( TriGraph* graph, vector* queue_output) { cmph_uint32 queue_head = 0, queue_tail = 0; diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index d130ace..ad8cc13 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -30,6 +30,8 @@ class MPHTable { template bool Reset(ForwardIterator begin, ForwardIterator end); cmph_uint32 index(const key_type& x) const; + cmph_uint32 size() const { return m_; } + void clear(); private: template From 76a88922acfa0342af8c8b08435b6cefa6abdd50 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Thu, 4 Nov 2010 22:57:41 -0200 Subject: [PATCH 08/89] Added iterator first. --- cxxmph/iterator_first.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 cxxmph/iterator_first.h diff --git a/cxxmph/iterator_first.h b/cxxmph/iterator_first.h new file mode 100644 index 0000000..d8350af --- /dev/null +++ b/cxxmph/iterator_first.h @@ -0,0 +1,12 @@ +template +struct iterator_first : public iterator { + iterator_first(iterator it) : iterator(it) { } + const typename iterator::value_type::first_type& operator*() const { + return this->iterator::operator*().first; + } +}; + +template +iterator_first make_iterator_first(iterator it) { + return iterator_first(it); +} From 84f7da426cb897bcb49cabc0a258de340ac7c381 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Thu, 4 Nov 2010 22:59:42 -0200 Subject: [PATCH 09/89] Added trigraph test. --- cxxmph/trigraph_test.cc | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 cxxmph/trigraph_test.cc diff --git a/cxxmph/trigraph_test.cc b/cxxmph/trigraph_test.cc new file mode 100644 index 0000000..6220138 --- /dev/null +++ b/cxxmph/trigraph_test.cc @@ -0,0 +1,22 @@ +#include + +#include "trigraph.h" + +using cxxmph::TriGraph; + +int main(int argc, char** argv) { + TriGraph g(4, 2); + g.AddEdge(TriGraph::Edge(0, 1, 2)); + g.AddEdge(TriGraph::Edge(1, 3, 2)); + assert(g.vertex_degree()[0] == 1); + assert(g.vertex_degree()[1] == 2); + assert(g.vertex_degree()[2] == 2); + assert(g.vertex_degree()[3] == 1); + g.RemoveEdge(0); + assert(g.vertex_degree()[0] == 0); + assert(g.vertex_degree()[1] == 1); + assert(g.vertex_degree()[2] == 1); + assert(g.vertex_degree()[3] == 1); + std::vector edges; + g.ExtractEdgesAndClear(&edges); +} From 6c69aa0a8fe2d03837fdb6279d8e83bf0bf8df91 Mon Sep 17 00:00:00 2001 From: davi Date: Fri, 5 Nov 2010 00:17:08 -0200 Subject: [PATCH 10/89] Fixed small bugs. --- INSTALL | 234 ---------------------------------- cxxmph/cmph_hash_map_test.cc | 10 +- cxxmph/mphtable.h | 3 +- cxxmph/mphtable_test.cc | 1 + cxxmph/randomly_seeded_hash.h | 17 --- 5 files changed, 8 insertions(+), 257 deletions(-) delete mode 100644 INSTALL diff --git a/INSTALL b/INSTALL deleted file mode 100644 index 5458714..0000000 --- a/INSTALL +++ /dev/null @@ -1,234 +0,0 @@ -Installation Instructions -************************* - -Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005, -2006 Free Software Foundation, Inc. - -This file is free documentation; the Free Software Foundation gives -unlimited permission to copy, distribute and modify it. - -Basic Installation -================== - -Briefly, the shell commands `./configure; make; make install' should -configure, build, and install this package. The following -more-detailed instructions are generic; see the `README' file for -instructions specific to this package. - - The `configure' shell script attempts to guess correct values for -various system-dependent variables used during compilation. It uses -those values to create a `Makefile' in each directory of the package. -It may also create one or more `.h' files containing system-dependent -definitions. Finally, it creates a shell script `config.status' that -you can run in the future to recreate the current configuration, and a -file `config.log' containing compiler output (useful mainly for -debugging `configure'). - - It can also use an optional file (typically called `config.cache' -and enabled with `--cache-file=config.cache' or simply `-C') that saves -the results of its tests to speed up reconfiguring. Caching is -disabled by default to prevent problems with accidental use of stale -cache files. - - If you need to do unusual things to compile the package, please try -to figure out how `configure' could check whether to do them, and mail -diffs or instructions to the address given in the `README' so they can -be considered for the next release. If you are using the cache, and at -some point `config.cache' contains results you don't want to keep, you -may remove or edit it. - - The file `configure.ac' (or `configure.in') is used to create -`configure' by a program called `autoconf'. You need `configure.ac' if -you want to change it or regenerate `configure' using a newer version -of `autoconf'. - -The simplest way to compile this package is: - - 1. `cd' to the directory containing the package's source code and type - `./configure' to configure the package for your system. - - Running `configure' might take a while. While running, it prints - some messages telling which features it is checking for. - - 2. Type `make' to compile the package. - - 3. Optionally, type `make check' to run any self-tests that come with - the package. - - 4. Type `make install' to install the programs and any data files and - documentation. - - 5. You can remove the program binaries and object files from the - source code directory by typing `make clean'. To also remove the - files that `configure' created (so you can compile the package for - a different kind of computer), type `make distclean'. There is - also a `make maintainer-clean' target, but that is intended mainly - for the package's developers. If you use it, you may have to get - all sorts of other programs in order to regenerate files that came - with the distribution. - -Compilers and Options -===================== - -Some systems require unusual options for compilation or linking that the -`configure' script does not know about. Run `./configure --help' for -details on some of the pertinent environment variables. - - You can give `configure' initial values for configuration parameters -by setting variables in the command line or in the environment. Here -is an example: - - ./configure CC=c99 CFLAGS=-g LIBS=-lposix - - *Note Defining Variables::, for more details. - -Compiling For Multiple Architectures -==================================== - -You can compile the package for more than one kind of computer at the -same time, by placing the object files for each architecture in their -own directory. To do this, you can use GNU `make'. `cd' to the -directory where you want the object files and executables to go and run -the `configure' script. `configure' automatically checks for the -source code in the directory that `configure' is in and in `..'. - - With a non-GNU `make', it is safer to compile the package for one -architecture at a time in the source code directory. After you have -installed the package for one architecture, use `make distclean' before -reconfiguring for another architecture. - -Installation Names -================== - -By default, `make install' installs the package's commands under -`/usr/local/bin', include files under `/usr/local/include', etc. You -can specify an installation prefix other than `/usr/local' by giving -`configure' the option `--prefix=PREFIX'. - - You can specify separate installation prefixes for -architecture-specific files and architecture-independent files. If you -pass the option `--exec-prefix=PREFIX' to `configure', the package uses -PREFIX as the prefix for installing programs and libraries. -Documentation and other data files still use the regular prefix. - - In addition, if you use an unusual directory layout you can give -options like `--bindir=DIR' to specify different values for particular -kinds of files. Run `configure --help' for a list of the directories -you can set and what kinds of files go in them. - - If the package supports it, you can cause programs to be installed -with an extra prefix or suffix on their names by giving `configure' the -option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. - -Optional Features -================= - -Some packages pay attention to `--enable-FEATURE' options to -`configure', where FEATURE indicates an optional part of the package. -They may also pay attention to `--with-PACKAGE' options, where PACKAGE -is something like `gnu-as' or `x' (for the X Window System). The -`README' should mention any `--enable-' and `--with-' options that the -package recognizes. - - For packages that use the X Window System, `configure' can usually -find the X include and library files automatically, but if it doesn't, -you can use the `configure' options `--x-includes=DIR' and -`--x-libraries=DIR' to specify their locations. - -Specifying the System Type -========================== - -There may be some features `configure' cannot figure out automatically, -but needs to determine by the type of machine the package will run on. -Usually, assuming the package is built to be run on the _same_ -architectures, `configure' can figure that out, but if it prints a -message saying it cannot guess the machine type, give it the -`--build=TYPE' option. TYPE can either be a short name for the system -type, such as `sun4', or a canonical name which has the form: - - CPU-COMPANY-SYSTEM - -where SYSTEM can have one of these forms: - - OS KERNEL-OS - - See the file `config.sub' for the possible values of each field. If -`config.sub' isn't included in this package, then this package doesn't -need to know the machine type. - - If you are _building_ compiler tools for cross-compiling, you should -use the option `--target=TYPE' to select the type of system they will -produce code for. - - If you want to _use_ a cross compiler, that generates code for a -platform different from the build platform, you should specify the -"host" platform (i.e., that on which the generated programs will -eventually be run) with `--host=TYPE'. - -Sharing Defaults -================ - -If you want to set default values for `configure' scripts to share, you -can create a site shell script called `config.site' that gives default -values for variables like `CC', `cache_file', and `prefix'. -`configure' looks for `PREFIX/share/config.site' if it exists, then -`PREFIX/etc/config.site' if it exists. Or, you can set the -`CONFIG_SITE' environment variable to the location of the site script. -A warning: not all `configure' scripts look for a site script. - -Defining Variables -================== - -Variables not defined in a site shell script can be set in the -environment passed to `configure'. However, some packages may run -configure again during the build, and the customized values of these -variables may be lost. In order to avoid this problem, you should set -them in the `configure' command line, using `VAR=value'. For example: - - ./configure CC=/usr/local2/bin/gcc - -causes the specified `gcc' to be used as the C compiler (unless it is -overridden in the site shell script). - -Unfortunately, this technique does not work for `CONFIG_SHELL' due to -an Autoconf bug. Until the bug is fixed you can use this workaround: - - CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash - -`configure' Invocation -====================== - -`configure' recognizes the following options to control how it operates. - -`--help' -`-h' - Print a summary of the options to `configure', and exit. - -`--version' -`-V' - Print the version of Autoconf used to generate the `configure' - script, and exit. - -`--cache-file=FILE' - Enable the cache: use and save the results of the tests in FILE, - traditionally `config.cache'. FILE defaults to `/dev/null' to - disable caching. - -`--config-cache' -`-C' - Alias for `--cache-file=config.cache'. - -`--quiet' -`--silent' -`-q' - Do not print messages saying which checks are being made. To - suppress all normal output, redirect it to `/dev/null' (any error - messages will still be shown). - -`--srcdir=DIR' - Look for the package's source code in directory DIR. Usually - `configure' can determine that directory automatically. - -`configure' also accepts some other, not widely useful, options. Run -`configure --help' for more details. - diff --git a/cxxmph/cmph_hash_map_test.cc b/cxxmph/cmph_hash_map_test.cc index c70af58..a3e02f9 100644 --- a/cxxmph/cmph_hash_map_test.cc +++ b/cxxmph/cmph_hash_map_test.cc @@ -12,20 +12,20 @@ int main(int argc, char** argv) { h.insert(std::make_pair("-1",-1)); cmph_hash_map::const_iterator it; for (it = h.begin(); it != h.end(); ++it) { - std::cout << it->first << " -> " << it->second << std::endl; + std::cerr << it->first << " -> " << it->second << std::endl; } - std::cout << "Search -1 gives " << h.find("-1")->second << std::endl; - for (int i = 0; i < 1000; ++i) { + std::cerr << "Search -1 gives " << h.find("-1")->second << std::endl; + for (int i = 0; i < 100; ++i) { char buf[10]; snprintf(buf, 10, "%d", i); h.insert(std::make_pair(buf, i)); } - for (int j = 0; j < 1000; ++j) { + for (int j = 0; j < 100; ++j) { for (int i = 1000; i > 0; --i) { char buf[10]; snprintf(buf, 10, "%d", i - 1); h.find(buf); - // std::cout << "Search " << i - 1 << " gives " << h.find(i - 1)->second << std::endl; + std::cerr << "Search " << i - 1 << " gives " << h.find(buf)->second << std::endl; } } } diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index ad8cc13..0a37799 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -24,7 +24,8 @@ class MPHTable { typedef StringPiece key_type; typedef RandomlySeededHashFunction hasher_type; - MPHTable(double c = 1.23, cmph_uint8 b = 7) : c_(c), b_(b) { } + MPHTable(double c = 1.23, cmph_uint8 b = 7) : + c_(c), b_(b), m_(0), n_(0), k_(0), r_(0) { } ~MPHTable() {} template diff --git a/cxxmph/mphtable_test.cc b/cxxmph/mphtable_test.cc index c444b49..a745718 100644 --- a/cxxmph/mphtable_test.cc +++ b/cxxmph/mphtable_test.cc @@ -1,3 +1,4 @@ +#include #include #include #include diff --git a/cxxmph/randomly_seeded_hash.h b/cxxmph/randomly_seeded_hash.h index 60ab32d..fa382dd 100644 --- a/cxxmph/randomly_seeded_hash.h +++ b/cxxmph/randomly_seeded_hash.h @@ -8,7 +8,6 @@ #include "../src/cmph_types.h" #include "MurmurHash2.h" -#include "jenkins_hash.h" #include "stringpiece.h" namespace cxxmph { @@ -16,25 +15,9 @@ namespace cxxmph { template struct RandomlySeededHashFunction { }; -class JenkinsStringPiece { }; class Murmur2StringPiece { }; class Murmur2Pod { }; -template <> -struct RandomlySeededHashFunction { - RandomlySeededHashFunction() { - srand(1); - seed = 4; - } - cmph_uint32 operator()(const StringPiece& key) const { - return jenkins_hash(key.data(), key.length(), seed); - } - void operator()(const StringPiece& key, cmph_uint32* hashes) const { - __jenkins_hash_vector(seed, key.data(), key.length(), hashes); - } - cmph_uint32 seed; -}; - template <> struct RandomlySeededHashFunction { RandomlySeededHashFunction() : seed(random()) { } From 8663285897d319e0bbc5ce7df04140093ddc04c4 Mon Sep 17 00:00:00 2001 From: davi Date: Fri, 5 Nov 2010 04:40:15 -0200 Subject: [PATCH 11/89] Better design for hash templates. --- cxxmph/Makefile.am | 3 +- cxxmph/cmph_hash_function.h | 77 +++++++++++++++++++++++++++++++++++ cxxmph/cmph_hash_map.h | 31 +++++++++++--- cxxmph/iterator_first.h | 31 ++++++++++++++ cxxmph/mphtable.cc | 26 +----------- cxxmph/mphtable.h | 64 +++++++++++++++++++---------- cxxmph/mphtable_test.cc | 4 +- cxxmph/randomly_seeded_hash.h | 2 +- cxxmph/stringpiece.h | 4 +- cxxmph/trigraph.h | 2 +- 10 files changed, 187 insertions(+), 57 deletions(-) create mode 100644 cxxmph/cmph_hash_function.h diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 10bd278..7566f00 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,7 +1,8 @@ bin_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test lib_LTLIBRARIES = libcxxmph.la +include_HEADERS = cmph_hash_map.h mphtable.h MurmurHash2.h trigraph.h cmph_hash_function.h -libcxxmph_la_SOURCES = stringpiece.h MurmurHash2.h randomly_seeded_hash.h trigragh.h trigraph.cc mphtable.h mphtable.cc +libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mphtable.h mphtable.cc cmph_hash_function.h libcxxmph_la_LDFLAGS = -version-info 0:0:0 cmph_hash_map_test_LDADD = libcxxmph.la diff --git a/cxxmph/cmph_hash_function.h b/cxxmph/cmph_hash_function.h new file mode 100644 index 0000000..900491d --- /dev/null +++ b/cxxmph/cmph_hash_function.h @@ -0,0 +1,77 @@ +#include +#include // for __gnu_cxx::hash + +#include "MurmurHash2.h" +#include "stringpiece.h" +#include "cmph_types.h" + +namespace cxxmph { + +template +struct seeded_hash_function { + template + cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const { + return HashFcn()(k) ^ seed; + } +}; + +struct Murmur2 { + template + cmph_uint32 operator()(const Key& k) const { + return MurmurHash2(k, sizeof(Key), 1 /* seed */); + } +}; +struct Murmur2StringPiece { + template + cmph_uint32 operator()(const Key& k) const { + StringPiece s(k); + return MurmurHash2(k.data(), k.length(), 1 /* seed */); + } +}; + +template <> +struct seeded_hash_function { + template + cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const { + return MurmurHash2(k, sizeof(Key), seed); + } +}; + +template <> +struct seeded_hash_function { + template + cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const { + StringPiece s(k); + return MurmurHash2(k.data(), k.length(), seed); + } +}; + +template struct OptimizedSeededHashFunction +{ typedef seeded_hash_function hash_function; }; +// Use Murmur2 instead for all types defined in __gnu_cxx::hash, plus +// std::string which is commonly extended. +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; + +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +{ typedef seeded_hash_function hash_function; }; + +} // namespace cxxmph diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index 12d39f1..a606c32 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -12,7 +12,12 @@ template <> struct hash { return MurmurHash2(s.c_str(), s.length(), 1 /* seed */); } }; -} +template <> struct hash { + std::size_t operator()(const long long int& s) const { + return MurmurHash2(reinterpret_cast(&s), sizeof(long long int), 1 /* seed */); + } +}; +} // namespace __gnu_cxx namespace cxxmph { @@ -63,11 +68,25 @@ class cmph_hash_map { void pack() { rehash(); } private: - void rehash(); - std::vector values_; - MPHTable table_; - typedef typename __gnu_cxx::hash_map slack_type; - slack_type slack_; + template + struct iterator_first : public iterator { + iterator_first(iterator it) : iterator(it) { } + const typename iterator::value_type::first_type& operator*() const { + return this->iterator::operator*().first; + } + }; + + template + iterator_first make_iterator_first(iterator it) { + return iterator_first(it); + } + + + void rehash(); + std::vector values_; + SimpleMPHTable::hash_function> table_; + typedef typename __gnu_cxx::hash_map slack_type; + slack_type slack_; }; CMPH_TMPL_SPEC diff --git a/cxxmph/iterator_first.h b/cxxmph/iterator_first.h index d8350af..1babb77 100644 --- a/cxxmph/iterator_first.h +++ b/cxxmph/iterator_first.h @@ -1,3 +1,7 @@ +#include "stringpiece.h" + +namespace cxxmph { + template struct iterator_first : public iterator { iterator_first(iterator it) : iterator(it) { } @@ -10,3 +14,30 @@ template iterator_first make_iterator_first(iterator it) { return iterator_first(it); } + +template class MakeStringPiece { + public: + StringPiece operator()(const value& v) { return StringPiece(reinterpret_cast(&v), sizeof(value)); } +}; +template <> class MakeStringPiece { + public: + StringPiece operator()(const std::string& v) { return StringPiece(v); } +}; +template <> class MakeStringPiece { + public: + StringPiece operator()(const char* v) { return StringPiece(v); } +}; + +template +struct iterator_stringpiece : public iterator { + iterator_stringpiece(iterator it) : iterator(it) { } + StringPiece operator*() const { + return MakeStringPiece()(this->iterator::operator*()); + } +}; +template +iterator_stringpiece make_iterator_stringpiece(iterator it) { + return iterator_stringpiece(it); +} + +} // namespace cxxmph diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index 0b899da..d3537a9 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -32,18 +32,12 @@ static cmph_uint8 kBdzLookupTable[] = 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0 }; -static const cmph_uint8 valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; -void set_2bit_value(vector *d, cmph_uint8 i, cmph_uint8 v) { - (*d)[(i >> 2)] &= (v << ((i & 3) << 1)) | valuemask[i & 3]; -} -cmph_uint32 get_2bit_value(const vector& d, cmph_uint8 i) { - return (d[(i >> 2)] >> ((i & 3) << 1)) & 3; -} - } // anonymous namespace namespace cxxmph { +const cmph_uint8 MPHTable::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; + void MPHTable::clear() { // TODO(davi) impolement me } @@ -166,18 +160,6 @@ void MPHTable::Ranking() { } } -cmph_uint32 MPHTable::Search(const key_type& key) const { - cmph_uint32 h[3]; - for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](key); - // hash_function_[0](key, h); - h[0] = h[0] % r_; - h[1] = h[1] % r_ + r_; - h[2] = h[2] % r_ + (r_ << 1); - cmph_uint32 vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3]; - cerr << "Search found vertex " << vertex << endl; - return Rank(vertex); -} - cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { cmph_uint32 index = vertex >> b_; cmph_uint32 base_rank = ranktable_[index]; @@ -202,8 +184,4 @@ cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { return base_rank; } -cmph_uint32 MPHTable::index(const key_type& key) const { - return Search(key); -} - } // namespace cxxmph diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index 0a37799..2a3786a 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -11,31 +11,26 @@ using std::cerr; using std::endl; -#include "randomly_seeded_hash.h" -#include "stringpiece.h" +#include "cmph_hash_function.h" #include "trigraph.h" namespace cxxmph { class MPHTable { public: - // This class could be a template for both key type and hash function, but we - // chose to go with simplicity. - typedef StringPiece key_type; - typedef RandomlySeededHashFunction hasher_type; - MPHTable(double c = 1.23, cmph_uint8 b = 7) : c_(c), b_(b), m_(0), n_(0), k_(0), r_(0) { } ~MPHTable() {} - template + template bool Reset(ForwardIterator begin, ForwardIterator end); - cmph_uint32 index(const key_type& x) const; + template // must agree with Reset + cmph_uint32 index(const Key& x) const; cmph_uint32 size() const { return m_; } void clear(); private: - template + template bool Mapping(ForwardIterator begin, ForwardIterator end, std::vector* edges, std::vector* queue); @@ -43,7 +38,6 @@ class MPHTable { void Assigning(const std::vector& edges, const std::vector& queue); void Ranking(); - cmph_uint32 Search(const key_type& key) const; cmph_uint32 Rank(cmph_uint32 vertex) const; // Algorithm parameters @@ -63,14 +57,23 @@ class MPHTable { std::vector g_; // The table used for the rank step of the minimal perfect hash function std::vector ranktable_; - // The selected hash function triplet for finding the edges in the minimal + // The selected hash seed triplet for finding the edges in the minimal // perfect hash function graph. - hasher_type hash_function_[3]; + cmph_uint32 hash_seed_[3]; + + static const cmph_uint8 valuemask[]; + static void set_2bit_value(std::vector *d, cmph_uint8 i, cmph_uint8 v) { + (*d)[(i >> 2)] &= (v << ((i & 3) << 1)) | valuemask[i & 3]; + } + static cmph_uint32 get_2bit_value(const std::vector& d, cmph_uint8 i) { + return (d[(i >> 2)] >> ((i & 3) << 1)) & 3; + } + }; // Template method needs to go in the header file. -template +template bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { m_ = end - begin; r_ = static_cast(ceil((c_*m_)/3)); @@ -85,10 +88,8 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { std::vector queue; while (1) { cerr << "Iterations missing: " << iterations << endl; - for (int i = 0; i < 3; ++i) hash_function_[i] = hasher_type(); - // hash_function_[0] = hasher_type(); - cerr << "Seed: " << hash_function_[0].seed << endl; - if (Mapping(begin, end, &edges, &queue)) break; + for (int i = 0; i < 3; ++i) hash_seed_[i] = random(); + if (Mapping(begin, end, &edges, &queue)) break; else --iterations; if (iterations == 0) break; } @@ -99,15 +100,14 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { return true; } -template +template bool MPHTable::Mapping( ForwardIterator begin, ForwardIterator end, std::vector* edges, std::vector* queue) { TriGraph graph(n_, m_); for (ForwardIterator it = begin; it != end; ++it) { cmph_uint32 h[3]; - for (int i = 0; i < 3; ++i) h[i] = hash_function_[i](*it); - // hash_function_[0](*it, h); + for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); cmph_uint32 v0 = h[0] % r_; cmph_uint32 v1 = h[1] % r_ + r_; cmph_uint32 v2 = h[2] % r_ + (r_ << 1); @@ -121,6 +121,28 @@ bool MPHTable::Mapping( return false; } +template +cmph_uint32 MPHTable::index(const Key& key) const { + cmph_uint32 h[3]; + for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); + h[0] = h[0] % r_; + h[1] = h[1] % r_ + r_; + h[2] = h[2] % r_ + (r_ << 1); + cmph_uint32 vertex = h[(get_2bit_value(g_, h[0]) + get_2bit_value(g_, h[1]) + get_2bit_value(g_, h[2])) % 3]; + cerr << "Search found vertex " << vertex << endl; + return Rank(vertex); +} + +template >::hash_function> +class SimpleMPHTable : public MPHTable { + public: + template + bool Reset(ForwardIterator begin, ForwardIterator end) { + return MPHTable::Reset(begin, end); + } + cmph_uint32 index(const Key& key) { return MPHTable::index(key); } +}; + } // namespace cxxmph #endif // __CXXMPH_MPHTABLE_H__ diff --git a/cxxmph/mphtable_test.cc b/cxxmph/mphtable_test.cc index a745718..eb6ed3f 100644 --- a/cxxmph/mphtable_test.cc +++ b/cxxmph/mphtable_test.cc @@ -7,7 +7,7 @@ using std::string; using std::vector; -using cxxmph::MPHTable; +using cxxmph::SimpleMPHTable; int main(int argc, char** argv) { @@ -23,7 +23,7 @@ int main(int argc, char** argv) { keys.push_back("diogo"); keys.push_back("algume"); - MPHTable mphtable; + SimpleMPHTable mphtable; assert(mphtable.Reset(keys.begin(), keys.end())); vector ids; for (vector::size_type i = 0; i < keys.size(); ++i) { diff --git a/cxxmph/randomly_seeded_hash.h b/cxxmph/randomly_seeded_hash.h index fa382dd..747bbf3 100644 --- a/cxxmph/randomly_seeded_hash.h +++ b/cxxmph/randomly_seeded_hash.h @@ -6,7 +6,7 @@ #include -#include "../src/cmph_types.h" +#include "cmph_types.h" #include "MurmurHash2.h" #include "stringpiece.h" diff --git a/cxxmph/stringpiece.h b/cxxmph/stringpiece.h index fdd8f75..4595dc7 100644 --- a/cxxmph/stringpiece.h +++ b/cxxmph/stringpiece.h @@ -172,6 +172,8 @@ inline bool operator>=(const cxxmph::StringPiece& x, const cxxmph::StringPiece& } // allow StringPiece to be logged -extern std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece); +inline std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece) { + return operator<<(o, std::string(piece.data(), piece.size())); +} #endif // CXXMPH_STRINGPIECE_H__ diff --git a/cxxmph/trigraph.h b/cxxmph/trigraph.h index 18d8d98..7321d5a 100644 --- a/cxxmph/trigraph.h +++ b/cxxmph/trigraph.h @@ -9,7 +9,7 @@ #include -#include "../src/cmph_types.h" +#include "cmph_types.h" namespace cxxmph { From c09df518dc70f2b6d7350524f7174891ad888850 Mon Sep 17 00:00:00 2001 From: davi Date: Fri, 5 Nov 2010 04:48:53 -0200 Subject: [PATCH 12/89] make install works. --- cxxmph/Makefile.am | 4 ++-- cxxmph/cmph_hash_function.h | 4 ++-- cxxmph/cmph_hash_map.h | 1 - cxxmph/iterator_first.h | 43 ----------------------------------- cxxmph/randomly_seeded_hash.h | 42 ---------------------------------- cxxmph/stringpiece.h | 4 +--- 6 files changed, 5 insertions(+), 93 deletions(-) delete mode 100644 cxxmph/iterator_first.h delete mode 100644 cxxmph/randomly_seeded_hash.h diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 7566f00..4c8989a 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,8 +1,8 @@ bin_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test lib_LTLIBRARIES = libcxxmph.la -include_HEADERS = cmph_hash_map.h mphtable.h MurmurHash2.h trigraph.h cmph_hash_function.h +include_HEADERS = cmph_hash_map.h mphtable.h MurmurHash2.h trigraph.h cmph_hash_function.h stringpiece.h -libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mphtable.h mphtable.cc cmph_hash_function.h +libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mphtable.h mphtable.cc cmph_hash_function.h stringpiece.h libcxxmph_la_LDFLAGS = -version-info 0:0:0 cmph_hash_map_test_LDADD = libcxxmph.la diff --git a/cxxmph/cmph_hash_function.h b/cxxmph/cmph_hash_function.h index 900491d..e607e48 100644 --- a/cxxmph/cmph_hash_function.h +++ b/cxxmph/cmph_hash_function.h @@ -25,7 +25,7 @@ struct Murmur2StringPiece { template cmph_uint32 operator()(const Key& k) const { StringPiece s(k); - return MurmurHash2(k.data(), k.length(), 1 /* seed */); + return MurmurHash2(s.data(), s.length(), 1 /* seed */); } }; @@ -42,7 +42,7 @@ struct seeded_hash_function { template cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const { StringPiece s(k); - return MurmurHash2(k.data(), k.length(), seed); + return MurmurHash2(s.data(), s.length(), seed); } }; diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index a606c32..51ddcef 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -4,7 +4,6 @@ #include "MurmurHash2.h" #include "mphtable.h" -#include "iterator_first.h" namespace __gnu_cxx { template <> struct hash { diff --git a/cxxmph/iterator_first.h b/cxxmph/iterator_first.h deleted file mode 100644 index 1babb77..0000000 --- a/cxxmph/iterator_first.h +++ /dev/null @@ -1,43 +0,0 @@ -#include "stringpiece.h" - -namespace cxxmph { - -template -struct iterator_first : public iterator { - iterator_first(iterator it) : iterator(it) { } - const typename iterator::value_type::first_type& operator*() const { - return this->iterator::operator*().first; - } -}; - -template -iterator_first make_iterator_first(iterator it) { - return iterator_first(it); -} - -template class MakeStringPiece { - public: - StringPiece operator()(const value& v) { return StringPiece(reinterpret_cast(&v), sizeof(value)); } -}; -template <> class MakeStringPiece { - public: - StringPiece operator()(const std::string& v) { return StringPiece(v); } -}; -template <> class MakeStringPiece { - public: - StringPiece operator()(const char* v) { return StringPiece(v); } -}; - -template -struct iterator_stringpiece : public iterator { - iterator_stringpiece(iterator it) : iterator(it) { } - StringPiece operator*() const { - return MakeStringPiece()(this->iterator::operator*()); - } -}; -template -iterator_stringpiece make_iterator_stringpiece(iterator it) { - return iterator_stringpiece(it); -} - -} // namespace cxxmph diff --git a/cxxmph/randomly_seeded_hash.h b/cxxmph/randomly_seeded_hash.h deleted file mode 100644 index 747bbf3..0000000 --- a/cxxmph/randomly_seeded_hash.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef __CXXMPH_RANDOMLY_SEEDED_HASH__ -#define __CXXMPH_RANDOMLY_SEEDED_HASH__ - -// Helper to create randomly seeded hash functions out of existing hash -// functions that take a seed as a parameter. - -#include - -#include "cmph_types.h" -#include "MurmurHash2.h" -#include "stringpiece.h" - -namespace cxxmph { - -template -struct RandomlySeededHashFunction { }; - -class Murmur2StringPiece { }; -class Murmur2Pod { }; - -template <> -struct RandomlySeededHashFunction { - RandomlySeededHashFunction() : seed(random()) { } - cmph_uint32 operator()(const StringPiece& key) const { - return MurmurHash2(key.data(), key.length(), seed); - } - cmph_uint32 seed; -}; - -template<> -struct RandomlySeededHashFunction { - RandomlySeededHashFunction() : seed(random()) { } - template - cmph_uint32 operator()(const Key& key) const { - return MurmurHash2(&key, sizeof(key), seed); - } - cmph_uint32 seed; -}; - -} // namespace cxxmph - -#endif // __CXXMPH_RANDOMLY_SEEDED_HASH__ diff --git a/cxxmph/stringpiece.h b/cxxmph/stringpiece.h index 4595dc7..fdd8f75 100644 --- a/cxxmph/stringpiece.h +++ b/cxxmph/stringpiece.h @@ -172,8 +172,6 @@ inline bool operator>=(const cxxmph::StringPiece& x, const cxxmph::StringPiece& } // allow StringPiece to be logged -inline std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece) { - return operator<<(o, std::string(piece.data(), piece.size())); -} +extern std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece); #endif // CXXMPH_STRINGPIECE_H__ From 0c5f2301df5d1ee25b95e9da6f3c754a4d09e207 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Fri, 5 Nov 2010 21:46:53 -0200 Subject: [PATCH 13/89] Fixed compilation error and detected iterator problem. --- cxxmph/Makefile.am | 6 +++- cxxmph/cmph_hash_map.h | 24 ++++--------- cxxmph/cmph_hash_map_test.cc | 10 ++++++ cxxmph/cxxmph.cc | 68 ++++++++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 19 deletions(-) create mode 100644 cxxmph/cxxmph.cc diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 4c8989a..c3a0a2b 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,4 +1,5 @@ -bin_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test +noinst_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test +bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la include_HEADERS = cmph_hash_map.h mphtable.h MurmurHash2.h trigraph.h cmph_hash_function.h stringpiece.h @@ -13,3 +14,6 @@ mphtable_test_SOURCES = mphtable_test.cc trigraph_test_LDADD = libcxxmph.la trigraph_test_SOURCES = trigraph_test.cc + +cxxmph_LDADD = libcxxmph.la +cxxmph_SOURCES = cxxmph.cc diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index 51ddcef..bebb6cd 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -5,27 +5,14 @@ #include "MurmurHash2.h" #include "mphtable.h" -namespace __gnu_cxx { -template <> struct hash { - std::size_t operator()(std::string const& s) const { - return MurmurHash2(s.c_str(), s.length(), 1 /* seed */); - } -}; -template <> struct hash { - std::size_t operator()(const long long int& s) const { - return MurmurHash2(reinterpret_cast(&s), sizeof(long long int), 1 /* seed */); - } -}; -} // namespace __gnu_cxx - namespace cxxmph { // Save on repetitive typing. -#define CMPH_TMPL_SPEC template +#define CMPH_TMPL_SPEC template #define CMPH_CLASS_SPEC cmph_hash_map #define CMPH_METHOD_DECL(r, m) CMPH_TMPL_SPEC typename CMPH_CLASS_SPEC::r CMPH_CLASS_SPEC::m -template , class EqualKey = std::equal_to, class Alloc = std::allocator > +template , class EqualKey = std::equal_to, class Alloc = std::allocator > class cmph_hash_map { public: typedef Key key_type; @@ -132,10 +119,10 @@ CMPH_METHOD_DECL(const_iterator, begin)() const { return values_.begin(); } CMPH_METHOD_DECL(const_iterator, end)() const { return values_.end(); } CMPH_METHOD_DECL(bool_type, empty)() const { return values_.empty(); } -CMPH_METHOD_DECL(void_type, clear)() { +CMPH_METHOD_DECL(void_type, clear)() { values_.clear(); slack_.clear(); - table_.clear(); + table_.clear(); } CMPH_METHOD_DECL(void_type, erase)(iterator pos) { @@ -163,6 +150,8 @@ CMPH_METHOD_DECL(const_iterator, find)(const key_type& k) const { CMPH_METHOD_DECL(iterator, find)(const key_type& k) { if (!slack_.empty()) { typename slack_type::const_iterator it = slack_.find(k); + // TODO(davi) this is broken, it->second should be an integer + // otherwise I cannot access values_ iterators. if (it != slack_.end()) return values_.begin() + it->second; } if (table_.size() == 0) return end(); @@ -172,7 +161,6 @@ CMPH_METHOD_DECL(iterator, find)(const key_type& k) { } return end(); } - CMPH_METHOD_DECL(data_type&, operator[])(const key_type& k) { return insert(std::make_pair(k, data_type())).first->second; diff --git a/cxxmph/cmph_hash_map_test.cc b/cxxmph/cmph_hash_map_test.cc index a3e02f9..6f9eeab 100644 --- a/cxxmph/cmph_hash_map_test.cc +++ b/cxxmph/cmph_hash_map_test.cc @@ -1,5 +1,6 @@ #include "cmph_hash_map.h" +#include #include #include #include @@ -28,4 +29,13 @@ int main(int argc, char** argv) { std::cerr << "Search " << i - 1 << " gives " << h.find(buf)->second << std::endl; } } + for (int j = 0; j < 100; ++j) { + for (int i = 1000; i > 0; --i) { + char buf[10]; + snprintf(buf, 10, "%d", i*100 - 1); + h.find(buf); + std::cerr << "Search " << i*100 - 1 << " gives " << h.find(buf)->second << std::endl; + } + } + } diff --git a/cxxmph/cxxmph.cc b/cxxmph/cxxmph.cc new file mode 100644 index 0000000..9b93450 --- /dev/null +++ b/cxxmph/cxxmph.cc @@ -0,0 +1,68 @@ +// Copyright 2010 Google Inc. All Rights Reserved. +// Author: davi@google.com (Davi Reis) + +#include + +#include +#include +#include +#include + +#include "cmph_hash_map.h" +#include "config.h" + +using std::cerr; +using std::cout; +using std::endl; +using std::getline; +using std::ifstream; +using std::string; +using std::vector; + +using cxxmph::cmph_hash_map; + +void usage(const char* prg) { + cerr << "usage: " << prg << "[-v] [-h] [-V]" << endl; +} +void usage_long(const char* prg) { + usage(prg); + cerr << " -h\t print this help message" << endl; + cerr << " -V\t print version number and exit" << endl; + cerr << " -v\t increase verbosity (may be used multiple times)" << endl; +} + +int main(int argc, char** argv) { + + int verbosity = 0; + while (1) { + char ch = (char)getopt(argc, argv, "hv"); + if (ch == -1) break; + switch (ch) { + case 'h': + usage_long(argv[0]); + return 0; + case 'V': + std::cout << VERSION << std::endl; + return 0; + case 'v': + ++verbosity; + break; + } + } + if (optind != argc - 1) { + usage(argv[0]); + return 1; + } + vector keys; + ifstream f(argv[optind]); + string buffer; + while (!getline(f, buffer).eof()) keys.push_back(buffer); + cmph_hash_map table; + for (int i = 0; i < keys.size(); ++i) table[keys[i].c_str()] = keys[i]; + cmph_hash_map::const_iterator it = table.begin(); + cmph_hash_map::const_iterator end = table.end(); + for (; it != end; ++it) { + cout << (it - table.begin()) << ": " << it->first + <<" -> " << it->second << endl; + } +} From 7c9c6c518d46fb409b9aab30e9af273bc1a4ff73 Mon Sep 17 00:00:00 2001 From: davi Date: Sat, 6 Nov 2010 09:14:07 -0200 Subject: [PATCH 14/89] Thinking slack. --- cxxmph/cmph_hash_map.h | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index bebb6cd..2aba8ce 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -67,11 +67,20 @@ class cmph_hash_map { return iterator_first(it); } + struct slack_hashfnc { + size_t operator()(const const_iterator& it) const { return HashFcn()(it->first); } + }; + struct slack_equalkey { + bool operator()(const const_iterator& lhs, const const_iterator& rhs) { + return EqualKey()(lhs->first, rhs->first); + } + }; + void rehash(); std::vector values_; SimpleMPHTable::hash_function> table_; - typedef typename __gnu_cxx::hash_map slack_type; + typedef typename __gnu_cxx::hash_set slack_type; slack_type slack_; }; @@ -91,7 +100,7 @@ CMPH_METHOD_DECL(insert_return_type, insert)(const value_type& x) { iterator it = find(x.first); if (it != end()) return std::make_pair(it, false); values_.push_back(x); - slack_.insert(std::make_pair(x.first, values_.size() - 1)); + slack_.insert(values_.end() - 1); if ((slack_.size() > 10 && table_.size() == 0) || (table_.size() && slack_.size() > table_.size() * 2)) { rehash(); @@ -106,9 +115,10 @@ CMPH_METHOD_DECL(void_type, rehash)() { table_.Reset(make_iterator_first(values_.begin()), make_iterator_first(values_.end())); std::vector new_values(values_.size()); - for (unsigned int i = 0; i < values_.size(); ++i) { - size_type id = table_.index(values_[i].first); - new_values[id] = values_[i]; + for (const_iterator it = values_.begin(), end = values_.end(); + it != end; ++it) { + size_type id = table_.index(it->first); + new_values[id] = *it; } values_.swap(new_values); } @@ -137,8 +147,10 @@ CMPH_METHOD_DECL(void_type, erase)(const key_type& k) { CMPH_METHOD_DECL(const_iterator, find)(const key_type& k) const { if (!slack_.empty()) { - typename slack_type::const_iterator it = slack_.find(k); - if (it != slack_.end()) return values_.begin() + it->second; + iterator slack_key; + slack_key.first = k; + typename slack_type::const_iterator it = slack_.find(slack_key); + if (it != slack_.end()) return *it; } if (table_.size() == 0) return end(); size_type id = table_.index(k); From cde9f72c9e25416ac148d44a1a91f822c56b436e Mon Sep 17 00:00:00 2001 From: davi Date: Mon, 8 Nov 2010 18:19:44 -0200 Subject: [PATCH 15/89] Repro failure. --- cxxmph/URLS1k | 256 +++++++++++++++++++++++++++++++++++ cxxmph/cmph_hash_function.h | 32 +++-- cxxmph/cmph_hash_map.h | 38 +++--- cxxmph/cmph_hash_map_test.cc | 7 + cxxmph/cxxmph.cc | 10 +- cxxmph/mphtable.h | 3 +- 6 files changed, 305 insertions(+), 41 deletions(-) create mode 100644 cxxmph/URLS1k diff --git a/cxxmph/URLS1k b/cxxmph/URLS1k new file mode 100644 index 0000000..a7fa160 --- /dev/null +++ b/cxxmph/URLS1k @@ -0,0 +1,256 @@ +http://100_fundos.zip.net/arch2004-03-28_2004-04-03.html +http://2d-galois.flogbrasil.terra.com.br/robots.txt +http://2littledoves.fotolog.terra.com.br/ +http://3336.fotoblog.uol.com.br/photo20040305160808.html +http://3reis.weblogger.terra.com.br/robots.txt +http://4track.blogger.com.br/robots.txt +http://abelrezende.flogbrasil.terra.com.br/tops.php +http://abelsidney.vilabol.uol.com.br/cro91.html +http://abelsidney.vilabol.uol.com.br/rede8.html +http://abobaninha.weblogger.terra.com.br/200302_abobaninha_arquivo.htm +http://abpblh.org.br/ +http://abvo.org.br/up.htm +http://acervocorrs.vilabol.uol.com.br/main071.html +http://acid.weblogger.terra.com.br/2883612 +http://actionblog.zip.net/robots.txt +http://ademir.alfa.ind.br/entrada.htm +http://adrianorp.zip.net/listArchive.html +http://adriele.flogbrasil.terra.com.br/tops.php +http://aflordapele.blogger.com.br/2004_04_01_archive.html +http://afoganao.flogbrasil.terra.com.br/estados.php +http://aftertonight.blogger.com.br/robots.txt +http://agoraevidareal.blogger.com.br/2004_05_01_archive.html +http://ahafotolog.fotoblog.uol.com.br/photo20040327172209.html +http://ahmuleke.fotoblog.uol.com.br/photo20040420143013.html +http://aicomomexe.weblogger.terra.com.br/200401_aicomomexe_arquivo.htm +http://aiehdose.blig.ig.com.br/ +http://aimeudedo.zip.net/arch2004-05-09_2004-05-15.html +http://airtonfilho.pcc.usp.br/curriculum.htm +http://akane_hoshi.blogger.com.br/ +http://akinyele.letras.terra.com.br/ +http://al-green.letras.terra.com.br/ +http://albaligia.fotoblog.uol.com.br/photo20040501225132.html +http://alemaosalsicha.flogbrasil.terra.com.br/robots.txt +http://alessandrafleury.blogger.com.br/ +http://aliatras.weblogger.terra.com.br/www.marisurf.weblogger.com.br +http://alissondantas.flogbrasil.terra.com.br/ +http://allcolix.flogbrasil.terra.com.br/estados.php +http://allmylove.fotolog.terra.com.br/ +http://allyouneedisnats.flogbrasil.terra.com.br/gold_comprar.php +http://almostnaked.weblogger.terra.com.br/4674105 +http://alwaystogether.weblogger.terra.com.br/200310_alwaystogether_arquivo.htm +http://amanda-sc.fotoblog.uol.com.br/photo20040413082243.html +http://amiga-kelly.flogbrasil.terra.com.br/robots.txt +http://amigasbh.fotolog.terra.com.br/gold_comprar.php +http://amigooo.weblogger.terra.com.br/14534470 +http://amiruhama.vilabol.uol.com.br/celina_tuesmeudeus.html +http://amotheoc.fotolog.terra.com.br/tops_meninos.php +http://ana.fla.fotoblog.uol.com.br/photo20040512080652.html +http://anala.fotolog.terra.com.br/tops.php +http://anasps-sp.sites.uol.com.br/birth5.htm +http://anastm.fotolog.terra.com.br/tops_meninos.php +http://andandonasnuvens.weblogger.terra.com.br/3233568 +http://andercb.fotolog.terra.com.br/tops_meninas.php +http://andre.bac.sites.uol.com.br/equipamentos.htm +http://andrezadepp.blig.ig.com.br/ +http://andrezavega.vilabol.uol.com.br/amigo.html +http://aneeee.flogbrasil.terra.com.br/tops.php +http://angelsusy.flogbrasil.terra.com.br/robots.txt +http://ani2.weblogger.terra.com.br/ +http://aninha_space_for_me.zip.net/robots.txt +http://aninhacerqueira.flogbrasil.terra.com.br/gold.php +http://aninhafuracao.weblogger.terra.com.br/200310_aninhafuracao_arquivo.htm +http://aninhaumverdadeiroamor.weblogger.com.br/ +http://anjosdanados.flogbrasil.terra.com.br/tops.php +http://anjosklb.blogger.com.br/ +http://annafernandes.flogbrasil.terra.com.br/gold.php +http://annarafaella.fotolog.terra.com.br/ +http://anninha25.weblogger.terra.com.br/200307_anninha25_arquivo.htm +http://anninhaturbo2ponto4.flogbrasil.terra.com.br/ +http://anonimoincognito.flogbrasil.terra.com.br/tops_estados.php +http://answerbook.ime.usp.br:8888/ab2 +http://antonio-bz.flogbrasil.terra.com.br/tops_estados.php +http://apae.weblogger.terra.com.br/9718679 +http://aparelho-de-dvd-pioneer-dvr-57h.ofertas-comprar-vender.com.br/robots.txt +http://apoio.weblogger.terra.com.br/200404_apoio_arquivo.htm +http://arabella.bella.blog.uol.com.br/arch2004-04-11_2004-04-17.html +http://art-popular.cifras.art.br/cifra_16277.html +http://artcanal.com.br/q.html +http://artemiro.fotolog.terra.com.br/ +http://aruba.com.br/ +http://aslilis.flogbrasil.terra.com.br/tops_meninas.php +http://asmaluketxxx.blig.ig.com.br/robots.txt +http://asprincesinhas.weblogger.terra.com.br/robots.txt +http://assespro.org.br/fotosrs.asp +http://asveredas.com.br/gal_amigos005.html +http://atrevidamqn.fotolog.terra.com.br/tops.php +http://avitchas.blogger.com.br/ +http://ayegui-estella.spain.ehotelfinder.net/ +http://b0caum.flogbrasil.terra.com.br/tops_estados.php +http://babasidera.fotolog.terra.com.br/tops_meninos.php +http://babebibobu.blogger.com.br/robots.txt +http://babifiles.zip.net/arch2004-04-04_2004-04-10.html +http://babisenena.blogger.com.br/2004_03_14_archive.html +http://babixmaisa.flogbrasil.terra.com.br/tops_meninas.php +http://bacardigirls.flogbrasil.terra.com.br/tops.php +http://backstreet-boys.weblogger.terra.com.br/5478915 +http://backstreet.hpg.com.br/midis.html +http://bagagera.flogbrasil.terra.com.br/gold.php +http://balila.flogbrasil.terra.com.br/gold.php +http://balletclassico.fotolog.terra.com.br/tops.php +http://bambambm.fotolog.terra.com.br/gold.php +http://bandamusicalbox.vilabol.uol.com.br/MUSICALBOX-Marcelo.htm +http://barbaridades.weblogger.terra.com.br/2353538 +http://barbyssa.fotolog.terra.com.br/ +http://bartsimpson-sp.fotoblog.uol.com.br/links.html +http://batatadoce.com.br/recomendamos +http://bazinha21.fotolog.terra.com.br/ +http://bazinhamartins.fotolog.terra.com.br/tops_meninas.php +http://beavera.fotolog.terra.com.br/tops.php +http://bebecrazy.flogbrasil.terra.com.br/tops.php +http://beberzao.flogbrasil.terra.com.br/tops.php +http://becaelilo.flogbrasil.terra.com.br/tops_adulto.php +http://beck.cifras.art.br/cifra_516.html +http://bego-90.flogbrasil.terra.com.br/gold.php +http://belacapelinha.cantaminas.com.br/ +http://belluchesi.weblogger.terra.com.br/200403_belluchesi_arquivo.htm +http://belo.letras.terra.com.br/ +http://belthatha.flogbrasil.terra.com.br/tops.php +http://bemestar.weblogger.terra.com.br/200302_bemestar_arquivo.htm +http://bessa.flogbrasil.terra.com.br/tops_meninos.php +http://bestoes.flogbrasil.terra.com.br/gold.php +http://betooow.flogbrasil.terra.com.br/tops.php +http://betoosurf.flogbrasil.terra.com.br/contato.php +http://bfr.fotolog.terra.com.br/robots.txt +http://bialoka.blig.ig.com.br/robots.txt +http://biazinha-labruna.flogbrasil.terra.com.br/tops_estados.php +http://bibizinha_lindinha.zip.net/arch2004-03-07_2004-03-13.html +http://billiefabruz.blogger.com.br/robots.txt +http://bjokera.flogbrasil.terra.com.br/ +http://bjork.com.br/indexw.htm +http://bl3.com.br/windreg03_gzero.htm +http://blaze.cifras.art.br/cifra_14186.html +http://bleach.ofertas-comprar-vender.com.br/ +http://blog-da-pri.weblogger.terra.com.br/200402_blog-da-pri_arquivo.htm +http://blogdababi.zip.net/arch2004-04-04_2004-04-10.html +http://blogdadee.weblogger.terra.com.br/200311_blogdadee_arquivo.htm +http://blogdaeve.weblogger.terra.com.br/200405_blogdaeve_arquivo.htm +http://blogdalidi.weblogger.terra.com.br/200310_blogdalidi_arquivo.htm +http://blogdocaralho.weblogger.terra.com.br/12647435 +http://blogdosfalcatruas.blig.com.br/robots.txt +http://bloglife.blogger.com.br/robots.txt +http://blogsdanoite.blogger.com.br/2004_04_01_archive.html +http://bolados.weblogger.com.br/robots.txt +http://bomarley.vilabol.uol.com.br/XO.html +http://botlily.fotolog.terra.com.br/tops.php +http://boys_lie.weblogger.terra.com.br/200405_boys_lie_arquivo.htm +http://brabuletatas.turmadobar.com.br/ +http://brainwave.weblogger.terra.com.br/200403_brainwave_arquivo.htm +http://brasileiromestizo.blogger.com.br/2003_07_01_archive.html +http://brazzie.zip.net/arch2002-11-01_2002-11-15.html +http://bribokinha.fotolog.terra.com.br/ +http://bricinhu.fotolog.terra.com.br/tops_meninos.php +http://brink.fotolog.terra.com.br/tops.php +http://brox.flogbrasil.terra.com.br/tops_meninas.php +http://brozmania.blogger.com.br/robots.txt +http://bruninhabebe.fotolog.terra.com.br/tops_adulto.php +http://brustamato.flogbrasil.terra.com.br/robots.txt +http://bruuuhh.fotolog.terra.com.br/tops_meninos.php +http://bruxinhafofinha.fotolog.terra.com.br/ +http://bruxinhawiccana.flogbrasil.terra.com.br/tops_meninos.php +http://bubuzinhu.flogbrasil.terra.com.br/tops_meninos.php +http://bud.weblogger.terra.com.br/20021124_bud_arquivo.htm +http://bully.sites.uol.com.br/"+Arrws[a]+" +http://bulmarta.fotolog.terra.com.br/tops.php +http://bunitu.fotolog.terra.com.br/robots.txt +http://ca-brevi.flogbrasil.terra.com.br/tops_meninos.php +http://cabanasarua.com.br/foto_visitante_44.htm +http://cacabolhao.fotoblog.uol.com.br/photo20040423114747.html +http://cacahzinhaaa.flogbrasil.terra.com.br/estados.php +http://cadastro.brfree.com.br/ +http://cadelasemacao.vila.bol.com.br/ +http://cadenzza.sites.uol.com.br/george.htm +http://caen.france.qwhotels.com/fullindex.phtml +http://cahieursdupositif.weblogger.terra.com.br/8772980 +http://caio.munhoz.fotoblog.uol.com.br/photo20040321172822.html +http://caleo.flogbrasil.terra.com.br/tops_meninas.php +http://camera-c-730.ofertas-comprar-vender.com.br/robots.txt +http://camera-digital-aiptek-dvii.ofertas-comprar-vender.com.br/robots.txt +http://camera-digital-c-700-ultra-zoom.ofertas-comprar-vender.com.br/robots.txt +http://camera-digital-pocket-cam-classic.ofertas-comprar-vender.com.br/robots.txt +http://camera-hp-photosmart-620.ofertas-comprar-vender.com.br/robots.txt +http://camera-pentax-ist-d-slr.ofertas-comprar-vender.com.br/robots.txt +http://cameras-d510.ofertas-comprar-vender.com.br/robots.txt +http://cameras-digitais-sony-fd-91.ofertas-comprar-vender.com.br/robots.txt +http://cameras-kodak-dx-3600.ofertas-comprar-vender.com.br/robots.txt +http://camilacba.fotolog.terra.com.br/ +http://camilafelicia.flogbrasil.terra.com.br/gold.php +http://camilajuliana.fotoblog.uol.com.br/photo20040504113702.html +http://camisa-do-barbarense.ofertas-comprar-vender.com.br/ +http://campospalomino.fotoblog.uol.com.br/photo20040507120841.html +http://canal-pira.vila.bol.com.br/flagrantes.htm +http://canga.fotolog.terra.com.br/robots.txt +http://cantinhodoale.flogbrasil.terra.com.br/tops_meninas.php +http://caosfilosoficos.blogger.com.br/2004_02_01_archive.html +http://carlinhus.flogbrasil.terra.com.br/tops_adulto.php +http://carol-ferrari.fotoblog.uol.com.br/photo20040429144714.html +http://carol-ju.fotolog.terra.com.br/tops.php +http://carol-meiguinha.fotolog.terra.com.br/ +http://carolapezzatto.flogbrasil.terra.com.br/tops_meninos.php +http://carolzinhaas.flogbrasil.terra.com.br/robots.txt +http://carolzinhabacaninha.flogbrasil.terra.com.br/ +http://carolzita.logme.ig.com.br/robots.txt +http://caronte.fapergs.tche.br/res1001.htm +http://casadananda.weblogger.terra.com.br/200202_casadananda_arquivo.htm +http://cassia-eller.cifras.art.br/cifra_4838.html +http://catarinavianna.fotolog.terra.com.br/robots.txt +http://catguy.fotoblog.uol.com.br/photo20040322115139.html +http://catite.fotolog.terra.com.br/tops.php +http://catolicos.weblogger.com.br/robots.txt +http://caxias.weblogger.com.br/ +http://ccramalho.flogbrasil.terra.com.br/ +http://ccs.pontagrossa.pr.gov.br/ +http://cd-de-garbage-collection.ofertas-comprar-vender.com.br/ +http://cd-de-oficial-u2.ofertas-comprar-vender.com.br/ +http://cd-do-iron-maiden-and-and-fear-of-the-dark.ofertas-comprar-vender.com.br/robots.txt +http://cd-do-u2-one.ofertas-comprar-vender.com.br/robots.txt +http://cd-player-deh-p7500mp.ofertas-comprar-vender.com.br/ +http://cd-player-para-carro-fh-p4400.ofertas-comprar-vender.com.br/ +http://cd-player-sony-cdxc710.ofertas-comprar-vender.com.br/ +http://cedaspy.com.br/index.php +http://celina.fotolog.terra.com.br/tops_meninas.php +http://celulares-ciemems-cl50.ofertas-comprar-vender.com.br/ +http://celulares-fisio-820.ofertas-comprar-vender.com.br/ +http://celulares-sendo-j530.ofertas-comprar-vender.com.br/ +http://celulares-tel-me--2.ofertas-comprar-vender.com.br/robots.txt +http://cha_angel.weblogger.terra.com.br/ +http://chacota.flogbrasil.terra.com.br/tops_meninos.php +http://chakilla.zip.net/arch2004-04-11_2004-04-17.html +http://chalupadegilliatt.weblogger.terra.com.br/robots.txt +http://charlie-brown-jr.cifras.art.br/cifra_738.html +http://chegadesapo.blogger.com.br/ +http://chgiacomini.fotoblog.uol.com.br/links.html +http://chicolustosa.zip.net/arch2004-05-01_2004-05-15.html +http://chinfra.blogger.com.br/2003_08_17_archive.html +http://cholke.blig.ig.com.br/robots.txt +http://cicinhaa.flogbrasil.terra.com.br/contato.php +http://cidoloko.flogbrasil.terra.com.br/tops.php +http://cifranet.org/robots.txt +http://cigabruca.fotoblog.uol.com.br/photo20040514173549.html +http://cirilobresolin.weblogger.terra.com.br/16237735 +http://cissadantas.flogbrasil.terra.com.br/contato.php +http://cla_loka.zip.net/arch2004-03-01_2004-03-15.html +http://clanogueira.fotoblog.uol.com.br/photo20040420181756.html +http://clari.viana.fotoblog.uol.com.br/photo20040413170749.html +http://clarinhakk.flogbrasil.terra.com.br/gold_comprar.php +http://claudiajabour8.fotoblog.uol.com.br/photo20040411144511.html +http://clicker.flogbrasil.terra.com.br/gold.php +http://clodoaldopb.vila.bol.com.br/historiasovasorachado.html +http://clubedopairico.com.br/diario_05_11-0404.html +http://cmurca.fotoblog.uol.com.br/photo20040413184113.html +http://cobralillo.weblogger.terra.com.br/200404_cobralillo_arquivo.htm +http://coisasdoleo.weblogger.terra.com.br/20030330_coisasdoleo_arquivo.htm +http://coise-saint-jean-pied-gauthier.france.ehotelfinder.net/ +http://colombia.flogbrasil.terra.com.br/tops.php +http://comcharisma.weblogger.com.br/ +http://comediante.flogbrasil.terra.com.br/tops_meninas.php diff --git a/cxxmph/cmph_hash_function.h b/cxxmph/cmph_hash_function.h index e607e48..933d729 100644 --- a/cxxmph/cmph_hash_function.h +++ b/cxxmph/cmph_hash_function.h @@ -1,5 +1,5 @@ #include -#include // for __gnu_cxx::hash +#include // for std::hash #include "MurmurHash2.h" #include "stringpiece.h" @@ -33,7 +33,7 @@ template <> struct seeded_hash_function { template cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const { - return MurmurHash2(k, sizeof(Key), seed); + return MurmurHash2(reinterpret_cast(&k), sizeof(Key), seed); } }; @@ -48,30 +48,34 @@ struct seeded_hash_function { template struct OptimizedSeededHashFunction { typedef seeded_hash_function hash_function; }; -// Use Murmur2 instead for all types defined in __gnu_cxx::hash, plus +// Use Murmur2 instead for all types defined in std::hash, plus // std::string which is commonly extended. -template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +template <> struct OptimizedSeededHashFunction > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +template <> struct OptimizedSeededHashFunction > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +template <> struct OptimizedSeededHashFunction > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +template <> struct OptimizedSeededHashFunction > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +template <> struct OptimizedSeededHashFunction > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +template <> struct OptimizedSeededHashFunction > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +template <> struct OptimizedSeededHashFunction > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +template <> struct OptimizedSeededHashFunction > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +template <> struct OptimizedSeededHashFunction > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +template <> struct OptimizedSeededHashFunction > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction<__gnu_cxx::hash > +template <> struct OptimizedSeededHashFunction > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction > +{ typedef seeded_hash_function hash_function; }; +template <> struct OptimizedSeededHashFunction > { typedef seeded_hash_function hash_function; }; } // namespace cxxmph diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index 2aba8ce..871d4b1 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -1,4 +1,5 @@ -#include +#include +#include #include #include // for std::pair @@ -12,7 +13,7 @@ namespace cxxmph { #define CMPH_CLASS_SPEC cmph_hash_map #define CMPH_METHOD_DECL(r, m) CMPH_TMPL_SPEC typename CMPH_CLASS_SPEC::r CMPH_CLASS_SPEC::m -template , class EqualKey = std::equal_to, class Alloc = std::allocator > +template , class EqualKey = std::equal_to, class Alloc = std::allocator > class cmph_hash_map { public: typedef Key key_type; @@ -67,20 +68,11 @@ class cmph_hash_map { return iterator_first(it); } - struct slack_hashfnc { - size_t operator()(const const_iterator& it) const { return HashFcn()(it->first); } - }; - struct slack_equalkey { - bool operator()(const const_iterator& lhs, const const_iterator& rhs) { - return EqualKey()(lhs->first, rhs->first); - } - }; - - void rehash(); std::vector values_; SimpleMPHTable::hash_function> table_; - typedef typename __gnu_cxx::hash_set slack_type; + // TODO(davi) optimize slack to no hold a copy of the key + typedef typename std::unordered_map slack_type; slack_type slack_; }; @@ -100,9 +92,11 @@ CMPH_METHOD_DECL(insert_return_type, insert)(const value_type& x) { iterator it = find(x.first); if (it != end()) return std::make_pair(it, false); values_.push_back(x); - slack_.insert(values_.end() - 1); - if ((slack_.size() > 10 && table_.size() == 0) || - (table_.size() && slack_.size() > table_.size() * 2)) { + slack_.insert(std::make_pair(x.first, values_.size() - 1)); + if (slack_.size() == table_.size() || + (slack_.size() >= 256 && table_.size() == 0)) { + // TODO(davi) debug only, remove afterwards + std::sort(values_.begin(), values_.end()); rehash(); } it = find(x.first); @@ -111,6 +105,10 @@ CMPH_METHOD_DECL(insert_return_type, insert)(const value_type& x) { CMPH_METHOD_DECL(void_type, rehash)() { if (values_.empty()) return; + std::cerr << "Calling Reset with " + << table_.size() << " keys in table " + << slack_.size() << " keys in slack " + << values_.size() << " key in total" << std::endl; slack_type().swap(slack_); table_.Reset(make_iterator_first(values_.begin()), make_iterator_first(values_.end())); @@ -147,10 +145,8 @@ CMPH_METHOD_DECL(void_type, erase)(const key_type& k) { CMPH_METHOD_DECL(const_iterator, find)(const key_type& k) const { if (!slack_.empty()) { - iterator slack_key; - slack_key.first = k; - typename slack_type::const_iterator it = slack_.find(slack_key); - if (it != slack_.end()) return *it; + typename slack_type::const_iterator it = slack_.find(k); + if (it != slack_.end()) return values_.begin() + it->second; } if (table_.size() == 0) return end(); size_type id = table_.index(k); @@ -162,8 +158,6 @@ CMPH_METHOD_DECL(const_iterator, find)(const key_type& k) const { CMPH_METHOD_DECL(iterator, find)(const key_type& k) { if (!slack_.empty()) { typename slack_type::const_iterator it = slack_.find(k); - // TODO(davi) this is broken, it->second should be an integer - // otherwise I cannot access values_ iterators. if (it != slack_.end()) return values_.begin() + it->second; } if (table_.size() == 0) return end(); diff --git a/cxxmph/cmph_hash_map_test.cc b/cxxmph/cmph_hash_map_test.cc index 6f9eeab..a75e0cd 100644 --- a/cxxmph/cmph_hash_map_test.cc +++ b/cxxmph/cmph_hash_map_test.cc @@ -5,10 +5,16 @@ #include #include +using std::make_pair; using std::string; using cxxmph::cmph_hash_map; int main(int argc, char** argv) { + cmph_hash_map b; + for (int i = 0; i < 2*500; ++i) { + b.insert(make_pair(i, i)); + } + /* cmph_hash_map h; h.insert(std::make_pair("-1",-1)); cmph_hash_map::const_iterator it; @@ -37,5 +43,6 @@ int main(int argc, char** argv) { std::cerr << "Search " << i*100 - 1 << " gives " << h.find(buf)->second << std::endl; } } + */ } diff --git a/cxxmph/cxxmph.cc b/cxxmph/cxxmph.cc index 9b93450..623ecc4 100644 --- a/cxxmph/cxxmph.cc +++ b/cxxmph/cxxmph.cc @@ -57,10 +57,12 @@ int main(int argc, char** argv) { ifstream f(argv[optind]); string buffer; while (!getline(f, buffer).eof()) keys.push_back(buffer); - cmph_hash_map table; - for (int i = 0; i < keys.size(); ++i) table[keys[i].c_str()] = keys[i]; - cmph_hash_map::const_iterator it = table.begin(); - cmph_hash_map::const_iterator end = table.end(); + for (int i = 0; i < keys.size(); ++i) string s = keys[i]; + cmph_hash_map table; + + for (int i = 0; i < keys.size(); ++i) table[keys[i]] = keys[i]; + cmph_hash_map::const_iterator it = table.begin(); + cmph_hash_map::const_iterator end = table.end(); for (; it != end; ++it) { cout << (it - table.begin()) << ": " << it->first <<" -> " << it->second << endl; diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index 2a3786a..ce2517e 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -4,6 +4,7 @@ // Minimal perfect hash abstraction implementing the BDZ algorithm #include +#include // for std::hash #include #include @@ -133,7 +134,7 @@ cmph_uint32 MPHTable::index(const Key& key) const { return Rank(vertex); } -template >::hash_function> +template >::hash_function> class SimpleMPHTable : public MPHTable { public: template From 676d34073c01617f1ffb1d2c0094b7ea9baddee1 Mon Sep 17 00:00:00 2001 From: davi Date: Mon, 8 Nov 2010 22:02:18 -0200 Subject: [PATCH 16/89] Fixed first_edge initialization bug. --- cxxmph/cmph_hash_map.h | 6 ++- cxxmph/mphtable.cc | 1 + cxxmph/mphtable.h | 6 +++ cxxmph/trigraph.cc | 14 +++++- cxxmph/trigraph.h | 1 + src/Makefile.am | 2 +- src/bdz.c | 1 + src/jenkins_hash.c | 110 +++++++++++++++++++++-------------------- 8 files changed, 84 insertions(+), 57 deletions(-) diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index 871d4b1..12e98a0 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -110,8 +110,10 @@ CMPH_METHOD_DECL(void_type, rehash)() { << slack_.size() << " keys in slack " << values_.size() << " key in total" << std::endl; slack_type().swap(slack_); - table_.Reset(make_iterator_first(values_.begin()), - make_iterator_first(values_.end())); + bool success = table_.Reset( + make_iterator_first(values_.begin()), + make_iterator_first(values_.end())); + assert(success); std::vector new_values(values_.size()); for (const_iterator it = values_.begin(), end = values_.end(); it != end; ++it) { diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index d3537a9..669df06 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -69,6 +69,7 @@ bool MPHTable::GenerateQueue( // At this point queue head is the number of edges touching at least one // vertex of degree 1. cerr << "Queue head " << queue_head << " Queue tail " << queue_tail << endl; + graph->DebugGraph(); while (queue_tail != queue_head) { cmph_uint32 current_edge = queue[queue_tail++]; graph->RemoveEdge(current_edge); diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index ce2517e..46726b6 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -3,6 +3,7 @@ // Minimal perfect hash abstraction implementing the BDZ algorithm +#include #include #include // for std::hash #include @@ -129,6 +130,11 @@ cmph_uint32 MPHTable::index(const Key& key) const { h[0] = h[0] % r_; h[1] = h[1] % r_ + r_; h[2] = h[2] % r_ + (r_ << 1); + assert(g_.size()); + cerr << "g_.size() " << g_.size() << " h0 >> 2 " << (h[0] >> 2) << endl; + assert((h[0] >> 2) > 2) > 2) ::max(); +static const cmph_uint32 kInvalidEdge = std::numeric_limits::max(); } namespace cxxmph { @@ -65,5 +65,17 @@ void TriGraph::RemoveEdge(cmph_uint32 current_edge) { --vertex_degree_[vertex]; } } + +void TriGraph::DebugGraph() const { + int i; + for(i = 0; i < edges_.size(); i++){ + cerr << i << " " << edges_[i][0] << " " << edges_[i][1] << " " << edges_[i][2] + << " nexts " << next_edge_[i][0] << " " << next_edge_[i][1] << " " << next_edge_[i][2] << endl; + } + for(i = 0; i < first_edge_.size();i++){ + cerr << "first for vertice " <* edges); + void DebugGraph() const; const std::vector& edges() const { return edges_; } const std::vector& vertex_degree() const { return vertex_degree_; } diff --git a/src/Makefile.am b/src/Makefile.am index f3896dc..40734e4 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -2,7 +2,7 @@ bin_PROGRAMS = cmph lib_LTLIBRARIES = libcmph.la include_HEADERS = cmph.h cmph_types.h cmph_time.h chd_ph.h libcmph_la_SOURCES = hash.h hash.c \ - jenkins_hash.h jenkins_hash.c\ + jenkins_hash.h jenkins_hash.c MurmurHash2.h\ hash_state.h debug.h \ vstack.h vstack.c vqueue.h vqueue.c\ graph.h graph.c bitbool.h \ diff --git a/src/bdz.c b/src/bdz.c index 5dce597..059c281 100755 --- a/src/bdz.c +++ b/src/bdz.c @@ -178,6 +178,7 @@ static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_que }; }; DEBUGP("Queue head %d Queue tail %d\n", queue_head, queue_tail); + bdz_dump_graph(graph3,graph3->nedges,graph3->nedges+graph3->nedges/4); while(queue_tail!=queue_head){ curr_edge=queue[queue_tail++]; bdz_remove_edge(graph3,curr_edge); diff --git a/src/jenkins_hash.c b/src/jenkins_hash.c index 4697f74..5d4e807 100644 --- a/src/jenkins_hash.c +++ b/src/jenkins_hash.c @@ -9,6 +9,7 @@ #define DEBUG #include "debug.h" +#include "MurmurHash2.h" #define hashsize(n) ((cmph_uint32)1<<(n)) #define hashmask(n) (hashsize(n)-1) @@ -99,63 +100,67 @@ void jenkins_state_destroy(jenkins_state_t *state) inline void __jenkins_hash_vector(cmph_uint32 seed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) { - register cmph_uint32 len, length; - - /* Set up the internal state */ - length = keylen; - len = length; - hashes[0] = hashes[1] = 0x9e3779b9; /* the golden ratio; an arbitrary value */ - hashes[2] = seed; /* the previous hash value - seed in our case */ - - /*---------------------------------------- handle most of the key */ - while (len >= 12) - { - hashes[0] += ((cmph_uint32)k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24)); - hashes[1] += ((cmph_uint32)k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24)); - hashes[2] += ((cmph_uint32)k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24)); - mix(hashes[0],hashes[1],hashes[2]); - k += 12; len -= 12; + int i; + for (i = 0; i < 3; ++i) { + hashes[i] = MurmurHash2(k, keylen, seed + i); } - - /*------------------------------------- handle the last 11 bytes */ - hashes[2] += length; - switch(len) /* all the case statements fall through */ - { - case 11: - hashes[2] +=((cmph_uint32)k[10]<<24); - case 10: - hashes[2] +=((cmph_uint32)k[9]<<16); - case 9 : - hashes[2] +=((cmph_uint32)k[8]<<8); - /* the first byte of hashes[2] is reserved for the length */ - case 8 : - hashes[1] +=((cmph_uint32)k[7]<<24); - case 7 : - hashes[1] +=((cmph_uint32)k[6]<<16); - case 6 : - hashes[1] +=((cmph_uint32)k[5]<<8); - case 5 : - hashes[1] +=(cmph_uint8) k[4]; - case 4 : - hashes[0] +=((cmph_uint32)k[3]<<24); - case 3 : - hashes[0] +=((cmph_uint32)k[2]<<16); - case 2 : - hashes[0] +=((cmph_uint32)k[1]<<8); - case 1 : - hashes[0] +=(cmph_uint8)k[0]; - /* case 0: nothing left to add */ - } - - mix(hashes[0],hashes[1],hashes[2]); +// register cmph_uint32 len, length; +// +// /* Set up the internal state */ +// length = keylen; +// len = length; +// hashes[0] = hashes[1] = 0x9e3779b9; /* the golden ratio; an arbitrary value */ +// hashes[2] = seed; /* the previous hash value - seed in our case */ +// +// /*---------------------------------------- handle most of the key */ +// while (len >= 12) +// { +// hashes[0] += ((cmph_uint32)k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24)); +// hashes[1] += ((cmph_uint32)k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24)); +// hashes[2] += ((cmph_uint32)k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24)); +// mix(hashes[0],hashes[1],hashes[2]); +// k += 12; len -= 12; +// } +// +// /*------------------------------------- handle the last 11 bytes */ +// hashes[2] += length; +// switch(len) /* all the case statements fall through */ +// { +// case 11: +// hashes[2] +=((cmph_uint32)k[10]<<24); +// case 10: +// hashes[2] +=((cmph_uint32)k[9]<<16); +// case 9 : +// hashes[2] +=((cmph_uint32)k[8]<<8); +// /* the first byte of hashes[2] is reserved for the length */ +// case 8 : +// hashes[1] +=((cmph_uint32)k[7]<<24); +// case 7 : +// hashes[1] +=((cmph_uint32)k[6]<<16); +// case 6 : +// hashes[1] +=((cmph_uint32)k[5]<<8); +// case 5 : +// hashes[1] +=(cmph_uint8) k[4]; +// case 4 : +// hashes[0] +=((cmph_uint32)k[3]<<24); +// case 3 : +// hashes[0] +=((cmph_uint32)k[2]<<16); +// case 2 : +// hashes[0] +=((cmph_uint32)k[1]<<8); +// case 1 : +// hashes[0] +=(cmph_uint8)k[0]; +// /* case 0: nothing left to add */ +// } +// +// mix(hashes[0],hashes[1],hashes[2]); } cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen) { - cmph_uint32 hashes[3]; - __jenkins_hash_vector(state->seed, k, keylen, hashes); - return hashes[2]; -/* cmph_uint32 a, b, c; +// cmph_uint32 hashes[3]; +// __jenkins_hash_vector(state->seed, k, keylen, hashes); +// return hashes[2]; + cmph_uint32 a, b, c; cmph_uint32 len, length; // Set up the internal state @@ -209,7 +214,6 @@ cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keyl /// report the result return c; - */ } void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) From b0255a8269b0302c118bfb6a1f080125aea2242f Mon Sep 17 00:00:00 2001 From: davi Date: Tue, 9 Nov 2010 02:29:39 -0200 Subject: [PATCH 17/89] Valgrind pass. --- cxxmph/cmph_hash_map_test.cc | 2 +- cxxmph/mphtable.cc | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cxxmph/cmph_hash_map_test.cc b/cxxmph/cmph_hash_map_test.cc index a75e0cd..f610f15 100644 --- a/cxxmph/cmph_hash_map_test.cc +++ b/cxxmph/cmph_hash_map_test.cc @@ -11,7 +11,7 @@ using cxxmph::cmph_hash_map; int main(int argc, char** argv) { cmph_hash_map b; - for (int i = 0; i < 2*500; ++i) { + for (int i = 0; i < 257; ++i) { b.insert(make_pair(i, i)); } /* diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index 669df06..ac898c1 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -95,13 +95,13 @@ bool MPHTable::GenerateQueue( void MPHTable::Assigning( const vector& edges, const vector& queue) { - cmph_uint32 nedges = m_; cmph_uint32 current_edge = 0; - vector marked_vertices(nedges + 1); + vector marked_vertices(n_ + 1); // Initialize vector of half nibbles with all bits set. cmph_uint32 sizeg = static_cast(ceil(n_/4.0)); vector(sizeg, std::numeric_limits::max()).swap(g_); + cmph_uint32 nedges = m_; // for legibility for (int i = nedges - 1; i + 1 >= 1; --i) { current_edge = queue[i]; cerr << "Current edge " << current_edge << " at queue pos " << i << endl; @@ -117,6 +117,7 @@ void MPHTable::Assigning( } if (!marked_vertices[e[2]]) { set_2bit_value(&g_, e[2], kUnassigned); + assert(marked_vertices.size() > e[2]); marked_vertices[e[2]] = true; } set_2bit_value(&g_, e[0], (6 - (get_2bit_value(g_, e[1]) + get_2bit_value(g_, e[2]))) % 3); From bb2e9e28a8a392d9bd589857ed73943cdc442e3f Mon Sep 17 00:00:00 2001 From: davi Date: Tue, 9 Nov 2010 03:38:46 -0200 Subject: [PATCH 18/89] All looks fine, commenting debug. --- cxxmph/cmph_hash_map.h | 2 -- cxxmph/cmph_hash_map_test.cc | 2 +- cxxmph/mphtable.cc | 10 +++++++--- cxxmph/mphtable.h | 9 +++++---- src/bdz.c | 3 +++ 5 files changed, 16 insertions(+), 10 deletions(-) diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index 12e98a0..931c073 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -95,8 +95,6 @@ CMPH_METHOD_DECL(insert_return_type, insert)(const value_type& x) { slack_.insert(std::make_pair(x.first, values_.size() - 1)); if (slack_.size() == table_.size() || (slack_.size() >= 256 && table_.size() == 0)) { - // TODO(davi) debug only, remove afterwards - std::sort(values_.begin(), values_.end()); rehash(); } it = find(x.first); diff --git a/cxxmph/cmph_hash_map_test.cc b/cxxmph/cmph_hash_map_test.cc index f610f15..a75e0cd 100644 --- a/cxxmph/cmph_hash_map_test.cc +++ b/cxxmph/cmph_hash_map_test.cc @@ -11,7 +11,7 @@ using cxxmph::cmph_hash_map; int main(int argc, char** argv) { cmph_hash_map b; - for (int i = 0; i < 257; ++i) { + for (int i = 0; i < 2*500; ++i) { b.insert(make_pair(i, i)); } /* diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index ac898c1..b46818c 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -100,16 +100,17 @@ void MPHTable::Assigning( // Initialize vector of half nibbles with all bits set. cmph_uint32 sizeg = static_cast(ceil(n_/4.0)); vector(sizeg, std::numeric_limits::max()).swap(g_); + assert(get_2bit_value(g_, 291) == kUnassigned); cmph_uint32 nedges = m_; // for legibility for (int i = nedges - 1; i + 1 >= 1; --i) { current_edge = queue[i]; - cerr << "Current edge " << current_edge << " at queue pos " << i << endl; + if (current_edge == 157) cerr << "Edge 157" << endl; const TriGraph::Edge& e = edges[current_edge]; cerr << "B: " << e[0] << " " << e[1] << " " << e[2] << " -> " << get_2bit_value(g_, e[0]) << " " << get_2bit_value(g_, e[1]) << " " - << get_2bit_value(g_, e[2]) << " " << endl; + << get_2bit_value(g_, e[2]) << " edge " << current_edge << endl; if (!marked_vertices[e[0]]) { if (!marked_vertices[e[1]]) { set_2bit_value(&g_, e[1], kUnassigned); @@ -121,6 +122,7 @@ void MPHTable::Assigning( marked_vertices[e[2]] = true; } set_2bit_value(&g_, e[0], (6 - (get_2bit_value(g_, e[1]) + get_2bit_value(g_, e[2]))) % 3); + if (e[0] == 291) cerr << "Vertex 291 " << get_2bit_value(g_, 291) << " updated at case 1" << endl; marked_vertices[e[0]] = true; } else if (!marked_vertices[e[1]]) { if (!marked_vertices[e[2]]) { @@ -128,9 +130,11 @@ void MPHTable::Assigning( marked_vertices[e[2]] = true; } set_2bit_value(&g_, e[1], (7 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[2]))) % 3); + if (e[1] == 291) cerr << "Vertex 291 " << get_2bit_value(g_, 291) << " updated at case 2" << endl; marked_vertices[e[1]] = true; } else { set_2bit_value(&g_, e[2], (8 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[1]))) % 3); + if (e[2] == 291) cerr << "Vertex 291 " << get_2bit_value(g_, 291) << " updated at case 3" << endl; marked_vertices[e[2]] = true; } cerr << "A: " << e[0] << " " << e[1] << " " << e[2] << " -> " @@ -177,8 +181,8 @@ cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { for (unsigned int i = 0; i < n_; ++i) { cerr << get_2bit_value(g_, i) << " "; } + cerr << endl; while (beg_idx_v < vertex) { - cerr << get_2bit_value(g_, beg_idx_v) << " "; if (get_2bit_value(g_, beg_idx_v) != kUnassigned) ++base_rank; ++beg_idx_v; } diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index 46726b6..c1394c1 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -64,10 +64,10 @@ class MPHTable { cmph_uint32 hash_seed_[3]; static const cmph_uint8 valuemask[]; - static void set_2bit_value(std::vector *d, cmph_uint8 i, cmph_uint8 v) { + static void set_2bit_value(std::vector *d, cmph_uint32 i, cmph_uint8 v) { (*d)[(i >> 2)] &= (v << ((i & 3) << 1)) | valuemask[i & 3]; } - static cmph_uint32 get_2bit_value(const std::vector& d, cmph_uint8 i) { + static cmph_uint32 get_2bit_value(const std::vector& d, cmph_uint32 i) { return (d[(i >> 2)] >> ((i & 3) << 1)) & 3; } @@ -85,12 +85,13 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; - int iterations = 1000; + int iterations = 10; std::vector edges; std::vector queue; while (1) { cerr << "Iterations missing: " << iterations << endl; - for (int i = 0; i < 3; ++i) hash_seed_[i] = random(); + for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_; + // for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i; if (Mapping(begin, end, &edges, &queue)) break; else --iterations; if (iterations == 0) break; diff --git a/src/bdz.c b/src/bdz.c index 059c281..1c49c9d 100755 --- a/src/bdz.c +++ b/src/bdz.c @@ -444,6 +444,7 @@ static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t SETBIT(marked_vertices, v2); } SETVALUE1(bdz->g, v0, (6-(GETVALUE(bdz->g, v1) + GETVALUE(bdz->g,v2)))%3); + if (v0 == 291) fprintf(stderr, "Vertex 291 updated at case 1\n"); SETBIT(marked_vertices, v0); } else if(!GETBIT(marked_vertices, v1)) { if(!GETBIT(marked_vertices, v2)) @@ -452,9 +453,11 @@ static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t SETBIT(marked_vertices, v2); } SETVALUE1(bdz->g, v1, (7-(GETVALUE(bdz->g, v0)+GETVALUE(bdz->g, v2)))%3); + if (v1 == 291) fprintf(stderr, "Vertex 291 updated at case 1\n"); SETBIT(marked_vertices, v1); }else { SETVALUE1(bdz->g, v2, (8-(GETVALUE(bdz->g,v0)+GETVALUE(bdz->g, v1)))%3); + if (v2 == 291) fprintf(stderr, "Vertex 291 updated at case 1\n"); SETBIT(marked_vertices, v2); } DEBUGP("A:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz->g, v0), GETVALUE(bdz->g, v1), GETVALUE(bdz->g, v2)); From 62ac3f4bded4443d7561fd640133a85554948dbd Mon Sep 17 00:00:00 2001 From: davi Date: Tue, 9 Nov 2010 03:51:33 -0200 Subject: [PATCH 19/89] All fine, time to optimize. --- cxxmph/cmph_hash_map_test.cc | 2 +- cxxmph/mphtable.cc | 45 ++++++++++++++++++------------------ cxxmph/mphtable.h | 6 ++--- cxxmph/trigraph.cc | 2 +- 4 files changed, 27 insertions(+), 28 deletions(-) diff --git a/cxxmph/cmph_hash_map_test.cc b/cxxmph/cmph_hash_map_test.cc index a75e0cd..73c3290 100644 --- a/cxxmph/cmph_hash_map_test.cc +++ b/cxxmph/cmph_hash_map_test.cc @@ -11,7 +11,7 @@ using cxxmph::cmph_hash_map; int main(int argc, char** argv) { cmph_hash_map b; - for (int i = 0; i < 2*500; ++i) { + for (int i = 0; i < 2*1000*1000; ++i) { b.insert(make_pair(i, i)); } /* diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index b46818c..ba90ae2 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -60,16 +60,18 @@ bool MPHTable::GenerateQueue( } } } + /* for (unsigned int i = 0; i < marked_edge.size(); ++i) { cerr << "vertex with degree " << static_cast(graph->vertex_degree()[i]) << " marked " << marked_edge[i] << endl; } for (unsigned int i = 0; i < queue.size(); ++i) { cerr << "vertex " << i << " queued at " << queue[i] << endl; } + */ // At this point queue head is the number of edges touching at least one // vertex of degree 1. - cerr << "Queue head " << queue_head << " Queue tail " << queue_tail << endl; - graph->DebugGraph(); + // cerr << "Queue head " << queue_head << " Queue tail " << queue_tail << endl; + // graph->DebugGraph(); while (queue_tail != queue_head) { cmph_uint32 current_edge = queue[queue_tail++]; graph->RemoveEdge(current_edge); @@ -85,9 +87,11 @@ bool MPHTable::GenerateQueue( } } } + /* for (unsigned int i = 0; i < queue.size(); ++i) { cerr << "vertex " << i << " queued at " << queue[i] << endl; } + */ int cycles = queue_head - nedges; if (cycles == 0) queue.swap(*queue_output); return cycles == 0; @@ -100,17 +104,15 @@ void MPHTable::Assigning( // Initialize vector of half nibbles with all bits set. cmph_uint32 sizeg = static_cast(ceil(n_/4.0)); vector(sizeg, std::numeric_limits::max()).swap(g_); - assert(get_2bit_value(g_, 291) == kUnassigned); cmph_uint32 nedges = m_; // for legibility for (int i = nedges - 1; i + 1 >= 1; --i) { current_edge = queue[i]; - if (current_edge == 157) cerr << "Edge 157" << endl; const TriGraph::Edge& e = edges[current_edge]; - cerr << "B: " << e[0] << " " << e[1] << " " << e[2] << " -> " - << get_2bit_value(g_, e[0]) << " " - << get_2bit_value(g_, e[1]) << " " - << get_2bit_value(g_, e[2]) << " edge " << current_edge << endl; + // cerr << "B: " << e[0] << " " << e[1] << " " << e[2] << " -> " + // << get_2bit_value(g_, e[0]) << " " + // << get_2bit_value(g_, e[1]) << " " + // << get_2bit_value(g_, e[2]) << " edge " << current_edge << endl; if (!marked_vertices[e[0]]) { if (!marked_vertices[e[1]]) { set_2bit_value(&g_, e[1], kUnassigned); @@ -122,7 +124,6 @@ void MPHTable::Assigning( marked_vertices[e[2]] = true; } set_2bit_value(&g_, e[0], (6 - (get_2bit_value(g_, e[1]) + get_2bit_value(g_, e[2]))) % 3); - if (e[0] == 291) cerr << "Vertex 291 " << get_2bit_value(g_, 291) << " updated at case 1" << endl; marked_vertices[e[0]] = true; } else if (!marked_vertices[e[1]]) { if (!marked_vertices[e[2]]) { @@ -130,17 +131,15 @@ void MPHTable::Assigning( marked_vertices[e[2]] = true; } set_2bit_value(&g_, e[1], (7 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[2]))) % 3); - if (e[1] == 291) cerr << "Vertex 291 " << get_2bit_value(g_, 291) << " updated at case 2" << endl; marked_vertices[e[1]] = true; } else { set_2bit_value(&g_, e[2], (8 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[1]))) % 3); - if (e[2] == 291) cerr << "Vertex 291 " << get_2bit_value(g_, 291) << " updated at case 3" << endl; marked_vertices[e[2]] = true; } - cerr << "A: " << e[0] << " " << e[1] << " " << e[2] << " -> " - << get_2bit_value(g_, e[0]) << " " - << get_2bit_value(g_, e[1]) << " " - << get_2bit_value(g_, e[2]) << " " << endl; + // cerr << "A: " << e[0] << " " << e[1] << " " << e[2] << " -> " + // << get_2bit_value(g_, e[0]) << " " + // << get_2bit_value(g_, e[1]) << " " + // << get_2bit_value(g_, e[2]) << " " << endl; } } @@ -174,19 +173,19 @@ cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { cmph_uint32 end_idx_b = vertex >> 2; while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]]; beg_idx_v = beg_idx_b << 2; - cerr << "beg_idx_v: " << beg_idx_v << endl; - cerr << "base rank: " << base_rank << endl; + // cerr << "beg_idx_v: " << beg_idx_v << endl; + // cerr << "base rank: " << base_rank << endl; - cerr << "G: "; - for (unsigned int i = 0; i < n_; ++i) { - cerr << get_2bit_value(g_, i) << " "; - } - cerr << endl; + //cerr << "G: "; + // for (unsigned int i = 0; i < n_; ++i) { + // cerr << get_2bit_value(g_, i) << " "; + //} + // cerr << endl; while (beg_idx_v < vertex) { if (get_2bit_value(g_, beg_idx_v) != kUnassigned) ++base_rank; ++beg_idx_v; } - cerr << "Base rank: " << base_rank << endl; + // cerr << "Base rank: " << base_rank << endl; return base_rank; } diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index c1394c1..02250a9 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -114,7 +114,7 @@ bool MPHTable::Mapping( cmph_uint32 v0 = h[0] % r_; cmph_uint32 v1 = h[1] % r_ + r_; cmph_uint32 v2 = h[2] % r_ + (r_ << 1); - cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; + // cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; graph.AddEdge(TriGraph::Edge(v0, v1, v2)); } if (GenerateQueue(&graph, queue)) { @@ -132,12 +132,12 @@ cmph_uint32 MPHTable::index(const Key& key) const { h[1] = h[1] % r_ + r_; h[2] = h[2] % r_ + (r_ << 1); assert(g_.size()); - cerr << "g_.size() " << g_.size() << " h0 >> 2 " << (h[0] >> 2) << endl; + //cerr << "g_.size() " << g_.size() << " h0 >> 2 " << (h[0] >> 2) << endl; assert((h[0] >> 2) > 2) > 2) Date: Thu, 20 Jan 2011 23:07:46 -0200 Subject: [PATCH 20/89] Forgot. --- cxxmph/cmph_hash_map_test.cc | 3 ++ src/MurmurHash2.h | 69 ++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 src/MurmurHash2.h diff --git a/cxxmph/cmph_hash_map_test.cc b/cxxmph/cmph_hash_map_test.cc index 73c3290..2ebaa94 100644 --- a/cxxmph/cmph_hash_map_test.cc +++ b/cxxmph/cmph_hash_map_test.cc @@ -14,6 +14,9 @@ int main(int argc, char** argv) { for (int i = 0; i < 2*1000*1000; ++i) { b.insert(make_pair(i, i)); } + for (int i = 0; i < 100*1000*1000; ++i) { + b.find(i); + } /* cmph_hash_map h; h.insert(std::make_pair("-1",-1)); diff --git a/src/MurmurHash2.h b/src/MurmurHash2.h new file mode 100644 index 0000000..52d015a --- /dev/null +++ b/src/MurmurHash2.h @@ -0,0 +1,69 @@ +#ifndef __CXXMPH_MURMUR_HASH2__ +#define __CXXMPH_MURMUR_HASH2__ + +//----------------------------------------------------------------------------- +// MurmurHash2, by Austin Appleby + +// Note - This code makes a few assumptions about how your machine behaves - + +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 + +// And it has a few limitations - + +// 1. It will not work incrementally. +// 2. It will not produce the same results on little-endian and big-endian +// machines. + +unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const unsigned int m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + unsigned int h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + unsigned int k = *(unsigned int *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +#endif // __CXXMPH_MURMUR_HASH2__ From 2a35666bfa88e8260add9b48b71762308a3710d2 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Mon, 24 Jan 2011 10:29:22 -0200 Subject: [PATCH 21/89] Add benchmarking code. --- src/Makefile.am | 1 + src/cmph_benchmark.c | 103 +++++++++++++++++++++++++++++++++++++++++++ src/cmph_benchmark.h | 20 +++++++++ 3 files changed, 124 insertions(+) create mode 100644 src/cmph_benchmark.c create mode 100644 src/cmph_benchmark.h diff --git a/src/Makefile.am b/src/Makefile.am index 40734e4..c44d832 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -23,6 +23,7 @@ libcmph_la_SOURCES = hash.h hash.c \ select.h select.c select_lookup_tables.h \ compressed_seq.h compressed_seq.c \ compressed_rank.h compressed_rank.c \ + cmph_benchmark.h cmph_benchmark.cc \ cmph_time.h libcmph_la_LDFLAGS = -version-info 0:0:0 diff --git a/src/cmph_benchmark.c b/src/cmph_benchmark.c new file mode 100644 index 0000000..9d7c9da --- /dev/null +++ b/src/cmph_benchmark.c @@ -0,0 +1,103 @@ +#include +#include +#include + +#include "cmph_benchmark.h" + +typedef struct { + const char* name; + void (*func)(int); + struct rusage begin; + struct rusage end; +} benchmark_t; + +static benchmark_t* global_benchmarks = NULL; + +/* Subtract the `struct timeval' values X and Y, + storing the result in RESULT. + Return 1 if the difference is negative, otherwise 0. */ + +int timeval_subtract ( + struct timeval *result, struct timeval *x, struct timeval* y) { + /* Perform the carry for the later subtraction by updating y. */ + if (x->tv_usec < y->tv_usec) { + int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; + y->tv_usec -= 1000000 * nsec; + y->tv_sec += nsec; + } + if (x->tv_usec - y->tv_usec > 1000000) { + int nsec = (x->tv_usec - y->tv_usec) / 1000000; + y->tv_usec += 1000000 * nsec; + y->tv_sec -= nsec; + } + + /* Compute the time remaining to wait. + tv_usec is certainly positive. */ + result->tv_sec = x->tv_sec - y->tv_sec; + result->tv_usec = x->tv_usec - y->tv_usec; + + /* Return 1 if result is negative. */ + return x->tv_sec < y->tv_sec; +} + +benchmark_t* find_benchmark(const char* name) { + benchmark_t* benchmark = global_benchmarks; + while (benchmark->name != NULL) if (strcmp(benchmark->name, name) != 0) break; + if (!benchmark->name) return NULL; + return benchmark; +} + +int global_benchmarks_length() { + benchmark_t* benchmark; + int length = 0; + if (global_benchmarks == 0) return 0; + benchmark = global_benchmarks; + while (benchmark->name != NULL) ++length; + return length; +} + +void bm_register(const char* name, void (*func)(int), int iters) { + benchmark_t benchmark; + int length = global_benchmarks_length(); + benchmark.name = name; + benchmark.func = func; + assert(!find_benchmark(name)); + global_benchmarks = realloc(global_benchmarks, length + 1); + global_benchmarks[length] = benchmark; +} + +void bm_start(const char* name) { + benchmark_t* benchmark; + struct rusage rs; + + benchmark = find_benchmark(name); + int ret = getrusage(RUSAGE_SELF, &rs); + if (ret != 0) { + perror("rusage failed"); + exit(-1); + } + benchmark->begin = rs; + (*benchmark->func)(1); +} + +void bm_end(const char* name) { + benchmark_t* benchmark; + struct rusage rs; + + int ret = getrusage(RUSAGE_SELF, &rs); + if (ret != 0) { + perror("rusage failed"); + exit(-1); + } + + benchmark = find_benchmark(name); + benchmark->end = rs; + + struct timeval utime; + timeval_subtract(&utime, &benchmark->end.ru_utime, &benchmark->begin.ru_utime); + struct timeval stime; + timeval_subtract(&stime, &benchmark->end.ru_stime, &benchmark->begin.ru_stime); + + printf("User cpu time used: %ld.%6ld\n", utime.tv_sec, utime.tv_usec); + printf("System cpu time used: %ld.%6ld\n", stime.tv_sec, stime.tv_usec); +} diff --git a/src/cmph_benchmark.h b/src/cmph_benchmark.h new file mode 100644 index 0000000..f987ce1 --- /dev/null +++ b/src/cmph_benchmark.h @@ -0,0 +1,20 @@ +#ifndef __CMPH_BENCHMARK_H__ +#define __CMPH_BENCHMARK_H__ + +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define BM_REGISTER(func, iters) bm_register(##func, func, iters); +void bm_register(const char* name, void (*func)(int), int iters); +void run_benchmarks(int argc, char** argv); + +#ifdef __cplusplus +} +#endif + +#endif // __CMPH_BENCHMARK_H__ From 5b78c02da09414f89d1de1191a03c11ac2c22c35 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sun, 13 Feb 2011 20:40:26 -0200 Subject: [PATCH 22/89] Dumping cmph_uint32. --- cxxmph/Makefile.am | 10 ++- cxxmph/MurmurHash2.h | 4 +- cxxmph/bm_urls.cc | 21 ++++++ cxxmph/cmph_hash_map.h | 4 +- .../{cmph_hash_function.h => cxxmph_hash.h} | 41 ++++++------ cxxmph/mphtable.cc | 66 +++++++++---------- cxxmph/mphtable.h | 64 +++++++++--------- cxxmph/trigraph.cc | 18 ++--- cxxmph/trigraph.h | 25 ++++--- cxxmph/trigraph_test.cc | 22 ------- src/Makefile.am | 4 ++ src/cmph_benchmark.c | 39 ++++++++--- src/cmph_benchmark.h | 2 +- src/cmph_benchmark_test.cc | 22 +++++++ 14 files changed, 197 insertions(+), 145 deletions(-) create mode 100644 cxxmph/bm_urls.cc rename cxxmph/{cmph_hash_function.h => cxxmph_hash.h} (58%) delete mode 100644 cxxmph/trigraph_test.cc create mode 100644 src/cmph_benchmark_test.cc diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index c3a0a2b..f18fd6e 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,9 +1,10 @@ -noinst_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test +check_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test bm_urls +noinst_PROGRAMS = bm_urls bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la -include_HEADERS = cmph_hash_map.h mphtable.h MurmurHash2.h trigraph.h cmph_hash_function.h stringpiece.h +include_HEADERS = cmph_hash_map.h mphtable.h MurmurHash2.h trigraph.h cxxmph_hash.h stringpiece.h -libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mphtable.h mphtable.cc cmph_hash_function.h stringpiece.h +libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mphtable.h mphtable.cc cxxmph_hash.h stringpiece.h libcxxmph_la_LDFLAGS = -version-info 0:0:0 cmph_hash_map_test_LDADD = libcxxmph.la @@ -15,5 +16,8 @@ mphtable_test_SOURCES = mphtable_test.cc trigraph_test_LDADD = libcxxmph.la trigraph_test_SOURCES = trigraph_test.cc +bm_urls_LDADD = libcxxmph.la +bm_urls_SOURCES = bm_urls.cc + cxxmph_LDADD = libcxxmph.la cxxmph_SOURCES = cxxmph.cc diff --git a/cxxmph/MurmurHash2.h b/cxxmph/MurmurHash2.h index aa9338f..d817c7b 100644 --- a/cxxmph/MurmurHash2.h +++ b/cxxmph/MurmurHash2.h @@ -15,7 +15,7 @@ // 2. It will not produce the same results on little-endian and big-endian // machines. -namespace { +namespace cxxmph { unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) { @@ -68,6 +68,6 @@ unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) return h; } -} +} // namespace cxxmph #endif // __CXXMPH_MURMUR_HASH2__ diff --git a/cxxmph/bm_urls.cc b/cxxmph/bm_urls.cc new file mode 100644 index 0000000..7d43e2f --- /dev/null +++ b/cxxmph/bm_urls.cc @@ -0,0 +1,21 @@ +#include +#include +#include +#include + +#include "mphtable.h" + +using std::ifstream; +using std::string; +using std::vector; +using cxxmph::SimpleMPHTable; + +int main(int argc, char** argv) { + vector urls; + std::ifstream f("URLS1k"); + string buffer; + while(std::getline(f, buffer)) urls.push_back(buffer); + + SimpleMPHTable table; + table.Reset(urls.begin(), urls.end()); +} diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index 931c073..629667f 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -70,9 +70,9 @@ class cmph_hash_map { void rehash(); std::vector values_; - SimpleMPHTable::hash_function> table_; + SimpleMPHTable::hash_function> table_; // TODO(davi) optimize slack to no hold a copy of the key - typedef typename std::unordered_map slack_type; + typedef typename std::unordered_map slack_type; slack_type slack_; }; diff --git a/cxxmph/cmph_hash_function.h b/cxxmph/cxxmph_hash.h similarity index 58% rename from cxxmph/cmph_hash_function.h rename to cxxmph/cxxmph_hash.h index 933d729..98748a0 100644 --- a/cxxmph/cmph_hash_function.h +++ b/cxxmph/cxxmph_hash.h @@ -1,29 +1,30 @@ +#include // for uint32_t and friends + #include #include // for std::hash #include "MurmurHash2.h" #include "stringpiece.h" -#include "cmph_types.h" namespace cxxmph { template struct seeded_hash_function { template - cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const { + uint32_t operator()(const Key& k, uint32_t seed) const { return HashFcn()(k) ^ seed; } }; struct Murmur2 { template - cmph_uint32 operator()(const Key& k) const { + uint32_t operator()(const Key& k) const { return MurmurHash2(k, sizeof(Key), 1 /* seed */); } }; struct Murmur2StringPiece { template - cmph_uint32 operator()(const Key& k) const { + uint32_t operator()(const Key& k) const { StringPiece s(k); return MurmurHash2(s.data(), s.length(), 1 /* seed */); } @@ -32,7 +33,7 @@ struct Murmur2StringPiece { template <> struct seeded_hash_function { template - cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const { + uint32_t operator()(const Key& k, uint32_t seed) const { return MurmurHash2(reinterpret_cast(&k), sizeof(Key), seed); } }; @@ -40,42 +41,42 @@ struct seeded_hash_function { template <> struct seeded_hash_function { template - cmph_uint32 operator()(const Key& k, cmph_uint32 seed) const { + uint32_t operator()(const Key& k, uint32_t seed) const { StringPiece s(k); return MurmurHash2(s.data(), s.length(), seed); } }; -template struct OptimizedSeededHashFunction +template struct cxxmph_hash { typedef seeded_hash_function hash_function; }; // Use Murmur2 instead for all types defined in std::hash, plus // std::string which is commonly extended. -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; -template <> struct OptimizedSeededHashFunction > +template <> struct cxxmph_hash > { typedef seeded_hash_function hash_function; }; } // namespace cxxmph diff --git a/cxxmph/mphtable.cc b/cxxmph/mphtable.cc index ba90ae2..bbc0c31 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mphtable.cc @@ -10,9 +10,9 @@ using std::vector; namespace { -static const cmph_uint8 kUnassigned = 3; +static const uint8_t kUnassigned = 3; // table used for looking up the number of assigned vertices to a 8-bit integer -static cmph_uint8 kBdzLookupTable[] = +static uint8_t kBdzLookupTable[] = { 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, @@ -36,20 +36,20 @@ static cmph_uint8 kBdzLookupTable[] = namespace cxxmph { -const cmph_uint8 MPHTable::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; +const uint8_t MPHTable::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; void MPHTable::clear() { // TODO(davi) impolement me } bool MPHTable::GenerateQueue( - TriGraph* graph, vector* queue_output) { - cmph_uint32 queue_head = 0, queue_tail = 0; - cmph_uint32 nedges = m_; - cmph_uint32 nvertices = n_; + TriGraph* graph, vector* queue_output) { + uint32_t queue_head = 0, queue_tail = 0; + uint32_t nedges = m_; + uint32_t nvertices = n_; // Relies on vector using 1 bit per element vector marked_edge(nedges + 1, false); - vector queue(nvertices, 0); - for (cmph_uint32 i = 0; i < nedges; ++i) { + vector queue(nvertices, 0); + for (uint32_t i = 0; i < nedges; ++i) { const TriGraph::Edge& e = graph->edges()[i]; if (graph->vertex_degree()[e[0]] == 1 || graph->vertex_degree()[e[1]] == 1 || @@ -62,7 +62,7 @@ bool MPHTable::GenerateQueue( } /* for (unsigned int i = 0; i < marked_edge.size(); ++i) { - cerr << "vertex with degree " << static_cast(graph->vertex_degree()[i]) << " marked " << marked_edge[i] << endl; + cerr << "vertex with degree " << static_cast(graph->vertex_degree()[i]) << " marked " << marked_edge[i] << endl; } for (unsigned int i = 0; i < queue.size(); ++i) { cerr << "vertex " << i << " queued at " << queue[i] << endl; @@ -73,13 +73,13 @@ bool MPHTable::GenerateQueue( // cerr << "Queue head " << queue_head << " Queue tail " << queue_tail << endl; // graph->DebugGraph(); while (queue_tail != queue_head) { - cmph_uint32 current_edge = queue[queue_tail++]; + uint32_t current_edge = queue[queue_tail++]; graph->RemoveEdge(current_edge); const TriGraph::Edge& e = graph->edges()[current_edge]; for (int i = 0; i < 3; ++i) { - cmph_uint32 v = e[i]; + uint32_t v = e[i]; if (graph->vertex_degree()[v] == 1) { - cmph_uint32 first_edge = graph->first_edge()[v]; + uint32_t first_edge = graph->first_edge()[v]; if (!marked_edge[first_edge]) { queue[queue_head++] = first_edge; marked_edge[first_edge] = true; @@ -98,14 +98,14 @@ bool MPHTable::GenerateQueue( } void MPHTable::Assigning( - const vector& edges, const vector& queue) { - cmph_uint32 current_edge = 0; + const vector& edges, const vector& queue) { + uint32_t current_edge = 0; vector marked_vertices(n_ + 1); // Initialize vector of half nibbles with all bits set. - cmph_uint32 sizeg = static_cast(ceil(n_/4.0)); - vector(sizeg, std::numeric_limits::max()).swap(g_); + uint32_t sizeg = static_cast(ceil(n_/4.0)); + vector(sizeg, std::numeric_limits::max()).swap(g_); - cmph_uint32 nedges = m_; // for legibility + uint32_t nedges = m_; // for legibility for (int i = nedges - 1; i + 1 >= 1; --i) { current_edge = queue[i]; const TriGraph::Edge& e = edges[current_edge]; @@ -144,20 +144,20 @@ void MPHTable::Assigning( } void MPHTable::Ranking() { - cmph_uint32 nbytes_total = static_cast(ceil(n_ / 4.0)); - cmph_uint32 size = k_ >> 2U; - cmph_uint32 ranktablesize = static_cast( + uint32_t nbytes_total = static_cast(ceil(n_ / 4.0)); + uint32_t size = k_ >> 2U; + uint32_t ranktablesize = static_cast( ceil(n_ / static_cast(k_))); // TODO(davi) Change swap of member classes for resize + memset to avoid // fragmentation - vector (ranktablesize).swap(ranktable_);; - cmph_uint32 offset = 0; - cmph_uint32 count = 0; - cmph_uint32 i = 1; + vector (ranktablesize).swap(ranktable_);; + uint32_t offset = 0; + uint32_t count = 0; + uint32_t i = 1; while (1) { if (i == ranktable_.size()) break; - cmph_uint32 nbytes = size < nbytes_total ? size : nbytes_total; - for (cmph_uint32 j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]]; + uint32_t nbytes = size < nbytes_total ? size : nbytes_total; + for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]]; ranktable_[i] = count; offset += nbytes; nbytes_total -= size; @@ -165,12 +165,12 @@ void MPHTable::Ranking() { } } -cmph_uint32 MPHTable::Rank(cmph_uint32 vertex) const { - cmph_uint32 index = vertex >> b_; - cmph_uint32 base_rank = ranktable_[index]; - cmph_uint32 beg_idx_v = index << b_; - cmph_uint32 beg_idx_b = beg_idx_v >> 2; - cmph_uint32 end_idx_b = vertex >> 2; +uint32_t MPHTable::Rank(uint32_t vertex) const { + uint32_t index = vertex >> b_; + uint32_t base_rank = ranktable_[index]; + uint32_t beg_idx_v = index << b_; + uint32_t beg_idx_b = beg_idx_v >> 2; + uint32_t end_idx_b = vertex >> 2; while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]]; beg_idx_v = beg_idx_b << 2; // cerr << "beg_idx_v: " << beg_idx_v << endl; diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index 02250a9..a899a89 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -3,6 +3,8 @@ // Minimal perfect hash abstraction implementing the BDZ algorithm +#include + #include #include #include // for std::hash @@ -13,61 +15,61 @@ using std::cerr; using std::endl; -#include "cmph_hash_function.h" +#include "cxxmph_hash.h" #include "trigraph.h" namespace cxxmph { class MPHTable { public: - MPHTable(double c = 1.23, cmph_uint8 b = 7) : + MPHTable(double c = 1.23, uint8_t b = 7) : c_(c), b_(b), m_(0), n_(0), k_(0), r_(0) { } ~MPHTable() {} template bool Reset(ForwardIterator begin, ForwardIterator end); template // must agree with Reset - cmph_uint32 index(const Key& x) const; - cmph_uint32 size() const { return m_; } + uint32_t index(const Key& x) const; + uint32_t size() const { return m_; } void clear(); private: template bool Mapping(ForwardIterator begin, ForwardIterator end, std::vector* edges, - std::vector* queue); - bool GenerateQueue(TriGraph* graph, std::vector* queue); + std::vector* queue); + bool GenerateQueue(TriGraph* graph, std::vector* queue); void Assigning(const std::vector& edges, - const std::vector& queue); + const std::vector& queue); void Ranking(); - cmph_uint32 Rank(cmph_uint32 vertex) const; + uint32_t Rank(uint32_t vertex) const; // Algorithm parameters double c_; // Number of bits per key (? is it right) - cmph_uint8 b_; // Number of bits of the kth index in the ranktable + uint8_t b_; // Number of bits of the kth index in the ranktable // Values used during generation - cmph_uint32 m_; // edges count - cmph_uint32 n_; // vertex count - cmph_uint32 k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$ + uint32_t m_; // edges count + uint32_t n_; // vertex count + uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$ // Values used during search // Partition vertex count, derived from c parameter. - cmph_uint32 r_; + uint32_t r_; // The array containing the minimal perfect hash function graph. - std::vector g_; + std::vector g_; // The table used for the rank step of the minimal perfect hash function - std::vector ranktable_; + std::vector ranktable_; // The selected hash seed triplet for finding the edges in the minimal // perfect hash function graph. - cmph_uint32 hash_seed_[3]; + uint32_t hash_seed_[3]; - static const cmph_uint8 valuemask[]; - static void set_2bit_value(std::vector *d, cmph_uint32 i, cmph_uint8 v) { + static const uint8_t valuemask[]; + static void set_2bit_value(std::vector *d, uint32_t i, uint8_t v) { (*d)[(i >> 2)] &= (v << ((i & 3) << 1)) | valuemask[i & 3]; } - static cmph_uint32 get_2bit_value(const std::vector& d, cmph_uint32 i) { + static uint32_t get_2bit_value(const std::vector& d, uint32_t i) { return (d[(i >> 2)] >> ((i & 3) << 1)) & 3; } @@ -78,7 +80,7 @@ class MPHTable { template bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { m_ = end - begin; - r_ = static_cast(ceil((c_*m_)/3)); + r_ = static_cast(ceil((c_*m_)/3)); if ((r_ % 2) == 0) r_ += 1; n_ = 3*r_; k_ = 1U << b_; @@ -87,7 +89,7 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { int iterations = 10; std::vector edges; - std::vector queue; + std::vector queue; while (1) { cerr << "Iterations missing: " << iterations << endl; for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_; @@ -106,14 +108,14 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { template bool MPHTable::Mapping( ForwardIterator begin, ForwardIterator end, - std::vector* edges, std::vector* queue) { + std::vector* edges, std::vector* queue) { TriGraph graph(n_, m_); for (ForwardIterator it = begin; it != end; ++it) { - cmph_uint32 h[3]; + uint32_t h[3]; for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); - cmph_uint32 v0 = h[0] % r_; - cmph_uint32 v1 = h[1] % r_ + r_; - cmph_uint32 v2 = h[2] % r_ + (r_ << 1); + uint32_t v0 = h[0] % r_; + uint32_t v1 = h[1] % r_ + r_; + uint32_t v2 = h[2] % r_ + (r_ << 1); // cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; graph.AddEdge(TriGraph::Edge(v0, v1, v2)); } @@ -125,8 +127,8 @@ bool MPHTable::Mapping( } template -cmph_uint32 MPHTable::index(const Key& key) const { - cmph_uint32 h[3]; +uint32_t MPHTable::index(const Key& key) const { + uint32_t h[3]; for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); h[0] = h[0] % r_; h[1] = h[1] % r_ + r_; @@ -136,19 +138,19 @@ cmph_uint32 MPHTable::index(const Key& key) const { assert((h[0] >> 2) > 2) > 2) >::hash_function> +template >::hash_function> class SimpleMPHTable : public MPHTable { public: template bool Reset(ForwardIterator begin, ForwardIterator end) { return MPHTable::Reset(begin, end); } - cmph_uint32 index(const Key& key) { return MPHTable::index(key); } + uint32_t index(const Key& key) { return MPHTable::index(key); } }; } // namespace cxxmph diff --git a/cxxmph/trigraph.cc b/cxxmph/trigraph.cc index 872f5b3..5e9fd66 100644 --- a/cxxmph/trigraph.cc +++ b/cxxmph/trigraph.cc @@ -9,12 +9,12 @@ using std::endl; using std::vector; namespace { -static const cmph_uint32 kInvalidEdge = std::numeric_limits::max(); +static const uint32_t kInvalidEdge = std::numeric_limits::max(); } namespace cxxmph { -TriGraph::TriGraph(cmph_uint32 nvertices, cmph_uint32 nedges) +TriGraph::TriGraph(uint32_t nvertices, uint32_t nedges) : nedges_(0), edges_(nedges), next_edge_(nedges), @@ -23,8 +23,8 @@ TriGraph::TriGraph(cmph_uint32 nvertices, cmph_uint32 nedges) void TriGraph::ExtractEdgesAndClear(vector* edges) { vector().swap(next_edge_); - vector().swap(first_edge_); - vector().swap(vertex_degree_); + vector().swap(first_edge_); + vector().swap(vertex_degree_); nedges_ = 0; edges->swap(edges_); } @@ -45,13 +45,13 @@ void TriGraph::AddEdge(const Edge& edge) { ++nedges_; } -void TriGraph::RemoveEdge(cmph_uint32 current_edge) { +void TriGraph::RemoveEdge(uint32_t current_edge) { // cerr << "Removing edge " << current_edge << " from " << nedges_ << " existing edges " << endl; for (int i = 0; i < 3; ++i) { - cmph_uint32 vertex = edges_[current_edge][i]; - cmph_uint32 edge1 = first_edge_[vertex]; - cmph_uint32 edge2 = kInvalidEdge; - cmph_uint32 j = 0; + uint32_t vertex = edges_[current_edge][i]; + uint32_t edge1 = first_edge_[vertex]; + uint32_t edge2 = kInvalidEdge; + uint32_t j = 0; while (edge1 != current_edge && edge1 != kInvalidEdge) { edge2 = edge1; if (edges_[edge1][0] == vertex) j = 0; diff --git a/cxxmph/trigraph.h b/cxxmph/trigraph.h index 9cbae1b..22adaeb 100644 --- a/cxxmph/trigraph.h +++ b/cxxmph/trigraph.h @@ -6,42 +6,41 @@ // required. For each vertex, we store how many edges touch it (degree) and the // index of the first edge in the vector of triples representing the edges. +#include // for uint32_t and friends #include -#include "cmph_types.h" - namespace cxxmph { class TriGraph { public: struct Edge { Edge() { } - Edge(cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2) { + Edge(uint32_t v0, uint32_t v1, uint32_t v2) { vertices[0] = v0; vertices[1] = v1; vertices[2] = v2; } - cmph_uint32& operator[](cmph_uint8 v) { return vertices[v]; } - const cmph_uint32& operator[](cmph_uint8 v) const { return vertices[v]; } - cmph_uint32 vertices[3]; + uint32_t& operator[](uint8_t v) { return vertices[v]; } + const uint32_t& operator[](uint8_t v) const { return vertices[v]; } + uint32_t vertices[3]; }; - TriGraph(cmph_uint32 nedges, cmph_uint32 nvertices); + TriGraph(uint32_t nedges, uint32_t nvertices); void AddEdge(const Edge& edge); - void RemoveEdge(cmph_uint32 edge_id); + void RemoveEdge(uint32_t edge_id); void ExtractEdgesAndClear(std::vector* edges); void DebugGraph() const; const std::vector& edges() const { return edges_; } - const std::vector& vertex_degree() const { return vertex_degree_; } - const std::vector& first_edge() const { return first_edge_; } + const std::vector& vertex_degree() const { return vertex_degree_; } + const std::vector& first_edge() const { return first_edge_; } private: - cmph_uint32 nedges_; // total number of edges + uint32_t nedges_; // total number of edges std::vector edges_; std::vector next_edge_; // for implementing removal - std::vector first_edge_; // the first edge for this vertex - std::vector vertex_degree_; // number of edges for this vertex + std::vector first_edge_; // the first edge for this vertex + std::vector vertex_degree_; // number of edges for this vertex }; } // namespace cxxmph diff --git a/cxxmph/trigraph_test.cc b/cxxmph/trigraph_test.cc deleted file mode 100644 index 6220138..0000000 --- a/cxxmph/trigraph_test.cc +++ /dev/null @@ -1,22 +0,0 @@ -#include - -#include "trigraph.h" - -using cxxmph::TriGraph; - -int main(int argc, char** argv) { - TriGraph g(4, 2); - g.AddEdge(TriGraph::Edge(0, 1, 2)); - g.AddEdge(TriGraph::Edge(1, 3, 2)); - assert(g.vertex_degree()[0] == 1); - assert(g.vertex_degree()[1] == 2); - assert(g.vertex_degree()[2] == 2); - assert(g.vertex_degree()[3] == 1); - g.RemoveEdge(0); - assert(g.vertex_degree()[0] == 0); - assert(g.vertex_degree()[1] == 1); - assert(g.vertex_degree()[2] == 1); - assert(g.vertex_degree()[3] == 1); - std::vector edges; - g.ExtractEdgesAndClear(&edges); -} diff --git a/src/Makefile.am b/src/Makefile.am index c44d832..7593321 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,4 +1,5 @@ bin_PROGRAMS = cmph +check_PROGRAMS = cmph_benchmark_test lib_LTLIBRARIES = libcmph.la include_HEADERS = cmph.h cmph_types.h cmph_time.h chd_ph.h libcmph_la_SOURCES = hash.h hash.c \ @@ -30,3 +31,6 @@ libcmph_la_LDFLAGS = -version-info 0:0:0 cmph_SOURCES = main.c wingetopt.h wingetopt.c cmph_LDADD = libcmph.la + +cmph_benchmark_test_SOURCES = cmph_benchmark_test.cc +cmph_benchmark_test_LDADD = libcmph.la diff --git a/src/cmph_benchmark.c b/src/cmph_benchmark.c index 9d7c9da..b63bb84 100644 --- a/src/cmph_benchmark.c +++ b/src/cmph_benchmark.c @@ -1,3 +1,5 @@ +// A simple benchmark tool around getrusage + #include #include #include @@ -42,17 +44,19 @@ int timeval_subtract ( benchmark_t* find_benchmark(const char* name) { benchmark_t* benchmark = global_benchmarks; - while (benchmark->name != NULL) if (strcmp(benchmark->name, name) != 0) break; - if (!benchmark->name) return NULL; + while (benchmark && benchmark->name != NULL) { + if (strcmp(benchmark->name, name) == 0) break; + ++benchmark; + } + if (!benchmark || !benchmark->name) return NULL; return benchmark; } int global_benchmarks_length() { - benchmark_t* benchmark; + benchmark_t* benchmark = global_benchmarks; int length = 0; - if (global_benchmarks == 0) return 0; - benchmark = global_benchmarks; - while (benchmark->name != NULL) ++length; + if (benchmark == NULL) return 0; + while (benchmark->name != NULL) ++length, ++benchmark; return length; } @@ -62,8 +66,11 @@ void bm_register(const char* name, void (*func)(int), int iters) { benchmark.name = name; benchmark.func = func; assert(!find_benchmark(name)); - global_benchmarks = realloc(global_benchmarks, length + 1); + global_benchmarks = realloc( + global_benchmarks, (length + 2)*sizeof(benchmark_t)); global_benchmarks[length] = benchmark; + memset(&benchmark, 0, sizeof(benchmark_t)); // pivot + global_benchmarks[length + 1] = benchmark; } void bm_start(const char* name) { @@ -71,6 +78,7 @@ void bm_start(const char* name) { struct rusage rs; benchmark = find_benchmark(name); + assert(benchmark); int ret = getrusage(RUSAGE_SELF, &rs); if (ret != 0) { perror("rusage failed"); @@ -98,6 +106,19 @@ void bm_end(const char* name) { struct timeval stime; timeval_subtract(&stime, &benchmark->end.ru_stime, &benchmark->begin.ru_stime); - printf("User cpu time used: %ld.%6ld\n", utime.tv_sec, utime.tv_usec); - printf("System cpu time used: %ld.%6ld\n", stime.tv_sec, stime.tv_usec); + printf("Benchmark: %s\n", benchmark->name); + printf("User time used : %ld.%6ld\n", utime.tv_sec, utime.tv_usec); + printf("System time used: %ld.%6ld\n", stime.tv_sec, stime.tv_usec); + printf("Wall time used : %ld.%6ld\n", stime.tv_sec, stime.tv_usec); + printf("\n"); } + +void run_benchmarks(int argc, char** argv) { + benchmark_t* benchmark = global_benchmarks; + while (benchmark && benchmark->name != NULL) { + bm_start(benchmark->name); + bm_end(benchmark->name); + ++benchmark; + } +} + diff --git a/src/cmph_benchmark.h b/src/cmph_benchmark.h index f987ce1..bd0eb78 100644 --- a/src/cmph_benchmark.h +++ b/src/cmph_benchmark.h @@ -9,7 +9,7 @@ extern "C" { #endif -#define BM_REGISTER(func, iters) bm_register(##func, func, iters); +#define BM_REGISTER(func, iters) bm_register(#func, func, iters) void bm_register(const char* name, void (*func)(int), int iters); void run_benchmarks(int argc, char** argv); diff --git a/src/cmph_benchmark_test.cc b/src/cmph_benchmark_test.cc new file mode 100644 index 0000000..9ea3193 --- /dev/null +++ b/src/cmph_benchmark_test.cc @@ -0,0 +1,22 @@ +#include // for sleep +#include + +#include "cmph_benchmark.h" + +void bm_sleep(int iters) { + sleep(1); +} + +void bm_increment(int iters) { + int i, v = 0; + for (i = 0; i < INT_MAX; ++i) { + v += i; + } +} + +int main(int argc, char** argv) { + BM_REGISTER(bm_sleep, 1); + BM_REGISTER(bm_increment, 1); + run_benchmarks(argc, argv); +} + From b2da526497644062b2ba76f451cc0224579cb1f7 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sun, 13 Feb 2011 23:32:50 -0200 Subject: [PATCH 23/89] Dumping cmph_uint32. --- src/bm_numbers.c | 12 ++++++++++++ ...{cmph_benchmark_test.cc => cmph_benchmark_test.c} | 0 2 files changed, 12 insertions(+) create mode 100644 src/bm_numbers.c rename src/{cmph_benchmark_test.cc => cmph_benchmark_test.c} (100%) diff --git a/src/bm_numbers.c b/src/bm_numbers.c new file mode 100644 index 0000000..4fa23d9 --- /dev/null +++ b/src/bm_numbers.c @@ -0,0 +1,12 @@ +#include "cmph.h" +#include "cmph_benchmark.h" + +void bm_bdz_numbers(int iters) { + cmph_config_t config; + config.algo = CMPH_BMZ; + + +int main(int argc, char** argv) { + run_benchmarks(argc, argv); +} + diff --git a/src/cmph_benchmark_test.cc b/src/cmph_benchmark_test.c similarity index 100% rename from src/cmph_benchmark_test.cc rename to src/cmph_benchmark_test.c From d0eb54d0301137ce70f8f8dcf5f29dea5eee482f Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Tue, 15 Feb 2011 14:49:08 -0500 Subject: [PATCH 24/89] Finishing benchmarks. --- src/Makefile.am | 9 +++- src/bdz.c | 5 +-- src/bm_numbers.c | 98 ++++++++++++++++++++++++++++++++++++++--- src/cmph_benchmark.c | 11 +++-- src/jenkins_hash.c | 2 +- src/linear_string_map.c | 68 ++++++++++++++++++++++++++++ src/linear_string_map.h | 13 ++++++ 7 files changed, 191 insertions(+), 15 deletions(-) create mode 100644 src/linear_string_map.c create mode 100644 src/linear_string_map.h diff --git a/src/Makefile.am b/src/Makefile.am index 7593321..4b6e5b4 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,5 +1,6 @@ bin_PROGRAMS = cmph check_PROGRAMS = cmph_benchmark_test +noinst_PROGRAMS = bm_numbers lib_LTLIBRARIES = libcmph.la include_HEADERS = cmph.h cmph_types.h cmph_time.h chd_ph.h libcmph_la_SOURCES = hash.h hash.c \ @@ -24,7 +25,8 @@ libcmph_la_SOURCES = hash.h hash.c \ select.h select.c select_lookup_tables.h \ compressed_seq.h compressed_seq.c \ compressed_rank.h compressed_rank.c \ - cmph_benchmark.h cmph_benchmark.cc \ + linear_string_map.h linear_string_map.c \ + cmph_benchmark.h cmph_benchmark.c \ cmph_time.h libcmph_la_LDFLAGS = -version-info 0:0:0 @@ -32,5 +34,8 @@ libcmph_la_LDFLAGS = -version-info 0:0:0 cmph_SOURCES = main.c wingetopt.h wingetopt.c cmph_LDADD = libcmph.la -cmph_benchmark_test_SOURCES = cmph_benchmark_test.cc +cmph_benchmark_test_SOURCES = cmph_benchmark_test.c cmph_benchmark_test_LDADD = libcmph.la + +bm_numbers_SOURCES = bm_numbers.c +bm_numbers_LDADD = libcmph.la diff --git a/src/bdz.c b/src/bdz.c index 1c49c9d..e129c51 100755 --- a/src/bdz.c +++ b/src/bdz.c @@ -178,7 +178,9 @@ static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_que }; }; DEBUGP("Queue head %d Queue tail %d\n", queue_head, queue_tail); + #ifdef DEBUG bdz_dump_graph(graph3,graph3->nedges,graph3->nedges+graph3->nedges/4); + #endif while(queue_tail!=queue_head){ curr_edge=queue[queue_tail++]; bdz_remove_edge(graph3,curr_edge); @@ -444,7 +446,6 @@ static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t SETBIT(marked_vertices, v2); } SETVALUE1(bdz->g, v0, (6-(GETVALUE(bdz->g, v1) + GETVALUE(bdz->g,v2)))%3); - if (v0 == 291) fprintf(stderr, "Vertex 291 updated at case 1\n"); SETBIT(marked_vertices, v0); } else if(!GETBIT(marked_vertices, v1)) { if(!GETBIT(marked_vertices, v2)) @@ -453,11 +454,9 @@ static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t SETBIT(marked_vertices, v2); } SETVALUE1(bdz->g, v1, (7-(GETVALUE(bdz->g, v0)+GETVALUE(bdz->g, v2)))%3); - if (v1 == 291) fprintf(stderr, "Vertex 291 updated at case 1\n"); SETBIT(marked_vertices, v1); }else { SETVALUE1(bdz->g, v2, (8-(GETVALUE(bdz->g,v0)+GETVALUE(bdz->g, v1)))%3); - if (v2 == 291) fprintf(stderr, "Vertex 291 updated at case 1\n"); SETBIT(marked_vertices, v2); } DEBUGP("A:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz->g, v0), GETVALUE(bdz->g, v1), GETVALUE(bdz->g, v2)); diff --git a/src/bm_numbers.c b/src/bm_numbers.c index 4fa23d9..4d464c0 100644 --- a/src/bm_numbers.c +++ b/src/bm_numbers.c @@ -1,12 +1,100 @@ +#include +#include + +#include "bitbool.h" #include "cmph.h" #include "cmph_benchmark.h" +#include "linear_string_map.h" -void bm_bdz_numbers(int iters) { - cmph_config_t config; - config.algo = CMPH_BMZ; +// Generates a vector with random unique 32 bits integers +cmph_uint32* random_numbers_vector_new(cmph_uint32 size) { + cmph_uint32 i = 0; + cmph_uint32 dup_bits = sizeof(cmph_uint32)*size*8; + char* dup = (char*)malloc(dup_bits/8); + cmph_uint32* vec = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*size); + memset(dup, 0, dup_bits/8); + for (i = 0; i < size; ++i) { + cmph_uint32 v = random(); + while (GETBIT(dup, v % dup_bits)) { v = random(); } + SETBIT(dup, v % dup_bits); + vec[i] = v; + fprintf(stderr, "v[%u] = %u\n", i, vec[i]); + } + free(dup); + return vec; +} +static cmph_uint32 g_numbers_len = 0; +static cmph_uint32 *g_numbers = NULL; +static lsmap_t *g_created_mphf = NULL; + +void bm_create(CMPH_ALGO algo, int iters) { + cmph_uint32 i = 0; + cmph_io_adapter_t* source = NULL; + cmph_config_t* config = NULL; + cmph_t* mphf = NULL; + + if (iters > g_numbers_len) { + fprintf(stderr, "No input with proper size."); + exit(-1); + } + + source = cmph_io_struct_vector_adapter( + (void*)g_numbers, sizeof(cmph_uint32), + 0, sizeof(cmph_uint32), iters); + config = cmph_config_new(source); + cmph_config_set_algo(config, algo); + mphf = cmph_new(config); + if (!mphf) { + fprintf(stderr, "Failed to create mphf for algorithm %s with %u keys", + cmph_names[algo], iters); + exit(-1); + } + cmph_config_destroy(config); + cmph_io_struct_vector_adapter_destroy(source); -int main(int argc, char** argv) { - run_benchmarks(argc, argv); + char mphf_name[128]; + snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters); + lsmap_append(g_created_mphf, strdup(mphf_name), mphf); +} + +void bm_search(CMPH_ALGO algo, int iters) { + int i = 0; + char mphf_name[128]; + cmph_t* mphf = NULL; + + snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters); + mphf = lsmap_search(g_created_mphf, mphf_name); + for (i = 0; i < iters * 100; ++i) { + cmph_uint32 pos = random() % iters; + fprintf(stderr, "Looking for key %u at pos %u\n", g_numbers[pos], pos); + const char* buf = (const char*)(g_numbers + pos); + cmph_uint32 h = cmph_search(mphf, buf, sizeof(cmph_uint32)); + fprintf(stderr, "Found h %u value %u\n", h, g_numbers[h]); + if (h != pos) { + fprintf(stderr, "Buggy mphf\n"); + } + } +} + +#define DECLARE_ALGO(algo) \ + void bm_create_ ## algo(int iters) { bm_create(algo, iters); } \ + void bm_search_ ## algo(int iters) { bm_search(algo, iters); } + +DECLARE_ALGO(CMPH_BDZ); + +int main(int argc, char** argv) { + g_numbers_len = 20; + g_numbers = random_numbers_vector_new(g_numbers_len); + g_created_mphf = lsmap_new(); + + BM_REGISTER(bm_create_CMPH_BDZ, 20); + BM_REGISTER(bm_search_CMPH_BDZ, 20); + run_benchmarks(argc, argv); + + free(g_numbers); + lsmap_foreach_key(g_created_mphf, free); + lsmap_foreach_value(g_created_mphf, cmph_destroy); + lsmap_destroy(g_created_mphf); } diff --git a/src/cmph_benchmark.c b/src/cmph_benchmark.c index b63bb84..073f937 100644 --- a/src/cmph_benchmark.c +++ b/src/cmph_benchmark.c @@ -3,12 +3,14 @@ #include #include #include +#include #include "cmph_benchmark.h" typedef struct { const char* name; void (*func)(int); + int iters; struct rusage begin; struct rusage end; } benchmark_t; @@ -65,6 +67,7 @@ void bm_register(const char* name, void (*func)(int), int iters) { int length = global_benchmarks_length(); benchmark.name = name; benchmark.func = func; + benchmark.iters = iters; assert(!find_benchmark(name)); global_benchmarks = realloc( global_benchmarks, (length + 2)*sizeof(benchmark_t)); @@ -85,7 +88,7 @@ void bm_start(const char* name) { exit(-1); } benchmark->begin = rs; - (*benchmark->func)(1); + (*benchmark->func)(benchmark->iters); } void bm_end(const char* name) { @@ -107,9 +110,9 @@ void bm_end(const char* name) { timeval_subtract(&stime, &benchmark->end.ru_stime, &benchmark->begin.ru_stime); printf("Benchmark: %s\n", benchmark->name); - printf("User time used : %ld.%6ld\n", utime.tv_sec, utime.tv_usec); - printf("System time used: %ld.%6ld\n", stime.tv_sec, stime.tv_usec); - printf("Wall time used : %ld.%6ld\n", stime.tv_sec, stime.tv_usec); + printf("User time used : %ld.%06ld\n", utime.tv_sec, utime.tv_usec); + printf("System time used: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); + printf("Wall time used : %ld.%06ld\n", stime.tv_sec, stime.tv_usec); printf("\n"); } diff --git a/src/jenkins_hash.c b/src/jenkins_hash.c index 5d4e807..8fd3836 100644 --- a/src/jenkins_hash.c +++ b/src/jenkins_hash.c @@ -7,7 +7,7 @@ #include #include -#define DEBUG +// #define DEBUG #include "debug.h" #include "MurmurHash2.h" diff --git a/src/linear_string_map.c b/src/linear_string_map.c new file mode 100644 index 0000000..4390c5b --- /dev/null +++ b/src/linear_string_map.c @@ -0,0 +1,68 @@ +#include +#include +#include + +#include "linear_string_map.h" + +struct __linear_string_map_t { + const char *key; + void *value; + struct __linear_string_map_t* next; +}; + +lsmap_t *lsmap_new() { + lsmap_t* lsmap = (lsmap_t*)malloc(sizeof(lsmap_t)); + lsmap->key = "dummy node"; + lsmap->next = NULL; + return lsmap; +} + +int lsmap_size(lsmap_t *lsmap) { + int size = 0; + while (lsmap->next != NULL) ++size; + return size; +} + +void lsmap_append(lsmap_t *lsmap, const char *key, void *value) { + while (lsmap->next != NULL) lsmap = lsmap->next; + lsmap->next = (lsmap_t*)malloc(sizeof(lsmap_t)); + lsmap->key = key; + lsmap->value = value; + lsmap = lsmap->next; + lsmap->key = "dummy node"; + lsmap->next = NULL; +} + +void* lsmap_search(lsmap_t *lsmap, const char *key) { + while (lsmap->next != NULL) { + if (strcmp(lsmap->key, key) == 0) { + return lsmap->value; + } + lsmap = lsmap->next; + } + return NULL; +} + +void lsmap_foreach_key(lsmap_t *lsmap, void (*f)(const char*)) { + while (lsmap->next != NULL) { + f(lsmap->key); + lsmap = lsmap->next; + } +} + +void lsmap_foreach_value(lsmap_t *lsmap, void (*f)(void*)) { + while (lsmap->next != NULL) { + f(lsmap->value); + lsmap = lsmap->next; + } +} + +void lsmap_destroy(lsmap_t *lsmap) { + while (lsmap->next != NULL) { + lsmap_t* freeme = lsmap; + lsmap = lsmap->next; + free(freeme); + } + free(lsmap); +} + diff --git a/src/linear_string_map.h b/src/linear_string_map.h new file mode 100644 index 0000000..2e2287e --- /dev/null +++ b/src/linear_string_map.h @@ -0,0 +1,13 @@ +// A simple linked list based dynamic sized associative map from const char* to +// void*. Designed to maximize ease of use instead of performance. Should be +// used in benchmarks and tests only, not to be distributed with the cmph +// runtime headers. + +typedef struct __linear_string_map_t lsmap_t; + +lsmap_t *lsmap_new(); +void lsmap_append(lsmap_t *lsmap, const char *key, void *value); +void* lsmap_search(lsmap_t *lsmap, const char *key); +void lsmap_foreach_key(lsmap_t* lsmap, void (*f)(const char*)); +void lsmap_foreach_value(lsmap_t* lsmap, void (*f)(void*)); +void lsmap_destroy(lsmap_t* lsmap); From 4fc0c52c568ed0bc4b1f75f96a9c227a71780485 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Tue, 15 Feb 2011 20:46:05 -0800 Subject: [PATCH 25/89] Benchmark works. --- src/bdz.c | 6 +++-- src/bm_numbers.c | 53 ++++++++++++++++++++++++++++++++------------ src/bmz.c | 19 ++++++++++------ src/cmph.c | 2 +- src/cmph_benchmark.c | 1 - src/graph.c | 4 ++-- 6 files changed, 58 insertions(+), 27 deletions(-) diff --git a/src/bdz.c b/src/bdz.c index e129c51..6b3f80e 100755 --- a/src/bdz.c +++ b/src/bdz.c @@ -9,7 +9,7 @@ #include #include #include -#define DEBUG +// #define DEBUG #include "debug.h" #define UNASSIGNED 3U #define NULL_EDGE 0xffffffff @@ -115,10 +115,12 @@ static void bdz_dump_graph(bdz_graph3_t* graph3, cmph_uint32 nedges, cmph_uint32 graph3->edges[i].next_edges[1],graph3->edges[i].next_edges[2]); }; + #ifdef DEBUG for(i=0;ifirst_edge[i]); }; + #endif }; static void bdz_remove_edge(bdz_graph3_t * graph3, cmph_uint32 curr_edge) @@ -408,7 +410,7 @@ static int bdz_mapping(cmph_config_t *mph, bdz_graph3_t* graph3, bdz_queue_t que h0 = hl[0] % bdz->r; h1 = hl[1] % bdz->r + bdz->r; h2 = hl[2] % bdz->r + (bdz->r << 1); - DEBUGP("Key: %s (%u %u %u)\n", key, h0, h1, h2); + DEBUGP("Key: %.*s (%u %u %u)\n", keylen, key, h0, h1, h2); mph->key_source->dispose(mph->key_source->data, key, keylen); bdz_add_edge(graph3,h0,h1,h2); } diff --git a/src/bm_numbers.c b/src/bm_numbers.c index 4d464c0..2e86de4 100644 --- a/src/bm_numbers.c +++ b/src/bm_numbers.c @@ -18,14 +18,26 @@ cmph_uint32* random_numbers_vector_new(cmph_uint32 size) { while (GETBIT(dup, v % dup_bits)) { v = random(); } SETBIT(dup, v % dup_bits); vec[i] = v; - fprintf(stderr, "v[%u] = %u\n", i, vec[i]); } free(dup); return vec; } + +int cmph_uint32_cmp(const void *a, const void *b) { + return *(const cmph_uint32*)a - *(const cmph_uint32*)b; +} + +char* create_lsmap_key(CMPH_ALGO algo, int iters) { + char mphf_name[128]; + snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters); + return strdup(mphf_name); +} + static cmph_uint32 g_numbers_len = 0; static cmph_uint32 *g_numbers = NULL; static lsmap_t *g_created_mphf = NULL; +static lsmap_t *g_expected_probes = NULL; +static lsmap_t *g_mphf_probes = NULL; void bm_create(CMPH_ALGO algo, int iters) { cmph_uint32 i = 0; @@ -51,11 +63,7 @@ void bm_create(CMPH_ALGO algo, int iters) { } cmph_config_destroy(config); cmph_io_struct_vector_adapter_destroy(source); - - - char mphf_name[128]; - snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters); - lsmap_append(g_created_mphf, strdup(mphf_name), mphf); + lsmap_append(g_created_mphf, create_lsmap_key(algo, iters), mphf); } void bm_search(CMPH_ALGO algo, int iters) { @@ -63,35 +71,52 @@ void bm_search(CMPH_ALGO algo, int iters) { char mphf_name[128]; cmph_t* mphf = NULL; + snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters); mphf = lsmap_search(g_created_mphf, mphf_name); + + cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); + cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); + for (i = 0; i < iters * 100; ++i) { cmph_uint32 pos = random() % iters; - fprintf(stderr, "Looking for key %u at pos %u\n", g_numbers[pos], pos); const char* buf = (const char*)(g_numbers + pos); cmph_uint32 h = cmph_search(mphf, buf, sizeof(cmph_uint32)); - fprintf(stderr, "Found h %u value %u\n", h, g_numbers[h]); - if (h != pos) { - fprintf(stderr, "Buggy mphf\n"); - } + ++count[pos]; + ++hash_count[h]; } + + // Verify correctness later. + lsmap_append(g_expected_probes, create_lsmap_key(algo, iters), count); + lsmap_append(g_mphf_probes, create_lsmap_key(algo, iters), hash_count); } +void verify() { } + #define DECLARE_ALGO(algo) \ void bm_create_ ## algo(int iters) { bm_create(algo, iters); } \ void bm_search_ ## algo(int iters) { bm_search(algo, iters); } +DECLARE_ALGO(CMPH_CHM); +DECLARE_ALGO(CMPH_BMZ); DECLARE_ALGO(CMPH_BDZ); int main(int argc, char** argv) { - g_numbers_len = 20; + g_numbers_len = 1000 * 1000; g_numbers = random_numbers_vector_new(g_numbers_len); g_created_mphf = lsmap_new(); + g_expected_probes = lsmap_new(); + g_mphf_probes = lsmap_new(); - BM_REGISTER(bm_create_CMPH_BDZ, 20); - BM_REGISTER(bm_search_CMPH_BDZ, 20); + BM_REGISTER(bm_create_CMPH_CHM, 1000 * 1000); + BM_REGISTER(bm_search_CMPH_CHM, 1000 * 1000); + BM_REGISTER(bm_create_CMPH_BMZ, 1000 * 1000); + BM_REGISTER(bm_search_CMPH_BMZ, 1000 * 1000); + BM_REGISTER(bm_create_CMPH_BDZ, 1000 * 1000); + BM_REGISTER(bm_search_CMPH_BDZ, 1000 * 1000); run_benchmarks(argc, argv); + verify(); free(g_numbers); lsmap_foreach_key(g_created_mphf, free); lsmap_foreach_value(g_created_mphf, cmph_destroy); diff --git a/src/bmz.c b/src/bmz.c index 51798a1..002f8dc 100644 --- a/src/bmz.c +++ b/src/bmz.c @@ -12,7 +12,7 @@ #include #include -//#define DEBUG +// #define DEBUG #include "debug.h" static int bmz_gen_edges(cmph_config_t *mph); @@ -162,13 +162,19 @@ cmph_t *bmz_new(cmph_config_t *mph, double c) } free(used_edges); free(visited); - }while(restart_mapping && iterations_map > 0); + } while(restart_mapping && iterations_map > 0); graph_destroy(bmz->graph); bmz->graph = NULL; if (iterations_map == 0) { return NULL; } + #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < bmz->n; ++i) fprintf(stderr, "%u ", bmz->g[i]); + fprintf(stderr, "\n"); + #endif + mphf = (cmph_t *)malloc(sizeof(cmph_t)); mphf->algo = mph->algo; bmzf = (bmz_data_t *)malloc(sizeof(bmz_data_t)); @@ -421,19 +427,18 @@ static int bmz_gen_edges(cmph_config_t *mph) char *key = NULL; mph->key_source->read(mph->key_source->data, &key, &keylen); -// if (key == NULL)fprintf(stderr, "key = %s -- read BMZ\n", key); h1 = hash(bmz->hashes[0], key, keylen) % bmz->n; h2 = hash(bmz->hashes[1], key, keylen) % bmz->n; if (h1 == h2) if (++h2 >= bmz->n) h2 = 0; + DEBUGP("key: %.*s h1: %u h2: %u\n", keylen, key, h1, h2); if (h1 == h2) { if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); mph->key_source->dispose(mph->key_source->data, key, keylen); return 0; } - //DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key); + DEBUGP("Adding edge: %u -> %u for key %.*s\n", h1, h2, keylen, key); mph->key_source->dispose(mph->key_source->data, key, keylen); -// fprintf(stderr, "key = %s -- dispose BMZ\n", key); multiple_edges = graph_contains_edge(bmz->graph, h1, h2); if (mph->verbosity && multiple_edges) fprintf(stderr, "A non simple graph was generated\n"); if (multiple_edges) return 0; // checking multiple edge restriction. @@ -524,9 +529,9 @@ cmph_uint32 bmz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) bmz_data_t *bmz = mphf->data; cmph_uint32 h1 = hash(bmz->hashes[0], key, keylen) % bmz->n; cmph_uint32 h2 = hash(bmz->hashes[1], key, keylen) % bmz->n; - DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + DEBUGP("key: %.*s h1: %u h2: %u\n", keylen, key, h1, h2); if (h1 == h2 && ++h2 > bmz->n) h2 = 0; - DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, bmz->g[h1], bmz->g[h2], bmz->m); + DEBUGP("key: %.*s g[h1]: %u g[h2]: %u edges: %u\n", keylen, key, bmz->g[h1], bmz->g[h2], bmz->m); return bmz->g[h1] + bmz->g[h2]; } void bmz_destroy(cmph_t *mphf) diff --git a/src/cmph.c b/src/cmph.c index cba735f..b0c33bf 100644 --- a/src/cmph.c +++ b/src/cmph.c @@ -13,7 +13,7 @@ #include #include #include -//#define DEBUG +// #define DEBUG #include "debug.h" const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz", "bdz_ph", "chd_ph", "chd", NULL }; diff --git a/src/cmph_benchmark.c b/src/cmph_benchmark.c index 073f937..f7177a3 100644 --- a/src/cmph_benchmark.c +++ b/src/cmph_benchmark.c @@ -112,7 +112,6 @@ void bm_end(const char* name) { printf("Benchmark: %s\n", benchmark->name); printf("User time used : %ld.%06ld\n", utime.tv_sec, utime.tv_usec); printf("System time used: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); - printf("Wall time used : %ld.%06ld\n", stime.tv_sec, stime.tv_usec); printf("\n"); } diff --git a/src/graph.c b/src/graph.c index c29fd8b..2e9ddb7 100644 --- a/src/graph.c +++ b/src/graph.c @@ -8,7 +8,7 @@ #include "vstack.h" #include "bitbool.h" -//#define DEBUG +// #define DEBUG #include "debug.h" /* static const cmph_uint8 bitmask[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; */ @@ -176,7 +176,7 @@ static cmph_uint8 find_degree1_edge(graph_t *g, cmph_uint32 v, cmph_uint8 *delet { cmph_uint32 edge = g->first[v]; cmph_uint8 found = 0; - DEBUGP("Checking degree of vertex %u\n", v); + DEBUGP("Checking degree of vertex %u connected to edge %u\n", v, edge); if (edge == EMPTY) return 0; else if (!(GETBIT(deleted, abs_edge(edge, 0)))) { From 8355e2e1b8694406fbd9f96808dc2bfbfd0d1a17 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Fri, 18 Feb 2011 14:15:10 -0800 Subject: [PATCH 26/89] Added a benchmark to the C code. --- src/bm_numbers.c | 12 +++++++++--- src/cmph_benchmark.c | 1 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/src/bm_numbers.c b/src/bm_numbers.c index 2e86de4..7ff8b9a 100644 --- a/src/bm_numbers.c +++ b/src/bm_numbers.c @@ -97,8 +97,10 @@ void verify() { } void bm_create_ ## algo(int iters) { bm_create(algo, iters); } \ void bm_search_ ## algo(int iters) { bm_search(algo, iters); } -DECLARE_ALGO(CMPH_CHM); DECLARE_ALGO(CMPH_BMZ); +DECLARE_ALGO(CMPH_CHM); +DECLARE_ALGO(CMPH_BRZ); +DECLARE_ALGO(CMPH_FCH); DECLARE_ALGO(CMPH_BDZ); int main(int argc, char** argv) { @@ -108,10 +110,14 @@ int main(int argc, char** argv) { g_expected_probes = lsmap_new(); g_mphf_probes = lsmap_new(); - BM_REGISTER(bm_create_CMPH_CHM, 1000 * 1000); - BM_REGISTER(bm_search_CMPH_CHM, 1000 * 1000); BM_REGISTER(bm_create_CMPH_BMZ, 1000 * 1000); BM_REGISTER(bm_search_CMPH_BMZ, 1000 * 1000); + BM_REGISTER(bm_create_CMPH_CHM, 1000 * 1000); + BM_REGISTER(bm_search_CMPH_CHM, 1000 * 1000); +// BM_REGISTER(bm_create_CMPH_BRZ, 1000 * 1000); +// BM_REGISTER(bm_search_CMPH_BRZ, 1000 * 1000); + BM_REGISTER(bm_create_CMPH_FCH, 1000 * 1000); + BM_REGISTER(bm_search_CMPH_FCH, 1000 * 1000); BM_REGISTER(bm_create_CMPH_BDZ, 1000 * 1000); BM_REGISTER(bm_search_CMPH_BDZ, 1000 * 1000); run_benchmarks(argc, argv); diff --git a/src/cmph_benchmark.c b/src/cmph_benchmark.c index f7177a3..a67f78b 100644 --- a/src/cmph_benchmark.c +++ b/src/cmph_benchmark.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "cmph_benchmark.h" From 05eaf15d5396983d30ad337ac590ad68746183e7 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Fri, 18 Feb 2011 14:15:24 -0800 Subject: [PATCH 27/89] Added a benchmark to the C++ code. --- cxxmph/Makefile.am | 9 ++-- cxxmph/MurmurHash2.h | 1 + cxxmph/benchmark.cc | 104 +++++++++++++++++++++++++++++++++++++++++++ cxxmph/benchmark.h | 31 +++++++++++++ cxxmph/bm_numbers.cc | 52 ++++++++++++++++++++++ cxxmph/bm_urls.cc | 71 ++++++++++++++++++++++++----- cxxmph/mphtable.h | 4 +- 7 files changed, 256 insertions(+), 16 deletions(-) create mode 100644 cxxmph/benchmark.cc create mode 100644 cxxmph/benchmark.h create mode 100644 cxxmph/bm_numbers.cc diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index f18fd6e..6149a53 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,10 +1,10 @@ -check_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test bm_urls -noinst_PROGRAMS = bm_urls +check_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test +noinst_PROGRAMS = bm_numbers bm_urls bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la include_HEADERS = cmph_hash_map.h mphtable.h MurmurHash2.h trigraph.h cxxmph_hash.h stringpiece.h -libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mphtable.h mphtable.cc cxxmph_hash.h stringpiece.h +libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mphtable.h mphtable.cc cxxmph_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 cmph_hash_map_test_LDADD = libcxxmph.la @@ -16,6 +16,9 @@ mphtable_test_SOURCES = mphtable_test.cc trigraph_test_LDADD = libcxxmph.la trigraph_test_SOURCES = trigraph_test.cc +bm_numbers_LDADD = libcxxmph.la +bm_numbers_SOURCES = bm_numbers.cc + bm_urls_LDADD = libcxxmph.la bm_urls_SOURCES = bm_urls.cc diff --git a/cxxmph/MurmurHash2.h b/cxxmph/MurmurHash2.h index d817c7b..0d318a3 100644 --- a/cxxmph/MurmurHash2.h +++ b/cxxmph/MurmurHash2.h @@ -17,6 +17,7 @@ namespace cxxmph { +inline // not measured, for making compilation easier only unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) { // 'm' and 'r' are mixing constants generated offline. diff --git a/cxxmph/benchmark.cc b/cxxmph/benchmark.cc new file mode 100644 index 0000000..9a45491 --- /dev/null +++ b/cxxmph/benchmark.cc @@ -0,0 +1,104 @@ +#include "benchmark.h" + +#include +#include +#include + +#include +#include + +using std::cerr; +using std::endl; +using std::string; +using std::vector; + +namespace { + +/* Subtract the `struct timeval' values X and Y, + storing the result in RESULT. + Return 1 if the difference is negative, otherwise 0. */ +int timeval_subtract ( + struct timeval *result, struct timeval *x, struct timeval* y) { + /* Perform the carry for the later subtraction by updating y. */ + if (x->tv_usec < y->tv_usec) { + int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; + y->tv_usec -= 1000000 * nsec; + y->tv_sec += nsec; + } + if (x->tv_usec - y->tv_usec > 1000000) { + int nsec = (x->tv_usec - y->tv_usec) / 1000000; + y->tv_usec += 1000000 * nsec; + y->tv_sec -= nsec; + } + + /* Compute the time remaining to wait. + tv_usec is certainly positive. */ + result->tv_sec = x->tv_sec - y->tv_sec; + result->tv_usec = x->tv_usec - y->tv_usec; + + /* Return 1 if result is negative. */ + return x->tv_sec < y->tv_sec; +} + +struct rusage getrusage_or_die() { + struct rusage rs; + int ret = getrusage(RUSAGE_SELF, &rs); + if (ret != 0) { + cerr << "rusage failed: " << strerror(errno) << endl; + exit(-1); + } + return rs; +} + +#ifdef HAVE_CXA_DEMANGLE +string demangle(const string& name) { + char buf[1024]; + unsigned int size = 1024; + int status; + char* res = abi::__cxa_demangle( + name.c_str(), buf, &size, &status); + return res; +} +#else +string demangle(const string& name) { return name; } +#endif + + +static vector g_benchmarks; + +} // anonymous namespace + +namespace cxxmph { + +/* static */ void Benchmark::Register(Benchmark* bm) { + if (bm->name().empty()) { + string name = demangle(typeid(*bm).name()); + bm->set_name(name); + } + g_benchmarks.push_back(bm); +} + +/* static */ void Benchmark::RunAll() { + for (auto it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { + (*it)->MeasureRun(); + delete *it; + } +} + +void Benchmark::MeasureRun() { + struct rusage begin = getrusage_or_die(); + Run(iters_); + struct rusage end = getrusage_or_die(); + + struct timeval utime; + timeval_subtract(&utime, &end.ru_utime, &begin.ru_utime); + struct timeval stime; + timeval_subtract(&stime, &end.ru_stime, &begin.ru_stime); + + printf("Benchmark: %s\n", name().c_str()); + printf("User time used : %ld.%06ld\n", utime.tv_sec, utime.tv_usec); + printf("System time used: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); + printf("\n"); +} + +} // namespace cxxmph diff --git a/cxxmph/benchmark.h b/cxxmph/benchmark.h new file mode 100644 index 0000000..f0629e4 --- /dev/null +++ b/cxxmph/benchmark.h @@ -0,0 +1,31 @@ +#ifndef __CXXMPH_BENCHMARK_H__ +#define __CXXMPH_BENCHMARK_H__ + +#include +#include + +namespace cxxmph { + +class Benchmark { + public: + Benchmark(int iters = 1) : iters_(iters) { } + virtual void Run(int iters) = 0; + virtual ~Benchmark() { } + const std::string& name() { return name_; } + void set_name(const std::string& name) { name_ = name; } + + static void Register(Benchmark* bm); + static void RunAll(); + + protected: + int iters() { return iters_; } + + private: + int iters_; + std::string name_; + void MeasureRun(); +}; + +} // namespace cxxmph + +#endif diff --git a/cxxmph/bm_numbers.cc b/cxxmph/bm_numbers.cc new file mode 100644 index 0000000..40bef70 --- /dev/null +++ b/cxxmph/bm_numbers.cc @@ -0,0 +1,52 @@ +#include +#include + +#include "benchmark.h" +#include "mphtable.h" + +using std::set; +using std::vector; + +namespace cxxmph { +class BM_NumbersCreate : public Benchmark { + public: + BM_NumbersCreate(int iters = 1) : Benchmark(iters) { + set unique; + while (unique.size() < 1000 * 1000) { + int v = random(); + if (unique.find(v) == unique.end()) { + unique.insert(v); + random_unique_.push_back(v); + } + } + } + protected: + virtual void Run(int iters) { + SimpleMPHTable table; + table.Reset(random_unique_.begin(), random_unique_.end()); + } + std::vector random_unique_; +}; + +class BM_NumbersFind : public BM_NumbersCreate { + public: + BM_NumbersFind(int iters) : BM_NumbersCreate(iters) { table_.Reset(random_unique_.begin(), random_unique_.end()); } + virtual void Run(int iters) { + for (int i = 0; i < iters * 100; ++i) { + int pos = random() % random_unique_.size();; + int h = table_.index(pos); + } + } + private: + SimpleMPHTable table_; +}; + +} // namespace cxxmph + +using namespace cxxmph; + +int main(int argc, char** argv) { + Benchmark::Register(new BM_NumbersCreate()); + Benchmark::Register(new BM_NumbersFind(1000 * 1000)); + Benchmark::RunAll(); +} diff --git a/cxxmph/bm_urls.cc b/cxxmph/bm_urls.cc index 7d43e2f..916c725 100644 --- a/cxxmph/bm_urls.cc +++ b/cxxmph/bm_urls.cc @@ -1,21 +1,70 @@ -#include -#include #include +#include +#include +#include #include +#include -#include "mphtable.h" +#include "benchmark.h" +#include "cmph_hash_map.h" using std::ifstream; +using std::set; using std::string; using std::vector; -using cxxmph::SimpleMPHTable; + +namespace cxxmph { + +class BM_UrlsCreate : public Benchmark { + public: + BM_UrlsCreate(int iters = 1) : Benchmark(iters) { + ReadUrls(); + } + protected: + virtual void Run(int iters) { + BuildTable(); + } + void BuildTable() { + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + table_[*it] = it - urls_.begin(); + } + table_.pack(); + } + void ReadUrls() { + vector urls; + std::ifstream f("URLS100k"); + string buffer; + while(std::getline(f, buffer)) urls.push_back(buffer); + set unique(urls.begin(), urls.end()); + if (unique.size() != urls.size()) { + cerr << "Input file has repeated keys." << endl; + exit(-1); + } + urls_.swap(urls); + } + vector urls_; + cxxmph::cmph_hash_map table_; +}; + +class BM_UrlsFind : public BM_UrlsCreate { + public: + BM_UrlsFind(int iters = 1) : BM_UrlsCreate(iters) { ReadUrls(); BuildTable(); } + protected: + virtual void Run(int iters) { + for (int i = 0; i < iters * 100; ++i) { + int pos = random() % urls_.size();; + int h = table_[urls_[pos]]; + assert(h == pos); + } + } +}; + +} // namespace cxxmph + +using namespace cxxmph; int main(int argc, char** argv) { - vector urls; - std::ifstream f("URLS1k"); - string buffer; - while(std::getline(f, buffer)) urls.push_back(buffer); - - SimpleMPHTable table; - table.Reset(urls.begin(), urls.end()); + Benchmark::Register(new BM_UrlsCreate()); + Benchmark::Register(new BM_UrlsFind(1000 * 1000)); + Benchmark::RunAll(); } diff --git a/cxxmph/mphtable.h b/cxxmph/mphtable.h index a899a89..340b3db 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mphtable.h @@ -85,13 +85,13 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { n_ = 3*r_; k_ = 1U << b_; - cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; + // cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; int iterations = 10; std::vector edges; std::vector queue; while (1) { - cerr << "Iterations missing: " << iterations << endl; + // cerr << "Iterations missing: " << iterations << endl; for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_; // for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i; if (Mapping(begin, end, &edges, &queue)) break; From b0f3aaa3517b29beb5752a7b95ba63b97a13c04b Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sat, 14 May 2011 17:44:58 -0300 Subject: [PATCH 28/89] Reorganized tests. --- tests/Makefile.am | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/Makefile.am b/tests/Makefile.am index a0fe694..28ad0b5 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,4 +1,6 @@ -noinst_PROGRAMS = graph_tests packed_mphf_tests mphf_tests select_tests compressed_seq_tests compressed_rank_tests +TESTS = $(check_PROGRAMS) +check_PROGRAMS = graph_tests select_tests compressed_seq_tests compressed_rank_tests +noinst_PROGRAMS = packed_mphf_tests mphf_tests INCLUDES = -I../src/ From 0e6849792eee56edddfb542b8aac26a7bd3d7f1e Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sun, 15 May 2011 12:29:24 -0300 Subject: [PATCH 29/89] Ready to roll. --- NEWSLOG.t2t | 4 ++++ configure.ac | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/NEWSLOG.t2t b/NEWSLOG.t2t index ec8e7b6..b74bf2a 100644 --- a/NEWSLOG.t2t +++ b/NEWSLOG.t2t @@ -5,6 +5,10 @@ News Log ---------------------------------------- +==News for version 1.1== + +Fixed a bug in the chd_pc algorithm and reorganized tests. + ==News for version 1.0== This is a bugfix only version, after which a revamp of the cmph code and diff --git a/configure.ac b/configure.ac index e2cbde0..c2c88c8 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ dnl Process this file with autoconf to produce a configure script. AC_INIT(Makefile.am) -AM_INIT_AUTOMAKE(cmph, 1.0) +AM_INIT_AUTOMAKE(cmph, 1.1) AM_CONFIG_HEADER(config.h) AC_CONFIG_MACRO_DIR([m4]) From 6adc2a4816cd15e34d7c6327de624929cb1778e6 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sun, 15 May 2011 12:58:13 -0300 Subject: [PATCH 30/89] Added changes to README.t2t as well. --- README.t2t | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.t2t b/README.t2t index d94e70e..21d851f 100644 --- a/README.t2t +++ b/README.t2t @@ -88,6 +88,10 @@ The CMPH Library encapsulates the newest and more efficient algorithms in an eas ---------------------------------------- +==News for version 1.1== + +Fixed a bug in the chd_pc algorithm and reorganized tests. + ==News for version 1.0== This is a bugfix only version, after which a revamp of the cmph code and From 532ee999b98d63476987a939419c2f234d9e595b Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sun, 15 May 2011 17:19:08 -0300 Subject: [PATCH 31/89] Moved benchmark code into tests directory. --- src/Makefile.am | 4 ---- tests/Makefile.am | 5 ++++- {src => tests}/cmph_benchmark_test.c | 0 3 files changed, 4 insertions(+), 5 deletions(-) rename {src => tests}/cmph_benchmark_test.c (100%) diff --git a/src/Makefile.am b/src/Makefile.am index 4b6e5b4..0ab079a 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,5 +1,4 @@ bin_PROGRAMS = cmph -check_PROGRAMS = cmph_benchmark_test noinst_PROGRAMS = bm_numbers lib_LTLIBRARIES = libcmph.la include_HEADERS = cmph.h cmph_types.h cmph_time.h chd_ph.h @@ -34,8 +33,5 @@ libcmph_la_LDFLAGS = -version-info 0:0:0 cmph_SOURCES = main.c wingetopt.h wingetopt.c cmph_LDADD = libcmph.la -cmph_benchmark_test_SOURCES = cmph_benchmark_test.c -cmph_benchmark_test_LDADD = libcmph.la - bm_numbers_SOURCES = bm_numbers.c bm_numbers_LDADD = libcmph.la diff --git a/tests/Makefile.am b/tests/Makefile.am index 28ad0b5..361c67b 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,5 +1,5 @@ TESTS = $(check_PROGRAMS) -check_PROGRAMS = graph_tests select_tests compressed_seq_tests compressed_rank_tests +check_PROGRAMS = graph_tests select_tests compressed_seq_tests compressed_rank_tests cmph_benchmark_test noinst_PROGRAMS = packed_mphf_tests mphf_tests INCLUDES = -I../src/ @@ -21,3 +21,6 @@ compressed_seq_tests_LDADD = ../src/libcmph.la compressed_rank_tests_SOURCES = compressed_rank_tests.c compressed_rank_tests_LDADD = ../src/libcmph.la + +cmph_benchmark_test_SOURCES = cmph_benchmark_test.c +cmph_benchmark_test_LDADD = ../src/libcmph.la diff --git a/src/cmph_benchmark_test.c b/tests/cmph_benchmark_test.c similarity index 100% rename from src/cmph_benchmark_test.c rename to tests/cmph_benchmark_test.c From 4482c8f39b366d63794a4fd1c9b000c94640229b Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Sun, 15 May 2011 19:38:31 -0300 Subject: [PATCH 32/89] Conditional compilation of the cxxmph directory. --- Makefile.am | 4 ++-- acinclude.m4 | 8 ++++++++ configure.ac | 24 +++++++++++++++--------- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/Makefile.am b/Makefile.am index 0569dc0..cdf1447 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,5 +1,5 @@ -SUBDIRS = src tests examples cxxmph man -EXTRA_DIST = cmph.spec configure.ac cmph.pc.in +SUBDIRS = src tests examples man $(CXXMPH) +EXTRA_DIST = cmph.spec configure.ac cmph.pc.in LGPL-2 MPL-1.1 pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = cmph.pc diff --git a/acinclude.m4 b/acinclude.m4 index f216360..b49a92b 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -1,3 +1,11 @@ +AC_DEFUN([AC_ENABLE_CXXMPH], [AC_ARG_ENABLE([cxxmph], + [ --enable-cxxmph enable the c++ cxxmph library ], + [case "${enableval}" in + yes) cxxmph=true ;; + no) cxxmph=false ;; + *) AC_MSG_ERROR([bad value ${enableval} for --enable-cxxmph]) ;; + esac],[cxxmph=false])]) + AC_DEFUN([AC_CHECK_SPOON], [ AC_ARG_WITH(spoon, [ --with-spoon=SPOON this is inocuous, since the truth is that there is no spoon ]) AC_MSG_CHECKING(if there is spoon) diff --git a/configure.ac b/configure.ac index 7f0e2a2..ae62410 100644 --- a/configure.ac +++ b/configure.ac @@ -1,16 +1,16 @@ dnl Process this file with autoconf to produce a configure script. -AC_INIT(Makefile.am) +AC_INIT +AC_CONFIG_SRCDIR([Makefile.am]) AM_INIT_AUTOMAKE(cmph, 1.0) -AM_CONFIG_HEADER(config.h) +AC_CONFIG_HEADERS([config.h]) AC_CONFIG_MACRO_DIR([m4]) dnl Checks for programs. AC_PROG_AWK AC_PROG_CC -AC_PROG_CXX AC_PROG_INSTALL AC_PROG_LN_S -AC_PROG_LIBTOOL +LT_INIT AC_SYS_EXTRA_LARGEFILE if test "x$ac_cv_sys_largefile_CFLAGS" = "xno" ; then ac_cv_sys_largefile_CFLAGS="" @@ -25,16 +25,22 @@ CFLAGS="$CFLAGS $ac_cv_sys_largefile_CFLAGS" LDFLAGS="$LDFLAGS $ac_cv_sys_largefile_LDFLAGS" LIBS="$LIBS $ac_cv_sys_largefile_LIBS" - dnl Checks for headers AC_CHECK_HEADERS([getopt.h math.h]) dnl Checks for libraries. -AC_CHECK_LIBM +LT_LIB_M LDFLAGS="$LIBM $LDFLAGS" -dnl Checks for library functions. +AC_PROG_CXX +AC_ENABLE_CXXMPH +if test x$cxxmph = xtrue; then + CXXFLAGS="$CXXFLAGS -std=c++0x" + AC_SUBST([CXXMPH], "cxxmph") +fi AC_CHECK_SPOON -dnl AC_OUTPUT(Makefile tests/Makefile samples/Makefile) -AC_OUTPUT(Makefile src/Makefile cxxmph/Makefile tests/Makefile examples/Makefile man/Makefile cmph.pc) +dnl AC_CONFIG_FILES([Makefile tests/Makefile samples/Makefile]) +AC_OUTPUT +AC_CONFIG_FILES([Makefile src/Makefile cxxmph/Makefile tests/Makefile examples/Makefile man/Makefile cmph.pc]) +AC_OUTPUT From 7d9253fd98149d19354b8d1b0d6e3ca0e123aeae Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sun, 15 May 2011 19:39:55 -0300 Subject: [PATCH 33/89] Fixed include dir. --- cxxmph/Makefile.am | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 6149a53..a7fafa0 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -2,10 +2,10 @@ check_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test noinst_PROGRAMS = bm_numbers bm_urls bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la -include_HEADERS = cmph_hash_map.h mphtable.h MurmurHash2.h trigraph.h cxxmph_hash.h stringpiece.h - libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mphtable.h mphtable.cc cxxmph_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 +cxxmph_includedir = $(includedir)/cxxmph/ +cxxmph_include_HEADERS = cmph_hash_map.h mphtable.h MurmurHash2.h trigraph.h cxxmph_hash.h stringpiece.h cmph_hash_map_test_LDADD = libcxxmph.la cmph_hash_map_test_SOURCES = cmph_hash_map_test.cc From 0761f2418247c875832e31b6d1cfd484ef6bd059 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sun, 15 May 2011 19:48:01 -0300 Subject: [PATCH 34/89] Improved cxxmph test organization. --- cxxmph/Makefile.am | 1 + cxxmph/cmph_hash_map.h | 4 ---- cxxmph/cmph_hash_map_test.cc | 4 ++-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index a7fafa0..087e487 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,3 +1,4 @@ +TESTS = $(check_PROGRAMS) check_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test noinst_PROGRAMS = bm_numbers bm_urls bin_PROGRAMS = cxxmph diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/cmph_hash_map.h index 629667f..3f1b1b5 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/cmph_hash_map.h @@ -103,10 +103,6 @@ CMPH_METHOD_DECL(insert_return_type, insert)(const value_type& x) { CMPH_METHOD_DECL(void_type, rehash)() { if (values_.empty()) return; - std::cerr << "Calling Reset with " - << table_.size() << " keys in table " - << slack_.size() << " keys in slack " - << values_.size() << " key in total" << std::endl; slack_type().swap(slack_); bool success = table_.Reset( make_iterator_first(values_.begin()), diff --git a/cxxmph/cmph_hash_map_test.cc b/cxxmph/cmph_hash_map_test.cc index 2ebaa94..50a7bc5 100644 --- a/cxxmph/cmph_hash_map_test.cc +++ b/cxxmph/cmph_hash_map_test.cc @@ -11,10 +11,10 @@ using cxxmph::cmph_hash_map; int main(int argc, char** argv) { cmph_hash_map b; - for (int i = 0; i < 2*1000*1000; ++i) { + for (int i = 0; i < 100*1000; ++i) { b.insert(make_pair(i, i)); } - for (int i = 0; i < 100*1000*1000; ++i) { + for (int i = 0; i < 1000*1000; ++i) { b.find(i); } /* From 5a46ad95bec2ded79f03f1a0f739673ea72e540e Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sun, 15 May 2011 20:47:42 -0300 Subject: [PATCH 35/89] Better header organization. --- cxxmph/Makefile.am | 14 +++--- cxxmph/benchmark.cc | 2 +- cxxmph/bm_numbers.cc | 2 +- cxxmph/bm_urls.cc | 4 +- cxxmph/cxxmph.cc | 12 ++--- cxxmph/{cmph_hash_map.h => mph_map.h} | 50 +++++++++---------- ...{cmph_hash_map_test.cc => mph_map_test.cc} | 12 ++--- cxxmph/{mphtable.cc => mph_table.cc} | 3 +- cxxmph/{mphtable.h => mph_table.h} | 10 ++-- .../{mphtable_test.cc => mph_table_test.cc} | 8 +-- cxxmph/{cxxmph_hash.h => seeded_hash.h} | 33 ++++++------ 11 files changed, 78 insertions(+), 72 deletions(-) rename cxxmph/{cmph_hash_map.h => mph_map.h} (73%) rename cxxmph/{cmph_hash_map_test.cc => mph_map_test.cc} (87%) rename cxxmph/{mphtable.cc => mph_table.cc} (99%) rename cxxmph/{mphtable.h => mph_table.h} (96%) rename cxxmph/{mphtable_test.cc => mph_table_test.cc} (82%) rename cxxmph/{cxxmph_hash.h => seeded_hash.h} (71%) diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 087e487..662f488 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,18 +1,18 @@ TESTS = $(check_PROGRAMS) -check_PROGRAMS = cmph_hash_map_test mphtable_test trigraph_test +check_PROGRAMS = mph_map_test mph_table_test trigraph_test noinst_PROGRAMS = bm_numbers bm_urls bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la -libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mphtable.h mphtable.cc cxxmph_hash.h stringpiece.h benchmark.h benchmark.cc +libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_table.h mph_table.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 cxxmph_includedir = $(includedir)/cxxmph/ -cxxmph_include_HEADERS = cmph_hash_map.h mphtable.h MurmurHash2.h trigraph.h cxxmph_hash.h stringpiece.h +cxxmph_include_HEADERS = mph_map.h mph_table.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h -cmph_hash_map_test_LDADD = libcxxmph.la -cmph_hash_map_test_SOURCES = cmph_hash_map_test.cc +mph_map_test_LDADD = libcxxmph.la +mph_map_test_SOURCES = mph_map_test.cc -mphtable_test_LDADD = libcxxmph.la -mphtable_test_SOURCES = mphtable_test.cc +mph_table_test_LDADD = libcxxmph.la +mph_table_test_SOURCES = mph_table_test.cc trigraph_test_LDADD = libcxxmph.la trigraph_test_SOURCES = trigraph_test.cc diff --git a/cxxmph/benchmark.cc b/cxxmph/benchmark.cc index 9a45491..04e5086 100644 --- a/cxxmph/benchmark.cc +++ b/cxxmph/benchmark.cc @@ -73,7 +73,7 @@ namespace cxxmph { /* static */ void Benchmark::Register(Benchmark* bm) { if (bm->name().empty()) { string name = demangle(typeid(*bm).name()); - bm->set_name(name); + bm->set_name(name); } g_benchmarks.push_back(bm); } diff --git a/cxxmph/bm_numbers.cc b/cxxmph/bm_numbers.cc index 40bef70..85653f5 100644 --- a/cxxmph/bm_numbers.cc +++ b/cxxmph/bm_numbers.cc @@ -2,7 +2,7 @@ #include #include "benchmark.h" -#include "mphtable.h" +#include "mph_table.h" using std::set; using std::vector; diff --git a/cxxmph/bm_urls.cc b/cxxmph/bm_urls.cc index 916c725..6424755 100644 --- a/cxxmph/bm_urls.cc +++ b/cxxmph/bm_urls.cc @@ -6,7 +6,7 @@ #include #include "benchmark.h" -#include "cmph_hash_map.h" +#include "mph_map.h" using std::ifstream; using std::set; @@ -43,7 +43,7 @@ class BM_UrlsCreate : public Benchmark { urls_.swap(urls); } vector urls_; - cxxmph::cmph_hash_map table_; + cxxmph::mph_map table_; }; class BM_UrlsFind : public BM_UrlsCreate { diff --git a/cxxmph/cxxmph.cc b/cxxmph/cxxmph.cc index 623ecc4..68bb23e 100644 --- a/cxxmph/cxxmph.cc +++ b/cxxmph/cxxmph.cc @@ -8,7 +8,7 @@ #include #include -#include "cmph_hash_map.h" +#include "mph_map.h" #include "config.h" using std::cerr; @@ -19,10 +19,10 @@ using std::ifstream; using std::string; using std::vector; -using cxxmph::cmph_hash_map; +using cxxmph::mph_map; void usage(const char* prg) { - cerr << "usage: " << prg << "[-v] [-h] [-V]" << endl; + cerr << "usage: " << prg << " [-v] [-h] [-V] " << endl; } void usage_long(const char* prg) { usage(prg); @@ -58,11 +58,11 @@ int main(int argc, char** argv) { string buffer; while (!getline(f, buffer).eof()) keys.push_back(buffer); for (int i = 0; i < keys.size(); ++i) string s = keys[i]; - cmph_hash_map table; + mph_map table; for (int i = 0; i < keys.size(); ++i) table[keys[i]] = keys[i]; - cmph_hash_map::const_iterator it = table.begin(); - cmph_hash_map::const_iterator end = table.end(); + mph_map::const_iterator it = table.begin(); + mph_map::const_iterator end = table.end(); for (; it != end; ++it) { cout << (it - table.begin()) << ": " << it->first <<" -> " << it->second << endl; diff --git a/cxxmph/cmph_hash_map.h b/cxxmph/mph_map.h similarity index 73% rename from cxxmph/cmph_hash_map.h rename to cxxmph/mph_map.h index 3f1b1b5..538f708 100644 --- a/cxxmph/cmph_hash_map.h +++ b/cxxmph/mph_map.h @@ -4,17 +4,17 @@ #include // for std::pair #include "MurmurHash2.h" -#include "mphtable.h" +#include "mph_table.h" namespace cxxmph { // Save on repetitive typing. -#define CMPH_TMPL_SPEC template -#define CMPH_CLASS_SPEC cmph_hash_map -#define CMPH_METHOD_DECL(r, m) CMPH_TMPL_SPEC typename CMPH_CLASS_SPEC::r CMPH_CLASS_SPEC::m +#define MPH_MAP_TMPL_SPEC template +#define MPH_MAP_CLASS_SPEC mph_map +#define MPH_MAP_METHOD_DECL(r, m) MPH_MAP_TMPL_SPEC typename MPH_MAP_CLASS_SPEC::r MPH_MAP_CLASS_SPEC::m template , class EqualKey = std::equal_to, class Alloc = std::allocator > -class cmph_hash_map { +class mph_map { public: typedef Key key_type; typedef Data data_type; @@ -35,8 +35,8 @@ class cmph_hash_map { typedef bool bool_type; typedef std::pair insert_return_type; - cmph_hash_map(); - ~cmph_hash_map(); + mph_map(); + ~mph_map(); iterator begin(); iterator end(); @@ -70,25 +70,25 @@ class cmph_hash_map { void rehash(); std::vector values_; - SimpleMPHTable::hash_function> table_; + SimpleMPHTable::hash_function> table_; // TODO(davi) optimize slack to no hold a copy of the key typedef typename std::unordered_map slack_type; slack_type slack_; }; -CMPH_TMPL_SPEC -bool operator==(const CMPH_CLASS_SPEC& lhs, const CMPH_CLASS_SPEC& rhs) { +MPH_MAP_TMPL_SPEC +bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) { return lhs.values_ == rhs.values_; } -CMPH_TMPL_SPEC CMPH_CLASS_SPEC::cmph_hash_map() { +MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() { rehash(); } -CMPH_TMPL_SPEC CMPH_CLASS_SPEC::~cmph_hash_map() { +MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() { } -CMPH_METHOD_DECL(insert_return_type, insert)(const value_type& x) { +MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { iterator it = find(x.first); if (it != end()) return std::make_pair(it, false); values_.push_back(x); @@ -101,7 +101,7 @@ CMPH_METHOD_DECL(insert_return_type, insert)(const value_type& x) { return std::make_pair(it, true); } -CMPH_METHOD_DECL(void_type, rehash)() { +MPH_MAP_METHOD_DECL(void_type, rehash)() { if (values_.empty()) return; slack_type().swap(slack_); bool success = table_.Reset( @@ -117,29 +117,29 @@ CMPH_METHOD_DECL(void_type, rehash)() { values_.swap(new_values); } -CMPH_METHOD_DECL(iterator, begin)() { return values_.begin(); } -CMPH_METHOD_DECL(iterator, end)() { return values_.end(); } -CMPH_METHOD_DECL(const_iterator, begin)() const { return values_.begin(); } -CMPH_METHOD_DECL(const_iterator, end)() const { return values_.end(); } -CMPH_METHOD_DECL(bool_type, empty)() const { return values_.empty(); } +MPH_MAP_METHOD_DECL(iterator, begin)() { return values_.begin(); } +MPH_MAP_METHOD_DECL(iterator, end)() { return values_.end(); } +MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return values_.begin(); } +MPH_MAP_METHOD_DECL(const_iterator, end)() const { return values_.end(); } +MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); } -CMPH_METHOD_DECL(void_type, clear)() { +MPH_MAP_METHOD_DECL(void_type, clear)() { values_.clear(); slack_.clear(); table_.clear(); } -CMPH_METHOD_DECL(void_type, erase)(iterator pos) { +MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { values_.erase(pos); rehash(); } -CMPH_METHOD_DECL(void_type, erase)(const key_type& k) { +MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { iterator it = find(k); if (it == end()) return; erase(it); } -CMPH_METHOD_DECL(const_iterator, find)(const key_type& k) const { +MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { if (!slack_.empty()) { typename slack_type::const_iterator it = slack_.find(k); if (it != slack_.end()) return values_.begin() + it->second; @@ -151,7 +151,7 @@ CMPH_METHOD_DECL(const_iterator, find)(const key_type& k) const { } return end(); } -CMPH_METHOD_DECL(iterator, find)(const key_type& k) { +MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { if (!slack_.empty()) { typename slack_type::const_iterator it = slack_.find(k); if (it != slack_.end()) return values_.begin() + it->second; @@ -164,7 +164,7 @@ CMPH_METHOD_DECL(iterator, find)(const key_type& k) { return end(); } -CMPH_METHOD_DECL(data_type&, operator[])(const key_type& k) { +MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) { return insert(std::make_pair(k, data_type())).first->second; } diff --git a/cxxmph/cmph_hash_map_test.cc b/cxxmph/mph_map_test.cc similarity index 87% rename from cxxmph/cmph_hash_map_test.cc rename to cxxmph/mph_map_test.cc index 50a7bc5..579e0ca 100644 --- a/cxxmph/cmph_hash_map_test.cc +++ b/cxxmph/mph_map_test.cc @@ -1,16 +1,16 @@ -#include "cmph_hash_map.h" - #include #include #include #include +#include "mph_map.h" + using std::make_pair; using std::string; -using cxxmph::cmph_hash_map; +using cxxmph::mph_map; int main(int argc, char** argv) { - cmph_hash_map b; + mph_map b; for (int i = 0; i < 100*1000; ++i) { b.insert(make_pair(i, i)); } @@ -18,9 +18,9 @@ int main(int argc, char** argv) { b.find(i); } /* - cmph_hash_map h; + mph_map h; h.insert(std::make_pair("-1",-1)); - cmph_hash_map::const_iterator it; + mph_map::const_iterator it; for (it = h.begin(); it != h.end(); ++it) { std::cerr << it->first << " -> " << it->second << std::endl; } diff --git a/cxxmph/mphtable.cc b/cxxmph/mph_table.cc similarity index 99% rename from cxxmph/mphtable.cc rename to cxxmph/mph_table.cc index bbc0c31..03dcfa3 100644 --- a/cxxmph/mphtable.cc +++ b/cxxmph/mph_table.cc @@ -1,10 +1,11 @@ #include #include +#include using std::cerr; using std::endl; -#include "mphtable.h" +#include "mph_table.h" using std::vector; diff --git a/cxxmph/mphtable.h b/cxxmph/mph_table.h similarity index 96% rename from cxxmph/mphtable.h rename to cxxmph/mph_table.h index 340b3db..a2fa236 100644 --- a/cxxmph/mphtable.h +++ b/cxxmph/mph_table.h @@ -1,5 +1,5 @@ -#ifndef __CXXMPH_MPHTABLE_H__ -#define __CXXMPH_MPHTABLE_H__ +#ifndef __CXXMPH_MPH_TABLE_H__ +#define __CXXMPH_MPH_TABLE_H__ // Minimal perfect hash abstraction implementing the BDZ algorithm @@ -15,7 +15,7 @@ using std::cerr; using std::endl; -#include "cxxmph_hash.h" +#include "seeded_hash.h" #include "trigraph.h" namespace cxxmph { @@ -143,7 +143,7 @@ uint32_t MPHTable::index(const Key& key) const { return Rank(vertex); } -template >::hash_function> +template >::hash_function> class SimpleMPHTable : public MPHTable { public: template @@ -155,4 +155,4 @@ class SimpleMPHTable : public MPHTable { } // namespace cxxmph -#endif // __CXXMPH_MPHTABLE_H__ +#endif // __CXXMPH_MPH_TABLE_H__ diff --git a/cxxmph/mphtable_test.cc b/cxxmph/mph_table_test.cc similarity index 82% rename from cxxmph/mphtable_test.cc rename to cxxmph/mph_table_test.cc index eb6ed3f..d12f901 100644 --- a/cxxmph/mphtable_test.cc +++ b/cxxmph/mph_table_test.cc @@ -3,7 +3,7 @@ #include #include -#include "mphtable.h" +#include "mph_table.h" using std::string; using std::vector; @@ -23,11 +23,11 @@ int main(int argc, char** argv) { keys.push_back("diogo"); keys.push_back("algume"); - SimpleMPHTable mphtable; - assert(mphtable.Reset(keys.begin(), keys.end())); + SimpleMPHTable mph_table; + assert(mph_table.Reset(keys.begin(), keys.end())); vector ids; for (vector::size_type i = 0; i < keys.size(); ++i) { - ids.push_back(mphtable.index(keys[i])); + ids.push_back(mph_table.index(keys[i])); cerr << " " << *(ids.end() - 1); } cerr << endl; diff --git a/cxxmph/cxxmph_hash.h b/cxxmph/seeded_hash.h similarity index 71% rename from cxxmph/cxxmph_hash.h rename to cxxmph/seeded_hash.h index 98748a0..7446813 100644 --- a/cxxmph/cxxmph_hash.h +++ b/cxxmph/seeded_hash.h @@ -1,3 +1,6 @@ +#ifndef __CXXMPH_SEEDED_HASH_H__ +#define __CXXMPH_SEEDED_HASH_H__ + #include // for uint32_t and friends #include @@ -47,36 +50,38 @@ struct seeded_hash_function { } }; -template struct cxxmph_hash +template struct seeded_hash { typedef seeded_hash_function hash_function; }; // Use Murmur2 instead for all types defined in std::hash, plus // std::string which is commonly extended. -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct cxxmph_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; } // namespace cxxmph + +#endif // __CXXMPH_SEEDED_HASH_H__ From cb50e06bc2375d25bd06d4242ea134936b40fdca Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sun, 15 May 2011 21:56:28 -0300 Subject: [PATCH 36/89] Fixed cxxflags. --- configure.ac | 2 -- cxxmph/Makefile.am | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index aa947ef..b749ad3 100644 --- a/configure.ac +++ b/configure.ac @@ -22,7 +22,6 @@ if test "x$ac_cv_sys_largefile_LIBS" = "xno" ; then ac_cv_sys_largefile_LIBS="" fi CFLAGS="$CFLAGS $ac_cv_sys_largefile_CFLAGS" -CXXFLAGS='-std=c++0x' LDFLAGS="$LDFLAGS $ac_cv_sys_largefile_LDFLAGS" LIBS="$LIBS $ac_cv_sys_largefile_LIBS" @@ -37,7 +36,6 @@ CFLAGS="-Wall -Werror" AC_PROG_CXX AC_ENABLE_CXXMPH if test x$cxxmph = xtrue; then - CXXFLAGS="$CXXFLAGS -std=c++0x" AC_SUBST([CXXMPH], "cxxmph") fi diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 662f488..2d44345 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,3 +1,4 @@ +AM_CXXFLAGS='-std=c++0x' TESTS = $(check_PROGRAMS) check_PROGRAMS = mph_map_test mph_table_test trigraph_test noinst_PROGRAMS = bm_numbers bm_urls From a61882d72205081a73297467ef005ee84e49166d Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sun, 15 May 2011 22:02:34 -0300 Subject: [PATCH 37/89] Enabled debug. --- cxxmph/mph_table.cc | 34 +++++++++++++++------------------- cxxmph/mph_table.h | 10 +++++----- 2 files changed, 20 insertions(+), 24 deletions(-) diff --git a/cxxmph/mph_table.cc b/cxxmph/mph_table.cc index 03dcfa3..2f92498 100644 --- a/cxxmph/mph_table.cc +++ b/cxxmph/mph_table.cc @@ -61,14 +61,12 @@ bool MPHTable::GenerateQueue( } } } - /* for (unsigned int i = 0; i < marked_edge.size(); ++i) { cerr << "vertex with degree " << static_cast(graph->vertex_degree()[i]) << " marked " << marked_edge[i] << endl; } for (unsigned int i = 0; i < queue.size(); ++i) { cerr << "vertex " << i << " queued at " << queue[i] << endl; } - */ // At this point queue head is the number of edges touching at least one // vertex of degree 1. // cerr << "Queue head " << queue_head << " Queue tail " << queue_tail << endl; @@ -88,11 +86,9 @@ bool MPHTable::GenerateQueue( } } } - /* for (unsigned int i = 0; i < queue.size(); ++i) { cerr << "vertex " << i << " queued at " << queue[i] << endl; } - */ int cycles = queue_head - nedges; if (cycles == 0) queue.swap(*queue_output); return cycles == 0; @@ -110,10 +106,10 @@ void MPHTable::Assigning( for (int i = nedges - 1; i + 1 >= 1; --i) { current_edge = queue[i]; const TriGraph::Edge& e = edges[current_edge]; - // cerr << "B: " << e[0] << " " << e[1] << " " << e[2] << " -> " - // << get_2bit_value(g_, e[0]) << " " - // << get_2bit_value(g_, e[1]) << " " - // << get_2bit_value(g_, e[2]) << " edge " << current_edge << endl; + cerr << "B: " << e[0] << " " << e[1] << " " << e[2] << " -> " + << get_2bit_value(g_, e[0]) << " " + << get_2bit_value(g_, e[1]) << " " + << get_2bit_value(g_, e[2]) << " edge " << current_edge << endl; if (!marked_vertices[e[0]]) { if (!marked_vertices[e[1]]) { set_2bit_value(&g_, e[1], kUnassigned); @@ -137,10 +133,10 @@ void MPHTable::Assigning( set_2bit_value(&g_, e[2], (8 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[1]))) % 3); marked_vertices[e[2]] = true; } - // cerr << "A: " << e[0] << " " << e[1] << " " << e[2] << " -> " - // << get_2bit_value(g_, e[0]) << " " - // << get_2bit_value(g_, e[1]) << " " - // << get_2bit_value(g_, e[2]) << " " << endl; + cerr << "A: " << e[0] << " " << e[1] << " " << e[2] << " -> " + << get_2bit_value(g_, e[0]) << " " + << get_2bit_value(g_, e[1]) << " " + << get_2bit_value(g_, e[2]) << " " << endl; } } @@ -174,14 +170,14 @@ uint32_t MPHTable::Rank(uint32_t vertex) const { uint32_t end_idx_b = vertex >> 2; while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]]; beg_idx_v = beg_idx_b << 2; - // cerr << "beg_idx_v: " << beg_idx_v << endl; - // cerr << "base rank: " << base_rank << endl; + cerr << "beg_idx_v: " << beg_idx_v << endl; + cerr << "base rank: " << base_rank << endl; - //cerr << "G: "; - // for (unsigned int i = 0; i < n_; ++i) { - // cerr << get_2bit_value(g_, i) << " "; - //} - // cerr << endl; + cerr << "G: "; + for (unsigned int i = 0; i < n_; ++i) { + cerr << get_2bit_value(g_, i) << " "; + } + cerr << endl; while (beg_idx_v < vertex) { if (get_2bit_value(g_, beg_idx_v) != kUnassigned) ++base_rank; ++beg_idx_v; diff --git a/cxxmph/mph_table.h b/cxxmph/mph_table.h index a2fa236..99baaa3 100644 --- a/cxxmph/mph_table.h +++ b/cxxmph/mph_table.h @@ -85,13 +85,13 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { n_ = 3*r_; k_ = 1U << b_; - // cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; + cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; int iterations = 10; std::vector edges; std::vector queue; while (1) { - // cerr << "Iterations missing: " << iterations << endl; + cerr << "Iterations missing: " << iterations << endl; for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_; // for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i; if (Mapping(begin, end, &edges, &queue)) break; @@ -116,7 +116,7 @@ bool MPHTable::Mapping( uint32_t v0 = h[0] % r_; uint32_t v1 = h[1] % r_ + r_; uint32_t v2 = h[2] % r_ + (r_ << 1); - // cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; + cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; graph.AddEdge(TriGraph::Edge(v0, v1, v2)); } if (GenerateQueue(&graph, queue)) { @@ -134,12 +134,12 @@ uint32_t MPHTable::index(const Key& key) const { h[1] = h[1] % r_ + r_; h[2] = h[2] % r_ + (r_ << 1); assert(g_.size()); - //cerr << "g_.size() " << g_.size() << " h0 >> 2 " << (h[0] >> 2) << endl; + cerr << "g_.size() " << g_.size() << " h0 >> 2 " << (h[0] >> 2) << endl; assert((h[0] >> 2) > 2) > 2) Date: Sun, 15 May 2011 23:04:30 -0300 Subject: [PATCH 38/89] Moved to c arrays to allow mmap'ing. --- cxxmph/mph_map.h | 1 + cxxmph/mph_table.cc | 59 +++++++++++++++++++++++++++++++-------------- cxxmph/mph_table.h | 40 ++++++++++++++++-------------- 3 files changed, 64 insertions(+), 36 deletions(-) diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 538f708..398db3e 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -112,6 +112,7 @@ MPH_MAP_METHOD_DECL(void_type, rehash)() { for (const_iterator it = values_.begin(), end = values_.end(); it != end; ++it) { size_type id = table_.index(it->first); + assert(id < new_values.size()); new_values[id] = *it; } values_.swap(new_values); diff --git a/cxxmph/mph_table.cc b/cxxmph/mph_table.cc index 2f92498..0fa0393 100644 --- a/cxxmph/mph_table.cc +++ b/cxxmph/mph_table.cc @@ -39,9 +39,20 @@ namespace cxxmph { const uint8_t MPHTable::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; -void MPHTable::clear() { - // TODO(davi) impolement me +MPHTable::~MPHTable() { + clear(); } + +void MPHTable::clear() { + delete [] g_; + g_ = NULL; + g_size_ = 0; + delete [] ranktable_; + ranktable_ = NULL; + ranktable_size_ = 0; + // TODO(davi) implement me +} + bool MPHTable::GenerateQueue( TriGraph* graph, vector* queue_output) { uint32_t queue_head = 0, queue_tail = 0; @@ -61,12 +72,14 @@ bool MPHTable::GenerateQueue( } } } + /* for (unsigned int i = 0; i < marked_edge.size(); ++i) { cerr << "vertex with degree " << static_cast(graph->vertex_degree()[i]) << " marked " << marked_edge[i] << endl; } for (unsigned int i = 0; i < queue.size(); ++i) { cerr << "vertex " << i << " queued at " << queue[i] << endl; } + */ // At this point queue head is the number of edges touching at least one // vertex of degree 1. // cerr << "Queue head " << queue_head << " Queue tail " << queue_tail << endl; @@ -86,9 +99,11 @@ bool MPHTable::GenerateQueue( } } } + /* for (unsigned int i = 0; i < queue.size(); ++i) { cerr << "vertex " << i << " queued at " << queue[i] << endl; } + */ int cycles = queue_head - nedges; if (cycles == 0) queue.swap(*queue_output); return cycles == 0; @@ -99,60 +114,67 @@ void MPHTable::Assigning( uint32_t current_edge = 0; vector marked_vertices(n_ + 1); // Initialize vector of half nibbles with all bits set. - uint32_t sizeg = static_cast(ceil(n_/4.0)); - vector(sizeg, std::numeric_limits::max()).swap(g_); + g_size_ = static_cast(ceil(n_/4.0)); + delete [] g_; + g_ = new uint8_t[g_size_]; + memset(g_, std::numeric_limits::max(), g_size_); + assert(g_[g_size_ - 1] == 255); uint32_t nedges = m_; // for legibility for (int i = nedges - 1; i + 1 >= 1; --i) { current_edge = queue[i]; const TriGraph::Edge& e = edges[current_edge]; + /* cerr << "B: " << e[0] << " " << e[1] << " " << e[2] << " -> " << get_2bit_value(g_, e[0]) << " " << get_2bit_value(g_, e[1]) << " " << get_2bit_value(g_, e[2]) << " edge " << current_edge << endl; + */ if (!marked_vertices[e[0]]) { if (!marked_vertices[e[1]]) { - set_2bit_value(&g_, e[1], kUnassigned); + set_2bit_value(g_, e[1], kUnassigned); marked_vertices[e[1]] = true; } if (!marked_vertices[e[2]]) { - set_2bit_value(&g_, e[2], kUnassigned); + set_2bit_value(g_, e[2], kUnassigned); assert(marked_vertices.size() > e[2]); marked_vertices[e[2]] = true; } - set_2bit_value(&g_, e[0], (6 - (get_2bit_value(g_, e[1]) + get_2bit_value(g_, e[2]))) % 3); + set_2bit_value(g_, e[0], (6 - (get_2bit_value(g_, e[1]) + get_2bit_value(g_, e[2]))) % 3); marked_vertices[e[0]] = true; } else if (!marked_vertices[e[1]]) { if (!marked_vertices[e[2]]) { - set_2bit_value(&g_, e[2], kUnassigned); + set_2bit_value(g_, e[2], kUnassigned); marked_vertices[e[2]] = true; } - set_2bit_value(&g_, e[1], (7 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[2]))) % 3); + set_2bit_value(g_, e[1], (7 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[2]))) % 3); marked_vertices[e[1]] = true; } else { - set_2bit_value(&g_, e[2], (8 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[1]))) % 3); + set_2bit_value(g_, e[2], (8 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[1]))) % 3); marked_vertices[e[2]] = true; } + /* cerr << "A: " << e[0] << " " << e[1] << " " << e[2] << " -> " << get_2bit_value(g_, e[0]) << " " << get_2bit_value(g_, e[1]) << " " << get_2bit_value(g_, e[2]) << " " << endl; + */ } } void MPHTable::Ranking() { uint32_t nbytes_total = static_cast(ceil(n_ / 4.0)); uint32_t size = k_ >> 2U; - uint32_t ranktablesize = static_cast( + ranktable_size_ = static_cast( ceil(n_ / static_cast(k_))); - // TODO(davi) Change swap of member classes for resize + memset to avoid - // fragmentation - vector (ranktablesize).swap(ranktable_);; + delete [] ranktable_; + ranktable_ = new uint32_t[ranktable_size_]; + memset(ranktable_, 0, ranktable_size_*sizeof(uint32_t)); uint32_t offset = 0; uint32_t count = 0; uint32_t i = 1; while (1) { - if (i == ranktable_.size()) break; + if (i == ranktable_size_) break; uint32_t nbytes = size < nbytes_total ? size : nbytes_total; for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]]; ranktable_[i] = count; @@ -170,14 +192,15 @@ uint32_t MPHTable::Rank(uint32_t vertex) const { uint32_t end_idx_b = vertex >> 2; while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]]; beg_idx_v = beg_idx_b << 2; - cerr << "beg_idx_v: " << beg_idx_v << endl; - cerr << "base rank: " << base_rank << endl; - + // cerr << "beg_idx_v: " << beg_idx_v << endl; + // cerr << "base rank: " << base_rank << endl; + /* cerr << "G: "; for (unsigned int i = 0; i < n_; ++i) { cerr << get_2bit_value(g_, i) << " "; } cerr << endl; + */ while (beg_idx_v < vertex) { if (get_2bit_value(g_, beg_idx_v) != kUnassigned) ++base_rank; ++beg_idx_v; diff --git a/cxxmph/mph_table.h b/cxxmph/mph_table.h index 99baaa3..34544fd 100644 --- a/cxxmph/mph_table.h +++ b/cxxmph/mph_table.h @@ -23,8 +23,9 @@ namespace cxxmph { class MPHTable { public: MPHTable(double c = 1.23, uint8_t b = 7) : - c_(c), b_(b), m_(0), n_(0), k_(0), r_(0) { } - ~MPHTable() {} + c_(c), b_(b), m_(0), n_(0), k_(0), r_(0), + g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0) { } + ~MPHTable(); template bool Reset(ForwardIterator begin, ForwardIterator end); @@ -57,20 +58,23 @@ class MPHTable { // Partition vertex count, derived from c parameter. uint32_t r_; - // The array containing the minimal perfect hash function graph. - std::vector g_; + // The array containing the minimal perfect hash function graph. Do not use + // c++ vector to make mmap based backing easier. + uint8_t* g_; + uint32_t g_size_; // The table used for the rank step of the minimal perfect hash function - std::vector ranktable_; + uint32_t* ranktable_; + uint32_t ranktable_size_; // The selected hash seed triplet for finding the edges in the minimal // perfect hash function graph. uint32_t hash_seed_[3]; static const uint8_t valuemask[]; - static void set_2bit_value(std::vector *d, uint32_t i, uint8_t v) { - (*d)[(i >> 2)] &= (v << ((i & 3) << 1)) | valuemask[i & 3]; + static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { + d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]); } - static uint32_t get_2bit_value(const std::vector& d, uint32_t i) { - return (d[(i >> 2)] >> ((i & 3) << 1)) & 3; + static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { + return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); } @@ -85,13 +89,13 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { n_ = 3*r_; k_ = 1U << b_; - cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; + // cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; int iterations = 10; std::vector edges; std::vector queue; while (1) { - cerr << "Iterations missing: " << iterations << endl; + // cerr << "Iterations missing: " << iterations << endl; for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_; // for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i; if (Mapping(begin, end, &edges, &queue)) break; @@ -116,7 +120,7 @@ bool MPHTable::Mapping( uint32_t v0 = h[0] % r_; uint32_t v1 = h[1] % r_ + r_; uint32_t v2 = h[2] % r_ + (r_ << 1); - cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; + // cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; graph.AddEdge(TriGraph::Edge(v0, v1, v2)); } if (GenerateQueue(&graph, queue)) { @@ -133,13 +137,13 @@ uint32_t MPHTable::index(const Key& key) const { h[0] = h[0] % r_; h[1] = h[1] % r_ + r_; h[2] = h[2] % r_ + (r_ << 1); - assert(g_.size()); - cerr << "g_.size() " << g_.size() << " h0 >> 2 " << (h[0] >> 2) << endl; - assert((h[0] >> 2) > 2) > 2) > 2 " << (h[0] >> 2) << endl; + assert((h[0] >> 2) > 2) > 2) Date: Sun, 15 May 2011 23:24:12 -0300 Subject: [PATCH 39/89] Improved const-correctness. --- cxxmph/mph_table.cc | 52 ++++++++++++++++++++++++++++++++------------- cxxmph/mph_table.h | 11 ++++++++-- 2 files changed, 46 insertions(+), 17 deletions(-) diff --git a/cxxmph/mph_table.cc b/cxxmph/mph_table.cc index 0fa0393..3c74c2a 100644 --- a/cxxmph/mph_table.cc +++ b/cxxmph/mph_table.cc @@ -116,9 +116,10 @@ void MPHTable::Assigning( // Initialize vector of half nibbles with all bits set. g_size_ = static_cast(ceil(n_/4.0)); delete [] g_; - g_ = new uint8_t[g_size_]; - memset(g_, std::numeric_limits::max(), g_size_); - assert(g_[g_size_ - 1] == 255); + g_ = NULL; + uint8_t* g = new uint8_t[g_size_]; + memset(g, std::numeric_limits::max(), g_size_); + assert(g[g_size_ - 1] == 255); uint32_t nedges = m_; // for legibility for (int i = nedges - 1; i + 1 >= 1; --i) { @@ -132,34 +133,35 @@ void MPHTable::Assigning( */ if (!marked_vertices[e[0]]) { if (!marked_vertices[e[1]]) { - set_2bit_value(g_, e[1], kUnassigned); + set_2bit_value(g, e[1], kUnassigned); marked_vertices[e[1]] = true; } if (!marked_vertices[e[2]]) { - set_2bit_value(g_, e[2], kUnassigned); + set_2bit_value(g, e[2], kUnassigned); assert(marked_vertices.size() > e[2]); marked_vertices[e[2]] = true; } - set_2bit_value(g_, e[0], (6 - (get_2bit_value(g_, e[1]) + get_2bit_value(g_, e[2]))) % 3); + set_2bit_value(g, e[0], (6 - (get_2bit_value(g, e[1]) + get_2bit_value(g, e[2]))) % 3); marked_vertices[e[0]] = true; } else if (!marked_vertices[e[1]]) { if (!marked_vertices[e[2]]) { - set_2bit_value(g_, e[2], kUnassigned); + set_2bit_value(g, e[2], kUnassigned); marked_vertices[e[2]] = true; } - set_2bit_value(g_, e[1], (7 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[2]))) % 3); + set_2bit_value(g, e[1], (7 - (get_2bit_value(g, e[0]) + get_2bit_value(g, e[2]))) % 3); marked_vertices[e[1]] = true; } else { - set_2bit_value(g_, e[2], (8 - (get_2bit_value(g_, e[0]) + get_2bit_value(g_, e[1]))) % 3); + set_2bit_value(g, e[2], (8 - (get_2bit_value(g, e[0]) + get_2bit_value(g, e[1]))) % 3); marked_vertices[e[2]] = true; } /* cerr << "A: " << e[0] << " " << e[1] << " " << e[2] << " -> " - << get_2bit_value(g_, e[0]) << " " - << get_2bit_value(g_, e[1]) << " " - << get_2bit_value(g_, e[2]) << " " << endl; + << get_2bit_value(g, e[0]) << " " + << get_2bit_value(g, e[1]) << " " + << get_2bit_value(g, e[2]) << " " << endl; */ } + g_ = g; } void MPHTable::Ranking() { @@ -168,8 +170,9 @@ void MPHTable::Ranking() { ranktable_size_ = static_cast( ceil(n_ / static_cast(k_))); delete [] ranktable_; - ranktable_ = new uint32_t[ranktable_size_]; - memset(ranktable_, 0, ranktable_size_*sizeof(uint32_t)); + ranktable_ = NULL; + uint32_t* ranktable = new uint32_t[ranktable_size_]; + memset(ranktable, 0, ranktable_size_*sizeof(uint32_t)); uint32_t offset = 0; uint32_t count = 0; uint32_t i = 1; @@ -177,11 +180,12 @@ void MPHTable::Ranking() { if (i == ranktable_size_) break; uint32_t nbytes = size < nbytes_total ? size : nbytes_total; for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]]; - ranktable_[i] = count; + ranktable[i] = count; offset += nbytes; nbytes_total -= size; ++i; } + ranktable_ = ranktable; } uint32_t MPHTable::Rank(uint32_t vertex) const { @@ -209,4 +213,22 @@ uint32_t MPHTable::Rank(uint32_t vertex) const { return base_rank; } +uint32_t MPHTable::serialize_bytes_needed() const { + return sizeof(MPHTable) + g_size_ + ranktable_size_*sizeof(uint32_t); +} +void MPHTable::serialize(char* memory) const { + memcpy(memory, this, sizeof(MPHTable)); + memcpy(memory + sizeof(MPHTable), g_, g_size_); + memcpy(memory + sizeof(MPHTable) + g_size_, + ranktable_, ranktable_size_*sizeof(uint32_t)); +} + +bool MPHTable::deserialize(const char* serialized_memory) { + memcpy(this, serialized_memory, sizeof(MPHTable)); + g_ = reinterpret_cast(serialized_memory + sizeof(MPHTable)); + ranktable_ = reinterpret_cast( + serialized_memory + sizeof(MPHTable) + g_size_); + return true; +} + } // namespace cxxmph diff --git a/cxxmph/mph_table.h b/cxxmph/mph_table.h index 34544fd..1b713e5 100644 --- a/cxxmph/mph_table.h +++ b/cxxmph/mph_table.h @@ -34,6 +34,13 @@ class MPHTable { uint32_t size() const { return m_; } void clear(); + // Serialization machinery for mmap usage. + // Serialized tables are not guaranteed to work across versions or different + // endianness (although they could easily be made to be). + uint32_t serialize_bytes_needed() const; + void serialize(char *memory) const; + bool deserialize(const char* serialized_memory); + private: template bool Mapping(ForwardIterator begin, ForwardIterator end, @@ -60,10 +67,10 @@ class MPHTable { uint32_t r_; // The array containing the minimal perfect hash function graph. Do not use // c++ vector to make mmap based backing easier. - uint8_t* g_; + const uint8_t* g_; uint32_t g_size_; // The table used for the rank step of the minimal perfect hash function - uint32_t* ranktable_; + const uint32_t* ranktable_; uint32_t ranktable_size_; // The selected hash seed triplet for finding the edges in the minimal // perfect hash function graph. From c630eb2a70b7aaa66dedcfec92f1bba694df206a Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Mon, 16 May 2011 11:26:18 -0300 Subject: [PATCH 40/89] Implemented serialization machinery. --- cxxmph/mph_table.cc | 9 +++++---- cxxmph/mph_table.h | 6 +++++- cxxmph/mph_table_test.cc | 5 +++++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/cxxmph/mph_table.cc b/cxxmph/mph_table.cc index 3c74c2a..fae8e98 100644 --- a/cxxmph/mph_table.cc +++ b/cxxmph/mph_table.cc @@ -44,10 +44,10 @@ MPHTable::~MPHTable() { } void MPHTable::clear() { - delete [] g_; + if (!deserialized_) delete [] g_; g_ = NULL; g_size_ = 0; - delete [] ranktable_; + if (!deserialized_) delete [] ranktable_; ranktable_ = NULL; ranktable_size_ = 0; // TODO(davi) implement me @@ -115,7 +115,7 @@ void MPHTable::Assigning( vector marked_vertices(n_ + 1); // Initialize vector of half nibbles with all bits set. g_size_ = static_cast(ceil(n_/4.0)); - delete [] g_; + if (!deserialized_) delete [] g_; g_ = NULL; uint8_t* g = new uint8_t[g_size_]; memset(g, std::numeric_limits::max(), g_size_); @@ -169,7 +169,7 @@ void MPHTable::Ranking() { uint32_t size = k_ >> 2U; ranktable_size_ = static_cast( ceil(n_ / static_cast(k_))); - delete [] ranktable_; + if (!deserialized_) delete [] ranktable_; ranktable_ = NULL; uint32_t* ranktable = new uint32_t[ranktable_size_]; memset(ranktable, 0, ranktable_size_*sizeof(uint32_t)); @@ -228,6 +228,7 @@ bool MPHTable::deserialize(const char* serialized_memory) { g_ = reinterpret_cast(serialized_memory + sizeof(MPHTable)); ranktable_ = reinterpret_cast( serialized_memory + sizeof(MPHTable) + g_size_); + deserialized_ = true; return true; } diff --git a/cxxmph/mph_table.h b/cxxmph/mph_table.h index 1b713e5..51d26ea 100644 --- a/cxxmph/mph_table.h +++ b/cxxmph/mph_table.h @@ -24,7 +24,8 @@ class MPHTable { public: MPHTable(double c = 1.23, uint8_t b = 7) : c_(c), b_(b), m_(0), n_(0), k_(0), r_(0), - g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0) { } + g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0), + deserialized_(false) { } ~MPHTable(); template @@ -76,6 +77,8 @@ class MPHTable { // perfect hash function graph. uint32_t hash_seed_[3]; + bool deserialized_; + static const uint8_t valuemask[]; static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]); @@ -113,6 +116,7 @@ bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { Assigning(edges, queue); std::vector().swap(edges); Ranking(); + deserialized_ = false; return true; } diff --git a/cxxmph/mph_table_test.cc b/cxxmph/mph_table_test.cc index d12f901..c9e91a8 100644 --- a/cxxmph/mph_table_test.cc +++ b/cxxmph/mph_table_test.cc @@ -33,5 +33,10 @@ int main(int argc, char** argv) { cerr << endl; sort(ids.begin(), ids.end()); for (vector::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast::value_type>(i)); + + char* serialized = new char[mph_table.serialize_bytes_needed()]; + mph_table.serialize(serialized); + SimpleMPHTable other_mph_table; + other_mph_table.deserialize(serialized); } From bb40a4bb00f2ad3caccb5964c3b17b93c16c8386 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Mon, 23 May 2011 11:01:08 -0700 Subject: [PATCH 41/89] Renamed table to index and reorganized benchmarks. --- cxxmph/Makefile.am | 20 +- cxxmph/benchmark.cc | 32 +++- cxxmph/benchmark.h | 11 +- cxxmph/bm_map.cc | 52 +++++ cxxmph/bm_numbers.cc | 52 ----- cxxmph/bm_urls.cc | 70 ------- cxxmph/{mph_table.cc => mph_index.cc} | 42 ++--- cxxmph/mph_index.h | 173 +++++++++++++++++ .../{mph_table_test.cc => mph_index_test.cc} | 18 +- cxxmph/mph_map.h | 22 +-- cxxmph/mph_table.h | 177 +----------------- src/bm_numbers.c | 51 ++++- 12 files changed, 367 insertions(+), 353 deletions(-) create mode 100644 cxxmph/bm_map.cc delete mode 100644 cxxmph/bm_numbers.cc delete mode 100644 cxxmph/bm_urls.cc rename cxxmph/{mph_table.cc => mph_index.cc} (87%) create mode 100644 cxxmph/mph_index.h rename cxxmph/{mph_table_test.cc => mph_index_test.cc} (66%) diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 2d44345..801d2d0 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,28 +1,28 @@ AM_CXXFLAGS='-std=c++0x' TESTS = $(check_PROGRAMS) -check_PROGRAMS = mph_map_test mph_table_test trigraph_test -noinst_PROGRAMS = bm_numbers bm_urls +check_PROGRAMS = mph_map_test mph_index_test trigraph_test +noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la -libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_table.h mph_table.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc +libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 cxxmph_includedir = $(includedir)/cxxmph/ -cxxmph_include_HEADERS = mph_map.h mph_table.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h +cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h mph_map_test_LDADD = libcxxmph.la mph_map_test_SOURCES = mph_map_test.cc -mph_table_test_LDADD = libcxxmph.la -mph_table_test_SOURCES = mph_table_test.cc +mph_index_test_LDADD = libcxxmph.la +mph_index_test_SOURCES = mph_index_test.cc trigraph_test_LDADD = libcxxmph.la trigraph_test_SOURCES = trigraph_test.cc -bm_numbers_LDADD = libcxxmph.la -bm_numbers_SOURCES = bm_numbers.cc +bm_index_LDADD = libcxxmph.la +bm_index_SOURCES = bm_common.cc bm_index.cc -bm_urls_LDADD = libcxxmph.la -bm_urls_SOURCES = bm_urls.cc +bm_map_LDADD = libcxxmph.la +bm_map_SOURCES = bm_common.cc bm_map.cc cxxmph_LDADD = libcxxmph.la cxxmph_SOURCES = cxxmph.cc diff --git a/cxxmph/benchmark.cc b/cxxmph/benchmark.cc index 04e5086..644bdc9 100644 --- a/cxxmph/benchmark.cc +++ b/cxxmph/benchmark.cc @@ -1,7 +1,9 @@ #include "benchmark.h" +#include #include #include +#include #include #include @@ -50,6 +52,16 @@ struct rusage getrusage_or_die() { return rs; } +struct timeval gettimeofday_or_die() { + struct timeval tv; + int ret = gettimeofday(&tv, NULL); + if (ret != 0) { + cerr << "gettimeofday failed: " << strerror(errno) << endl; + exit(-1); + } + return tv; +} + #ifdef HAVE_CXA_DEMANGLE string demangle(const string& name) { char buf[1024]; @@ -79,25 +91,33 @@ namespace cxxmph { } /* static */ void Benchmark::RunAll() { - for (auto it = g_benchmarks.begin(); it != g_benchmarks.end(); ++it) { - (*it)->MeasureRun(); - delete *it; + for (int i = 0; i < g_benchmarks.size(); ++i) { + Benchmark* bm = g_benchmarks[i]; + bm->SetUp(); + bm->MeasureRun(); + bm->TearDown(); + delete bm; } } void Benchmark::MeasureRun() { + struct timeval walltime_begin = gettimeofday_or_die(); struct rusage begin = getrusage_or_die(); - Run(iters_); + Run(); struct rusage end = getrusage_or_die(); + struct timeval walltime_end = gettimeofday_or_die(); struct timeval utime; timeval_subtract(&utime, &end.ru_utime, &begin.ru_utime); struct timeval stime; timeval_subtract(&stime, &end.ru_stime, &begin.ru_stime); + struct timeval wtime; + timeval_subtract(&wtime, &walltime_end, &walltime_begin); printf("Benchmark: %s\n", name().c_str()); - printf("User time used : %ld.%06ld\n", utime.tv_sec, utime.tv_usec); - printf("System time used: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); + printf("CPU User time : %ld.%06ld\n", utime.tv_sec, utime.tv_usec); + printf("CPU System time: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); + printf("Wall clock time: %ld.%06ld\n", wtime.tv_sec, wtime.tv_usec); printf("\n"); } diff --git a/cxxmph/benchmark.h b/cxxmph/benchmark.h index f0629e4..edd3fb9 100644 --- a/cxxmph/benchmark.h +++ b/cxxmph/benchmark.h @@ -8,9 +8,9 @@ namespace cxxmph { class Benchmark { public: - Benchmark(int iters = 1) : iters_(iters) { } - virtual void Run(int iters) = 0; - virtual ~Benchmark() { } + Benchmark() {} + virtual ~Benchmark() {} + const std::string& name() { return name_; } void set_name(const std::string& name) { name_ = name; } @@ -18,10 +18,11 @@ class Benchmark { static void RunAll(); protected: - int iters() { return iters_; } + virtual bool SetUp() {}; + virtual void Run() = 0; + virtual bool TearDown() {}; private: - int iters_; std::string name_; void MeasureRun(); }; diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc new file mode 100644 index 0000000..1d26847 --- /dev/null +++ b/cxxmph/bm_map.cc @@ -0,0 +1,52 @@ +#include +#include + +#include "bm_common.h" +#include "mph_map.h" + +using cxxmph::mph_map; +using std::string; +using std::unordered_map; + +namespace cxxmph { + +template +class BM_MapCreate : public UrlsBenchmark { + public: + virtual void Run() { + MapType mymap; + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + mymap[*it] = *it; + } + } +}; + +template +class BM_MapSearch : public SearchUrlsBenchmark { + public: + virtual void Run() { + for (auto it = random_.begin(); it != random_.end(); ++it) { + auto value = mymap[*it]; + } + } + protected: + virtual void SetUp() { + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + mymap_[*it] = *it; + } + mymap_.resize(mymap.size()); + } + MapType mymap_; +}; + +} // namespace cxxmph + +using namespace cxxmph; + +int main(int argc, char** argv) { + Benchmark::Register(new BM_MapCreate>("URLS100k")); + Benchmark::Register(new BM_MapCreate>("URLS100k")); + Benchmark::Register(new BM_MapSearch>("URLS100k", 1000 * 1000)); + Benchmark::Register(new BM_MapSearch>("URLS100k", 1000 * 1000)); + Benchmark::RunAll(); +} diff --git a/cxxmph/bm_numbers.cc b/cxxmph/bm_numbers.cc deleted file mode 100644 index 85653f5..0000000 --- a/cxxmph/bm_numbers.cc +++ /dev/null @@ -1,52 +0,0 @@ -#include -#include - -#include "benchmark.h" -#include "mph_table.h" - -using std::set; -using std::vector; - -namespace cxxmph { -class BM_NumbersCreate : public Benchmark { - public: - BM_NumbersCreate(int iters = 1) : Benchmark(iters) { - set unique; - while (unique.size() < 1000 * 1000) { - int v = random(); - if (unique.find(v) == unique.end()) { - unique.insert(v); - random_unique_.push_back(v); - } - } - } - protected: - virtual void Run(int iters) { - SimpleMPHTable table; - table.Reset(random_unique_.begin(), random_unique_.end()); - } - std::vector random_unique_; -}; - -class BM_NumbersFind : public BM_NumbersCreate { - public: - BM_NumbersFind(int iters) : BM_NumbersCreate(iters) { table_.Reset(random_unique_.begin(), random_unique_.end()); } - virtual void Run(int iters) { - for (int i = 0; i < iters * 100; ++i) { - int pos = random() % random_unique_.size();; - int h = table_.index(pos); - } - } - private: - SimpleMPHTable table_; -}; - -} // namespace cxxmph - -using namespace cxxmph; - -int main(int argc, char** argv) { - Benchmark::Register(new BM_NumbersCreate()); - Benchmark::Register(new BM_NumbersFind(1000 * 1000)); - Benchmark::RunAll(); -} diff --git a/cxxmph/bm_urls.cc b/cxxmph/bm_urls.cc deleted file mode 100644 index 6424755..0000000 --- a/cxxmph/bm_urls.cc +++ /dev/null @@ -1,70 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include "benchmark.h" -#include "mph_map.h" - -using std::ifstream; -using std::set; -using std::string; -using std::vector; - -namespace cxxmph { - -class BM_UrlsCreate : public Benchmark { - public: - BM_UrlsCreate(int iters = 1) : Benchmark(iters) { - ReadUrls(); - } - protected: - virtual void Run(int iters) { - BuildTable(); - } - void BuildTable() { - for (auto it = urls_.begin(); it != urls_.end(); ++it) { - table_[*it] = it - urls_.begin(); - } - table_.pack(); - } - void ReadUrls() { - vector urls; - std::ifstream f("URLS100k"); - string buffer; - while(std::getline(f, buffer)) urls.push_back(buffer); - set unique(urls.begin(), urls.end()); - if (unique.size() != urls.size()) { - cerr << "Input file has repeated keys." << endl; - exit(-1); - } - urls_.swap(urls); - } - vector urls_; - cxxmph::mph_map table_; -}; - -class BM_UrlsFind : public BM_UrlsCreate { - public: - BM_UrlsFind(int iters = 1) : BM_UrlsCreate(iters) { ReadUrls(); BuildTable(); } - protected: - virtual void Run(int iters) { - for (int i = 0; i < iters * 100; ++i) { - int pos = random() % urls_.size();; - int h = table_[urls_[pos]]; - assert(h == pos); - } - } -}; - -} // namespace cxxmph - -using namespace cxxmph; - -int main(int argc, char** argv) { - Benchmark::Register(new BM_UrlsCreate()); - Benchmark::Register(new BM_UrlsFind(1000 * 1000)); - Benchmark::RunAll(); -} diff --git a/cxxmph/mph_table.cc b/cxxmph/mph_index.cc similarity index 87% rename from cxxmph/mph_table.cc rename to cxxmph/mph_index.cc index fae8e98..b1c0176 100644 --- a/cxxmph/mph_table.cc +++ b/cxxmph/mph_index.cc @@ -5,7 +5,7 @@ using std::cerr; using std::endl; -#include "mph_table.h" +#include "mph_index.h" using std::vector; @@ -13,7 +13,7 @@ namespace { static const uint8_t kUnassigned = 3; // table used for looking up the number of assigned vertices to a 8-bit integer -static uint8_t kBdzLookupTable[] = +static uint8_t kBdzLookupIndex[] = { 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, 4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, @@ -37,13 +37,13 @@ static uint8_t kBdzLookupTable[] = namespace cxxmph { -const uint8_t MPHTable::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; +const uint8_t MPHIndex::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; -MPHTable::~MPHTable() { +MPHIndex::~MPHIndex() { clear(); } -void MPHTable::clear() { +void MPHIndex::clear() { if (!deserialized_) delete [] g_; g_ = NULL; g_size_ = 0; @@ -53,7 +53,7 @@ void MPHTable::clear() { // TODO(davi) implement me } -bool MPHTable::GenerateQueue( +bool MPHIndex::GenerateQueue( TriGraph* graph, vector* queue_output) { uint32_t queue_head = 0, queue_tail = 0; uint32_t nedges = m_; @@ -109,7 +109,7 @@ bool MPHTable::GenerateQueue( return cycles == 0; } -void MPHTable::Assigning( +void MPHIndex::Assigning( const vector& edges, const vector& queue) { uint32_t current_edge = 0; vector marked_vertices(n_ + 1); @@ -164,7 +164,7 @@ void MPHTable::Assigning( g_ = g; } -void MPHTable::Ranking() { +void MPHIndex::Ranking() { uint32_t nbytes_total = static_cast(ceil(n_ / 4.0)); uint32_t size = k_ >> 2U; ranktable_size_ = static_cast( @@ -179,7 +179,7 @@ void MPHTable::Ranking() { while (1) { if (i == ranktable_size_) break; uint32_t nbytes = size < nbytes_total ? size : nbytes_total; - for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupTable[g_[offset + j]]; + for (uint32_t j = 0; j < nbytes; ++j) count += kBdzLookupIndex[g_[offset + j]]; ranktable[i] = count; offset += nbytes; nbytes_total -= size; @@ -188,13 +188,13 @@ void MPHTable::Ranking() { ranktable_ = ranktable; } -uint32_t MPHTable::Rank(uint32_t vertex) const { +uint32_t MPHIndex::Rank(uint32_t vertex) const { uint32_t index = vertex >> b_; uint32_t base_rank = ranktable_[index]; uint32_t beg_idx_v = index << b_; uint32_t beg_idx_b = beg_idx_v >> 2; uint32_t end_idx_b = vertex >> 2; - while (beg_idx_b < end_idx_b) base_rank += kBdzLookupTable[g_[beg_idx_b++]]; + while (beg_idx_b < end_idx_b) base_rank += kBdzLookupIndex[g_[beg_idx_b++]]; beg_idx_v = beg_idx_b << 2; // cerr << "beg_idx_v: " << beg_idx_v << endl; // cerr << "base rank: " << base_rank << endl; @@ -213,21 +213,21 @@ uint32_t MPHTable::Rank(uint32_t vertex) const { return base_rank; } -uint32_t MPHTable::serialize_bytes_needed() const { - return sizeof(MPHTable) + g_size_ + ranktable_size_*sizeof(uint32_t); +uint32_t MPHIndex::serialize_bytes_needed() const { + return sizeof(MPHIndex) + g_size_ + ranktable_size_*sizeof(uint32_t); } -void MPHTable::serialize(char* memory) const { - memcpy(memory, this, sizeof(MPHTable)); - memcpy(memory + sizeof(MPHTable), g_, g_size_); - memcpy(memory + sizeof(MPHTable) + g_size_, +void MPHIndex::serialize(char* memory) const { + memcpy(memory, this, sizeof(MPHIndex)); + memcpy(memory + sizeof(MPHIndex), g_, g_size_); + memcpy(memory + sizeof(MPHIndex) + g_size_, ranktable_, ranktable_size_*sizeof(uint32_t)); } -bool MPHTable::deserialize(const char* serialized_memory) { - memcpy(this, serialized_memory, sizeof(MPHTable)); - g_ = reinterpret_cast(serialized_memory + sizeof(MPHTable)); +bool MPHIndex::deserialize(const char* serialized_memory) { + memcpy(this, serialized_memory, sizeof(MPHIndex)); + g_ = reinterpret_cast(serialized_memory + sizeof(MPHIndex)); ranktable_ = reinterpret_cast( - serialized_memory + sizeof(MPHTable) + g_size_); + serialized_memory + sizeof(MPHIndex) + g_size_); deserialized_ = true; return true; } diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h new file mode 100644 index 0000000..77b6ea4 --- /dev/null +++ b/cxxmph/mph_index.h @@ -0,0 +1,173 @@ +#ifndef __CXXMPH_MPH_INDEX_H__ +#define __CXXMPH_MPH_INDEX_H__ + +// Minimal perfect hash abstraction implementing the BDZ algorithm + +#include + +#include +#include +#include // for std::hash +#include + +#include + +using std::cerr; +using std::endl; + +#include "seeded_hash.h" +#include "trigraph.h" + +namespace cxxmph { + +class MPHIndex { + public: + MPHIndex(double c = 1.23, uint8_t b = 7) : + c_(c), b_(b), m_(0), n_(0), k_(0), r_(0), + g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0), + deserialized_(false) { } + ~MPHIndex(); + + template + bool Reset(ForwardIterator begin, ForwardIterator end); + template // must agree with Reset + uint32_t index(const Key& x) const; + uint32_t size() const { return m_; } + void clear(); + + // Serialization machinery for mmap usage. + // Serialized tables are not guaranteed to work across versions or different + // endianness (although they could easily be made to be). + uint32_t serialize_bytes_needed() const; + void serialize(char *memory) const; + bool deserialize(const char* serialized_memory); + + private: + template + bool Mapping(ForwardIterator begin, ForwardIterator end, + std::vector* edges, + std::vector* queue); + bool GenerateQueue(TriGraph* graph, std::vector* queue); + void Assigning(const std::vector& edges, + const std::vector& queue); + void Ranking(); + uint32_t Rank(uint32_t vertex) const; + + // Algorithm parameters + double c_; // Number of bits per key (? is it right) + uint8_t b_; // Number of bits of the kth index in the ranktable + + // Values used during generation + uint32_t m_; // edges count + uint32_t n_; // vertex count + uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$ + + // Values used during search + + // Partition vertex count, derived from c parameter. + uint32_t r_; + // The array containing the minimal perfect hash function graph. Do not use + // c++ vector to make mmap based backing easier. + const uint8_t* g_; + uint32_t g_size_; + // The table used for the rank step of the minimal perfect hash function + const uint32_t* ranktable_; + uint32_t ranktable_size_; + // The selected hash seed triplet for finding the edges in the minimal + // perfect hash function graph. + uint32_t hash_seed_[3]; + + bool deserialized_; + + static const uint8_t valuemask[]; + static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { + d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]); + } + static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { + return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); + } + + +}; + +// Template method needs to go in the header file. +template +bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) { + m_ = end - begin; + r_ = static_cast(ceil((c_*m_)/3)); + if ((r_ % 2) == 0) r_ += 1; + n_ = 3*r_; + k_ = 1U << b_; + + // cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; + + int iterations = 10; + std::vector edges; + std::vector queue; + while (1) { + // cerr << "Iterations missing: " << iterations << endl; + for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_; + // for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i; + if (Mapping(begin, end, &edges, &queue)) break; + else --iterations; + if (iterations == 0) break; + } + if (iterations == 0) return false; + Assigning(edges, queue); + std::vector().swap(edges); + Ranking(); + deserialized_ = false; + return true; +} + +template +bool MPHIndex::Mapping( + ForwardIterator begin, ForwardIterator end, + std::vector* edges, std::vector* queue) { + TriGraph graph(n_, m_); + for (ForwardIterator it = begin; it != end; ++it) { + uint32_t h[3]; + for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); + uint32_t v0 = h[0] % r_; + uint32_t v1 = h[1] % r_ + r_; + uint32_t v2 = h[2] % r_ + (r_ << 1); + // cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; + graph.AddEdge(TriGraph::Edge(v0, v1, v2)); + } + if (GenerateQueue(&graph, queue)) { + graph.ExtractEdgesAndClear(edges); + return true; + } + return false; +} + +template +uint32_t MPHIndex::index(const Key& key) const { + uint32_t h[3]; + for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); + h[0] = h[0] % r_; + h[1] = h[1] % r_ + r_; + h[2] = h[2] % r_ + (r_ << 1); + assert(g_size_); + // cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl; + assert((h[0] >> 2) > 2) > 2) >::hash_function> +class SimpleMPHIndex : public MPHIndex { + public: + template + bool Reset(ForwardIterator begin, ForwardIterator end) { + return MPHIndex::Reset(begin, end); + } + uint32_t index(const Key& key) { return MPHIndex::index(key); } +}; + +} // namespace cxxmph + +#endif // __CXXMPH_MPH_INDEX_H__ diff --git a/cxxmph/mph_table_test.cc b/cxxmph/mph_index_test.cc similarity index 66% rename from cxxmph/mph_table_test.cc rename to cxxmph/mph_index_test.cc index c9e91a8..421369c 100644 --- a/cxxmph/mph_table_test.cc +++ b/cxxmph/mph_index_test.cc @@ -3,11 +3,11 @@ #include #include -#include "mph_table.h" +#include "mph_index.h" using std::string; using std::vector; -using cxxmph::SimpleMPHTable; +using cxxmph::SimpleMPHIndex; int main(int argc, char** argv) { @@ -23,20 +23,20 @@ int main(int argc, char** argv) { keys.push_back("diogo"); keys.push_back("algume"); - SimpleMPHTable mph_table; - assert(mph_table.Reset(keys.begin(), keys.end())); + SimpleMPHIndex mph_index; + assert(mph_index.Reset(keys.begin(), keys.end())); vector ids; for (vector::size_type i = 0; i < keys.size(); ++i) { - ids.push_back(mph_table.index(keys[i])); + ids.push_back(mph_index.index(keys[i])); cerr << " " << *(ids.end() - 1); } cerr << endl; sort(ids.begin(), ids.end()); for (vector::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast::value_type>(i)); - char* serialized = new char[mph_table.serialize_bytes_needed()]; - mph_table.serialize(serialized); - SimpleMPHTable other_mph_table; - other_mph_table.deserialize(serialized); + char* serialized = new char[mph_index.serialize_bytes_needed()]; + mph_index.serialize(serialized); + SimpleMPHIndex other_mph_index; + other_mph_index.deserialize(serialized); } diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 398db3e..1c01b64 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -4,7 +4,7 @@ #include // for std::pair #include "MurmurHash2.h" -#include "mph_table.h" +#include "mph_index.h" namespace cxxmph { @@ -70,7 +70,7 @@ class mph_map { void rehash(); std::vector values_; - SimpleMPHTable::hash_function> table_; + SimpleMPHIndex::hash_function> index_; // TODO(davi) optimize slack to no hold a copy of the key typedef typename std::unordered_map slack_type; slack_type slack_; @@ -93,8 +93,8 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { if (it != end()) return std::make_pair(it, false); values_.push_back(x); slack_.insert(std::make_pair(x.first, values_.size() - 1)); - if (slack_.size() == table_.size() || - (slack_.size() >= 256 && table_.size() == 0)) { + if (slack_.size() == index_.size() || + (slack_.size() >= 256 && index_.size() == 0)) { rehash(); } it = find(x.first); @@ -104,14 +104,14 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { MPH_MAP_METHOD_DECL(void_type, rehash)() { if (values_.empty()) return; slack_type().swap(slack_); - bool success = table_.Reset( + bool success = index_.Reset( make_iterator_first(values_.begin()), make_iterator_first(values_.end())); assert(success); std::vector new_values(values_.size()); for (const_iterator it = values_.begin(), end = values_.end(); it != end; ++it) { - size_type id = table_.index(it->first); + size_type id = index_.index(it->first); assert(id < new_values.size()); new_values[id] = *it; } @@ -127,7 +127,7 @@ MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); } MPH_MAP_METHOD_DECL(void_type, clear)() { values_.clear(); slack_.clear(); - table_.clear(); + index_.clear(); } MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { @@ -145,8 +145,8 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { typename slack_type::const_iterator it = slack_.find(k); if (it != slack_.end()) return values_.begin() + it->second; } - if (table_.size() == 0) return end(); - size_type id = table_.index(k); + if (index_.size() == 0) return end(); + size_type id = index_.index(k); if (key_equal()(values_[id].first, k)) { return values_.begin() + id; } @@ -157,8 +157,8 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { typename slack_type::const_iterator it = slack_.find(k); if (it != slack_.end()) return values_.begin() + it->second; } - if (table_.size() == 0) return end(); - size_type id = table_.index(k); + if (index_.size() == 0) return end(); + size_type id = index_.index(k); if (key_equal()(values_[id].first, k)) { return values_.begin() + id; } diff --git a/cxxmph/mph_table.h b/cxxmph/mph_table.h index 51d26ea..234540d 100644 --- a/cxxmph/mph_table.h +++ b/cxxmph/mph_table.h @@ -1,173 +1,16 @@ -#ifndef __CXXMPH_MPH_TABLE_H__ -#define __CXXMPH_MPH_TABLE_H__ +#include "mph_index.h" -// Minimal perfect hash abstraction implementing the BDZ algorithm - -#include - -#include -#include -#include // for std::hash -#include - -#include - -using std::cerr; -using std::endl; - -#include "seeded_hash.h" -#include "trigraph.h" - -namespace cxxmph { +// String to string map working on mmap'ed memory class MPHTable { public: - MPHTable(double c = 1.23, uint8_t b = 7) : - c_(c), b_(b), m_(0), n_(0), k_(0), r_(0), - g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0), - deserialized_(false) { } - ~MPHTable(); - - template - bool Reset(ForwardIterator begin, ForwardIterator end); - template // must agree with Reset - uint32_t index(const Key& x) const; - uint32_t size() const { return m_; } - void clear(); - - // Serialization machinery for mmap usage. - // Serialized tables are not guaranteed to work across versions or different - // endianness (although they could easily be made to be). - uint32_t serialize_bytes_needed() const; - void serialize(char *memory) const; - bool deserialize(const char* serialized_memory); - - private: - template - bool Mapping(ForwardIterator begin, ForwardIterator end, - std::vector* edges, - std::vector* queue); - bool GenerateQueue(TriGraph* graph, std::vector* queue); - void Assigning(const std::vector& edges, - const std::vector& queue); - void Ranking(); - uint32_t Rank(uint32_t vertex) const; - - // Algorithm parameters - double c_; // Number of bits per key (? is it right) - uint8_t b_; // Number of bits of the kth index in the ranktable - - // Values used during generation - uint32_t m_; // edges count - uint32_t n_; // vertex count - uint32_t k_; // kth index in ranktable, $k = log_2(n=3r)\varepsilon$ - - // Values used during search - - // Partition vertex count, derived from c parameter. - uint32_t r_; - // The array containing the minimal perfect hash function graph. Do not use - // c++ vector to make mmap based backing easier. - const uint8_t* g_; - uint32_t g_size_; - // The table used for the rank step of the minimal perfect hash function - const uint32_t* ranktable_; - uint32_t ranktable_size_; - // The selected hash seed triplet for finding the edges in the minimal - // perfect hash function graph. - uint32_t hash_seed_[3]; - - bool deserialized_; - - static const uint8_t valuemask[]; - static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { - d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]); - } - static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { - return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); - } - - -}; - -// Template method needs to go in the header file. -template -bool MPHTable::Reset(ForwardIterator begin, ForwardIterator end) { - m_ = end - begin; - r_ = static_cast(ceil((c_*m_)/3)); - if ((r_ % 2) == 0) r_ += 1; - n_ = 3*r_; - k_ = 1U << b_; - - // cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; - - int iterations = 10; - std::vector edges; - std::vector queue; - while (1) { - // cerr << "Iterations missing: " << iterations << endl; - for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_; - // for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i; - if (Mapping(begin, end, &edges, &queue)) break; - else --iterations; - if (iterations == 0) break; - } - if (iterations == 0) return false; - Assigning(edges, queue); - std::vector().swap(edges); - Ranking(); - deserialized_ = false; - return true; -} - -template -bool MPHTable::Mapping( - ForwardIterator begin, ForwardIterator end, - std::vector* edges, std::vector* queue) { - TriGraph graph(n_, m_); - for (ForwardIterator it = begin; it != end; ++it) { - uint32_t h[3]; - for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); - uint32_t v0 = h[0] % r_; - uint32_t v1 = h[1] % r_ + r_; - uint32_t v2 = h[2] % r_ + (r_ << 1); - // cerr << "Key: " << *it << " edge " << it - begin << " (" << v0 << "," << v1 << "," << v2 << ")" << endl; - graph.AddEdge(TriGraph::Edge(v0, v1, v2)); - } - if (GenerateQueue(&graph, queue)) { - graph.ExtractEdgesAndClear(edges); - return true; - } - return false; -} - -template -uint32_t MPHTable::index(const Key& key) const { - uint32_t h[3]; - for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); - h[0] = h[0] % r_; - h[1] = h[1] % r_ + r_; - h[2] = h[2] % r_ + (r_ << 1); - assert(g_size_); - // cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl; - assert((h[0] >> 2) > 2) > 2) >::hash_function> -class SimpleMPHTable : public MPHTable { - public: + typedef StringPiece key_type; + typedef StringPiece data_type; + typedef std::pair value_type; template - bool Reset(ForwardIterator begin, ForwardIterator end) { - return MPHTable::Reset(begin, end); - } - uint32_t index(const Key& key) { return MPHTable::index(key); } + bool Reset(ForwardIterator begin, ForwardIterator end); + private: + char* data_; + vector offsets_; + MPHIndex index_; }; - -} // namespace cxxmph - -#endif // __CXXMPH_MPH_TABLE_H__ diff --git a/src/bm_numbers.c b/src/bm_numbers.c index 7428bc5..7c6abb5 100644 --- a/src/bm_numbers.c +++ b/src/bm_numbers.c @@ -1,6 +1,10 @@ #include #include +#include +using __gnu_cxx::hash_set; +static const char cxx_name = "__gnu_cxx::hash_set"; + #include "bitbool.h" #include "cmph.h" #include "cmph_benchmark.h" @@ -71,8 +75,8 @@ void bm_search(CMPH_ALGO algo, int iters) { cmph_t* mphf = NULL; - snprintf(mphf_name, 128, "%s:%u", cmph_names[algo], iters); - mphf = lsmap_search(g_created_mphf, mphf_name); + snprintf(mphf_name, 128, "%s:%u", cxx_name, iters); + mphf = (cmph_t*)lsmap_search(g_created_mphf, mphf_name); cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); @@ -102,6 +106,49 @@ DECLARE_ALGO(CMPH_BRZ); DECLARE_ALGO(CMPH_FCH); DECLARE_ALGO(CMPH_BDZ); +void bm_create_ext_hash_set(int iters) { + cmph_uint32 i = 0; + + if (iters > g_numbers_len) { + fprintf(stderr, "No input with proper size."); + exit(-1); + } + + hash_set* ext_hash_set = new hash_set; + for (i = 0; i < iters; ++i) { + ext_hash_set->insert(g_numbers[i]); + } + lsmap_append(g_created_mphf, cxx_name, ext_hash_set); +} + +void bm_search_ext_hash_set(int iters) { + cmph_uint32 i = 0; + + if (iters > g_numbers_len) { + fprintf(stderr, "No input with proper size."); + exit(-1); + } + + snprintf(mphf_name, 128, "%s:%u", hash_count, iters); + mphf = (__gnu_cxx::hash_set*)lsmap_search(g_created_mphf, mphf_name); + + cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); + cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); + + for (i = 0; i < iters * 100; ++i) { + cmph_uint32 pos = random() % iters; + const char* buf = (const char*)(g_numbers + pos); + cmph_uint32 h = cmph_search(mphf, buf, sizeof(cmph_uint32)); + ++count[pos]; + ++hash_count[h]; + } + + // Verify correctness later. + lsmap_append(g_expected_probes, create_lsmap_key(algo, iters), count); + lsmap_append(g_mphf_probes, create_lsmap_key(algo, iters), hash_count); +} +} + int main(int argc, char** argv) { g_numbers_len = 1000 * 1000; g_numbers = random_numbers_vector_new(g_numbers_len); From bbfcdeb5a6c038330d2924cc1985425ea9d5aa3d Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Mon, 23 May 2011 17:18:24 -0700 Subject: [PATCH 42/89] Compiles with clang in mac. --- cxxmph/benchmark.h | 4 ++-- cxxmph/bm_map.cc | 14 +++++++++----- cxxmph/mph_index.h | 4 ++-- cxxmph/mph_map.h | 22 +++++++++++++--------- cxxmph/seeded_hash.h | 30 +++++++++++++++--------------- 5 files changed, 41 insertions(+), 33 deletions(-) diff --git a/cxxmph/benchmark.h b/cxxmph/benchmark.h index edd3fb9..cecbc2f 100644 --- a/cxxmph/benchmark.h +++ b/cxxmph/benchmark.h @@ -18,9 +18,9 @@ class Benchmark { static void RunAll(); protected: - virtual bool SetUp() {}; + virtual bool SetUp() { return true; }; virtual void Run() = 0; - virtual bool TearDown() {}; + virtual bool TearDown() { return true; }; private: std::string name_; diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 1d26847..5e79fbc 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -1,18 +1,19 @@ #include -#include +#include #include "bm_common.h" #include "mph_map.h" using cxxmph::mph_map; using std::string; -using std::unordered_map; +using std::tr1::unordered_map; namespace cxxmph { template class BM_MapCreate : public UrlsBenchmark { public: + BM_MapCreate(const string& urls_file) : UrlsBenchmark(urls_file) { } virtual void Run() { MapType mymap; for (auto it = urls_.begin(); it != urls_.end(); ++it) { @@ -24,17 +25,20 @@ class BM_MapCreate : public UrlsBenchmark { template class BM_MapSearch : public SearchUrlsBenchmark { public: + BM_MapSearch(const std::string& urls_file, int nsearches) + : SearchUrlsBenchmark(urls_file, nsearches) { } virtual void Run() { for (auto it = random_.begin(); it != random_.end(); ++it) { - auto value = mymap[*it]; + auto value = mymap_[it->ToString()]; } } protected: - virtual void SetUp() { + virtual bool SetUp() { for (auto it = urls_.begin(); it != urls_.end(); ++it) { mymap_[*it] = *it; } - mymap_.resize(mymap.size()); + mymap_.rehash(mymap_.bucket_count()); + return true; } MapType mymap_; }; diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 77b6ea4..5b0f6c6 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -7,7 +7,7 @@ #include #include -#include // for std::hash +#include // for std::tr1::hash #include #include @@ -158,7 +158,7 @@ uint32_t MPHIndex::index(const Key& key) const { return Rank(vertex); } -template >::hash_function> +template >::hash_function> class SimpleMPHIndex : public MPHIndex { public: template diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 1c01b64..d52f617 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -1,5 +1,5 @@ #include -#include +#include #include #include // for std::pair @@ -8,12 +8,14 @@ namespace cxxmph { +using std::tr1::unordered_map; + // Save on repetitive typing. #define MPH_MAP_TMPL_SPEC template #define MPH_MAP_CLASS_SPEC mph_map #define MPH_MAP_METHOD_DECL(r, m) MPH_MAP_TMPL_SPEC typename MPH_MAP_CLASS_SPEC::r MPH_MAP_CLASS_SPEC::m -template , class EqualKey = std::equal_to, class Alloc = std::allocator > +template , class EqualKey = std::equal_to, class Alloc = std::allocator > class mph_map { public: typedef Key key_type; @@ -52,7 +54,8 @@ class mph_map { const_iterator find(const key_type& k) const; data_type& operator[](const key_type &k); - void pack() { rehash(); } + size_type bucket_count() const { return size(); } + void rehash(size_type nbuckets /*ignored*/) { pack(); } private: template @@ -68,11 +71,11 @@ class mph_map { return iterator_first(it); } - void rehash(); + void pack(); std::vector values_; SimpleMPHIndex::hash_function> index_; // TODO(davi) optimize slack to no hold a copy of the key - typedef typename std::unordered_map slack_type; + typedef unordered_map slack_type; slack_type slack_; }; @@ -82,7 +85,7 @@ bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) { } MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() { - rehash(); + pack(); } MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() { @@ -95,13 +98,13 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { slack_.insert(std::make_pair(x.first, values_.size() - 1)); if (slack_.size() == index_.size() || (slack_.size() >= 256 && index_.size() == 0)) { - rehash(); + pack(); } it = find(x.first); return std::make_pair(it, true); } -MPH_MAP_METHOD_DECL(void_type, rehash)() { +MPH_MAP_METHOD_DECL(void_type, pack)() { if (values_.empty()) return; slack_type().swap(slack_); bool success = index_.Reset( @@ -123,6 +126,7 @@ MPH_MAP_METHOD_DECL(iterator, end)() { return values_.end(); } MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return values_.begin(); } MPH_MAP_METHOD_DECL(const_iterator, end)() const { return values_.end(); } MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); } +MPH_MAP_METHOD_DECL(size_type, size)() const { return values_.size(); } MPH_MAP_METHOD_DECL(void_type, clear)() { values_.clear(); @@ -132,7 +136,7 @@ MPH_MAP_METHOD_DECL(void_type, clear)() { MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { values_.erase(pos); - rehash(); + pack(); } MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { iterator it = find(k); diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index 7446813..d732d62 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -4,7 +4,7 @@ #include // for uint32_t and friends #include -#include // for std::hash +#include // for std::tr1::hash #include "MurmurHash2.h" #include "stringpiece.h" @@ -52,34 +52,34 @@ struct seeded_hash_function { template struct seeded_hash { typedef seeded_hash_function hash_function; }; -// Use Murmur2 instead for all types defined in std::hash, plus +// Use Murmur2 instead for all types defined in std::tr1::hash, plus // std::string which is commonly extended. -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; } // namespace cxxmph From b10fe56a4e1b4520ed5f0dbbee7affeeae0f790f Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Mon, 13 Jun 2011 02:16:19 -0300 Subject: [PATCH 43/89] All compiles in the mac. --- cxxmph/Makefile.am | 6 ++--- cxxmph/benchmark.cc | 34 +++++++++++++++++++------- cxxmph/bm_map.cc | 11 +++++---- cxxmph/mph_index.h | 4 ++++ cxxmph/seeded_hash.h | 2 ++ cxxmph/stringpiece.h | 18 +++++++------- src/bm_numbers.c | 57 ++++---------------------------------------- src/cmph_benchmark.c | 6 +++-- 8 files changed, 60 insertions(+), 78 deletions(-) diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 801d2d0..c02e1c9 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,6 +1,6 @@ AM_CXXFLAGS='-std=c++0x' TESTS = $(check_PROGRAMS) -check_PROGRAMS = mph_map_test mph_index_test trigraph_test +check_PROGRAMS = mph_map_test mph_index_test # trigraph_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la @@ -15,8 +15,8 @@ mph_map_test_SOURCES = mph_map_test.cc mph_index_test_LDADD = libcxxmph.la mph_index_test_SOURCES = mph_index_test.cc -trigraph_test_LDADD = libcxxmph.la -trigraph_test_SOURCES = trigraph_test.cc +# trigraph_test_LDADD = libcxxmph.la +# trigraph_test_SOURCES = trigraph_test.cc bm_index_LDADD = libcxxmph.la bm_index_SOURCES = bm_common.cc bm_index.cc diff --git a/cxxmph/benchmark.cc b/cxxmph/benchmark.cc index 644bdc9..70175e1 100644 --- a/cxxmph/benchmark.cc +++ b/cxxmph/benchmark.cc @@ -3,15 +3,22 @@ #include #include #include +#include #include #include +#include #include +#include #include using std::cerr; +using std::cout; using std::endl; +using std::setfill; +using std::setw; using std::string; +using std::ostringstream; using std::vector; namespace { @@ -42,6 +49,14 @@ int timeval_subtract ( return x->tv_sec < y->tv_sec; } +// C++ iostream is terrible for formatting. +string timeval_to_string(timeval tv) { + ostringstream out; + out << setfill(' ') << setw(3) << tv.tv_sec << '.'; + out << setfill('0') << setw(6) << tv.tv_usec; + return out.str(); +} + struct rusage getrusage_or_die() { struct rusage rs; int ret = getrusage(RUSAGE_SELF, &rs); @@ -92,11 +107,14 @@ namespace cxxmph { /* static */ void Benchmark::RunAll() { for (int i = 0; i < g_benchmarks.size(); ++i) { - Benchmark* bm = g_benchmarks[i]; - bm->SetUp(); + std::auto_ptr bm(g_benchmarks[i]); + if (!bm->SetUp()) { + cerr << "Set up phase for benchmark " + << bm->name() << " failed." << endl; + continue; + } bm->MeasureRun(); bm->TearDown(); - delete bm; } } @@ -114,11 +132,11 @@ void Benchmark::MeasureRun() { struct timeval wtime; timeval_subtract(&wtime, &walltime_end, &walltime_begin); - printf("Benchmark: %s\n", name().c_str()); - printf("CPU User time : %ld.%06ld\n", utime.tv_sec, utime.tv_usec); - printf("CPU System time: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); - printf("Wall clock time: %ld.%06ld\n", wtime.tv_sec, wtime.tv_usec); - printf("\n"); + cout << "Benchmark: " << name_ << endl; + cout << "CPU User time : " << timeval_to_string(utime) << endl; + cout << "CPU System time: " << timeval_to_string(stime) << endl; + cout << "Wall clock time: " << timeval_to_string(wtime) << endl; + cout << endl; } } // namespace cxxmph diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 5e79fbc..423e329 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -29,11 +29,12 @@ class BM_MapSearch : public SearchUrlsBenchmark { : SearchUrlsBenchmark(urls_file, nsearches) { } virtual void Run() { for (auto it = random_.begin(); it != random_.end(); ++it) { - auto value = mymap_[it->ToString()]; + mymap_.find(*it); } } protected: virtual bool SetUp() { + if (!SearchUrlsBenchmark::SetUp()) return false; for (auto it = urls_.begin(); it != urls_.end(); ++it) { mymap_[*it] = *it; } @@ -48,9 +49,9 @@ class BM_MapSearch : public SearchUrlsBenchmark { using namespace cxxmph; int main(int argc, char** argv) { - Benchmark::Register(new BM_MapCreate>("URLS100k")); - Benchmark::Register(new BM_MapCreate>("URLS100k")); - Benchmark::Register(new BM_MapSearch>("URLS100k", 1000 * 1000)); - Benchmark::Register(new BM_MapSearch>("URLS100k", 1000 * 1000)); + Benchmark::Register(new BM_MapCreate>("URLS100k")); + Benchmark::Register(new BM_MapCreate>("URLS100k")); + Benchmark::Register(new BM_MapSearch>("URLS100k", 1000* 1000)); + Benchmark::Register(new BM_MapSearch>("URLS100k", 1000* 1000)); Benchmark::RunAll(); } diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 5b0f6c6..d03dd92 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -93,6 +93,10 @@ class MPHIndex { // Template method needs to go in the header file. template bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) { + if (end == begin) { + clear(); + return true; + } m_ = end - begin; r_ = static_cast(ceil((c_*m_)/3)); if ((r_ % 2) == 0) r_ += 1; diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index d732d62..99a3ca6 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -60,6 +60,8 @@ template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; +template <> struct seeded_hash > +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; diff --git a/cxxmph/stringpiece.h b/cxxmph/stringpiece.h index fdd8f75..ee6d125 100644 --- a/cxxmph/stringpiece.h +++ b/cxxmph/stringpiece.h @@ -145,32 +145,34 @@ class StringPiece { StringPiece substr(size_type pos, size_type n = npos) const; }; -} // namespace cxxmph +inline bool operator==(const StringPiece& x, const StringPiece& y) { + return x.length() == y.length() && memcmp(x.data(), y.data(), x.length()) == 0; +} -bool operator==(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y); - -inline bool operator!=(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) { +inline bool operator!=(const StringPiece& x, const StringPiece& y) { return !(x == y); } -inline bool operator<(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) { +inline bool operator<(const StringPiece& x, const StringPiece& y) { const int r = memcmp(x.data(), y.data(), std::min(x.size(), y.size())); return ((r < 0) || ((r == 0) && (x.size() < y.size()))); } -inline bool operator>(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) { +inline bool operator>(const StringPiece& x, const StringPiece& y) { return y < x; } -inline bool operator<=(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) { +inline bool operator<=(const StringPiece& x, const StringPiece& y) { return !(x > y); } -inline bool operator>=(const cxxmph::StringPiece& x, const cxxmph::StringPiece& y) { +inline bool operator>=(const StringPiece& x, StringPiece& y) { return !(x < y); } +} // namespace cxxmph + // allow StringPiece to be logged extern std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece); diff --git a/src/bm_numbers.c b/src/bm_numbers.c index 7c6abb5..cd3aa74 100644 --- a/src/bm_numbers.c +++ b/src/bm_numbers.c @@ -1,10 +1,6 @@ #include #include -#include -using __gnu_cxx::hash_set; -static const char cxx_name = "__gnu_cxx::hash_set"; - #include "bitbool.h" #include "cmph.h" #include "cmph_benchmark.h" @@ -71,12 +67,12 @@ void bm_create(CMPH_ALGO algo, int iters) { void bm_search(CMPH_ALGO algo, int iters) { int i = 0; - char mphf_name[128]; + char *mphf_name; cmph_t* mphf = NULL; - - snprintf(mphf_name, 128, "%s:%u", cxx_name, iters); + mphf_name = create_lsmap_key(algo, iters); mphf = (cmph_t*)lsmap_search(g_created_mphf, mphf_name); + free(mphf_name); cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); @@ -106,49 +102,6 @@ DECLARE_ALGO(CMPH_BRZ); DECLARE_ALGO(CMPH_FCH); DECLARE_ALGO(CMPH_BDZ); -void bm_create_ext_hash_set(int iters) { - cmph_uint32 i = 0; - - if (iters > g_numbers_len) { - fprintf(stderr, "No input with proper size."); - exit(-1); - } - - hash_set* ext_hash_set = new hash_set; - for (i = 0; i < iters; ++i) { - ext_hash_set->insert(g_numbers[i]); - } - lsmap_append(g_created_mphf, cxx_name, ext_hash_set); -} - -void bm_search_ext_hash_set(int iters) { - cmph_uint32 i = 0; - - if (iters > g_numbers_len) { - fprintf(stderr, "No input with proper size."); - exit(-1); - } - - snprintf(mphf_name, 128, "%s:%u", hash_count, iters); - mphf = (__gnu_cxx::hash_set*)lsmap_search(g_created_mphf, mphf_name); - - cmph_uint32* count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); - cmph_uint32* hash_count = (cmph_uint32*)malloc(sizeof(cmph_uint32)*iters); - - for (i = 0; i < iters * 100; ++i) { - cmph_uint32 pos = random() % iters; - const char* buf = (const char*)(g_numbers + pos); - cmph_uint32 h = cmph_search(mphf, buf, sizeof(cmph_uint32)); - ++count[pos]; - ++hash_count[h]; - } - - // Verify correctness later. - lsmap_append(g_expected_probes, create_lsmap_key(algo, iters), count); - lsmap_append(g_mphf_probes, create_lsmap_key(algo, iters), hash_count); -} -} - int main(int argc, char** argv) { g_numbers_len = 1000 * 1000; g_numbers = random_numbers_vector_new(g_numbers_len); @@ -162,8 +115,8 @@ int main(int argc, char** argv) { BM_REGISTER(bm_search_CMPH_CHM, 1000 * 1000); // BM_REGISTER(bm_create_CMPH_BRZ, 1000 * 1000); // BM_REGISTER(bm_search_CMPH_BRZ, 1000 * 1000); - BM_REGISTER(bm_create_CMPH_FCH, 1000 * 1000); - BM_REGISTER(bm_search_CMPH_FCH, 1000 * 1000); +// BM_REGISTER(bm_create_CMPH_FCH, 1000 * 1000); +// BM_REGISTER(bm_search_CMPH_FCH, 1000 * 1000); BM_REGISTER(bm_create_CMPH_BDZ, 1000 * 1000); BM_REGISTER(bm_search_CMPH_BDZ, 1000 * 1000); run_benchmarks(argc, argv); diff --git a/src/cmph_benchmark.c b/src/cmph_benchmark.c index a67f78b..0023e2f 100644 --- a/src/cmph_benchmark.c +++ b/src/cmph_benchmark.c @@ -111,8 +111,10 @@ void bm_end(const char* name) { timeval_subtract(&stime, &benchmark->end.ru_stime, &benchmark->begin.ru_stime); printf("Benchmark: %s\n", benchmark->name); - printf("User time used : %ld.%06ld\n", utime.tv_sec, utime.tv_usec); - printf("System time used: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); + printf("User time used : %ld.%06ld\n", + utime.tv_sec, (long int)utime.tv_usec); + printf("System time used: %ld.%06ld\n", + stime.tv_sec, (long int)stime.tv_usec); printf("\n"); } From c749ab444b3a85b3228c650cbb64cffc7dbbc929 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Mon, 13 Jun 2011 03:14:15 -0300 Subject: [PATCH 44/89] Added bm_common and bm_index missing files. --- cxxmph/bm_common.cc | 62 ++++++++++++++++++++++++++++++++++ cxxmph/bm_common.h | 62 ++++++++++++++++++++++++++++++++++ cxxmph/bm_index.cc | 81 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 205 insertions(+) create mode 100644 cxxmph/bm_common.cc create mode 100644 cxxmph/bm_common.h create mode 100644 cxxmph/bm_index.cc diff --git a/cxxmph/bm_common.cc b/cxxmph/bm_common.cc new file mode 100644 index 0000000..f0e0336 --- /dev/null +++ b/cxxmph/bm_common.cc @@ -0,0 +1,62 @@ +#include +#include +#include + +#include "bm_common.h" + +using std::cerr; +using std::endl; +using std::set; +using std::string; +using std::vector; + +namespace cxxmph { + +bool UrlsBenchmark::SetUp() { + vector urls; + std::ifstream f(urls_file_.c_str()); + if (!f.is_open()) { + cerr << "Failed to open urls file " << urls_file_ << endl; + return false; + } + string buffer; + while(std::getline(f, buffer)) urls.push_back(buffer); + set unique(urls.begin(), urls.end()); + if (unique.size() != urls.size()) { + cerr << "Input file has repeated keys." << endl; + return false; + } + urls.swap(urls_); + return true; +} + +bool SearchUrlsBenchmark::SetUp() { + if (!UrlsBenchmark::SetUp()) return false; + random_.resize(nsearches_); + for (int i = 0; i < nsearches_; ++i) { + random_[i] = urls_[random() % urls_.size()]; + } + return true; +} + +bool Uint64Benchmark::SetUp() { + set unique; + for (int i = 0; i < count_; ++i) { + uint64_t v; + do { v = random(); } while (unique.find(v) != unique.end()); + values_.push_back(v); + unique.insert(v); + } + return true; +} + +bool SearchUint64Benchmark::SetUp() { + if (!Uint64Benchmark::SetUp()) return false; + random_.resize(nsearches_); + for (int i = 0; i < nsearches_; ++i) { + random_.push_back(values_[random() % values_.size()]); + } + return true; +} + +} // namespace cxxmph diff --git a/cxxmph/bm_common.h b/cxxmph/bm_common.h new file mode 100644 index 0000000..70a5f5e --- /dev/null +++ b/cxxmph/bm_common.h @@ -0,0 +1,62 @@ +#include "stringpiece.h" + +#include +#include +#include // for std::tr1::hash +#include "MurmurHash2.h" + +#include "benchmark.h" + +namespace std { +namespace tr1 { +template <> struct hash { + uint32_t operator()(const cxxmph::StringPiece& k) const { + return cxxmph::MurmurHash2(k.data(), k.length(), 1); + } +}; +} // namespace tr1 +} // namespace std + +namespace cxxmph { + +class UrlsBenchmark : public Benchmark { + public: + UrlsBenchmark(const std::string& urls_file) : urls_file_(urls_file) { } + protected: + virtual bool SetUp(); + const std::string urls_file_; + std::vector urls_; +}; + +class SearchUrlsBenchmark : public UrlsBenchmark { + public: + SearchUrlsBenchmark(const std::string& urls_file, uint32_t nsearches) + : UrlsBenchmark(urls_file), nsearches_(nsearches) {} + protected: + virtual bool SetUp(); + const uint32_t nsearches_; + std::vector random_; +}; + +class Uint64Benchmark : public Benchmark { + public: + Uint64Benchmark(uint32_t count) : count_(count) { } + virtual void Run() {} + protected: + virtual bool SetUp(); + const uint32_t count_; + std::vector values_; +}; + +class SearchUint64Benchmark : public Uint64Benchmark { + public: + SearchUint64Benchmark(uint32_t count, uint32_t nsearches) + : Uint64Benchmark(count), nsearches_(nsearches) { } + virtual void Run() {}; + protected: + virtual bool SetUp(); + const uint32_t nsearches_; + std::vector random_; +}; + +} // namespace cxxmph diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc new file mode 100644 index 0000000..03cb222 --- /dev/null +++ b/cxxmph/bm_index.cc @@ -0,0 +1,81 @@ +#include +#include +#include + +#include "bm_common.h" +#include "StringPiece.h" +#include "mph_index.h" + +using namespace cxxmph; + +using std::string; +using std::tr1::unordered_set; + +class BM_MPHIndexCreate : public UrlsBenchmark { + public: + BM_MPHIndexCreate(const std::string& urls_file) + : UrlsBenchmark(urls_file) { } + protected: + virtual void Run() { + SimpleMPHIndex index; + index.Reset(urls_.begin(), urls_.end()); + } +}; + +class BM_STLIndexCreate : public UrlsBenchmark { + public: + BM_STLIndexCreate(const std::string& urls_file) + : UrlsBenchmark(urls_file) { } + protected: + virtual void Run() { + unordered_set index; + index.insert(urls_.begin(), urls_.end()); + } +}; + +class BM_MPHIndexSearch : public SearchUrlsBenchmark { + public: + BM_MPHIndexSearch(const std::string& urls_file, int nsearches) + : SearchUrlsBenchmark(urls_file, nsearches) { } + virtual void Run() { + while (true) { + for (auto it = random_.begin(); it != random_.end(); ++it) { + index_.index(*it); + } + } + } + protected: + virtual bool SetUp () { + if (!SearchUrlsBenchmark::SetUp()) return false; + index_.Reset(urls_.begin(), urls_.end()); + return true; + } + SimpleMPHIndex index_; +}; + +class BM_STLIndexSearch : public SearchUrlsBenchmark { + public: + BM_STLIndexSearch(const std::string& urls_file, int nsearches) + : SearchUrlsBenchmark(urls_file, nsearches) { } + virtual void Run() { + for (auto it = random_.begin(); it != random_.end(); ++it) { + index_.find(*it); // - index_.begin(); + } + } + protected: + virtual bool SetUp () { + if (!SearchUrlsBenchmark::SetUp()) return false; + std::tr1::unordered_set(urls_.begin(), urls_.end()).swap(index_); + return true; + } + std::tr1::unordered_set index_; +}; + +int main(int argc, char** argv) { + Benchmark::Register(new BM_MPHIndexCreate("URLS100k")); + Benchmark::Register(new BM_STLIndexCreate("URLS100k")); + Benchmark::Register(new BM_MPHIndexSearch("URLS100k", 1000*1000)); + Benchmark::Register(new BM_STLIndexSearch("URLS100k", 1000*1000)); + Benchmark::RunAll(); + return 0; +} From 0846177267157765ee466c93b4c8b5710901fd60 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Tue, 14 Jun 2011 02:24:40 -0300 Subject: [PATCH 45/89] All tests pass. --- cxxmph/bm_common.h | 2 +- cxxmph/bm_index.cc | 34 ++++++++++++++--------- cxxmph/bm_map.cc | 59 ++++++++++++++++++++++++++++++++++------ cxxmph/mph_index.h | 3 +- cxxmph/mph_index_test.cc | 3 +- cxxmph/mph_map.h | 37 +++++++++++++++---------- cxxmph/seeded_hash.h | 9 +++++- 7 files changed, 106 insertions(+), 41 deletions(-) diff --git a/cxxmph/bm_common.h b/cxxmph/bm_common.h index 70a5f5e..fc95b21 100644 --- a/cxxmph/bm_common.h +++ b/cxxmph/bm_common.h @@ -56,7 +56,7 @@ class SearchUint64Benchmark : public Uint64Benchmark { protected: virtual bool SetUp(); const uint32_t nsearches_; - std::vector random_; + std::vector random_; }; } // namespace cxxmph diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index 03cb222..f92972b 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -1,15 +1,15 @@ #include #include -#include +#include #include "bm_common.h" -#include "StringPiece.h" +#include "stringpiece.h" #include "mph_index.h" using namespace cxxmph; using std::string; -using std::tr1::unordered_set; +using std::tr1::unordered_map; class BM_MPHIndexCreate : public UrlsBenchmark { public: @@ -28,8 +28,11 @@ class BM_STLIndexCreate : public UrlsBenchmark { : UrlsBenchmark(urls_file) { } protected: virtual void Run() { - unordered_set index; - index.insert(urls_.begin(), urls_.end()); + unordered_map index; + int idx = 0; + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + index.insert(make_pair(*it, idx++)); + } } }; @@ -38,10 +41,10 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark { BM_MPHIndexSearch(const std::string& urls_file, int nsearches) : SearchUrlsBenchmark(urls_file, nsearches) { } virtual void Run() { - while (true) { for (auto it = random_.begin(); it != random_.end(); ++it) { - index_.index(*it); - } + auto idx = index_.index(*it); + // Collision check to be fair with STL + if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; } } protected: @@ -59,23 +62,28 @@ class BM_STLIndexSearch : public SearchUrlsBenchmark { : SearchUrlsBenchmark(urls_file, nsearches) { } virtual void Run() { for (auto it = random_.begin(); it != random_.end(); ++it) { - index_.find(*it); // - index_.begin(); + auto idx = index_.find(*it); } } protected: virtual bool SetUp () { if (!SearchUrlsBenchmark::SetUp()) return false; - std::tr1::unordered_set(urls_.begin(), urls_.end()).swap(index_); + unordered_map index; + int idx = 0; + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + index.insert(make_pair(*it, idx++)); + } + index.swap(index_); return true; } - std::tr1::unordered_set index_; + std::tr1::unordered_map index_; }; int main(int argc, char** argv) { Benchmark::Register(new BM_MPHIndexCreate("URLS100k")); Benchmark::Register(new BM_STLIndexCreate("URLS100k")); - Benchmark::Register(new BM_MPHIndexSearch("URLS100k", 1000*1000)); - Benchmark::Register(new BM_STLIndexSearch("URLS100k", 1000*1000)); + Benchmark::Register(new BM_MPHIndexSearch("URLS100k", 100*1000*1000)); + Benchmark::Register(new BM_STLIndexSearch("URLS100k", 100*1000*1000)); Benchmark::RunAll(); return 0; } diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 423e329..12dd2f1 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -10,10 +10,25 @@ using std::tr1::unordered_map; namespace cxxmph { +uint64_t myfind(const unordered_map& mymap, const uint64_t& k) { + return mymap.find(k)->second; +} +uint64_t myfind(const mph_map& mymap, const uint64_t& k) { + return mymap.index(k); +} + +const StringPiece& myfind(const unordered_map& mymap, const StringPiece& k) { + return mymap.find(k)->second; +} +StringPiece myfind(const mph_map& mymap, const StringPiece& k) { + auto it = mymap.find(k); + return it->second; +} + template -class BM_MapCreate : public UrlsBenchmark { +class BM_CreateUrls : public UrlsBenchmark { public: - BM_MapCreate(const string& urls_file) : UrlsBenchmark(urls_file) { } + BM_CreateUrls(const string& urls_file) : UrlsBenchmark(urls_file) { } virtual void Run() { MapType mymap; for (auto it = urls_.begin(); it != urls_.end(); ++it) { @@ -23,13 +38,13 @@ class BM_MapCreate : public UrlsBenchmark { }; template -class BM_MapSearch : public SearchUrlsBenchmark { +class BM_SearchUrls : public SearchUrlsBenchmark { public: - BM_MapSearch(const std::string& urls_file, int nsearches) + BM_SearchUrls(const std::string& urls_file, int nsearches) : SearchUrlsBenchmark(urls_file, nsearches) { } virtual void Run() { for (auto it = random_.begin(); it != random_.end(); ++it) { - mymap_.find(*it); + auto idx = myfind(mymap_, *it); } } protected: @@ -44,14 +59,40 @@ class BM_MapSearch : public SearchUrlsBenchmark { MapType mymap_; }; +template +class BM_SearchUint64 : public SearchUint64Benchmark { + public: + BM_SearchUint64() : SearchUint64Benchmark(1000*1000, 1000*1000) { } + virtual bool SetUp() { + if (!SearchUint64Benchmark::SetUp()) return false; + for (int i = 0; i < values_.size(); ++i) { + mymap_[values_[i]] = values_[i]; + } + mymap_.rehash(mymap_.bucket_count()); + return true; + } + virtual void Run() { + for (auto it = random_.begin(); it != random_.end(); ++it) { + auto v = myfind(mymap_, *it); + } + } + MapType mymap_; +}; + } // namespace cxxmph using namespace cxxmph; int main(int argc, char** argv) { - Benchmark::Register(new BM_MapCreate>("URLS100k")); - Benchmark::Register(new BM_MapCreate>("URLS100k")); - Benchmark::Register(new BM_MapSearch>("URLS100k", 1000* 1000)); - Benchmark::Register(new BM_MapSearch>("URLS100k", 1000* 1000)); + /* + Benchmark::Register(new BM_CreateUrls>("URLS100k")); + Benchmark::Register(new BM_CreateUrls>("URLS100k")); + */ + Benchmark::Register(new BM_SearchUrls>("URLS100k", 1000* 1000*100)); + /* + Benchmark::Register(new BM_SearchUrls>("URLS100k", 1000* 1000)); + Benchmark::Register(new BM_SearchUint64>); + Benchmark::Register(new BM_SearchUint64>); + */ Benchmark::RunAll(); } diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index d03dd92..02f7368 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -149,6 +149,7 @@ template uint32_t MPHIndex::index(const Key& key) const { uint32_t h[3]; for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); + assert(r_); h[0] = h[0] % r_; h[1] = h[1] % r_ + r_; h[2] = h[2] % r_ + (r_ << 1); @@ -169,7 +170,7 @@ class SimpleMPHIndex : public MPHIndex { bool Reset(ForwardIterator begin, ForwardIterator end) { return MPHIndex::Reset(begin, end); } - uint32_t index(const Key& key) { return MPHIndex::index(key); } + uint32_t index(const Key& key) const { return MPHIndex::index(key); } }; } // namespace cxxmph diff --git a/cxxmph/mph_index_test.cc b/cxxmph/mph_index_test.cc index 421369c..7a7d036 100644 --- a/cxxmph/mph_index_test.cc +++ b/cxxmph/mph_index_test.cc @@ -24,7 +24,7 @@ int main(int argc, char** argv) { keys.push_back("algume"); SimpleMPHIndex mph_index; - assert(mph_index.Reset(keys.begin(), keys.end())); + if (!mph_index.Reset(keys.begin(), keys.end())) { exit(-1); } vector ids; for (vector::size_type i = 0; i < keys.size(); ++i) { ids.push_back(mph_index.index(keys[i])); @@ -33,7 +33,6 @@ int main(int argc, char** argv) { cerr << endl; sort(ids.begin(), ids.end()); for (vector::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast::value_type>(i)); - char* serialized = new char[mph_index.serialize_bytes_needed()]; mph_index.serialize(serialized); SimpleMPHIndex other_mph_index; diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index d52f617..cd8f684 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -52,11 +52,17 @@ class mph_map { std::pair insert(const value_type& x); iterator find(const key_type& k); const_iterator find(const key_type& k) const; + typedef int32_t my_int32_t; + int32_t index(const key_type& k) const; data_type& operator[](const key_type &k); + const data_type& operator[](const key_type &k) const; size_type bucket_count() const { return size(); } void rehash(size_type nbuckets /*ignored*/) { pack(); } + protected: // mimicking STL implementation + EqualKey equal_; + private: template struct iterator_first : public iterator { @@ -145,30 +151,33 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { } MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { - if (!slack_.empty()) { - typename slack_type::const_iterator it = slack_.find(k); - if (it != slack_.end()) return values_.begin() + it->second; - } - if (index_.size() == 0) return end(); - size_type id = index_.index(k); - if (key_equal()(values_[id].first, k)) { - return values_.begin() + id; + if (__builtin_expect(!slack_.empty(), 0)) { + typename slack_type::const_iterator it = slack_.find(k); + if (it != slack_.end()) return values_.begin() + it->second; } + if (__builtin_expect(index_.size() == 0, 0)) return end(); + auto it = values_.begin() + index_.index(k); + if (__builtin_expect(equal_(k, it->first), 1)) return it; return end(); } + MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { if (!slack_.empty()) { - typename slack_type::const_iterator it = slack_.find(k); - if (it != slack_.end()) return values_.begin() + it->second; + typename slack_type::const_iterator it = slack_.find(k); + if (it != slack_.end()) return values_.begin() + it->second; } if (index_.size() == 0) return end(); - size_type id = index_.index(k); - if (key_equal()(values_[id].first, k)) { - return values_.begin() + id; - } + auto it = values_.begin() + index_.index(k); + if (equal_(it->first, k)) return it; return end(); } +MPH_MAP_METHOD_DECL(my_int32_t, index)(const key_type& k) const { + assert(slack_.empty()); + if (index_.size() == 0) return -1; + return index_.index(k); +} + MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) { return insert(std::make_pair(k, data_type())).first->second; } diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index 99a3ca6..a12d4f8 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -19,10 +19,17 @@ struct seeded_hash_function { } }; +struct seeded_identity_function { + template + uint32_t operator()(const Key& k, uint32_t seed) const { + return k ^ seed; + } +}; + struct Murmur2 { template uint32_t operator()(const Key& k) const { - return MurmurHash2(k, sizeof(Key), 1 /* seed */); + return MurmurHash2(reinterpret_cast(&k), sizeof(Key), 1 /* seed */); } }; struct Murmur2StringPiece { From 1a5eee170c6480ba842d718e7dfd47f2a2c84384 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Tue, 14 Jun 2011 03:30:41 -0300 Subject: [PATCH 46/89] Fixed bug in uit64 benchmark. --- cxxmph/bm_common.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cxxmph/bm_common.cc b/cxxmph/bm_common.cc index f0e0336..c52b2e5 100644 --- a/cxxmph/bm_common.cc +++ b/cxxmph/bm_common.cc @@ -54,7 +54,8 @@ bool SearchUint64Benchmark::SetUp() { if (!Uint64Benchmark::SetUp()) return false; random_.resize(nsearches_); for (int i = 0; i < nsearches_; ++i) { - random_.push_back(values_[random() % values_.size()]); + uint32_t pos = random() % values_.size(); + random_[i] = values_[pos]; } return true; } From cc80fcfa2b0d472bd6d0cf1021ce8d79546413fe Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Tue, 14 Jun 2011 03:32:02 -0300 Subject: [PATCH 47/89] Fixed benchmark --- cxxmph/bm_map.cc | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 12dd2f1..045c10a 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -10,11 +10,11 @@ using std::tr1::unordered_map; namespace cxxmph { -uint64_t myfind(const unordered_map& mymap, const uint64_t& k) { +uint64_t myfind(const unordered_map& mymap, const uint64_t& k) { return mymap.find(k)->second; } uint64_t myfind(const mph_map& mymap, const uint64_t& k) { - return mymap.index(k); + return mymap.find(k)->second; } const StringPiece& myfind(const unordered_map& mymap, const StringPiece& k) { @@ -62,18 +62,26 @@ class BM_SearchUrls : public SearchUrlsBenchmark { template class BM_SearchUint64 : public SearchUint64Benchmark { public: - BM_SearchUint64() : SearchUint64Benchmark(1000*1000, 1000*1000) { } + BM_SearchUint64() : SearchUint64Benchmark(10000, 10*1000*1000) { } virtual bool SetUp() { if (!SearchUint64Benchmark::SetUp()) return false; for (int i = 0; i < values_.size(); ++i) { mymap_[values_[i]] = values_[i]; } mymap_.rehash(mymap_.bucket_count()); + // Double check if everything is all right + for (int i = 0; i < values_.size(); ++i) { + if (mymap_[values_[i]] != values_[i]) return false; + } return true; } virtual void Run() { for (auto it = random_.begin(); it != random_.end(); ++it) { auto v = myfind(mymap_, *it); + if (v != *it) { + fprintf(stderr, "Looked for %lu got %lu\n", *it, v); + exit(-1); + } } } MapType mymap_; @@ -84,14 +92,15 @@ class BM_SearchUint64 : public SearchUint64Benchmark { using namespace cxxmph; int main(int argc, char** argv) { + srandom(4); /* Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); */ - Benchmark::Register(new BM_SearchUrls>("URLS100k", 1000* 1000*100)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000* 1000)); /* Benchmark::Register(new BM_SearchUrls>("URLS100k", 1000* 1000)); - Benchmark::Register(new BM_SearchUint64>); + Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); */ Benchmark::RunAll(); From 1e1cbfe6069c3c33199b70a9a787c5c50b05dbb6 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Tue, 14 Jun 2011 03:38:23 -0300 Subject: [PATCH 48/89] Trying perfect hash. --- cxxmph/mph_index.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 02f7368..3083a60 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -35,6 +35,11 @@ class MPHIndex { uint32_t size() const { return m_; } void clear(); + uint32_t perfect_hash_size() const { return n_; } + template // must agree with Reset + uint32_t perfect_hash(const Key& x) const; + template // must agree with Reset + uint32_t minimal_perfect_hash(const Key& x) const; // Serialization machinery for mmap usage. // Serialized tables are not guaranteed to work across versions or different // endianness (although they could easily be made to be). @@ -146,7 +151,7 @@ bool MPHIndex::Mapping( } template -uint32_t MPHIndex::index(const Key& key) const { +uint32_t MPHIndex::perfect_hash(const Key& key) const { uint32_t h[3]; for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); assert(r_); @@ -159,8 +164,16 @@ uint32_t MPHIndex::index(const Key& key) const { assert((h[1] >> 2) > 2) +uint32_t MPHIndex::minimal_perfect_hash(const Key& key) const { + return Rank(perfect_hash(key)); +} + +template +uint32_t MPHIndex::index(const Key& key) const { + return minimal_perfect_hash(key); } template >::hash_function> From 2c88ab61ec43ef3574849b22338ba1c09933fed5 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Tue, 14 Jun 2011 04:58:22 -0300 Subject: [PATCH 49/89] Exposed perfect hash internals. --- cxxmph/mph_index.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 3083a60..3afc518 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -184,6 +184,8 @@ class SimpleMPHIndex : public MPHIndex { return MPHIndex::Reset(begin, end); } uint32_t index(const Key& key) const { return MPHIndex::index(key); } + uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash(key); } + uint32_t minimal_perfect_hash(const Key& key) const { return MPHIndex::minimal_perfect_hash(key); } }; } // namespace cxxmph From 85a0d7453a541b1547678025210e7ddc55575f2e Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Tue, 14 Jun 2011 04:59:54 -0300 Subject: [PATCH 50/89] Playing with benchmarks. --- cxxmph/bm_map.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 045c10a..95e76f7 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -13,6 +13,7 @@ namespace cxxmph { uint64_t myfind(const unordered_map& mymap, const uint64_t& k) { return mymap.find(k)->second; } + uint64_t myfind(const mph_map& mymap, const uint64_t& k) { return mymap.find(k)->second; } @@ -44,7 +45,11 @@ class BM_SearchUrls : public SearchUrlsBenchmark { : SearchUrlsBenchmark(urls_file, nsearches) { } virtual void Run() { for (auto it = random_.begin(); it != random_.end(); ++it) { - auto idx = myfind(mymap_, *it); + auto v = myfind(mymap_, *it); + if (v != *it) { + fprintf(stderr, "Looked for %s got %s\n", it->data(), v.data()); + exit(-1); + } } } protected: @@ -98,8 +103,8 @@ int main(int argc, char** argv) { Benchmark::Register(new BM_CreateUrls>("URLS100k")); */ Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000* 1000)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000* 1000)); /* - Benchmark::Register(new BM_SearchUrls>("URLS100k", 1000* 1000)); Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); */ From 245a84c75ea378c8f3447d9f6f87b186a8f8721d Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Wed, 3 Aug 2011 18:48:28 -0300 Subject: [PATCH 51/89] Fixed include camelcase. --- cxxmph/bm_index.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index 03cb222..ecc90bf 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -3,7 +3,7 @@ #include #include "bm_common.h" -#include "StringPiece.h" +#include "stringpiece.h" #include "mph_index.h" using namespace cxxmph; From 96862d3113ddd170e8f7cedf8000c1fcccbf3e84 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Sat, 5 Nov 2011 09:51:45 -0200 Subject: [PATCH 52/89] Fixed license. --- COPYING | 2 ++ 1 file changed, 2 insertions(+) diff --git a/COPYING b/COPYING index 4d2513e..e114d20 100644 --- a/COPYING +++ b/COPYING @@ -1,3 +1,5 @@ The code of the cmph library is dual licensed under the LGPL version 2 and MPL 1.1 licenses. Please refer to the LGPL-2 and MPL-1.1 files in the repository for the full description of each of the licenses. + +For cxxmph, the files stringpiece.h and MurmurHash2 are covered by the BSD and MIT licenses, respectively. From 2a67236e292e772898f71331408f1c895cd8b2d8 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Sat, 5 Nov 2011 10:27:24 -0200 Subject: [PATCH 53/89] Improved c++0x test. --- acinclude.m4 | 76 ++++++++++++++++++++++++++++++ configure.ac | 12 ++++- cxxmph/Makefile.am | 1 - src/jenkins_hash.c | 114 ++++++++++++++++++++++----------------------- 4 files changed, 142 insertions(+), 61 deletions(-) diff --git a/acinclude.m4 b/acinclude.m4 index b49a92b..bde7628 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -12,6 +12,82 @@ AC_DEFUN([AC_CHECK_SPOON], [ AC_MSG_RESULT(no) ]) +dnl Check for baseline language coverage in the compiler for the C++0x standard. +# AC_COMPILE_STDCXX_OX +AC_DEFUN([AC_COMPILE_STDCXX_0X], [ + AC_CACHE_CHECK(if compiler supports C++0x features without additional flags, + ac_cv_cxx_compile_cxx0x_native, + [AC_LANG_SAVE + AC_LANG_CPLUSPLUS + AC_TRY_COMPILE([ + #include + #include + template + struct check + { + static_assert(sizeof(int) <= sizeof(T), "not big enough"); + }; + + typedef check> right_angle_brackets; + + int a; + decltype(a) b; + ],, + ac_cv_cxx_compile_cxx0x_native=yes, ac_cv_cxx_compile_cxx0x_native=no) + AC_LANG_RESTORE + ]) + + AC_CACHE_CHECK(if compiler supports C++0x features with -std=c++0x, + ac_cv_cxx_compile_cxx0x_cxx, + [AC_LANG_SAVE + AC_LANG_CPLUSPLUS + ac_save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS -std=c++0x" + AC_TRY_COMPILE([ + template + struct check + { + static_assert(sizeof(int) <= sizeof(T), "not big enough"); + }; + + typedef check> right_angle_brackets; + + int a; + decltype(a) b;],, + ac_cv_cxx_compile_cxx0x_cxx=yes, ac_cv_cxx_compile_cxx0x_cxx=no) + CXXFLAGS="$ac_save_CXXFLAGS" + AC_LANG_RESTORE + ]) + + AC_CACHE_CHECK(if compiler supports C++0x features with -std=gnu++0x, + ac_cv_cxx_compile_cxx0x_gxx, + [AC_LANG_SAVE + AC_LANG_CPLUSPLUS + ac_save_CXXFLAGS="$CXXFLAGS" + CXXFLAGS="$CXXFLAGS -std=gnu++0x" + AC_TRY_COMPILE([ + template + struct check + { + static_assert(sizeof(int) <= sizeof(T), "not big enough"); + }; + + typedef check> right_angle_brackets; + + int a; + decltype(a) b;],, + ac_cv_cxx_compile_cxx0x_gxx=yes, ac_cv_cxx_compile_cxx0x_gxx=no) + CXXFLAGS="$ac_save_CXXFLAGS" + AC_LANG_RESTORE + ]) + + if test "$ac_cv_cxx_compile_cxx0x_native" = yes || + test "$ac_cv_cxx_compile_cxx0x_cxx" = yes || + test "$ac_cv_cxx_compile_cxx0x_gxx" = yes; then + AC_DEFINE(HAVE_STDCXX_0X,,[Define if g++ supports C++0x features. ]) + fi +]) + dnl By default, many hosts won't let programs access large files; dnl one must use special compiler options to get large-file access to work. dnl For more details about this brain damage please see: diff --git a/configure.ac b/configure.ac index b749ad3..172b02f 100644 --- a/configure.ac +++ b/configure.ac @@ -36,7 +36,17 @@ CFLAGS="-Wall -Werror" AC_PROG_CXX AC_ENABLE_CXXMPH if test x$cxxmph = xtrue; then - AC_SUBST([CXXMPH], "cxxmph") + AC_COMPILE_STDCXX_0X + if test x$ac_cv_cxx_compile_cxx0x_native = "xno"; then + if test x$ac_cv_cxx_compile_cxx0x_cxx = "xyes"; then + CXXFLAGS="$CXXFLAGS -std=c++0x" + elif test x$ac_cv_cxx_compile_cxx0x_gxx = "xyes"; then + CXXFLAGS="$CXXFLAGS -std=gnu++0x" + else + AC_MSG_ERROR("cxxmph demands a working c++0x compiler.") + fi + fi + AC_SUBST([CXXMPH], "cxxmph") fi AC_CHECK_SPOON diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index c02e1c9..0396811 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,4 +1,3 @@ -AM_CXXFLAGS='-std=c++0x' TESTS = $(check_PROGRAMS) check_PROGRAMS = mph_map_test mph_index_test # trigraph_test noinst_PROGRAMS = bm_index bm_map diff --git a/src/jenkins_hash.c b/src/jenkins_hash.c index bd65ff6..65cdff9 100644 --- a/src/jenkins_hash.c +++ b/src/jenkins_hash.c @@ -7,9 +7,8 @@ #include #include -// #define DEBUG +//#define DEBUG #include "debug.h" -#include "MurmurHash2.h" #define hashsize(n) ((cmph_uint32)1<<(n)) #define hashmask(n) (hashsize(n)-1) @@ -88,8 +87,8 @@ acceptable. Do NOT use for cryptographic purposes. jenkins_state_t *jenkins_state_new(cmph_uint32 size) //size of hash table { jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t)); + DEBUGP("Initializing jenkins hash\n"); state->seed = ((cmph_uint32)rand() % size); - DEBUGP("Initializied jenkins hash with seed %d\n", state->seed); return state; } void jenkins_state_destroy(jenkins_state_t *state) @@ -100,67 +99,63 @@ void jenkins_state_destroy(jenkins_state_t *state) static inline void __jenkins_hash_vector(cmph_uint32 seed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) { - int i; - for (i = 0; i < 3; ++i) { - hashes[i] = MurmurHash2(k, keylen, seed + i); + register cmph_uint32 len, length; + + /* Set up the internal state */ + length = keylen; + len = length; + hashes[0] = hashes[1] = 0x9e3779b9; /* the golden ratio; an arbitrary value */ + hashes[2] = seed; /* the previous hash value - seed in our case */ + + /*---------------------------------------- handle most of the key */ + while (len >= 12) + { + hashes[0] += ((cmph_uint32)k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24)); + hashes[1] += ((cmph_uint32)k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24)); + hashes[2] += ((cmph_uint32)k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24)); + mix(hashes[0],hashes[1],hashes[2]); + k += 12; len -= 12; } -// register cmph_uint32 len, length; -// -// /* Set up the internal state */ -// length = keylen; -// len = length; -// hashes[0] = hashes[1] = 0x9e3779b9; /* the golden ratio; an arbitrary value */ -// hashes[2] = seed; /* the previous hash value - seed in our case */ -// -// /*---------------------------------------- handle most of the key */ -// while (len >= 12) -// { -// hashes[0] += ((cmph_uint32)k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24)); -// hashes[1] += ((cmph_uint32)k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24)); -// hashes[2] += ((cmph_uint32)k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24)); -// mix(hashes[0],hashes[1],hashes[2]); -// k += 12; len -= 12; -// } -// -// /*------------------------------------- handle the last 11 bytes */ -// hashes[2] += length; -// switch(len) /* all the case statements fall through */ -// { -// case 11: -// hashes[2] +=((cmph_uint32)k[10]<<24); -// case 10: -// hashes[2] +=((cmph_uint32)k[9]<<16); -// case 9 : -// hashes[2] +=((cmph_uint32)k[8]<<8); -// /* the first byte of hashes[2] is reserved for the length */ -// case 8 : -// hashes[1] +=((cmph_uint32)k[7]<<24); -// case 7 : -// hashes[1] +=((cmph_uint32)k[6]<<16); -// case 6 : -// hashes[1] +=((cmph_uint32)k[5]<<8); -// case 5 : -// hashes[1] +=(cmph_uint8) k[4]; -// case 4 : -// hashes[0] +=((cmph_uint32)k[3]<<24); -// case 3 : -// hashes[0] +=((cmph_uint32)k[2]<<16); -// case 2 : -// hashes[0] +=((cmph_uint32)k[1]<<8); -// case 1 : -// hashes[0] +=(cmph_uint8)k[0]; -// /* case 0: nothing left to add */ -// } -// -// mix(hashes[0],hashes[1],hashes[2]); + + /*------------------------------------- handle the last 11 bytes */ + hashes[2] += length; + switch(len) /* all the case statements fall through */ + { + case 11: + hashes[2] +=((cmph_uint32)k[10]<<24); + case 10: + hashes[2] +=((cmph_uint32)k[9]<<16); + case 9 : + hashes[2] +=((cmph_uint32)k[8]<<8); + /* the first byte of hashes[2] is reserved for the length */ + case 8 : + hashes[1] +=((cmph_uint32)k[7]<<24); + case 7 : + hashes[1] +=((cmph_uint32)k[6]<<16); + case 6 : + hashes[1] +=((cmph_uint32)k[5]<<8); + case 5 : + hashes[1] +=(cmph_uint8) k[4]; + case 4 : + hashes[0] +=((cmph_uint32)k[3]<<24); + case 3 : + hashes[0] +=((cmph_uint32)k[2]<<16); + case 2 : + hashes[0] +=((cmph_uint32)k[1]<<8); + case 1 : + hashes[0] +=(cmph_uint8)k[0]; + /* case 0: nothing left to add */ + } + + mix(hashes[0],hashes[1],hashes[2]); } cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen) { -// cmph_uint32 hashes[3]; -// __jenkins_hash_vector(state->seed, k, keylen, hashes); -// return hashes[2]; - cmph_uint32 a, b, c; + cmph_uint32 hashes[3]; + __jenkins_hash_vector(state->seed, k, keylen, hashes); + return hashes[2]; +/* cmph_uint32 a, b, c; cmph_uint32 len, length; // Set up the internal state @@ -214,6 +209,7 @@ cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keyl /// report the result return c; + */ } void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) From b603173d0118e355a9762a850ff157d5cd9033ea Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Sat, 5 Nov 2011 10:32:47 -0200 Subject: [PATCH 54/89] About to merge. --- cxxmph/Makefile.am | 5 +- cxxmph/URLS1k | 256 --------------------------------------------- src/MurmurHash2.h | 69 ------------ 3 files changed, 1 insertion(+), 329 deletions(-) delete mode 100644 cxxmph/URLS1k delete mode 100644 src/MurmurHash2.h diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 0396811..04a90fe 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,5 +1,5 @@ TESTS = $(check_PROGRAMS) -check_PROGRAMS = mph_map_test mph_index_test # trigraph_test +check_PROGRAMS = mph_map_test mph_index_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la @@ -14,9 +14,6 @@ mph_map_test_SOURCES = mph_map_test.cc mph_index_test_LDADD = libcxxmph.la mph_index_test_SOURCES = mph_index_test.cc -# trigraph_test_LDADD = libcxxmph.la -# trigraph_test_SOURCES = trigraph_test.cc - bm_index_LDADD = libcxxmph.la bm_index_SOURCES = bm_common.cc bm_index.cc diff --git a/cxxmph/URLS1k b/cxxmph/URLS1k deleted file mode 100644 index a7fa160..0000000 --- a/cxxmph/URLS1k +++ /dev/null @@ -1,256 +0,0 @@ -http://100_fundos.zip.net/arch2004-03-28_2004-04-03.html -http://2d-galois.flogbrasil.terra.com.br/robots.txt -http://2littledoves.fotolog.terra.com.br/ -http://3336.fotoblog.uol.com.br/photo20040305160808.html -http://3reis.weblogger.terra.com.br/robots.txt -http://4track.blogger.com.br/robots.txt -http://abelrezende.flogbrasil.terra.com.br/tops.php -http://abelsidney.vilabol.uol.com.br/cro91.html -http://abelsidney.vilabol.uol.com.br/rede8.html -http://abobaninha.weblogger.terra.com.br/200302_abobaninha_arquivo.htm -http://abpblh.org.br/ -http://abvo.org.br/up.htm -http://acervocorrs.vilabol.uol.com.br/main071.html -http://acid.weblogger.terra.com.br/2883612 -http://actionblog.zip.net/robots.txt -http://ademir.alfa.ind.br/entrada.htm -http://adrianorp.zip.net/listArchive.html -http://adriele.flogbrasil.terra.com.br/tops.php -http://aflordapele.blogger.com.br/2004_04_01_archive.html -http://afoganao.flogbrasil.terra.com.br/estados.php -http://aftertonight.blogger.com.br/robots.txt -http://agoraevidareal.blogger.com.br/2004_05_01_archive.html -http://ahafotolog.fotoblog.uol.com.br/photo20040327172209.html -http://ahmuleke.fotoblog.uol.com.br/photo20040420143013.html -http://aicomomexe.weblogger.terra.com.br/200401_aicomomexe_arquivo.htm -http://aiehdose.blig.ig.com.br/ -http://aimeudedo.zip.net/arch2004-05-09_2004-05-15.html -http://airtonfilho.pcc.usp.br/curriculum.htm -http://akane_hoshi.blogger.com.br/ -http://akinyele.letras.terra.com.br/ -http://al-green.letras.terra.com.br/ -http://albaligia.fotoblog.uol.com.br/photo20040501225132.html -http://alemaosalsicha.flogbrasil.terra.com.br/robots.txt -http://alessandrafleury.blogger.com.br/ -http://aliatras.weblogger.terra.com.br/www.marisurf.weblogger.com.br -http://alissondantas.flogbrasil.terra.com.br/ -http://allcolix.flogbrasil.terra.com.br/estados.php -http://allmylove.fotolog.terra.com.br/ -http://allyouneedisnats.flogbrasil.terra.com.br/gold_comprar.php -http://almostnaked.weblogger.terra.com.br/4674105 -http://alwaystogether.weblogger.terra.com.br/200310_alwaystogether_arquivo.htm -http://amanda-sc.fotoblog.uol.com.br/photo20040413082243.html -http://amiga-kelly.flogbrasil.terra.com.br/robots.txt -http://amigasbh.fotolog.terra.com.br/gold_comprar.php -http://amigooo.weblogger.terra.com.br/14534470 -http://amiruhama.vilabol.uol.com.br/celina_tuesmeudeus.html -http://amotheoc.fotolog.terra.com.br/tops_meninos.php -http://ana.fla.fotoblog.uol.com.br/photo20040512080652.html -http://anala.fotolog.terra.com.br/tops.php -http://anasps-sp.sites.uol.com.br/birth5.htm -http://anastm.fotolog.terra.com.br/tops_meninos.php -http://andandonasnuvens.weblogger.terra.com.br/3233568 -http://andercb.fotolog.terra.com.br/tops_meninas.php -http://andre.bac.sites.uol.com.br/equipamentos.htm -http://andrezadepp.blig.ig.com.br/ -http://andrezavega.vilabol.uol.com.br/amigo.html -http://aneeee.flogbrasil.terra.com.br/tops.php -http://angelsusy.flogbrasil.terra.com.br/robots.txt -http://ani2.weblogger.terra.com.br/ -http://aninha_space_for_me.zip.net/robots.txt -http://aninhacerqueira.flogbrasil.terra.com.br/gold.php -http://aninhafuracao.weblogger.terra.com.br/200310_aninhafuracao_arquivo.htm -http://aninhaumverdadeiroamor.weblogger.com.br/ -http://anjosdanados.flogbrasil.terra.com.br/tops.php -http://anjosklb.blogger.com.br/ -http://annafernandes.flogbrasil.terra.com.br/gold.php -http://annarafaella.fotolog.terra.com.br/ -http://anninha25.weblogger.terra.com.br/200307_anninha25_arquivo.htm -http://anninhaturbo2ponto4.flogbrasil.terra.com.br/ -http://anonimoincognito.flogbrasil.terra.com.br/tops_estados.php -http://answerbook.ime.usp.br:8888/ab2 -http://antonio-bz.flogbrasil.terra.com.br/tops_estados.php -http://apae.weblogger.terra.com.br/9718679 -http://aparelho-de-dvd-pioneer-dvr-57h.ofertas-comprar-vender.com.br/robots.txt -http://apoio.weblogger.terra.com.br/200404_apoio_arquivo.htm -http://arabella.bella.blog.uol.com.br/arch2004-04-11_2004-04-17.html -http://art-popular.cifras.art.br/cifra_16277.html -http://artcanal.com.br/q.html -http://artemiro.fotolog.terra.com.br/ -http://aruba.com.br/ -http://aslilis.flogbrasil.terra.com.br/tops_meninas.php -http://asmaluketxxx.blig.ig.com.br/robots.txt -http://asprincesinhas.weblogger.terra.com.br/robots.txt -http://assespro.org.br/fotosrs.asp -http://asveredas.com.br/gal_amigos005.html -http://atrevidamqn.fotolog.terra.com.br/tops.php -http://avitchas.blogger.com.br/ -http://ayegui-estella.spain.ehotelfinder.net/ -http://b0caum.flogbrasil.terra.com.br/tops_estados.php -http://babasidera.fotolog.terra.com.br/tops_meninos.php -http://babebibobu.blogger.com.br/robots.txt -http://babifiles.zip.net/arch2004-04-04_2004-04-10.html -http://babisenena.blogger.com.br/2004_03_14_archive.html -http://babixmaisa.flogbrasil.terra.com.br/tops_meninas.php -http://bacardigirls.flogbrasil.terra.com.br/tops.php -http://backstreet-boys.weblogger.terra.com.br/5478915 -http://backstreet.hpg.com.br/midis.html -http://bagagera.flogbrasil.terra.com.br/gold.php -http://balila.flogbrasil.terra.com.br/gold.php -http://balletclassico.fotolog.terra.com.br/tops.php -http://bambambm.fotolog.terra.com.br/gold.php -http://bandamusicalbox.vilabol.uol.com.br/MUSICALBOX-Marcelo.htm -http://barbaridades.weblogger.terra.com.br/2353538 -http://barbyssa.fotolog.terra.com.br/ -http://bartsimpson-sp.fotoblog.uol.com.br/links.html -http://batatadoce.com.br/recomendamos -http://bazinha21.fotolog.terra.com.br/ -http://bazinhamartins.fotolog.terra.com.br/tops_meninas.php -http://beavera.fotolog.terra.com.br/tops.php -http://bebecrazy.flogbrasil.terra.com.br/tops.php -http://beberzao.flogbrasil.terra.com.br/tops.php -http://becaelilo.flogbrasil.terra.com.br/tops_adulto.php -http://beck.cifras.art.br/cifra_516.html -http://bego-90.flogbrasil.terra.com.br/gold.php -http://belacapelinha.cantaminas.com.br/ -http://belluchesi.weblogger.terra.com.br/200403_belluchesi_arquivo.htm -http://belo.letras.terra.com.br/ -http://belthatha.flogbrasil.terra.com.br/tops.php -http://bemestar.weblogger.terra.com.br/200302_bemestar_arquivo.htm -http://bessa.flogbrasil.terra.com.br/tops_meninos.php -http://bestoes.flogbrasil.terra.com.br/gold.php -http://betooow.flogbrasil.terra.com.br/tops.php -http://betoosurf.flogbrasil.terra.com.br/contato.php -http://bfr.fotolog.terra.com.br/robots.txt -http://bialoka.blig.ig.com.br/robots.txt -http://biazinha-labruna.flogbrasil.terra.com.br/tops_estados.php -http://bibizinha_lindinha.zip.net/arch2004-03-07_2004-03-13.html -http://billiefabruz.blogger.com.br/robots.txt -http://bjokera.flogbrasil.terra.com.br/ -http://bjork.com.br/indexw.htm -http://bl3.com.br/windreg03_gzero.htm -http://blaze.cifras.art.br/cifra_14186.html -http://bleach.ofertas-comprar-vender.com.br/ -http://blog-da-pri.weblogger.terra.com.br/200402_blog-da-pri_arquivo.htm -http://blogdababi.zip.net/arch2004-04-04_2004-04-10.html -http://blogdadee.weblogger.terra.com.br/200311_blogdadee_arquivo.htm -http://blogdaeve.weblogger.terra.com.br/200405_blogdaeve_arquivo.htm -http://blogdalidi.weblogger.terra.com.br/200310_blogdalidi_arquivo.htm -http://blogdocaralho.weblogger.terra.com.br/12647435 -http://blogdosfalcatruas.blig.com.br/robots.txt -http://bloglife.blogger.com.br/robots.txt -http://blogsdanoite.blogger.com.br/2004_04_01_archive.html -http://bolados.weblogger.com.br/robots.txt -http://bomarley.vilabol.uol.com.br/XO.html -http://botlily.fotolog.terra.com.br/tops.php -http://boys_lie.weblogger.terra.com.br/200405_boys_lie_arquivo.htm -http://brabuletatas.turmadobar.com.br/ -http://brainwave.weblogger.terra.com.br/200403_brainwave_arquivo.htm -http://brasileiromestizo.blogger.com.br/2003_07_01_archive.html -http://brazzie.zip.net/arch2002-11-01_2002-11-15.html -http://bribokinha.fotolog.terra.com.br/ -http://bricinhu.fotolog.terra.com.br/tops_meninos.php -http://brink.fotolog.terra.com.br/tops.php -http://brox.flogbrasil.terra.com.br/tops_meninas.php -http://brozmania.blogger.com.br/robots.txt -http://bruninhabebe.fotolog.terra.com.br/tops_adulto.php -http://brustamato.flogbrasil.terra.com.br/robots.txt -http://bruuuhh.fotolog.terra.com.br/tops_meninos.php -http://bruxinhafofinha.fotolog.terra.com.br/ -http://bruxinhawiccana.flogbrasil.terra.com.br/tops_meninos.php -http://bubuzinhu.flogbrasil.terra.com.br/tops_meninos.php -http://bud.weblogger.terra.com.br/20021124_bud_arquivo.htm -http://bully.sites.uol.com.br/"+Arrws[a]+" -http://bulmarta.fotolog.terra.com.br/tops.php -http://bunitu.fotolog.terra.com.br/robots.txt -http://ca-brevi.flogbrasil.terra.com.br/tops_meninos.php -http://cabanasarua.com.br/foto_visitante_44.htm -http://cacabolhao.fotoblog.uol.com.br/photo20040423114747.html -http://cacahzinhaaa.flogbrasil.terra.com.br/estados.php -http://cadastro.brfree.com.br/ -http://cadelasemacao.vila.bol.com.br/ -http://cadenzza.sites.uol.com.br/george.htm -http://caen.france.qwhotels.com/fullindex.phtml -http://cahieursdupositif.weblogger.terra.com.br/8772980 -http://caio.munhoz.fotoblog.uol.com.br/photo20040321172822.html -http://caleo.flogbrasil.terra.com.br/tops_meninas.php -http://camera-c-730.ofertas-comprar-vender.com.br/robots.txt -http://camera-digital-aiptek-dvii.ofertas-comprar-vender.com.br/robots.txt -http://camera-digital-c-700-ultra-zoom.ofertas-comprar-vender.com.br/robots.txt -http://camera-digital-pocket-cam-classic.ofertas-comprar-vender.com.br/robots.txt -http://camera-hp-photosmart-620.ofertas-comprar-vender.com.br/robots.txt -http://camera-pentax-ist-d-slr.ofertas-comprar-vender.com.br/robots.txt -http://cameras-d510.ofertas-comprar-vender.com.br/robots.txt -http://cameras-digitais-sony-fd-91.ofertas-comprar-vender.com.br/robots.txt -http://cameras-kodak-dx-3600.ofertas-comprar-vender.com.br/robots.txt -http://camilacba.fotolog.terra.com.br/ -http://camilafelicia.flogbrasil.terra.com.br/gold.php -http://camilajuliana.fotoblog.uol.com.br/photo20040504113702.html -http://camisa-do-barbarense.ofertas-comprar-vender.com.br/ -http://campospalomino.fotoblog.uol.com.br/photo20040507120841.html -http://canal-pira.vila.bol.com.br/flagrantes.htm -http://canga.fotolog.terra.com.br/robots.txt -http://cantinhodoale.flogbrasil.terra.com.br/tops_meninas.php -http://caosfilosoficos.blogger.com.br/2004_02_01_archive.html -http://carlinhus.flogbrasil.terra.com.br/tops_adulto.php -http://carol-ferrari.fotoblog.uol.com.br/photo20040429144714.html -http://carol-ju.fotolog.terra.com.br/tops.php -http://carol-meiguinha.fotolog.terra.com.br/ -http://carolapezzatto.flogbrasil.terra.com.br/tops_meninos.php -http://carolzinhaas.flogbrasil.terra.com.br/robots.txt -http://carolzinhabacaninha.flogbrasil.terra.com.br/ -http://carolzita.logme.ig.com.br/robots.txt -http://caronte.fapergs.tche.br/res1001.htm -http://casadananda.weblogger.terra.com.br/200202_casadananda_arquivo.htm -http://cassia-eller.cifras.art.br/cifra_4838.html -http://catarinavianna.fotolog.terra.com.br/robots.txt -http://catguy.fotoblog.uol.com.br/photo20040322115139.html -http://catite.fotolog.terra.com.br/tops.php -http://catolicos.weblogger.com.br/robots.txt -http://caxias.weblogger.com.br/ -http://ccramalho.flogbrasil.terra.com.br/ -http://ccs.pontagrossa.pr.gov.br/ -http://cd-de-garbage-collection.ofertas-comprar-vender.com.br/ -http://cd-de-oficial-u2.ofertas-comprar-vender.com.br/ -http://cd-do-iron-maiden-and-and-fear-of-the-dark.ofertas-comprar-vender.com.br/robots.txt -http://cd-do-u2-one.ofertas-comprar-vender.com.br/robots.txt -http://cd-player-deh-p7500mp.ofertas-comprar-vender.com.br/ -http://cd-player-para-carro-fh-p4400.ofertas-comprar-vender.com.br/ -http://cd-player-sony-cdxc710.ofertas-comprar-vender.com.br/ -http://cedaspy.com.br/index.php -http://celina.fotolog.terra.com.br/tops_meninas.php -http://celulares-ciemems-cl50.ofertas-comprar-vender.com.br/ -http://celulares-fisio-820.ofertas-comprar-vender.com.br/ -http://celulares-sendo-j530.ofertas-comprar-vender.com.br/ -http://celulares-tel-me--2.ofertas-comprar-vender.com.br/robots.txt -http://cha_angel.weblogger.terra.com.br/ -http://chacota.flogbrasil.terra.com.br/tops_meninos.php -http://chakilla.zip.net/arch2004-04-11_2004-04-17.html -http://chalupadegilliatt.weblogger.terra.com.br/robots.txt -http://charlie-brown-jr.cifras.art.br/cifra_738.html -http://chegadesapo.blogger.com.br/ -http://chgiacomini.fotoblog.uol.com.br/links.html -http://chicolustosa.zip.net/arch2004-05-01_2004-05-15.html -http://chinfra.blogger.com.br/2003_08_17_archive.html -http://cholke.blig.ig.com.br/robots.txt -http://cicinhaa.flogbrasil.terra.com.br/contato.php -http://cidoloko.flogbrasil.terra.com.br/tops.php -http://cifranet.org/robots.txt -http://cigabruca.fotoblog.uol.com.br/photo20040514173549.html -http://cirilobresolin.weblogger.terra.com.br/16237735 -http://cissadantas.flogbrasil.terra.com.br/contato.php -http://cla_loka.zip.net/arch2004-03-01_2004-03-15.html -http://clanogueira.fotoblog.uol.com.br/photo20040420181756.html -http://clari.viana.fotoblog.uol.com.br/photo20040413170749.html -http://clarinhakk.flogbrasil.terra.com.br/gold_comprar.php -http://claudiajabour8.fotoblog.uol.com.br/photo20040411144511.html -http://clicker.flogbrasil.terra.com.br/gold.php -http://clodoaldopb.vila.bol.com.br/historiasovasorachado.html -http://clubedopairico.com.br/diario_05_11-0404.html -http://cmurca.fotoblog.uol.com.br/photo20040413184113.html -http://cobralillo.weblogger.terra.com.br/200404_cobralillo_arquivo.htm -http://coisasdoleo.weblogger.terra.com.br/20030330_coisasdoleo_arquivo.htm -http://coise-saint-jean-pied-gauthier.france.ehotelfinder.net/ -http://colombia.flogbrasil.terra.com.br/tops.php -http://comcharisma.weblogger.com.br/ -http://comediante.flogbrasil.terra.com.br/tops_meninas.php diff --git a/src/MurmurHash2.h b/src/MurmurHash2.h deleted file mode 100644 index 52d015a..0000000 --- a/src/MurmurHash2.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef __CXXMPH_MURMUR_HASH2__ -#define __CXXMPH_MURMUR_HASH2__ - -//----------------------------------------------------------------------------- -// MurmurHash2, by Austin Appleby - -// Note - This code makes a few assumptions about how your machine behaves - - -// 1. We can read a 4-byte value from any address without crashing -// 2. sizeof(int) == 4 - -// And it has a few limitations - - -// 1. It will not work incrementally. -// 2. It will not produce the same results on little-endian and big-endian -// machines. - -unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) -{ - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - - const unsigned int m = 0x5bd1e995; - const int r = 24; - - // Initialize the hash to a 'random' value - - unsigned int h = seed ^ len; - - // Mix 4 bytes at a time into the hash - - const unsigned char * data = (const unsigned char *)key; - - while(len >= 4) - { - unsigned int k = *(unsigned int *)data; - - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - - data += 4; - len -= 4; - } - - // Handle the last few bytes of the input array - - switch(len) - { - case 3: h ^= data[2] << 16; - case 2: h ^= data[1] << 8; - case 1: h ^= data[0]; - h *= m; - }; - - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. - - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - return h; -} - -#endif // __CXXMPH_MURMUR_HASH2__ From d4ee76b7bfacbde849fa2467e2e9b39e4f959959 Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Sat, 5 Nov 2011 15:15:11 -0200 Subject: [PATCH 55/89] Small fixes, more comments. --- cxxmph/Makefile.am | 5 ++++- cxxmph/mph_index.h | 29 +++++++++++++++++++++++++++-- cxxmph/mph_map.h | 12 ++++++++++-- cxxmph/stringpiece.h | 4 +++- cxxmph/trigraph_test.cc | 22 ++++++++++++++++++++++ 5 files changed, 66 insertions(+), 6 deletions(-) create mode 100644 cxxmph/trigraph_test.cc diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 04a90fe..55df057 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,5 +1,5 @@ TESTS = $(check_PROGRAMS) -check_PROGRAMS = mph_map_test mph_index_test +check_PROGRAMS = mph_map_test mph_index_test trigraph_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la @@ -17,6 +17,9 @@ mph_index_test_SOURCES = mph_index_test.cc bm_index_LDADD = libcxxmph.la bm_index_SOURCES = bm_common.cc bm_index.cc +trigraph_test_LDADD = libcxxmph.la +trigraph_test_SOURCES = trigraph_test.cc + bm_map_LDADD = libcxxmph.la bm_map_SOURCES = bm_common.cc bm_map.cc diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 3afc518..70cee68 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -2,6 +2,25 @@ #define __CXXMPH_MPH_INDEX_H__ // Minimal perfect hash abstraction implementing the BDZ algorithm +// +// This is a data structure that given a set of known keys S, will create a +// mapping from S to [0..|S|). The class is informed about S through the Reset +// method and the mapping is queried by calling index(key). +// +// This is a pretty uncommon data structure, and if you application has a real +// use case for it, chances are that it is a real win. If all you are doing is +// a straightforward implementation of an in-memory associative mapping data +// structure (e.g., mph_map.h), then it will probably be slower, since that the +// evaluation of index() is typically slower than the total cost of running a +// traditional hash function over a key and doing 2-3 conflict resolutions on +// 100byte-ish strings. +// +// Notes: +// +// Most users can use the SimpleMPHIndex wrapper instead of the MPHIndex which +// have confusing template parameters. +// This class only implements a minimal perfect hash function, it does not +// implement an associative mapping data structure. #include @@ -31,16 +50,20 @@ class MPHIndex { template bool Reset(ForwardIterator begin, ForwardIterator end); template // must agree with Reset + // Get a unique identifier for k, in the range [0;size()). If x wasn't part + // of the input in the last Reset call, returns a random value. uint32_t index(const Key& x) const; uint32_t size() const { return m_; } void clear(); + // Advanced users functions. Please avoid unless you know what you are doing. uint32_t perfect_hash_size() const { return n_; } template // must agree with Reset uint32_t perfect_hash(const Key& x) const; template // must agree with Reset uint32_t minimal_perfect_hash(const Key& x) const; - // Serialization machinery for mmap usage. + + // Serialization for mmap usage - not tested well, ping me if you care. // Serialized tables are not guaranteed to work across versions or different // endianness (although they could easily be made to be). uint32_t serialize_bytes_needed() const; @@ -110,7 +133,7 @@ bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) { // cerr << "m " << m_ << " n " << n_ << " r " << r_ << endl; - int iterations = 10; + int iterations = 1000; std::vector edges; std::vector queue; while (1) { @@ -176,6 +199,8 @@ uint32_t MPHIndex::index(const Key& key) const { return minimal_perfect_hash(key); } +// Simple wrapper around MPHIndex to simplify calling code. Please refer to the +// MPHIndex class for documentation. template >::hash_function> class SimpleMPHIndex : public MPHIndex { public: diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index cd8f684..bcfebb6 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -1,3 +1,10 @@ +// Implementation of the unordered associative mapping interface using a +// minimal perfect hash function. +// +// This class is about 20% to 100% slower than unordered_map (or ext/hash_map) +// and should not be used if performance is a concern. In fact, you should only +// use it for educational purposes. + #include #include #include @@ -58,6 +65,7 @@ class mph_map { const data_type& operator[](const key_type &k) const; size_type bucket_count() const { return size(); } + // FIXME: not sure if this has the semantics I want void rehash(size_type nbuckets /*ignored*/) { pack(); } protected: // mimicking STL implementation @@ -156,7 +164,7 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { if (it != slack_.end()) return values_.begin() + it->second; } if (__builtin_expect(index_.size() == 0, 0)) return end(); - auto it = values_.begin() + index_.index(k); + const_iterator it = values_.begin() + index_.index(k); if (__builtin_expect(equal_(k, it->first), 1)) return it; return end(); } @@ -167,7 +175,7 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { if (it != slack_.end()) return values_.begin() + it->second; } if (index_.size() == 0) return end(); - auto it = values_.begin() + index_.index(k); + iterator it = values_.begin() + index_.index(k); if (equal_(it->first, k)) return it; return end(); } diff --git a/cxxmph/stringpiece.h b/cxxmph/stringpiece.h index ee6d125..f1327ea 100644 --- a/cxxmph/stringpiece.h +++ b/cxxmph/stringpiece.h @@ -174,6 +174,8 @@ inline bool operator>=(const StringPiece& x, StringPiece& y) { } // namespace cxxmph // allow StringPiece to be logged -extern std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece); +inline std::ostream& operator<<(std::ostream& o, const cxxmph::StringPiece& piece) { + o << piece.as_string(); return o; +} #endif // CXXMPH_STRINGPIECE_H__ diff --git a/cxxmph/trigraph_test.cc b/cxxmph/trigraph_test.cc new file mode 100644 index 0000000..6220138 --- /dev/null +++ b/cxxmph/trigraph_test.cc @@ -0,0 +1,22 @@ +#include + +#include "trigraph.h" + +using cxxmph::TriGraph; + +int main(int argc, char** argv) { + TriGraph g(4, 2); + g.AddEdge(TriGraph::Edge(0, 1, 2)); + g.AddEdge(TriGraph::Edge(1, 3, 2)); + assert(g.vertex_degree()[0] == 1); + assert(g.vertex_degree()[1] == 2); + assert(g.vertex_degree()[2] == 2); + assert(g.vertex_degree()[3] == 1); + g.RemoveEdge(0); + assert(g.vertex_degree()[0] == 0); + assert(g.vertex_degree()[1] == 1); + assert(g.vertex_degree()[2] == 1); + assert(g.vertex_degree()[3] == 1); + std::vector edges; + g.ExtractEdgesAndClear(&edges); +} From beb77d0e2da4ce0f203d63f4687e2c21369cc77e Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Thu, 10 Nov 2011 16:44:37 -0200 Subject: [PATCH 56/89] Removed tr1 stuff. --- cxxmph/bm_common.h | 4 +-- cxxmph/bm_index.cc | 8 +++--- cxxmph/bm_map.cc | 58 ++++++++++++++++++++++---------------------- cxxmph/mph_index.h | 4 +-- cxxmph/mph_map.h | 6 ++--- cxxmph/mph_table.h | 16 ------------ cxxmph/seeded_hash.h | 32 ++++++++++++------------ cxxmph/stringpiece.h | 1 + 8 files changed, 56 insertions(+), 73 deletions(-) delete mode 100644 cxxmph/mph_table.h diff --git a/cxxmph/bm_common.h b/cxxmph/bm_common.h index fc95b21..ad8466e 100644 --- a/cxxmph/bm_common.h +++ b/cxxmph/bm_common.h @@ -2,19 +2,17 @@ #include #include -#include // for std::tr1::hash +#include // std::hash #include "MurmurHash2.h" #include "benchmark.h" namespace std { -namespace tr1 { template <> struct hash { uint32_t operator()(const cxxmph::StringPiece& k) const { return cxxmph::MurmurHash2(k.data(), k.length(), 1); } }; -} // namespace tr1 } // namespace std namespace cxxmph { diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index f92972b..84bf7d2 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -1,6 +1,6 @@ #include #include -#include +#include #include "bm_common.h" #include "stringpiece.h" @@ -9,7 +9,7 @@ using namespace cxxmph; using std::string; -using std::tr1::unordered_map; +using std::unordered_map; class BM_MPHIndexCreate : public UrlsBenchmark { public: @@ -35,7 +35,7 @@ class BM_STLIndexCreate : public UrlsBenchmark { } } }; - + class BM_MPHIndexSearch : public SearchUrlsBenchmark { public: BM_MPHIndexSearch(const std::string& urls_file, int nsearches) @@ -76,7 +76,7 @@ class BM_STLIndexSearch : public SearchUrlsBenchmark { index.swap(index_); return true; } - std::tr1::unordered_map index_; + std::unordered_map index_; }; int main(int argc, char** argv) { diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 95e76f7..607edc6 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -4,38 +4,38 @@ #include "bm_common.h" #include "mph_map.h" -using cxxmph::mph_map; -using std::string; -using std::tr1::unordered_map; + using cxxmph::mph_map; + using std::string; + using std::unordered_map; -namespace cxxmph { + namespace cxxmph { -uint64_t myfind(const unordered_map& mymap, const uint64_t& k) { - return mymap.find(k)->second; -} - -uint64_t myfind(const mph_map& mymap, const uint64_t& k) { - return mymap.find(k)->second; -} - -const StringPiece& myfind(const unordered_map& mymap, const StringPiece& k) { - return mymap.find(k)->second; -} -StringPiece myfind(const mph_map& mymap, const StringPiece& k) { - auto it = mymap.find(k); - return it->second; -} - -template -class BM_CreateUrls : public UrlsBenchmark { - public: - BM_CreateUrls(const string& urls_file) : UrlsBenchmark(urls_file) { } - virtual void Run() { - MapType mymap; - for (auto it = urls_.begin(); it != urls_.end(); ++it) { - mymap[*it] = *it; - } + uint64_t myfind(const unordered_map& mymap, const uint64_t& k) { + return mymap.find(k)->second; } + + uint64_t myfind(const mph_map& mymap, const uint64_t& k) { + return mymap.find(k)->second; + } + + const StringPiece& myfind(const unordered_map& mymap, const StringPiece& k) { + return mymap.find(k)->second; + } + StringPiece myfind(const mph_map& mymap, const StringPiece& k) { + auto it = mymap.find(k); + return it->second; + } + + template + class BM_CreateUrls : public UrlsBenchmark { + public: + BM_CreateUrls(const string& urls_file) : UrlsBenchmark(urls_file) { } + virtual void Run() { + MapType mymap; + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + mymap[*it] = *it; + } + } }; template diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 70cee68..3ee9090 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -26,7 +26,7 @@ #include #include -#include // for std::tr1::hash +#include // for std::hash #include #include @@ -201,7 +201,7 @@ uint32_t MPHIndex::index(const Key& key) const { // Simple wrapper around MPHIndex to simplify calling code. Please refer to the // MPHIndex class for documentation. -template >::hash_function> +template >::hash_function> class SimpleMPHIndex : public MPHIndex { public: template diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index bcfebb6..23407d4 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -6,7 +6,7 @@ // use it for educational purposes. #include -#include +#include #include #include // for std::pair @@ -15,14 +15,14 @@ namespace cxxmph { -using std::tr1::unordered_map; +using std::unordered_map; // Save on repetitive typing. #define MPH_MAP_TMPL_SPEC template #define MPH_MAP_CLASS_SPEC mph_map #define MPH_MAP_METHOD_DECL(r, m) MPH_MAP_TMPL_SPEC typename MPH_MAP_CLASS_SPEC::r MPH_MAP_CLASS_SPEC::m -template , class EqualKey = std::equal_to, class Alloc = std::allocator > +template , class EqualKey = std::equal_to, class Alloc = std::allocator > class mph_map { public: typedef Key key_type; diff --git a/cxxmph/mph_table.h b/cxxmph/mph_table.h deleted file mode 100644 index 234540d..0000000 --- a/cxxmph/mph_table.h +++ /dev/null @@ -1,16 +0,0 @@ -#include "mph_index.h" - -// String to string map working on mmap'ed memory - -class MPHTable { - public: - typedef StringPiece key_type; - typedef StringPiece data_type; - typedef std::pair value_type; - template - bool Reset(ForwardIterator begin, ForwardIterator end); - private: - char* data_; - vector offsets_; - MPHIndex index_; -}; diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index a12d4f8..64cb74d 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -4,7 +4,7 @@ #include // for uint32_t and friends #include -#include // for std::tr1::hash +#include // for std::hash #include "MurmurHash2.h" #include "stringpiece.h" @@ -59,36 +59,36 @@ struct seeded_hash_function { template struct seeded_hash { typedef seeded_hash_function hash_function; }; -// Use Murmur2 instead for all types defined in std::tr1::hash, plus +// Use Murmur2 instead for all types defined in std::hash, plus // std::string which is commonly extended. -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -template <> struct seeded_hash > +template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; } // namespace cxxmph diff --git a/cxxmph/stringpiece.h b/cxxmph/stringpiece.h index f1327ea..06cea3a 100644 --- a/cxxmph/stringpiece.h +++ b/cxxmph/stringpiece.h @@ -19,6 +19,7 @@ #ifndef CXXMPH_STRINGPIECE_H__ #define CXXMPH_STRINGPIECE_H__ +#include #include #include #include From 91dc5d95d535e7b858c7aa5fc76d1299a0f7625e Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Mon, 5 Dec 2011 16:03:10 -0200 Subject: [PATCH 57/89] Fixed headers. --- cxxmph/bm_common.h | 5 +++++ cxxmph/mph_map.h | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/cxxmph/bm_common.h b/cxxmph/bm_common.h index ad8466e..4fea687 100644 --- a/cxxmph/bm_common.h +++ b/cxxmph/bm_common.h @@ -1,3 +1,6 @@ +#ifndef __CXXMPH_BM_COMMON_H__ +#define __CXXMPH_BM_COMMON_H__ + #include "stringpiece.h" #include @@ -58,3 +61,5 @@ class SearchUint64Benchmark : public Uint64Benchmark { }; } // namespace cxxmph + +#endif // __CXXMPH_BM_COMMON_H__ diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 23407d4..ddb8268 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -1,3 +1,5 @@ +#ifndef __CXXMPH_MPH_MAP_H__ +#define __CXXMPH_MPH_MAP_H__ // Implementation of the unordered associative mapping interface using a // minimal perfect hash function. // @@ -191,3 +193,5 @@ MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) { } } // namespace cxxmph + +#endif // __CXXMPH_MPH_MAP_H__ From 3ba778f671c98049f814e39247f441f3e2ae74cb Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Fri, 9 Dec 2011 23:57:37 -0200 Subject: [PATCH 58/89] Aesthetics, compile on mac with gcc44. --- acinclude.m4 | 2 ++ cxxmph/mph_map.h | 19 +++++++++++-------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/acinclude.m4 b/acinclude.m4 index bde7628..e926f46 100644 --- a/acinclude.m4 +++ b/acinclude.m4 @@ -44,6 +44,7 @@ AC_DEFUN([AC_COMPILE_STDCXX_0X], [ ac_save_CXXFLAGS="$CXXFLAGS" CXXFLAGS="$CXXFLAGS -std=c++0x" AC_TRY_COMPILE([ + #include template struct check { @@ -66,6 +67,7 @@ AC_DEFUN([AC_COMPILE_STDCXX_0X], [ ac_save_CXXFLAGS="$CXXFLAGS" CXXFLAGS="$CXXFLAGS -std=gnu++0x" AC_TRY_COMPILE([ + #include template struct check { diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index ddb8268..e574c7c 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -17,7 +17,10 @@ namespace cxxmph { +using std::pair; +using std::make_pair; using std::unordered_map; +using std::vector; // Save on repetitive typing. #define MPH_MAP_TMPL_SPEC template @@ -29,7 +32,7 @@ class mph_map { public: typedef Key key_type; typedef Data data_type; - typedef std::pair value_type; + typedef pair value_type; typedef HashFcn hasher; typedef EqualKey key_equal; @@ -44,7 +47,7 @@ class mph_map { // For making macros simpler. typedef void void_type; typedef bool bool_type; - typedef std::pair insert_return_type; + typedef pair insert_return_type; mph_map(); ~mph_map(); @@ -58,10 +61,10 @@ class mph_map { void clear(); void erase(iterator pos); void erase(const key_type& k); - std::pair insert(const value_type& x); + pair insert(const value_type& x); iterator find(const key_type& k); const_iterator find(const key_type& k) const; - typedef int32_t my_int32_t; + typedef int32_t my_int32_t; // help macros int32_t index(const key_type& k) const; data_type& operator[](const key_type &k); const data_type& operator[](const key_type &k) const; @@ -109,15 +112,15 @@ MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() { MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { iterator it = find(x.first); - if (it != end()) return std::make_pair(it, false); + if (it != end()) return make_pair(it, false); values_.push_back(x); - slack_.insert(std::make_pair(x.first, values_.size() - 1)); + slack_.insert(make_pair(x.first, values_.size() - 1)); if (slack_.size() == index_.size() || (slack_.size() >= 256 && index_.size() == 0)) { pack(); } it = find(x.first); - return std::make_pair(it, true); + return make_pair(it, true); } MPH_MAP_METHOD_DECL(void_type, pack)() { @@ -189,7 +192,7 @@ MPH_MAP_METHOD_DECL(my_int32_t, index)(const key_type& k) const { } MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) { - return insert(std::make_pair(k, data_type())).first->second; + return insert(make_pair(k, data_type())).first->second; } } // namespace cxxmph From 4e4d36d833da0e1c90eeaab10f95246ebb346d8f Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Mon, 26 Dec 2011 19:12:24 -0200 Subject: [PATCH 59/89] Fixed fread test. --- src/cmph.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/cmph.c b/src/cmph.c index b0c33bf..ae76727 100644 --- a/src/cmph.c +++ b/src/cmph.c @@ -18,18 +18,18 @@ const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz", "bdz_ph", "chd_ph", "chd", NULL }; -typedef struct +typedef struct { void *vector; - cmph_uint32 position; // access position when data is a vector + cmph_uint32 position; // access position when data is a vector } cmph_vector_t; -/** +/** * Support a vector of struct as the source of keys. * - * E.g. The keys could be the fieldB's in a vector of struct rec where + * E.g. The keys could be the fieldB's in a vector of struct rec where * struct rec is defined as: * struct rec { * fieldA; @@ -37,7 +37,7 @@ typedef struct * fieldC; * } */ -typedef struct +typedef struct { void *vector; /* Pointer to the vector of struct */ cmph_uint32 position; /* current position */ @@ -61,7 +61,7 @@ static int key_nlfile_read(void *data, char **key, cmph_uint32 *keylen) while(1) { char buf[BUFSIZ]; - char *c = fgets(buf, BUFSIZ, fd); + char *c = fgets(buf, BUFSIZ, fd); if (c == NULL) return -1; if (feof(fd)) return -1; *key = (char *)realloc(*key, *keylen + strlen(buf) + 1); @@ -156,8 +156,12 @@ static cmph_uint32 count_nlfile_keys(FILE *fd) while(1) { char buf[BUFSIZ]; - ptr = fgets(buf, BUFSIZ, fd); + ptr = fgets(buf, BUFSIZ, fd); if (feof(fd)) break; + if (ferror(fd) || ptr == NULL) { + perror("Error reading input file"); + return 0; + } if (buf[strlen(buf) - 1] != '\n') continue; ++count; } From 24e645febebb7716df7c92445c84a7ded548d7af Mon Sep 17 00:00:00 2001 From: Davi de Castro Reis Date: Mon, 26 Dec 2011 19:35:30 -0200 Subject: [PATCH 60/89] Aesthetics in C code and replaced some asserts with NULL returns. --- src/bdz.c | 80 ++++++++--------- src/bdz_ph.c | 106 +++++++++++----------- src/bm_numbers.c | 1 - src/bmz.c | 116 ++++++++++++------------ src/bmz8.c | 128 +++++++++++++-------------- src/brz.c | 190 ++++++++++++++++++++-------------------- src/buffer_entry.c | 10 +-- src/buffer_manage.c | 6 +- src/buffer_manager.c | 6 +- src/chd.c | 60 ++++++------- src/chd_ph.c | 157 ++++++++++++++++----------------- src/chm.c | 64 +++++++------- src/cmph.c | 66 +++++++------- src/cmph_structs.c | 8 +- src/djb2_hash.c | 3 +- src/fch.c | 89 ++++++++++--------- src/fch_buckets.c | 24 ++--- src/fnv_hash.c | 12 +-- src/graph.c | 30 +++---- src/hash.c | 10 +-- src/hashtree.c | 32 +++---- src/jenkins_hash.c | 71 +++++++-------- src/linear_string_map.c | 4 +- src/main.c | 26 +++--- src/sdbm_hash.c | 1 + src/vqueue.c | 4 +- src/vstack.c | 1 - 27 files changed, 649 insertions(+), 656 deletions(-) diff --git a/src/bdz.c b/src/bdz.c index 7629a6c..2c0de90 100755 --- a/src/bdz.c +++ b/src/bdz.c @@ -35,9 +35,9 @@ const cmph_uint8 bdz_lookup_table[] = 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0 -}; +}; -typedef struct +typedef struct { cmph_uint32 vertices[3]; cmph_uint32 next_edges[3]; @@ -54,12 +54,12 @@ static void bdz_free_queue(bdz_queue_t * queue) free(*queue); }; -typedef struct +typedef struct { cmph_uint32 nedges; bdz_edge_t * edges; cmph_uint32 * first_edge; - cmph_uint8 * vert_degree; + cmph_uint8 * vert_degree; }bdz_graph3_t; @@ -67,7 +67,7 @@ static void bdz_alloc_graph3(bdz_graph3_t * graph3, cmph_uint32 nedges, cmph_uin { graph3->edges=malloc(nedges*sizeof(bdz_edge_t)); graph3->first_edge=malloc(nvertices*sizeof(cmph_uint32)); - graph3->vert_degree=malloc((size_t)nvertices); + graph3->vert_degree=malloc((size_t)nvertices); }; static void bdz_init_graph3(bdz_graph3_t * graph3, cmph_uint32 nedges, cmph_uint32 nvertices) { @@ -136,7 +136,7 @@ static void bdz_remove_edge(bdz_graph3_t * graph3, cmph_uint32 curr_edge) j=0; } else if(graph3->edges[edge1].vertices[1]==vert){ j=1; - } else + } else j=2; edge1=graph3->edges[edge1].next_edges[j]; }; @@ -145,16 +145,16 @@ static void bdz_remove_edge(bdz_graph3_t * graph3, cmph_uint32 curr_edge) bdz_dump_graph(graph3,graph3->nedges,graph3->nedges+graph3->nedges/4); exit(-1); }; - + if(edge2!=NULL_EDGE){ - graph3->edges[edge2].next_edges[j] = + graph3->edges[edge2].next_edges[j] = graph3->edges[edge1].next_edges[i]; - } else + } else graph3->first_edge[vert]= graph3->edges[edge1].next_edges[i]; graph3->vert_degree[vert]--; }; - + }; static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_queue_t queue, bdz_graph3_t* graph3) @@ -170,7 +170,7 @@ static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_que v0=graph3->edges[i].vertices[0]; v1=graph3->edges[i].vertices[1]; v2=graph3->edges[i].vertices[2]; - if(graph3->vert_degree[v0]==1 || + if(graph3->vert_degree[v0]==1 || graph3->vert_degree[v1]==1 || graph3->vert_degree[v2]==1){ if(!GETBIT(marked_edge,i)) { @@ -196,7 +196,7 @@ static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_que queue[queue_head++]=tmp_edge; SETBIT(marked_edge,tmp_edge); }; - + }; if(graph3->vert_degree[v1]==1) { tmp_edge=graph3->first_edge[v1]; @@ -204,7 +204,7 @@ static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_que queue[queue_head++]=tmp_edge; SETBIT(marked_edge,tmp_edge); }; - + }; if(graph3->vert_degree[v2]==1){ tmp_edge=graph3->first_edge[v2]; @@ -227,7 +227,7 @@ bdz_config_data_t *bdz_config_new(void) { bdz_config_data_t *bdz; bdz = (bdz_config_data_t *)malloc(sizeof(bdz_config_data_t)); - assert(bdz); + if (!bdz) return NULL; memset(bdz, 0, sizeof(bdz_config_data_t)); bdz->hashfunc = CMPH_HASH_JENKINS; bdz->g = NULL; @@ -328,10 +328,10 @@ cmph_t *bdz_new(cmph_config_t *mph, double c) fprintf(stderr, "acyclic graph creation failure - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } + } else break; } - + if (iterations == 0) { bdz_free_queue(&edges); @@ -353,7 +353,7 @@ cmph_t *bdz_new(cmph_config_t *mph, double c) fprintf(stderr, "Entering ranking step for mph creation of %u keys with graph sized %u\n", bdz->m, bdz->n); } ranking(bdz); - #ifdef CMPH_TIMING + #ifdef CMPH_TIMING ELAPSED_TIME_IN_SECONDS(&construction_time); #endif mphf = (cmph_t *)malloc(sizeof(cmph_t)); @@ -381,17 +381,17 @@ cmph_t *bdz_new(cmph_config_t *mph, double c) } - #ifdef CMPH_TIMING + #ifdef CMPH_TIMING register cmph_uint32 space_usage = bdz_packed_size(mphf)*8; register cmph_uint32 keys_per_bucket = 1; construction_time = construction_time - construction_time_begin; fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", bdz->m, bdz->m/(double)bdz->n, keys_per_bucket, construction_time, space_usage/(double)bdz->m); - #endif + #endif return mphf; } - + static int bdz_mapping(cmph_config_t *mph, bdz_graph3_t* graph3, bdz_queue_t queue) { cmph_uint32 e; @@ -405,7 +405,7 @@ static int bdz_mapping(cmph_config_t *mph, bdz_graph3_t* graph3, bdz_queue_t que cmph_uint32 h0, h1, h2; cmph_uint32 keylen; char *key = NULL; - mph->key_source->read(mph->key_source->data, &key, &keylen); + mph->key_source->read(mph->key_source->data, &key, &keylen); hash_vector(bdz->hl, key, keylen,hl); h0 = hl[0] % bdz->r; h1 = hl[1] % bdz->r + bdz->r; @@ -414,7 +414,7 @@ static int bdz_mapping(cmph_config_t *mph, bdz_graph3_t* graph3, bdz_queue_t que mph->key_source->dispose(mph->key_source->data, key, keylen); bdz_add_edge(graph3,h0,h1,h2); } - cycles = bdz_generate_queue(bdz->m, bdz->n, queue, graph3); + cycles = bdz_generate_queue(bdz->m, bdz->n, queue, graph3); return (cycles == 0); } @@ -426,7 +426,7 @@ static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t cmph_uint32 v0,v1,v2; cmph_uint8 * marked_vertices =malloc((size_t)(bdz->n >> 3) + 1); cmph_uint32 sizeg = (cmph_uint32)ceil(bdz->n/4.0); - bdz->g = (cmph_uint8 *)calloc((size_t)(sizeg), sizeof(cmph_uint8)); + bdz->g = (cmph_uint8 *)calloc((size_t)(sizeg), sizeof(cmph_uint8)); memset(marked_vertices, 0, (size_t)(bdz->n >> 3) + 1); memset(bdz->g, 0xff, (size_t)(sizeg)); @@ -439,12 +439,12 @@ static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t if(!GETBIT(marked_vertices, v0)){ if(!GETBIT(marked_vertices,v1)) { - SETVALUE1(bdz->g, v1, UNASSIGNED); + SETVALUE1(bdz->g, v1, UNASSIGNED); SETBIT(marked_vertices, v1); } if(!GETBIT(marked_vertices,v2)) { - SETVALUE1(bdz->g, v2, UNASSIGNED); + SETVALUE1(bdz->g, v2, UNASSIGNED); SETBIT(marked_vertices, v2); } SETVALUE1(bdz->g, v0, (6-(GETVALUE(bdz->g, v1) + GETVALUE(bdz->g,v2)))%3); @@ -507,7 +507,7 @@ int bdz_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(&(data->r), sizeof(cmph_uint32), (size_t)1, fd); - + cmph_uint32 sizeg = (cmph_uint32)ceil(data->n/4.0); nbytes = fwrite(data->g, sizeof(cmph_uint8)*sizeg, (size_t)1, fd); @@ -541,12 +541,12 @@ void bdz_load(FILE *f, cmph_t *mphf) nbytes = fread(buf, (size_t)buflen, (size_t)1, f); bdz->hl = hash_state_load(buf, buflen); free(buf); - + DEBUGP("Reading m and n\n"); - nbytes = fread(&(bdz->n), sizeof(cmph_uint32), (size_t)1, f); - nbytes = fread(&(bdz->m), sizeof(cmph_uint32), (size_t)1, f); - nbytes = fread(&(bdz->r), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz->n), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz->m), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz->r), sizeof(cmph_uint32), (size_t)1, f); sizeg = (cmph_uint32)ceil(bdz->n/4.0); bdz->g = (cmph_uint8 *)calloc((size_t)(sizeg), sizeof(cmph_uint8)); nbytes = fread(bdz->g, sizeg*sizeof(cmph_uint8), (size_t)1, f); @@ -566,7 +566,7 @@ void bdz_load(FILE *f, cmph_t *mphf) #endif return; } - + static inline cmph_uint32 rank(cmph_uint32 b, cmph_uint32 * ranktable, cmph_uint8 * g, cmph_uint32 vertex) { @@ -578,17 +578,17 @@ static inline cmph_uint32 rank(cmph_uint32 b, cmph_uint32 * ranktable, cmph_uint while(beg_idx_b < end_idx_b) { base_rank += bdz_lookup_table[*(g + beg_idx_b++)]; - + } DEBUGP("base rank %u\n", base_rank); beg_idx_v = beg_idx_b << 2; DEBUGP("beg_idx_v %u\n", beg_idx_v); - while(beg_idx_v < vertex) + while(beg_idx_v < vertex) { if(GETVALUE(g, beg_idx_v) != UNASSIGNED) base_rank++; beg_idx_v++; } - + return base_rank; } @@ -610,7 +610,7 @@ cmph_uint32 bdz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) void bdz_destroy(cmph_t *mphf) { bdz_data_t *data = (bdz_data_t *)mphf->data; - free(data->g); + free(data->g); hash_state_destroy(data->hl); free(data->ranktable); free(data); @@ -660,18 +660,18 @@ void bdz_pack(cmph_t *mphf, void *packed_mphf) * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 bdz_packed_size(cmph_t *mphf) { bdz_data_t *data = (bdz_data_t *)mphf->data; - CMPH_HASH hl_type = hash_get_type(data->hl); + CMPH_HASH hl_type = hash_get_type(data->hl); return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(hl_type) + 3*sizeof(cmph_uint32) + sizeof(cmph_uint32)*(data->ranktablesize) + sizeof(cmph_uint8) + sizeof(cmph_uint8)* (cmph_uint32)(ceil(data->n/4.0))); } /** cmph_uint32 bdz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -679,13 +679,13 @@ cmph_uint32 bdz_packed_size(cmph_t *mphf) */ cmph_uint32 bdz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) { - + register cmph_uint32 vertex; register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf; register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4; register cmph_uint32 *ranktable = (cmph_uint32*)(hl_ptr + hash_state_packed_size(hl_type)); - + register cmph_uint32 r = *ranktable++; register cmph_uint32 ranktablesize = *ranktable++; register cmph_uint8 * g = (cmph_uint8 *)(ranktable + ranktablesize); diff --git a/src/bdz_ph.c b/src/bdz_ph.c index 16257c0..ad52d78 100755 --- a/src/bdz_ph.c +++ b/src/bdz_ph.c @@ -24,7 +24,7 @@ static cmph_uint8 lookup_table[5][256] = { {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, }; -typedef struct +typedef struct { cmph_uint32 vertices[3]; cmph_uint32 next_edges[3]; @@ -41,12 +41,12 @@ static void bdz_ph_free_queue(bdz_ph_queue_t * queue) free(*queue); }; -typedef struct +typedef struct { cmph_uint32 nedges; bdz_ph_edge_t * edges; cmph_uint32 * first_edge; - cmph_uint8 * vert_degree; + cmph_uint8 * vert_degree; }bdz_ph_graph3_t; @@ -54,7 +54,7 @@ static void bdz_ph_alloc_graph3(bdz_ph_graph3_t * graph3, cmph_uint32 nedges, cm { graph3->edges=malloc(nedges*sizeof(bdz_ph_edge_t)); graph3->first_edge=malloc(nvertices*sizeof(cmph_uint32)); - graph3->vert_degree=malloc((size_t)nvertices); + graph3->vert_degree=malloc((size_t)nvertices); }; static void bdz_ph_init_graph3(bdz_ph_graph3_t * graph3, cmph_uint32 nedges, cmph_uint32 nvertices) { @@ -101,10 +101,10 @@ static void bdz_ph_dump_graph(bdz_ph_graph3_t* graph3, cmph_uint32 nedges, cmph_ printf(" nexts %d %d %d",graph3->edges[i].next_edges[0], graph3->edges[i].next_edges[1],graph3->edges[i].next_edges[2]); }; - + for(i=0;ifirst_edge[i]); - + }; }; @@ -121,7 +121,7 @@ static void bdz_ph_remove_edge(bdz_ph_graph3_t * graph3, cmph_uint32 curr_edge) j=0; } else if(graph3->edges[edge1].vertices[1]==vert){ j=1; - } else + } else j=2; edge1=graph3->edges[edge1].next_edges[j]; }; @@ -130,16 +130,16 @@ static void bdz_ph_remove_edge(bdz_ph_graph3_t * graph3, cmph_uint32 curr_edge) bdz_ph_dump_graph(graph3,graph3->nedges,graph3->nedges+graph3->nedges/4); exit(-1); }; - + if(edge2!=NULL_EDGE){ - graph3->edges[edge2].next_edges[j] = + graph3->edges[edge2].next_edges[j] = graph3->edges[edge1].next_edges[i]; - } else + } else graph3->first_edge[vert]= graph3->edges[edge1].next_edges[i]; graph3->vert_degree[vert]--; }; - + }; static int bdz_ph_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_ph_queue_t queue, bdz_ph_graph3_t* graph3) @@ -176,7 +176,7 @@ static int bdz_ph_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_ queue[queue_head++]=tmp_edge; SETBIT(marked_edge,tmp_edge); }; - + }; if(graph3->vert_degree[v1]==1) { tmp_edge=graph3->first_edge[v1]; @@ -184,7 +184,7 @@ static int bdz_ph_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_ queue[queue_head++]=tmp_edge; SETBIT(marked_edge,tmp_edge); }; - + }; if(graph3->vert_degree[v2]==1){ tmp_edge=graph3->first_edge[v2]; @@ -229,7 +229,7 @@ void bdz_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 1) break; //bdz_ph only uses one linear hash function - bdz_ph->hashfunc = *hashptr; + bdz_ph->hashfunc = *hashptr; ++i, ++hashptr; } } @@ -251,16 +251,16 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c) if (c == 0) c = 1.23; // validating restrictions over parameter c. DEBUGP("c: %f\n", c); - bdz_ph->m = mph->key_source->nkeys; - bdz_ph->r = (cmph_uint32)ceil((c * mph->key_source->nkeys)/3); + bdz_ph->m = mph->key_source->nkeys; + bdz_ph->r = (cmph_uint32)ceil((c * mph->key_source->nkeys)/3); if ((bdz_ph->r % 2) == 0) bdz_ph->r += 1; bdz_ph->n = 3*bdz_ph->r; - + bdz_ph_alloc_graph3(&graph3, bdz_ph->m, bdz_ph->n); bdz_ph_alloc_queue(&edges,bdz_ph->m); DEBUGP("Created hypergraph\n"); - + DEBUGP("m (edges): %u n (vertices): %u r: %u c: %f \n", bdz_ph->m, bdz_ph->n, bdz_ph->r, c); // Mapping step @@ -287,10 +287,10 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c) fprintf(stderr, "acyclic graph creation failure - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } + } else break; } - + if (iterations == 0) { // free(bdz_ph->g); @@ -308,7 +308,7 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c) bdz_ph_free_queue(&edges); bdz_ph_free_graph3(&graph3); - + if (mph->verbosity) { fprintf(stderr, "Starting optimization step\n"); @@ -338,23 +338,23 @@ cmph_t *bdz_ph_new(cmph_config_t *mph, double c) fprintf(stderr, "Successfully generated minimal perfect hash function\n"); } - #ifdef CMPH_TIMING + #ifdef CMPH_TIMING register cmph_uint32 space_usage = bdz_ph_packed_size(mphf)*8; register cmph_uint32 keys_per_bucket = 1; construction_time = construction_time - construction_time_begin; fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", bdz_ph->m, bdz_ph->m/(double)bdz_ph->n, keys_per_bucket, construction_time, space_usage/(double)bdz_ph->m); - #endif + #endif return mphf; } - + static int bdz_ph_mapping(cmph_config_t *mph, bdz_ph_graph3_t* graph3, bdz_ph_queue_t queue) { cmph_uint32 e; int cycles = 0; cmph_uint32 hl[3]; - + bdz_ph_config_data_t *bdz_ph = (bdz_ph_config_data_t *)mph->data; bdz_ph_init_graph3(graph3, bdz_ph->m, bdz_ph->n); mph->key_source->rewind(mph->key_source->data); @@ -363,7 +363,7 @@ static int bdz_ph_mapping(cmph_config_t *mph, bdz_ph_graph3_t* graph3, bdz_ph_qu cmph_uint32 h0, h1, h2; cmph_uint32 keylen; char *key = NULL; - mph->key_source->read(mph->key_source->data, &key, &keylen); + mph->key_source->read(mph->key_source->data, &key, &keylen); hash_vector(bdz_ph->hl, key, keylen, hl); h0 = hl[0] % bdz_ph->r; h1 = hl[1] % bdz_ph->r + bdz_ph->r; @@ -371,7 +371,7 @@ static int bdz_ph_mapping(cmph_config_t *mph, bdz_ph_graph3_t* graph3, bdz_ph_qu mph->key_source->dispose(mph->key_source->data, key, keylen); bdz_ph_add_edge(graph3,h0,h1,h2); } - cycles = bdz_ph_generate_queue(bdz_ph->m, bdz_ph->n, queue, graph3); + cycles = bdz_ph_generate_queue(bdz_ph->m, bdz_ph->n, queue, graph3); return (cycles == 0); } @@ -383,7 +383,7 @@ static void assigning(bdz_ph_config_data_t *bdz_ph, bdz_ph_graph3_t* graph3, bdz cmph_uint32 v0,v1,v2; cmph_uint8 * marked_vertices =malloc((size_t)(bdz_ph->n >> 3) + 1); cmph_uint32 sizeg = (cmph_uint32)ceil(bdz_ph->n/4.0); - bdz_ph->g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); + bdz_ph->g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); memset(marked_vertices, 0, (size_t)(bdz_ph->n >> 3) + 1); //memset(bdz_ph->g, 0xff, sizeg); @@ -396,14 +396,14 @@ static void assigning(bdz_ph_config_data_t *bdz_ph, bdz_ph_graph3_t* graph3, bdz if(!GETBIT(marked_vertices, v0)){ if(!GETBIT(marked_vertices,v1)) { - //SETVALUE(bdz_ph->g, v1, UNASSIGNED); + //SETVALUE(bdz_ph->g, v1, UNASSIGNED); SETBIT(marked_vertices, v1); } if(!GETBIT(marked_vertices,v2)) { - //SETVALUE(bdz_ph->g, v2, UNASSIGNED); + //SETVALUE(bdz_ph->g, v2, UNASSIGNED); SETBIT(marked_vertices, v2); - } + } SETVALUE0(bdz_ph->g, v0, (6-(GETVALUE(bdz_ph->g, v1) + GETVALUE(bdz_ph->g,v2)))%3); SETBIT(marked_vertices, v0); } else if(!GETBIT(marked_vertices, v1)) { @@ -417,7 +417,7 @@ static void assigning(bdz_ph_config_data_t *bdz_ph, bdz_ph_graph3_t* graph3, bdz }else { SETVALUE0(bdz_ph->g, v2, (8-(GETVALUE(bdz_ph->g,v0)+GETVALUE(bdz_ph->g, v1)))%3); SETBIT(marked_vertices, v2); - } + } DEBUGP("A:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz_ph->g, v0), GETVALUE(bdz_ph->g, v1), GETVALUE(bdz_ph->g, v2)); }; free(marked_vertices); @@ -428,11 +428,11 @@ static void bdz_ph_optimization(bdz_ph_config_data_t *bdz_ph) cmph_uint32 i; cmph_uint8 byte = 0; cmph_uint32 sizeg = (cmph_uint32)ceil(bdz_ph->n/5.0); - cmph_uint8 * new_g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); + cmph_uint8 * new_g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); cmph_uint8 value; cmph_uint32 idx; - for(i = 0; i < bdz_ph->n; i++) - { + for(i = 0; i < bdz_ph->n; i++) + { idx = i/5; byte = new_g[idx]; value = GETVALUE(bdz_ph->g, i); @@ -462,7 +462,7 @@ int bdz_ph_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(&(data->r), sizeof(cmph_uint32), (size_t)1, fd); - sizeg = (cmph_uint32)ceil(data->n/5.0); + sizeg = (cmph_uint32)ceil(data->n/5.0); nbytes = fwrite(data->g, sizeof(cmph_uint8)*sizeg, (size_t)1, fd); #ifdef DEBUG @@ -491,19 +491,19 @@ void bdz_ph_load(FILE *f, cmph_t *mphf) nbytes = fread(buf, (size_t)buflen, (size_t)1, f); bdz_ph->hl = hash_state_load(buf, buflen); free(buf); - + DEBUGP("Reading m and n\n"); - nbytes = fread(&(bdz_ph->n), sizeof(cmph_uint32), (size_t)1, f); - nbytes = fread(&(bdz_ph->m), sizeof(cmph_uint32), (size_t)1, f); - nbytes = fread(&(bdz_ph->r), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz_ph->n), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz_ph->m), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz_ph->r), sizeof(cmph_uint32), (size_t)1, f); sizeg = (cmph_uint32)ceil(bdz_ph->n/5.0); bdz_ph->g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); nbytes = fread(bdz_ph->g, sizeg*sizeof(cmph_uint8), (size_t)1, f); return; } - + cmph_uint32 bdz_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { @@ -520,12 +520,12 @@ cmph_uint32 bdz_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) byte0 = bdz_ph->g[hl[0]/5]; byte1 = bdz_ph->g[hl[1]/5]; byte2 = bdz_ph->g[hl[2]/5]; - + byte0 = lookup_table[hl[0]%5U][byte0]; byte1 = lookup_table[hl[1]%5U][byte1]; byte2 = lookup_table[hl[2]%5U][byte2]; vertex = hl[(byte0 + byte1 + byte2)%3]; - + return vertex; } @@ -533,7 +533,7 @@ cmph_uint32 bdz_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) void bdz_ph_destroy(cmph_t *mphf) { bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data; - free(data->g); + free(data->g); hash_state_destroy(data->hl); free(data); free(mphf); @@ -571,17 +571,17 @@ void bdz_ph_pack(cmph_t *mphf, void *packed_mphf) * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 bdz_ph_packed_size(cmph_t *mphf) { bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data; - CMPH_HASH hl_type = hash_get_type(data->hl); + CMPH_HASH hl_type = hash_get_type(data->hl); cmph_uint32 sizeg = (cmph_uint32)ceil(data->n/5.0); return (cmph_uint32) (sizeof(CMPH_ALGO) + hash_state_packed_size(hl_type) + 2*sizeof(cmph_uint32) + sizeof(cmph_uint8)*sizeg); } /** cmph_uint32 bdz_ph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -589,21 +589,21 @@ cmph_uint32 bdz_ph_packed_size(cmph_t *mphf) */ cmph_uint32 bdz_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) { - + register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf; register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4; - + register cmph_uint8 * ptr = hl_ptr + hash_state_packed_size(hl_type); register cmph_uint32 r = *((cmph_uint32*) ptr); register cmph_uint8 * g = ptr + 4; - + cmph_uint32 hl[3]; register cmph_uint8 byte0, byte1, byte2; register cmph_uint32 vertex; hash_vector_packed(hl_ptr, hl_type, key, keylen, hl); - + hl[0] = hl[0] % r; hl[1] = hl[1] % r + r; hl[2] = hl[2] % r + (r << 1); @@ -611,11 +611,11 @@ cmph_uint32 bdz_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 byte0 = g[hl[0]/5]; byte1 = g[hl[1]/5]; byte2 = g[hl[2]/5]; - + byte0 = lookup_table[hl[0]%5][byte0]; byte1 = lookup_table[hl[1]%5][byte1]; byte2 = lookup_table[hl[2]%5][byte2]; vertex = hl[(byte0 + byte1 + byte2)%3]; - + return vertex; } diff --git a/src/bm_numbers.c b/src/bm_numbers.c index cd3aa74..4ede2d7 100644 --- a/src/bm_numbers.c +++ b/src/bm_numbers.c @@ -128,4 +128,3 @@ int main(int argc, char** argv) { lsmap_destroy(g_created_mphf); return 0; } - diff --git a/src/bmz.c b/src/bmz.c index 51c7785..eb3190e 100644 --- a/src/bmz.c +++ b/src/bmz.c @@ -24,7 +24,7 @@ bmz_config_data_t *bmz_config_new(void) { bmz_config_data_t *bmz = NULL; bmz = (bmz_config_data_t *)malloc(sizeof(bmz_config_data_t)); - assert(bmz); + if (!bmz) return NULL; memset(bmz, 0, sizeof(bmz_config_data_t)); bmz->hashfuncs[0] = CMPH_HASH_JENKINS; bmz->hashfuncs[1] = CMPH_HASH_JENKINS; @@ -49,7 +49,7 @@ void bmz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 2) break; //bmz only uses two hash functions - bmz->hashfuncs[i] = *hashptr; + bmz->hashfuncs[i] = *hashptr; ++i, ++hashptr; } } @@ -68,8 +68,8 @@ cmph_t *bmz_new(cmph_config_t *mph, double c) bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data; if (c == 0) c = 1.15; // validating restrictions over parameter c. DEBUGP("c: %f\n", c); - bmz->m = mph->key_source->nkeys; - bmz->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); + bmz->m = mph->key_source->nkeys; + bmz->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz->m, bmz->n, c); bmz->graph = graph_new(bmz->n, bmz->m); DEBUGP("Created graph\n"); @@ -81,7 +81,7 @@ cmph_t *bmz_new(cmph_config_t *mph, double c) { // Mapping step cmph_uint32 biggest_g_value = 0; - cmph_uint32 biggest_edge_value = 1; + cmph_uint32 biggest_edge_value = 1; iterations = 100; if (mph->verbosity) { @@ -109,12 +109,12 @@ cmph_t *bmz_new(cmph_config_t *mph, double c) fprintf(stderr, "simple graph creation failure - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } + } else break; } if (iterations == 0) { - graph_destroy(bmz->graph); + graph_destroy(bmz->graph); return NULL; } // Ordering step @@ -155,17 +155,17 @@ cmph_t *bmz_new(cmph_config_t *mph, double c) } bmz_traverse_non_critical_nodes(bmz, used_edges, visited); // non_critical_nodes } - else + else { iterations_map--; if (mph->verbosity) fprintf(stderr, "Restarting mapping step. %u iterations remaining.\n", iterations_map); - } + } free(used_edges); free(visited); } while(restart_mapping && iterations_map > 0); graph_destroy(bmz->graph); bmz->graph = NULL; - if (iterations_map == 0) + if (iterations_map == 0) { return NULL; } @@ -212,15 +212,15 @@ static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint3 while(!vqueue_is_empty(q)) { v = vqueue_remove(q); - it = graph_neighbors_it(bmz->graph, v); + it = graph_neighbors_it(bmz->graph, v); while ((u = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR) - { + { if (graph_node_is_critical(bmz->graph, u) && (!GETBIT(visited,u))) { collision = 1; while(collision) // lookahead to resolve collisions { - next_g = *biggest_g_value + 1; + next_g = *biggest_g_value + 1; it1 = graph_neighbors_it(bmz->graph, u); collision = 0; while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR) @@ -232,7 +232,7 @@ static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint3 vqueue_destroy(q); return 1; // restart mapping step. } - if (GETBIT(used_edges, (next_g + bmz->g[lav]))) + if (GETBIT(used_edges, (next_g + bmz->g[lav]))) { collision = 1; break; @@ -240,7 +240,7 @@ static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint3 } } if (next_g > *biggest_g_value) *biggest_g_value = next_g; - } + } // Marking used edges... it1 = graph_neighbors_it(bmz->graph, u); while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR) @@ -254,9 +254,9 @@ static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint3 bmz->g[u] = next_g; // Labelling vertex u. SETBIT(visited,u); vqueue_insert(q, u); - } + } } - + } vqueue_destroy(q); return 0; @@ -282,22 +282,22 @@ static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, while(!vqueue_is_empty(q)) { v = vqueue_remove(q); - it = graph_neighbors_it(bmz->graph, v); + it = graph_neighbors_it(bmz->graph, v); while ((u = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR) - { + { if (graph_node_is_critical(bmz->graph, u) && (!GETBIT(visited,u))) { cmph_uint32 next_g_index = 0; collision = 1; while(collision) // lookahead to resolve collisions { - if (next_g_index < nunused_g_values) + if (next_g_index < nunused_g_values) { - next_g = unused_g_values[next_g_index++]; + next_g = unused_g_values[next_g_index++]; } - else + else { - next_g = *biggest_g_value + 1; + next_g = *biggest_g_value + 1; next_g_index = UINT_MAX; } it1 = graph_neighbors_it(bmz->graph, u); @@ -312,7 +312,7 @@ static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, free(unused_g_values); return 1; // restart mapping step. } - if (GETBIT(used_edges, (next_g + bmz->g[lav]))) + if (GETBIT(used_edges, (next_g + bmz->g[lav]))) { collision = 1; break; @@ -324,13 +324,13 @@ static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, if(nunused_g_values == unused_g_values_capacity) { unused_g_values = (cmph_uint32 *)realloc(unused_g_values, (unused_g_values_capacity + BUFSIZ)*sizeof(cmph_uint32)); - unused_g_values_capacity += BUFSIZ; - } - unused_g_values[nunused_g_values++] = next_g; + unused_g_values_capacity += BUFSIZ; + } + unused_g_values[nunused_g_values++] = next_g; } if (next_g > *biggest_g_value) *biggest_g_value = next_g; - } + } next_g_index--; if (next_g_index < nunused_g_values) unused_g_values[next_g_index] = unused_g_values[--nunused_g_values]; @@ -347,13 +347,13 @@ static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, bmz->g[u] = next_g; // Labelling vertex u. SETBIT(visited, u); vqueue_insert(q, u); - } + } } - + } vqueue_destroy(q); free(unused_g_values); - return 0; + return 0; } static cmph_uint32 next_unused_edge(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint32 unused_edge_index) @@ -381,8 +381,8 @@ static void bmz_traverse(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_u SETBIT(visited, neighbor); (*unused_edge_index)++; bmz_traverse(bmz, used_edges, neighbor, unused_edge_index, visited); - - } + + } } static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint8 * visited) @@ -394,7 +394,7 @@ static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * { v1 = graph_vertex_id(bmz->graph, i, 0); v2 = graph_vertex_id(bmz->graph, i, 1); - if((GETBIT(visited,v1) && GETBIT(visited,v2)) || (!GETBIT(visited,v1) && !GETBIT(visited,v2))) continue; + if((GETBIT(visited,v1) && GETBIT(visited,v2)) || (!GETBIT(visited,v1) && !GETBIT(visited,v2))) continue; if(GETBIT(visited,v1)) bmz_traverse(bmz, used_edges, v1, &unused_edge_index, visited); else bmz_traverse(bmz, used_edges, v2, &unused_edge_index, visited); @@ -403,7 +403,7 @@ static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * for(i = 0; i < bmz->n; i++) { if(!GETBIT(visited,i)) - { + { bmz->g[i] = 0; SETBIT(visited, i); bmz_traverse(bmz, used_edges, i, &unused_edge_index, visited); @@ -411,14 +411,14 @@ static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * } } - + static int bmz_gen_edges(cmph_config_t *mph) { cmph_uint32 e; bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data; cmph_uint8 multiple_edges = 0; DEBUGP("Generating edges for %u vertices\n", bmz->n); - graph_clear_edges(bmz->graph); + graph_clear_edges(bmz->graph); mph->key_source->rewind(mph->key_source->data); for (e = 0; e < mph->key_source->nkeys; ++e) { @@ -426,12 +426,12 @@ static int bmz_gen_edges(cmph_config_t *mph) cmph_uint32 keylen; char *key = NULL; mph->key_source->read(mph->key_source->data, &key, &keylen); - + h1 = hash(bmz->hashes[0], key, keylen) % bmz->n; h2 = hash(bmz->hashes[1], key, keylen) % bmz->n; if (h1 == h2) if (++h2 >= bmz->n) h2 = 0; DEBUGP("key: %.*s h1: %u h2: %u\n", keylen, key, h1, h2); - if (h1 == h2) + if (h1 == h2) { if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); mph->key_source->dispose(mph->key_source->data, key, keylen); @@ -472,7 +472,7 @@ int bmz_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); - + nbytes = fwrite(data->g, sizeof(cmph_uint32)*(data->n), (size_t)1, fd); #ifdef DEBUG cmph_uint32 i; @@ -510,8 +510,8 @@ void bmz_load(FILE *f, cmph_t *mphf) } DEBUGP("Reading m and n\n"); - nbytes = fread(&(bmz->n), sizeof(cmph_uint32), (size_t)1, f); - nbytes = fread(&(bmz->m), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bmz->n), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bmz->m), sizeof(cmph_uint32), (size_t)1, f); bmz->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*bmz->n); nbytes = fread(bmz->g, bmz->n*sizeof(cmph_uint32), (size_t)1, f); @@ -522,7 +522,7 @@ void bmz_load(FILE *f, cmph_t *mphf) #endif return; } - + cmph_uint32 bmz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { @@ -537,7 +537,7 @@ cmph_uint32 bmz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) void bmz_destroy(cmph_t *mphf) { bmz_data_t *data = (bmz_data_t *)mphf->data; - free(data->g); + free(data->g); hash_state_destroy(data->hashes[0]); hash_state_destroy(data->hashes[1]); free(data->hashes); @@ -548,7 +548,7 @@ void bmz_destroy(cmph_t *mphf) /** \fn void bmz_pack(cmph_t *mphf, void *packed_mphf); * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. * \param mphf pointer to the resulting mphf - * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() */ void bmz_pack(cmph_t *mphf, void *packed_mphf) { @@ -579,26 +579,26 @@ void bmz_pack(cmph_t *mphf, void *packed_mphf) ptr += sizeof(data->n); // packing g - memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n); + memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n); } /** \fn cmph_uint32 bmz_packed_size(cmph_t *mphf); * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 bmz_packed_size(cmph_t *mphf) { bmz_data_t *data = (bmz_data_t *)mphf->data; - CMPH_HASH h1_type = hash_get_type(data->hashes[0]); - CMPH_HASH h2_type = hash_get_type(data->hashes[1]); + CMPH_HASH h1_type = hash_get_type(data->hashes[0]); + CMPH_HASH h2_type = hash_get_type(data->hashes[1]); - return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + 3*sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n); } /** cmph_uint32 bmz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -613,13 +613,13 @@ cmph_uint32 bmz_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type); register CMPH_HASH h2_type = *((cmph_uint32 *)h2_ptr); h2_ptr += 4; - + register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type)); - - register cmph_uint32 n = *g_ptr++; - - register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; - register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; + + register cmph_uint32 n = *g_ptr++; + + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; + register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; if (h1 == h2 && ++h2 > n) h2 = 0; - return (g_ptr[h1] + g_ptr[h2]); + return (g_ptr[h1] + g_ptr[h2]); } diff --git a/src/bmz8.c b/src/bmz8.c index 4db4dfc..54ba606 100644 --- a/src/bmz8.c +++ b/src/bmz8.c @@ -23,7 +23,7 @@ bmz8_config_data_t *bmz8_config_new(void) { bmz8_config_data_t *bmz8; bmz8 = (bmz8_config_data_t *)malloc(sizeof(bmz8_config_data_t)); - assert(bmz8); + if (!bmz8) return NULL; memset(bmz8, 0, sizeof(bmz8_config_data_t)); bmz8->hashfuncs[0] = CMPH_HASH_JENKINS; bmz8->hashfuncs[1] = CMPH_HASH_JENKINS; @@ -48,7 +48,7 @@ void bmz8_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 2) break; //bmz8 only uses two hash functions - bmz8->hashfuncs[i] = *hashptr; + bmz8->hashfuncs[i] = *hashptr; ++i, ++hashptr; } } @@ -64,7 +64,7 @@ cmph_t *bmz8_new(cmph_config_t *mph, double c) cmph_uint8 restart_mapping = 0; cmph_uint8 * visited = NULL; bmz8_config_data_t *bmz8 = (bmz8_config_data_t *)mph->data; - + if (mph->key_source->nkeys >= 256) { if (mph->verbosity) fprintf(stderr, "The number of keys in BMZ8 must be lower than 256.\n"); @@ -72,8 +72,8 @@ cmph_t *bmz8_new(cmph_config_t *mph, double c) } if (c == 0) c = 1.15; // validating restrictions over parameter c. DEBUGP("c: %f\n", c); - bmz8->m = (cmph_uint8) mph->key_source->nkeys; - bmz8->n = (cmph_uint8) ceil(c * mph->key_source->nkeys); + bmz8->m = (cmph_uint8) mph->key_source->nkeys; + bmz8->n = (cmph_uint8) ceil(c * mph->key_source->nkeys); DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz8->m, bmz8->n, c); bmz8->graph = graph_new(bmz8->n, bmz8->m); DEBUGP("Created graph\n"); @@ -113,8 +113,8 @@ cmph_t *bmz8_new(cmph_config_t *mph, double c) fprintf(stderr, "simple graph creation failure - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } - else break; + } + else break; } if (iterations == 0) { @@ -161,19 +161,19 @@ cmph_t *bmz8_new(cmph_config_t *mph, double c) } bmz8_traverse_non_critical_nodes(bmz8, used_edges, visited); // non_critical_nodes } - else + else { iterations_map--; if (mph->verbosity) fprintf(stderr, "Restarting mapping step. %u iterations remaining.\n", iterations_map); - } + } free(used_edges); free(visited); }while(restart_mapping && iterations_map > 0); - graph_destroy(bmz8->graph); + graph_destroy(bmz8->graph); bmz8->graph = NULL; - if (iterations_map == 0) + if (iterations_map == 0) { return NULL; } @@ -213,15 +213,15 @@ static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_ui while(!vqueue_is_empty(q)) { v = vqueue_remove(q); - it = graph_neighbors_it(bmz8->graph, v); + it = graph_neighbors_it(bmz8->graph, v); while ((u = graph_next_neighbor(bmz8->graph, &it)) != GRAPH_NO_NEIGHBOR) - { + { if (graph_node_is_critical(bmz8->graph, u) && (!GETBIT(visited,u))) { collision = 1; while(collision) // lookahead to resolve collisions { - next_g = (cmph_uint8)(*biggest_g_value + 1); + next_g = (cmph_uint8)(*biggest_g_value + 1); it1 = graph_neighbors_it(bmz8->graph, u); collision = 0; while((lav = graph_next_neighbor(bmz8->graph, &it1)) != GRAPH_NO_NEIGHBOR) @@ -233,7 +233,7 @@ static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_ui vqueue_destroy(q); return 1; // restart mapping step. } - if (GETBIT(used_edges, (next_g + bmz8->g[lav]))) + if (GETBIT(used_edges, (next_g + bmz8->g[lav]))) { collision = 1; break; @@ -241,7 +241,7 @@ static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_ui } } if (next_g > *biggest_g_value) *biggest_g_value = next_g; - } + } // Marking used edges... it1 = graph_neighbors_it(bmz8->graph, u); while((lav = graph_next_neighbor(bmz8->graph, &it1)) != GRAPH_NO_NEIGHBOR) @@ -250,16 +250,16 @@ static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_ui { SETBIT(used_edges,(next_g + bmz8->g[lav])); - if(next_g + bmz8->g[lav] > *biggest_edge_value) + if(next_g + bmz8->g[lav] > *biggest_edge_value) *biggest_edge_value = (cmph_uint8)(next_g + bmz8->g[lav]); } } bmz8->g[u] = next_g; // Labelling vertex u. SETBIT(visited,u); vqueue_insert(q, u); - } + } } - + } vqueue_destroy(q); return 0; @@ -268,8 +268,8 @@ static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_ui static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz8, cmph_uint32 v, cmph_uint8 * biggest_g_value, cmph_uint8 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited) { cmph_uint8 next_g; - cmph_uint32 u; - cmph_uint32 lav; + cmph_uint32 u; + cmph_uint32 lav; cmph_uint8 collision; cmph_uint8 * unused_g_values = NULL; cmph_uint8 unused_g_values_capacity = 0; @@ -280,27 +280,27 @@ static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz DEBUGP("Labelling critical vertices\n"); bmz8->g[v] = (cmph_uint8)(ceil ((double)(*biggest_edge_value)/2) - 1); SETBIT(visited, v); - next_g = (cmph_uint8)floor((double)(*biggest_edge_value/2)); + next_g = (cmph_uint8)floor((double)(*biggest_edge_value/2)); vqueue_insert(q, v); while(!vqueue_is_empty(q)) { v = vqueue_remove(q); - it = graph_neighbors_it(bmz8->graph, v); + it = graph_neighbors_it(bmz8->graph, v); while ((u = graph_next_neighbor(bmz8->graph, &it)) != GRAPH_NO_NEIGHBOR) - { + { if (graph_node_is_critical(bmz8->graph, u) && (!GETBIT(visited,u))) { cmph_uint8 next_g_index = 0; collision = 1; while(collision) // lookahead to resolve collisions { - if (next_g_index < nunused_g_values) + if (next_g_index < nunused_g_values) { next_g = unused_g_values[next_g_index++]; } - else + else { - next_g = (cmph_uint8)(*biggest_g_value + 1); + next_g = (cmph_uint8)(*biggest_g_value + 1); next_g_index = 255;//UINT_MAX; } it1 = graph_neighbors_it(bmz8->graph, u); @@ -315,7 +315,7 @@ static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz free(unused_g_values); return 1; // restart mapping step. } - if (GETBIT(used_edges, (next_g + bmz8->g[lav]))) + if (GETBIT(used_edges, (next_g + bmz8->g[lav]))) { collision = 1; break; @@ -327,14 +327,14 @@ static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz if(nunused_g_values == unused_g_values_capacity) { unused_g_values = (cmph_uint8*)realloc(unused_g_values, ((size_t)(unused_g_values_capacity + BUFSIZ))*sizeof(cmph_uint8)); - unused_g_values_capacity += (cmph_uint8)BUFSIZ; - } - unused_g_values[nunused_g_values++] = next_g; + unused_g_values_capacity += (cmph_uint8)BUFSIZ; + } + unused_g_values[nunused_g_values++] = next_g; } if (next_g > *biggest_g_value) *biggest_g_value = next_g; } - + next_g_index--; if (next_g_index < nunused_g_values) unused_g_values[next_g_index] = unused_g_values[--nunused_g_values]; @@ -345,22 +345,22 @@ static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz if (graph_node_is_critical(bmz8->graph, lav) && GETBIT(visited, lav)) { SETBIT(used_edges,(next_g + bmz8->g[lav])); - if(next_g + bmz8->g[lav] > *biggest_edge_value) + if(next_g + bmz8->g[lav] > *biggest_edge_value) *biggest_edge_value = (cmph_uint8)(next_g + bmz8->g[lav]); } } - + bmz8->g[u] = next_g; // Labelling vertex u. SETBIT(visited, u); vqueue_insert(q, u); - - } + + } } - + } vqueue_destroy(q); free(unused_g_values); - return 0; + return 0; } static cmph_uint8 next_unused_edge(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmph_uint32 unused_edge_index) @@ -388,8 +388,8 @@ static void bmz8_traverse(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmp SETBIT(visited, neighbor); (*unused_edge_index)++; bmz8_traverse(bmz8, used_edges, neighbor, unused_edge_index, visited); - - } + + } } static void bmz8_traverse_non_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmph_uint8 * visited) @@ -401,7 +401,7 @@ static void bmz8_traverse_non_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint { v1 = (cmph_uint8)graph_vertex_id(bmz8->graph, i, 0); v2 = (cmph_uint8)graph_vertex_id(bmz8->graph, i, 1); - if((GETBIT(visited,v1) && GETBIT(visited,v2)) || (!GETBIT(visited,v1) && !GETBIT(visited,v2))) continue; + if((GETBIT(visited,v1) && GETBIT(visited,v2)) || (!GETBIT(visited,v1) && !GETBIT(visited,v2))) continue; if(GETBIT(visited,v1)) bmz8_traverse(bmz8, used_edges, v1, &unused_edge_index, visited); else bmz8_traverse(bmz8, used_edges, v2, &unused_edge_index, visited); @@ -410,7 +410,7 @@ static void bmz8_traverse_non_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint for(i = 0; i < bmz8->n; i++) { if(!GETBIT(visited,i)) - { + { bmz8->g[i] = 0; SETBIT(visited, i); bmz8_traverse(bmz8, used_edges, i, &unused_edge_index, visited); @@ -418,14 +418,14 @@ static void bmz8_traverse_non_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint } } - + static int bmz8_gen_edges(cmph_config_t *mph) { cmph_uint8 e; bmz8_config_data_t *bmz8 = (bmz8_config_data_t *)mph->data; cmph_uint8 multiple_edges = 0; DEBUGP("Generating edges for %u vertices\n", bmz8->n); - graph_clear_edges(bmz8->graph); + graph_clear_edges(bmz8->graph); mph->key_source->rewind(mph->key_source->data); for (e = 0; e < mph->key_source->nkeys; ++e) { @@ -433,12 +433,12 @@ static int bmz8_gen_edges(cmph_config_t *mph) cmph_uint32 keylen; char *key = NULL; mph->key_source->read(mph->key_source->data, &key, &keylen); - + // if (key == NULL)fprintf(stderr, "key = %s -- read BMZ\n", key); h1 = (cmph_uint8)(hash(bmz8->hashes[0], key, keylen) % bmz8->n); h2 = (cmph_uint8)(hash(bmz8->hashes[1], key, keylen) % bmz8->n); if (h1 == h2) if (++h2 >= bmz8->n) h2 = 0; - if (h1 == h2) + if (h1 == h2) { if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); mph->key_source->dispose(mph->key_source->data, key, keylen); @@ -480,7 +480,7 @@ int bmz8_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(&(data->n), sizeof(cmph_uint8), (size_t)1, fd); nbytes = fwrite(&(data->m), sizeof(cmph_uint8), (size_t)1, fd); - + nbytes = fwrite(data->g, sizeof(cmph_uint8)*(data->n), (size_t)1, fd); /* #ifdef DEBUG fprintf(stderr, "G: "); @@ -518,8 +518,8 @@ void bmz8_load(FILE *f, cmph_t *mphf) } DEBUGP("Reading m and n\n"); - nbytes = fread(&(bmz8->n), sizeof(cmph_uint8), (size_t)1, f); - nbytes = fread(&(bmz8->m), sizeof(cmph_uint8), (size_t)1, f); + nbytes = fread(&(bmz8->n), sizeof(cmph_uint8), (size_t)1, f); + nbytes = fread(&(bmz8->m), sizeof(cmph_uint8), (size_t)1, f); bmz8->g = (cmph_uint8 *)malloc(sizeof(cmph_uint8)*bmz8->n); nbytes = fread(bmz8->g, bmz8->n*sizeof(cmph_uint8), (size_t)1, f); @@ -530,7 +530,7 @@ void bmz8_load(FILE *f, cmph_t *mphf) #endif return; } - + cmph_uint8 bmz8_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { @@ -556,7 +556,7 @@ void bmz8_destroy(cmph_t *mphf) /** \fn void bmz8_pack(cmph_t *mphf, void *packed_mphf); * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. * \param mphf pointer to the resulting mphf - * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() */ void bmz8_pack(cmph_t *mphf, void *packed_mphf) { @@ -585,26 +585,26 @@ void bmz8_pack(cmph_t *mphf, void *packed_mphf) *ptr++ = data->n; // packing g - memcpy(ptr, data->g, sizeof(cmph_uint8)*data->n); + memcpy(ptr, data->g, sizeof(cmph_uint8)*data->n); } /** \fn cmph_uint32 bmz8_packed_size(cmph_t *mphf); * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 bmz8_packed_size(cmph_t *mphf) { bmz8_data_t *data = (bmz8_data_t *)mphf->data; - CMPH_HASH h1_type = hash_get_type(data->hashes[0]); - CMPH_HASH h2_type = hash_get_type(data->hashes[1]); + CMPH_HASH h1_type = hash_get_type(data->hashes[0]); + CMPH_HASH h2_type = hash_get_type(data->hashes[1]); - return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + 2*sizeof(cmph_uint32) + sizeof(cmph_uint8) + sizeof(cmph_uint8)*data->n); } /** cmph_uint8 bmz8_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -619,14 +619,14 @@ cmph_uint8 bmz8_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type); register CMPH_HASH h2_type = *((cmph_uint32 *)h2_ptr); h2_ptr += 4; - + register cmph_uint8 *g_ptr = h2_ptr + hash_state_packed_size(h2_type); - - register cmph_uint8 n = *g_ptr++; - - register cmph_uint8 h1 = (cmph_uint8)(hash_packed(h1_ptr, h1_type, key, keylen) % n); - register cmph_uint8 h2 = (cmph_uint8)(hash_packed(h2_ptr, h2_type, key, keylen) % n); + + register cmph_uint8 n = *g_ptr++; + + register cmph_uint8 h1 = (cmph_uint8)(hash_packed(h1_ptr, h1_type, key, keylen) % n); + register cmph_uint8 h2 = (cmph_uint8)(hash_packed(h2_ptr, h2_type, key, keylen) % n); DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); if (h1 == h2 && ++h2 > n) h2 = 0; - return (cmph_uint8)(g_ptr[h1] + g_ptr[h2]); + return (cmph_uint8)(g_ptr[h1] + g_ptr[h2]); } diff --git a/src/brz.c b/src/brz.c index f9c48ef..bac5bc5 100755 --- a/src/brz.c +++ b/src/brz.c @@ -26,8 +26,9 @@ static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fch static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen); brz_config_data_t *brz_config_new(void) { - brz_config_data_t *brz = NULL; + brz_config_data_t *brz = NULL; brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t)); + if (!brz) return NULL; brz->algo = CMPH_FCH; brz->b = 128; brz->hashfuncs[0] = CMPH_HASH_JENKINS; @@ -42,7 +43,7 @@ brz_config_data_t *brz_config_new(void) brz->memory_availability = 1024*1024; brz->tmp_dir = (cmph_uint8 *)calloc((size_t)10, sizeof(cmph_uint8)); brz->mphf_fd = NULL; - strcpy((char *)(brz->tmp_dir), "/var/tmp/"); + strcpy((char *)(brz->tmp_dir), "/var/tmp/"); assert(brz); return brz; } @@ -63,7 +64,7 @@ void brz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 3) break; //brz only uses three hash functions - brz->hashfuncs[i] = *hashptr; + brz->hashfuncs[i] = *hashptr; ++i, ++hashptr; } } @@ -84,14 +85,14 @@ void brz_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir) if(tmp_dir[len-1] != '/') { brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+2, sizeof(cmph_uint8)); - sprintf((char *)(brz->tmp_dir), "%s/", (char *)tmp_dir); + sprintf((char *)(brz->tmp_dir), "%s/", (char *)tmp_dir); } else { brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+1, sizeof(cmph_uint8)); - sprintf((char *)(brz->tmp_dir), "%s", (char *)tmp_dir); + sprintf((char *)(brz->tmp_dir), "%s", (char *)tmp_dir); } - + } } @@ -105,14 +106,14 @@ void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd) void brz_config_set_b(cmph_config_t *mph, cmph_uint32 b) { brz_config_data_t *brz = (brz_config_data_t *)mph->data; - if(b <= 64 || b >= 175) + if(b <= 64 || b >= 175) { b = 128; } brz->b = (cmph_uint8)b; } -void brz_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) +void brz_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) { if (algo == CMPH_BMZ8 || algo == CMPH_FCH) // supported algorithms { @@ -147,13 +148,13 @@ cmph_t *brz_new(cmph_config_t *mph, double c) brz->k = (cmph_uint32)ceil(brz->m/((double)brz->b)); DEBUGP("k: %u\n", brz->k); brz->size = (cmph_uint8 *) calloc((size_t)brz->k, sizeof(cmph_uint8)); - + // Clustering the keys by graph id. if (mph->verbosity) { - fprintf(stderr, "Partioning the set of keys.\n"); + fprintf(stderr, "Partioning the set of keys.\n"); } - + while(1) { int ok; @@ -172,17 +173,17 @@ cmph_t *brz_new(cmph_config_t *mph, double c) fprintf(stderr, "Failure: A graph with more than 255 keys was created - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } - else break; + } + else break; } - if (iterations == 0) + if (iterations == 0) { DEBUGP("Graphs with more than 255 keys were created in all 20 iterations\n"); free(brz->size); return NULL; } DEBUGP("Graphs generated\n"); - + brz->offset = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32)); for (i = 1; i < brz->k; ++i) { @@ -209,7 +210,7 @@ cmph_t *brz_new(cmph_config_t *mph, double c) brzf->m = brz->m; brzf->algo = brz->algo; mphf->data = brzf; - mphf->size = brz->m; + mphf->size = brz->m; DEBUGP("Successfully generated minimal perfect hash\n"); if (mph->verbosity) { @@ -240,7 +241,7 @@ static int brz_gen_mphf(cmph_config_t *mph) cmph_uint32 cur_bucket = 0; cmph_uint8 nkeys_vd = 0; cmph_uint8 ** keys_vd = NULL; - + mph->key_source->rewind(mph->key_source->data); DEBUGP("Generating graphs from %u keys\n", brz->m); // Partitioning @@ -249,7 +250,7 @@ static int brz_gen_mphf(cmph_config_t *mph) mph->key_source->read(mph->key_source->data, &key, &keylen); /* Buffers management */ - if (memory_usage + keylen + sizeof(keylen) > brz->memory_availability) // flush buffers + if (memory_usage + keylen + sizeof(keylen) > brz->memory_availability) // flush buffers { if(mph->verbosity) { @@ -265,8 +266,8 @@ static int brz_gen_mphf(cmph_config_t *mph) sum += value; value = buckets_size[i]; buckets_size[i] = sum; - - } + + } memory_usage = 0; keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32)); for(i = 0; i < nkeys_in_buffer; i++) @@ -298,8 +299,8 @@ static int brz_gen_mphf(cmph_config_t *mph) memcpy(buffer + memory_usage + sizeof(keylen), key, (size_t)keylen); memory_usage += keylen + (cmph_uint32)sizeof(keylen); h0 = hash(brz->h0, key, keylen) % brz->k; - - if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0]))) + + if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0]))) { free(buffer); free(buckets_size); @@ -310,8 +311,8 @@ static int brz_gen_mphf(cmph_config_t *mph) nkeys_in_buffer++; mph->key_source->dispose(mph->key_source->data, key, keylen); } - if (memory_usage != 0) // flush buffers - { + if (memory_usage != 0) // flush buffers + { if(mph->verbosity) { fprintf(stderr, "Flushing %u\n", nkeys_in_buffer); @@ -370,12 +371,12 @@ static int brz_gen_mphf(cmph_config_t *mph) nbytes = fwrite(&(brz->algo), sizeof(brz->algo), (size_t)1, brz->mphf_fd); nbytes = fwrite(&(brz->k), sizeof(cmph_uint32), (size_t)1, brz->mphf_fd); // number of MPHFs nbytes = fwrite(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, brz->mphf_fd); - + //tmp_fds = (FILE **)calloc(nflushes, sizeof(FILE *)); buff_manager = buffer_manager_new(brz->memory_availability, nflushes); buffer_merge = (cmph_uint8 **)calloc((size_t)nflushes, sizeof(cmph_uint8 *)); buffer_h0 = (cmph_uint32 *)calloc((size_t)nflushes, sizeof(cmph_uint32)); - + memory_usage = 0; for(i = 0; i < nflushes; i++) { @@ -388,7 +389,7 @@ static int brz_gen_mphf(cmph_config_t *mph) h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k; buffer_h0[i] = h0; buffer_merge[i] = (cmph_uint8 *)key; - key = NULL; //transfer memory ownership + key = NULL; //transfer memory ownership } e = 0; keys_vd = (cmph_uint8 **)calloc((size_t)MAX_BUCKET_SIZE, sizeof(cmph_uint8 *)); @@ -429,7 +430,7 @@ static int brz_gen_mphf(cmph_config_t *mph) e++; buffer_h0[i] = UINT_MAX; } - + if(nkeys_vd == brz->size[cur_bucket]) // Generating mphf for each bucket. { cmph_io_adapter_t *source = NULL; @@ -444,7 +445,7 @@ static int brz_gen_mphf(cmph_config_t *mph) //cmph_config_set_algo(config, CMPH_BMZ8); cmph_config_set_graphsize(config, brz->c); mphf_tmp = cmph_new(config); - if (mphf_tmp == NULL) + if (mphf_tmp == NULL) { if(mph->verbosity) fprintf(stderr, "ERROR: Can't generate MPHF for bucket %u out of %u\n", cur_bucket + 1, brz->k); error = 1; @@ -453,9 +454,9 @@ static int brz_gen_mphf(cmph_config_t *mph) cmph_io_byte_vector_adapter_destroy(source); break; } - if(mph->verbosity) + if(mph->verbosity) { - if (cur_bucket % 1000 == 0) + if (cur_bucket % 1000 == 0) { fprintf(stderr, "MPHF for bucket %u out of %u was generated.\n", cur_bucket + 1, brz->k); } @@ -465,7 +466,7 @@ static int brz_gen_mphf(cmph_config_t *mph) case CMPH_FCH: { fch_data_t * fchf = NULL; - fchf = (fch_data_t *)mphf_tmp->data; + fchf = (fch_data_t *)mphf_tmp->data; bufmphf = brz_copy_partial_fch_mphf(brz, fchf, cur_bucket, &buflenmphf); } break; @@ -516,7 +517,7 @@ static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fch { cmph_uint32 i = 0; cmph_uint32 buflenh1 = 0; - cmph_uint32 buflenh2 = 0; + cmph_uint32 buflenh2 = 0; char * bufh1 = NULL; char * bufh2 = NULL; char * buf = NULL; @@ -528,7 +529,7 @@ static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fch memcpy(buf, &buflenh1, sizeof(cmph_uint32)); memcpy(buf+sizeof(cmph_uint32), bufh1, (size_t)buflenh1); memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32)); - memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2); + memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2); for (i = 0; i < n; i++) memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2+i,(fchf->g + i), (size_t)1); free(bufh1); free(bufh2); @@ -537,7 +538,7 @@ static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fch static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen) { cmph_uint32 buflenh1 = 0; - cmph_uint32 buflenh2 = 0; + cmph_uint32 buflenh2 = 0; char * bufh1 = NULL; char * bufh2 = NULL; char * buf = NULL; @@ -572,7 +573,7 @@ int brz_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); free(buf); // Dumping m and the vector offset. - nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(data->offset, sizeof(cmph_uint32)*(data->k), (size_t)1, fd); return 1; } @@ -591,7 +592,7 @@ void brz_load(FILE *f, cmph_t *mphf) nbytes = fread(&(brz->algo), sizeof(brz->algo), (size_t)1, f); // Reading algo. nbytes = fread(&(brz->k), sizeof(cmph_uint32), (size_t)1, f); brz->size = (cmph_uint8 *) malloc(sizeof(cmph_uint8)*brz->k); - nbytes = fread(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, f); + nbytes = fread(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, f); brz->h1 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); brz->h2 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); brz->g = (cmph_uint8 **) calloc((size_t)brz->k, sizeof(cmph_uint8 *)); @@ -635,7 +636,7 @@ void brz_load(FILE *f, cmph_t *mphf) brz->h0 = hash_state_load(buf, buflen); free(buf); - //loading c, m, and the vector offset. + //loading c, m, and the vector offset. nbytes = fread(&(brz->m), sizeof(cmph_uint32), (size_t)1, f); brz->offset = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*brz->k); nbytes = fread(brz->offset, sizeof(cmph_uint32)*(brz->k), (size_t)1, f); @@ -654,9 +655,9 @@ static cmph_uint32 brz_bmz8_search(brz_data_t *brz, const char *key, cmph_uint32 register cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % n; register cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % n; register cmph_uint8 mphf_bucket; - + if (h1 == h2 && ++h2 >= n) h2 = 0; - mphf_bucket = (cmph_uint8)(brz->g[h0][h1] + brz->g[h0][h2]); + mphf_bucket = (cmph_uint8)(brz->g[h0][h1] + brz->g[h0][h2]); DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0); DEBUGP("key: %s g[h1]: %u g[h2]: %u offset[h0]: %u edges: %u\n", key, brz->g[h0][h1], brz->g[h0][h2], brz->offset[h0], brz->m); DEBUGP("Address: %u\n", mphf_bucket + brz->offset[h0]); @@ -722,61 +723,61 @@ void brz_destroy(cmph_t *mphf) /** \fn void brz_pack(cmph_t *mphf, void *packed_mphf); * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. * \param mphf pointer to the resulting mphf - * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() */ void brz_pack(cmph_t *mphf, void *packed_mphf) { brz_data_t *data = (brz_data_t *)mphf->data; cmph_uint8 * ptr = packed_mphf; cmph_uint32 i,n; - + // packing internal algo type memcpy(ptr, &(data->algo), sizeof(data->algo)); ptr += sizeof(data->algo); // packing h0 type - CMPH_HASH h0_type = hash_get_type(data->h0); + CMPH_HASH h0_type = hash_get_type(data->h0); memcpy(ptr, &h0_type, sizeof(h0_type)); ptr += sizeof(h0_type); // packing h0 hash_state_pack(data->h0, ptr); ptr += hash_state_packed_size(h0_type); - + // packing k memcpy(ptr, &(data->k), sizeof(data->k)); ptr += sizeof(data->k); // packing c - *((cmph_uint64 *)ptr) = (cmph_uint64)data->c; + *((cmph_uint64 *)ptr) = (cmph_uint64)data->c; ptr += sizeof(data->c); // packing h1 type - CMPH_HASH h1_type = hash_get_type(data->h1[0]); + CMPH_HASH h1_type = hash_get_type(data->h1[0]); memcpy(ptr, &h1_type, sizeof(h1_type)); ptr += sizeof(h1_type); // packing h2 type - CMPH_HASH h2_type = hash_get_type(data->h2[0]); + CMPH_HASH h2_type = hash_get_type(data->h2[0]); memcpy(ptr, &h2_type, sizeof(h2_type)); ptr += sizeof(h2_type); // packing size - memcpy(ptr, data->size, sizeof(cmph_uint8)*data->k); + memcpy(ptr, data->size, sizeof(cmph_uint8)*data->k); ptr += data->k; - + // packing offset - memcpy(ptr, data->offset, sizeof(cmph_uint32)*data->k); + memcpy(ptr, data->offset, sizeof(cmph_uint32)*data->k); ptr += sizeof(cmph_uint32)*data->k; - + #if defined (__ia64) || defined (__x86_64__) cmph_uint64 * g_is_ptr = (cmph_uint64 *)ptr; #else cmph_uint32 * g_is_ptr = (cmph_uint32 *)ptr; #endif - + cmph_uint8 * g_i = (cmph_uint8 *) (g_is_ptr + data->k); - + for(i = 0; i < data->k; i++) { #if defined (__ia64) || defined (__x86_64__) @@ -787,7 +788,7 @@ void brz_pack(cmph_t *mphf, void *packed_mphf) // packing h1[i] hash_state_pack(data->h1[i], g_i); g_i += hash_state_packed_size(h1_type); - + // packing h2[i] hash_state_pack(data->h2[i], g_i); g_i += hash_state_packed_size(h2_type); @@ -803,9 +804,9 @@ void brz_pack(cmph_t *mphf, void *packed_mphf) break; default: assert(0); } - memcpy(g_i, data->g[i], sizeof(cmph_uint8)*n); + memcpy(g_i, data->g[i], sizeof(cmph_uint8)*n); g_i += n; - + } } @@ -814,16 +815,16 @@ void brz_pack(cmph_t *mphf, void *packed_mphf) * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 brz_packed_size(cmph_t *mphf) { cmph_uint32 i; cmph_uint32 size = 0; brz_data_t *data = (brz_data_t *)mphf->data; - CMPH_HASH h0_type = hash_get_type(data->h0); - CMPH_HASH h1_type = hash_get_type(data->h1[0]); + CMPH_HASH h0_type = hash_get_type(data->h0); + CMPH_HASH h1_type = hash_get_type(data->h1[0]); CMPH_HASH h2_type = hash_get_type(data->h2[0]); - size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) + + size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) + sizeof(double) + sizeof(cmph_uint8)*data->k + sizeof(cmph_uint32)*data->k); // pointers to g_is #if defined (__ia64) || defined (__x86_64__) @@ -831,10 +832,10 @@ cmph_uint32 brz_packed_size(cmph_t *mphf) #else size += (cmph_uint32) sizeof(cmph_uint32)*data->k; #endif - + size += hash_state_packed_size(h1_type) * data->k; size += hash_state_packed_size(h2_type) * data->k; - + cmph_uint32 n = 0; for(i = 0; i < data->k; i++) { @@ -848,7 +849,7 @@ cmph_uint32 brz_packed_size(cmph_t *mphf) break; default: assert(0); } - size += n; + size += n; } return size; } @@ -859,28 +860,28 @@ static cmph_uint32 brz_bmz8_search_packed(cmph_uint32 *packed_mphf, const char * { register CMPH_HASH h0_type = *packed_mphf++; register cmph_uint32 *h0_ptr = packed_mphf; - packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type)); - + packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type)); + register cmph_uint32 k = *packed_mphf++; register double c = (double)(*((cmph_uint64*)packed_mphf)); packed_mphf += 2; - register CMPH_HASH h1_type = *packed_mphf++; - - register CMPH_HASH h2_type = *packed_mphf++; + register CMPH_HASH h1_type = *packed_mphf++; + + register CMPH_HASH h2_type = *packed_mphf++; register cmph_uint8 * size = (cmph_uint8 *) packed_mphf; - packed_mphf = (cmph_uint32 *)(size + k); - + packed_mphf = (cmph_uint32 *)(size + k); + register cmph_uint32 * offset = packed_mphf; packed_mphf += k; register cmph_uint32 h0; - + hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint); h0 = fingerprint[2] % k; - + register cmph_uint32 m = size[h0]; register cmph_uint32 n = (cmph_uint32)ceil(c * m); @@ -889,69 +890,69 @@ static cmph_uint32 brz_bmz8_search_packed(cmph_uint32 *packed_mphf, const char * #else register cmph_uint32 * g_is_ptr = packed_mphf; #endif - + register cmph_uint8 * h1_ptr = (cmph_uint8 *) g_is_ptr[h0]; - + register cmph_uint8 * h2_ptr = h1_ptr + hash_state_packed_size(h1_type); register cmph_uint8 * g = h2_ptr + hash_state_packed_size(h2_type); - + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; register cmph_uint8 mphf_bucket; - + if (h1 == h2 && ++h2 >= n) h2 = 0; - mphf_bucket = (cmph_uint8)(g[h1] + g[h2]); + mphf_bucket = (cmph_uint8)(g[h1] + g[h2]); DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0); DEBUGP("Address: %u\n", mphf_bucket + offset[h0]); - return (mphf_bucket + offset[h0]); + return (mphf_bucket + offset[h0]); } static cmph_uint32 brz_fch_search_packed(cmph_uint32 *packed_mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) { register CMPH_HASH h0_type = *packed_mphf++; - + register cmph_uint32 *h0_ptr = packed_mphf; - packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type)); - + packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type)); + register cmph_uint32 k = *packed_mphf++; register double c = (double)(*((cmph_uint64*)packed_mphf)); packed_mphf += 2; - register CMPH_HASH h1_type = *packed_mphf++; + register CMPH_HASH h1_type = *packed_mphf++; - register CMPH_HASH h2_type = *packed_mphf++; + register CMPH_HASH h2_type = *packed_mphf++; register cmph_uint8 * size = (cmph_uint8 *) packed_mphf; - packed_mphf = (cmph_uint32 *)(size + k); - + packed_mphf = (cmph_uint32 *)(size + k); + register cmph_uint32 * offset = packed_mphf; packed_mphf += k; - + register cmph_uint32 h0; - + hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint); h0 = fingerprint[2] % k; - + register cmph_uint32 m = size[h0]; register cmph_uint32 b = fch_calc_b(c, m); register double p1 = fch_calc_p1(m); register double p2 = fch_calc_p2(b); - + #if defined (__ia64) || defined (__x86_64__) register cmph_uint64 * g_is_ptr = (cmph_uint64 *)packed_mphf; #else register cmph_uint32 * g_is_ptr = packed_mphf; #endif - + register cmph_uint8 * h1_ptr = (cmph_uint8 *) g_is_ptr[h0]; - + register cmph_uint8 * h2_ptr = h1_ptr + hash_state_packed_size(h1_type); register cmph_uint8 * g = h2_ptr + hash_state_packed_size(h2_type); - + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m; register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % m; @@ -962,7 +963,7 @@ static cmph_uint32 brz_fch_search_packed(cmph_uint32 *packed_mphf, const char *k } /** cmph_uint32 brz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -970,7 +971,7 @@ static cmph_uint32 brz_fch_search_packed(cmph_uint32 *packed_mphf, const char *k */ cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) { - register cmph_uint32 *ptr = (cmph_uint32 *)packed_mphf; + register cmph_uint32 *ptr = (cmph_uint32 *)packed_mphf; register CMPH_ALGO algo = *ptr++; cmph_uint32 fingerprint[3]; switch(algo) @@ -982,4 +983,3 @@ cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke default: assert(0); } } - diff --git a/src/buffer_entry.c b/src/buffer_entry.c index 5dcc4d5..65ebfda 100644 --- a/src/buffer_entry.c +++ b/src/buffer_entry.c @@ -17,7 +17,7 @@ struct __buffer_entry_t buffer_entry_t * buffer_entry_new(cmph_uint32 capacity) { buffer_entry_t *buff_entry = (buffer_entry_t *)malloc(sizeof(buffer_entry_t)); - assert(buff_entry); + if (!buff_entry) return NULL; buff_entry->fd = NULL; buff_entry->buff = NULL; buff_entry->capacity = capacity; @@ -62,7 +62,7 @@ cmph_uint8 * buffer_entry_read_key(buffer_entry_t * buffer_entry, cmph_uint32 * free(buf); return NULL; } - if((buffer_entry->pos + lacked_bytes) > buffer_entry->nbytes) + if((buffer_entry->pos + lacked_bytes) > buffer_entry->nbytes) { copied_bytes = buffer_entry->nbytes - buffer_entry->pos; lacked_bytes = (buffer_entry->pos + lacked_bytes) - buffer_entry->nbytes; @@ -71,7 +71,7 @@ cmph_uint8 * buffer_entry_read_key(buffer_entry_t * buffer_entry, cmph_uint32 * } memcpy(keylen + copied_bytes, buffer_entry->buff + buffer_entry->pos, (size_t)lacked_bytes); buffer_entry->pos += lacked_bytes; - + lacked_bytes = *keylen; copied_bytes = 0; buf = (cmph_uint8 *)malloc(*keylen + sizeof(*keylen)); @@ -83,7 +83,7 @@ cmph_uint8 * buffer_entry_read_key(buffer_entry_t * buffer_entry, cmph_uint32 * memcpy(buf + sizeof(*keylen), buffer_entry->buff + buffer_entry->pos, (size_t)copied_bytes); } buffer_entry_load(buffer_entry); - } + } memcpy(buf+sizeof(*keylen)+copied_bytes, buffer_entry->buff + buffer_entry->pos, (size_t)lacked_bytes); buffer_entry->pos += lacked_bytes; return buf; @@ -97,7 +97,7 @@ void buffer_entry_destroy(buffer_entry_t * buffer_entry) buffer_entry->buff = NULL; buffer_entry->capacity = 0; buffer_entry->nbytes = 0; - buffer_entry->pos = 0; + buffer_entry->pos = 0; buffer_entry->eof = 0; free(buffer_entry); } diff --git a/src/buffer_manage.c b/src/buffer_manage.c index fdefc62..93ec327 100644 --- a/src/buffer_manage.c +++ b/src/buffer_manage.c @@ -16,7 +16,7 @@ buffer_manage_t * buffer_manage_new(cmph_uint32 memory_avail, cmph_uint32 nentri { cmph_uint32 memory_avail_entry, i; buffer_manage_t *buff_manage = (buffer_manage_t *)malloc(sizeof(buffer_manage_t)); - assert(buff_manage); + if (!buff_manage) return NULL; buff_manage->memory_avail = memory_avail; buff_manage->buffer_entries = (buffer_entry_t **)calloc((size_t)nentries, sizeof(buffer_entry_t *)); buff_manage->memory_avail_list = (cmph_uint32 *)calloc((size_t)nentries, sizeof(cmph_uint32)); @@ -26,7 +26,7 @@ buffer_manage_t * buffer_manage_new(cmph_uint32 memory_avail, cmph_uint32 nentri for(i = 0; i < buff_manage->nentries; i++) { buff_manage->buffer_entries[i] = buffer_entry_new(memory_avail_entry); - } + } return buff_manage; } @@ -54,7 +54,7 @@ cmph_uint8 * buffer_manage_read_key(buffer_manage_t * buffer_manage, cmph_uint32 } void buffer_manage_destroy(buffer_manage_t * buffer_manage) -{ +{ cmph_uint32 i; for(i = 0; i < buffer_manage->nentries; i++) { diff --git a/src/buffer_manager.c b/src/buffer_manager.c index 5a051e2..243d4d9 100644 --- a/src/buffer_manager.c +++ b/src/buffer_manager.c @@ -16,7 +16,7 @@ buffer_manager_t * buffer_manager_new(cmph_uint32 memory_avail, cmph_uint32 nent { cmph_uint32 memory_avail_entry, i; buffer_manager_t *buff_manager = (buffer_manager_t *)malloc(sizeof(buffer_manager_t)); - assert(buff_manager); + if (!buff_manager) return NULL; buff_manager->memory_avail = memory_avail; buff_manager->buffer_entries = (buffer_entry_t **)calloc((size_t)nentries, sizeof(buffer_entry_t *)); buff_manager->memory_avail_list = (cmph_uint32 *)calloc((size_t)nentries, sizeof(cmph_uint32)); @@ -26,7 +26,7 @@ buffer_manager_t * buffer_manager_new(cmph_uint32 memory_avail, cmph_uint32 nent for(i = 0; i < buff_manager->nentries; i++) { buff_manager->buffer_entries[i] = buffer_entry_new(memory_avail_entry); - } + } return buff_manager; } @@ -52,7 +52,7 @@ cmph_uint8 * buffer_manager_read_key(buffer_manager_t * buffer_manager, cmph_uin } void buffer_manager_destroy(buffer_manager_t * buffer_manager) -{ +{ cmph_uint32 i; for(i = 0; i < buffer_manager->nentries; i++) { diff --git a/src/chd.c b/src/chd.c index 3eec2b3..6aafdbc 100644 --- a/src/chd.c +++ b/src/chd.c @@ -18,7 +18,7 @@ chd_config_data_t *chd_config_new(cmph_config_t *mph) cmph_io_adapter_t *key_source = mph->key_source; chd_config_data_t *chd; chd = (chd_config_data_t *)malloc(sizeof(chd_config_data_t)); - assert(chd); + if (!chd) return NULL; memset(chd, 0, sizeof(chd_config_data_t)); chd->chd_ph = cmph_config_new(key_source); @@ -69,12 +69,12 @@ cmph_t *chd_new(cmph_config_t *mph, double c) chd_config_data_t *chd = (chd_config_data_t *)mph->data; chd_ph_config_data_t * chd_ph = (chd_ph_config_data_t *)chd->chd_ph->data; compressed_rank_t cr; - + register cmph_t * chd_phf = NULL; - register cmph_uint32 packed_chd_phf_size = 0; + register cmph_uint32 packed_chd_phf_size = 0; cmph_uint8 * packed_chd_phf = NULL; - - register cmph_uint32 packed_cr_size = 0; + + register cmph_uint32 packed_cr_size = 0; cmph_uint8 * packed_cr = NULL; register cmph_uint32 i, idx, nkeys, nvals, nbins; @@ -86,24 +86,24 @@ cmph_t *chd_new(cmph_config_t *mph, double c) ELAPSED_TIME_IN_SECONDS(&construction_time_begin); #endif - cmph_config_set_verbosity(chd->chd_ph, mph->verbosity); + cmph_config_set_verbosity(chd->chd_ph, mph->verbosity); cmph_config_set_graphsize(chd->chd_ph, c); - + if (mph->verbosity) { fprintf(stderr, "Generating a CHD_PH perfect hash function with a load factor equal to %.3f\n", c); } - + chd_phf = cmph_new(chd->chd_ph); - - if(chd_phf == NULL) + + if(chd_phf == NULL) { return NULL; } - - packed_chd_phf_size = cmph_packed_size(chd_phf); + + packed_chd_phf_size = cmph_packed_size(chd_phf); DEBUGP("packed_chd_phf_size = %u\n", packed_chd_phf_size); - + /* Make sure that we have enough space to pack the mphf. */ packed_chd_phf = calloc((size_t)packed_chd_phf_size,(size_t)1); @@ -111,8 +111,8 @@ cmph_t *chd_new(cmph_config_t *mph, double c) cmph_pack(chd_phf, packed_chd_phf); cmph_destroy(chd_phf); - - + + if (mph->verbosity) { fprintf(stderr, "Compressing the range of the resulting CHD_PH perfect hash function\n"); @@ -121,11 +121,11 @@ cmph_t *chd_new(cmph_config_t *mph, double c) compressed_rank_init(&cr); nbins = chd_ph->n; nkeys = chd_ph->m; - nvals = nbins - nkeys; - + nvals = nbins - nkeys; + vals_table = (cmph_uint32 *)calloc(nvals, sizeof(cmph_uint32)); occup_table = (cmph_uint32 *)chd_ph->occup_table; - + for(i = 0, idx = 0; i < nbins; i++) { if(!GETBIT32(occup_table, i)) @@ -133,10 +133,10 @@ cmph_t *chd_new(cmph_config_t *mph, double c) vals_table[idx++] = i; } } - + compressed_rank_generate(&cr, vals_table, nvals); free(vals_table); - + packed_cr_size = compressed_rank_packed_size(&cr); packed_cr = (cmph_uint8 *) calloc(packed_cr_size, sizeof(cmph_uint8)); compressed_rank_pack(&cr, packed_cr); @@ -145,16 +145,16 @@ cmph_t *chd_new(cmph_config_t *mph, double c) mphf = (cmph_t *)malloc(sizeof(cmph_t)); mphf->algo = mph->algo; chdf = (chd_data_t *)malloc(sizeof(chd_data_t)); - + chdf->packed_cr = packed_cr; packed_cr = NULL; //transfer memory ownership chdf->packed_chd_phf = packed_chd_phf; packed_chd_phf = NULL; //transfer memory ownership - + chdf->packed_chd_phf_size = packed_chd_phf_size; chdf->packed_cr_size = packed_cr_size; - + mphf->data = chdf; mphf->size = nkeys; @@ -163,12 +163,12 @@ cmph_t *chd_new(cmph_config_t *mph, double c) { fprintf(stderr, "Successfully generated minimal perfect hash function\n"); } - #ifdef CMPH_TIMING + #ifdef CMPH_TIMING ELAPSED_TIME_IN_SECONDS(&construction_time); register cmph_uint32 space_usage = chd_packed_size(mphf)*8; construction_time = construction_time - construction_time_begin; fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", nkeys, c, chd_ph->keys_per_bucket, construction_time, space_usage/(double)nkeys); - #endif + #endif return mphf; } @@ -196,7 +196,7 @@ int chd_dump(cmph_t *mphf, FILE *fd) { register size_t nbytes; chd_data_t *data = (chd_data_t *)mphf->data; - + __cmph_dump(mphf, fd); // Dumping CHD_PH perfect hash function @@ -207,7 +207,7 @@ int chd_dump(cmph_t *mphf, FILE *fd) DEBUGP("Dumping compressed rank structure with %u bytes to disk\n", 1); nbytes = fwrite(&data->packed_cr_size, sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(data->packed_cr, data->packed_cr_size, (size_t)1, fd); - + return 1; } @@ -242,10 +242,10 @@ void chd_pack(cmph_t *mphf, void *packed_mphf) // packing packed_cr_size and packed_cr *ptr = data->packed_cr_size; ptr8 = (cmph_uint8 *) (ptr + 1); - + memcpy(ptr8, data->packed_cr, data->packed_cr_size); ptr8 += data->packed_cr_size; - + ptr = (cmph_uint32 *) ptr8; *ptr = data->packed_chd_phf_size; @@ -268,5 +268,3 @@ cmph_uint32 chd_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint8 * packed_chd_phf = ((cmph_uint8 *) ptr) + packed_cr_size + sizeof(cmph_uint32); return _chd_search(packed_chd_phf, ptr, key, keylen); } - - diff --git a/src/chd_ph.c b/src/chd_ph.c index 71f83fb..d225156 100644 --- a/src/chd_ph.c +++ b/src/chd_ph.c @@ -29,7 +29,7 @@ struct _chd_ph_item_t }; typedef struct _chd_ph_item_t chd_ph_item_t; -// struct to represent the items at mapping phase only. +// struct to represent the items at mapping phase only. struct _chd_ph_map_item_t { cmph_uint32 f; @@ -85,7 +85,7 @@ static cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets,chd_ph_map_item register chd_ph_map_item_t * tmp_map_item = map_items + item_idx; register chd_ph_bucket_t * bucket = buckets + tmp_map_item->bucket_num; tmp_item = items + bucket->items_list; - + for(i = 0; i < bucket->size; i++) { if(tmp_item->f == tmp_map_item->f && tmp_item->h == tmp_map_item->h) @@ -105,7 +105,7 @@ void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets) free(buckets); } -static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, +static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, cmph_uint32 *max_bucket_size); static chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets,chd_ph_item_t ** items, @@ -131,7 +131,7 @@ static inline double chd_ph_get_entropy(cmph_uint32 * disp_table, cmph_uint32 n, { probe_counts[disp_table[i]]++; }; - + for(i = 0; i < max_probes; i++) { if(probe_counts[i] > 0) @@ -145,9 +145,9 @@ chd_ph_config_data_t *chd_ph_config_new(void) { chd_ph_config_data_t *chd_ph; chd_ph = (chd_ph_config_data_t *)malloc(sizeof(chd_ph_config_data_t)); - assert(chd_ph); + if (!chd_ph) return NULL; memset(chd_ph, 0, sizeof(chd_ph_config_data_t)); - + chd_ph->hashfunc = CMPH_HASH_JENKINS; chd_ph->cs = NULL; chd_ph->nbuckets = 0; @@ -159,7 +159,7 @@ chd_ph_config_data_t *chd_ph_config_new(void) chd_ph->keys_per_bin = 1; chd_ph->keys_per_bucket = 4; chd_ph->occup_table = 0; - + return chd_ph; } @@ -184,7 +184,7 @@ void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 1) break; //chd_ph only uses one linear hash function - chd_ph->hashfunc = *hashptr; + chd_ph->hashfunc = *hashptr; ++i, ++hashptr; } } @@ -228,24 +228,24 @@ cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_ { mapping_iterations--; if (chd_ph->hl) hash_state_destroy(chd_ph->hl); - chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m); + chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m); chd_ph_bucket_clean(buckets, chd_ph->nbuckets); - mph->key_source->rewind(mph->key_source->data); + mph->key_source->rewind(mph->key_source->data); for(i = 0; i < chd_ph->m; i++) { - mph->key_source->read(mph->key_source->data, &key, &keylen); + mph->key_source->read(mph->key_source->data, &key, &keylen); hash_vector(chd_ph->hl, key, keylen, hl); - + map_item = (map_items + i); g = hl[0] % chd_ph->nbuckets; map_item->f = hl[1] % chd_ph->n; map_item->h = hl[2] % (chd_ph->n - 1) + 1; map_item->bucket_num=g; - mph->key_source->dispose(mph->key_source->data, key, keylen); + mph->key_source->dispose(mph->key_source->data, key, keylen); // if(buckets[g].size == (chd_ph->keys_per_bucket << 2)) // { // DEBUGP("BUCKET = %u -- SIZE = %u -- MAXIMUM SIZE = %u\n", g, buckets[g].size, (chd_ph->keys_per_bucket << 2)); @@ -275,7 +275,7 @@ cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_ free(map_items); return 1; // SUCCESS } - + if(mapping_iterations == 0) { goto error; @@ -292,7 +292,7 @@ chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_ cmph_uint32 nbuckets, cmph_uint32 nitems, cmph_uint32 max_bucket_size) { chd_ph_sorted_list_t * sorted_lists = (chd_ph_sorted_list_t *) calloc(max_bucket_size + 1, sizeof(chd_ph_sorted_list_t)); - + chd_ph_bucket_t * input_buckets = (*_buckets); chd_ph_bucket_t * output_buckets; chd_ph_item_t * input_items = (*_items); @@ -319,7 +319,7 @@ chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_ // Store the buckets in a new array which is sorted by bucket sizes output_buckets = calloc(nbuckets, sizeof(chd_ph_bucket_t)); // everything is initialized with zero // non_empty_buckets = nbuckets; - + for(i = 0; i < nbuckets; i++) { bucket_size = input_buckets[i].size; @@ -338,8 +338,8 @@ chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_ // Return the buckets sorted in new order and free the old buckets sorted in old order free(input_buckets); (*_buckets) = output_buckets; - - + + // Store the items according to the new order of buckets. output_items = (chd_ph_item_t*)calloc(nitems, sizeof(chd_ph_item_t)); position = 0; @@ -426,26 +426,26 @@ static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph } position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n); UNSETBIT32(((cmph_uint32*)chd_ph->occup_table), position); - + // ([position/32]^=(1<<(position%32)); item++; i--; }; }; return 0; - } + } return 1; }; -static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, cmph_uint32 max_probes, +static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, cmph_uint32 max_probes, cmph_uint32 * disp_table, cmph_uint32 bucket_num, cmph_uint32 size) - + { register cmph_uint32 probe0_num, probe1_num, probe_num; probe0_num = 0; probe1_num = 0; probe_num = 0; - + while(1) { if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, bucket_num,size)) @@ -469,7 +469,7 @@ static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucke }; static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t * buckets, chd_ph_item_t *items, - cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table) { register cmph_uint32 i = 0; @@ -490,8 +490,8 @@ static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_buc return 1; }; -static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, - cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, +static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table) { register cmph_uint32 i,j, non_placed_bucket; @@ -516,10 +516,10 @@ static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_buc { // if bucket is successfully placed remove it from list if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, curr_bucket, i)) - { + { disp_table[buckets[curr_bucket].bucket_id] = probe0_num + probe1_num * chd_ph->n; // DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]); - } + } else { // DEBUGP("BUCKET %u NOT PLACED\n", curr_bucket); @@ -529,7 +529,7 @@ static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_buc #endif buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list = buckets[curr_bucket].items_list; buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id = buckets[curr_bucket].bucket_id; -#ifdef DEBUG +#ifdef DEBUG buckets[curr_bucket].items_list=items_list; buckets[curr_bucket].bucket_id=bucket_id; #endif @@ -557,7 +557,7 @@ static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_buc }; cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items , - cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table) { if(chd_ph->use_h) @@ -582,7 +582,7 @@ static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, memset(chd_ph->occup_table, 0, chd_ph->n); else memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32)); - + for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++) for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size + sorted_lists[bucket_size].buckets_list; i++) @@ -602,7 +602,7 @@ static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, return 0; } (chd_ph->occup_table[position])++; - } + } else { if(GETBIT32(((cmph_uint32*)chd_ph->occup_table), position)) @@ -624,7 +624,7 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) cmph_t *mphf = NULL; chd_ph_data_t *chd_phf = NULL; chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; - + register double load_factor = c; register cmph_uint8 searching_success = 0; register cmph_uint32 max_probes = 1 << 20; // default value for max_probes @@ -645,24 +645,24 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) chd_ph->m = mph->key_source->nkeys; DEBUGP("m = %u\n", chd_ph->m); - + chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1; DEBUGP("nbuckets = %u\n", chd_ph->nbuckets); - + if(load_factor < 0.5 ) { load_factor = 0.5; } - + if(load_factor >= 0.99) { load_factor = 0.99; } - + DEBUGP("load_factor = %.3f\n", load_factor); - + chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1; - + //Round the number of bins to the prime immediately above if(chd_ph->n % 2 == 0) chd_ph->n++; for(;;) @@ -670,35 +670,35 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) if(check_primality(chd_ph->n) == 1) break; chd_ph->n += 2; // just odd numbers can be primes for n > 2 - + }; - + DEBUGP("n = %u \n", chd_ph->n); if(chd_ph->keys_per_bin == 1) { space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n); } - + if(mph->verbosity) { fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound); } // We allocate the working tables - buckets = chd_ph_bucket_new(chd_ph->nbuckets); + buckets = chd_ph_bucket_new(chd_ph->nbuckets); items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t)); max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes); - + if(chd_ph->keys_per_bin == 1) chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32)); else chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8)); - + disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32)); -// +// // init_genrand(time(0)); - + while(1) { iterations --; @@ -706,12 +706,12 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) { fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n); } - + if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size)) { if (mph->verbosity) { - fprintf(stderr, "Failure in mapping step\n"); + fprintf(stderr, "Failure in mapping step\n"); } failure = 1; goto cleanup; @@ -727,15 +727,15 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) } sorted_lists = chd_ph_ordering(&buckets, &items, chd_ph->nbuckets, chd_ph->m, max_bucket_size); - + if (mph->verbosity) { fprintf(stderr, "Starting searching step\n"); } - + searching_success = chd_ph_searching(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table); if(searching_success) break; - + // reset occup_table if(chd_ph->keys_per_bin > 1) memset(chd_ph->occup_table, 0, chd_ph->n); @@ -757,19 +757,19 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) { if(!chd_ph_check_bin_hashing(chd_ph, buckets, items, disp_table,sorted_lists,max_bucket_size)) { - + DEBUGP("Error for bin packing generation"); failure = 1; goto cleanup; } } #endif - + if (mph->verbosity) { fprintf(stderr, "Starting compressing step\n"); } - + if(chd_ph->cs) { free(chd_ph->cs); @@ -777,7 +777,7 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); compressed_seq_init(chd_ph->cs); compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets); - + #ifdef CMPH_TIMING ELAPSED_TIME_IN_SECONDS(&construction_time); register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes); @@ -785,11 +785,11 @@ cmph_t *chd_ph_new(cmph_config_t *mph, double c) #endif cleanup: - chd_ph_bucket_destroy(buckets); + chd_ph_bucket_destroy(buckets); free(items); free(sorted_lists); free(disp_table); - if(failure) + if(failure) { if(chd_ph->hl) { @@ -802,14 +802,14 @@ cleanup: mphf = (cmph_t *)malloc(sizeof(cmph_t)); mphf->algo = mph->algo; chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t)); - + chd_phf->cs = chd_ph->cs; chd_ph->cs = NULL; //transfer memory ownership chd_phf->hl = chd_ph->hl; chd_ph->hl = NULL; //transfer memory ownership chd_phf->n = chd_ph->n; chd_phf->nbuckets = chd_ph->nbuckets; - + mphf->data = chd_phf; mphf->size = chd_ph->n; @@ -818,12 +818,12 @@ cleanup: { fprintf(stderr, "Successfully generated minimal perfect hash function\n"); } - - #ifdef CMPH_TIMING + + #ifdef CMPH_TIMING register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8; construction_time = construction_time - construction_time_begin; fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m); - #endif + #endif return mphf; } @@ -846,19 +846,19 @@ void chd_ph_load(FILE *fd, cmph_t *mphf) nbytes = fread(buf, (size_t)buflen, (size_t)1, fd); chd_ph->hl = hash_state_load(buf, buflen); free(buf); - + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd); DEBUGP("Compressed sequence structure has %u bytes\n", buflen); buf = (char *)malloc((size_t)buflen); nbytes = fread(buf, (size_t)buflen, (size_t)1, fd); - chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); + chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); compressed_seq_load(chd_ph->cs, buf, buflen); free(buf); - + // loading n and nbuckets DEBUGP("Reading n and nbuckets\n"); - nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd); - nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd); } int chd_ph_dump(cmph_t *mphf, FILE *fd) @@ -867,7 +867,7 @@ int chd_ph_dump(cmph_t *mphf, FILE *fd) cmph_uint32 buflen; register size_t nbytes; chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; - + __cmph_dump(mphf, fd); hash_state_dump(data->hl, &buf, &buflen); @@ -906,11 +906,11 @@ cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) register cmph_uint32 disp,position; register cmph_uint32 probe0_num,probe1_num; register cmph_uint32 f,g,h; - hash_vector(chd_ph->hl, key, keylen, hl); + hash_vector(chd_ph->hl, key, keylen, hl); g = hl[0] % chd_ph->nbuckets; f = hl[1] % chd_ph->n; h = hl[2] % (chd_ph->n-1) + 1; - + disp = compressed_seq_query(chd_ph->cs, g); probe0_num = disp % chd_ph->n; probe1_num = disp/chd_ph->n; @@ -949,10 +949,10 @@ void chd_ph_pack(cmph_t *mphf, void *packed_mphf) cmph_uint32 chd_ph_packed_size(cmph_t *mphf) { register chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; - register CMPH_HASH hl_type = hash_get_type(data->hl); + register CMPH_HASH hl_type = hash_get_type(data->hl); register cmph_uint32 hash_state_pack_size = hash_state_packed_size(hl_type); register cmph_uint32 cs_pack_size = compressed_seq_packed_size(data->cs); - + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_pack_size + cs_pack_size + 3*sizeof(cmph_uint32)); } @@ -961,28 +961,25 @@ cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 { register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf; register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4; - + register cmph_uint32 * ptr = (cmph_uint32 *)(hl_ptr + hash_state_packed_size(hl_type)); register cmph_uint32 n = *ptr++; register cmph_uint32 nbuckets = *ptr++; cmph_uint32 hl[3]; - + register cmph_uint32 disp,position; register cmph_uint32 probe0_num,probe1_num; register cmph_uint32 f,g,h; - + hash_vector_packed(hl_ptr, hl_type, key, keylen, hl); g = hl[0] % nbuckets; f = hl[1] % n; h = hl[2] % (n-1) + 1; - + disp = compressed_seq_query_packed(ptr, g); probe0_num = disp % n; probe1_num = disp/n; position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % n); return position; } - - - diff --git a/src/chm.c b/src/chm.c index 9cdbf41..5c416b1 100644 --- a/src/chm.c +++ b/src/chm.c @@ -21,7 +21,7 @@ chm_config_data_t *chm_config_new(void) { chm_config_data_t *chm = NULL; chm = (chm_config_data_t *)malloc(sizeof(chm_config_data_t)); - assert(chm); + if (!chm) return NULL; memset(chm, 0, sizeof(chm_config_data_t)); chm->hashfuncs[0] = CMPH_HASH_JENKINS; chm->hashfuncs[1] = CMPH_HASH_JENKINS; @@ -45,7 +45,7 @@ void chm_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 2) break; //chm only uses two hash functions - chm->hashfuncs[i] = *hashptr; + chm->hashfuncs[i] = *hashptr; ++i, ++hashptr; } } @@ -61,7 +61,7 @@ cmph_t *chm_new(cmph_config_t *mph, double c) chm_config_data_t *chm = (chm_config_data_t *)mph->data; chm->m = mph->key_source->nkeys; if (c == 0) c = 2.09; - chm->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); + chm->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); DEBUGP("m (edges): %u n (vertices): %u c: %f\n", chm->m, chm->n, c); chm->graph = graph_new(chm->n, chm->m); DEBUGP("Created graph\n"); @@ -92,12 +92,12 @@ cmph_t *chm_new(cmph_config_t *mph, double c) fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } - else break; + } + else break; } if (iterations == 0) { - graph_destroy(chm->graph); + graph_destroy(chm->graph); return NULL; } @@ -120,7 +120,7 @@ cmph_t *chm_new(cmph_config_t *mph, double c) chm_traverse(chm, visited, i); } } - graph_destroy(chm->graph); + graph_destroy(chm->graph); free(visited); chm->graph = NULL; @@ -149,7 +149,7 @@ static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint3 graph_iterator_t it = graph_neighbors_it(chm->graph, v); cmph_uint32 neighbor = 0; SETBIT(visited,v); - + DEBUGP("Visiting vertex %u\n", v); while((neighbor = graph_next_neighbor(chm->graph, &it)) != GRAPH_NO_NEIGHBOR) { @@ -162,7 +162,7 @@ static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint3 chm_traverse(chm, visited, neighbor); } } - + static int chm_gen_edges(cmph_config_t *mph) { cmph_uint32 e; @@ -170,7 +170,7 @@ static int chm_gen_edges(cmph_config_t *mph) int cycles = 0; DEBUGP("Generating edges for %u vertices with hash functions %s and %s\n", chm->n, cmph_hash_names[chm->hashfuncs[0]], cmph_hash_names[chm->hashfuncs[1]]); - graph_clear_edges(chm->graph); + graph_clear_edges(chm->graph); mph->key_source->rewind(mph->key_source->data); for (e = 0; e < mph->key_source->nkeys; ++e) { @@ -181,7 +181,7 @@ static int chm_gen_edges(cmph_config_t *mph) h1 = hash(chm->hashes[0], key, keylen) % chm->n; h2 = hash(chm->hashes[1], key, keylen) % chm->n; if (h1 == h2) if (++h2 >= chm->n) h2 = 0; - if (h1 == h2) + if (h1 == h2) { if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); mph->key_source->dispose(mph->key_source->data, key, keylen); @@ -205,7 +205,7 @@ int chm_dump(cmph_t *mphf, FILE *fd) cmph_uint32 two = 2; //number of hash functions chm_data_t *data = (chm_data_t *)mphf->data; register size_t nbytes; - + __cmph_dump(mphf, fd); nbytes = fwrite(&two, sizeof(cmph_uint32), (size_t)1, fd); @@ -223,7 +223,7 @@ int chm_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); - + nbytes = fwrite(data->g, sizeof(cmph_uint32)*data->n, (size_t)1, fd); /* #ifdef DEBUG fprintf(stderr, "G: "); @@ -260,8 +260,8 @@ void chm_load(FILE *f, cmph_t *mphf) } DEBUGP("Reading m and n\n"); - nbytes = fread(&(chm->n), sizeof(cmph_uint32), (size_t)1, f); - nbytes = fread(&(chm->m), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(chm->n), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(chm->m), sizeof(cmph_uint32), (size_t)1, f); chm->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*chm->n); nbytes = fread(chm->g, chm->n*sizeof(cmph_uint32), (size_t)1, f); @@ -272,7 +272,7 @@ void chm_load(FILE *f, cmph_t *mphf) #endif return; } - + cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { @@ -287,7 +287,7 @@ cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) void chm_destroy(cmph_t *mphf) { chm_data_t *data = (chm_data_t *)mphf->data; - free(data->g); + free(data->g); hash_state_destroy(data->hashes[0]); hash_state_destroy(data->hashes[1]); free(data->hashes); @@ -298,7 +298,7 @@ void chm_destroy(cmph_t *mphf) /** \fn void chm_pack(cmph_t *mphf, void *packed_mphf); * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. * \param mphf pointer to the resulting mphf - * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() */ void chm_pack(cmph_t *mphf, void *packed_mphf) { @@ -332,26 +332,26 @@ void chm_pack(cmph_t *mphf, void *packed_mphf) ptr += sizeof(data->m); // packing g - memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n); + memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n); } /** \fn cmph_uint32 chm_packed_size(cmph_t *mphf); * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 chm_packed_size(cmph_t *mphf) { chm_data_t *data = (chm_data_t *)mphf->data; - CMPH_HASH h1_type = hash_get_type(data->hashes[0]); - CMPH_HASH h2_type = hash_get_type(data->hashes[1]); + CMPH_HASH h1_type = hash_get_type(data->hashes[0]); + CMPH_HASH h2_type = hash_get_type(data->hashes[1]); - return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + 4*sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n); } /** cmph_uint32 chm_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -366,16 +366,16 @@ cmph_uint32 chm_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type); register CMPH_HASH h2_type = *((cmph_uint32 *)h2_ptr); h2_ptr += 4; - + register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type)); - - register cmph_uint32 n = *g_ptr++; - register cmph_uint32 m = *g_ptr++; - - register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; - register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; + + register cmph_uint32 n = *g_ptr++; + register cmph_uint32 m = *g_ptr++; + + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; + register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); if (h1 == h2 && ++h2 >= n) h2 = 0; DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, g_ptr[h1], g_ptr[h2], m); - return (g_ptr[h1] + g_ptr[h2]) % m; + return (g_ptr[h1] + g_ptr[h2]) % m; } diff --git a/src/cmph.c b/src/cmph.c index ae76727..f460dd0 100644 --- a/src/cmph.c +++ b/src/cmph.c @@ -1,10 +1,10 @@ #include "cmph.h" #include "cmph_structs.h" #include "chm.h" -#include "bmz.h" -#include "bmz8.h" -#include "brz.h" -#include "fch.h" +#include "bmz.h" +#include "bmz8.h" +#include "brz.h" +#include "fch.h" #include "bdz.h" #include "bdz_ph.h" #include "chd_ph.h" @@ -268,12 +268,12 @@ cmph_io_adapter_t *cmph_io_struct_vector_adapter(void * vector, cmph_uint32 stru key_source->read = key_struct_vector_read; key_source->dispose = key_vector_dispose; key_source->rewind = key_struct_vector_rewind; - return key_source; + return key_source; } void cmph_io_struct_vector_adapter_destroy(cmph_io_adapter_t * key_source) { - cmph_io_struct_vector_destroy(key_source); + cmph_io_struct_vector_destroy(key_source); } cmph_io_adapter_t *cmph_io_vector_adapter(char ** vector, cmph_uint32 nkeys) @@ -374,7 +374,7 @@ void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) void cmph_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir) { - if (mph->algo == CMPH_BRZ) + if (mph->algo == CMPH_BRZ) { brz_config_set_tmp_dir(mph, tmp_dir); } @@ -383,7 +383,7 @@ void cmph_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir) void cmph_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd) { - if (mph->algo == CMPH_BRZ) + if (mph->algo == CMPH_BRZ) { brz_config_set_mphf_fd(mph, mphf_fd); } @@ -391,19 +391,19 @@ void cmph_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd) void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b) { - if (mph->algo == CMPH_BRZ) + if (mph->algo == CMPH_BRZ) { brz_config_set_b(mph, b); } - else if (mph->algo == CMPH_BDZ) + else if (mph->algo == CMPH_BDZ) { bdz_config_set_b(mph, b); } - else if (mph->algo == CMPH_CHD_PH) + else if (mph->algo == CMPH_CHD_PH) { chd_ph_config_set_b(mph, b); } - else if (mph->algo == CMPH_CHD) + else if (mph->algo == CMPH_CHD) { chd_config_set_b(mph, b); } @@ -411,11 +411,11 @@ void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b) void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) { - if (mph->algo == CMPH_CHD_PH) + if (mph->algo == CMPH_CHD_PH) { chd_ph_config_set_keys_per_bin(mph, keys_per_bin); } - else if (mph->algo == CMPH_CHD) + else if (mph->algo == CMPH_CHD) { chd_config_set_keys_per_bin(mph, keys_per_bin); } @@ -423,7 +423,7 @@ void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability) { - if (mph->algo == CMPH_BRZ) + if (mph->algo == CMPH_BRZ) { brz_config_set_memory_availability(mph, memory_availability); } @@ -523,7 +523,7 @@ cmph_t *cmph_new(cmph_config_t *mph) double c = mph->c; DEBUGP("Creating mph with algorithm %s\n", cmph_names[mph->algo]); - switch (mph->algo) + switch (mph->algo) { case CMPH_CHM: DEBUGP("Creating chm hash\n"); @@ -658,28 +658,28 @@ cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) case CMPH_CHM: return chm_search(mphf, key, keylen); case CMPH_BMZ: /* included -- Fabiano */ - DEBUGP("bmz algorithm search\n"); + DEBUGP("bmz algorithm search\n"); return bmz_search(mphf, key, keylen); case CMPH_BMZ8: /* included -- Fabiano */ - DEBUGP("bmz8 algorithm search\n"); + DEBUGP("bmz8 algorithm search\n"); return bmz8_search(mphf, key, keylen); case CMPH_BRZ: /* included -- Fabiano */ - DEBUGP("brz algorithm search\n"); + DEBUGP("brz algorithm search\n"); return brz_search(mphf, key, keylen); case CMPH_FCH: /* included -- Fabiano */ - DEBUGP("fch algorithm search\n"); + DEBUGP("fch algorithm search\n"); return fch_search(mphf, key, keylen); case CMPH_BDZ: /* included -- Fabiano */ - DEBUGP("bdz algorithm search\n"); + DEBUGP("bdz algorithm search\n"); return bdz_search(mphf, key, keylen); case CMPH_BDZ_PH: /* included -- Fabiano */ - DEBUGP("bdz_ph algorithm search\n"); + DEBUGP("bdz_ph algorithm search\n"); return bdz_ph_search(mphf, key, keylen); case CMPH_CHD_PH: /* included -- Fabiano */ - DEBUGP("chd_ph algorithm search\n"); + DEBUGP("chd_ph algorithm search\n"); return chd_ph_search(mphf, key, keylen); case CMPH_CHD: /* included -- Fabiano */ - DEBUGP("chd algorithm search\n"); + DEBUGP("chd algorithm search\n"); return chd_search(mphf, key, keylen); default: assert(0); @@ -692,7 +692,7 @@ cmph_uint32 cmph_size(cmph_t *mphf) { return mphf->size; } - + void cmph_destroy(cmph_t *mphf) { switch(mphf->algo) @@ -724,7 +724,7 @@ void cmph_destroy(cmph_t *mphf) case CMPH_CHD: /* included -- Fabiano */ chd_destroy(mphf); return; - default: + default: assert(0); } assert(0); @@ -735,12 +735,12 @@ void cmph_destroy(cmph_t *mphf) /** \fn void cmph_pack(cmph_t *mphf, void *packed_mphf); * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. * \param mphf pointer to the resulting mphf - * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() */ void cmph_pack(cmph_t *mphf, void *packed_mphf) { // packing algorithm type to be used in cmph.c - cmph_uint32 * ptr = (cmph_uint32 *) packed_mphf; + cmph_uint32 * ptr = (cmph_uint32 *) packed_mphf; *ptr++ = mphf->algo; DEBUGP("mphf->algo = %u\n", mphf->algo); switch(mphf->algo) @@ -772,7 +772,7 @@ void cmph_pack(cmph_t *mphf, void *packed_mphf) case CMPH_CHD: /* included -- Fabiano */ chd_pack(mphf, ptr); break; - default: + default: assert(0); } return; @@ -782,7 +782,7 @@ void cmph_pack(cmph_t *mphf, void *packed_mphf) * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 cmph_packed_size(cmph_t *mphf) { switch(mphf->algo) @@ -805,14 +805,14 @@ cmph_uint32 cmph_packed_size(cmph_t *mphf) return chd_ph_packed_size(mphf); case CMPH_CHD: /* included -- Fabiano */ return chd_packed_size(mphf); - default: + default: assert(0); } return 0; // FAILURE } /** cmph_uint32 cmph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -842,7 +842,7 @@ cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 k return chd_ph_search_packed(++ptr, key, keylen); case CMPH_CHD: /* included -- Fabiano */ return chd_search_packed(++ptr, key, keylen); - default: + default: assert(0); } return 0; // FAILURE diff --git a/src/cmph_structs.c b/src/cmph_structs.c index bcd3da3..2c28bc3 100644 --- a/src/cmph_structs.c +++ b/src/cmph_structs.c @@ -28,7 +28,7 @@ void __cmph_dump(cmph_t *mphf, FILE *fd) nbytes = fwrite(cmph_names[mphf->algo], (size_t)(strlen(cmph_names[mphf->algo]) + 1), (size_t)1, fd); nbytes = fwrite(&(mphf->size), sizeof(mphf->size), (size_t)1, fd); } -cmph_t *__cmph_load(FILE *f) +cmph_t *__cmph_load(FILE *f) { cmph_t *mphf = NULL; cmph_uint32 i; @@ -36,7 +36,7 @@ cmph_t *__cmph_load(FILE *f) char *ptr = algo_name; CMPH_ALGO algo = CMPH_COUNT; register size_t nbytes; - + DEBUGP("Loading mphf\n"); while(1) { @@ -52,7 +52,7 @@ cmph_t *__cmph_load(FILE *f) algo = i; } } - if (algo == CMPH_COUNT) + if (algo == CMPH_COUNT) { DEBUGP("Algorithm %s not found\n", algo_name); return NULL; @@ -65,5 +65,3 @@ cmph_t *__cmph_load(FILE *f) return mphf; } - - diff --git a/src/djb2_hash.c b/src/djb2_hash.c index d3b4330..25f8220 100644 --- a/src/djb2_hash.c +++ b/src/djb2_hash.c @@ -4,6 +4,7 @@ djb2_state_t *djb2_state_new() { djb2_state_t *state = (djb2_state_t *)malloc(sizeof(djb2_state_t)); + if (!djb2_state) return NULL; state->hashfunc = CMPH_HASH_DJB2; return state; } @@ -18,7 +19,7 @@ cmph_uint32 djb2_hash(djb2_state_t *state, const char *k, cmph_uint32 keylen) register cmph_uint32 hash = 5381; const unsigned char *ptr = (unsigned char *)k; cmph_uint32 i = 0; - while (i < keylen) + while (i < keylen) { hash = hash*33 ^ *ptr; ++ptr, ++i; diff --git a/src/fch.c b/src/fch.c index 67b68fb..9ca4e03 100644 --- a/src/fch.c +++ b/src/fch.c @@ -23,7 +23,7 @@ fch_config_data_t *fch_config_new() { fch_config_data_t *fch; fch = (fch_config_data_t *)malloc(sizeof(fch_config_data_t)); - assert(fch); + if (!fch) return NULL; memset(fch, 0, sizeof(fch_config_data_t)); fch->hashfuncs[0] = CMPH_HASH_JENKINS; fch->hashfuncs[1] = CMPH_HASH_JENKINS; @@ -50,7 +50,7 @@ void fch_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 2) break; //fch only uses two hash functions - fch->hashfuncs[i] = *hashptr; + fch->hashfuncs[i] = *hashptr; ++i, ++hashptr; } } @@ -88,36 +88,36 @@ static fch_buckets_t * mapping(cmph_config_t *mph) fch_buckets_t *buckets = NULL; fch_config_data_t *fch = (fch_config_data_t *)mph->data; if (fch->h1) hash_state_destroy(fch->h1); - fch->h1 = hash_state_new(fch->hashfuncs[0], fch->m); + fch->h1 = hash_state_new(fch->hashfuncs[0], fch->m); fch->b = fch_calc_b(fch->c, fch->m); fch->p1 = fch_calc_p1(fch->m); fch->p2 = fch_calc_p2(fch->b); //DEBUGP("b:%u p1:%f p2:%f\n", fch->b, fch->p1, fch->p2); buckets = fch_buckets_new(fch->b); - mph->key_source->rewind(mph->key_source->data); + mph->key_source->rewind(mph->key_source->data); for(i = 0; i < fch->m; i++) { cmph_uint32 h1, keylen; char *key = NULL; - mph->key_source->read(mph->key_source->data, &key, &keylen); + mph->key_source->read(mph->key_source->data, &key, &keylen); h1 = hash(fch->h1, key, keylen) % fch->m; h1 = mixh10h11h12 (fch->b, fch->p1, fch->p2, h1); fch_buckets_insert(buckets, h1, key, keylen); key = NULL; // transger memory ownership - + } - return buckets; + return buckets; } -// returns the buckets indexes sorted by their sizes. +// returns the buckets indexes sorted by their sizes. static cmph_uint32 * ordering(fch_buckets_t * buckets) { return fch_buckets_get_indexes_sorted_by_size(buckets); } -/* Check whether function h2 causes collisions among the keys of each bucket */ +/* Check whether function h2 causes collisions among the keys of each bucket */ static cmph_uint8 check_for_collisions_h2(fch_config_data_t *fch, fch_buckets_t * buckets, cmph_uint32 *sorted_indexes) { //cmph_uint32 max_size = fch_buckets_get_max_size(buckets); @@ -146,7 +146,7 @@ static cmph_uint8 check_for_collisions_h2(fch_config_data_t *fch, fch_buckets_t } static void permut(cmph_uint32 * vector, cmph_uint32 n) -{ +{ cmph_uint32 i, j, b; for (i = 0; i < n; i++) { j = (cmph_uint32) rand() % n; @@ -179,12 +179,12 @@ static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph { map_table[random_table[i]] = i; } - do { + do { if (fch->h2) hash_state_destroy(fch->h2); - fch->h2 = hash_state_new(fch->hashfuncs[1], fch->m); + fch->h2 = hash_state_new(fch->hashfuncs[1], fch->m); restart = check_for_collisions_h2(fch, buckets, sorted_indexes); filled_count = 0; - if (!restart) + if (!restart) { searching_iterations++; iteration_to_generate_h2 = 0; //DEBUGP("searching_iterations: %u\n", searching_iterations); @@ -192,7 +192,7 @@ static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph else { iteration_to_generate_h2++; //DEBUGP("iteration_to_generate_h2: %u\n", iteration_to_generate_h2); - } + } for(i = 0; (i < nbuckets) && !restart; i++) { cmph_uint32 bucketsize = fch_buckets_get_size(buckets, sorted_indexes[i]); if (bucketsize == 0) @@ -204,8 +204,8 @@ static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph for(z = 0; (z < (fch->m - filled_count)) && restart; z++) { char * key = fch_buckets_get_key(buckets, sorted_indexes[i], INDEX); cmph_uint32 keylen = fch_buckets_get_keylength(buckets, sorted_indexes[i], INDEX); - cmph_uint32 h2 = hash(fch->h2, key, keylen) % fch->m; - counter = 0; + cmph_uint32 h2 = hash(fch->h2, key, keylen) % fch->m; + counter = 0; restart = 0; // false fch->g[sorted_indexes[i]] = (fch->m + random_table[filled_count + z] - h2) % fch->m; //DEBUGP("g[%u]: %u\n", sorted_indexes[i], fch->g[sorted_indexes[i]]); @@ -217,7 +217,7 @@ static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph h2 = hash(fch->h2, key, keylen) % fch->m; index = (h2 + fch->g[sorted_indexes[i]]) % fch->m; //DEBUGP("key:%s keylen:%u index: %u h2:%u bucketsize:%u\n", key, keylen, index, h2, bucketsize); - if (map_table[index] >= filled_count) { + if (map_table[index] >= filled_count) { cmph_uint32 y = map_table[index]; cmph_uint32 ry = random_table[y]; random_table[y] = random_table[filled_count]; @@ -225,19 +225,19 @@ static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph map_table[random_table[y]] = y; map_table[random_table[filled_count]] = filled_count; filled_count++; - counter ++; + counter ++; } - else { + else { restart = 1; // true filled_count = filled_count - counter; - counter = 0; + counter = 0; break; } j = (j + 1) % bucketsize; - } while(j % bucketsize != INDEX); + } while(j % bucketsize != INDEX); } //getchar(); - } + } } while(restart && (searching_iterations < 10) && (iteration_to_generate_h2 < 1000)); free(map_table); free(random_table); @@ -264,7 +264,7 @@ cmph_t *fch_new(cmph_config_t *mph, double c) fch->h2 = NULL; fch->g = NULL; do - { + { if (mph->verbosity) { fprintf(stderr, "Entering mapping step for mph creation of %u keys\n", fch->m); @@ -283,7 +283,7 @@ cmph_t *fch_new(cmph_config_t *mph, double c) } restart_mapping = searching(fch, buckets, sorted_indexes); iterations--; - + } while(restart_mapping && iterations > 0); if (buckets) fch_buckets_destroy(buckets); if (sorted_indexes) free (sorted_indexes); @@ -317,7 +317,7 @@ int fch_dump(cmph_t *mphf, FILE *fd) char *buf = NULL; cmph_uint32 buflen; register size_t nbytes; - + fch_data_t *data = (fch_data_t *)mphf->data; __cmph_dump(mphf, fd); @@ -365,7 +365,7 @@ void fch_load(FILE *f, cmph_t *mphf) nbytes = fread(buf, (size_t)buflen, (size_t)1, f); fch->h1 = hash_state_load(buf, buflen); free(buf); - + //DEBUGP("Loading fch mphf\n"); mphf->data = fch; //DEBUGP("Reading h2\n"); @@ -376,8 +376,8 @@ void fch_load(FILE *f, cmph_t *mphf) nbytes = fread(buf, (size_t)buflen, (size_t)1, f); fch->h2 = hash_state_load(buf, buflen); free(buf); - - + + //DEBUGP("Reading m and n\n"); nbytes = fread(&(fch->m), sizeof(cmph_uint32), (size_t)1, f); nbytes = fread(&(fch->c), sizeof(double), (size_t)1, f); @@ -418,7 +418,7 @@ void fch_destroy(cmph_t *mphf) /** \fn void fch_pack(cmph_t *mphf, void *packed_mphf); * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. * \param mphf pointer to the resulting mphf - * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() */ void fch_pack(cmph_t *mphf, void *packed_mphf) { @@ -450,37 +450,37 @@ void fch_pack(cmph_t *mphf, void *packed_mphf) // packing b *((cmph_uint32 *) ptr) = data->b; ptr += sizeof(data->b); - + // packing p1 - *((cmph_uint64 *)ptr) = (cmph_uint64)data->p1; + *((cmph_uint64 *)ptr) = (cmph_uint64)data->p1; ptr += sizeof(data->p1); // packing p2 - *((cmph_uint64 *)ptr) = (cmph_uint64)data->p2; + *((cmph_uint64 *)ptr) = (cmph_uint64)data->p2; ptr += sizeof(data->p2); // packing g - memcpy(ptr, data->g, sizeof(cmph_uint32)*(data->b)); + memcpy(ptr, data->g, sizeof(cmph_uint32)*(data->b)); } /** \fn cmph_uint32 fch_packed_size(cmph_t *mphf); * \brief Return the amount of space needed to pack mphf. * \param mphf pointer to a mphf * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 fch_packed_size(cmph_t *mphf) { fch_data_t *data = (fch_data_t *)mphf->data; - CMPH_HASH h1_type = hash_get_type(data->h1); - CMPH_HASH h2_type = hash_get_type(data->h2); + CMPH_HASH h1_type = hash_get_type(data->h1); + CMPH_HASH h2_type = hash_get_type(data->h2); - return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + 4*sizeof(cmph_uint32) + 2*sizeof(double) + sizeof(cmph_uint32)*(data->b)); } /** cmph_uint32 fch_search(void *packed_mphf, const char *key, cmph_uint32 keylen); - * \brief Use the packed mphf to do a search. + * \brief Use the packed mphf to do a search. * \param packed_mphf pointer to the packed mphf * \param key key to be hashed * \param keylen key legth in bytes @@ -495,12 +495,12 @@ cmph_uint32 fch_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type); register CMPH_HASH h2_type = *((cmph_uint32 *)h2_ptr); h2_ptr += 4; - - register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type)); - - register cmph_uint32 m = *g_ptr++; - register cmph_uint32 b = *g_ptr++; + register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type)); + + register cmph_uint32 m = *g_ptr++; + + register cmph_uint32 b = *g_ptr++; register double p1 = (double)(*((cmph_uint64 *)g_ptr)); g_ptr += 2; @@ -508,10 +508,9 @@ cmph_uint32 fch_search_packed(void *packed_mphf, const char *key, cmph_uint32 ke register double p2 = (double)(*((cmph_uint64 *)g_ptr)); g_ptr += 2; - register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m; + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m; register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % m; h1 = mixh10h11h12 (b, p1, p2, h1); return (h2 + g_ptr[h1]) % m; } - diff --git a/src/fch_buckets.c b/src/fch_buckets.c index a588f14..0c11051 100644 --- a/src/fch_buckets.c +++ b/src/fch_buckets.c @@ -20,7 +20,7 @@ typedef struct __fch_bucket_t -static void fch_bucket_new(fch_bucket_t *bucket) +static void fch_bucket_new(fch_bucket_t *bucket) { assert(bucket); bucket->size = 0; @@ -109,16 +109,16 @@ struct __fch_buckets_t { fch_bucket_t * values; cmph_uint32 nbuckets, max_size; - + }; fch_buckets_t * fch_buckets_new(cmph_uint32 nbuckets) { cmph_uint32 i; fch_buckets_t *buckets = (fch_buckets_t *)malloc(sizeof(fch_buckets_t)); - assert(buckets); + if (!buckets) return NULL; buckets->values = (fch_bucket_t *)calloc((size_t)nbuckets, sizeof(fch_bucket_t)); - for (i = 0; i < nbuckets; i++) fch_bucket_new(buckets->values + i); + for (i = 0; i < nbuckets; i++) fch_bucket_new(buckets->values + i); assert(buckets->values); buckets->nbuckets = nbuckets; buckets->max_size = 0; @@ -135,7 +135,7 @@ void fch_buckets_insert(fch_buckets_t * buckets, cmph_uint32 index, char * key, { assert(index < buckets->nbuckets); fch_bucket_insert(buckets->values + index, key, length); - if (fch_bucket_size(buckets->values + index) > buckets->max_size) + if (fch_bucket_size(buckets->values + index) > buckets->max_size) { buckets->max_size = fch_bucket_size(buckets->values + index); } @@ -170,16 +170,16 @@ cmph_uint32 fch_buckets_get_nbuckets(fch_buckets_t * buckets) return buckets->nbuckets; } -cmph_uint32 * fch_buckets_get_indexes_sorted_by_size(fch_buckets_t * buckets) +cmph_uint32 * fch_buckets_get_indexes_sorted_by_size(fch_buckets_t * buckets) { cmph_uint32 i = 0; cmph_uint32 sum = 0, value; cmph_uint32 *nbuckets_size = (cmph_uint32 *) calloc((size_t)buckets->max_size + 1, sizeof(cmph_uint32)); cmph_uint32 * sorted_indexes = (cmph_uint32 *) calloc((size_t)buckets->nbuckets, sizeof(cmph_uint32)); - + // collect how many buckets for each size. for(i = 0; i < buckets->nbuckets; i++) nbuckets_size[fch_bucket_size(buckets->values + i)] ++; - + // calculating offset considering a decreasing order of buckets size. value = nbuckets_size[buckets->max_size]; nbuckets_size[buckets->max_size] = sum; @@ -188,13 +188,13 @@ cmph_uint32 * fch_buckets_get_indexes_sorted_by_size(fch_buckets_t * buckets) sum += value; value = nbuckets_size[i]; nbuckets_size[i] = sum; - + } - for(i = 0; i < buckets->nbuckets; i++) + for(i = 0; i < buckets->nbuckets; i++) { sorted_indexes[nbuckets_size[fch_bucket_size(buckets->values + i)]] = (cmph_uint32)i; nbuckets_size[fch_bucket_size(buckets->values + i)] ++; - } + } free(nbuckets_size); return sorted_indexes; } @@ -208,7 +208,7 @@ void fch_buckets_print(fch_buckets_t * buckets) void fch_buckets_destroy(fch_buckets_t * buckets) { cmph_uint32 i; - for (i = 0; i < buckets->nbuckets; i++) fch_bucket_destroy(buckets->values + i); + for (i = 0; i < buckets->nbuckets; i++) fch_bucket_destroy(buckets->values + i); free(buckets->values); free(buckets); } diff --git a/src/fnv_hash.c b/src/fnv_hash.c index aeaca8f..0ef1f48 100644 --- a/src/fnv_hash.c +++ b/src/fnv_hash.c @@ -4,6 +4,7 @@ fnv_state_t *fnv_state_new() { fnv_state_t *state = (fnv_state_t *)malloc(sizeof(fnv_state_t)); + if (!state) return NULL; state->hashfunc = CMPH_HASH_FNV; return state; } @@ -15,13 +16,13 @@ void fnv_state_destroy(fnv_state_t *state) cmph_uint32 fnv_hash(fnv_state_t *state, const char *k, cmph_uint32 keylen) { - const unsigned char *bp = (const unsigned char *)k; - const unsigned char *be = bp + keylen; - static unsigned int hval = 0; + const unsigned char *bp = (const unsigned char *)k; + const unsigned char *be = bp + keylen; + static unsigned int hval = 0; - while (bp < be) + while (bp < be) { - + //hval *= 0x01000193; good for non-gcc compiler hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24); //good for gcc @@ -41,6 +42,7 @@ void fnv_state_dump(fnv_state_t *state, char **buf, cmph_uint32 *buflen) fnv_state_t * fnv_state_copy(fnv_state_t *src_state) { fnv_state_t *dest_state = (fnv_state_t *)malloc(sizeof(fnv_state_t)); + if (!dest_state) return NULL; dest_state->hashfunc = src_state->hashfunc; return dest_state; } diff --git a/src/graph.c b/src/graph.c index 2e9ddb7..97737ad 100644 --- a/src/graph.c +++ b/src/graph.c @@ -77,7 +77,7 @@ void graph_print(graph_t *g) printf("%u -> %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]); } } - + } return; } @@ -130,7 +130,7 @@ static void del_edge_point(graph_t *g, cmph_uint32 v1, cmph_uint32 v2) DEBUGP("Deleting edge point %u %u\n", v1, v2); e = g->first[v1]; - if (check_edge(g, e, v1, v2)) + if (check_edge(g, e, v1, v2)) { g->first[v1] = g->next[e]; //g->edges[e] = EMPTY; @@ -151,7 +151,7 @@ static void del_edge_point(graph_t *g, cmph_uint32 v1, cmph_uint32 v2) DEBUGP("Deleted\n"); } - + void graph_del_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2) { g->shrinking = 1; @@ -163,7 +163,7 @@ void graph_clear_edges(graph_t *g) { cmph_uint32 i; for (i = 0; i < g->nnodes; ++i) g->first[i] = EMPTY; - for (i = 0; i < g->nedges*2; ++i) + for (i = 0; i < g->nedges*2; ++i) { g->edges[i] = EMPTY; g->next[i] = EMPTY; @@ -178,7 +178,7 @@ static cmph_uint8 find_degree1_edge(graph_t *g, cmph_uint32 v, cmph_uint8 *delet cmph_uint8 found = 0; DEBUGP("Checking degree of vertex %u connected to edge %u\n", v, edge); if (edge == EMPTY) return 0; - else if (!(GETBIT(deleted, abs_edge(edge, 0)))) + else if (!(GETBIT(deleted, abs_edge(edge, 0)))) { found = 1; *e = edge; @@ -206,17 +206,17 @@ static void cyclic_del_edge(graph_t *g, cmph_uint32 v, cmph_uint8 *deleted) degree1 = find_degree1_edge(g, v1, deleted, &e); if (!degree1) return; - while(1) + while(1) { DEBUGP("Deleting edge %u (%u->%u)\n", e, g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]); SETBIT(deleted, abs_edge(e, 0)); - + v2 = g->edges[abs_edge(e, 0)]; if (v2 == v1) v2 = g->edges[abs_edge(e, 1)]; - DEBUGP("Checking if second endpoint %u has degree 1\n", v2); + DEBUGP("Checking if second endpoint %u has degree 1\n", v2); degree1 = find_degree1_edge(g, v2, deleted, &e); - if (degree1) + if (degree1) { DEBUGP("Inspecting vertex %u\n", v2); v1 = v2; @@ -240,7 +240,7 @@ int graph_is_cyclic(graph_t *g) } for (i = 0; i < g->nedges; ++i) { - if (!(GETBIT(deleted, i))) + if (!(GETBIT(deleted, i))) { DEBUGP("Edge %u %u->%u was not deleted\n", i, g->edges[i], g->edges[i + g->nedges]); free(deleted); @@ -275,15 +275,15 @@ void graph_obtain_critical_nodes(graph_t *g) /* included -- Fabiano*/ for (i = 0; i < g->nedges; ++i) { - if (!(GETBIT(deleted,i))) + if (!(GETBIT(deleted,i))) { DEBUGP("Edge %u %u->%u belongs to the 2-core\n", i, g->edges[i], g->edges[i + g->nedges]); - if(!(GETBIT(g->critical_nodes,g->edges[i]))) + if(!(GETBIT(g->critical_nodes,g->edges[i]))) { g->ncritical_nodes ++; SETBIT(g->critical_nodes,g->edges[i]); } - if(!(GETBIT(g->critical_nodes,g->edges[i + g->nedges]))) + if(!(GETBIT(g->critical_nodes,g->edges[i + g->nedges]))) { g->ncritical_nodes ++; SETBIT(g->critical_nodes,g->edges[i + g->nedges]); @@ -328,11 +328,9 @@ graph_iterator_t graph_neighbors_it(graph_t *g, cmph_uint32 v) cmph_uint32 graph_next_neighbor(graph_t *g, graph_iterator_t* it) { cmph_uint32 ret; - if(it->edge == EMPTY) return GRAPH_NO_NEIGHBOR; + if(it->edge == EMPTY) return GRAPH_NO_NEIGHBOR; if (g->edges[it->edge] == it->vertex) ret = g->edges[it->edge + g->nedges]; else ret = g->edges[it->edge]; it->edge = g->next[it->edge]; return ret; } - - diff --git a/src/hash.c b/src/hash.c index 7ab0b04..aa8c95f 100644 --- a/src/hash.c +++ b/src/hash.c @@ -133,7 +133,7 @@ void hash_state_destroy(hash_state_t *state) * \brief Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed. * \param state points to the hash function * \param hash_packed pointer to the contiguous memory area used to store the hash function. The size of hash_packed must be at least hash_state_packed_size() - * + * * Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed. * However, the hash function type must be packed outside. */ @@ -142,20 +142,20 @@ void hash_state_pack(hash_state_t *state, void *hash_packed) switch (state->hashfunc) { case CMPH_HASH_JENKINS: - // pack the jenkins hash function + // pack the jenkins hash function jenkins_state_pack((jenkins_state_t *)state, hash_packed); break; default: assert(0); } - return; + return; } /** \fn cmph_uint32 hash_state_packed_size(CMPH_HASH hashfunc) * \brief Return the amount of space needed to pack a hash function. * \param hashfunc function type * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 hash_state_packed_size(CMPH_HASH hashfunc) { cmph_uint32 size = 0; @@ -197,7 +197,7 @@ cmph_uint32 hash_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cm * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. */ void hash_vector_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) -{ +{ switch (hashfunc) { case CMPH_HASH_JENKINS: diff --git a/src/hashtree.c b/src/hashtree.c index 2f3567e..1bfd852 100644 --- a/src/hashtree.c +++ b/src/hashtree.c @@ -41,7 +41,7 @@ void hashtree_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) while(*hashptr != CMPH_HASH_COUNT) { if (i >= 3) break; //hashtree only uses three hash functions - hashtree->hashfuncs[i] = *hashptr; + hashtree->hashfuncs[i] = *hashptr; ++i, ++hashptr; } } @@ -55,8 +55,8 @@ cmph_t *hashtree_new(cmph_config_t *mph, double c) cmph_uint32 iterations = 20; cmph_uint8 *visited = NULL; hashtree_config_data_t *hashtree = (hashtree_config_data_t *)mph->data; - hashtree->m = mph->key_source->nkeys; - hashtree->n = ceil(c * mph->key_source->nkeys); + hashtree->m = mph->key_source->nkeys; + hashtree->n = ceil(c * mph->key_source->nkeys); DEBUGP("m (edges): %u n (vertices): %u c: %f\n", hashtree->m, hashtree->n, c); hashtree->graph = graph_new(hashtree->n, hashtree->m); DEBUGP("Created graph\n"); @@ -87,12 +87,12 @@ cmph_t *hashtree_new(cmph_config_t *mph, double c) fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations); } if (iterations == 0) break; - } - else break; + } + else break; } if (iterations == 0) { - graph_destroy(hashtree->graph); + graph_destroy(hashtree->graph); return NULL; } @@ -115,7 +115,7 @@ cmph_t *hashtree_new(cmph_config_t *mph, double c) hashtree_traverse(hashtree, visited, i); } } - graph_destroy(hashtree->graph); + graph_destroy(hashtree->graph); free(visited); hashtree->graph = NULL; @@ -144,7 +144,7 @@ static void hashtree_traverse(hashtree_config_data_t *hashtree, cmph_uint8 *visi graph_iterator_t it = graph_neighbors_it(hashtree->graph, v); cmph_uint32 neighbor = 0; SETBIT(visited,v); - + DEBUGP("Visiting vertex %u\n", v); while((neighbor = graph_next_neighbor(hashtree->graph, &it)) != GRAPH_NO_NEIGHBOR) { @@ -157,7 +157,7 @@ static void hashtree_traverse(hashtree_config_data_t *hashtree, cmph_uint8 *visi hashtree_traverse(hashtree, visited, neighbor); } } - + static int hashtree_gen_edges(cmph_config_t *mph) { cmph_uint32 e; @@ -165,7 +165,7 @@ static int hashtree_gen_edges(cmph_config_t *mph) int cycles = 0; DEBUGP("Generating edges for %u vertices with hash functions %s and %s\n", hashtree->n, cmph_hash_names[hashtree->hashfuncs[0]], cmph_hash_names[hashtree->hashfuncs[1]]); - graph_clear_edges(hashtree->graph); + graph_clear_edges(hashtree->graph); mph->key_source->rewind(mph->key_source->data); for (e = 0; e < mph->key_source->nkeys; ++e) { @@ -176,7 +176,7 @@ static int hashtree_gen_edges(cmph_config_t *mph) h1 = hash(hashtree->hashes[0], key, keylen) % hashtree->n; h2 = hash(hashtree->hashes[1], key, keylen) % hashtree->n; if (h1 == h2) if (++h2 >= hashtree->n) h2 = 0; - if (h1 == h2) + if (h1 == h2) { if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); mph->key_source->dispose(mph->key_source->data, key, keylen); @@ -216,7 +216,7 @@ int hashtree_dump(cmph_t *mphf, FILE *fd) fwrite(&(data->n), sizeof(cmph_uint32), 1, fd); fwrite(&(data->m), sizeof(cmph_uint32), 1, fd); - + fwrite(data->g, sizeof(cmph_uint32)*data->n, 1, fd); #ifdef DEBUG fprintf(stderr, "G: "); @@ -253,8 +253,8 @@ void hashtree_load(FILE *f, cmph_t *mphf) } DEBUGP("Reading m and n\n"); - fread(&(hashtree->n), sizeof(cmph_uint32), 1, f); - fread(&(hashtree->m), sizeof(cmph_uint32), 1, f); + fread(&(hashtree->n), sizeof(cmph_uint32), 1, f); + fread(&(hashtree->m), sizeof(cmph_uint32), 1, f); hashtree->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*hashtree->n); fread(hashtree->g, hashtree->n*sizeof(cmph_uint32), 1, f); @@ -265,7 +265,7 @@ void hashtree_load(FILE *f, cmph_t *mphf) #endif return; } - + cmph_uint32 hashtree_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) { @@ -280,7 +280,7 @@ cmph_uint32 hashtree_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) void hashtree_destroy(cmph_t *mphf) { hashtree_data_t *data = (hashtree_data_t *)mphf->data; - free(data->g); + free(data->g); hash_state_destroy(data->hashes[0]); hash_state_destroy(data->hashes[1]); free(data->hashes); diff --git a/src/jenkins_hash.c b/src/jenkins_hash.c index 65cdff9..d540216 100644 --- a/src/jenkins_hash.c +++ b/src/jenkins_hash.c @@ -28,16 +28,16 @@ have at least 1/4 probability of changing. * If mix() is run forward, every bit of c will change between 1/3 and 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.) - mix() was built out of 36 single-cycle latency instructions in a + mix() was built out of 36 single-cycle latency instructions in a structure that could supported 2x parallelism, like so: - a -= b; + a -= b; a -= c; x = (c>>13); b -= c; a ^= x; b -= a; x = (a<<8); c -= a; b ^= x; c -= b; x = (b>>13); ... - Unfortunately, superscalar Pentiums and Sparcs can't take advantage + Unfortunately, superscalar Pentiums and Sparcs can't take advantage of that parallelism. They've also turned some of those single-cycle latency instructions into multi-cycle latency instructions. Still, this is the fastest good hash I could find. There were about 2^^68 @@ -87,6 +87,7 @@ acceptable. Do NOT use for cryptographic purposes. jenkins_state_t *jenkins_state_new(cmph_uint32 size) //size of hash table { jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t)); + if (!state) return NULL; DEBUGP("Initializing jenkins hash\n"); state->seed = ((cmph_uint32)rand() % size); return state; @@ -121,28 +122,28 @@ static inline void __jenkins_hash_vector(cmph_uint32 seed, const char *k, cmph_u hashes[2] += length; switch(len) /* all the case statements fall through */ { - case 11: + case 11: hashes[2] +=((cmph_uint32)k[10]<<24); - case 10: + case 10: hashes[2] +=((cmph_uint32)k[9]<<16); - case 9 : + case 9 : hashes[2] +=((cmph_uint32)k[8]<<8); /* the first byte of hashes[2] is reserved for the length */ - case 8 : + case 8 : hashes[1] +=((cmph_uint32)k[7]<<24); - case 7 : + case 7 : hashes[1] +=((cmph_uint32)k[6]<<16); - case 6 : + case 6 : hashes[1] +=((cmph_uint32)k[5]<<8); case 5 : hashes[1] +=(cmph_uint8) k[4]; - case 4 : + case 4 : hashes[0] +=((cmph_uint32)k[3]<<24); - case 3 : + case 3 : hashes[0] +=((cmph_uint32)k[2]<<16); - case 2 : + case 2 : hashes[0] +=((cmph_uint32)k[1]<<8); - case 1 : + case 1 : hashes[0] +=(cmph_uint8)k[0]; /* case 0: nothing left to add */ } @@ -158,13 +159,13 @@ cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keyl /* cmph_uint32 a, b, c; cmph_uint32 len, length; - // Set up the internal state + // Set up the internal state length = keylen; len = length; - a = b = 0x9e3779b9; // the golden ratio; an arbitrary value - c = state->seed; // the previous hash value - seed in our case + a = b = 0x9e3779b9; // the golden ratio; an arbitrary value + c = state->seed; // the previous hash value - seed in our case - // handle most of the key + // handle most of the key while (len >= 12) { a += (k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24)); @@ -176,37 +177,37 @@ cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keyl // handle the last 11 bytes c += length; - switch(len) /// all the case statements fall through + switch(len) /// all the case statements fall through { - case 11: + case 11: c +=((cmph_uint32)k[10]<<24); - case 10: + case 10: c +=((cmph_uint32)k[9]<<16); - case 9 : + case 9 : c +=((cmph_uint32)k[8]<<8); - // the first byte of c is reserved for the length - case 8 : + // the first byte of c is reserved for the length + case 8 : b +=((cmph_uint32)k[7]<<24); - case 7 : + case 7 : b +=((cmph_uint32)k[6]<<16); - case 6 : + case 6 : b +=((cmph_uint32)k[5]<<8); - case 5 : + case 5 : b +=k[4]; - case 4 : + case 4 : a +=((cmph_uint32)k[3]<<24); - case 3 : + case 3 : a +=((cmph_uint32)k[2]<<16); - case 2 : + case 2 : a +=((cmph_uint32)k[1]<<8); - case 1 : + case 1 : a +=k[0]; - // case 0: nothing left to add + // case 0: nothing left to add } mix(a,b,c); - /// report the result + /// report the result return c; */ @@ -221,7 +222,7 @@ void jenkins_state_dump(jenkins_state_t *state, char **buf, cmph_uint32 *buflen) { *buflen = sizeof(cmph_uint32); *buf = (char *)malloc(sizeof(cmph_uint32)); - if (!*buf) + if (!*buf) { *buflen = UINT_MAX; return; @@ -252,7 +253,7 @@ jenkins_state_t *jenkins_state_load(const char *buf, cmph_uint32 buflen) /** \fn void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed); * \brief Support the ability to pack a jenkins function into a preallocated contiguous memory space pointed by jenkins_packed. * \param state points to the jenkins function - * \param jenkins_packed pointer to the contiguous memory area used to store the jenkins function. The size of jenkins_packed must be at least jenkins_state_packed_size() + * \param jenkins_packed pointer to the contiguous memory area used to store the jenkins function. The size of jenkins_packed must be at least jenkins_state_packed_size() */ void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed) { @@ -265,7 +266,7 @@ void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed) /** \fn cmph_uint32 jenkins_state_packed_size(jenkins_state_t *state); * \brief Return the amount of space needed to pack a jenkins function. * \return the size of the packed function or zero for failures - */ + */ cmph_uint32 jenkins_state_packed_size(void) { return sizeof(cmph_uint32); diff --git a/src/linear_string_map.c b/src/linear_string_map.c index 4390c5b..85f8d21 100644 --- a/src/linear_string_map.c +++ b/src/linear_string_map.c @@ -12,6 +12,7 @@ struct __linear_string_map_t { lsmap_t *lsmap_new() { lsmap_t* lsmap = (lsmap_t*)malloc(sizeof(lsmap_t)); + if (!lsmap) return NULL; lsmap->key = "dummy node"; lsmap->next = NULL; return lsmap; @@ -42,7 +43,7 @@ void* lsmap_search(lsmap_t *lsmap, const char *key) { } return NULL; } - + void lsmap_foreach_key(lsmap_t *lsmap, void (*f)(const char*)) { while (lsmap->next != NULL) { f(lsmap->key); @@ -65,4 +66,3 @@ void lsmap_destroy(lsmap_t *lsmap) { } free(lsmap); } - diff --git a/src/main.c b/src/main.c index f739b32..95a75c5 100644 --- a/src/main.c +++ b/src/main.c @@ -22,13 +22,13 @@ void usage(const char *prg) { - fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg); + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg); } void usage_long(const char *prg) { cmph_uint32 i; - fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg); - fprintf(stderr, "Minimum perfect hashing tool\n\n"); + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg); + fprintf(stderr, "Minimum perfect hashing tool\n\n"); fprintf(stderr, " -h\t print this help message\n"); fprintf(stderr, " -c\t c value determines:\n"); fprintf(stderr, " \t * the number of vertices in the graph for the algorithms BMZ and CHM\n"); @@ -57,7 +57,7 @@ void usage_long(const char *prg) fprintf(stderr, " \t and its value should be an integer in the range [1,32]. Default is 4. The\n"); fprintf(stderr, " \t larger is this value, the slower is the construction of the functions.\n"); fprintf(stderr, " \t This parameter has no effect for other algorithms.\n\n"); - fprintf(stderr, " -t\t set the number of keys per bin for a t-perfect hashing function. A t-perfect\n"); + fprintf(stderr, " -t\t set the number of keys per bin for a t-perfect hashing function. A t-perfect\n"); fprintf(stderr, " \t hash function allows at most t collisions in a given bin. This parameter applies\n"); fprintf(stderr, " \t only to the CHD and CHD_PH algorithms. Its value should be an integer in the\n"); fprintf(stderr, " \t range [1,128]. Defaul is 1\n"); @@ -182,7 +182,7 @@ int main(int argc, char **argv) break; } } - if (!valid) + if (!valid) { fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION); return -1; @@ -204,7 +204,7 @@ int main(int argc, char **argv) break; } } - if (!valid) + if (!valid) { fprintf(stderr, "Invalid hash function: %s\n", optarg); return -1; @@ -223,7 +223,7 @@ int main(int argc, char **argv) return 1; } keys_file = argv[optind]; - + if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL); srand(seed); int ret = 0; @@ -232,7 +232,7 @@ int main(int argc, char **argv) mphf_file = (char *)malloc(strlen(keys_file) + 5); memcpy(mphf_file, keys_file, strlen(keys_file)); memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5); - } + } keys_fd = fopen(keys_file, "r"); @@ -258,7 +258,7 @@ int main(int argc, char **argv) cmph_config_set_memory_availability(config, memory_availability); cmph_config_set_b(config, b); cmph_config_set_keys_per_bin(config, keys_per_bin); - + //if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15; if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15; if (c != 0) cmph_config_set_graphsize(config, c); @@ -279,8 +279,8 @@ int main(int argc, char **argv) free(mphf_file); return -1; } - cmph_dump(mphf, mphf_fd); - cmph_destroy(mphf); + cmph_dump(mphf, mphf_fd); + cmph_destroy(mphf); fclose(mphf_fd); } else @@ -329,7 +329,7 @@ int main(int argc, char **argv) } source->dispose(source->data, buf, buflen); } - + cmph_destroy(mphf); free(hashtable); } @@ -338,5 +338,5 @@ int main(int argc, char **argv) free(tmp_dir); cmph_io_nlfile_adapter_destroy(source); return ret; - + } diff --git a/src/sdbm_hash.c b/src/sdbm_hash.c index 2f706c9..3a052fd 100644 --- a/src/sdbm_hash.c +++ b/src/sdbm_hash.c @@ -4,6 +4,7 @@ sdbm_state_t *sdbm_state_new() { sdbm_state_t *state = (sdbm_state_t *)malloc(sizeof(sdbm_state_t)); + if (!state) return NULL; state->hashfunc = CMPH_HASH_SDBM; return state; } diff --git a/src/vqueue.c b/src/vqueue.c index 0619dd7..5c90ee0 100644 --- a/src/vqueue.c +++ b/src/vqueue.c @@ -12,7 +12,7 @@ vqueue_t * vqueue_new(cmph_uint32 capacity) { size_t capacity_plus_one = capacity + 1; vqueue_t *q = (vqueue_t *)malloc(sizeof(vqueue_t)); - assert(q); + if (!q) return NULL; q->values = (cmph_uint32 *)calloc(capacity_plus_one, sizeof(cmph_uint32)); q->beg = q->end = 0; q->capacity = (cmph_uint32) capacity_plus_one; @@ -43,7 +43,7 @@ void vqueue_print(vqueue_t * q) cmph_uint32 i; for (i = q->beg; i != q->end; i = (i + 1)%q->capacity) fprintf(stderr, "%u\n", q->values[(i + 1)%q->capacity]); -} +} void vqueue_destroy(vqueue_t *q) { diff --git a/src/vstack.c b/src/vstack.c index 96f5380..8791550 100644 --- a/src/vstack.c +++ b/src/vstack.c @@ -76,4 +76,3 @@ void vstack_reserve(vstack_t *stack, cmph_uint32 size) DEBUGP("Increased\n"); } } - From 7b6c163075c50ab3ccc4ead9d3482e8998172997 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Tue, 6 Mar 2012 18:25:05 -0800 Subject: [PATCH 61/89] Adding support for miss benchmarks. Need to fix myfind methods. --- cxxmph/Makefile.am | 2 +- cxxmph/bm_common.cc | 8 +++++++ cxxmph/bm_common.h | 4 +++- cxxmph/bm_index.cc | 56 +++++++++++++++++++++++++++++++++++++++++++++ cxxmph/bm_map.cc | 23 +++++++++++++++---- 5 files changed, 86 insertions(+), 7 deletions(-) diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 55df057..2e57a18 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -14,7 +14,7 @@ mph_map_test_SOURCES = mph_map_test.cc mph_index_test_LDADD = libcxxmph.la mph_index_test_SOURCES = mph_index_test.cc -bm_index_LDADD = libcxxmph.la +bm_index_LDADD = libcxxmph.la -lcmph bm_index_SOURCES = bm_common.cc bm_index.cc trigraph_test_LDADD = libcxxmph.la diff --git a/cxxmph/bm_common.cc b/cxxmph/bm_common.cc index c52b2e5..7e94dcf 100644 --- a/cxxmph/bm_common.cc +++ b/cxxmph/bm_common.cc @@ -1,4 +1,6 @@ +#include #include +#include #include #include @@ -32,9 +34,15 @@ bool UrlsBenchmark::SetUp() { bool SearchUrlsBenchmark::SetUp() { if (!UrlsBenchmark::SetUp()) return false; + int32_t miss_ratio_int32 = std::numeric_limits::max() * miss_ratio_; + forced_miss_urls_.resize(nsearches_); random_.resize(nsearches_); for (int i = 0; i < nsearches_; ++i) { random_[i] = urls_[random() % urls_.size()]; + if (random() < miss_ratio_int32) { + forced_miss_urls_[i] = random_[i].as_string() + ".force_miss"; + random_[i] = forced_miss_urls_[i]; + } } return true; } diff --git a/cxxmph/bm_common.h b/cxxmph/bm_common.h index 4fea687..c67b5cd 100644 --- a/cxxmph/bm_common.h +++ b/cxxmph/bm_common.h @@ -32,10 +32,12 @@ class UrlsBenchmark : public Benchmark { class SearchUrlsBenchmark : public UrlsBenchmark { public: SearchUrlsBenchmark(const std::string& urls_file, uint32_t nsearches) - : UrlsBenchmark(urls_file), nsearches_(nsearches) {} + : UrlsBenchmark(urls_file), nsearches_(nsearches), miss_ratio_(0.2) {} protected: virtual bool SetUp(); const uint32_t nsearches_; + float miss_ratio_; + std::vector forced_miss_urls_; std::vector random_; }; diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index 84bf7d2..a7c0a14 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -1,3 +1,6 @@ +#include + +#include #include #include #include @@ -56,6 +59,56 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark { SimpleMPHIndex index_; }; +class BM_CmphIndexSearch : public SearchUrlsBenchmark { + public: + BM_CmphIndexSearch(const std::string& urls_file, int nsearches) + : SearchUrlsBenchmark(urls_file, nsearches) { } + ~BM_CmphIndexSearch() { if (index_) cmph_destroy(index_); } + virtual void Run() { + for (auto it = random_.begin(); it != random_.end(); ++it) { + auto idx = cmph_search(index_, it->data(), it->length()); + // Collision check to be fair with STL + if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; + } + } + protected: + virtual bool SetUp() { + if (!SearchUrlsBenchmark::SetUp()) { + cerr << "Parent class setup failed." << endl; + return false; + } + FILE* f = fopen(urls_file_.c_str(), "r"); + if (!f) { + cerr << "Faied to open " << urls_file_ << endl; + return false; + } + cmph_io_adapter_t* source = cmph_io_nlfile_adapter(f); + if (!source) { + cerr << "Faied to create io adapter for " << urls_file_ << endl; + return false; + } + cmph_config_t* config = cmph_config_new(source); + if (!config) { + cerr << "Failed to create config" << endl; + return false; + } + cmph_config_set_algo(config, CMPH_BDZ); + cmph_t* mphf = cmph_new(config); + if (!mphf) { + cerr << "Failed to create mphf." << endl; + return false; + } + + cmph_config_destroy(config); + cmph_io_nlfile_adapter_destroy(source); + fclose(f); + index_ = mphf; + return true; + } + cmph_t* index_; +}; + + class BM_STLIndexSearch : public SearchUrlsBenchmark { public: BM_STLIndexSearch(const std::string& urls_file, int nsearches) @@ -80,10 +133,13 @@ class BM_STLIndexSearch : public SearchUrlsBenchmark { }; int main(int argc, char** argv) { +/* Benchmark::Register(new BM_MPHIndexCreate("URLS100k")); Benchmark::Register(new BM_STLIndexCreate("URLS100k")); Benchmark::Register(new BM_MPHIndexSearch("URLS100k", 100*1000*1000)); Benchmark::Register(new BM_STLIndexSearch("URLS100k", 100*1000*1000)); +*/ + Benchmark::Register(new BM_CmphIndexSearch("URLS100k", 100*1000*1000)); Benchmark::RunAll(); return 0; } diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 607edc6..8d2aef1 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -11,7 +11,9 @@ namespace cxxmph { uint64_t myfind(const unordered_map& mymap, const uint64_t& k) { - return mymap.find(k)->second; + auto it = mymap.find(k); + if (it == mymap.end()) return -1; + return it->second; } uint64_t myfind(const mph_map& mymap, const uint64_t& k) { @@ -19,7 +21,9 @@ } const StringPiece& myfind(const unordered_map& mymap, const StringPiece& k) { - return mymap.find(k)->second; + auto it = mymap.find(k); + if (it == mymap.end()) return ".force_miss"; + return it->second; } StringPiece myfind(const mph_map& mymap, const StringPiece& k) { auto it = mymap.find(k); @@ -44,13 +48,22 @@ class BM_SearchUrls : public SearchUrlsBenchmark { BM_SearchUrls(const std::string& urls_file, int nsearches) : SearchUrlsBenchmark(urls_file, nsearches) { } virtual void Run() { + fprintf(stderr, "Running benchmark\n"); for (auto it = random_.begin(); it != random_.end(); ++it) { + if (it->ends_with(".force_miss")) { + fprintf(stderr, "About to miss\n"); + } else { + fprintf(stderr, "No miss\n"); + } + fprintf(stderr, "it: *%s\n", it->as_string().c_str()); auto v = myfind(mymap_, *it); - if (v != *it) { + fprintf(stderr, "v: %s, it: *%s\n", v.as_string().c_str(), it->as_string().c_str()); + if (v != *it && !it->ends_with(".force_miss")) { fprintf(stderr, "Looked for %s got %s\n", it->data(), v.data()); exit(-1); } } + fprintf(stderr, "Done running benchmark\n"); } protected: virtual bool SetUp() { @@ -102,8 +115,8 @@ int main(int argc, char** argv) { Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); */ - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000* 1000)); - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000* 1000)); + // Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10)); /* Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); From b8b0cde5c722d59613a564c8cfc77a2ddc9814da Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 7 Mar 2012 01:00:17 -0500 Subject: [PATCH 62/89] Added miss ratio to benchmark tools. --- cxxmph/bm_common.h | 4 ++-- cxxmph/bm_index.cc | 6 +++--- cxxmph/bm_map.cc | 50 +++++++++++++--------------------------------- 3 files changed, 19 insertions(+), 41 deletions(-) diff --git a/cxxmph/bm_common.h b/cxxmph/bm_common.h index c67b5cd..aaf12b9 100644 --- a/cxxmph/bm_common.h +++ b/cxxmph/bm_common.h @@ -31,8 +31,8 @@ class UrlsBenchmark : public Benchmark { class SearchUrlsBenchmark : public UrlsBenchmark { public: - SearchUrlsBenchmark(const std::string& urls_file, uint32_t nsearches) - : UrlsBenchmark(urls_file), nsearches_(nsearches), miss_ratio_(0.2) {} + SearchUrlsBenchmark(const std::string& urls_file, uint32_t nsearches, float miss_ratio) + : UrlsBenchmark(urls_file), nsearches_(nsearches), miss_ratio_(miss_ratio) {} protected: virtual bool SetUp(); const uint32_t nsearches_; diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index a7c0a14..e700840 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -42,7 +42,7 @@ class BM_STLIndexCreate : public UrlsBenchmark { class BM_MPHIndexSearch : public SearchUrlsBenchmark { public: BM_MPHIndexSearch(const std::string& urls_file, int nsearches) - : SearchUrlsBenchmark(urls_file, nsearches) { } + : SearchUrlsBenchmark(urls_file, nsearches, 0) { } virtual void Run() { for (auto it = random_.begin(); it != random_.end(); ++it) { auto idx = index_.index(*it); @@ -62,7 +62,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark { class BM_CmphIndexSearch : public SearchUrlsBenchmark { public: BM_CmphIndexSearch(const std::string& urls_file, int nsearches) - : SearchUrlsBenchmark(urls_file, nsearches) { } + : SearchUrlsBenchmark(urls_file, nsearches, 0) { } ~BM_CmphIndexSearch() { if (index_) cmph_destroy(index_); } virtual void Run() { for (auto it = random_.begin(); it != random_.end(); ++it) { @@ -112,7 +112,7 @@ class BM_CmphIndexSearch : public SearchUrlsBenchmark { class BM_STLIndexSearch : public SearchUrlsBenchmark { public: BM_STLIndexSearch(const std::string& urls_file, int nsearches) - : SearchUrlsBenchmark(urls_file, nsearches) { } + : SearchUrlsBenchmark(urls_file, nsearches, 0) { } virtual void Run() { for (auto it = random_.begin(); it != random_.end(); ++it) { auto idx = index_.find(*it); diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 8d2aef1..42516b6 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -10,24 +10,11 @@ namespace cxxmph { - uint64_t myfind(const unordered_map& mymap, const uint64_t& k) { + template + const T* myfind(const Container& mymap, const T& k) { auto it = mymap.find(k); - if (it == mymap.end()) return -1; - return it->second; - } - - uint64_t myfind(const mph_map& mymap, const uint64_t& k) { - return mymap.find(k)->second; - } - - const StringPiece& myfind(const unordered_map& mymap, const StringPiece& k) { - auto it = mymap.find(k); - if (it == mymap.end()) return ".force_miss"; - return it->second; - } - StringPiece myfind(const mph_map& mymap, const StringPiece& k) { - auto it = mymap.find(k); - return it->second; + if (it == mymap.end()) return NULL; + return &it->second; } template @@ -45,25 +32,14 @@ template class BM_SearchUrls : public SearchUrlsBenchmark { public: - BM_SearchUrls(const std::string& urls_file, int nsearches) - : SearchUrlsBenchmark(urls_file, nsearches) { } + BM_SearchUrls(const std::string& urls_file, int nsearches, float miss_ratio) + : SearchUrlsBenchmark(urls_file, nsearches, miss_ratio) { } virtual void Run() { - fprintf(stderr, "Running benchmark\n"); for (auto it = random_.begin(); it != random_.end(); ++it) { - if (it->ends_with(".force_miss")) { - fprintf(stderr, "About to miss\n"); - } else { - fprintf(stderr, "No miss\n"); - } - fprintf(stderr, "it: *%s\n", it->as_string().c_str()); auto v = myfind(mymap_, *it); - fprintf(stderr, "v: %s, it: *%s\n", v.as_string().c_str(), it->as_string().c_str()); - if (v != *it && !it->ends_with(".force_miss")) { - fprintf(stderr, "Looked for %s got %s\n", it->data(), v.data()); - exit(-1); - } + assert(it->ends_with(".force_miss") ^ v != NULL); + assert(!v || *v == *it); } - fprintf(stderr, "Done running benchmark\n"); } protected: virtual bool SetUp() { @@ -96,8 +72,8 @@ class BM_SearchUint64 : public SearchUint64Benchmark { virtual void Run() { for (auto it = random_.begin(); it != random_.end(); ++it) { auto v = myfind(mymap_, *it); - if (v != *it) { - fprintf(stderr, "Looked for %lu got %lu\n", *it, v); + if (*v != *it) { + fprintf(stderr, "Looked for %lu got %lu\n", *it, *v); exit(-1); } } @@ -115,8 +91,10 @@ int main(int argc, char** argv) { Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); */ - // Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000)); - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); /* Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); From dbd4856faee98cb325053ca6fb8a2aca923ebe4c Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 7 Mar 2012 01:48:20 -0500 Subject: [PATCH 63/89] Removed unnecessary seed mod which was breaking on presence of poor hash functions. --- cxxmph/bm_index.cc | 10 ++++------ cxxmph/bm_map.cc | 42 ++++++++++++++++++++---------------------- cxxmph/mph_index.h | 3 +-- cxxmph/mph_map.h | 1 - cxxmph/seeded_hash.h | 9 ++------- 5 files changed, 27 insertions(+), 38 deletions(-) diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index e700840..924231c 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -129,17 +129,15 @@ class BM_STLIndexSearch : public SearchUrlsBenchmark { index.swap(index_); return true; } - std::unordered_map index_; + unordered_map index_; }; int main(int argc, char** argv) { -/* Benchmark::Register(new BM_MPHIndexCreate("URLS100k")); Benchmark::Register(new BM_STLIndexCreate("URLS100k")); - Benchmark::Register(new BM_MPHIndexSearch("URLS100k", 100*1000*1000)); - Benchmark::Register(new BM_STLIndexSearch("URLS100k", 100*1000*1000)); -*/ - Benchmark::Register(new BM_CmphIndexSearch("URLS100k", 100*1000*1000)); + Benchmark::Register(new BM_MPHIndexSearch("URLS100k", 10*1000*1000)); + Benchmark::Register(new BM_STLIndexSearch("URLS100k", 10*1000*1000)); + Benchmark::Register(new BM_CmphIndexSearch("URLS100k", 10*1000*1000)); Benchmark::RunAll(); return 0; } diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 42516b6..8334604 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -4,29 +4,29 @@ #include "bm_common.h" #include "mph_map.h" - using cxxmph::mph_map; - using std::string; - using std::unordered_map; +using cxxmph::mph_map; +using std::string; +using std::unordered_map; - namespace cxxmph { +namespace cxxmph { - template - const T* myfind(const Container& mymap, const T& k) { - auto it = mymap.find(k); - if (it == mymap.end()) return NULL; - return &it->second; - } +template +const T* myfind(const MapType& mymap, const T& k) { + auto it = mymap.find(k); + if (it == mymap.end()) return NULL; + return &it->second; +} - template - class BM_CreateUrls : public UrlsBenchmark { - public: - BM_CreateUrls(const string& urls_file) : UrlsBenchmark(urls_file) { } - virtual void Run() { - MapType mymap; - for (auto it = urls_.begin(); it != urls_.end(); ++it) { - mymap[*it] = *it; - } +template +class BM_CreateUrls : public UrlsBenchmark { + public: + BM_CreateUrls(const string& urls_file) : UrlsBenchmark(urls_file) { } + virtual void Run() { + MapType mymap; + for (auto it = urls_.begin(); it != urls_.end(); ++it) { + mymap[*it] = *it; } + } }; template @@ -90,14 +90,12 @@ int main(int argc, char** argv) { /* Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); - */ Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); - /* +*/ Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); - */ Benchmark::RunAll(); } diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 3ee9090..d2e4a01 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -138,8 +138,7 @@ bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) { std::vector queue; while (1) { // cerr << "Iterations missing: " << iterations << endl; - for (int i = 0; i < 3; ++i) hash_seed_[i] = random() % m_; - // for (int i = 0; i < 3; ++i) hash_seed_[i] = random() + i; + for (int i = 0; i < 3; ++i) hash_seed_[i] = random(); if (Mapping(begin, end, &edges, &queue)) break; else --iterations; if (iterations == 0) break; diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index e574c7c..7541c45 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -186,7 +186,6 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { } MPH_MAP_METHOD_DECL(my_int32_t, index)(const key_type& k) const { - assert(slack_.empty()); if (index_.size() == 0) return -1; return index_.index(k); } diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index 64cb74d..0f9adfa 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -19,13 +19,6 @@ struct seeded_hash_function { } }; -struct seeded_identity_function { - template - uint32_t operator()(const Key& k, uint32_t seed) const { - return k ^ seed; - } -}; - struct Murmur2 { template uint32_t operator()(const Key& k) const { @@ -78,6 +71,7 @@ template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; +/* template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > @@ -90,6 +84,7 @@ template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; +*/ } // namespace cxxmph From 20aeaf8ee11533510f1d42bcfff81f829d27fa78 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 7 Mar 2012 01:53:19 -0500 Subject: [PATCH 64/89] Poor hash functions break tests because of small set sizes. --- cxxmph/seeded_hash.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index 0f9adfa..d079a57 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -71,7 +71,6 @@ template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -/* template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > @@ -84,7 +83,6 @@ template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; -*/ } // namespace cxxmph From c057fb882bcacb494955ac8e7be6fe2c42d5fa9e Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 7 Mar 2012 03:10:29 -0500 Subject: [PATCH 65/89] Iterator game. --- cxxmph/bm_map.cc | 2 -- cxxmph/mph_map.h | 61 +++++++++++++++++++++++++++++++++--------------- 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 8334604..5c0f7a4 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -87,14 +87,12 @@ using namespace cxxmph; int main(int argc, char** argv) { srandom(4); - /* Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); -*/ Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); Benchmark::RunAll(); diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 7541c45..6a09d21 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -41,8 +41,18 @@ class mph_map { typedef typename std::vector::const_reference const_reference; typedef typename std::vector::size_type size_type; typedef typename std::vector::difference_type difference_type; - typedef typename std::vector::iterator iterator; - typedef typename std::vector::const_iterator const_iterator; + + template + struct indirect_iterator : public typename slack_type::iterator { + indirect_iterator(T* v, iterator it) : iterator(it), v_(v) { } + const typename iterator::value_type::first_type& operator*() const { + return v->begin() + (this->iterator::operator*())->second; + } + }; + + + typedef indirect_iterator, slack_type>::iterator iterator; + typedef indirect_iterator, slack_type>::const_iterator const_iterator; // For making macros simpler. typedef void void_type; @@ -69,7 +79,7 @@ class mph_map { data_type& operator[](const key_type &k); const data_type& operator[](const key_type &k) const; - size_type bucket_count() const { return size(); } + size_type bucket_count() const { return index_.perfect_hash_size() + slack_.bucket_count(); } // FIXME: not sure if this has the semantics I want void rehash(size_type nbuckets /*ignored*/) { pack(); } @@ -80,7 +90,7 @@ class mph_map { template struct iterator_first : public iterator { iterator_first(iterator it) : iterator(it) { } - const typename iterator::value_type::first_type& operator*() const { + const typename iterator::value_type::first_type& operator*() const { return this->iterator::operator*().first; } }; @@ -90,6 +100,11 @@ class mph_map { return iterator_first(it); } + template + indirect_iterator make_indirect_iterator(T* v, iterator it) { + return indirect_iterator(v, it); + } + void pack(); std::vector values_; SimpleMPHIndex::hash_function> index_; @@ -113,31 +128,39 @@ MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() { MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { iterator it = find(x.first); if (it != end()) return make_pair(it, false); + should_pack = false; + if (values_.capacity() == values_.size() && values_.size() > 256) { + should_pack = true; + } values_.push_back(x); slack_.insert(make_pair(x.first, values_.size() - 1)); - if (slack_.size() == index_.size() || - (slack_.size() >= 256 && index_.size() == 0)) { - pack(); - } + if (should_pack) pack(); it = find(x.first); return make_pair(it, true); } MPH_MAP_METHOD_DECL(void_type, pack)() { if (values_.empty()) return; - slack_type().swap(slack_); bool success = index_.Reset( - make_iterator_first(values_.begin()), - make_iterator_first(values_.end())); + make_iterator_first(slack_.begin())), + make_iterator_first(slack_.end()))); assert(success); - std::vector new_values(values_.size()); + std::vector new_values(index_.size()); for (const_iterator it = values_.begin(), end = values_.end(); it != end; ++it) { - size_type id = index_.index(it->first); + size_type id = index_.index((*it)->first); assert(id < new_values.size()); new_values[id] = *it; } values_.swap(new_values); + std::vector new_values_pointer( + index_.perfect_hash_size());; + for (size_type i = 0; i < values_.size(); ++i) { + size_type id = index_.perfect_hash(values_[i].first); + assert(id < new_values_pointer.size()); + new_values_pointer[id] = i; + } + values_pointer_.swap(new_values_pointer); } MPH_MAP_METHOD_DECL(iterator, begin)() { return values_.begin(); } @@ -169,25 +192,25 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { if (it != slack_.end()) return values_.begin() + it->second; } if (__builtin_expect(index_.size() == 0, 0)) return end(); - const_iterator it = values_.begin() + index_.index(k); + const_iterator it = values_.begin() + values_pointer_[index_.perfect_hash(k)]; if (__builtin_expect(equal_(k, it->first), 1)) return it; return end(); } MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { - if (!slack_.empty()) { + if (__builtin_expect(!slack_.empty(), 0)) { typename slack_type::const_iterator it = slack_.find(k); if (it != slack_.end()) return values_.begin() + it->second; } - if (index_.size() == 0) return end(); - iterator it = values_.begin() + index_.index(k); - if (equal_(it->first, k)) return it; + if (__builtin_expect(index_.size() == 0, 0)) return end(); + iterator it = values_.begin() + values_pointer_[index_.perfect_hash(k)]; + if (__builtin_expect(equal_(k, it->first), 1)) return it; return end(); } MPH_MAP_METHOD_DECL(my_int32_t, index)(const key_type& k) const { if (index_.size() == 0) return -1; - return index_.index(k); + return index_.perfect_hash(k); } MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) { From 238e384367e635d2fdaf61904446348086351d75 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Sun, 11 Mar 2012 23:21:18 -0300 Subject: [PATCH 66/89] Compiles, still need to fix size tracking. --- cxxmph/Makefile.am | 7 ++- cxxmph/bm_index.cc | 4 +- cxxmph/bm_map.cc | 3 +- cxxmph/cxxmph.cc | 4 +- cxxmph/hollow_iterator.h | 69 +++++++++++++++++++++++++ cxxmph/hollow_iterator_test.cc | 35 +++++++++++++ cxxmph/mph_index.h | 11 ++-- cxxmph/mph_index_test.cc | 2 +- cxxmph/mph_map.h | 92 +++++++++++++++++----------------- 9 files changed, 167 insertions(+), 60 deletions(-) create mode 100644 cxxmph/hollow_iterator.h create mode 100644 cxxmph/hollow_iterator_test.cc diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 2e57a18..cec2073 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,12 +1,12 @@ TESTS = $(check_PROGRAMS) -check_PROGRAMS = mph_map_test mph_index_test trigraph_test +check_PROGRAMS = hollow_iterator_test mph_map_test mph_index_test trigraph_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 cxxmph_includedir = $(includedir)/cxxmph/ -cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h +cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h mph_map_test_LDADD = libcxxmph.la mph_map_test_SOURCES = mph_map_test.cc @@ -25,3 +25,6 @@ bm_map_SOURCES = bm_common.cc bm_map.cc cxxmph_LDADD = libcxxmph.la cxxmph_SOURCES = cxxmph.cc + +hollow_iterator_test_SOURCES = hollow_iterator_test.cc + diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index 924231c..d1cbc00 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -21,7 +21,7 @@ class BM_MPHIndexCreate : public UrlsBenchmark { protected: virtual void Run() { SimpleMPHIndex index; - index.Reset(urls_.begin(), urls_.end()); + index.Reset(urls_.begin(), urls_.end(), urls_.size()); } }; @@ -53,7 +53,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark { protected: virtual bool SetUp () { if (!SearchUrlsBenchmark::SetUp()) return false; - index_.Reset(urls_.begin(), urls_.end()); + index_.Reset(urls_.begin(), urls_.end(), urls_.size()); return true; } SimpleMPHIndex index_; diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 5c0f7a4..e381976 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -13,7 +13,8 @@ namespace cxxmph { template const T* myfind(const MapType& mymap, const T& k) { auto it = mymap.find(k); - if (it == mymap.end()) return NULL; + auto end = mymap.end(); + if (it == end) return NULL; return &it->second; } diff --git a/cxxmph/cxxmph.cc b/cxxmph/cxxmph.cc index 68bb23e..e9bffd0 100644 --- a/cxxmph/cxxmph.cc +++ b/cxxmph/cxxmph.cc @@ -63,8 +63,8 @@ int main(int argc, char** argv) { for (int i = 0; i < keys.size(); ++i) table[keys[i]] = keys[i]; mph_map::const_iterator it = table.begin(); mph_map::const_iterator end = table.end(); - for (; it != end; ++it) { - cout << (it - table.begin()) << ": " << it->first + for (int i = 0; it != end; ++it, ++i) { + cout << i << ": " << it->first <<" -> " << it->second << endl; } } diff --git a/cxxmph/hollow_iterator.h b/cxxmph/hollow_iterator.h new file mode 100644 index 0000000..bbb34bf --- /dev/null +++ b/cxxmph/hollow_iterator.h @@ -0,0 +1,69 @@ +#ifndef __CXXMPH_HOLLOW_ITERATOR_H__ +#define __CXXMPH_HOLLOW_ITERATOR_H__ + +#include + +namespace cxxmph { + +template +struct hollow_iterator_base + : public std::iterator { + typedef presence_type presence; + typedef container_type container; + typedef iterator_type iterator; + typedef hollow_iterator_base& self_reference; + typedef typename iterator::reference reference; + typedef typename iterator::pointer pointer; + + hollow_iterator_base(container* c, presence* p, iterator it) + : c_(c), p_(p), it_(it) { find_present(); } + self_reference operator++() { + ++it_; find_present(); + } + reference operator*() { return *it_; } + pointer operator->() { return &(*it_); } + + // TODO find syntax to make this less permissible at compile time + template + bool operator==(const T& rhs) { return rhs.it_ == this->it_; } + template + bool operator!=(const T& rhs) { return rhs.it_ != this->it_; } + + public: // TODO find syntax to make this friend of const iterator + void find_present() { + while (it_ != c_->end() && !((*p_)[it_-c_->begin()])) ++it_; + } + container* c_; + presence* p_; + iterator it_; +}; + +template +struct hollow_iterator : public hollow_iterator_base< + container_type, std::vector, typename container_type::iterator> { + typedef hollow_iterator_base< + container_type, std::vector, typename container_type::iterator> parent_class; + hollow_iterator(typename parent_class::container* c, + typename parent_class::presence* p, + typename parent_class::iterator it) + : parent_class(c, p, it) { } +}; + +template +struct hollow_const_iterator : public hollow_iterator_base< + const container_type, const std::vector, typename container_type::const_iterator> { + typedef hollow_iterator_base< + const container_type, const std::vector, typename container_type::const_iterator> parent_class; + typedef hollow_const_iterator self_type; + typedef hollow_iterator non_const_type; + hollow_const_iterator(non_const_type rhs) : parent_class(rhs.c_, rhs.p_, typename container_type::const_iterator(rhs.it_)) { } + hollow_const_iterator(const typename parent_class::container* c, + const typename parent_class::presence* p, + typename parent_class::iterator it) + : parent_class(c, p, it) { } +}; + +} // namespace cxxmph + +#endif // __CXXMPH_HOLLOW_ITERATOR_H__ diff --git a/cxxmph/hollow_iterator_test.cc b/cxxmph/hollow_iterator_test.cc new file mode 100644 index 0000000..201b748 --- /dev/null +++ b/cxxmph/hollow_iterator_test.cc @@ -0,0 +1,35 @@ +#include +#include +#include + +#include "hollow_iterator.h" + +using std::vector; +using cxxmph::hollow_iterator; +using cxxmph::hollow_const_iterator; + +int main(int argc, char** argv) { + vector v; + vector p; + for (int i = 0; i < 100; ++i) { + v.push_back(i); + p.push_back(i % 2 == 0); + } + auto begin = hollow_iterator>(&v, &p, v.begin()); + auto end = hollow_iterator>(&v, &p, v.end()); + for (auto it = begin; it != end; ++it) { + if (((*it) % 2) != 0) exit(-1); + } + hollow_const_iterator> const_begin(begin); + hollow_const_iterator> const_end(end); + for (auto it = const_begin; it != const_end; ++it) { + if (((*it) % 2) != 0) exit(-1); + } + vector::iterator vit1 = v.begin(); + vector::const_iterator vit2 = v.begin(); + if (vit1 != vit2) exit(-1); + auto it1 = hollow_iterator>(&v, &p, v.begin()); + auto it2 = hollow_const_iterator>(&v, &p, v.begin()); + if (it1 != it2) exit(-1); +} + diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index d2e4a01..ad5bc6e 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -48,7 +48,7 @@ class MPHIndex { ~MPHIndex(); template - bool Reset(ForwardIterator begin, ForwardIterator end); + bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size); template // must agree with Reset // Get a unique identifier for k, in the range [0;size()). If x wasn't part // of the input in the last Reset call, returns a random value. @@ -120,12 +120,13 @@ class MPHIndex { // Template method needs to go in the header file. template -bool MPHIndex::Reset(ForwardIterator begin, ForwardIterator end) { +bool MPHIndex::Reset( + ForwardIterator begin, ForwardIterator end, uint32_t size) { if (end == begin) { clear(); return true; } - m_ = end - begin; + m_ = size; r_ = static_cast(ceil((c_*m_)/3)); if ((r_ % 2) == 0) r_ += 1; n_ = 3*r_; @@ -204,8 +205,8 @@ template >::hash class SimpleMPHIndex : public MPHIndex { public: template - bool Reset(ForwardIterator begin, ForwardIterator end) { - return MPHIndex::Reset(begin, end); + bool Reset(ForwardIterator begin, ForwardIterator end, uint32_t size) { + return MPHIndex::Reset(begin, end, size); } uint32_t index(const Key& key) const { return MPHIndex::index(key); } uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash(key); } diff --git a/cxxmph/mph_index_test.cc b/cxxmph/mph_index_test.cc index 7a7d036..70e01bc 100644 --- a/cxxmph/mph_index_test.cc +++ b/cxxmph/mph_index_test.cc @@ -24,7 +24,7 @@ int main(int argc, char** argv) { keys.push_back("algume"); SimpleMPHIndex mph_index; - if (!mph_index.Reset(keys.begin(), keys.end())) { exit(-1); } + if (!mph_index.Reset(keys.begin(), keys.end(), keys.size())) { exit(-1); } vector ids; for (vector::size_type i = 0; i < keys.size(); ++i) { ids.push_back(mph_index.index(keys[i])); diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 6a09d21..7687ba5 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -14,6 +14,7 @@ #include "MurmurHash2.h" #include "mph_index.h" +#include "hollow_iterator.h" namespace cxxmph { @@ -42,17 +43,8 @@ class mph_map { typedef typename std::vector::size_type size_type; typedef typename std::vector::difference_type difference_type; - template - struct indirect_iterator : public typename slack_type::iterator { - indirect_iterator(T* v, iterator it) : iterator(it), v_(v) { } - const typename iterator::value_type::first_type& operator*() const { - return v->begin() + (this->iterator::operator*())->second; - } - }; - - - typedef indirect_iterator, slack_type>::iterator iterator; - typedef indirect_iterator, slack_type>::const_iterator const_iterator; + typedef hollow_iterator> iterator; + typedef hollow_const_iterator> const_iterator; // For making macros simpler. typedef void void_type; @@ -90,7 +82,7 @@ class mph_map { template struct iterator_first : public iterator { iterator_first(iterator it) : iterator(it) { } - const typename iterator::value_type::first_type& operator*() const { + const typename iterator::value_type::first_type& operator*() { return this->iterator::operator*().first; } }; @@ -100,25 +92,29 @@ class mph_map { return iterator_first(it); } - template - indirect_iterator make_indirect_iterator(T* v, iterator it) { - return indirect_iterator(v, it); + iterator make_iterator(typename std::vector::iterator it) { + return hollow_iterator>(&values_, &present_, it); + } + const_iterator make_iterator(typename std::vector::const_iterator it) const { + return hollow_const_iterator>(&values_, &present_, it); } void pack(); std::vector values_; + std::vector present_; SimpleMPHIndex::hash_function> index_; // TODO(davi) optimize slack to no hold a copy of the key typedef unordered_map slack_type; slack_type slack_; + size_type size_; }; MPH_MAP_TMPL_SPEC bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) { - return lhs.values_ == rhs.values_; + return lhs.size() == rhs.size() && std::equal(lhs.begin(), lhs.end(), rhs.begin()); } -MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() { +MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() : size_(0) { pack(); } @@ -126,13 +122,15 @@ MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() { } MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { - iterator it = find(x.first); - if (it != end()) return make_pair(it, false); - should_pack = false; + auto it = find(x.first); + auto it_end = end(); + if (it != it_end) return make_pair(it, false); + bool should_pack = false; if (values_.capacity() == values_.size() && values_.size() > 256) { should_pack = true; } values_.push_back(x); + present_.push_back(true); slack_.insert(make_pair(x.first, values_.size() - 1)); if (should_pack) pack(); it = find(x.first); @@ -142,43 +140,39 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { MPH_MAP_METHOD_DECL(void_type, pack)() { if (values_.empty()) return; bool success = index_.Reset( - make_iterator_first(slack_.begin())), - make_iterator_first(slack_.end()))); + make_iterator_first(begin()), + make_iterator_first(end()), size_); assert(success); std::vector new_values(index_.size()); - for (const_iterator it = values_.begin(), end = values_.end(); - it != end; ++it) { - size_type id = index_.index((*it)->first); + std::vector new_present(index_.size(), false); + for (iterator it(begin()), it_end(end()); it != it_end; ++it) { + size_type id = index_.index(it->first); assert(id < new_values.size()); new_values[id] = *it; + new_present[id] = true; } values_.swap(new_values); - std::vector new_values_pointer( - index_.perfect_hash_size());; - for (size_type i = 0; i < values_.size(); ++i) { - size_type id = index_.perfect_hash(values_[i].first); - assert(id < new_values_pointer.size()); - new_values_pointer[id] = i; - } - values_pointer_.swap(new_values_pointer); + present_.swap(new_present); + slack_type().swap(slack_); } -MPH_MAP_METHOD_DECL(iterator, begin)() { return values_.begin(); } -MPH_MAP_METHOD_DECL(iterator, end)() { return values_.end(); } -MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return values_.begin(); } -MPH_MAP_METHOD_DECL(const_iterator, end)() const { return values_.end(); } -MPH_MAP_METHOD_DECL(bool_type, empty)() const { return values_.empty(); } -MPH_MAP_METHOD_DECL(size_type, size)() const { return values_.size(); } +MPH_MAP_METHOD_DECL(iterator, begin)() { return make_iterator(values_.begin()); } +MPH_MAP_METHOD_DECL(iterator, end)() { return make_iterator(values_.end()); } +MPH_MAP_METHOD_DECL(const_iterator, begin)() const { return make_iterator(values_.begin()); } +MPH_MAP_METHOD_DECL(const_iterator, end)() const { return make_iterator(values_.end()); } +MPH_MAP_METHOD_DECL(bool_type, empty)() const { return size_ == 0; } +MPH_MAP_METHOD_DECL(size_type, size)() const { return size_; } MPH_MAP_METHOD_DECL(void_type, clear)() { values_.clear(); + present_.clear(); slack_.clear(); index_.clear(); } MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { - values_.erase(pos); - pack(); + present_[pos - begin] = false; + *pos = value_type(); } MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { iterator it = find(k); @@ -188,22 +182,26 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { if (__builtin_expect(!slack_.empty(), 0)) { - typename slack_type::const_iterator it = slack_.find(k); - if (it != slack_.end()) return values_.begin() + it->second; + auto it = slack_.find(k); + if (it != slack_.end()) return make_iterator(values_.begin() + it->second); } if (__builtin_expect(index_.size() == 0, 0)) return end(); - const_iterator it = values_.begin() + values_pointer_[index_.perfect_hash(k)]; + auto id = index_.perfect_hash(k); + if (!present_[id]) return end(); + auto it = make_iterator(values_.begin() + id); if (__builtin_expect(equal_(k, it->first), 1)) return it; return end(); } MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { if (__builtin_expect(!slack_.empty(), 0)) { - typename slack_type::const_iterator it = slack_.find(k); - if (it != slack_.end()) return values_.begin() + it->second; + auto it = slack_.find(k); + if (it != slack_.end()) return make_iterator(values_.begin() + it->second); } if (__builtin_expect(index_.size() == 0, 0)) return end(); - iterator it = values_.begin() + values_pointer_[index_.perfect_hash(k)]; + auto id = index_.perfect_hash(k); + if (!present_[id]) return end(); + auto it = make_iterator(values_.begin() + id); if (__builtin_expect(equal_(k, it->first), 1)) return it; return end(); } From 09c1af7771811dd17f5c50f979b099d4c96eb1de Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Mon, 12 Mar 2012 00:17:08 -0300 Subject: [PATCH 67/89] Perfect hash working, but it is slower. --- cxxmph/hollow_iterator.h | 4 +++- cxxmph/hollow_iterator_test.cc | 3 +++ cxxmph/mph_map.h | 26 +++++++++++++++++++------- cxxmph/mph_map_test.cc | 31 +++++++++++++++++-------------- 4 files changed, 42 insertions(+), 22 deletions(-) diff --git a/cxxmph/hollow_iterator.h b/cxxmph/hollow_iterator.h index bbb34bf..c650d21 100644 --- a/cxxmph/hollow_iterator.h +++ b/cxxmph/hollow_iterator.h @@ -17,7 +17,7 @@ struct hollow_iterator_base typedef typename iterator::pointer pointer; hollow_iterator_base(container* c, presence* p, iterator it) - : c_(c), p_(p), it_(it) { find_present(); } + : c_(c), p_(p), it_(it) { if (c_) find_present(); } self_reference operator++() { ++it_; find_present(); } @@ -44,6 +44,7 @@ struct hollow_iterator : public hollow_iterator_base< container_type, std::vector, typename container_type::iterator> { typedef hollow_iterator_base< container_type, std::vector, typename container_type::iterator> parent_class; + hollow_iterator() : parent_class(NULL, NULL, typename container_type::iterator()) { } hollow_iterator(typename parent_class::container* c, typename parent_class::presence* p, typename parent_class::iterator it) @@ -58,6 +59,7 @@ struct hollow_const_iterator : public hollow_iterator_base< typedef hollow_const_iterator self_type; typedef hollow_iterator non_const_type; hollow_const_iterator(non_const_type rhs) : parent_class(rhs.c_, rhs.p_, typename container_type::const_iterator(rhs.it_)) { } + hollow_const_iterator() : parent_class(NULL, NULL, typename container_type::iterator()) { } hollow_const_iterator(const typename parent_class::container* c, const typename parent_class::presence* p, typename parent_class::iterator it) diff --git a/cxxmph/hollow_iterator_test.cc b/cxxmph/hollow_iterator_test.cc index 201b748..07963ae 100644 --- a/cxxmph/hollow_iterator_test.cc +++ b/cxxmph/hollow_iterator_test.cc @@ -31,5 +31,8 @@ int main(int argc, char** argv) { auto it1 = hollow_iterator>(&v, &p, v.begin()); auto it2 = hollow_const_iterator>(&v, &p, v.begin()); if (it1 != it2) exit(-1); + + hollow_iterator> default_constructed; + default_constructed = hollow_iterator>(&v, &p, v.begin()); } diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 7687ba5..ac77a06 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -7,6 +7,7 @@ // and should not be used if performance is a concern. In fact, you should only // use it for educational purposes. +#include #include #include #include @@ -71,9 +72,8 @@ class mph_map { data_type& operator[](const key_type &k); const data_type& operator[](const key_type &k) const; - size_type bucket_count() const { return index_.perfect_hash_size() + slack_.bucket_count(); } - // FIXME: not sure if this has the semantics I want - void rehash(size_type nbuckets /*ignored*/) { pack(); } + size_type bucket_count() const { return index_.size() + slack_.bucket_count(); } + void rehash(size_type nbuckets /*ignored*/); protected: // mimicking STL implementation EqualKey equal_; @@ -131,6 +131,7 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { } values_.push_back(x); present_.push_back(true); + ++size_; slack_.insert(make_pair(x.first, values_.size() - 1)); if (should_pack) pack(); it = find(x.first); @@ -143,10 +144,12 @@ MPH_MAP_METHOD_DECL(void_type, pack)() { make_iterator_first(begin()), make_iterator_first(end()), size_); assert(success); - std::vector new_values(index_.size()); - std::vector new_present(index_.size(), false); - for (iterator it(begin()), it_end(end()); it != it_end; ++it) { - size_type id = index_.index(it->first); + std::vector new_values(index_.perfect_hash_size()); + new_values.reserve(new_values.size() * 2); + std::vector new_present(index_.perfect_hash_size(), false); + new_present.reserve(new_present.size() * 2); + for (iterator it = begin(), it_end = end(); it != it_end; ++it) { + size_type id = index_.perfect_hash(it->first); assert(id < new_values.size()); new_values[id] = *it; new_present[id] = true; @@ -168,11 +171,13 @@ MPH_MAP_METHOD_DECL(void_type, clear)() { present_.clear(); slack_.clear(); index_.clear(); + size_ = 0; } MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { present_[pos - begin] = false; *pos = value_type(); + --size_; } MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { iterator it = find(k); @@ -214,6 +219,13 @@ MPH_MAP_METHOD_DECL(my_int32_t, index)(const key_type& k) const { MPH_MAP_METHOD_DECL(data_type&, operator[])(const key_type& k) { return insert(make_pair(k, data_type())).first->second; } +MPH_MAP_METHOD_DECL(void_type, rehash)(size_type nbuckets) { + pack(); + vector(values_.begin(), values_.end()).swap(values_); + vector(present_.begin(), present_.end()).swap(present_); + slack_type().swap(slack_); +} + } // namespace cxxmph diff --git a/cxxmph/mph_map_test.cc b/cxxmph/mph_map_test.cc index 579e0ca..11bfbc9 100644 --- a/cxxmph/mph_map_test.cc +++ b/cxxmph/mph_map_test.cc @@ -11,21 +11,25 @@ using cxxmph::mph_map; int main(int argc, char** argv) { mph_map b; - for (int i = 0; i < 100*1000; ++i) { + int32_t num_keys = 1000*10; + for (int i = 0; i < num_keys; ++i) { b.insert(make_pair(i, i)); } - for (int i = 0; i < 1000*1000; ++i) { - b.find(i); + for (int i = 0; i < num_keys; ++i) { + auto it = b.find(i); + if (it->first != it->second || it->first != i) { + std::cerr << "Found " << it->first << " looking for " << i << std::endl; + exit(-1); + } } - /* mph_map h; h.insert(std::make_pair("-1",-1)); mph_map::const_iterator it; for (it = h.begin(); it != h.end(); ++it) { - std::cerr << it->first << " -> " << it->second << std::endl; + if (it->second != -1) exit(-1); } - std::cerr << "Search -1 gives " << h.find("-1")->second << std::endl; - for (int i = 0; i < 100; ++i) { + int32_t num_valid = 100; + for (int i = 0; i < num_valid; ++i) { char buf[10]; snprintf(buf, 10, "%d", i); h.insert(std::make_pair(buf, i)); @@ -34,18 +38,17 @@ int main(int argc, char** argv) { for (int i = 1000; i > 0; --i) { char buf[10]; snprintf(buf, 10, "%d", i - 1); - h.find(buf); - std::cerr << "Search " << i - 1 << " gives " << h.find(buf)->second << std::endl; + auto it = h.find(buf); + if (i < num_valid && it->second != i - 1) exit(-1); } } for (int j = 0; j < 100; ++j) { for (int i = 1000; i > 0; --i) { char buf[10]; - snprintf(buf, 10, "%d", i*100 - 1); - h.find(buf); - std::cerr << "Search " << i*100 - 1 << " gives " << h.find(buf)->second << std::endl; + int key = i*100 - 1; + snprintf(buf, 10, "%d", key); + auto it = h.find(buf); + if (key < num_valid && it->second != key) exit(-1); } } - */ - } From 9dcf0450f00fd5bbf12ab39f38565babb15546de Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Mon, 12 Mar 2012 01:43:06 -0300 Subject: [PATCH 68/89] Added Murmur3 support. Not necessarily faster. --- cxxmph/Makefile.am | 4 +- cxxmph/MurmurHash2.h | 74 ---------------------------------- cxxmph/bm_common.h | 6 ++- cxxmph/bm_index.cc | 2 +- cxxmph/bm_map.cc | 4 +- cxxmph/mph_index.h | 5 ++- cxxmph/mph_index_test.cc | 3 +- cxxmph/mph_map.h | 3 +- cxxmph/seeded_hash.h | 85 ++++++++++++++++++++++++++++------------ 9 files changed, 76 insertions(+), 110 deletions(-) delete mode 100644 cxxmph/MurmurHash2.h diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index cec2073..0de662f 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -3,10 +3,10 @@ check_PROGRAMS = hollow_iterator_test mph_map_test mph_index_test trigraph_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la -libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc +libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 cxxmph_includedir = $(includedir)/cxxmph/ -cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h +cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h mph_map_test_LDADD = libcxxmph.la mph_map_test_SOURCES = mph_map_test.cc diff --git a/cxxmph/MurmurHash2.h b/cxxmph/MurmurHash2.h deleted file mode 100644 index 0d318a3..0000000 --- a/cxxmph/MurmurHash2.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef __CXXMPH_MURMUR_HASH2__ -#define __CXXMPH_MURMUR_HASH2__ - -//----------------------------------------------------------------------------- -// MurmurHash2, by Austin Appleby - -// Note - This code makes a few assumptions about how your machine behaves - - -// 1. We can read a 4-byte value from any address without crashing -// 2. sizeof(int) == 4 - -// And it has a few limitations - - -// 1. It will not work incrementally. -// 2. It will not produce the same results on little-endian and big-endian -// machines. - -namespace cxxmph { - -inline // not measured, for making compilation easier only -unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) -{ - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - - const unsigned int m = 0x5bd1e995; - const int r = 24; - - // Initialize the hash to a 'random' value - - unsigned int h = seed ^ len; - - // Mix 4 bytes at a time into the hash - - const unsigned char * data = (const unsigned char *)key; - - while(len >= 4) - { - unsigned int k = *(unsigned int *)data; - - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - - data += 4; - len -= 4; - } - - // Handle the last few bytes of the input array - - switch(len) - { - case 3: h ^= data[2] << 16; - case 2: h ^= data[1] << 8; - case 1: h ^= data[0]; - h *= m; - }; - - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. - - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - return h; -} - -} // namespace cxxmph - -#endif // __CXXMPH_MURMUR_HASH2__ diff --git a/cxxmph/bm_common.h b/cxxmph/bm_common.h index aaf12b9..eed12df 100644 --- a/cxxmph/bm_common.h +++ b/cxxmph/bm_common.h @@ -6,14 +6,16 @@ #include #include #include // std::hash -#include "MurmurHash2.h" +#include "MurmurHash3.h" #include "benchmark.h" namespace std { template <> struct hash { uint32_t operator()(const cxxmph::StringPiece& k) const { - return cxxmph::MurmurHash2(k.data(), k.length(), 1); + uint32_t out; + MurmurHash3_x86_32(k.data(), k.length(), 1, &out); + return out; } }; } // namespace std diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index d1cbc00..9345a11 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -47,7 +47,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark { for (auto it = random_.begin(); it != random_.end(); ++it) { auto idx = index_.index(*it); // Collision check to be fair with STL - if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; + // if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; } } protected: diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index e381976..a90b7b2 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -91,9 +91,9 @@ int main(int argc, char** argv) { Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); Benchmark::RunAll(); diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index ad5bc6e..7b54250 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -158,7 +158,8 @@ bool MPHIndex::Mapping( std::vector* edges, std::vector* queue) { TriGraph graph(n_, m_); for (ForwardIterator it = begin; it != end; ++it) { - uint32_t h[3]; + uint32_t h[4]; + // SeededHashFcn().hash64(*it, hash_seed_[0], reinterpret_cast(&h)); for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); uint32_t v0 = h[0] % r_; uint32_t v1 = h[1] % r_ + r_; @@ -201,7 +202,7 @@ uint32_t MPHIndex::index(const Key& key) const { // Simple wrapper around MPHIndex to simplify calling code. Please refer to the // MPHIndex class for documentation. -template >::hash_function> +template >::hash_function> class SimpleMPHIndex : public MPHIndex { public: template diff --git a/cxxmph/mph_index_test.cc b/cxxmph/mph_index_test.cc index 70e01bc..b4101df 100644 --- a/cxxmph/mph_index_test.cc +++ b/cxxmph/mph_index_test.cc @@ -7,7 +7,7 @@ using std::string; using std::vector; -using cxxmph::SimpleMPHIndex; +using namespace cxxmph; int main(int argc, char** argv) { @@ -38,4 +38,3 @@ int main(int argc, char** argv) { SimpleMPHIndex other_mph_index; other_mph_index.deserialize(serialized); } - diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index ac77a06..fa264c8 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -6,6 +6,8 @@ // This class is about 20% to 100% slower than unordered_map (or ext/hash_map) // and should not be used if performance is a concern. In fact, you should only // use it for educational purposes. +// +// See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl #include #include @@ -13,7 +15,6 @@ #include #include // for std::pair -#include "MurmurHash2.h" #include "mph_index.h" #include "hollow_iterator.h" diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index d079a57..e204d36 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -6,9 +6,12 @@ #include #include // for std::hash -#include "MurmurHash2.h" +#include "MurmurHash3.h" #include "stringpiece.h" +// From murmur, only used naively to extend 32 bits functions to 64 bits. +uint32_t fmix ( uint32_t h ); + namespace cxxmph { template @@ -17,72 +20,106 @@ struct seeded_hash_function { uint32_t operator()(const Key& k, uint32_t seed) const { return HashFcn()(k) ^ seed; } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + for (int i = 0; i < 4; ++i) { + out[i] = HashFcn()(k) ^ seed; + seed = fmix(seed); + } + } }; -struct Murmur2 { +struct Murmur3 { template uint32_t operator()(const Key& k) const { - return MurmurHash2(reinterpret_cast(&k), sizeof(Key), 1 /* seed */); + uint32_t out; + MurmurHash3_x86_32(reinterpret_cast(&k), sizeof(Key), 1 /* seed */, &out); + return out; + } + template + void hash64(const Key& k, uint32_t* out) const { + MurmurHash3_x64_128(reinterpret_cast(&k), sizeof(Key), 1 /* seed */, out); } }; -struct Murmur2StringPiece { + +struct Murmur3StringPiece { template uint32_t operator()(const Key& k) const { StringPiece s(k); - return MurmurHash2(s.data(), s.length(), 1 /* seed */); + uint32_t out; + MurmurHash3_x86_32(s.data(), s.length(), 1 /* seed */, &out); + return out; + } + template + void hash64(const Key& k, uint32_t* out) const { + StringPiece s(k); + MurmurHash3_x64_128(s.data(), s.length(), 1 /* seed */, out); } }; template <> -struct seeded_hash_function { +struct seeded_hash_function { template uint32_t operator()(const Key& k, uint32_t seed) const { - return MurmurHash2(reinterpret_cast(&k), sizeof(Key), seed); + uint32_t out; + MurmurHash3_x86_32(reinterpret_cast(&k), sizeof(Key), seed, &out); + return out; + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + MurmurHash3_x64_128(reinterpret_cast(&k), sizeof(Key), seed, out); } }; template <> -struct seeded_hash_function { +struct seeded_hash_function { template uint32_t operator()(const Key& k, uint32_t seed) const { StringPiece s(k); - return MurmurHash2(s.data(), s.length(), seed); + uint32_t out; + MurmurHash3_x86_32(s.data(), s.length(), seed, &out); + return out; + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + StringPiece s(k); + MurmurHash3_x64_128(s.data(), s.length(), seed, out); } }; template struct seeded_hash { typedef seeded_hash_function hash_function; }; -// Use Murmur2 instead for all types defined in std::hash, plus +// Use Murmur3 instead for all types defined in std::hash, plus // std::string which is commonly extended. template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; } // namespace cxxmph From ee75d9a62082aae9ddf011969ec69a24afc60b6b Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Mon, 12 Mar 2012 01:44:56 -0300 Subject: [PATCH 69/89] Reenabled benchmarks. --- cxxmph/bm_map.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 8334604..5c0f7a4 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -87,14 +87,12 @@ using namespace cxxmph; int main(int argc, char** argv) { srandom(4); - /* Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); -*/ Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); Benchmark::RunAll(); From 7b8b3e583476abbd6d33eed0f2f547f2c60225bb Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Tue, 13 Mar 2012 19:31:35 -0300 Subject: [PATCH 70/89] Use hash64. --- cxxmph/mph_index.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 7b54250..f2741ea 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -159,8 +159,8 @@ bool MPHIndex::Mapping( TriGraph graph(n_, m_); for (ForwardIterator it = begin; it != end; ++it) { uint32_t h[4]; - // SeededHashFcn().hash64(*it, hash_seed_[0], reinterpret_cast(&h)); - for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); + SeededHashFcn().hash64(*it, hash_seed_[0], reinterpret_cast(&h)); + // for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); uint32_t v0 = h[0] % r_; uint32_t v1 = h[1] % r_ + r_; uint32_t v2 = h[2] % r_ + (r_ << 1); @@ -176,8 +176,9 @@ bool MPHIndex::Mapping( template uint32_t MPHIndex::perfect_hash(const Key& key) const { - uint32_t h[3]; - for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); + uint32_t h[4]; + SeededHashFcn().hash64(key, hash_seed_[0], reinterpret_cast(&h)); + // for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); assert(r_); h[0] = h[0] % r_; h[1] = h[1] % r_ + r_; From bd9efab766c7346bae44f0b6acff5128faadfe2e Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Tue, 13 Mar 2012 19:34:03 -0300 Subject: [PATCH 71/89] Added Murmur3 support. Not necessarily faster. Conflicts: cxxmph/Makefile.am --- cxxmph/Makefile.am | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 2e57a18..f1129d4 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -3,10 +3,10 @@ check_PROGRAMS = mph_map_test mph_index_test trigraph_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la -libcxxmph_la_SOURCES = MurmurHash2.h trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc +libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 cxxmph_includedir = $(includedir)/cxxmph/ -cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash2.h trigraph.h seeded_hash.h stringpiece.h +cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h mph_map_test_LDADD = libcxxmph.la mph_map_test_SOURCES = mph_map_test.cc From fd0bc2ae439fd1381317343a80c3a8e7e9a86fd9 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Tue, 13 Mar 2012 19:34:24 -0300 Subject: [PATCH 72/89] Added Murmur3 support. --- cxxmph/MurmurHash2.h | 74 ---------------------------------- cxxmph/bm_common.h | 6 ++- cxxmph/bm_index.cc | 2 +- cxxmph/bm_map.cc | 4 +- cxxmph/mph_index.h | 5 ++- cxxmph/mph_index_test.cc | 3 +- cxxmph/mph_map.h | 3 +- cxxmph/seeded_hash.h | 85 ++++++++++++++++++++++++++++------------ 8 files changed, 74 insertions(+), 108 deletions(-) delete mode 100644 cxxmph/MurmurHash2.h diff --git a/cxxmph/MurmurHash2.h b/cxxmph/MurmurHash2.h deleted file mode 100644 index 0d318a3..0000000 --- a/cxxmph/MurmurHash2.h +++ /dev/null @@ -1,74 +0,0 @@ -#ifndef __CXXMPH_MURMUR_HASH2__ -#define __CXXMPH_MURMUR_HASH2__ - -//----------------------------------------------------------------------------- -// MurmurHash2, by Austin Appleby - -// Note - This code makes a few assumptions about how your machine behaves - - -// 1. We can read a 4-byte value from any address without crashing -// 2. sizeof(int) == 4 - -// And it has a few limitations - - -// 1. It will not work incrementally. -// 2. It will not produce the same results on little-endian and big-endian -// machines. - -namespace cxxmph { - -inline // not measured, for making compilation easier only -unsigned int MurmurHash2 ( const void * key, int len, unsigned int seed ) -{ - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - - const unsigned int m = 0x5bd1e995; - const int r = 24; - - // Initialize the hash to a 'random' value - - unsigned int h = seed ^ len; - - // Mix 4 bytes at a time into the hash - - const unsigned char * data = (const unsigned char *)key; - - while(len >= 4) - { - unsigned int k = *(unsigned int *)data; - - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - - data += 4; - len -= 4; - } - - // Handle the last few bytes of the input array - - switch(len) - { - case 3: h ^= data[2] << 16; - case 2: h ^= data[1] << 8; - case 1: h ^= data[0]; - h *= m; - }; - - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. - - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - return h; -} - -} // namespace cxxmph - -#endif // __CXXMPH_MURMUR_HASH2__ diff --git a/cxxmph/bm_common.h b/cxxmph/bm_common.h index aaf12b9..eed12df 100644 --- a/cxxmph/bm_common.h +++ b/cxxmph/bm_common.h @@ -6,14 +6,16 @@ #include #include #include // std::hash -#include "MurmurHash2.h" +#include "MurmurHash3.h" #include "benchmark.h" namespace std { template <> struct hash { uint32_t operator()(const cxxmph::StringPiece& k) const { - return cxxmph::MurmurHash2(k.data(), k.length(), 1); + uint32_t out; + MurmurHash3_x86_32(k.data(), k.length(), 1, &out); + return out; } }; } // namespace std diff --git a/cxxmph/bm_index.cc b/cxxmph/bm_index.cc index 924231c..443178f 100644 --- a/cxxmph/bm_index.cc +++ b/cxxmph/bm_index.cc @@ -47,7 +47,7 @@ class BM_MPHIndexSearch : public SearchUrlsBenchmark { for (auto it = random_.begin(); it != random_.end(); ++it) { auto idx = index_.index(*it); // Collision check to be fair with STL - if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; + // if (strcmp(urls_[idx].c_str(), it->data()) != 0) idx = -1; } } protected: diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 5c0f7a4..25ba463 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -90,9 +90,9 @@ int main(int argc, char** argv) { Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); Benchmark::RunAll(); diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index d2e4a01..46d8ebe 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -157,7 +157,8 @@ bool MPHIndex::Mapping( std::vector* edges, std::vector* queue) { TriGraph graph(n_, m_); for (ForwardIterator it = begin; it != end; ++it) { - uint32_t h[3]; + uint32_t h[4]; + // SeededHashFcn().hash64(*it, hash_seed_[0], reinterpret_cast(&h)); for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); uint32_t v0 = h[0] % r_; uint32_t v1 = h[1] % r_ + r_; @@ -200,7 +201,7 @@ uint32_t MPHIndex::index(const Key& key) const { // Simple wrapper around MPHIndex to simplify calling code. Please refer to the // MPHIndex class for documentation. -template >::hash_function> +template >::hash_function> class SimpleMPHIndex : public MPHIndex { public: template diff --git a/cxxmph/mph_index_test.cc b/cxxmph/mph_index_test.cc index 7a7d036..f2482b7 100644 --- a/cxxmph/mph_index_test.cc +++ b/cxxmph/mph_index_test.cc @@ -7,7 +7,7 @@ using std::string; using std::vector; -using cxxmph::SimpleMPHIndex; +using namespace cxxmph; int main(int argc, char** argv) { @@ -38,4 +38,3 @@ int main(int argc, char** argv) { SimpleMPHIndex other_mph_index; other_mph_index.deserialize(serialized); } - diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 7541c45..405a7f9 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -6,13 +6,14 @@ // This class is about 20% to 100% slower than unordered_map (or ext/hash_map) // and should not be used if performance is a concern. In fact, you should only // use it for educational purposes. +// +// See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl #include #include #include #include // for std::pair -#include "MurmurHash2.h" #include "mph_index.h" namespace cxxmph { diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index d079a57..e204d36 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -6,9 +6,12 @@ #include #include // for std::hash -#include "MurmurHash2.h" +#include "MurmurHash3.h" #include "stringpiece.h" +// From murmur, only used naively to extend 32 bits functions to 64 bits. +uint32_t fmix ( uint32_t h ); + namespace cxxmph { template @@ -17,72 +20,106 @@ struct seeded_hash_function { uint32_t operator()(const Key& k, uint32_t seed) const { return HashFcn()(k) ^ seed; } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + for (int i = 0; i < 4; ++i) { + out[i] = HashFcn()(k) ^ seed; + seed = fmix(seed); + } + } }; -struct Murmur2 { +struct Murmur3 { template uint32_t operator()(const Key& k) const { - return MurmurHash2(reinterpret_cast(&k), sizeof(Key), 1 /* seed */); + uint32_t out; + MurmurHash3_x86_32(reinterpret_cast(&k), sizeof(Key), 1 /* seed */, &out); + return out; + } + template + void hash64(const Key& k, uint32_t* out) const { + MurmurHash3_x64_128(reinterpret_cast(&k), sizeof(Key), 1 /* seed */, out); } }; -struct Murmur2StringPiece { + +struct Murmur3StringPiece { template uint32_t operator()(const Key& k) const { StringPiece s(k); - return MurmurHash2(s.data(), s.length(), 1 /* seed */); + uint32_t out; + MurmurHash3_x86_32(s.data(), s.length(), 1 /* seed */, &out); + return out; + } + template + void hash64(const Key& k, uint32_t* out) const { + StringPiece s(k); + MurmurHash3_x64_128(s.data(), s.length(), 1 /* seed */, out); } }; template <> -struct seeded_hash_function { +struct seeded_hash_function { template uint32_t operator()(const Key& k, uint32_t seed) const { - return MurmurHash2(reinterpret_cast(&k), sizeof(Key), seed); + uint32_t out; + MurmurHash3_x86_32(reinterpret_cast(&k), sizeof(Key), seed, &out); + return out; + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + MurmurHash3_x64_128(reinterpret_cast(&k), sizeof(Key), seed, out); } }; template <> -struct seeded_hash_function { +struct seeded_hash_function { template uint32_t operator()(const Key& k, uint32_t seed) const { StringPiece s(k); - return MurmurHash2(s.data(), s.length(), seed); + uint32_t out; + MurmurHash3_x86_32(s.data(), s.length(), seed, &out); + return out; + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + StringPiece s(k); + MurmurHash3_x64_128(s.data(), s.length(), seed, out); } }; template struct seeded_hash { typedef seeded_hash_function hash_function; }; -// Use Murmur2 instead for all types defined in std::hash, plus +// Use Murmur3 instead for all types defined in std::hash, plus // std::string which is commonly extended. template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; } // namespace cxxmph From 498884327a0ca80db86953038f5af1f0d5d4eefb Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Tue, 13 Mar 2012 19:31:35 -0300 Subject: [PATCH 73/89] Use hash64. --- cxxmph/mph_index.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 46d8ebe..9970943 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -158,8 +158,8 @@ bool MPHIndex::Mapping( TriGraph graph(n_, m_); for (ForwardIterator it = begin; it != end; ++it) { uint32_t h[4]; - // SeededHashFcn().hash64(*it, hash_seed_[0], reinterpret_cast(&h)); - for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); + SeededHashFcn().hash64(*it, hash_seed_[0], reinterpret_cast(&h)); + // for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(*it, hash_seed_[i]); uint32_t v0 = h[0] % r_; uint32_t v1 = h[1] % r_ + r_; uint32_t v2 = h[2] % r_ + (r_ << 1); @@ -175,8 +175,9 @@ bool MPHIndex::Mapping( template uint32_t MPHIndex::perfect_hash(const Key& key) const { - uint32_t h[3]; - for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); + uint32_t h[4]; + SeededHashFcn().hash64(key, hash_seed_[0], reinterpret_cast(&h)); + // for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); assert(r_); h[0] = h[0] % r_; h[1] = h[1] % r_ + r_; From aa5fa26b49d324b8dfe4c6e46362a7e8640eabcc Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Tue, 13 Mar 2012 20:25:06 -0300 Subject: [PATCH 74/89] Strange optimizations for 64 bit integers. --- cxxmph/bm_map.cc | 2 ++ cxxmph/seeded_hash.h | 34 +++++++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index a90b7b2..8195217 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -88,12 +88,14 @@ using namespace cxxmph; int main(int argc, char** argv) { srandom(4); + /* Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); + */ Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); Benchmark::RunAll(); diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index e204d36..f0bab05 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -9,8 +9,9 @@ #include "MurmurHash3.h" #include "stringpiece.h" -// From murmur, only used naively to extend 32 bits functions to 64 bits. +// From murmur, only used naively to extend 32 bits functions to 128 bits. uint32_t fmix ( uint32_t h ); +uint64_t fmix ( uint64_t h ); namespace cxxmph { @@ -57,6 +58,19 @@ struct Murmur3StringPiece { } }; +struct Murmur3Fmix64bitsType { + template + uint32_t operator()(const Key& k) const { + return fmix(*reinterpret_cast(&k)); + } + template + void hash64(const Key& k, uint32_t* out) const { + uint64_t h = fmix(*reinterpret_cast(&k)); + *reinterpret_cast(out) = h; + *reinterpret_cast(out + 2) = h; + } +}; + template <> struct seeded_hash_function { template @@ -87,6 +101,20 @@ struct seeded_hash_function { } }; +template <> +struct seeded_hash_function { + template + uint32_t operator()(const Key& k, uint32_t seed) const { + return fmix(k + seed); + } + template + void hash64(const Key& k, uint32_t seed, uint32_t* out) const { + *reinterpret_cast(out) = fmix(k ^ seed); + *(out + 2) = fmix(*out); + } +}; + + template struct seeded_hash { typedef seeded_hash_function hash_function; }; // Use Murmur3 instead for all types defined in std::hash, plus @@ -117,9 +145,9 @@ template <> struct seeded_hash > template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; } // namespace cxxmph From 86797b6402c37c9285cdcc4f588320bd9a9a90f5 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 14 Mar 2012 01:29:13 -0300 Subject: [PATCH 75/89] Finally beat STL. Trying improvement around cuckoo hashing idea. --- cxxmph/bm_map.cc | 7 +++--- cxxmph/mph_index.h | 43 +++++++++++++++++++++++++++++++---- cxxmph/mph_map.h | 54 +++++++++++++++++++++++++++++++++----------- cxxmph/seeded_hash.h | 9 ++++---- 4 files changed, 87 insertions(+), 26 deletions(-) diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 8195217..0a0b225 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -49,6 +49,7 @@ class BM_SearchUrls : public SearchUrlsBenchmark { mymap_[*it] = *it; } mymap_.rehash(mymap_.bucket_count()); + fprintf(stderr, "Occupation: %f\n", static_cast(mymap_.size())/mymap_.bucket_count()); return true; } MapType mymap_; @@ -57,7 +58,7 @@ class BM_SearchUrls : public SearchUrlsBenchmark { template class BM_SearchUint64 : public SearchUint64Benchmark { public: - BM_SearchUint64() : SearchUint64Benchmark(10000, 10*1000*1000) { } + BM_SearchUint64() : SearchUint64Benchmark(100000, 10*1000*1000) { } virtual bool SetUp() { if (!SearchUint64Benchmark::SetUp()) return false; for (int i = 0; i < values_.size(); ++i) { @@ -88,15 +89,13 @@ using namespace cxxmph; int main(int argc, char** argv) { srandom(4); - /* Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); - */ - Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); + Benchmark::Register(new BM_SearchUint64>); Benchmark::RunAll(); } diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index f2741ea..45390a4 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -25,6 +25,7 @@ #include #include +#include #include #include // for std::hash #include @@ -63,6 +64,12 @@ class MPHIndex { template // must agree with Reset uint32_t minimal_perfect_hash(const Key& x) const; + // Crazy functions. Ignore. + template // must agree with Reset + uint32_t cuckoo_hash(const Key& x, const uint32_t* h, uint8_t nest) const; + template // must agree with Reset + void hash_vector(const Key& x, uint32_t* h) const; + // Serialization for mmap usage - not tested well, ping me if you care. // Serialized tables are not guaranteed to work across versions or different // endianness (although they could easily be made to be). @@ -94,6 +101,8 @@ class MPHIndex { // Partition vertex count, derived from c parameter. uint32_t r_; + uint32_t nest_displacement_[3]; // derived from r_ + // The array containing the minimal perfect hash function graph. Do not use // c++ vector to make mmap based backing easier. const uint8_t* g_; @@ -118,6 +127,16 @@ class MPHIndex { }; +template +T nexthigher(T k) { + if (k == 0) + return 1; + k--; + for (int i=1; i> i; + return k+1; +} + // Template method needs to go in the header file. template bool MPHIndex::Reset( @@ -129,6 +148,13 @@ bool MPHIndex::Reset( m_ = size; r_ = static_cast(ceil((c_*m_)/3)); if ((r_ % 2) == 0) r_ += 1; + nest_displacement_[0] = 0; + nest_displacement_[1] = r_; + nest_displacement_[2] = (r_ << 1); + // This can be used to speed mods, but increases occupation too much. + // Needs to try http://gmplib.org/manual/Integer-Exponentiation.html instead + // r_ = nexthigher(r_); + n_ = 3*r_; k_ = 1U << b_; @@ -174,15 +200,24 @@ bool MPHIndex::Mapping( return false; } +template +uint32_t MPHIndex::cuckoo_hash(const Key& key, const uint32_t* h, uint8_t nest) const { + return (h[nest] % r_) + nest_displacement_[nest]; +} + +template +void MPHIndex::hash_vector(const Key& key, uint32_t* h) const { + SeededHashFcn().hash64(key, hash_seed_[0], reinterpret_cast(&h)); +} + template uint32_t MPHIndex::perfect_hash(const Key& key) const { uint32_t h[4]; SeededHashFcn().hash64(key, hash_seed_[0], reinterpret_cast(&h)); // for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); - assert(r_); - h[0] = h[0] % r_; - h[1] = h[1] % r_ + r_; - h[2] = h[2] % r_ + (r_ << 1); + h[0] = (h[0] % r_) + nest_displacement_[0]; + h[1] = (h[1] % r_) + nest_displacement_[1]; + h[2] = (h[2] % r_) + nest_displacement_[2]; assert(g_size_); // cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl; assert((h[0] >> 2) >(&values_, &present_, it); } + static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { + d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]); + } + static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { + return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); + } + void pack(); std::vector values_; std::vector present_; + const uint8_t* nests_; SimpleMPHIndex::hash_function> index_; // TODO(davi) optimize slack to no hold a copy of the key typedef unordered_map slack_type; @@ -187,28 +195,48 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { } MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { - if (__builtin_expect(!slack_.empty(), 0)) { - auto it = slack_.find(k); - if (it != slack_.end()) return make_iterator(values_.begin() + it->second); + uint32_t h[4]; + auto nest = nests_[index_.hash_vector(k, reinterpret_cast(&h))]; + if (nest != kNestCollision) { + auto vit = values_.begin() + h[nest]; + if (equal_(k, vit->first)) return make_iterator(vit); } - if (__builtin_expect(index_.size() == 0, 0)) return end(); + return slow_find(k); +} + +MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k) const { auto id = index_.perfect_hash(k); if (!present_[id]) return end(); - auto it = make_iterator(values_.begin() + id); - if (__builtin_expect(equal_(k, it->first), 1)) return it; + auto vit = values_.begin() + id; + if (equal_(k, vit->first)) return make_iterator(vit); + + if (__builtin_expect(!slack_.empty(), 0)) { + auto sit = slack_.find(k); + if (it != slack_.end()) return make_iterator(values_.begin() + sit->second); + } return end(); } MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { - if (__builtin_expect(!slack_.empty(), 0)) { - auto it = slack_.find(k); - if (it != slack_.end()) return make_iterator(values_.begin() + it->second); + uint32_t h[4]; + auto nest = nests_[index_.hash_vector(k, reinterpret_cast(&h))]; + if (nest != kNestCollision) { + auto vit = values_.begin() + h[nest]; + if (equal_(k, vit->first)) return make_iterator(vit); } - if (__builtin_expect(index_.size() == 0, 0)) return end(); + return slow_find(k); +} + +MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k) { auto id = index_.perfect_hash(k); if (!present_[id]) return end(); - auto it = make_iterator(values_.begin() + id); - if (__builtin_expect(equal_(k, it->first), 1)) return it; + auto vit = values_.begin() + id; + if (equal_(k, vit->first)) return make_iterator(vit); + + if (__builtin_expect(!slack_.empty(), 0)) { + auto sit = slack_.find(k); + if (it != slack_.end()) return make_iterator(values_.begin() + sit->second); + } return end(); } diff --git a/cxxmph/seeded_hash.h b/cxxmph/seeded_hash.h index f0bab05..69cb0ac 100644 --- a/cxxmph/seeded_hash.h +++ b/cxxmph/seeded_hash.h @@ -65,9 +65,8 @@ struct Murmur3Fmix64bitsType { } template void hash64(const Key& k, uint32_t* out) const { - uint64_t h = fmix(*reinterpret_cast(&k)); - *reinterpret_cast(out) = h; - *reinterpret_cast(out + 2) = h; + *reinterpret_cast(out) = fmix(k); + *(out + 2) = fmix(*out); } }; @@ -145,9 +144,9 @@ template <> struct seeded_hash > template <> struct seeded_hash > { typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; template <> struct seeded_hash > -{ typedef seeded_hash_function hash_function; }; +{ typedef seeded_hash_function hash_function; }; } // namespace cxxmph From a4d96e6cb26fead353d2b8afc110c154291b10c7 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 14 Mar 2012 04:51:55 -0300 Subject: [PATCH 76/89] Tests pass, but it segfaults at the benchmark. Need further investigation, but the core for the cuckoo stuff is already there. --- cxxmph/Makefile.am | 2 +- cxxmph/bm_map.cc | 2 ++ cxxmph/mph_bits.cc | 4 +++ cxxmph/mph_bits.h | 18 ++++++++++ cxxmph/mph_index.h | 54 ++++++++++++++--------------- cxxmph/mph_map.h | 77 ++++++++++++++++++++++++++++++------------ cxxmph/mph_map_test.cc | 4 +++ 7 files changed, 111 insertions(+), 50 deletions(-) create mode 100644 cxxmph/mph_bits.cc create mode 100644 cxxmph/mph_bits.h diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 0de662f..22c0bb2 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -3,7 +3,7 @@ check_PROGRAMS = hollow_iterator_test mph_map_test mph_index_test trigraph_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la -libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc +libcxxmph_la_SOURCES = MurmurHash3.h MurmurHash3.cpp trigragh.h trigraph.cc mph_index.h mph_index.cc seeded_hash.h stringpiece.h benchmark.h benchmark.cc mph_bits.h mph_bits.cc libcxxmph_la_LDFLAGS = -version-info 0:0:0 cxxmph_includedir = $(includedir)/cxxmph/ cxxmph_include_HEADERS = mph_map.h mph_index.h MurmurHash3.h trigraph.h seeded_hash.h stringpiece.h hollow_iterator.h diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 0a0b225..44e3fe7 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -89,10 +89,12 @@ using namespace cxxmph; int main(int argc, char** argv) { srandom(4); + /* Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); + */ Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUint64>); diff --git a/cxxmph/mph_bits.cc b/cxxmph/mph_bits.cc new file mode 100644 index 0000000..9fb97bd --- /dev/null +++ b/cxxmph/mph_bits.cc @@ -0,0 +1,4 @@ +#include "mph_bits.h" + +namespace cxxmph { +} diff --git a/cxxmph/mph_bits.h b/cxxmph/mph_bits.h new file mode 100644 index 0000000..03f7c08 --- /dev/null +++ b/cxxmph/mph_bits.h @@ -0,0 +1,18 @@ +#ifndef __CXXMPH_MPH_BITS_H__ +#define __CXXMPH_MPH_BITS_H__ + +#include // for uint32_t and friends + +namespace cxxmph { + +static const uint8_t valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; +static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { + d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]); +} +static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { + return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); +} + +} // namespace cxxmph + +#endif diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 45390a4..ad1d7f6 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -36,6 +36,7 @@ using std::cerr; using std::endl; #include "seeded_hash.h" +#include "mph_bits.h" #include "trigraph.h" namespace cxxmph { @@ -43,7 +44,7 @@ namespace cxxmph { class MPHIndex { public: MPHIndex(double c = 1.23, uint8_t b = 7) : - c_(c), b_(b), m_(0), n_(0), k_(0), r_(0), + c_(c), b_(b), m_(0), n_(0), k_(0), r_(1), g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0), deserialized_(false) { } ~MPHIndex(); @@ -65,8 +66,12 @@ class MPHIndex { uint32_t minimal_perfect_hash(const Key& x) const; // Crazy functions. Ignore. + template // must agree with Reset + uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const; template // must agree with Reset - uint32_t cuckoo_hash(const Key& x, const uint32_t* h, uint8_t nest) const; + uint8_t cuckoo_nest(const Key& x, const uint32_t* h) const; + template // must agree with Reset + uint32_t cuckoo_nest_index(const Key& x, uint32_t* h) const; template // must agree with Reset void hash_vector(const Key& x, uint32_t* h) const; @@ -117,26 +122,8 @@ class MPHIndex { bool deserialized_; static const uint8_t valuemask[]; - static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { - d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]); - } - static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { - return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); - } - - }; -template -T nexthigher(T k) { - if (k == 0) - return 1; - k--; - for (int i=1; i> i; - return k+1; -} - // Template method needs to go in the header file. template bool MPHIndex::Reset( @@ -153,7 +140,7 @@ bool MPHIndex::Reset( nest_displacement_[2] = (r_ << 1); // This can be used to speed mods, but increases occupation too much. // Needs to try http://gmplib.org/manual/Integer-Exponentiation.html instead - // r_ = nexthigher(r_); + // r_ = nextpoweroftwo(r_); n_ = 3*r_; k_ = 1U << b_; @@ -200,30 +187,40 @@ bool MPHIndex::Mapping( return false; } -template -uint32_t MPHIndex::cuckoo_hash(const Key& key, const uint32_t* h, uint8_t nest) const { +template +uint32_t MPHIndex::cuckoo_hash(const uint32_t* h, uint8_t nest) const { return (h[nest] % r_) + nest_displacement_[nest]; } template void MPHIndex::hash_vector(const Key& key, uint32_t* h) const { - SeededHashFcn().hash64(key, hash_seed_[0], reinterpret_cast(&h)); + SeededHashFcn().hash64(key, hash_seed_[0], h); +} + +template +uint8_t MPHIndex::cuckoo_nest(const Key& key, const uint32_t* h) const { + uint32_t x[4]; + x[0] = (h[0] % r_) + nest_displacement_[0]; + x[1] = (h[1] % r_) + nest_displacement_[1]; + x[2] = (h[2] % r_) + nest_displacement_[2]; + return (get_2bit_value(g_, x[0]) + get_2bit_value(g_, x[1]) + get_2bit_value(g_, x[2])) % 3; } template uint32_t MPHIndex::perfect_hash(const Key& key) const { uint32_t h[4]; - SeededHashFcn().hash64(key, hash_seed_[0], reinterpret_cast(&h)); + SeededHashFcn().hash64(key, hash_seed_[0], h); // for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); h[0] = (h[0] % r_) + nest_displacement_[0]; h[1] = (h[1] % r_) + nest_displacement_[1]; h[2] = (h[2] % r_) + nest_displacement_[2]; - assert(g_size_); + if (!g_size_) return 0; // cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl; assert((h[0] >> 2) > 2) > 2) @@ -248,6 +245,9 @@ class SimpleMPHIndex : public MPHIndex { uint32_t index(const Key& key) const { return MPHIndex::index(key); } uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash(key); } uint32_t minimal_perfect_hash(const Key& key) const { return MPHIndex::minimal_perfect_hash(key); } + uint8_t cuckoo_nest(const Key& key, const uint32_t* h) const { return MPHIndex::cuckoo_nest(key, h); } + uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const { return MPHIndex::cuckoo_hash(h, nest); } + void hash_vector(const Key& key, uint32_t* h) const { MPHIndex::hash_vector(key, h); } }; } // namespace cxxmph diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 66822ad..25fecab 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -9,8 +9,9 @@ // // See http://www.strchr.com/crc32_popcnt and new Murmur3 function to try to beat stl -#include #include +#include +#include #include #include #include // for std::pair @@ -100,17 +101,19 @@ class mph_map { return hollow_const_iterator>(&values_, &present_, it); } - static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { - d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]); - } - static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { - return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); + iterator slow_find(const key_type& k); + const_iterator slow_find(const key_type& k) const; + static const uint8_t kNestCollision = 3; // biggest 2 bit value + uint32_t nest_index(const key_type& k, uint32_t* h) const { + index_.hash_vector(k, h); + // Use a pivot to prevent branch in the fast path + return h[3] % (index_.perfect_hash_size() + 1); } void pack(); std::vector values_; std::vector present_; - const uint8_t* nests_; + std::vector nests_; SimpleMPHIndex::hash_function> index_; // TODO(davi) optimize slack to no hold a copy of the key typedef unordered_map slack_type; @@ -124,6 +127,7 @@ bool operator==(const MPH_MAP_CLASS_SPEC& lhs, const MPH_MAP_CLASS_SPEC& rhs) { } MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() : size_(0) { + clear(); pack(); } @@ -140,6 +144,10 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { } values_.push_back(x); present_.push_back(true); + nests_.resize(ceil(values_.size() / 2.0), std::numeric_limits::max()); + uint32_t h[4]; + auto index = nest_index(x.first, h); + set_2bit_value(&(nests_[0]), index, kNestCollision); ++size_; slack_.insert(make_pair(x.first, values_.size() - 1)); if (should_pack) pack(); @@ -157,14 +165,28 @@ MPH_MAP_METHOD_DECL(void_type, pack)() { new_values.reserve(new_values.size() * 2); std::vector new_present(index_.perfect_hash_size(), false); new_present.reserve(new_present.size() * 2); + std::vector new_nests(ceil(index_.perfect_hash_size() / 2.0), std::numeric_limits::max()); + new_nests.reserve(new_nests.size() * 2); + vector used_nests(new_nests.size() * 2); for (iterator it = begin(), it_end = end(); it != it_end; ++it) { size_type id = index_.perfect_hash(it->first); assert(id < new_values.size()); new_values[id] = *it; new_present[id] = true; + uint32_t h[4]; + uint32_t index = nest_index(it->first, h); + if (used_nests[index]) { + set_2bit_value(&(new_nests[0]), index, kNestCollision); + } + else { + set_2bit_value(&(new_nests[0]), index, index_.cuckoo_nest(it->first, h)); + assert(index_.perfect_hash(it->first) == index_.cuckoo_hash(h, index_.cuckoo_nest(it->first, h))); + used_nests[index] = true; + } } values_.swap(new_values); present_.swap(new_present); + nests_.swap(new_nests); slack_type().swap(slack_); } @@ -180,11 +202,15 @@ MPH_MAP_METHOD_DECL(void_type, clear)() { present_.clear(); slack_.clear(); index_.clear(); + nests_.clear(); + nests_.push_back(std::numeric_limits::max()); size_ = 0; } MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { present_[pos - begin] = false; + uint32_t h[4]; + nests_[nest_index(pos->first, h)] = kNestCollision; *pos = value_type(); --size_; } @@ -196,7 +222,7 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { uint32_t h[4]; - auto nest = nests_[index_.hash_vector(k, reinterpret_cast(&h))]; + auto nest = get_2bit_value(&(nests_[0]), nest_index(k, h)); if (nest != kNestCollision) { auto vit = values_.begin() + h[nest]; if (equal_(k, vit->first)) return make_iterator(vit); @@ -205,37 +231,44 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { } MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k) const { - auto id = index_.perfect_hash(k); - if (!present_[id]) return end(); - auto vit = values_.begin() + id; - if (equal_(k, vit->first)) return make_iterator(vit); - + if (index_.perfect_hash_size()) { + auto id = index_.perfect_hash(k); + if (present_[id]) { + auto vit = values_.begin() + id; + if (equal_(k, vit->first)) return make_iterator(vit); + } + } if (__builtin_expect(!slack_.empty(), 0)) { auto sit = slack_.find(k); - if (it != slack_.end()) return make_iterator(values_.begin() + sit->second); + if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second); } return end(); } MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { uint32_t h[4]; - auto nest = nests_[index_.hash_vector(k, reinterpret_cast(&h))]; + auto index = nest_index(k, h); + assert(nests_.size()); + assert(nests_.size() > index / 2); + auto nest = get_2bit_value(&(nests_[0]), index); if (nest != kNestCollision) { - auto vit = values_.begin() + h[nest]; + auto vit = values_.begin() + index_.cuckoo_hash(h, nest); if (equal_(k, vit->first)) return make_iterator(vit); } return slow_find(k); } MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k) { - auto id = index_.perfect_hash(k); - if (!present_[id]) return end(); - auto vit = values_.begin() + id; - if (equal_(k, vit->first)) return make_iterator(vit); - + if (index_.perfect_hash_size()) { + auto id = index_.perfect_hash(k); + if (present_[id]) { + auto vit = values_.begin() + id; + if (equal_(k, vit->first)) return make_iterator(vit); + } + } if (__builtin_expect(!slack_.empty(), 0)) { auto sit = slack_.find(k); - if (it != slack_.end()) return make_iterator(values_.begin() + sit->second); + if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second); } return end(); } diff --git a/cxxmph/mph_map_test.cc b/cxxmph/mph_map_test.cc index 11bfbc9..ada71b3 100644 --- a/cxxmph/mph_map_test.cc +++ b/cxxmph/mph_map_test.cc @@ -17,6 +17,10 @@ int main(int argc, char** argv) { } for (int i = 0; i < num_keys; ++i) { auto it = b.find(i); + if (it == b.end()) { + std::cerr << "Failed to find " << i << std::endl; + exit(-1); + } if (it->first != it->second || it->first != i) { std::cerr << "Found " << it->first << " looking for " << i << std::endl; exit(-1); From 687cc1b194742ad7f8fcfedab4a07a5f3bdae8a7 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 14 Mar 2012 11:58:37 -0300 Subject: [PATCH 77/89] Added cuckoo stuff, uint64 became slower again. --- cxxmph/bm_map.cc | 2 - cxxmph/mph_bits.h | 7 ++++ cxxmph/mph_index.h | 16 +++++--- cxxmph/mph_map.h | 98 +++++++++++++++++++++++++++------------------- 4 files changed, 74 insertions(+), 49 deletions(-) diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 44e3fe7..0a0b225 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -89,12 +89,10 @@ using namespace cxxmph; int main(int argc, char** argv) { srandom(4); - /* Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); - */ Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUint64>); diff --git a/cxxmph/mph_bits.h b/cxxmph/mph_bits.h index 03f7c08..6de8168 100644 --- a/cxxmph/mph_bits.h +++ b/cxxmph/mph_bits.h @@ -2,6 +2,7 @@ #define __CXXMPH_MPH_BITS_H__ #include // for uint32_t and friends +#include namespace cxxmph { @@ -12,6 +13,12 @@ static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); } +static uint32_t nextpoweroftwo(uint32_t k) { + if (k == 0) return 1; + k--; + for (int i=1; i> i; + return k+1; +} } // namespace cxxmph diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index ad1d7f6..deccf22 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -68,8 +68,8 @@ class MPHIndex { // Crazy functions. Ignore. template // must agree with Reset uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const; - template // must agree with Reset - uint8_t cuckoo_nest(const Key& x, const uint32_t* h) const; + template // must agree with Reset + uint8_t cuckoo_nest(const uint32_t* h) const; template // must agree with Reset uint32_t cuckoo_nest_index(const Key& x, uint32_t* h) const; template // must agree with Reset @@ -197,24 +197,28 @@ void MPHIndex::hash_vector(const Key& key, uint32_t* h) const { SeededHashFcn().hash64(key, hash_seed_[0], h); } -template -uint8_t MPHIndex::cuckoo_nest(const Key& key, const uint32_t* h) const { +template // must agree with Reset +uint8_t MPHIndex::cuckoo_nest(const uint32_t* h) const { uint32_t x[4]; + if (!g_size_) return 0; x[0] = (h[0] % r_) + nest_displacement_[0]; x[1] = (h[1] % r_) + nest_displacement_[1]; x[2] = (h[2] % r_) + nest_displacement_[2]; + assert((x[0] >> 2) > 2) > 2) uint32_t MPHIndex::perfect_hash(const Key& key) const { uint32_t h[4]; + if (!g_size_) return 0; SeededHashFcn().hash64(key, hash_seed_[0], h); // for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); h[0] = (h[0] % r_) + nest_displacement_[0]; h[1] = (h[1] % r_) + nest_displacement_[1]; h[2] = (h[2] % r_) + nest_displacement_[2]; - if (!g_size_) return 0; // cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl; assert((h[0] >> 2) > 2) (key); } uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash(key); } uint32_t minimal_perfect_hash(const Key& key) const { return MPHIndex::minimal_perfect_hash(key); } - uint8_t cuckoo_nest(const Key& key, const uint32_t* h) const { return MPHIndex::cuckoo_nest(key, h); } + uint8_t cuckoo_nest(const uint32_t* h) const { return MPHIndex::cuckoo_nest(h); } uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const { return MPHIndex::cuckoo_hash(h, nest); } void hash_vector(const Key& key, uint32_t* h) const { MPHIndex::hash_vector(key, h); } }; diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 25fecab..687315e 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include // for std::pair @@ -101,13 +102,19 @@ class mph_map { return hollow_const_iterator>(&values_, &present_, it); } - iterator slow_find(const key_type& k); - const_iterator slow_find(const key_type& k) const; + iterator slow_find(const key_type& k, uint32_t perfect_hash); + const_iterator slow_find(const key_type& k, uint32_t perfect_hash) const; static const uint8_t kNestCollision = 3; // biggest 2 bit value - uint32_t nest_index(const key_type& k, uint32_t* h) const { - index_.hash_vector(k, h); - // Use a pivot to prevent branch in the fast path - return h[3] % (index_.perfect_hash_size() + 1); + void set_nest_value(const uint32_t* h, uint8_t value) { + assert(get_nest_index(h) < nests_.size() * 4); + set_2bit_value(&(nests_[0]), get_nest_index(h), value); + } + uint32_t get_nest_value(const uint32_t* h) const { + assert(get_nest_index(h) < nests_.size() * 4); + return get_2bit_value(&(nests_[0]), get_nest_index(h)); + } + uint32_t get_nest_index(const uint32_t* h) const { + return h[3] & ((nests_.size() << 2) - 1); } void pack(); @@ -144,10 +151,11 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { } values_.push_back(x); present_.push_back(true); - nests_.resize(ceil(values_.size() / 2.0), std::numeric_limits::max()); + auto nests_size = nextpoweroftwo(ceil(values_.size() / 4.0) + 1)*10; + nests_.resize(nests_size, std::numeric_limits::max()); uint32_t h[4]; - auto index = nest_index(x.first, h); - set_2bit_value(&(nests_[0]), index, kNestCollision); + index_.hash_vector(x.first, h); + set_nest_value(h, kNestCollision); ++size_; slack_.insert(make_pair(x.first, values_.size() - 1)); if (should_pack) pack(); @@ -157,6 +165,7 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { MPH_MAP_METHOD_DECL(void_type, pack)() { if (values_.empty()) return; + assert(std::unordered_set(make_iterator_first(begin()), make_iterator_first(end())).size() == size()); bool success = index_.Reset( make_iterator_first(begin()), make_iterator_first(end()), size_); @@ -165,28 +174,33 @@ MPH_MAP_METHOD_DECL(void_type, pack)() { new_values.reserve(new_values.size() * 2); std::vector new_present(index_.perfect_hash_size(), false); new_present.reserve(new_present.size() * 2); - std::vector new_nests(ceil(index_.perfect_hash_size() / 2.0), std::numeric_limits::max()); + auto new_nests_size = nextpoweroftwo(ceil(new_values.size() / 4.0) + 1)*10; + std::vector new_nests(new_nests_size, std::numeric_limits::max()); new_nests.reserve(new_nests.size() * 2); - vector used_nests(new_nests.size() * 2); + nests_.swap(new_nests); + vector used_nests(nests_.size() * 4); + uint32_t collisions = 0; for (iterator it = begin(), it_end = end(); it != it_end; ++it) { size_type id = index_.perfect_hash(it->first); assert(id < new_values.size()); new_values[id] = *it; new_present[id] = true; uint32_t h[4]; - uint32_t index = nest_index(it->first, h); - if (used_nests[index]) { - set_2bit_value(&(new_nests[0]), index, kNestCollision); - } - else { - set_2bit_value(&(new_nests[0]), index, index_.cuckoo_nest(it->first, h)); - assert(index_.perfect_hash(it->first) == index_.cuckoo_hash(h, index_.cuckoo_nest(it->first, h))); - used_nests[index] = true; + index_.hash_vector(it->first, h); + // fprintf(stderr, "Nest index: %d\n", get_nest_index(h)); + assert(used_nests.size() > get_nest_index(h)); + if (used_nests[get_nest_index(h)]) { + set_nest_value(h, kNestCollision); + ++collisions; + } else { + set_nest_value(h, index_.cuckoo_nest(h)); + assert(index_.perfect_hash(it->first) == index_.cuckoo_hash(h, index_.cuckoo_nest(h))); + used_nests[get_nest_index(h)] = true; } } + fprintf(stderr, "Collision ratio: %f\n", collisions*1.0/size()); values_.swap(new_values); present_.swap(new_present); - nests_.swap(new_nests); slack_type().swap(slack_); } @@ -210,7 +224,8 @@ MPH_MAP_METHOD_DECL(void_type, clear)() { MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { present_[pos - begin] = false; uint32_t h[4]; - nests_[nest_index(pos->first, h)] = kNestCollision; + index_.hash_vector(pos->first, &h); + nests_[get_nest_index(h)] = kNestCollision; *pos = value_type(); --size_; } @@ -222,19 +237,21 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { uint32_t h[4]; - auto nest = get_2bit_value(&(nests_[0]), nest_index(k, h)); - if (nest != kNestCollision) { - auto vit = values_.begin() + h[nest]; + index_.hash_vector(k, h); + auto nest = get_nest_value(h); + if (__builtin_expect(nest != kNestCollision, 1)) { + auto vit = values_.begin() + index_.cuckoo_hash(h, nest); if (equal_(k, vit->first)) return make_iterator(vit); } - return slow_find(k); + nest = index_.cuckoo_nest(h); + assert(index_.perfect_hash(k) == index_.cuckoo_hash(h, nest)); + return slow_find(k, index_.cuckoo_hash(h, nest)); } -MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k) const { - if (index_.perfect_hash_size()) { - auto id = index_.perfect_hash(k); - if (present_[id]) { - auto vit = values_.begin() + id; +MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfect_hash) const { + if (__builtin_expect(index_.perfect_hash_size(), 0)) { + if (__builtin_expect(present_[perfect_hash], true)) { + auto vit = values_.begin() + perfect_hash; if (equal_(k, vit->first)) return make_iterator(vit); } } @@ -247,22 +264,21 @@ MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k) const { MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { uint32_t h[4]; - auto index = nest_index(k, h); - assert(nests_.size()); - assert(nests_.size() > index / 2); - auto nest = get_2bit_value(&(nests_[0]), index); - if (nest != kNestCollision) { + index_.hash_vector(k, h); + auto nest = get_nest_value(h); + if (__builtin_expect(nest != kNestCollision, 1)) { auto vit = values_.begin() + index_.cuckoo_hash(h, nest); if (equal_(k, vit->first)) return make_iterator(vit); } - return slow_find(k); + nest = index_.cuckoo_nest(h); + // assert(index_.perfect_hash(k) == index_.cuckoo_hash(h, nest)); + return slow_find(k, index_.cuckoo_hash(h, nest)); } -MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k) { - if (index_.perfect_hash_size()) { - auto id = index_.perfect_hash(k); - if (present_[id]) { - auto vit = values_.begin() + id; +MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_hash) { + if (__builtin_expect(index_.perfect_hash_size(), 0)) { + if (__builtin_expect(present_[perfect_hash], true)) { + auto vit = values_.begin() + perfect_hash; if (equal_(k, vit->first)) return make_iterator(vit); } } From 9c4bb27dc426e95e7e2923dc7fe132ddbf3895f0 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 14 Mar 2012 12:07:08 -0300 Subject: [PATCH 78/89] Disabled cuckoo stuff to beat STL again. --- cxxmph/mph_map.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 687315e..37cd2d1 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -236,6 +236,8 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { } MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { + return slow_find(k, index_.perfect_hash(k)); + /* uint32_t h[4]; index_.hash_vector(k, h); auto nest = get_nest_value(h); @@ -246,6 +248,7 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { nest = index_.cuckoo_nest(h); assert(index_.perfect_hash(k) == index_.cuckoo_hash(h, nest)); return slow_find(k, index_.cuckoo_hash(h, nest)); + */ } MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfect_hash) const { @@ -263,6 +266,8 @@ MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfe } MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { + return slow_find(k, index_.perfect_hash(k)); + /* uint32_t h[4]; index_.hash_vector(k, h); auto nest = get_nest_value(h); @@ -273,6 +278,7 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { nest = index_.cuckoo_nest(h); // assert(index_.perfect_hash(k) == index_.cuckoo_hash(h, nest)); return slow_find(k, index_.cuckoo_hash(h, nest)); + */ } MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_hash) { From b63f6182045af009a150a471c295cbe6e4ad2ae4 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 14 Mar 2012 12:40:50 -0300 Subject: [PATCH 79/89] bit methods need tests. --- cxxmph/mph_map.h | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 37cd2d1..6886cb3 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -106,8 +106,12 @@ class mph_map { const_iterator slow_find(const key_type& k, uint32_t perfect_hash) const; static const uint8_t kNestCollision = 3; // biggest 2 bit value void set_nest_value(const uint32_t* h, uint8_t value) { + auto index = get_nest_index(h); assert(get_nest_index(h) < nests_.size() * 4); - set_2bit_value(&(nests_[0]), get_nest_index(h), value); + assert(get_nest_index(h) >> 2 < nests_.size()); + assert(value < 4); + set_2bit_value(&nests_[0], index, value); + assert(get_2bit_value(&nests_[0], index) == value); } uint32_t get_nest_value(const uint32_t* h) const { assert(get_nest_index(h) < nests_.size() * 4); @@ -151,8 +155,6 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { } values_.push_back(x); present_.push_back(true); - auto nests_size = nextpoweroftwo(ceil(values_.size() / 4.0) + 1)*10; - nests_.resize(nests_size, std::numeric_limits::max()); uint32_t h[4]; index_.hash_vector(x.first, h); set_nest_value(h, kNestCollision); @@ -191,13 +193,20 @@ MPH_MAP_METHOD_DECL(void_type, pack)() { assert(used_nests.size() > get_nest_index(h)); if (used_nests[get_nest_index(h)]) { set_nest_value(h, kNestCollision); + assert(get_nest_value(h) == kNestCollision); ++collisions; } else { set_nest_value(h, index_.cuckoo_nest(h)); - assert(index_.perfect_hash(it->first) == index_.cuckoo_hash(h, index_.cuckoo_nest(h))); + assert(get_nest_value(h) == index_.cuckoo_nest(h)); + assert(index_.perfect_hash(it->first) == index_.cuckoo_hash(h, get_nest_value(h))); used_nests[get_nest_index(h)] = true; } } + for (iterator it = begin(), it_end = end(); it != it_end; ++it) { + uint32_t h[4]; + index_.hash_vector(it->first, h); + assert(get_nest_value(h) == kNestCollision || index_.perfect_hash(it->first) == index_.cuckoo_hash(h, get_nest_value(h))); + } fprintf(stderr, "Collision ratio: %f\n", collisions*1.0/size()); values_.swap(new_values); present_.swap(new_present); @@ -266,19 +275,22 @@ MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfe } MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { - return slow_find(k, index_.perfect_hash(k)); - /* + // return slow_find(k, index_.perfect_hash(k)); uint32_t h[4]; index_.hash_vector(k, h); auto nest = get_nest_value(h); if (__builtin_expect(nest != kNestCollision, 1)) { auto vit = values_.begin() + index_.cuckoo_hash(h, nest); - if (equal_(k, vit->first)) return make_iterator(vit); + assert(index_.perfect_hash(k) == index_.cuckoo_hash(h, nest)); + if (equal_(k, vit->first)) { + fprintf(stderr, "fast\n"); + return make_iterator(vit); + } } nest = index_.cuckoo_nest(h); + fprintf(stderr, "slow\n"); // assert(index_.perfect_hash(k) == index_.cuckoo_hash(h, nest)); return slow_find(k, index_.cuckoo_hash(h, nest)); - */ } MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_hash) { From 0335cbe6793d37b83c41bba44d1ce860090af21f Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 14 Mar 2012 16:43:38 -0300 Subject: [PATCH 80/89] struggle --- cxxmph/Makefile.am | 4 +++- cxxmph/mph_bits.cc | 3 +++ cxxmph/mph_bits.h | 54 ++++++++++++++++++++++++++++++++++++++++++++-- cxxmph/mph_map.h | 2 +- 4 files changed, 59 insertions(+), 4 deletions(-) diff --git a/cxxmph/Makefile.am b/cxxmph/Makefile.am index 22c0bb2..db8ffa1 100644 --- a/cxxmph/Makefile.am +++ b/cxxmph/Makefile.am @@ -1,5 +1,5 @@ TESTS = $(check_PROGRAMS) -check_PROGRAMS = hollow_iterator_test mph_map_test mph_index_test trigraph_test +check_PROGRAMS = mph_bits_test hollow_iterator_test mph_map_test mph_index_test trigraph_test noinst_PROGRAMS = bm_index bm_map bin_PROGRAMS = cxxmph lib_LTLIBRARIES = libcxxmph.la @@ -27,4 +27,6 @@ cxxmph_LDADD = libcxxmph.la cxxmph_SOURCES = cxxmph.cc hollow_iterator_test_SOURCES = hollow_iterator_test.cc +mph_bits_test_SOURCES = mph_bits_test.cc +mph_bits_test_LDADD = libcxxmph.la diff --git a/cxxmph/mph_bits.cc b/cxxmph/mph_bits.cc index 9fb97bd..510572c 100644 --- a/cxxmph/mph_bits.cc +++ b/cxxmph/mph_bits.cc @@ -1,4 +1,7 @@ #include "mph_bits.h" namespace cxxmph { + +const uint8_t dynamic_2bitset::vmask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; + } diff --git a/cxxmph/mph_bits.h b/cxxmph/mph_bits.h index 6de8168..7dcf0be 100644 --- a/cxxmph/mph_bits.h +++ b/cxxmph/mph_bits.h @@ -2,13 +2,63 @@ #define __CXXMPH_MPH_BITS_H__ #include // for uint32_t and friends +#include #include +#include +#include +#include +#include namespace cxxmph { -static const uint8_t valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; +class dynamic_2bitset { + public: + dynamic_2bitset() : data_(NULL), size_(0), one_initialized_(false) {} + dynamic_2bitset(uint32_t size, bool one_initialized = false) + : data_(NULL), size_(0), one_initialized_(one_initialized) { + resize(size); + } + ~dynamic_2bitset() { delete [] data_; } + + const uint8_t operator[](uint32_t i) const { return get(i); } + uint8_t get(uint32_t i) const { + return (data_[(i >> 2)] >> (((i & 3) << 1)) & 3); + } + uint8_t set(uint32_t i, uint8_t v) { + uint8_t sf = ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]); + fprintf(stderr, "v %d sf %d\n", v, sf); + data_[(i >> 2)] &= ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]); + assert(get(i) == v); + } + void resize(uint32_t size) { + uint8_t* new_data = new uint8_t[size << 2]; + assert(one_initialized_); + assert(one_initialized_ * ones() == ones()); + memset(new_data, one_initialized_*ones(), size << 2); + assert(new_data[0] == ones()); + uint8_t* old_data_ = data_; + for (int i = 0; i < size_; ++i) { + data_ = old_data_; + auto v = get(i); + data_ = new_data; + set(i, v); + } + size_ = size; + delete [] old_data_; + data_ = new_data; + assert(data_[0] == ones()); + assert(get(0) == 3); + } + static const uint8_t vmask[]; + private: + uint8_t* data_; + uint32_t size_; + bool one_initialized_; + uint8_t ones() { return std::numeric_limits::max(); } +}; + static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { - d[(i >> 2)] &= ((v << ((i & 3) << 1)) | valuemask[i & 3]); + d[(i >> 2)] &= ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]); } static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 6886cb3..caddf12 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -126,7 +126,7 @@ class mph_map { std::vector present_; std::vector nests_; SimpleMPHIndex::hash_function> index_; - // TODO(davi) optimize slack to no hold a copy of the key + // TODO(davi) optimize slack to hold 128 unique bits from hash64 as key typedef unordered_map slack_type; slack_type slack_; size_type size_; From b96b71961d483a796bcbc77edb2e99fd25213e77 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 14 Mar 2012 16:44:16 -0300 Subject: [PATCH 81/89] struggle --- cxxmph/MurmurHash3.cpp | 335 ++++++++++++++++++++++++++++++++++++++++ cxxmph/MurmurHash3.h | 37 +++++ cxxmph/mph_bits_test.cc | 32 ++++ 3 files changed, 404 insertions(+) create mode 100644 cxxmph/MurmurHash3.cpp create mode 100644 cxxmph/MurmurHash3.h create mode 100644 cxxmph/mph_bits_test.cc diff --git a/cxxmph/MurmurHash3.cpp b/cxxmph/MurmurHash3.cpp new file mode 100644 index 0000000..09ffb26 --- /dev/null +++ b/cxxmph/MurmurHash3.cpp @@ -0,0 +1,335 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +// Note - The x86 and x64 versions do _not_ produce the same results, as the +// algorithms are optimized for their respective platforms. You can still +// compile and run any of them on any platform, but your performance with the +// non-native version will be less than optimal. + +#include "MurmurHash3.h" + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +#define FORCE_INLINE __forceinline + +#include + +#define ROTL32(x,y) _rotl(x,y) +#define ROTL64(x,y) _rotl64(x,y) + +#define BIG_CONSTANT(x) (x) + +// Other compilers + +#else // defined(_MSC_VER) + +#define FORCE_INLINE __attribute__((always_inline)) + +inline uint32_t rotl32 ( uint32_t x, int8_t r ) +{ + return (x << r) | (x >> (32 - r)); +} + +inline uint64_t rotl64 ( uint64_t x, int8_t r ) +{ + return (x << r) | (x >> (64 - r)); +} + +#define ROTL32(x,y) rotl32(x,y) +#define ROTL64(x,y) rotl64(x,y) + +#define BIG_CONSTANT(x) (x##LLU) + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- +// Block read - if your platform needs to do endian-swapping or can only +// handle aligned reads, do the conversion here + +FORCE_INLINE uint32_t getblock ( const uint32_t * p, int i ) +{ + return p[i]; +} + +FORCE_INLINE uint64_t getblock ( const uint64_t * p, int i ) +{ + return p[i]; +} + +//----------------------------------------------------------------------------- +// Finalization mix - force all bits of a hash block to avalanche + +FORCE_INLINE uint32_t fmix ( uint32_t h ) +{ + h ^= h >> 16; + h *= 0x85ebca6b; + h ^= h >> 13; + h *= 0xc2b2ae35; + h ^= h >> 16; + + return h; +} + +//---------- + +FORCE_INLINE uint64_t fmix ( uint64_t k ) +{ + k ^= k >> 33; + k *= BIG_CONSTANT(0xff51afd7ed558ccd); + k ^= k >> 33; + k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); + k ^= k >> 33; + + return k; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 4; + + uint32_t h1 = seed; + + uint32_t c1 = 0xcc9e2d51; + uint32_t c2 = 0x1b873593; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i); + + k1 *= c1; + k1 = ROTL32(k1,15); + k1 *= c2; + + h1 ^= k1; + h1 = ROTL32(h1,13); + h1 = h1*5+0xe6546b64; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*4); + + uint32_t k1 = 0; + + switch(len & 3) + { + case 3: k1 ^= tail[2] << 16; + case 2: k1 ^= tail[1] << 8; + case 1: k1 ^= tail[0]; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; + + h1 = fmix(h1); + + *(uint32_t*)out = h1; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_128 ( const void * key, const int len, + uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint32_t h1 = seed; + uint32_t h2 = seed; + uint32_t h3 = seed; + uint32_t h4 = seed; + + uint32_t c1 = 0x239b961b; + uint32_t c2 = 0xab0e9789; + uint32_t c3 = 0x38b34ae5; + uint32_t c4 = 0xa1e38b93; + + //---------- + // body + + const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); + + for(int i = -nblocks; i; i++) + { + uint32_t k1 = getblock(blocks,i*4+0); + uint32_t k2 = getblock(blocks,i*4+1); + uint32_t k3 = getblock(blocks,i*4+2); + uint32_t k4 = getblock(blocks,i*4+3); + + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + + h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; + + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; + + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; + + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint32_t k1 = 0; + uint32_t k2 = 0; + uint32_t k3 = 0; + uint32_t k4 = 0; + + switch(len & 15) + { + case 15: k4 ^= tail[14] << 16; + case 14: k4 ^= tail[13] << 8; + case 13: k4 ^= tail[12] << 0; + k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; + + case 12: k3 ^= tail[11] << 24; + case 11: k3 ^= tail[10] << 16; + case 10: k3 ^= tail[ 9] << 8; + case 9: k3 ^= tail[ 8] << 0; + k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; + + case 8: k2 ^= tail[ 7] << 24; + case 7: k2 ^= tail[ 6] << 16; + case 6: k2 ^= tail[ 5] << 8; + case 5: k2 ^= tail[ 4] << 0; + k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; + + case 4: k1 ^= tail[ 3] << 24; + case 3: k1 ^= tail[ 2] << 16; + case 2: k1 ^= tail[ 1] << 8; + case 1: k1 ^= tail[ 0] << 0; + k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + h3 = fmix(h3); + h4 = fmix(h4); + + h1 += h2; h1 += h3; h1 += h4; + h2 += h1; h3 += h1; h4 += h1; + + ((uint32_t*)out)[0] = h1; + ((uint32_t*)out)[1] = h2; + ((uint32_t*)out)[2] = h3; + ((uint32_t*)out)[3] = h4; +} + +//----------------------------------------------------------------------------- + +void MurmurHash3_x64_128 ( const void * key, const int len, + const uint32_t seed, void * out ) +{ + const uint8_t * data = (const uint8_t*)key; + const int nblocks = len / 16; + + uint64_t h1 = seed; + uint64_t h2 = seed; + + uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); + uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); + + //---------- + // body + + const uint64_t * blocks = (const uint64_t *)(data); + + for(int i = 0; i < nblocks; i++) + { + uint64_t k1 = getblock(blocks,i*2+0); + uint64_t k2 = getblock(blocks,i*2+1); + + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + + h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; + + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; + } + + //---------- + // tail + + const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + + uint64_t k1 = 0; + uint64_t k2 = 0; + + switch(len & 15) + { + case 15: k2 ^= uint64_t(tail[14]) << 48; + case 14: k2 ^= uint64_t(tail[13]) << 40; + case 13: k2 ^= uint64_t(tail[12]) << 32; + case 12: k2 ^= uint64_t(tail[11]) << 24; + case 11: k2 ^= uint64_t(tail[10]) << 16; + case 10: k2 ^= uint64_t(tail[ 9]) << 8; + case 9: k2 ^= uint64_t(tail[ 8]) << 0; + k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; + + case 8: k1 ^= uint64_t(tail[ 7]) << 56; + case 7: k1 ^= uint64_t(tail[ 6]) << 48; + case 6: k1 ^= uint64_t(tail[ 5]) << 40; + case 5: k1 ^= uint64_t(tail[ 4]) << 32; + case 4: k1 ^= uint64_t(tail[ 3]) << 24; + case 3: k1 ^= uint64_t(tail[ 2]) << 16; + case 2: k1 ^= uint64_t(tail[ 1]) << 8; + case 1: k1 ^= uint64_t(tail[ 0]) << 0; + k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; + }; + + //---------- + // finalization + + h1 ^= len; h2 ^= len; + + h1 += h2; + h2 += h1; + + h1 = fmix(h1); + h2 = fmix(h2); + + h1 += h2; + h2 += h1; + + ((uint64_t*)out)[0] = h1; + ((uint64_t*)out)[1] = h2; +} + +//----------------------------------------------------------------------------- + diff --git a/cxxmph/MurmurHash3.h b/cxxmph/MurmurHash3.h new file mode 100644 index 0000000..54e9d3f --- /dev/null +++ b/cxxmph/MurmurHash3.h @@ -0,0 +1,37 @@ +//----------------------------------------------------------------------------- +// MurmurHash3 was written by Austin Appleby, and is placed in the public +// domain. The author hereby disclaims copyright to this source code. + +#ifndef _MURMURHASH3_H_ +#define _MURMURHASH3_H_ + +//----------------------------------------------------------------------------- +// Platform-specific functions and macros + +// Microsoft Visual Studio + +#if defined(_MSC_VER) + +typedef unsigned char uint8_t; +typedef unsigned long uint32_t; +typedef unsigned __int64 uint64_t; + +// Other compilers + +#else // defined(_MSC_VER) + +#include + +#endif // !defined(_MSC_VER) + +//----------------------------------------------------------------------------- + +void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); + +void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); + +//----------------------------------------------------------------------------- + +#endif // _MURMURHASH3_H_ diff --git a/cxxmph/mph_bits_test.cc b/cxxmph/mph_bits_test.cc new file mode 100644 index 0000000..c828f56 --- /dev/null +++ b/cxxmph/mph_bits_test.cc @@ -0,0 +1,32 @@ +#include +#include + +#include "mph_bits.h" + +using cxxmph::dynamic_2bitset; +int main(int argc, char** argv) { + int size = 256; + dynamic_2bitset bits(size, true /* fill with ones */); + for (int i = 0; i < size; ++i) { + if (bits[i] != 3) { + fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, 3); + exit(-1); + } + } + for (int i = 0; i < size; ++i) bits.set(i, 0); + for (int i = 0; i < size; ++i) { + if (bits[i] != 0) { + fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, 0); + exit(-1); + } + } + for (int i = 0; i < size; ++i) bits.set(i, i % 4); + for (int i = 0; i < size; ++i) { + if (bits[i] != i % 4) { + fprintf(stderr, "wrong bits %d at %d expected %d\n", bits[i], i, i % 4); + exit(-1); + } + } +} + + From e3ccde3ba048c91a12c2bb4d65ad1e31f150c665 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 14 Mar 2012 18:26:26 -0300 Subject: [PATCH 82/89] Working, but it sucks. --- cxxmph/mph_bits.h | 42 +++++++++++++------------------- cxxmph/mph_bits_test.cc | 17 +++++++++++++ cxxmph/mph_map.h | 54 +++++++++++++++++++++-------------------- 3 files changed, 62 insertions(+), 51 deletions(-) diff --git a/cxxmph/mph_bits.h b/cxxmph/mph_bits.h index 7dcf0be..06b2946 100644 --- a/cxxmph/mph_bits.h +++ b/cxxmph/mph_bits.h @@ -4,6 +4,7 @@ #include // for uint32_t and friends #include #include +#include #include #include #include @@ -13,47 +14,38 @@ namespace cxxmph { class dynamic_2bitset { public: - dynamic_2bitset() : data_(NULL), size_(0), one_initialized_(false) {} - dynamic_2bitset(uint32_t size, bool one_initialized = false) - : data_(NULL), size_(0), one_initialized_(one_initialized) { - resize(size); + dynamic_2bitset() : fill_(false) {} + dynamic_2bitset(uint32_t size, bool fill = false) + : size_(size), fill_(fill), data_(ceil(size / 4.0), ones()*fill) { } - ~dynamic_2bitset() { delete [] data_; } const uint8_t operator[](uint32_t i) const { return get(i); } uint8_t get(uint32_t i) const { return (data_[(i >> 2)] >> (((i & 3) << 1)) & 3); } uint8_t set(uint32_t i, uint8_t v) { - uint8_t sf = ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]); - fprintf(stderr, "v %d sf %d\n", v, sf); + data_[(i >> 2)] |= ones() ^ dynamic_2bitset::vmask[i & 3]; data_[(i >> 2)] &= ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]); + assert(v <= 3); assert(get(i) == v); } void resize(uint32_t size) { - uint8_t* new_data = new uint8_t[size << 2]; - assert(one_initialized_); - assert(one_initialized_ * ones() == ones()); - memset(new_data, one_initialized_*ones(), size << 2); - assert(new_data[0] == ones()); - uint8_t* old_data_ = data_; - for (int i = 0; i < size_; ++i) { - data_ = old_data_; - auto v = get(i); - data_ = new_data; - set(i, v); - } size_ = size; - delete [] old_data_; - data_ = new_data; - assert(data_[0] == ones()); - assert(get(0) == 3); + data_.resize(size >> 2, fill_*ones()); } + void swap(dynamic_2bitset& other) { + std::swap(other.size_, size_); + std::swap(other.fill_, fill_); + std::swap(other.data_, data_); + } + void clear() { data_.clear(); } + + uint32_t size() const { return size_; } static const uint8_t vmask[]; private: - uint8_t* data_; uint32_t size_; - bool one_initialized_; + bool fill_; + std::vector data_; uint8_t ones() { return std::numeric_limits::max(); } }; diff --git a/cxxmph/mph_bits_test.cc b/cxxmph/mph_bits_test.cc index c828f56..e6a764d 100644 --- a/cxxmph/mph_bits_test.cc +++ b/cxxmph/mph_bits_test.cc @@ -5,6 +5,15 @@ using cxxmph::dynamic_2bitset; int main(int argc, char** argv) { + dynamic_2bitset small(256, true); + for (int i = 0; i < small.size(); ++i) small.set(i, i % 4); + for (int i = 0; i < small.size(); ++i) { + if (small[i] != i % 4) { + fprintf(stderr, "wrong bits %d at %d expected %d\n", small[i], i, i % 4); + exit(-1); + } + } + int size = 256; dynamic_2bitset bits(size, true /* fill with ones */); for (int i = 0; i < size; ++i) { @@ -27,6 +36,14 @@ int main(int argc, char** argv) { exit(-1); } } + dynamic_2bitset size_corner1(1); + if (size_corner1.size() != 1) exit(-1); + dynamic_2bitset size_corner2(2); + if (size_corner2.size() != 2) exit(-1); + (dynamic_2bitset(4)).swap(size_corner2); + if (size_corner2.size() != 4) exit(-1); + + } diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index caddf12..a291986 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -17,6 +17,7 @@ #include #include // for std::pair +#include "mph_bits.h" #include "mph_index.h" #include "hollow_iterator.h" @@ -107,29 +108,34 @@ class mph_map { static const uint8_t kNestCollision = 3; // biggest 2 bit value void set_nest_value(const uint32_t* h, uint8_t value) { auto index = get_nest_index(h); - assert(get_nest_index(h) < nests_.size() * 4); + assert(get_nest_index(h) < nests_.size()); assert(get_nest_index(h) >> 2 < nests_.size()); assert(value < 4); - set_2bit_value(&nests_[0], index, value); - assert(get_2bit_value(&nests_[0], index) == value); + nests_.set(index, value); + assert(nests_[index] == value); } uint32_t get_nest_value(const uint32_t* h) const { - assert(get_nest_index(h) < nests_.size() * 4); - return get_2bit_value(&(nests_[0]), get_nest_index(h)); + assert(get_nest_index(h) < nests_.size()); + return nests_[get_nest_index(h)]; } uint32_t get_nest_index(const uint32_t* h) const { - return h[3] & ((nests_.size() << 2) - 1); + assert(nests_.size()); + return h[3] % nests_.size(); // a mod 2^n == a & 2^n - 1 + // return h[3] & (nests_.size() - 1); // a mod 2^n == a & 2^n - 1 } void pack(); std::vector values_; std::vector present_; - std::vector nests_; + dynamic_2bitset nests_; SimpleMPHIndex::hash_function> index_; // TODO(davi) optimize slack to hold 128 unique bits from hash64 as key typedef unordered_map slack_type; slack_type slack_; size_type size_; + + mutable uint64_t fast_; + mutable uint64_t slow_; }; MPH_MAP_TMPL_SPEC @@ -143,6 +149,7 @@ MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() : size_(0) { } MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() { + fprintf(stderr, "Fast: %d Slow %d ratio %f\n", fast_, slow_, fast_*1.0/slow_); } MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { @@ -176,11 +183,9 @@ MPH_MAP_METHOD_DECL(void_type, pack)() { new_values.reserve(new_values.size() * 2); std::vector new_present(index_.perfect_hash_size(), false); new_present.reserve(new_present.size() * 2); - auto new_nests_size = nextpoweroftwo(ceil(new_values.size() / 4.0) + 1)*10; - std::vector new_nests(new_nests_size, std::numeric_limits::max()); - new_nests.reserve(new_nests.size() * 2); - nests_.swap(new_nests); - vector used_nests(nests_.size() * 4); + auto new_nests_size = nextpoweroftwo(ceil(new_values.size())*10 + 1); + dynamic_2bitset(new_nests_size, true /* fill with 1s */).swap(nests_); + vector used_nests(nests_.size()); uint32_t collisions = 0; for (iterator it = begin(), it_end = end(); it != it_end; ++it) { size_type id = index_.perfect_hash(it->first); @@ -194,6 +199,7 @@ MPH_MAP_METHOD_DECL(void_type, pack)() { if (used_nests[get_nest_index(h)]) { set_nest_value(h, kNestCollision); assert(get_nest_value(h) == kNestCollision); + // fprintf(stderr, "Collision at nest index %d among %d positions\n", get_nest_index(h), nests_.size()); ++collisions; } else { set_nest_value(h, index_.cuckoo_nest(h)); @@ -207,7 +213,7 @@ MPH_MAP_METHOD_DECL(void_type, pack)() { index_.hash_vector(it->first, h); assert(get_nest_value(h) == kNestCollision || index_.perfect_hash(it->first) == index_.cuckoo_hash(h, get_nest_value(h))); } - fprintf(stderr, "Collision ratio: %f\n", collisions*1.0/size()); + // fprintf(stderr, "Collision ratio: %f\n", collisions*1.0/size()); values_.swap(new_values); present_.swap(new_present); slack_type().swap(slack_); @@ -225,8 +231,7 @@ MPH_MAP_METHOD_DECL(void_type, clear)() { present_.clear(); slack_.clear(); index_.clear(); - nests_.clear(); - nests_.push_back(std::numeric_limits::max()); + dynamic_2bitset(1, true /* fill with 1s */).swap(nests_); size_ = 0; } @@ -245,19 +250,19 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { } MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { - return slow_find(k, index_.perfect_hash(k)); - /* uint32_t h[4]; index_.hash_vector(k, h); auto nest = get_nest_value(h); if (__builtin_expect(nest != kNestCollision, 1)) { auto vit = values_.begin() + index_.cuckoo_hash(h, nest); - if (equal_(k, vit->first)) return make_iterator(vit); + if (equal_(k, vit->first)) { + ++fast_; + return make_iterator(vit); + } } nest = index_.cuckoo_nest(h); - assert(index_.perfect_hash(k) == index_.cuckoo_hash(h, nest)); + ++slow_; return slow_find(k, index_.cuckoo_hash(h, nest)); - */ } MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfect_hash) const { @@ -275,21 +280,18 @@ MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfe } MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { - // return slow_find(k, index_.perfect_hash(k)); uint32_t h[4]; index_.hash_vector(k, h); auto nest = get_nest_value(h); if (__builtin_expect(nest != kNestCollision, 1)) { auto vit = values_.begin() + index_.cuckoo_hash(h, nest); - assert(index_.perfect_hash(k) == index_.cuckoo_hash(h, nest)); if (equal_(k, vit->first)) { - fprintf(stderr, "fast\n"); - return make_iterator(vit); + ++fast_; + return make_iterator(vit); } } nest = index_.cuckoo_nest(h); - fprintf(stderr, "slow\n"); - // assert(index_.perfect_hash(k) == index_.cuckoo_hash(h, nest)); + ++slow_; return slow_find(k, index_.cuckoo_hash(h, nest)); } From 7fe9527459792f910e4434b4cf62f3f11bbf0b4e Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 14 Mar 2012 21:22:40 -0300 Subject: [PATCH 83/89] Interesting point, but get_cuckoo_nest is adding a lot and fast path is not that fast for int64. --- cxxmph/bm_map.cc | 4 ++-- cxxmph/mph_map.h | 45 ++++++++++++++++++++++++++++++++---------- cxxmph/mph_map_test.cc | 10 +++++++--- 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 0a0b225..51d2ad0 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -93,8 +93,8 @@ int main(int argc, char** argv) { Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); - Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); + // Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); + // Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); Benchmark::RunAll(); diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index a291986..471dafd 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -120,8 +120,10 @@ class mph_map { } uint32_t get_nest_index(const uint32_t* h) const { assert(nests_.size()); - return h[3] % nests_.size(); // a mod 2^n == a & 2^n - 1 - // return h[3] & (nests_.size() - 1); // a mod 2^n == a & 2^n - 1 + assert(nests_.size() % 2 == 0); + assert((nests_.size() & (nests_.size() - 1)) == 0); + assert((h[3] % nests_.size()) == (h[3] & (nests_.size() - 1))); + return (h[3] & (nests_.size() - 1)); // a mod 2^n == a & 2^n - 1 } void pack(); @@ -135,7 +137,9 @@ class mph_map { size_type size_; mutable uint64_t fast_; + mutable uint64_t fast_taken_; mutable uint64_t slow_; + mutable uint64_t very_slow_; }; MPH_MAP_TMPL_SPEC @@ -149,7 +153,7 @@ MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() : size_(0) { } MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() { - fprintf(stderr, "Fast: %d Slow %d ratio %f\n", fast_, slow_, fast_*1.0/slow_); + fprintf(stderr, "Fast taken: %d Fast: %d Slow %d very_slow %d ratio %f\n", fast_taken_, fast_, slow_, very_slow_, fast_*1.0/slow_); } MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { @@ -169,10 +173,15 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { slack_.insert(make_pair(x.first, values_.size() - 1)); if (should_pack) pack(); it = find(x.first); + slow_ = 0; + very_slow_ = 0; + fast_ = 0; + fast_taken_ = 0; return make_pair(it, true); } MPH_MAP_METHOD_DECL(void_type, pack)() { + // fprintf(stderr, "Paki %d values\n", values_.size()); if (values_.empty()) return; assert(std::unordered_set(make_iterator_first(begin()), make_iterator_first(end())).size() == size()); bool success = index_.Reset( @@ -183,7 +192,7 @@ MPH_MAP_METHOD_DECL(void_type, pack)() { new_values.reserve(new_values.size() * 2); std::vector new_present(index_.perfect_hash_size(), false); new_present.reserve(new_present.size() * 2); - auto new_nests_size = nextpoweroftwo(ceil(new_values.size())*10 + 1); + auto new_nests_size = nextpoweroftwo(ceil(new_values.size())*100 + 1); dynamic_2bitset(new_nests_size, true /* fill with 1s */).swap(nests_); vector used_nests(nests_.size()); uint32_t collisions = 0; @@ -208,15 +217,24 @@ MPH_MAP_METHOD_DECL(void_type, pack)() { used_nests[get_nest_index(h)] = true; } } - for (iterator it = begin(), it_end = end(); it != it_end; ++it) { - uint32_t h[4]; - index_.hash_vector(it->first, h); - assert(get_nest_value(h) == kNestCollision || index_.perfect_hash(it->first) == index_.cuckoo_hash(h, get_nest_value(h))); - } // fprintf(stderr, "Collision ratio: %f\n", collisions*1.0/size()); values_.swap(new_values); present_.swap(new_present); slack_type().swap(slack_); + int32_t fast = 0; + int32_t slow= 0; + for (iterator it = begin(), it_end = end(); it != it_end; ++it) { + uint32_t h[4]; + index_.hash_vector(it->first, h); + if (get_nest_value(h) == kNestCollision) ++slow; + else { + ++fast; + auto cit = values_.begin() + index_.cuckoo_hash(h, get_nest_value(h)); + assert(index_.perfect_hash(it->first) == cit - values_.begin()); + assert(equal_(it->first, cit->first)); + } + } + // fprintf(stderr, "Predicted fast: %d slow %d\n", fast, slow); } MPH_MAP_METHOD_DECL(iterator, begin)() { return make_iterator(values_.begin()); } @@ -231,7 +249,7 @@ MPH_MAP_METHOD_DECL(void_type, clear)() { present_.clear(); slack_.clear(); index_.clear(); - dynamic_2bitset(1, true /* fill with 1s */).swap(nests_); + dynamic_2bitset(8, true /* fill with 1s */).swap(nests_); size_ = 0; } @@ -254,7 +272,10 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { index_.hash_vector(k, h); auto nest = get_nest_value(h); if (__builtin_expect(nest != kNestCollision, 1)) { + ++fast_taken_; auto vit = values_.begin() + index_.cuckoo_hash(h, nest); + // do not hold for unknown keys + assert(values_.size() != index_.perfect_hash_size() || equal_(k, vit->first)); if (equal_(k, vit->first)) { ++fast_; return make_iterator(vit); @@ -273,6 +294,7 @@ MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfe } } if (__builtin_expect(!slack_.empty(), 0)) { + ++very_slow_; auto sit = slack_.find(k); if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second); } @@ -284,7 +306,9 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { index_.hash_vector(k, h); auto nest = get_nest_value(h); if (__builtin_expect(nest != kNestCollision, 1)) { + ++fast_taken_; auto vit = values_.begin() + index_.cuckoo_hash(h, nest); + assert(values_.size() != index_.perfect_hash_size() || equal_(k, vit->first)); if (equal_(k, vit->first)) { ++fast_; return make_iterator(vit); @@ -303,6 +327,7 @@ MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_has } } if (__builtin_expect(!slack_.empty(), 0)) { + ++very_slow_; auto sit = slack_.find(k); if (sit != slack_.end()) return make_iterator(values_.begin() + sit->second); } diff --git a/cxxmph/mph_map_test.cc b/cxxmph/mph_map_test.cc index ada71b3..1d489c6 100644 --- a/cxxmph/mph_map_test.cc +++ b/cxxmph/mph_map_test.cc @@ -15,17 +15,20 @@ int main(int argc, char** argv) { for (int i = 0; i < num_keys; ++i) { b.insert(make_pair(i, i)); } - for (int i = 0; i < num_keys; ++i) { - auto it = b.find(i); + b.rehash(b.size()); + fprintf(stderr, "Insertion finished\n"); + for (int i = 0; i < 1000000; ++i) { + auto it = b.find(i % num_keys); if (it == b.end()) { std::cerr << "Failed to find " << i << std::endl; exit(-1); } - if (it->first != it->second || it->first != i) { + if (it->first != it->second || it->first != i % num_keys) { std::cerr << "Found " << it->first << " looking for " << i << std::endl; exit(-1); } } + /* mph_map h; h.insert(std::make_pair("-1",-1)); mph_map::const_iterator it; @@ -55,4 +58,5 @@ int main(int argc, char** argv) { if (key < num_valid && it->second != key) exit(-1); } } + */ } From 3c127c76908bc4bd022e2ff7a67b00acc1c261a5 Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Wed, 14 Mar 2012 23:23:48 -0300 Subject: [PATCH 84/89] First tentative on the perfect hash design. --- cxxmph/bm_map.cc | 4 ++-- cxxmph/mph_map.h | 19 +++++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/cxxmph/bm_map.cc b/cxxmph/bm_map.cc index 51d2ad0..0a0b225 100644 --- a/cxxmph/bm_map.cc +++ b/cxxmph/bm_map.cc @@ -93,8 +93,8 @@ int main(int argc, char** argv) { Benchmark::Register(new BM_CreateUrls>("URLS100k")); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0)); - // Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); - // Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); + Benchmark::Register(new BM_SearchUrls>("URLS100k", 10*1000 * 1000, 0.9)); Benchmark::Register(new BM_SearchUint64>); Benchmark::Register(new BM_SearchUint64>); Benchmark::RunAll(); diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index 471dafd..dd7bb08 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -69,8 +69,8 @@ class mph_map { void erase(iterator pos); void erase(const key_type& k); pair insert(const value_type& x); - iterator find(const key_type& k); - const_iterator find(const key_type& k) const; + iterator find(const key_type& k) { return slow_find(k, index_.perfect_hash(k)); } + const_iterator find(const key_type& k) const { return slow_find(k, index_.perfect_hash(k)); }; typedef int32_t my_int32_t; // help macros int32_t index(const key_type& k) const; data_type& operator[](const key_type &k); @@ -103,6 +103,9 @@ class mph_map { return hollow_const_iterator>(&values_, &present_, it); } + // Experimental functions, not always faster + iterator fast_find(const key_type& k); + const_iterator fast_find(const key_type& k) const; iterator slow_find(const key_type& k, uint32_t perfect_hash); const_iterator slow_find(const key_type& k, uint32_t perfect_hash) const; static const uint8_t kNestCollision = 3; // biggest 2 bit value @@ -153,7 +156,7 @@ MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::mph_map() : size_(0) { } MPH_MAP_TMPL_SPEC MPH_MAP_CLASS_SPEC::~mph_map() { - fprintf(stderr, "Fast taken: %d Fast: %d Slow %d very_slow %d ratio %f\n", fast_taken_, fast_, slow_, very_slow_, fast_*1.0/slow_); + // fprintf(stderr, "Fast taken: %d Fast: %d Slow %d very_slow %d ratio %f\n", fast_taken_, fast_, slow_, very_slow_, fast_*1.0/slow_); } MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { @@ -192,7 +195,7 @@ MPH_MAP_METHOD_DECL(void_type, pack)() { new_values.reserve(new_values.size() * 2); std::vector new_present(index_.perfect_hash_size(), false); new_present.reserve(new_present.size() * 2); - auto new_nests_size = nextpoweroftwo(ceil(new_values.size())*100 + 1); + auto new_nests_size = nextpoweroftwo(ceil(new_values.size())*10000 + 1); dynamic_2bitset(new_nests_size, true /* fill with 1s */).swap(nests_); vector used_nests(nests_.size()); uint32_t collisions = 0; @@ -267,7 +270,7 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { erase(it); } -MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { +MPH_MAP_METHOD_DECL(const_iterator, fast_find)(const key_type& k) const { uint32_t h[4]; index_.hash_vector(k, h); auto nest = get_nest_value(h); @@ -287,7 +290,7 @@ MPH_MAP_METHOD_DECL(const_iterator, find)(const key_type& k) const { } MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfect_hash) const { - if (__builtin_expect(index_.perfect_hash_size(), 0)) { + if (__builtin_expect(index_.perfect_hash_size(), 1)) { if (__builtin_expect(present_[perfect_hash], true)) { auto vit = values_.begin() + perfect_hash; if (equal_(k, vit->first)) return make_iterator(vit); @@ -301,7 +304,7 @@ MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfe return end(); } -MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { +MPH_MAP_METHOD_DECL(iterator, fast_find)(const key_type& k) { uint32_t h[4]; index_.hash_vector(k, h); auto nest = get_nest_value(h); @@ -320,7 +323,7 @@ MPH_MAP_METHOD_DECL(iterator, find)(const key_type& k) { } MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_hash) { - if (__builtin_expect(index_.perfect_hash_size(), 0)) { + if (__builtin_expect(index_.perfect_hash_size(), 1)) { if (__builtin_expect(present_[perfect_hash], true)) { auto vit = values_.begin() + perfect_hash; if (equal_(k, vit->first)) return make_iterator(vit); From 11d54ea83787058d6086a7b9fe9b7e258bcc75dc Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Fri, 16 Mar 2012 02:54:16 -0300 Subject: [PATCH 85/89] Added nice optimization to avoid mod 3. --- cxxmph/mph_bits.h | 6 ++++++ cxxmph/mph_index.h | 18 ++++++++++++------ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/cxxmph/mph_bits.h b/cxxmph/mph_bits.h index 06b2946..6577b9d 100644 --- a/cxxmph/mph_bits.h +++ b/cxxmph/mph_bits.h @@ -2,6 +2,8 @@ #define __CXXMPH_MPH_BITS_H__ #include // for uint32_t and friends + +#include #include #include #include @@ -9,6 +11,7 @@ #include #include #include +#include namespace cxxmph { @@ -61,6 +64,9 @@ static uint32_t nextpoweroftwo(uint32_t k) { for (int i=1; i> i; return k+1; } + +// Interesting bit tricks that might end up here: +// http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord } // namespace cxxmph diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index deccf22..c397b27 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -45,7 +45,8 @@ class MPHIndex { public: MPHIndex(double c = 1.23, uint8_t b = 7) : c_(c), b_(b), m_(0), n_(0), k_(0), r_(1), - g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0), + g_(NULL), g_size_(0), + ranktable_(NULL), ranktable_size_(0), deserialized_(false) { } ~MPHIndex(); @@ -112,6 +113,7 @@ class MPHIndex { // c++ vector to make mmap based backing easier. const uint8_t* g_; uint32_t g_size_; + uint8_t threebit_mod3[10]; // speed up mod3 calculation for 3bit ints // The table used for the rank step of the minimal perfect hash function const uint32_t* ranktable_; uint32_t ranktable_size_; @@ -135,12 +137,13 @@ bool MPHIndex::Reset( m_ = size; r_ = static_cast(ceil((c_*m_)/3)); if ((r_ % 2) == 0) r_ += 1; - nest_displacement_[0] = 0; - nest_displacement_[1] = r_; - nest_displacement_[2] = (r_ << 1); // This can be used to speed mods, but increases occupation too much. // Needs to try http://gmplib.org/manual/Integer-Exponentiation.html instead // r_ = nextpoweroftwo(r_); + nest_displacement_[0] = 0; + nest_displacement_[1] = r_; + nest_displacement_[2] = (r_ << 1); + for (int i = 0; i < sizeof(threebit_mod3); ++i) threebit_mod3[i] = i % 3; n_ = 3*r_; k_ = 1U << b_; @@ -215,15 +218,18 @@ uint32_t MPHIndex::perfect_hash(const Key& key) const { uint32_t h[4]; if (!g_size_) return 0; SeededHashFcn().hash64(key, hash_seed_[0], h); - // for (int i = 0; i < 3; ++i) h[i] = SeededHashFcn()(key, hash_seed_[i]); h[0] = (h[0] % r_) + nest_displacement_[0]; h[1] = (h[1] % r_) + nest_displacement_[1]; h[2] = (h[2] % r_) + nest_displacement_[2]; + // h[0] = (h[0] & (r_-1)) + nest_displacement_[0]; + // h[1] = (h[1] & (r_-1)) + nest_displacement_[1]; + // h[2] = (h[2] & (r_-1)) + nest_displacement_[2]; // cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl; assert((h[0] >> 2) > 2) > 2) Date: Fri, 16 Mar 2012 03:11:39 -0300 Subject: [PATCH 86/89] Removed cuckoo hash failed attempt. Slower because of extra memory access. --- cxxmph/mph_index.h | 26 ------------- cxxmph/mph_map.h | 97 ---------------------------------------------- 2 files changed, 123 deletions(-) diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index c397b27..72ea3ef 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -67,12 +67,6 @@ class MPHIndex { uint32_t minimal_perfect_hash(const Key& x) const; // Crazy functions. Ignore. - template // must agree with Reset - uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const; - template // must agree with Reset - uint8_t cuckoo_nest(const uint32_t* h) const; - template // must agree with Reset - uint32_t cuckoo_nest_index(const Key& x, uint32_t* h) const; template // must agree with Reset void hash_vector(const Key& x, uint32_t* h) const; @@ -190,29 +184,11 @@ bool MPHIndex::Mapping( return false; } -template -uint32_t MPHIndex::cuckoo_hash(const uint32_t* h, uint8_t nest) const { - return (h[nest] % r_) + nest_displacement_[nest]; -} - template void MPHIndex::hash_vector(const Key& key, uint32_t* h) const { SeededHashFcn().hash64(key, hash_seed_[0], h); } -template // must agree with Reset -uint8_t MPHIndex::cuckoo_nest(const uint32_t* h) const { - uint32_t x[4]; - if (!g_size_) return 0; - x[0] = (h[0] % r_) + nest_displacement_[0]; - x[1] = (h[1] % r_) + nest_displacement_[1]; - x[2] = (h[2] % r_) + nest_displacement_[2]; - assert((x[0] >> 2) > 2) > 2) uint32_t MPHIndex::perfect_hash(const Key& key) const { uint32_t h[4]; @@ -255,8 +231,6 @@ class SimpleMPHIndex : public MPHIndex { uint32_t index(const Key& key) const { return MPHIndex::index(key); } uint32_t perfect_hash(const Key& key) const { return MPHIndex::perfect_hash(key); } uint32_t minimal_perfect_hash(const Key& key) const { return MPHIndex::minimal_perfect_hash(key); } - uint8_t cuckoo_nest(const uint32_t* h) const { return MPHIndex::cuckoo_nest(h); } - uint32_t cuckoo_hash(const uint32_t* h, uint8_t nest) const { return MPHIndex::cuckoo_hash(h, nest); } void hash_vector(const Key& key, uint32_t* h) const { MPHIndex::hash_vector(key, h); } }; diff --git a/cxxmph/mph_map.h b/cxxmph/mph_map.h index dd7bb08..9440fe8 100644 --- a/cxxmph/mph_map.h +++ b/cxxmph/mph_map.h @@ -105,34 +105,12 @@ class mph_map { // Experimental functions, not always faster iterator fast_find(const key_type& k); - const_iterator fast_find(const key_type& k) const; iterator slow_find(const key_type& k, uint32_t perfect_hash); const_iterator slow_find(const key_type& k, uint32_t perfect_hash) const; - static const uint8_t kNestCollision = 3; // biggest 2 bit value - void set_nest_value(const uint32_t* h, uint8_t value) { - auto index = get_nest_index(h); - assert(get_nest_index(h) < nests_.size()); - assert(get_nest_index(h) >> 2 < nests_.size()); - assert(value < 4); - nests_.set(index, value); - assert(nests_[index] == value); - } - uint32_t get_nest_value(const uint32_t* h) const { - assert(get_nest_index(h) < nests_.size()); - return nests_[get_nest_index(h)]; - } - uint32_t get_nest_index(const uint32_t* h) const { - assert(nests_.size()); - assert(nests_.size() % 2 == 0); - assert((nests_.size() & (nests_.size() - 1)) == 0); - assert((h[3] % nests_.size()) == (h[3] & (nests_.size() - 1))); - return (h[3] & (nests_.size() - 1)); // a mod 2^n == a & 2^n - 1 - } void pack(); std::vector values_; std::vector present_; - dynamic_2bitset nests_; SimpleMPHIndex::hash_function> index_; // TODO(davi) optimize slack to hold 128 unique bits from hash64 as key typedef unordered_map slack_type; @@ -169,9 +147,6 @@ MPH_MAP_METHOD_DECL(insert_return_type, insert)(const value_type& x) { } values_.push_back(x); present_.push_back(true); - uint32_t h[4]; - index_.hash_vector(x.first, h); - set_nest_value(h, kNestCollision); ++size_; slack_.insert(make_pair(x.first, values_.size() - 1)); if (should_pack) pack(); @@ -195,49 +170,16 @@ MPH_MAP_METHOD_DECL(void_type, pack)() { new_values.reserve(new_values.size() * 2); std::vector new_present(index_.perfect_hash_size(), false); new_present.reserve(new_present.size() * 2); - auto new_nests_size = nextpoweroftwo(ceil(new_values.size())*10000 + 1); - dynamic_2bitset(new_nests_size, true /* fill with 1s */).swap(nests_); - vector used_nests(nests_.size()); - uint32_t collisions = 0; for (iterator it = begin(), it_end = end(); it != it_end; ++it) { size_type id = index_.perfect_hash(it->first); assert(id < new_values.size()); new_values[id] = *it; new_present[id] = true; - uint32_t h[4]; - index_.hash_vector(it->first, h); - // fprintf(stderr, "Nest index: %d\n", get_nest_index(h)); - assert(used_nests.size() > get_nest_index(h)); - if (used_nests[get_nest_index(h)]) { - set_nest_value(h, kNestCollision); - assert(get_nest_value(h) == kNestCollision); - // fprintf(stderr, "Collision at nest index %d among %d positions\n", get_nest_index(h), nests_.size()); - ++collisions; - } else { - set_nest_value(h, index_.cuckoo_nest(h)); - assert(get_nest_value(h) == index_.cuckoo_nest(h)); - assert(index_.perfect_hash(it->first) == index_.cuckoo_hash(h, get_nest_value(h))); - used_nests[get_nest_index(h)] = true; - } } // fprintf(stderr, "Collision ratio: %f\n", collisions*1.0/size()); values_.swap(new_values); present_.swap(new_present); slack_type().swap(slack_); - int32_t fast = 0; - int32_t slow= 0; - for (iterator it = begin(), it_end = end(); it != it_end; ++it) { - uint32_t h[4]; - index_.hash_vector(it->first, h); - if (get_nest_value(h) == kNestCollision) ++slow; - else { - ++fast; - auto cit = values_.begin() + index_.cuckoo_hash(h, get_nest_value(h)); - assert(index_.perfect_hash(it->first) == cit - values_.begin()); - assert(equal_(it->first, cit->first)); - } - } - // fprintf(stderr, "Predicted fast: %d slow %d\n", fast, slow); } MPH_MAP_METHOD_DECL(iterator, begin)() { return make_iterator(values_.begin()); } @@ -252,7 +194,6 @@ MPH_MAP_METHOD_DECL(void_type, clear)() { present_.clear(); slack_.clear(); index_.clear(); - dynamic_2bitset(8, true /* fill with 1s */).swap(nests_); size_ = 0; } @@ -260,7 +201,6 @@ MPH_MAP_METHOD_DECL(void_type, erase)(iterator pos) { present_[pos - begin] = false; uint32_t h[4]; index_.hash_vector(pos->first, &h); - nests_[get_nest_index(h)] = kNestCollision; *pos = value_type(); --size_; } @@ -270,25 +210,6 @@ MPH_MAP_METHOD_DECL(void_type, erase)(const key_type& k) { erase(it); } -MPH_MAP_METHOD_DECL(const_iterator, fast_find)(const key_type& k) const { - uint32_t h[4]; - index_.hash_vector(k, h); - auto nest = get_nest_value(h); - if (__builtin_expect(nest != kNestCollision, 1)) { - ++fast_taken_; - auto vit = values_.begin() + index_.cuckoo_hash(h, nest); - // do not hold for unknown keys - assert(values_.size() != index_.perfect_hash_size() || equal_(k, vit->first)); - if (equal_(k, vit->first)) { - ++fast_; - return make_iterator(vit); - } - } - nest = index_.cuckoo_nest(h); - ++slow_; - return slow_find(k, index_.cuckoo_hash(h, nest)); -} - MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfect_hash) const { if (__builtin_expect(index_.perfect_hash_size(), 1)) { if (__builtin_expect(present_[perfect_hash], true)) { @@ -304,24 +225,6 @@ MPH_MAP_METHOD_DECL(const_iterator, slow_find)(const key_type& k, uint32_t perfe return end(); } -MPH_MAP_METHOD_DECL(iterator, fast_find)(const key_type& k) { - uint32_t h[4]; - index_.hash_vector(k, h); - auto nest = get_nest_value(h); - if (__builtin_expect(nest != kNestCollision, 1)) { - ++fast_taken_; - auto vit = values_.begin() + index_.cuckoo_hash(h, nest); - assert(values_.size() != index_.perfect_hash_size() || equal_(k, vit->first)); - if (equal_(k, vit->first)) { - ++fast_; - return make_iterator(vit); - } - } - nest = index_.cuckoo_nest(h); - ++slow_; - return slow_find(k, index_.cuckoo_hash(h, nest)); -} - MPH_MAP_METHOD_DECL(iterator, slow_find)(const key_type& k, uint32_t perfect_hash) { if (__builtin_expect(index_.perfect_hash_size(), 1)) { if (__builtin_expect(present_[perfect_hash], true)) { From b3842c69e827e915dd888ab8ffcc33e3c7bde46c Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Mon, 19 Mar 2012 03:10:42 -0300 Subject: [PATCH 87/89] New bit code works, need to cleanup logging. --- cxxmph/mph_bits.h | 22 +++++++++++++----- cxxmph/mph_bits_test.cc | 12 ++++++++-- cxxmph/mph_index.cc | 50 +++++++++++++--------------------------- cxxmph/mph_index.h | 19 +++++++-------- cxxmph/mph_index_test.cc | 2 ++ 5 files changed, 52 insertions(+), 53 deletions(-) diff --git a/cxxmph/mph_bits.h b/cxxmph/mph_bits.h index 6577b9d..36782f2 100644 --- a/cxxmph/mph_bits.h +++ b/cxxmph/mph_bits.h @@ -17,16 +17,23 @@ namespace cxxmph { class dynamic_2bitset { public: - dynamic_2bitset() : fill_(false) {} + dynamic_2bitset() : size_(0), fill_(false) {} dynamic_2bitset(uint32_t size, bool fill = false) : size_(size), fill_(fill), data_(ceil(size / 4.0), ones()*fill) { + if (data_.size()) fprintf(stderr, "creating %p size %d\n", &data_[0], data_.size()); + } + ~dynamic_2bitset() { + if (data_.size()) fprintf(stderr, "Deleting %p size %d\n", &data_[0], data_.size()); } const uint8_t operator[](uint32_t i) const { return get(i); } - uint8_t get(uint32_t i) const { + const uint8_t get(uint32_t i) const { + assert(i < size()); + assert((i >> 2) < data_.size()); return (data_[(i >> 2)] >> (((i & 3) << 1)) & 3); } uint8_t set(uint32_t i, uint8_t v) { + assert((i >> 2) < data_.size()); data_[(i >> 2)] |= ones() ^ dynamic_2bitset::vmask[i & 3]; data_[(i >> 2)] &= ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]); assert(v <= 3); @@ -39,17 +46,18 @@ class dynamic_2bitset { void swap(dynamic_2bitset& other) { std::swap(other.size_, size_); std::swap(other.fill_, fill_); - std::swap(other.data_, data_); + other.data_.swap(data_); } - void clear() { data_.clear(); } + void clear() { data_.clear(); size_ = 0; } uint32_t size() const { return size_; } static const uint8_t vmask[]; - private: + const std::vector& data() const { return data_; } +// private: uint32_t size_; bool fill_; std::vector data_; - uint8_t ones() { return std::numeric_limits::max(); } + const uint8_t ones() { return std::numeric_limits::max(); } }; static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { @@ -67,6 +75,8 @@ static uint32_t nextpoweroftwo(uint32_t k) { // Interesting bit tricks that might end up here: // http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord +// Fast a % (k*2^t) +// http://www.azillionmonkeys.com/qed/adiv.html } // namespace cxxmph diff --git a/cxxmph/mph_bits_test.cc b/cxxmph/mph_bits_test.cc index e6a764d..c1680e3 100644 --- a/cxxmph/mph_bits_test.cc +++ b/cxxmph/mph_bits_test.cc @@ -40,10 +40,18 @@ int main(int argc, char** argv) { if (size_corner1.size() != 1) exit(-1); dynamic_2bitset size_corner2(2); if (size_corner2.size() != 2) exit(-1); - (dynamic_2bitset(4)).swap(size_corner2); + (dynamic_2bitset(4, true)).swap(size_corner2); if (size_corner2.size() != 4) exit(-1); + for (int i = 0; i < size_corner2.size(); ++i) { + if (size_corner2[i] != 3) exit(-1); + } + size_corner2.clear(); + if (size_corner2.size() != 0) exit(-1); - + dynamic_2bitset empty; + empty.clear(); + dynamic_2bitset large(1000, true); + empty.swap(large); } diff --git a/cxxmph/mph_index.cc b/cxxmph/mph_index.cc index b1c0176..e1b24a8 100644 --- a/cxxmph/mph_index.cc +++ b/cxxmph/mph_index.cc @@ -44,9 +44,6 @@ MPHIndex::~MPHIndex() { } void MPHIndex::clear() { - if (!deserialized_) delete [] g_; - g_ = NULL; - g_size_ = 0; if (!deserialized_) delete [] ranktable_; ranktable_ = NULL; ranktable_size_ = 0; @@ -113,13 +110,9 @@ void MPHIndex::Assigning( const vector& edges, const vector& queue) { uint32_t current_edge = 0; vector marked_vertices(n_ + 1); + dynamic_2bitset().swap(g_); // Initialize vector of half nibbles with all bits set. - g_size_ = static_cast(ceil(n_/4.0)); - if (!deserialized_) delete [] g_; - g_ = NULL; - uint8_t* g = new uint8_t[g_size_]; - memset(g, std::numeric_limits::max(), g_size_); - assert(g[g_size_ - 1] == 255); + dynamic_2bitset g(n_, true /* set bits to 1 */); uint32_t nedges = m_; // for legibility for (int i = nedges - 1; i + 1 >= 1; --i) { @@ -133,35 +126,35 @@ void MPHIndex::Assigning( */ if (!marked_vertices[e[0]]) { if (!marked_vertices[e[1]]) { - set_2bit_value(g, e[1], kUnassigned); + g.set(e[1], kUnassigned); marked_vertices[e[1]] = true; } if (!marked_vertices[e[2]]) { - set_2bit_value(g, e[2], kUnassigned); + g.set(e[2], kUnassigned); assert(marked_vertices.size() > e[2]); marked_vertices[e[2]] = true; } - set_2bit_value(g, e[0], (6 - (get_2bit_value(g, e[1]) + get_2bit_value(g, e[2]))) % 3); + g.set(e[0], (6 - (g[e[1]] + g[e[2]])) % 3); marked_vertices[e[0]] = true; } else if (!marked_vertices[e[1]]) { if (!marked_vertices[e[2]]) { - set_2bit_value(g, e[2], kUnassigned); + g.set(e[2], kUnassigned); marked_vertices[e[2]] = true; } - set_2bit_value(g, e[1], (7 - (get_2bit_value(g, e[0]) + get_2bit_value(g, e[2]))) % 3); + g.set(e[1], (7 - (g[e[0]] + g[e[2]])) % 3); marked_vertices[e[1]] = true; } else { - set_2bit_value(g, e[2], (8 - (get_2bit_value(g, e[0]) + get_2bit_value(g, e[1]))) % 3); + g.set(e[2], (8 - (g[e[0]] + g[e[1]])) % 3); marked_vertices[e[2]] = true; } /* cerr << "A: " << e[0] << " " << e[1] << " " << e[2] << " -> " - << get_2bit_value(g, e[0]) << " " - << get_2bit_value(g, e[1]) << " " - << get_2bit_value(g, e[2]) << " " << endl; + << static_cast(g[e[0]]) << " " + << static_cast(g[e[1]]) << " " + << static_cast(g[e[2]]) << " " << endl; */ } - g_ = g; + g_.swap(g); } void MPHIndex::Ranking() { @@ -194,19 +187,17 @@ uint32_t MPHIndex::Rank(uint32_t vertex) const { uint32_t beg_idx_v = index << b_; uint32_t beg_idx_b = beg_idx_v >> 2; uint32_t end_idx_b = vertex >> 2; - while (beg_idx_b < end_idx_b) base_rank += kBdzLookupIndex[g_[beg_idx_b++]]; + while (beg_idx_b < end_idx_b) base_rank += kBdzLookupIndex[g_.data()[beg_idx_b++]]; beg_idx_v = beg_idx_b << 2; // cerr << "beg_idx_v: " << beg_idx_v << endl; // cerr << "base rank: " << base_rank << endl; - /* cerr << "G: "; for (unsigned int i = 0; i < n_; ++i) { - cerr << get_2bit_value(g_, i) << " "; + cerr << static_cast(g_[i]) << " "; } cerr << endl; - */ while (beg_idx_v < vertex) { - if (get_2bit_value(g_, beg_idx_v) != kUnassigned) ++base_rank; + if (g_[beg_idx_v] != kUnassigned) ++base_rank; ++beg_idx_v; } // cerr << "Base rank: " << base_rank << endl; @@ -214,21 +205,12 @@ uint32_t MPHIndex::Rank(uint32_t vertex) const { } uint32_t MPHIndex::serialize_bytes_needed() const { - return sizeof(MPHIndex) + g_size_ + ranktable_size_*sizeof(uint32_t); + return 0; } void MPHIndex::serialize(char* memory) const { - memcpy(memory, this, sizeof(MPHIndex)); - memcpy(memory + sizeof(MPHIndex), g_, g_size_); - memcpy(memory + sizeof(MPHIndex) + g_size_, - ranktable_, ranktable_size_*sizeof(uint32_t)); } bool MPHIndex::deserialize(const char* serialized_memory) { - memcpy(this, serialized_memory, sizeof(MPHIndex)); - g_ = reinterpret_cast(serialized_memory + sizeof(MPHIndex)); - ranktable_ = reinterpret_cast( - serialized_memory + sizeof(MPHIndex) + g_size_); - deserialized_ = true; return true; } diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 72ea3ef..c872f6b 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -45,7 +45,6 @@ class MPHIndex { public: MPHIndex(double c = 1.23, uint8_t b = 7) : c_(c), b_(b), m_(0), n_(0), k_(0), r_(1), - g_(NULL), g_size_(0), ranktable_(NULL), ranktable_size_(0), deserialized_(false) { } ~MPHIndex(); @@ -103,10 +102,8 @@ class MPHIndex { uint32_t r_; uint32_t nest_displacement_[3]; // derived from r_ - // The array containing the minimal perfect hash function graph. Do not use - // c++ vector to make mmap based backing easier. - const uint8_t* g_; - uint32_t g_size_; + // The array containing the minimal perfect hash function graph. + dynamic_2bitset g_; uint8_t threebit_mod3[10]; // speed up mod3 calculation for 3bit ints // The table used for the rank step of the minimal perfect hash function const uint32_t* ranktable_; @@ -156,6 +153,7 @@ bool MPHIndex::Reset( } if (iterations == 0) return false; Assigning(edges, queue); + fprintf(stderr, "Assignment finished\n"); std::vector().swap(edges); Ranking(); deserialized_ = false; @@ -192,7 +190,7 @@ void MPHIndex::hash_vector(const Key& key, uint32_t* h) const { template uint32_t MPHIndex::perfect_hash(const Key& key) const { uint32_t h[4]; - if (!g_size_) return 0; + if (!g_.size()) return 0; SeededHashFcn().hash64(key, hash_seed_[0], h); h[0] = (h[0] % r_) + nest_displacement_[0]; h[1] = (h[1] % r_) + nest_displacement_[1]; @@ -200,12 +198,11 @@ uint32_t MPHIndex::perfect_hash(const Key& key) const { // h[0] = (h[0] & (r_-1)) + nest_displacement_[0]; // h[1] = (h[1] & (r_-1)) + nest_displacement_[1]; // h[2] = (h[2] & (r_-1)) + nest_displacement_[2]; - // cerr << "g_.size() " << g_size_ << " h0 >> 2 " << (h[0] >> 2) << endl; - assert((h[0] >> 2) > 2) > 2) ::size_type i = 0; i < ids.size(); ++i) assert(ids[i] == static_cast::value_type>(i)); + /* char* serialized = new char[mph_index.serialize_bytes_needed()]; mph_index.serialize(serialized); SimpleMPHIndex other_mph_index; other_mph_index.deserialize(serialized); + */ } From b47f367db0fbe27368cdc507f59d0581b317785e Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Mon, 19 Mar 2012 03:18:57 -0300 Subject: [PATCH 88/89] Nice and fast. --- cxxmph/mph_bits.h | 12 +----------- cxxmph/mph_index.cc | 26 +++++++------------------- cxxmph/mph_index.h | 16 +--------------- cxxmph/mph_map_test.cc | 1 - 4 files changed, 9 insertions(+), 46 deletions(-) diff --git a/cxxmph/mph_bits.h b/cxxmph/mph_bits.h index 36782f2..586e42b 100644 --- a/cxxmph/mph_bits.h +++ b/cxxmph/mph_bits.h @@ -20,10 +20,6 @@ class dynamic_2bitset { dynamic_2bitset() : size_(0), fill_(false) {} dynamic_2bitset(uint32_t size, bool fill = false) : size_(size), fill_(fill), data_(ceil(size / 4.0), ones()*fill) { - if (data_.size()) fprintf(stderr, "creating %p size %d\n", &data_[0], data_.size()); - } - ~dynamic_2bitset() { - if (data_.size()) fprintf(stderr, "Deleting %p size %d\n", &data_[0], data_.size()); } const uint8_t operator[](uint32_t i) const { return get(i); } @@ -53,19 +49,13 @@ class dynamic_2bitset { uint32_t size() const { return size_; } static const uint8_t vmask[]; const std::vector& data() const { return data_; } -// private: + private: uint32_t size_; bool fill_; std::vector data_; const uint8_t ones() { return std::numeric_limits::max(); } }; -static void set_2bit_value(uint8_t *d, uint32_t i, uint8_t v) { - d[(i >> 2)] &= ((v << ((i & 3) << 1)) | dynamic_2bitset::vmask[i & 3]); -} -static uint32_t get_2bit_value(const uint8_t* d, uint32_t i) { - return (d[(i >> 2)] >> (((i & 3) << 1)) & 3); -} static uint32_t nextpoweroftwo(uint32_t k) { if (k == 0) return 1; k--; diff --git a/cxxmph/mph_index.cc b/cxxmph/mph_index.cc index e1b24a8..8b6baec 100644 --- a/cxxmph/mph_index.cc +++ b/cxxmph/mph_index.cc @@ -37,14 +37,12 @@ static uint8_t kBdzLookupIndex[] = namespace cxxmph { -const uint8_t MPHIndex::valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; - MPHIndex::~MPHIndex() { clear(); } void MPHIndex::clear() { - if (!deserialized_) delete [] ranktable_; + delete [] ranktable_; ranktable_ = NULL; ranktable_size_ = 0; // TODO(davi) implement me @@ -162,7 +160,7 @@ void MPHIndex::Ranking() { uint32_t size = k_ >> 2U; ranktable_size_ = static_cast( ceil(n_ / static_cast(k_))); - if (!deserialized_) delete [] ranktable_; + delete [] ranktable_; ranktable_ = NULL; uint32_t* ranktable = new uint32_t[ranktable_size_]; memset(ranktable, 0, ranktable_size_*sizeof(uint32_t)); @@ -191,11 +189,11 @@ uint32_t MPHIndex::Rank(uint32_t vertex) const { beg_idx_v = beg_idx_b << 2; // cerr << "beg_idx_v: " << beg_idx_v << endl; // cerr << "base rank: " << base_rank << endl; - cerr << "G: "; - for (unsigned int i = 0; i < n_; ++i) { - cerr << static_cast(g_[i]) << " "; - } - cerr << endl; + // cerr << "G: "; + // for (unsigned int i = 0; i < n_; ++i) { + // cerr << static_cast(g_[i]) << " "; + // } + // cerr << endl; while (beg_idx_v < vertex) { if (g_[beg_idx_v] != kUnassigned) ++base_rank; ++beg_idx_v; @@ -204,14 +202,4 @@ uint32_t MPHIndex::Rank(uint32_t vertex) const { return base_rank; } -uint32_t MPHIndex::serialize_bytes_needed() const { - return 0; -} -void MPHIndex::serialize(char* memory) const { -} - -bool MPHIndex::deserialize(const char* serialized_memory) { - return true; -} - } // namespace cxxmph diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index c872f6b..17ad3e5 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -45,8 +45,7 @@ class MPHIndex { public: MPHIndex(double c = 1.23, uint8_t b = 7) : c_(c), b_(b), m_(0), n_(0), k_(0), r_(1), - ranktable_(NULL), ranktable_size_(0), - deserialized_(false) { } + ranktable_(NULL), ranktable_size_(0) { } ~MPHIndex(); template @@ -69,13 +68,6 @@ class MPHIndex { template // must agree with Reset void hash_vector(const Key& x, uint32_t* h) const; - // Serialization for mmap usage - not tested well, ping me if you care. - // Serialized tables are not guaranteed to work across versions or different - // endianness (although they could easily be made to be). - uint32_t serialize_bytes_needed() const; - void serialize(char *memory) const; - bool deserialize(const char* serialized_memory); - private: template bool Mapping(ForwardIterator begin, ForwardIterator end, @@ -111,10 +103,6 @@ class MPHIndex { // The selected hash seed triplet for finding the edges in the minimal // perfect hash function graph. uint32_t hash_seed_[3]; - - bool deserialized_; - - static const uint8_t valuemask[]; }; // Template method needs to go in the header file. @@ -153,10 +141,8 @@ bool MPHIndex::Reset( } if (iterations == 0) return false; Assigning(edges, queue); - fprintf(stderr, "Assignment finished\n"); std::vector().swap(edges); Ranking(); - deserialized_ = false; return true; } diff --git a/cxxmph/mph_map_test.cc b/cxxmph/mph_map_test.cc index 1d489c6..dd8eb5a 100644 --- a/cxxmph/mph_map_test.cc +++ b/cxxmph/mph_map_test.cc @@ -16,7 +16,6 @@ int main(int argc, char** argv) { b.insert(make_pair(i, i)); } b.rehash(b.size()); - fprintf(stderr, "Insertion finished\n"); for (int i = 0; i < 1000000; ++i) { auto it = b.find(i % num_keys); if (it == b.end()) { From e760465fca0e494e20815c2e74bd64f6971da13c Mon Sep 17 00:00:00 2001 From: Davi Reis Date: Mon, 19 Mar 2012 22:48:11 -0300 Subject: [PATCH 89/89] Some comments. --- cxxmph/mph_bits.h | 2 ++ cxxmph/mph_index.h | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/cxxmph/mph_bits.h b/cxxmph/mph_bits.h index 586e42b..c9eaabb 100644 --- a/cxxmph/mph_bits.h +++ b/cxxmph/mph_bits.h @@ -67,6 +67,8 @@ static uint32_t nextpoweroftwo(uint32_t k) { // http://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord // Fast a % (k*2^t) // http://www.azillionmonkeys.com/qed/adiv.html +// rank and select: +// http://vigna.dsi.unimi.it/ftp/papers/Broadword.pdf } // namespace cxxmph diff --git a/cxxmph/mph_index.h b/cxxmph/mph_index.h index 17ad3e5..2a217bc 100644 --- a/cxxmph/mph_index.h +++ b/cxxmph/mph_index.h @@ -15,6 +15,10 @@ // traditional hash function over a key and doing 2-3 conflict resolutions on // 100byte-ish strings. // +// Thesis presenting this and similar algorithms: +// http://homepages.dcc.ufmg.br/~fbotelho/en/talks/thesis2008/thesis.pdf +// +// // Notes: // // Most users can use the SimpleMPHIndex wrapper instead of the MPHIndex which @@ -80,7 +84,11 @@ class MPHIndex { uint32_t Rank(uint32_t vertex) const; // Algorithm parameters - double c_; // Number of bits per key (? is it right) + // Perfect hash function density. If this was a 2graph, + // then probability of having an acyclic graph would be + // sqrt(1-(2/c)^2). See section 3 for details. + // http://www.it-c.dk/people/pagh/papers/simpleperf.pdf + double c_; uint8_t b_; // Number of bits of the kth index in the ranktable // Values used during generation