From 0935b2bbcbe8850d049c42553beae4f1a1dde5d3 Mon Sep 17 00:00:00 2001 From: fc_botelho Date: Mon, 16 Mar 2009 13:07:10 +0000 Subject: [PATCH] *** empty log message *** --- src/Makefile | 5 +- src/Makefile.am | 2 +- src/Makefile.in | 5 +- src/bitbool.h | 99 +++++++++++- src/compressed_seq.c | 373 +++++++++++++++++++++++++++++++++++++++++++ src/compressed_seq.h | 84 ++++++++++ src/select.c | 80 ++++++---- src/select.h | 12 +- tests/Makefile.am | 5 +- tests/select_tests.c | 7 +- 10 files changed, 623 insertions(+), 49 deletions(-) create mode 100644 src/compressed_seq.c create mode 100644 src/compressed_seq.h diff --git a/src/Makefile b/src/Makefile index 10149dc..3f665fa 100644 --- a/src/Makefile +++ b/src/Makefile @@ -60,7 +60,7 @@ libcmph_la_LIBADD = am_libcmph_la_OBJECTS = hash.lo jenkins_hash.lo vstack.lo vqueue.lo \ graph.lo cmph.lo cmph_structs.lo chm.lo bmz.lo bmz8.lo bdz.lo \ bdz_ph.lo buffer_manager.lo buffer_entry.lo brz.lo fch.lo \ - fch_buckets.lo select.lo + fch_buckets.lo select.lo compressed_seq.lo libcmph_la_OBJECTS = $(am_libcmph_la_OBJECTS) libcmph_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ @@ -207,7 +207,7 @@ libcmph_la_SOURCES = hash.c jenkins_hash.c\ chm.c bmz.c bmz8.c bdz.c bdz_ph.c\ buffer_manager.c buffer_entry.c\ brz.c fch.c fch_buckets.c \ - select.c + select.c compressed_seq.c libcmph_la_LDFLAGS = -version-info 0:0:0 cmph_SOURCES = main.c wingetopt.h wingetopt.c @@ -322,6 +322,7 @@ include ./$(DEPDIR)/buffer_manager.Plo include ./$(DEPDIR)/chm.Plo include ./$(DEPDIR)/cmph.Plo include ./$(DEPDIR)/cmph_structs.Plo +include ./$(DEPDIR)/compressed_seq.Plo include ./$(DEPDIR)/fch.Plo include ./$(DEPDIR)/fch_buckets.Plo include ./$(DEPDIR)/graph.Plo diff --git a/src/Makefile.am b/src/Makefile.am index acd2796..cfaf360 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -7,7 +7,7 @@ libcmph_la_SOURCES = hash.c jenkins_hash.c\ chm.c bmz.c bmz8.c bdz.c bdz_ph.c\ buffer_manager.c buffer_entry.c\ brz.c fch.c fch_buckets.c \ - select.c + select.c compressed_seq.c libcmph_la_LDFLAGS = -version-info 0:0:0 diff --git a/src/Makefile.in b/src/Makefile.in index 08994a6..302b6f2 100644 --- a/src/Makefile.in +++ b/src/Makefile.in @@ -60,7 +60,7 @@ libcmph_la_LIBADD = am_libcmph_la_OBJECTS = hash.lo jenkins_hash.lo vstack.lo vqueue.lo \ graph.lo cmph.lo cmph_structs.lo chm.lo bmz.lo bmz8.lo bdz.lo \ bdz_ph.lo buffer_manager.lo buffer_entry.lo brz.lo fch.lo \ - fch_buckets.lo select.lo + fch_buckets.lo select.lo compressed_seq.lo libcmph_la_OBJECTS = $(am_libcmph_la_OBJECTS) libcmph_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \ $(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \ @@ -207,7 +207,7 @@ libcmph_la_SOURCES = hash.c jenkins_hash.c\ chm.c bmz.c bmz8.c bdz.c bdz_ph.c\ buffer_manager.c buffer_entry.c\ brz.c fch.c fch_buckets.c \ - select.c + select.c compressed_seq.c libcmph_la_LDFLAGS = -version-info 0:0:0 cmph_SOURCES = main.c wingetopt.h wingetopt.c @@ -322,6 +322,7 @@ distclean-compile: @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/chm.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cmph.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/cmph_structs.Plo@am__quote@ +@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/compressed_seq.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fch.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/fch_buckets.Plo@am__quote@ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/graph.Plo@am__quote@ diff --git a/src/bitbool.h b/src/bitbool.h index 6c92b8b..695b2a0 100644 --- a/src/bitbool.h +++ b/src/bitbool.h @@ -2,10 +2,15 @@ #define _CMPH_BITBOOL_H__ #include "cmph_types.h" -static const cmph_uint8 bitmask[] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; -static const cmph_uint8 valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; +static const cmph_uint8 bitmask[] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; -// extern const cmph_uint8 bitmask[]; +static const cmph_uint32 bitmask32[] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, + 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, + 1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 21, 1 << 22, 1 << 23, + 1 << 24, 1 << 25, 1 << 26, 1 << 27, 1 << 28, 1 << 29, 1 << 30, 1 << 31 + }; + +static const cmph_uint8 valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; /** \def GETBIT(array, i) @@ -39,7 +44,6 @@ static const cmph_uint8 valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; //#define SETBIT(array, i) (array[(i) / 8] |= bitmask[(i) % 8]) //#define UNSETBIT(array, i) (array[(i) / 8] &= (~(bitmask[(i) % 8]))) -extern const cmph_uint8 valuemask[]; /** \def SETVALUE1(array, i, v) * \brief set a value for a 2-bit integer stored in an array initialized with 1s. @@ -76,4 +80,91 @@ extern const cmph_uint8 valuemask[]; #define GETVALUE(array, i) ((array[i >> 2] >> ((i & 0x00000003) << 1)) & 0x00000003) + +/** \def SETBIT32(array, i) + * \brief set 1 to an 1-bit integer stored in an array of 32-bit words. + * \param array to store 1-bit integer values. The entries are 32-bit words. + * \param i is the index in array to set the the bit to 1 + * + * SETBIT32(array, i) is a macro that sets 1 to an 1-bit integer stored in an array of 32-bit words. + */ +#define SETBIT32(array, i) (array[i >> 5] |= bitmask32[i & 0x0000001f]) + +/** \def GETBIT32(array, i) + * \brief get the value of an 1-bit integer stored in an array of 32-bit words. + * \param array to get 1-bit integer values from. The entries are 32-bit words. + * \param i is the index in array to get the 1-bit integer value from + * + * GETBIT32(array, i) is a macro that gets the value of an 1-bit integer stored in an array of 32-bit words. + */ +#define GETBIT32(array, i) (array[i >> 5] & bitmask32[i & 0x0000001f]) + +#define BITS_TABLE_SIZE(n, bits_length) ((n * bits_length + 31) >> 5) + +static inline void set_bits_value(cmph_uint32 * bits_table, cmph_uint32 index, cmph_uint32 bits_string, + cmph_uint32 string_length, cmph_uint32 string_mask) +{ + register cmph_uint32 bit_idx = index * string_length; + register cmph_uint32 word_idx = bit_idx >> 5; + register cmph_uint32 shift1 = bit_idx & 0x0000001f; + register cmph_uint32 shift2 = 32 - shift1; + + bits_table[word_idx] &= ~((string_mask) << shift1); + bits_table[word_idx] |= bits_string << shift1; + + if(shift2 < string_length) + { + bits_table[word_idx+1] &= ~((string_mask) >> shift2); + bits_table[word_idx+1] |= bits_string >> shift2; + }; +}; + +static inline cmph_uint32 get_bits_value(cmph_uint32 * bits_table,cmph_uint32 index, cmph_uint32 string_length, cmph_uint32 string_mask) +{ + register cmph_uint32 bit_idx = index * string_length; + register cmph_uint32 word_idx = bit_idx >> 5; + register cmph_uint32 shift1 = bit_idx & 0x0000001f; + register cmph_uint32 shift2 = 32-shift1; + register cmph_uint32 bits_string; + + bits_string = (bits_table[word_idx] >> shift1) & string_mask; + + if(shift2 < string_length) + bits_string |= (bits_table[word_idx+1] << shift2) & string_mask; + + return bits_string; +}; + +static inline void set_bits_at_pos(cmph_uint32 * bits_table, cmph_uint32 pos, cmph_uint32 bits_string, cmph_uint32 string_length) +{ + register cmph_uint32 word_idx = pos >> 5; + register cmph_uint32 shift1 = pos & 0x0000001f; + register cmph_uint32 shift2 = 32-shift1; + register cmph_uint32 string_mask = (1 << string_length) - 1; + + bits_table[word_idx] &= ~((string_mask) << shift1); + bits_table[word_idx] |= bits_string << shift1; + if(shift2 < string_length) + { + bits_table[word_idx+1] &= ~((string_mask) >> shift2); + bits_table[word_idx+1] |= bits_string >> shift2; + } +}; + +static inline cmph_uint32 get_bits_at_pos(cmph_uint32 * bits_table,cmph_uint32 pos,cmph_uint32 string_length) +{ + register cmph_uint32 word_idx = pos >> 5; + register cmph_uint32 shift1 = pos & 0x0000001f; + register cmph_uint32 shift2 = 32 - shift1; + register cmph_uint32 string_mask = (1 << string_length) - 1; + register cmph_uint32 bits_string; + + bits_string = (bits_table[word_idx] >> shift1) & string_mask; + + if(shift2 < string_length) + bits_string |= (bits_table[word_idx+1] << shift2) & string_mask; + return bits_string; +} + + #endif diff --git a/src/compressed_seq.c b/src/compressed_seq.c new file mode 100644 index 0000000..fd5001b --- /dev/null +++ b/src/compressed_seq.c @@ -0,0 +1,373 @@ +#include "compressed_seq.h" +#include +#include +#include +#include +#include + +#include "bitbool.h" + +// #define DEBUG +#include "debug.h" + +static inline cmph_uint32 i_log2(cmph_uint32 x) +{ + register cmph_uint32 res = 0; + + while(x > 1) + { + x >>= 1; + res++; + } + return res; +}; + +void compressed_seq_init(compressed_seq_t * cs) +{ + select_init(&cs->sel); + cs->n = 0; + cs->rem_r = 0; + cs->length_rems = 0; + cs->total_length = 0; + cs->store_table = 0; +} + +void compressed_seq_destroy(compressed_seq_t * cs) +{ + free(cs->store_table); + cs->store_table = 0; + free(cs->length_rems); + cs->length_rems = 0; + select_destroy(&cs->sel); +}; + + +void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cmph_uint32 n) +{ + register cmph_uint32 i; + // lengths: represents lengths of encoded values + register cmph_uint32 * lengths = (cmph_uint32 *)calloc(n, sizeof(cmph_uint32)); + register cmph_uint32 rems_mask; + register cmph_uint32 stored_value; + + cs->n = n; + cs->total_length = 0; + + for(i = 0; i < cs->n; i++) + { + if(vals_table[i] == 0) + { + lengths[i] = 0; + } + else + { + lengths[i] = i_log2(vals_table[i] + 1); + cs->total_length += lengths[i]; + }; + }; + + if(cs->store_table) + { + free(cs->store_table); + } + cs->store_table = (cmph_uint32 *) calloc(((cs->total_length + 31) >> 5), sizeof(cmph_uint32)); + cs->total_length = 0; + + for(i = 0; i < cs->n; i++) + { + if(vals_table[i] == 0) + continue; + stored_value = vals_table[i] - ((1 << lengths[i]) - 1); + set_bits_at_pos(cs->store_table, cs->total_length, stored_value, lengths[i]); + cs->total_length += lengths[i]; + }; + + cs->rem_r = i_log2(cs->total_length/cs->n); + + if(cs->length_rems) + { + free(cs->length_rems); + } + + cs->length_rems = (cmph_uint32 *) calloc(BITS_TABLE_SIZE(cs->n, cs->rem_r), sizeof(cmph_uint32)); + + rems_mask = (1 << cs->rem_r) - 1; + cs->total_length = 0; + + for(i = 0; i < cs->n; i++) + { + cs->total_length += lengths[i]; + set_bits_value(cs->length_rems, i, cs->total_length & rems_mask, cs->rem_r, rems_mask); + lengths[i] = cs->total_length >> cs->rem_r; + }; + + select_init(&cs->sel); + + // FABIANO: before it was (cs->total_length >> cs->rem_r) + 1. But I wiped out the + 1 because + // I changed the select structure to work up to m, instead of up to m - 1. + select_generate(&cs->sel, lengths, cs->n, (cs->total_length >> cs->rem_r)); + + free(lengths); +}; + +cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs) +{ + register cmph_uint32 space_usage = select_get_space_usage(&cs->sel); + space_usage += ((cs->total_length + 31) >> 5) * sizeof(cmph_uint32) * 8; + space_usage += BITS_TABLE_SIZE(cs->n, cs->rem_r) * sizeof(cmph_uint32) * 8; + return 4 * sizeof(cmph_uint32) * 8 + space_usage; +} + +cmph_int32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx) +{ + register cmph_uint32 enc_idx, enc_length; + register cmph_uint32 rems_mask; + register cmph_uint32 stored_value; + register cmph_uint32 sel_res; + + assert(idx < cs->n); // FABIANO ADDED + + rems_mask = (1 << cs->rem_r) - 1; + + if(idx == 0) + { + enc_idx = 0; + sel_res = select_query(&cs->sel, idx); + } + else + { + sel_res = select_query(&cs->sel, idx - 1); + + enc_idx = (sel_res - (idx - 1)) << cs->rem_r; + enc_idx += get_bits_value(cs->length_rems, idx-1, cs->rem_r, rems_mask); + + sel_res = select_next_query(&cs->sel, sel_res); + }; + + enc_length = (sel_res - idx) << cs->rem_r; + enc_length += get_bits_value(cs->length_rems, idx, cs->rem_r, rems_mask); + enc_length -= enc_idx; + if(enc_length == 0) + return 0; + + stored_value = get_bits_at_pos(cs->store_table, enc_idx, enc_length); + return stored_value + ((1 << enc_length) - 1); +}; + +void compressed_seq_dump(compressed_seq_t * cs, char ** buf, cmph_uint32 * buflen) +{ + register cmph_uint32 sel_size = select_get_space_usage(&cs->sel) >> 3; + register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r) * 4; + register cmph_uint32 store_table_size = ((cs->total_length + 31) >> 5) * 4; + register cmph_uint32 pos = 0; + char * buf_sel = 0; + cmph_uint32 buflen_sel = 0; + + *buflen = 4*sizeof(cmph_uint32) + sel_size + length_rems_size + store_table_size; + + DEBUGP("sel_size = %u\n", sel_size); + DEBUGP("length_rems_size = %u\n", length_rems_size); + DEBUGP("store_table_size = %u\n", store_table_size); + *buf = (char *)calloc(*buflen, sizeof(char)); + + if (!*buf) + { + *buflen = UINT_MAX; + return; + } + + // dumping n, rem_r and total_length + memcpy(*buf, &(cs->n), sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("n = %u\n", cs->n); + + memcpy(*buf + pos, &(cs->rem_r), sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("rem_r = %u\n", cs->rem_r); + + memcpy(*buf + pos, &(cs->total_length), sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("total_length = %u\n", cs->total_length); + + + // dumping sel + select_dump(&cs->sel, &buf_sel, &buflen_sel); + memcpy(*buf + pos, &buflen_sel, sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("buflen_sel = %u\n", buflen_sel); + + memcpy(*buf + pos, buf_sel, buflen_sel); + #ifdef DEBUG + cmph_uint32 i = 0; + for(i = 0; i < buflen_sel; i++) + { + DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(*buf + pos + i)); + } + #endif + pos += buflen_sel; + + free(buf_sel); + + // dumping length_rems + memcpy(*buf + pos, cs->length_rems, length_rems_size); + #ifdef DEBUG + for(i = 0; i < length_rems_size; i++) + { + DEBUGP("pos = %u -- length_rems_size = %u -- length_rems[%u] = %u\n", pos, length_rems_size, i, *(*buf + pos + i)); + } + #endif + pos += length_rems_size; + + // dumping store_table + memcpy(*buf + pos, cs->store_table, store_table_size); + + #ifdef DEBUG + for(i = 0; i < store_table_size; i++) + { + DEBUGP("pos = %u -- store_table_size = %u -- store_table[%u] = %u\n", pos, store_table_size, i, *(*buf + pos + i)); + } + #endif + DEBUGP("Dumped compressed sequence structure with size %u bytes\n", *buflen); +} + +void compressed_seq_load(compressed_seq_t * cs, const char * buf, cmph_uint32 buflen) +{ + register cmph_uint32 pos = 0; + cmph_uint32 buflen_sel = 0; + register cmph_uint32 length_rems_size = 0; + register cmph_uint32 store_table_size = 0; + + // loading n, rem_r and total_length + memcpy(&(cs->n), buf, sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("n = %u\n", cs->n); + + memcpy(&(cs->rem_r), buf + pos, sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("rem_r = %u\n", cs->rem_r); + + memcpy(&(cs->total_length), buf + pos, sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("total_length = %u\n", cs->total_length); + + // loading sel + memcpy(&buflen_sel, buf + pos, sizeof(cmph_uint32)); + pos += sizeof(cmph_uint32); + DEBUGP("buflen_sel = %u\n", buflen_sel); + + select_load(&cs->sel, buf + pos, buflen_sel); + #ifdef DEBUG + cmph_uint32 i = 0; + for(i = 0; i < buflen_sel; i++) + { + DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(buf + pos + i)); + } + #endif + pos += buflen_sel; + + // loading length_rems + if(cs->length_rems) + { + free(cs->length_rems); + } + length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r); + cs->length_rems = (cmph_uint32 *) calloc(length_rems_size, sizeof(cmph_uint32)); + length_rems_size *= 4; + memcpy(cs->length_rems, buf + pos, length_rems_size); + + #ifdef DEBUG + for(i = 0; i < length_rems_size; i++) + { + DEBUGP("pos = %u -- length_rems_size = %u -- length_rems[%u] = %u\n", pos, length_rems_size, i, *(buf + pos + i)); + } + #endif + pos += length_rems_size; + + // loading store_table + store_table_size = ((cs->total_length + 31) >> 5); + if(cs->store_table) + { + free(cs->store_table); + } + cs->store_table = (cmph_uint32 *) calloc(store_table_size, sizeof(cmph_uint32)); + store_table_size *= 4; + memcpy(cs->store_table, buf + pos, store_table_size); + + #ifdef DEBUG + for(i = 0; i < store_table_size; i++) + { + DEBUGP("pos = %u -- store_table_size = %u -- store_table[%u] = %u\n", pos, store_table_size, i, *(buf + pos + i)); + } + #endif + + DEBUGP("Loaded compressed sequence structure with size %u bytes\n", buflen); +} + +void compressed_seq_pack(compressed_seq_t *cs, void *cs_packed) +{ + if (cs && cs_packed) + { + char *buf = NULL; + cmph_uint32 buflen = 0; + compressed_seq_dump(cs, &buf, &buflen); + memcpy(cs_packed, buf, buflen); + free(buf); + } + +} + +cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs) +{ + register cmph_uint32 sel_size = select_packed_size(&cs->sel); + register cmph_uint32 store_table_size = ((cs->total_length + 31) >> 5) * sizeof(cmph_uint32); + register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r) * sizeof(cmph_uint32); + return 4 * sizeof(cmph_uint32) + sel_size + store_table_size + length_rems_size; +} + + +cmph_int32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx) +{ + // unpacking cs_packed + register cmph_uint32 *ptr = (cmph_uint32 *)cs_packed; + register cmph_uint32 n = *ptr++; + register cmph_uint32 rem_r = *ptr++; + ptr++; // skipping total_length +// register cmph_uint32 total_length = *ptr++; + register cmph_uint32 buflen_sel = *ptr++; + register cmph_uint32 * sel_packed = ptr; + register cmph_uint32 * length_rems = (ptr += (buflen_sel >> 2)); + register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(n, rem_r); + register cmph_uint32 * store_table = (ptr += length_rems_size); + + // compressed sequence query computation + register cmph_uint32 enc_idx, enc_length; + register cmph_uint32 rems_mask; + register cmph_uint32 stored_value; + register cmph_uint32 sel_res; + + rems_mask = (1 << rem_r) - 1; + + if(idx == 0) + { + enc_idx = 0; + sel_res = select_query_packed(sel_packed, idx); + } + else + { + sel_res = select_query_packed(sel_packed, idx - 1); + + enc_idx = (sel_res - (idx - 1)) << rem_r; + enc_idx += get_bits_value(length_rems, idx-1, rem_r, rems_mask); + + sel_res = select_next_query_packed(sel_packed, sel_res); + }; + + enc_length = (sel_res - idx) << rem_r; + enc_length += get_bits_value(length_rems, idx, rem_r, rems_mask); + enc_length -= enc_idx; + if(enc_length == 0) + return 0; + + stored_value = get_bits_at_pos(store_table, enc_idx, enc_length); + return stored_value + ((1 << enc_length) - 1); +} diff --git a/src/compressed_seq.h b/src/compressed_seq.h new file mode 100644 index 0000000..8f53665 --- /dev/null +++ b/src/compressed_seq.h @@ -0,0 +1,84 @@ +#ifndef __CMPH_COMPRESSED_SEQ_H__ +#define __CMPH_COMPRESSED_SEQ_H__ + +#include"select.h" + +struct _compressed_seq_t +{ + cmph_uint32 n; // number of values stored in store_table + // The length in bits of each value is decomposed into two compnents: the lg(n) MSBs are stored in rank_select data structure + // the remaining LSBs are stored in a table of n cells, each one of rem_r bits. + cmph_uint32 rem_r; + cmph_uint32 total_length; // total length in bits of stored_table + select_t sel; + cmph_uint32 * length_rems; + cmph_uint32 * store_table; +}; + +typedef struct _compressed_seq_t compressed_seq_t; + +/** \fn void compressed_seq_init(compressed_seq_t * cs); + * \brief Initialize a compressed sequence structure. + * \param cs points to the compressed sequence structure to be initialized + */ +void compressed_seq_init(compressed_seq_t * cs); + +/** \fn void compressed_seq_destroy(compressed_seq_t * cs); + * \brief Destroy a compressed sequence given as input. + * \param cs points to the compressed sequence structure to be destroyed + */ +void compressed_seq_destroy(compressed_seq_t * cs); + +/** \fn void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cmph_uint32 n); + * \brief Generate a compressed sequence from an input array with n values. + * \param cs points to the compressed sequence structure + * \param vals_table poiter to the array given as input + * \param n number of values in @see vals_table + */ +void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cmph_uint32 n); + + +/** \fn cmph_int32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx); + * \brief Returns the value stored at index @see idx of the compressed sequence structure. + * \param cs points to the compressed sequence structure + * \param idx index to retrieve the value from + * \return the value stored at index @see idx of the compressed sequence structure + */ +cmph_int32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx); + + +/** \fn cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs); + * \brief Returns amount of space (in bits) to store the compressed sequence. + * \param cs points to the compressed sequence structure + * \return the amount of space (in bits) to store @see cs + */ +cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs); + +void compressed_seq_dump(compressed_seq_t * cs, char ** buf, cmph_uint32 * buflen); + +void compressed_seq_load(compressed_seq_t * cs, const char * buf, cmph_uint32 buflen); + + +/** \fn void compressed_seq_pack(compressed_seq_t *cs, void *cs_packed); + * \brief Support the ability to pack a compressed sequence structure into a preallocated contiguous memory space pointed by cs_packed. + * \param cs points to the compressed sequence structure + * \param cs_packed pointer to the contiguous memory area used to store the compressed sequence structure. The size of cs_packed must be at least @see compressed_seq_packed_size + */ +void compressed_seq_pack(compressed_seq_t *cs, void *cs_packed); + +/** \fn cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs); + * \brief Return the amount of space needed to pack a compressed sequence structure. + * \return the size of the packed compressed sequence structure or zero for failures + */ +cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs); + + +/** \fn cmph_int32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx); + * \brief Returns the value stored at index @see idx of the packed compressed sequence structure. + * \param cs_packed is a pointer to a contiguous memory area + * \param idx is the index to retrieve the value from + * \return the value stored at index @see idx of the packed compressed sequence structure + */ +cmph_int32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx); + +#endif diff --git a/src/select.c b/src/select.c index b3a7c29..fdf127c 100644 --- a/src/select.c +++ b/src/select.c @@ -32,33 +32,23 @@ static inline void select_insert_1(cmph_uint32 * buffer) (*buffer) |= 0x80000000; }; -void select_init(select_t * sel, cmph_uint32 n, cmph_uint32 m) +void select_init(select_t * sel) { - register cmph_uint32 nbits; - register cmph_uint32 vec_size; - register cmph_uint32 sel_table_size; - sel->n = n; - sel->m = m; // n values in the range [0,m-1] - - nbits = sel->n + sel->m; - vec_size = (nbits + 31) >> 5; // (nbits + 31) >> 5 = (nbits + 31)/32 - - sel_table_size = (sel->n >> NBITS_STEP_SELECT_TABLE) + 1; // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) - - sel->bits_vec = (cmph_uint32 *)calloc(vec_size, sizeof(cmph_uint32)); - - sel->select_table = (cmph_uint32 *)calloc(sel_table_size, sizeof(cmph_uint32)); + sel->n = 0; + sel->m = 0; + sel->bits_vec = 0; + sel->select_table = 0; }; -double select_get_space_usage(select_t * sel) +cmph_uint32 select_get_space_usage(select_t * sel) { register cmph_uint32 nbits; register cmph_uint32 vec_size; register cmph_uint32 sel_table_size; - register double space_usage; + register cmph_uint32 space_usage; nbits = sel->n + sel->m; - vec_size = (nbits + 31) >> 5; // (nbits + 31) >> 5 = (nbits + 31)/32 + vec_size = (nbits + 31) >> 5; sel_table_size = (sel->n >> NBITS_STEP_SELECT_TABLE) + 1; // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) space_usage = 2 * sizeof(cmph_uint32) * 8; // n and m @@ -101,11 +91,36 @@ static inline void select_generate_sel_table(select_t * sel) }; }; -void select_generate(select_t * sel, cmph_uint32 * keys_vec) +void select_generate(select_t * sel, cmph_uint32 * keys_vec, cmph_uint32 n, cmph_uint32 m) { register cmph_uint32 i, j, idx; cmph_uint32 buffer = 0; + register cmph_uint32 nbits; + register cmph_uint32 vec_size; + register cmph_uint32 sel_table_size; + sel->n = n; + sel->m = m; // n values in the range [0,m-1] + + nbits = sel->n + sel->m; + vec_size = (nbits + 31) >> 5; // (nbits + 31) >> 5 = (nbits + 31)/32 + + sel_table_size = (sel->n >> NBITS_STEP_SELECT_TABLE) + 1; // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) + + if(sel->bits_vec) + { + free(sel->bits_vec); + } + sel->bits_vec = (cmph_uint32 *)calloc(vec_size, sizeof(cmph_uint32)); + + if(sel->select_table) + { + free(sel->select_table); + } + sel->select_table = (cmph_uint32 *)calloc(sel_table_size, sizeof(cmph_uint32)); + + + idx = i = j = 0; for(;;) @@ -204,12 +219,11 @@ cmph_int32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx) return _select_next_query((cmph_uint8 *)sel->bits_vec, vec_bit_idx); }; - void select_dump(select_t *sel, char **buf, cmph_uint32 *buflen) { register cmph_uint32 nbits = sel->n + sel->m; - register cmph_uint32 vec_size = (nbits + 31) >> 5; // (nbits + 31) >> 5 = (nbits + 31)/32 - register cmph_uint32 sel_table_size = (sel->n >> NBITS_STEP_SELECT_TABLE) + 1; // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) + register cmph_uint32 vec_size = ((nbits + 31) >> 5) * sizeof(cmph_uint32); // (nbits + 31) >> 5 = (nbits + 31)/32 + register cmph_uint32 sel_table_size = ((sel->n >> NBITS_STEP_SELECT_TABLE) + 1) * sizeof(cmph_uint32); // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) register cmph_uint32 pos = 0; *buflen = 2*sizeof(cmph_uint32) + vec_size + sel_table_size; @@ -246,14 +260,20 @@ void select_load(select_t * sel, const char *buf, cmph_uint32 buflen) pos += sizeof(cmph_uint32); nbits = sel->n + sel->m; - vec_size = (nbits + 31) >> 5; // (nbits + 31) >> 5 = (nbits + 31)/32 - sel_table_size = (sel->n >> NBITS_STEP_SELECT_TABLE) + 1; // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) + vec_size = ((nbits + 31) >> 5) * sizeof(cmph_uint32); // (nbits + 31) >> 5 = (nbits + 31)/32 + sel_table_size = ((sel->n >> NBITS_STEP_SELECT_TABLE) + 1) * sizeof(cmph_uint32); // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) - if(sel->bits_vec) free(sel->bits_vec); - sel->bits_vec = (cmph_uint32 *)calloc(vec_size, sizeof(cmph_uint32)); + if(sel->bits_vec) + { + free(sel->bits_vec); + } + sel->bits_vec = (cmph_uint32 *)calloc(vec_size/sizeof(cmph_uint32), sizeof(cmph_uint32)); - if(sel->select_table) free(sel->select_table); - sel->select_table = (cmph_uint32 *)calloc(sel_table_size, sizeof(cmph_uint32)); + if(sel->select_table) + { + free(sel->select_table); + } + sel->select_table = (cmph_uint32 *)calloc(sel_table_size/sizeof(cmph_uint32), sizeof(cmph_uint32)); memcpy(sel->bits_vec, buf + pos, vec_size); pos += vec_size; @@ -288,8 +308,8 @@ void select_pack(select_t *sel, void *sel_packed) cmph_uint32 select_packed_size(select_t *sel) { register cmph_uint32 nbits = sel->n + sel->m; - register cmph_uint32 vec_size = (nbits + 31) >> 5; // (nbits + 31) >> 5 = (nbits + 31)/32 - register cmph_uint32 sel_table_size = (sel->n >> NBITS_STEP_SELECT_TABLE) + 1; // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) + register cmph_uint32 vec_size = ((nbits + 31) >> 5) * sizeof(cmph_uint32); // (nbits + 31) >> 5 = (nbits + 31)/32 + register cmph_uint32 sel_table_size = ((sel->n >> NBITS_STEP_SELECT_TABLE) + 1) * sizeof(cmph_uint32); // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) return 2*sizeof(cmph_uint32) + vec_size + sel_table_size; } diff --git a/src/select.h b/src/select.h index bf2380b..a193ac2 100644 --- a/src/select.h +++ b/src/select.h @@ -1,5 +1,5 @@ -#ifndef SELECT_h -#define SELECT_h +#ifndef __CMPH_SELECT_H__ +#define __CMPH_SELECT_H__ #include "cmph_types.h" @@ -12,17 +12,17 @@ struct _select_t typedef struct _select_t select_t; -void select_init(select_t * sel, cmph_uint32 n, cmph_uint32 m); +void select_init(select_t * sel); void select_destroy(select_t * sel); -void select_generate(select_t * sel, cmph_uint32 * keys_vec); +void select_generate(select_t * sel, cmph_uint32 * keys_vec, cmph_uint32 n, cmph_uint32 m); cmph_int32 select_query(select_t * sel, cmph_uint32 one_idx); cmph_int32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx); -double select_get_space_usage(select_t * sel); +cmph_uint32 select_get_space_usage(select_t * sel); void select_dump(select_t *sel, char **buf, cmph_uint32 *buflen); @@ -30,7 +30,7 @@ void select_load(select_t * sel, const char *buf, cmph_uint32 buflen); /** \fn void select_pack(select_t *sel, void *sel_packed); - * \brief Support the ability to pack a select structure function into a preallocated contiguous memory space pointed by sel_packed. + * \brief Support the ability to pack a select structure into a preallocated contiguous memory space pointed by sel_packed. * \param sel points to the select structure * \param sel_packed pointer to the contiguous memory area used to store the select structure. The size of sel_packed must be at least @see select_packed_size */ diff --git a/tests/Makefile.am b/tests/Makefile.am index f0688a9..54f0b75 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,4 +1,4 @@ -noinst_PROGRAMS = graph_tests packed_mphf_tests mphf_tests select_tests +noinst_PROGRAMS = graph_tests packed_mphf_tests mphf_tests select_tests compressed_seq_tests INCLUDES = -I../src/ @@ -13,3 +13,6 @@ mphf_tests_LDADD = ../src/libcmph.la select_tests_SOURCES = select_tests.c select_tests_LDADD = ../src/libcmph.la + +compressed_seq_tests_SOURCES = compressed_seq_tests.c +compressed_seq_tests_LDADD = ../src/libcmph.la diff --git a/tests/select_tests.c b/tests/select_tests.c index 8edc30f..c7118a8 100644 --- a/tests/select_tests.c +++ b/tests/select_tests.c @@ -68,9 +68,9 @@ int main(int argc, char **argv) char * select_packed = NULL; cmph_uint32 select_pack_size = 0; - select_init(&sel, n, m); - select_generate(&sel, keys_vec); - fprintf(stderr, "Space usage = %f\n", select_get_space_usage(&sel)); + select_init(&sel); + select_generate(&sel, keys_vec, n, m); + fprintf(stderr, "Space usage = %u\n", select_get_space_usage(&sel)); print_values(&sel); fprintf(stderr, "Dumping select structure\n"); @@ -87,6 +87,7 @@ int main(int argc, char **argv) select_packed = (char *) calloc(select_pack_size, sizeof(char)); select_pack(&sel, select_packed); + select_destroy(&sel); fprintf(stderr, "Querying the packed select structure\n"); print_values_packed(select_packed);