commit 03519cc9c8f44a551c7714a365bc5b46f26190f5 Author: davi Date: Thu Dec 23 13:16:30 2004 +0000 Initial revision diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..427eedb --- /dev/null +++ b/AUTHORS @@ -0,0 +1,2 @@ +Davi de Castro Reis +Fabiano Cupertino Botelho diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..d60c31a --- /dev/null +++ b/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..b7626b7 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,159 @@ +2004-11-24 12:42 davi + + * src/czech.c: Fixed some leaks. + +2004-11-24 11:34 davi + + * src/: Makefile.am, cmph.h, cmph_types.h, czech.h, graph.h, + hash.h, jenkins_hash.h, types.h, vstack.h: Better header layout. + +2004-11-23 15:15 davi + + * src/jenkins_hash.c: Fixed trivial bug. + +2004-11-04 00:56 davi + + * src/: czech.c, graph.c: Forgot. + +2004-11-04 00:09 davi + + * src/: cmph.c, cmph.h, czech.c, czech.h, graph.c, main.c: Fixed + small bug due to fgets trick. + +2004-11-03 23:15 davi + + * src/: cmph.c, cmph.h, cmph_structs.c, cmph_structs.h, czech.c, + main.c: Added verbosity option + +2004-11-03 22:42 davi + + * src/czech.c: [no log message] + +2004-11-03 20:57 davi + + * src/: Makefile.am, fnv_hash.c, fnv_hash.h, hash.c, hash_state.h: + Added fnv hash function. + +2004-11-03 18:10 davi + + * src/: Makefile.am, cmph.c, cmph.h, czech.c, czech.h, djb2_hash.c, + hash.c, hash_state.h, jenkins_hash.c, main.c, sdbm_hash.c, + sdbm_hash.h: Added sdbm hash function. + +2004-11-03 15:53 davi + + * src/cmph.c, src/cmph.h, src/cmph_structs.c, src/cmph_structs.h, + src/czech.c, src/czech.h, src/czech_structs.h, src/main.c, + tests/Makefile.am, tests/czech_tests.c: Added callback structure to + retrieve keys in disk. + +2004-10-31 22:57 davi + + * src/: Makefile.am, djb2_hash.c, djb2_hash.h, hash.c, hash.h, + hash_funcs.h, jenkins_hash.h: Added new hash function. + +2004-10-31 20:53 davi + + * src/main.c: Added random seed. + +2004-10-31 20:48 davi + + * src/main.c, tests/czech_tests.c: Better getopt code. + +2004-10-31 19:27 davi + + * src/: cmph.c, cmph.h, cmph_structs.c, cmph_structs.h, czech.c, + czech.h, czech_structs.h, hash.c, hash.h, jenkins_hash.c, + jenkins_hash.h, main.c: Everything working flawlessly. + +2004-10-30 20:12 davi + + * src/cmph_structs.c: Added missing file. + +2004-10-30 20:09 davi + + * src/main.c: forgot. + +2004-10-29 19:02 davi + + * src/: Makefile.am, cmph.c, cmph.h, cmph_structs.h, czech.c, + czech.h, czech_structs.h, graph.c, hash.c, hash.h, jenkins_hash.c, + jenkins_hash.h, main.c: Cleaned some warnings. + +2004-10-29 16:14 davi + + * src/: hash.c, hash_state.h: Missing file. + +2004-10-29 14:18 davi + + * src/: Makefile.am, cmph.c, cmph.h, czech.c, graph.c, hash.h, + jenkins_hash.c, jenkins_hash.h, jenkinshash.c, jenkinshash.h: + Working nice. Serialization almost there. + +2004-10-28 11:21 davi + + * Makefile.am, src/cmph.c, src/cmph_structs.h, src/czech.c, + src/czech_structs.h, src/jenkinshash.c: Hash generation seems to be + working fine. + +2004-10-27 15:19 davi + + * src/hash.h: Added hash header. + +2004-10-27 15:06 davi + + * COPYING, INSTALL, Makefile.am, src/czech.c, src/czech_structs.h, + src/graph.c, src/graph.h, src/vstack.c, tests/Makefile.am, + tests/czech_tests.c, tests/graph_tests.c: Cool. Now it is + serialization time. + +2004-10-27 13:06 davi + + * tests/: czech_tests.c, keys: Added missing files. + +2004-10-26 21:23 davi + + * src/graph.c, tests/Makefile.am: Still working on f*cking graph + implementation. + +2004-10-25 20:05 davi + + * src/: cmph.c, cmph.h, cmph_structs.h, czech.c, czech.h, + czech_structs.h, graph.c, graph.h, jenkinshash.c, jenkinshash.h, + vstack.c: Added some new files. + +2004-10-25 00:27 davi + + * src/: czech.c, czech.h, graph.c, graph.h, jenkinshash.c: Some + random code. + +2004-10-24 23:07 davi + + * src/: Makefile.am, debug.h, graph.c, main.c, vstack.c, vstack.h: + Added stack implementation. + +2004-10-24 21:57 davi + + * src/list.h: Added missing file. + +2004-10-24 21:50 davi + + * src/types.h: Added missing file. + +2004-10-22 20:30 davi + + * src/: Makefile.am, cmph.h, czech.c, czech.h, graph.c, graph.h, + jenkinshash.c, jenkinshash.h: Added a lot of files. + +2004-10-19 17:08 davi + + * AUTHORS, COPYING, ChangeLog, INSTALL, Makefile.am, NEWS, README, + cmph.spec, configure.ac, src/Makefile.am, src/cmph.c, src/cmph.h, + src/main.c, tests/Makefile.am: Initial revision + +2004-10-19 17:08 davi + + * AUTHORS, COPYING, ChangeLog, INSTALL, Makefile.am, NEWS, README, + cmph.spec, configure.ac, src/Makefile.am, src/cmph.c, src/cmph.h, + src/main.c, tests/Makefile.am: Minimum perfect hashing library + diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..c735106 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,2 @@ +SUBDIRS = src tests +EXTRA_DIST = cmph.spec configure.ac diff --git a/cmph.spec b/cmph.spec new file mode 100644 index 0000000..19e796b --- /dev/null +++ b/cmph.spec @@ -0,0 +1,39 @@ +%define name cmph +%define version 0.2 +%define release 1 + +Name: %{name} +Version: %{version} +Release: %{release} +Summary: C Minimal perfect hash library +Source: %{name}-%{version}.tar.gz +License: Proprietary +URL: http://www.akwan.com.br +BuildArch: i386 +Group: Sitesearch +BuildRoot: %{_tmppath}/%{name}-root + +%description +C Minimal perfect hash library + +%prep +rm -Rf $RPM_BUILD_ROOT +rm -rf $RPM_BUILD_ROOT +%setup +mkdir $RPM_BUILD_ROOT +mkdir $RPM_BUILD_ROOT/usr +CXXFLAGS="-O2" ./configure --prefix=/usr/ + +%build +make + +%install +DESTDIR=$RPM_BUILD_ROOT make install + +%files +%defattr(755,root,root) +/ + +%changelog +* Tue Jun 1 2004 Davi de Castro Reis ++ Initial build diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..dcea170 --- /dev/null +++ b/configure.ac @@ -0,0 +1,23 @@ +dnl Process this file with autoconf to produce a configure script. +AC_INIT(Makefile.am) +AM_INIT_AUTOMAKE(cmph, 0.2) +AM_CONFIG_HEADER(config.h) + +dnl Checks for programs. +AC_PROG_AWK +AC_PROG_CC +AC_PROG_INSTALL +AC_PROG_LN_S +AC_PROG_LIBTOOL + +dnl Checks for headers +AC_CHECK_HEADERS([getopt.h math.h]) + +dnl Checks for libraries. +AC_CHECK_LIBM +LDFLAGS="$LIBM $LDFLAGS" + +dnl Checks for library functions. + +dnl AC_OUTPUT(Makefile tests/Makefile samples/Makefile) +AC_OUTPUT(Makefile src/Makefile tests/Makefile) diff --git a/src/Makefile.am b/src/Makefile.am new file mode 100644 index 0000000..8a56ad1 --- /dev/null +++ b/src/Makefile.am @@ -0,0 +1,22 @@ +bin_PROGRAMS = cmph +lib_LTLIBRARIES = libcmph.la +include_HEADERS = cmph.h cmph_types.h +libcmph_la_SOURCES = debug.h\ + cmph_types.h\ + hash.h hash_state.h hash.c\ + jenkins_hash.h jenkins_hash.c\ + djb2_hash.h djb2_hash.c\ + sdbm_hash.h sdbm_hash.c\ + fnv_hash.h fnv_hash.c\ + vstack.h vstack.c\ + vqueue.h vqueue.c\ + graph.h graph.c\ + cmph.h cmph.c\ + cmph_structs.h cmph_structs.c\ + czech.h czech_structs.h czech.c\ + bmz.h bmz_structs.h bmz.c + +libcmph_la_LDFLAGS = -version-info 0:0:0 + +cmph_SOURCES = main.c +cmph_LDADD = libcmph.la diff --git a/src/bmz.c b/src/bmz.c new file mode 100644 index 0000000..61a1b5b --- /dev/null +++ b/src/bmz.c @@ -0,0 +1,439 @@ +#include "bmz.h" +#include "cmph_structs.h" +#include "bmz_structs.h" +#include "hash.h" +#include "vqueue.h" + +#include +#include +#include +#include +#include +#include + +//#define DEBUG +#include "debug.h" + +static uint32 UNDEFINED = UINT_MAX; + +static int bmz_gen_edges(mph_t *mph); +static void bmz_traverse_critical_nodes(bmz_mph_data_t *bmz, uint32 v, uint32 * biggest_g_value, uint32 * biggest_edge_value, uint8 * used_edges); +static void bmz_traverse_non_critical_nodes(bmz_mph_data_t *bmz, uint8 * used_edges); + +mph_t *bmz_mph_new(key_source_t *key_source) +{ + mph_t *mph = NULL; + bmz_mph_data_t *bmz = NULL; + mph = __mph_new(MPH_BMZ, key_source); + if (mph == NULL) return NULL; + bmz = (bmz_mph_data_t *)malloc(sizeof(bmz_mph_data_t)); + if (bmz == NULL) + { + __mph_destroy(mph); + return NULL; + } + bmz->hashfuncs[0] = HASH_JENKINS; + bmz->hashfuncs[1] = HASH_JENKINS; + bmz->g = NULL; + bmz->graph = NULL; + bmz->hashes = NULL; + mph->data = bmz; + assert(mph->data); + return mph; +} +void bmz_mph_destroy(mph_t *mph) +{ + bmz_mph_data_t *data = (bmz_mph_data_t *)mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + free(data); + __mph_destroy(mph); +} + +void bmz_mph_set_hashfuncs(mph_t *mph, CMPH_HASH *hashfuncs) +{ + bmz_mph_data_t *bmz = (bmz_mph_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + uint32 i = 0; + while(*hashptr != HASH_COUNT) + { + if (i >= 2) break; //bmz only uses two hash functions + bmz->hashfuncs[i] = *hashptr; + ++i, ++hashptr; + } +} + +mphf_t *bmz_mph_create(mph_t *mph, float bmz_c) +{ + mphf_t *mphf = NULL; + bmz_mphf_data_t *bmzf = NULL; + + uint32 i; + uint32 iterations = 10; + uint8 *used_edges = NULL; + uint32 unused_edge_index = 0; + uint32 biggest_g_value = 0; + uint32 biggest_edge_value = 1; + DEBUGP("bmz_c: %f\n", bmz_c); + bmz_mph_data_t *bmz = (bmz_mph_data_t *)mph->data; + bmz->m = mph->key_source->nkeys; + bmz->n = ceil(bmz_c * mph->key_source->nkeys); + DEBUGP("m (edges): %u n (vertices): %u bmz_c: %f\n", bmz->m, bmz->n, bmz_c); + bmz->graph = graph_new(bmz->n, bmz->m); + DEBUGP("Created graph\n"); + + bmz->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3); + for(i = 0; i < 3; ++i) bmz->hashes[i] = NULL; + + // Mapping step + if (mph->verbosity) + { + fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", bmz->m, bmz->n); + } + while(1) + { + int ok; + DEBUGP("hash function 1\n"); + bmz->hashes[0] = hash_state_new(bmz->hashfuncs[0], bmz->n); + DEBUGP("hash function 2\n"); + bmz->hashes[1] = hash_state_new(bmz->hashfuncs[1], bmz->n); + DEBUGP("Generating edges\n"); + ok = bmz_gen_edges(mph); + if (!ok) + { + --iterations; + hash_state_destroy(bmz->hashes[0]); + bmz->hashes[0] = NULL; + hash_state_destroy(bmz->hashes[1]); + bmz->hashes[1] = NULL; + DEBUGP("%u iterations remaining\n", iterations); + if (mph->verbosity) + { + fprintf(stderr, "simple graph creation failure - %u iterations remaining\n", iterations); + } + if (iterations == 0) break; + } + else break; + } + if (iterations == 0) + { + graph_destroy(bmz->graph); + return NULL; + } + + // Ordering step + if (mph->verbosity) + { + fprintf(stderr, "Starting ordering step\n"); + } + + graph_obtain_critical_nodes(bmz->graph); + + // Searching step + if (mph->verbosity) + { + fprintf(stderr, "Starting Searching step\n"); + fprintf(stderr, "\tTraversing critical vertices.\n"); + } + DEBUGP("Searching step\n"); + used_edges = (uint8 *)malloc(bmz->m*sizeof(uint8)); + memset(used_edges, 0, bmz->m); + free(bmz->g); + bmz->g = malloc(bmz->n * sizeof(uint32)); + assert(bmz->g); + for (i = 0; i < bmz->n; ++i) bmz->g[i] = UNDEFINED; + + for (i = 0; i < bmz->n; ++i) // critical nodes + { + if (graph_node_is_critical(bmz->graph, i) && (bmz->g[i] == UNDEFINED)) + { + bmz_traverse_critical_nodes(bmz, i, &biggest_g_value, &biggest_edge_value, used_edges); + } + } + if (mph->verbosity) + { + fprintf(stderr, "\tTraversing non critical vertices.\n"); + } + + bmz_traverse_non_critical_nodes(bmz, used_edges); // non_critical_nodes + graph_destroy(bmz->graph); + free(used_edges); + bmz->graph = NULL; + + mphf = (mphf_t *)malloc(sizeof(mphf_t)); + mphf->algo = mph->algo; + bmzf = (bmz_mphf_data_t *)malloc(sizeof(bmz_mph_data_t)); + bmzf->g = bmz->g; + bmz->g = NULL; //transfer memory ownership + bmzf->hashes = bmz->hashes; + bmz->hashes = NULL; //transfer memory ownership + bmzf->n = bmz->n; + bmzf->m = bmz->m; + mphf->data = bmzf; + mphf->size = bmz->m; + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + return mphf; +} + +static void bmz_traverse_critical_nodes(bmz_mph_data_t *bmz, uint32 v, uint32 * biggest_g_value, uint32 * biggest_edge_value, uint8 * used_edges) +{ + uint32 next_g; + uint32 u; /* Auxiliary vertex */ + uint32 lav; /* lookahead vertex */ + uint8 collision; + vqueue_t * q = vqueue_new(graph_ncritical_nodes(bmz->graph)); + graph_iterator_t it, it1; + + DEBUGP("Labelling critical vertices\n"); + bmz->g[v] = (uint32)ceil ((double)(*biggest_edge_value)/2) - 1; + next_g = (uint32)floor((double)(*biggest_edge_value/2)); /* next_g is incremented in the do..while statement*/ + *biggest_g_value = next_g; + vqueue_insert(q, v); + while(!vqueue_is_empty(q)) + { + v = vqueue_remove(q); + it = graph_neighbors_it(bmz->graph, v); + while ((u = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz->graph, u) && (bmz->g[u] == UNDEFINED)) + { + collision = 1; + while(collision) // lookahead to resolve collisions + { + next_g = *biggest_g_value + 1; + it1 = graph_neighbors_it(bmz->graph, u); + collision = 0; + while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz->graph, lav) && (bmz->g[lav] != UNDEFINED)) + { + assert(next_g + bmz->g[lav] < bmz->m); + if (used_edges[next_g + bmz->g[lav]]) + { + collision = 1; + break; + } + } + } + if (next_g > *biggest_g_value) *biggest_g_value = next_g; + } + // Marking used edges... + it1 = graph_neighbors_it(bmz->graph, u); + while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz->graph, lav) && (bmz->g[lav] != UNDEFINED)) + { + used_edges[next_g + bmz->g[lav]] = 1; + if(next_g + bmz->g[lav] > *biggest_edge_value) *biggest_edge_value = next_g + bmz->g[lav]; + } + } + bmz->g[u] = next_g; // Labelling vertex u. + vqueue_insert(q, u); + } + } + + } + vqueue_destroy(q); + +} + +static uint32 next_unused_edge(bmz_mph_data_t *bmz, uint8 * used_edges, uint32 unused_edge_index) +{ + while(1) + { + assert(unused_edge_index < bmz->m); + if(used_edges[unused_edge_index]) unused_edge_index ++; + else break; + } + return unused_edge_index; +} + +static void bmz_traverse(bmz_mph_data_t *bmz, uint8 * used_edges, uint32 v, uint32 * unused_edge_index) +{ + graph_iterator_t it = graph_neighbors_it(bmz->graph, v); + uint32 neighbor = 0; + while((neighbor = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR) + { + DEBUGP("Visiting neighbor %u\n", neighbor); + if(bmz->g[neighbor] != UNDEFINED) continue; + *unused_edge_index = next_unused_edge(bmz, used_edges, *unused_edge_index + 1); + bmz->g[neighbor] = *unused_edge_index - bmz->g[v]; + bmz_traverse(bmz, used_edges, neighbor, unused_edge_index); + } +} + +static void bmz_traverse_non_critical_nodes(bmz_mph_data_t *bmz, uint8 * used_edges) +{ + + uint32 i, v1, v2, unused_edge_index = -1; + DEBUGP("Labelling non critical vertices\n"); + for(i = 0; i < bmz->m; i++) + { + v1 = graph_vertex_id(bmz->graph, i, 0); + v2 = graph_vertex_id(bmz->graph, i, 1); + if((bmz->g[v1] != UNDEFINED && bmz->g[v2] != UNDEFINED) || (bmz->g[v1] == UNDEFINED && bmz->g[v2] == UNDEFINED)) continue; + if(bmz->g[v1] != UNDEFINED) bmz_traverse(bmz, used_edges, v1, &unused_edge_index); + else bmz_traverse(bmz, used_edges, v2, &unused_edge_index); + } + + for(i = 0; i < bmz->n; i++) + { + if(bmz->g[i] == UNDEFINED) + { + bmz->g[i] = 0; + bmz_traverse(bmz, used_edges, i, &unused_edge_index); + } + } + +} + +static int bmz_gen_edges(mph_t *mph) +{ + uint32 e; + bmz_mph_data_t *bmz = (bmz_mph_data_t *)mph->data; + uint8 multiple_edges = 0; + + DEBUGP("Generating edges for %u vertices\n", bmz->n); + graph_clear_edges(bmz->graph); + mph->key_source->rewind(mph->key_source->data); + for (e = 0; e < mph->key_source->nkeys; ++e) + { + uint32 h1, h2; + uint32 keylen; + char *key; + mph->key_source->read(mph->key_source->data, &key, &keylen); + h1 = hash(bmz->hashes[0], key, keylen) % bmz->n; + h2 = hash(bmz->hashes[1], key, keylen) % bmz->n; + if (h1 == h2) if (++h2 >= bmz->n) h2 = 0; + if (h1 == h2) + { + if (mph->verbosity) fprintf(stderr, "Self loop for key %e\n", e); + mph->key_source->dispose(mph->key_source->data, key, keylen); + return 0; + } + DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key); + mph->key_source->dispose(mph->key_source->data, key, keylen); + multiple_edges = graph_contains_edge(bmz->graph, h1, h2); + if (mph->verbosity && multiple_edges) fprintf(stderr, "A non simple graph was generated\n"); + if (multiple_edges) return 0; // checking multiple edge restriction. + graph_add_edge(bmz->graph, h1, h2); + } + return !multiple_edges; +} + +int bmz_mphf_dump(mphf_t *mphf, FILE *fd) +{ + char *buf = NULL; + uint32 buflen; + uint32 nbuflen; + uint32 i; + uint32 two = htonl(2); //number of hash functions + bmz_mphf_data_t *data = (bmz_mphf_data_t *)mphf->data; + uint32 nn, nm; + __mphf_dump(mphf, fd); + + fwrite(&two, sizeof(uint32), 1, fd); + + hash_state_dump(data->hashes[0], &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbuflen = htonl(buflen); + fwrite(&nbuflen, sizeof(uint32), 1, fd); + fwrite(buf, buflen, 1, fd); + free(buf); + + hash_state_dump(data->hashes[1], &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbuflen = htonl(buflen); + fwrite(&nbuflen, sizeof(uint32), 1, fd); + fwrite(buf, buflen, 1, fd); + free(buf); + + nn = htonl(data->n); + fwrite(&nn, sizeof(uint32), 1, fd); + nm = htonl(data->m); + fwrite(&nm, sizeof(uint32), 1, fd); + + for (i = 0; i < data->n; ++i) + { + uint32 ng = htonl(data->g[i]); + fwrite(&ng, sizeof(uint32), 1, fd); + } + #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]); + fprintf(stderr, "\n"); + #endif + return 1; +} + +void bmz_mphf_load(FILE *f, mphf_t *mphf) +{ + uint32 nhashes; + char fbuf[BUFSIZ]; + char *buf = NULL; + uint32 buflen; + uint32 i; + hash_state_t *state; + bmz_mphf_data_t *bmz = (bmz_mphf_data_t *)malloc(sizeof(bmz_mphf_data_t)); + + DEBUGP("Loading bmz mphf\n"); + mphf->data = bmz; + fread(&nhashes, sizeof(uint32), 1, f); + nhashes = ntohl(nhashes); + bmz->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1)); + bmz->hashes[nhashes] = NULL; + DEBUGP("Reading %u hashes\n", nhashes); + for (i = 0; i < nhashes; ++i) + { + hash_state_t *state = NULL; + fread(&buflen, sizeof(uint32), 1, f); + buflen = ntohl(buflen); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc(buflen); + fread(buf, buflen, 1, f); + state = hash_state_load(buf, buflen); + bmz->hashes[i] = state; + free(buf); + } + + DEBUGP("Reading m and n\n"); + fread(&(bmz->n), sizeof(uint32), 1, f); + bmz->n = ntohl(bmz->n); + fread(&(bmz->m), sizeof(uint32), 1, f); + bmz->m = ntohl(bmz->m); + + bmz->g = (uint32 *)malloc(sizeof(uint32)*bmz->n); + fread(bmz->g, bmz->n*sizeof(uint32), 1, f); + for (i = 0; i < bmz->n; ++i) bmz->g[i] = ntohl(bmz->g[i]); + #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < bmz->n; ++i) fprintf(stderr, "%u ", bmz->g[i]); + fprintf(stderr, "\n"); + #endif + return; +} + + +uint32 bmz_mphf_search(mphf_t *mphf, const char *key, uint32 keylen) +{ + bmz_mphf_data_t *bmz = mphf->data; + uint32 h1 = hash(bmz->hashes[0], key, keylen) % bmz->n; + uint32 h2 = hash(bmz->hashes[1], key, keylen) % bmz->n; + DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + if (h1 == h2 && ++h2 > bmz->n) h2 = 0; + DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, bmz->g[h1], bmz->g[h2], bmz->m); + return bmz->g[h1] + bmz->g[h2]; +} +void bmz_mphf_destroy(mphf_t *mphf) +{ + bmz_mphf_data_t *data = (bmz_mphf_data_t *)mphf->data; + free(data->g); + hash_state_destroy(data->hashes[0]); + hash_state_destroy(data->hashes[1]); + free(data->hashes); + free(data); + free(mphf); +} diff --git a/src/bmz.h b/src/bmz.h new file mode 100644 index 0000000..6dc3ca2 --- /dev/null +++ b/src/bmz.h @@ -0,0 +1,18 @@ +#ifndef __BMZ_H__ +#define __BMZ_H__ + +#include "graph.h" +#include "cmph.h" + +typedef struct __bmz_mphf_data_t bmz_mphf_data_t; +typedef struct __bmz_mph_data_t bmz_mph_data_t; + +mph_t *bmz_mph_new(key_source_t *key_source); +void bmz_mph_set_hashfuncs(mph_t *mph, CMPH_HASH *hashfuncs); +void bmz_mph_destroy(mph_t *mph); +mphf_t *bmz_mph_create(mph_t *mph, float bmz_c); + +void bmz_mphf_load(FILE *f, mphf_t *mphf); +int bmz_mphf_dump(mphf_t *mphf, FILE *f); +uint32 bmz_mphf_search(mphf_t *mphf, const char *key, uint32 keylen); +#endif diff --git a/src/bmz_structs.h b/src/bmz_structs.h new file mode 100644 index 0000000..4f6d131 --- /dev/null +++ b/src/bmz_structs.h @@ -0,0 +1,24 @@ +#ifndef __BMZ_STRUCTS_H__ +#define __BMZ_STRUCTS_H__ + +#include "hash_state.h" + +struct __bmz_mphf_data_t +{ + uint32 m; //edges (words) count + uint32 n; //vertex count + uint32 *g; + hash_state_t **hashes; +}; + +struct __bmz_mph_data_t +{ + CMPH_HASH hashfuncs[2]; + uint32 m; //edges (words) count + uint32 n; //vertex count + graph_t *graph; + uint32 *g; + hash_state_t **hashes; +}; + +#endif diff --git a/src/cmph.c b/src/cmph.c new file mode 100644 index 0000000..5432924 --- /dev/null +++ b/src/cmph.c @@ -0,0 +1,169 @@ +#include "cmph.h" +#include "cmph_structs.h" +#include "czech.h" +#include "bmz.h" +//#include "bmz.h" /* included -- Fabiano */ + +#include +#include + +//#define DEBUG +#include "debug.h" + +const char *mph_names[] = { "czech", "bmz", NULL }; /* included -- Fabiano */ + +mph_t *mph_new(MPH_ALGO algo, key_source_t *key_source) +{ + mph_t *mph = NULL; + DEBUGP("Creating mph with algorithm %s\n", mph_names[algo]); + switch (algo) + { + case MPH_CZECH: + mph = czech_mph_new(key_source); + break; + case MPH_BMZ: /* included -- Fabiano */ + DEBUGP("new bmz algorithm \n"); + mph = bmz_mph_new(key_source); + break; + default: + assert(0); + } + assert(mph); + return mph; +} + +void mph_destroy(mph_t *mph) +{ + DEBUGP("Destroying mph with algo %s\n", mph_names[mph->algo]); + switch (mph->algo) + { + case MPH_CZECH: + czech_mph_destroy(mph); + break; + case MPH_BMZ: /* included -- Fabiano */ + bmz_mph_destroy(mph); + break; + default: + assert(0); + } +} + +void mph_set_verbosity(mph_t *mph, uint32 verbosity) +{ + mph->verbosity = verbosity; +} + +void mph_set_hashfuncs(mph_t *mph, CMPH_HASH *hashfuncs) +{ + switch (mph->algo) + { + case MPH_CZECH: + czech_mph_set_hashfuncs(mph, hashfuncs); + break; + case MPH_BMZ: /* included -- Fabiano */ + bmz_mph_set_hashfuncs(mph, hashfuncs); + break; + default: + break; + } + return; +} + +mphf_t *mph_create(mph_t *mph) +{ + mphf_t *mphf = NULL; + switch (mph->algo) + { + case MPH_CZECH: + DEBUGP("Creating czech hash\n"); + mphf = czech_mph_create(mph, 2.09); + break; + case MPH_BMZ: /* included -- Fabiano */ + DEBUGP("Creating bmz hash\n"); + mphf = bmz_mph_create(mph, 1.10); + break; + default: + assert(0); + } + return mphf; +} + +int mphf_dump(mphf_t *mphf, FILE *f) +{ + switch (mphf->algo) + { + case MPH_CZECH: + return czech_mphf_dump(mphf, f); + break; + case MPH_BMZ: /* included -- Fabiano */ + return bmz_mphf_dump(mphf, f); + break; + default: + assert(0); + } + assert(0); + return 0; +} +mphf_t *mphf_load(FILE *f) +{ + mphf_t *mphf = NULL; + DEBUGP("Loading mphf generic parts\n"); + mphf = __mphf_load(f); + if (mphf == NULL) return NULL; + DEBUGP("Loading mphf algorithm dependent parts\n"); + + switch (mphf->algo) + { + case MPH_CZECH: + czech_mphf_load(f, mphf); + break; + case MPH_BMZ: /* included -- Fabiano */ + DEBUGP("Loading bmz algorithm dependent parts\n"); + bmz_mphf_load(f, mphf); + break; + default: + assert(0); + } + DEBUGP("Loaded mphf\n"); + return mphf; +} + + +uint32 mphf_search(mphf_t *mphf, const char *key, uint32 keylen) +{ + DEBUGP("mphf algorithm: %u \n", mphf->algo); + switch(mphf->algo) + { + case MPH_CZECH: + return czech_mphf_search(mphf, key, keylen); + case MPH_BMZ: /* included -- Fabiano */ + DEBUGP("bmz algorithm search\n"); + return bmz_mphf_search(mphf, key, keylen); + default: + assert(0); + } + assert(0); + return; +} + +uint32 mphf_size(mphf_t *mphf) +{ + return mphf->size; +} + +void mphf_destroy(mphf_t *mphf) +{ + switch(mphf->algo) + { + case MPH_CZECH: + czech_mphf_destroy(mphf); + return; + case MPH_BMZ: /* included -- Fabiano */ + bmz_mphf_destroy(mphf); + return; + default: + assert(0); + } + assert(0); + return; +} diff --git a/src/cmph.h b/src/cmph.h new file mode 100644 index 0000000..2224a42 --- /dev/null +++ b/src/cmph.h @@ -0,0 +1,44 @@ +#ifndef __CMPH_H__ +#define __CMPH_H__ + +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "cmph_types.h" + +typedef struct __mph_t mph_t; +typedef struct __mphf_t mphf_t; + +typedef struct +{ + void *data; + uint32 nkeys; + int (*read)(void *, char **, uint32 *); + void (*dispose)(void *, char *, uint32); + void (*rewind)(void *); +} key_source_t; + +/** Hash generation API **/ +mph_t *mph_new(MPH_ALGO algo, key_source_t *key_source); +void mph_set_hashfuncs(mph_t *mph, CMPH_HASH *hashfuncs); +void mph_set_verbosity(mph_t *mph, uint32 verbosity); +void mph_destroy(mph_t *mph); +mphf_t *mph_create(mph_t *mph); + +/** Hash querying API **/ +mphf_t *mphf_load(FILE *f); +int mphf_dump(mphf_t *mphf, FILE *f); +uint32 mphf_search(mphf_t *mphf, const char *key, uint32 keylen); +uint32 mphf_size(mphf_t *mphf); +void mphf_destroy(mphf_t *mphf); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/cmph_structs.c b/src/cmph_structs.c new file mode 100644 index 0000000..6c2f6dc --- /dev/null +++ b/src/cmph_structs.c @@ -0,0 +1,68 @@ +#include "cmph_structs.h" + +#include + +#define DEBUG +#include "debug.h" + +mph_t *__mph_new(MPH_ALGO algo, key_source_t *key_source) +{ + mph_t *mph = (mph_t *)malloc(sizeof(mph_t)); + DEBUGP("Creating mph with algorithm %s\n", mph_names[algo]); + if (mph == NULL) return NULL; + mph->algo = algo; + mph->key_source = key_source; + mph->verbosity = 0; + return mph; +} + +void __mph_destroy(mph_t *mph) +{ + free(mph); +} + +void __mphf_dump(mphf_t *mphf, FILE *fd) +{ + uint32 nsize = htonl(mphf->size); + fwrite(mph_names[mphf->algo], strlen(mph_names[mphf->algo]) + 1, 1, fd); + fwrite(&nsize, sizeof(mphf->size), 1, fd); +} +mphf_t *__mphf_load(FILE *f) +{ + mphf_t *mphf = NULL; + uint32 i; + char algo_name[BUFSIZ]; + char *ptr = algo_name; + MPH_ALGO algo = MPH_COUNT; + + DEBUGP("Loading mphf\n"); + while(1) + { + uint32 c = fread(ptr, 1, 1, f); + if (c != 1) return NULL; + if (*ptr == 0) break; + ++ptr; + } + for(i = 0; i < MPH_COUNT; ++i) + { + if (strcmp(algo_name, mph_names[i]) == 0) + { + algo = i; + } + } + if (algo == MPH_COUNT) + { + DEBUGP("Algorithm %s not found\n", algo_name); + return NULL; + } + mphf = (mphf_t *)malloc(sizeof(mphf_t)); + mphf->algo = algo; + fread(&(mphf->size), sizeof(mphf->size), 1, f); + mphf->size = ntohl(mphf->size); + mphf->data = NULL; + DEBUGP("Algorithm is %s and mphf is sized %u\n", mph_names[algo], mphf->size); + + return mphf; +} + + diff --git a/src/cmph_structs.h b/src/cmph_structs.h new file mode 100644 index 0000000..9b1e634 --- /dev/null +++ b/src/cmph_structs.h @@ -0,0 +1,32 @@ +#ifndef __CMPH_STRUCTS_H__ +#define __CMPH_STRUCTS_H__ + +#include "cmph.h" + +/** Hash generation algorithm data + */ +struct __mph_t +{ + MPH_ALGO algo; + key_source_t *key_source; + uint32 verbosity; + void *data; //algorithm dependent data +}; + +/** Hash querying algorithm data + */ +struct __mphf_t +{ + MPH_ALGO algo; + uint32 size; + key_source_t *key_source; + void *data; //algorithm dependent data +}; + +mph_t *__mph_new(MPH_ALGO algo, key_source_t *key_source); +void __mph_destroy(); +void __mphf_dump(mphf_t *mphf, FILE *); +mphf_t *__mphf_load(FILE *f); + + +#endif diff --git a/src/cmph_types.h b/src/cmph_types.h new file mode 100644 index 0000000..84b6a22 --- /dev/null +++ b/src/cmph_types.h @@ -0,0 +1,13 @@ +#ifndef __CMPH_TYPES_H__ +#define __CMPH_TYPES_H__ + +typedef unsigned char uint8; +typedef unsigned short uint16; +typedef unsigned int uint32; + +typedef enum { HASH_JENKINS, HASH_DJB2, HASH_SDBM, HASH_FNV, HASH_GLIB, HASH_PJW, HASH_COUNT } CMPH_HASH; +extern const char *hash_names[]; +typedef enum { MPH_CZECH, MPH_BMZ, MPH_COUNT } MPH_ALGO; /* included -- Fabiano */ +extern const char *mph_names[]; + +#endif diff --git a/src/czech.c b/src/czech.c new file mode 100644 index 0000000..d699e84 --- /dev/null +++ b/src/czech.c @@ -0,0 +1,320 @@ +#include "czech.h" +#include "cmph_structs.h" +#include "czech_structs.h" +#include "hash.h" + +#include +#include +#include +#include +#include +#include + +//#define DEBUG +#include "debug.h" + +static int czech_gen_edges(mph_t *mph); +static void czech_traverse(czech_mph_data_t *czech, char *visited, uint32 v); + +mph_t *czech_mph_new(key_source_t *key_source) +{ + mph_t *mph = NULL; + czech_mph_data_t *czech = NULL; + mph = __mph_new(MPH_CZECH, key_source); + if (mph == NULL) return NULL; + czech = (czech_mph_data_t *)malloc(sizeof(czech_mph_data_t)); + if (czech == NULL) + { + __mph_destroy(mph); + return NULL; + } + czech->hashfuncs[0] = HASH_JENKINS; + czech->hashfuncs[1] = HASH_JENKINS; + czech->g = NULL; + czech->graph = NULL; + czech->hashes = NULL; + mph->data = czech; + assert(mph->data); + return mph; +} +void czech_mph_destroy(mph_t *mph) +{ + czech_mph_data_t *data = (czech_mph_data_t *)mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + free(data); + __mph_destroy(mph); +} + +void czech_mph_set_hashfuncs(mph_t *mph, CMPH_HASH *hashfuncs) +{ + czech_mph_data_t *czech = (czech_mph_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + uint32 i = 0; + while(*hashptr != HASH_COUNT) + { + if (i >= 2) break; //czech only uses two hash functions + czech->hashfuncs[i] = *hashptr; + ++i, ++hashptr; + } +} + +mphf_t *czech_mph_create(mph_t *mph, float c) +{ + mphf_t *mphf = NULL; + czech_mphf_data_t *czechf = NULL; + + uint32 i; + uint32 iterations = 10; + char *visited = NULL; + czech_mph_data_t *czech = (czech_mph_data_t *)mph->data; + czech->m = mph->key_source->nkeys; + czech->n = ceil(c * mph->key_source->nkeys); + DEBUGP("m (edges): %u n (vertices): %u c: %f\n", czech->m, czech->n, c); + czech->graph = graph_new(czech->n, czech->m); + DEBUGP("Created graph\n"); + + czech->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3); + for(i = 0; i < 3; ++i) czech->hashes[i] = NULL; + //Mapping step + if (mph->verbosity) + { + fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", czech->m, czech->n); + } + while(1) + { + int ok; + czech->hashes[0] = hash_state_new(czech->hashfuncs[0], czech->n); + czech->hashes[1] = hash_state_new(czech->hashfuncs[1], czech->n); + ok = czech_gen_edges(mph); + if (!ok) + { + --iterations; + hash_state_destroy(czech->hashes[0]); + czech->hashes[0] = NULL; + hash_state_destroy(czech->hashes[1]); + czech->hashes[1] = NULL; + DEBUGP("%u iterations remaining\n", iterations); + if (mph->verbosity) + { + fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations); + } + if (iterations == 0) break; + } + else break; + } + if (iterations == 0) + { + graph_destroy(czech->graph); + return NULL; + } + + //Assignment step + if (mph->verbosity) + { + fprintf(stderr, "Starting assignment step\n"); + } + DEBUGP("Assignment step\n"); + visited = (char *)malloc(czech->n); + memset(visited, 0, czech->n); + free(czech->g); + czech->g = malloc(czech->n * sizeof(uint32)); + assert(czech->g); + for (i = 0; i < czech->n; ++i) + { + if (!visited[i]) + { + czech->g[i] = 0; + czech_traverse(czech, visited, i); + } + } + graph_destroy(czech->graph); + free(visited); + czech->graph = NULL; + + mphf = (mphf_t *)malloc(sizeof(mphf_t)); + mphf->algo = mph->algo; + czechf = (czech_mphf_data_t *)malloc(sizeof(czech_mph_data_t)); + czechf->g = czech->g; + czech->g = NULL; //transfer memory ownership + czechf->hashes = czech->hashes; + czech->hashes = NULL; //transfer memory ownership + czechf->n = czech->n; + czechf->m = czech->m; + mphf->data = czechf; + mphf->size = czech->m; + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + return mphf; +} + +static void czech_traverse(czech_mph_data_t *czech, char *visited, uint32 v) +{ + + graph_iterator_t it = graph_neighbors_it(czech->graph, v); + uint32 neighbor = 0; + visited[v] = 1; + + DEBUGP("Visiting vertex %u\n", v); + while((neighbor = graph_next_neighbor(czech->graph, &it)) != GRAPH_NO_NEIGHBOR) + { + DEBUGP("Visiting neighbor %u\n", neighbor); + if(visited[neighbor]) continue; + DEBUGP("Visiting neighbor %u\n", neighbor); + DEBUGP("Visiting edge %u->%u with id %u\n", v, neighbor, graph_edge_id(czech->graph, v, neighbor)); + czech->g[neighbor] = graph_edge_id(czech->graph, v, neighbor) - czech->g[v]; + DEBUGP("g is %u (%u - %u mod %u)\n", czech->g[neighbor], graph_edge_id(czech->graph, v, neighbor), czech->g[v], czech->m); + czech_traverse(czech, visited, neighbor); + } +} + +static int czech_gen_edges(mph_t *mph) +{ + uint32 e; + czech_mph_data_t *czech = (czech_mph_data_t *)mph->data; + int cycles = 0; + + DEBUGP("Generating edges for %u vertices\n", czech->n); + graph_clear_edges(czech->graph); + mph->key_source->rewind(mph->key_source->data); + for (e = 0; e < mph->key_source->nkeys; ++e) + { + uint32 h1, h2; + uint32 keylen; + char *key; + mph->key_source->read(mph->key_source->data, &key, &keylen); + h1 = hash(czech->hashes[0], key, keylen) % czech->n; + h2 = hash(czech->hashes[1], key, keylen) % czech->n; + if (h1 == h2) if (++h2 >= czech->n) h2 = 0; + if (h1 == h2) + { + if (mph->verbosity) fprintf(stderr, "Self loop for key %e\n", e); + mph->key_source->dispose(mph->key_source->data, key, keylen); + return 0; + } + DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key); + mph->key_source->dispose(mph->key_source->data, key, keylen); + graph_add_edge(czech->graph, h1, h2); + } + cycles = graph_is_cyclic(czech->graph); + if (mph->verbosity && cycles) fprintf(stderr, "Cyclic graph generated\n"); + DEBUGP("Looking for cycles: %u\n", cycles); + + return ! cycles; +} + +int czech_mphf_dump(mphf_t *mphf, FILE *fd) +{ + char *buf = NULL; + uint32 buflen; + uint32 nbuflen; + uint32 i; + uint32 two = htonl(2); //number of hash functions + czech_mphf_data_t *data = (czech_mphf_data_t *)mphf->data; + uint32 nn, nm; + __mphf_dump(mphf, fd); + + fwrite(&two, sizeof(uint32), 1, fd); + + hash_state_dump(data->hashes[0], &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbuflen = htonl(buflen); + fwrite(&nbuflen, sizeof(uint32), 1, fd); + fwrite(buf, buflen, 1, fd); + free(buf); + + hash_state_dump(data->hashes[1], &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbuflen = htonl(buflen); + fwrite(&nbuflen, sizeof(uint32), 1, fd); + fwrite(buf, buflen, 1, fd); + free(buf); + + nn = htonl(data->n); + fwrite(&nn, sizeof(uint32), 1, fd); + nm = htonl(data->m); + fwrite(&nm, sizeof(uint32), 1, fd); + + for (i = 0; i < data->n; ++i) + { + uint32 ng = htonl(data->g[i]); + fwrite(&ng, sizeof(uint32), 1, fd); + } + #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]); + fprintf(stderr, "\n"); + #endif + return 1; +} + +void czech_mphf_load(FILE *f, mphf_t *mphf) +{ + uint32 nhashes; + char fbuf[BUFSIZ]; + char *buf = NULL; + uint32 buflen; + uint32 i; + hash_state_t *state; + czech_mphf_data_t *czech = (czech_mphf_data_t *)malloc(sizeof(czech_mphf_data_t)); + + DEBUGP("Loading czech mphf\n"); + mphf->data = czech; + fread(&nhashes, sizeof(uint32), 1, f); + nhashes = ntohl(nhashes); + czech->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1)); + czech->hashes[nhashes] = NULL; + DEBUGP("Reading %u hashes\n", nhashes); + for (i = 0; i < nhashes; ++i) + { + hash_state_t *state = NULL; + fread(&buflen, sizeof(uint32), 1, f); + buflen = ntohl(buflen); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc(buflen); + fread(buf, buflen, 1, f); + state = hash_state_load(buf, buflen); + czech->hashes[i] = state; + free(buf); + } + + DEBUGP("Reading m and n\n"); + fread(&(czech->n), sizeof(uint32), 1, f); + czech->n = ntohl(czech->n); + fread(&(czech->m), sizeof(uint32), 1, f); + czech->m = ntohl(czech->m); + + czech->g = (uint32 *)malloc(sizeof(uint32)*czech->n); + fread(czech->g, czech->n*sizeof(uint32), 1, f); + for (i = 0; i < czech->n; ++i) czech->g[i] = ntohl(czech->g[i]); + #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < czech->n; ++i) fprintf(stderr, "%u ", czech->g[i]); + fprintf(stderr, "\n"); + #endif + return; +} + + +uint32 czech_mphf_search(mphf_t *mphf, const char *key, uint32 keylen) +{ + czech_mphf_data_t *czech = mphf->data; + uint32 h1 = hash(czech->hashes[0], key, keylen) % czech->n; + uint32 h2 = hash(czech->hashes[1], key, keylen) % czech->n; + DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + if (h1 == h2 && ++h2 > czech->n) h2 = 0; + DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, czech->g[h1], czech->g[h2], czech->m); + return (czech->g[h1] + czech->g[h2]) % czech->m; +} +void czech_mphf_destroy(mphf_t *mphf) +{ + czech_mphf_data_t *data = (czech_mphf_data_t *)mphf->data; + free(data->g); + hash_state_destroy(data->hashes[0]); + hash_state_destroy(data->hashes[1]); + free(data->hashes); + free(data); + free(mphf); +} diff --git a/src/czech.h b/src/czech.h new file mode 100644 index 0000000..9bb758f --- /dev/null +++ b/src/czech.h @@ -0,0 +1,18 @@ +#ifndef __CZECH_H__ +#define __CZECH_H__ + +#include "graph.h" +#include "cmph.h" + +typedef struct __czech_mphf_data_t czech_mphf_data_t; +typedef struct __czech_mph_data_t czech_mph_data_t; + +mph_t *czech_mph_new(key_source_t *key_source); +void czech_mph_set_hashfuncs(mph_t *mph, CMPH_HASH *hashfuncs); +void czech_mph_destroy(mph_t *mph); +mphf_t *czech_mph_create(mph_t *mph, float c); + +void czech_mphf_load(FILE *f, mphf_t *mphf); +int czech_mphf_dump(mphf_t *mphf, FILE *f); +uint32 czech_mphf_search(mphf_t *mphf, const char *key, uint32 keylen); +#endif diff --git a/src/czech_structs.h b/src/czech_structs.h new file mode 100644 index 0000000..3db81c8 --- /dev/null +++ b/src/czech_structs.h @@ -0,0 +1,24 @@ +#ifndef __CZECH_STRUCTS_H__ +#define __CZECH_STRUCTS_H__ + +#include "hash_state.h" + +struct __czech_mphf_data_t +{ + uint32 m; //edges (words) count + uint32 n; //vertex count + uint32 *g; + hash_state_t **hashes; +}; + +struct __czech_mph_data_t +{ + CMPH_HASH hashfuncs[2]; + uint32 m; //edges (words) count + uint32 n; //vertex count + graph_t *graph; + uint32 *g; + hash_state_t **hashes; +}; + +#endif diff --git a/src/debug.h b/src/debug.h new file mode 100644 index 0000000..49b564e --- /dev/null +++ b/src/debug.h @@ -0,0 +1,15 @@ +#ifndef __MY_DEBUGC__ +#define __MY_DEBUGC__ + +#ifdef __cplusplus +#include +#else +#include +#endif +#ifdef DEBUG +#define DEBUGP(args...) do { fprintf(stderr, "%s:%d ", __FILE__, __LINE__); fprintf(stderr, ## args); } while(0) +#else +#define DEBUGP(args...) +#endif + +#endif diff --git a/src/djb2_hash.c b/src/djb2_hash.c new file mode 100644 index 0000000..433452b --- /dev/null +++ b/src/djb2_hash.c @@ -0,0 +1,42 @@ +#include "djb2_hash.h" +#include + +djb2_state_t *djb2_state_new() +{ + djb2_state_t *state = (djb2_state_t *)malloc(sizeof(djb2_state_t)); + state->hashfunc = HASH_DJB2; + return state; +} + +void djb2_state_destroy(djb2_state_t *state) +{ + free(state); +} + +uint32 djb2_hash(djb2_state_t *state, const char *k, uint32 keylen) +{ + register unsigned int hash = 5381; + const unsigned char *ptr = k; + int i = 0; + while (i < keylen) + { + hash = hash*33 ^ *ptr; + ++ptr, ++i; + } + return hash; +} + + +void djb2_state_dump(djb2_state_t *state, char **buf, uint32 *buflen) +{ + *buf = NULL; + *buflen = 0; + return; +} + +djb2_state_t *djb2_state_load(const char *buf, uint32 buflen) +{ + djb2_state_t *state = (djb2_state_t *)malloc(sizeof(djb2_state_t)); + state->hashfunc = HASH_DJB2; + return state; +} diff --git a/src/djb2_hash.h b/src/djb2_hash.h new file mode 100644 index 0000000..0ea3998 --- /dev/null +++ b/src/djb2_hash.h @@ -0,0 +1,17 @@ +#ifndef __DJB2_HASH_H__ +#define __DJB2_HASH_H__ + +#include "hash.h" + +typedef struct __djb2_state_t +{ + CMPH_HASH hashfunc; +} djb2_state_t; + +djb2_state_t *djb2_state_new(); +uint32 djb2_hash(djb2_state_t *state, const char *k, uint32 keylen); +void djb2_state_dump(djb2_state_t *state, char **buf, uint32 *buflen); +djb2_state_t *djb2_state_load(const char *buf, uint32 buflen); +void djb2_state_destroy(djb2_state_t *state); + +#endif diff --git a/src/fnv_hash.c b/src/fnv_hash.c new file mode 100644 index 0000000..513fc56 --- /dev/null +++ b/src/fnv_hash.c @@ -0,0 +1,46 @@ +#include "fnv_hash.h" +#include + +fnv_state_t *fnv_state_new() +{ + fnv_state_t *state = (fnv_state_t *)malloc(sizeof(fnv_state_t)); + state->hashfunc = HASH_FNV; + return state; +} + +void fnv_state_destroy(fnv_state_t *state) +{ + free(state); +} + +uint32 fnv_hash(fnv_state_t *state, const char *k, uint32 keylen) +{ + const unsigned char *bp = (const unsigned char *)k; + const unsigned char *be = bp + keylen; + static unsigned int hval = 0; + + while (bp < be) + { + + //hval *= 0x01000193; good for non-gcc compiler + hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24); //good for gcc + + hval ^= *bp++; + } + return hval; +} + + +void fnv_state_dump(fnv_state_t *state, char **buf, uint32 *buflen) +{ + *buf = NULL; + *buflen = 0; + return; +} + +fnv_state_t *fnv_state_load(const char *buf, uint32 buflen) +{ + fnv_state_t *state = (fnv_state_t *)malloc(sizeof(fnv_state_t)); + state->hashfunc = HASH_FNV; + return state; +} diff --git a/src/fnv_hash.h b/src/fnv_hash.h new file mode 100644 index 0000000..a17717e --- /dev/null +++ b/src/fnv_hash.h @@ -0,0 +1,17 @@ +#ifndef __FNV_HASH_H__ +#define __FNV_HASH_H__ + +#include "hash.h" + +typedef struct __fnv_state_t +{ + CMPH_HASH hashfunc; +} fnv_state_t; + +fnv_state_t *fnv_state_new(); +uint32 fnv_hash(fnv_state_t *state, const char *k, uint32 keylen); +void fnv_state_dump(fnv_state_t *state, char **buf, uint32 *buflen); +fnv_state_t *fnv_state_load(const char *buf, uint32 buflen); +void fnv_state_destroy(fnv_state_t *state); + +#endif diff --git a/src/graph.c b/src/graph.c new file mode 100644 index 0000000..a2a927a --- /dev/null +++ b/src/graph.c @@ -0,0 +1,329 @@ +#include "graph.h" + +#include +#include +#include +#include +#include +#include "vstack.h" + +//#define DEBUG +#include "debug.h" + +#define abs_edge(e, i) (e % g->nedges + i * g->nedges) + +struct __graph_t +{ + uint32 nnodes; + uint32 nedges; + uint32 *edges; + uint32 *first; + uint32 *next; + uint8 *critical_nodes; /* included -- Fabiano*/ + uint32 ncritical_nodes; /* included -- Fabiano*/ + uint32 cedges; + int shrinking; +}; + +static uint32 EMPTY = UINT_MAX; + +graph_t *graph_new(uint32 nnodes, uint32 nedges) +{ + graph_t *graph = (graph_t *)malloc(sizeof(graph_t)); + if (!graph) return NULL; + + graph->edges = (uint32 *)malloc(sizeof(uint32) * 2 * nedges); + graph->next = (uint32 *)malloc(sizeof(uint32) * 2 * nedges); + graph->first = (uint32 *)malloc(sizeof(uint32) * nnodes); + graph->critical_nodes = NULL; /* included -- Fabiano*/ + graph->ncritical_nodes = 0; /* included -- Fabiano*/ + graph->nnodes = nnodes; + graph->nedges = nedges; + + graph_clear_edges(graph); + return graph; +} + + +void graph_destroy(graph_t *graph) +{ + DEBUGP("Destroying graph\n"); + free(graph->edges); + free(graph->first); + free(graph->next); + free(graph->critical_nodes); /* included -- Fabiano*/ + free(graph); + return; +} + +void graph_print(graph_t *g) +{ + uint32 i, e; + for (i = 0; i < g->nnodes; ++i) + { + DEBUGP("Printing edges connected to %u\n", i); + e = g->first[i]; + if (e != EMPTY) + { + printf("%u -> %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]); + while ((e = g->next[e]) != EMPTY) + { + printf("%u -> %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]); + } + } + + } + return; +} + +void graph_add_edge(graph_t *g, uint32 v1, uint32 v2) +{ + uint32 e = g->cedges; + + assert(v1 < g->nnodes); + assert(v2 < g->nnodes); + assert(e < g->nedges); + assert(!g->shrinking); + + g->next[e] = g->first[v1]; + g->first[v1] = e; + g->edges[e] = v2; + + g->next[e + g->nedges] = g->first[v2]; + g->first[v2] = e + g->nedges; + g->edges[e + g->nedges] = v1; + + ++(g->cedges); +} + +static int check_edge(graph_t *g, uint32 e, uint32 v1, uint32 v2) +{ + DEBUGP("Checking edge %u %u looking for %u %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)], v1, v2); + if (g->edges[abs_edge(e, 0)] == v1 && g->edges[abs_edge(e, 1)] == v2) return 1; + if (g->edges[abs_edge(e, 0)] == v2 && g->edges[abs_edge(e, 1)] == v1) return 1; + return 0; +} + +uint32 graph_edge_id(graph_t *g, uint32 v1, uint32 v2) +{ + uint32 e; + e = g->first[v1]; + assert(e != EMPTY); + if (check_edge(g, e, v1, v2)) return abs_edge(e, 0); + do + { + e = g->next[e]; + assert(e != EMPTY); + } + while (!check_edge(g, e, v1, v2)); + return abs_edge(e, 0); +} +static void del_edge_point(graph_t *g, uint32 v1, uint32 v2) +{ + uint32 e, prev; + + DEBUGP("Deleting edge point %u %u\n", v1, v2); + e = g->first[v1]; + if (check_edge(g, e, v1, v2)) + { + g->first[v1] = g->next[e]; + //g->edges[e] = EMPTY; + DEBUGP("Deleted\n"); + return; + } + DEBUGP("Checking linked list\n"); + do + { + prev = e; + e = g->next[e]; + assert(e != EMPTY); + } + while (!check_edge(g, e, v1, v2)); + + g->next[prev] = g->next[e]; + //g->edges[e] = EMPTY; + DEBUGP("Deleted\n"); +} + + +void graph_del_edge(graph_t *g, uint32 v1, uint32 v2) +{ + g->shrinking = 1; + del_edge_point(g, v1, v2); + del_edge_point(g, v2, v1); +} + +void graph_clear_edges(graph_t *g) +{ + uint32 i; + for (i = 0; i < g->nnodes; ++i) g->first[i] = EMPTY; + for (i = 0; i < g->nedges*2; ++i) + { + g->edges[i] = EMPTY; + g->next[i] = EMPTY; + } + g->cedges = 0; + g->shrinking = 0; +} + +static int find_degree1_edge(graph_t *g, uint32 v, char *deleted, uint32 *e) +{ + uint32 edge = g->first[v]; + char found = 0; + DEBUGP("Checking degree of vertex %u\n", v); + if (edge == EMPTY) return 0; + else if (!deleted[abs_edge(edge, 0)]) + { + found = 1; + *e = edge; + } + while(1) + { + edge = g->next[edge]; + if (edge == EMPTY) break; + if (deleted[abs_edge(edge, 0)]) continue; + if (found) return 0; + DEBUGP("Found first edge\n"); + *e = edge; + found = 1; + } + return found; +} + +static void cyclic_del_edge(graph_t *g, uint32 v, char *deleted) +{ + + uint32 e; + char degree1; + uint32 v1 = v; + uint32 v2 = 0; + + degree1 = find_degree1_edge(g, v1, deleted, &e); + if (!degree1) return; + while(1) + { + DEBUGP("Deleting edge %u (%u->%u)\n", e, g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]); + deleted[abs_edge(e, 0)] = 1; + + v2 = g->edges[abs_edge(e, 0)]; + if (v2 == v1) v2 = g->edges[abs_edge(e, 1)]; + + DEBUGP("Checking if second endpoint %u has degree 1\n", v2); + degree1 = find_degree1_edge(g, v2, deleted, &e); + if (degree1) + { + DEBUGP("Inspecting vertex %u\n", v2); + v1 = v2; + } + else break; + } +} + +int graph_is_cyclic(graph_t *g) +{ + uint32 i; + uint32 v; + char *deleted = (char *)malloc(g->nedges*sizeof(char)); + memset(deleted, 0, g->nedges); + + DEBUGP("Looking for cycles in graph with %u vertices and %u edges\n", g->nnodes, g->nedges); + for (v = 0; v < g->nnodes; ++v) + { + cyclic_del_edge(g, v, deleted); + } + for (i = 0; i < g->nedges; ++i) + { + if (!(deleted[i])) + { + DEBUGP("Edge %u %u->%u was not deleted\n", i, g->edges[i], g->edges[i + g->nedges]); + free(deleted); + return 1; + } + } + free(deleted); + return 0; +} + +uint8 graph_node_is_critical(graph_t * g, uint32 v) /* included -- Fabiano */ +{ + return g->critical_nodes[v]; +} + +void graph_obtain_critical_nodes(graph_t *g) /* included -- Fabiano*/ +{ + uint32 i; + uint32 v; + char *deleted = (char *)malloc(g->nedges*sizeof(char)); + memset(deleted, 0, g->nedges); +/* g->critical_nodes = (uint8 *)malloc((size_t)(ceil(g->nnodes*sizeof(uint8)/8.))); */ + g->critical_nodes = (uint8 *)malloc(g->nnodes*sizeof(uint8)); + g->ncritical_nodes = 0; + DEBUGP("Looking for the 2-core in graph with %u vertices and %u edges\n", g->nnodes, g->nedges); + for (v = 0; v < g->nnodes; ++v) + { + cyclic_del_edge(g, v, deleted); + } + + for (i = 0; i < g->nedges; ++i) + { + if (!(deleted[i])) + { + DEBUGP("Edge %u %u->%u belongs to the 2-core\n", i, g->edges[i], g->edges[i + g->nedges]); + if(!(g->critical_nodes[g->edges[i]])) + { + g->ncritical_nodes ++; + g->critical_nodes[g->edges[i]] = 1; + } + if(!(g->critical_nodes[g->edges[i + g->nedges]])) + { + g->ncritical_nodes ++; + g->critical_nodes[g->edges[i + g->nedges]] = 1; + } + } + } + free(deleted); +} + +uint8 graph_contains_edge(graph_t *g, uint32 v1, uint32 v2) /* included -- Fabiano*/ +{ + uint32 e; + e = g->first[v1]; + if(e == EMPTY) return 0; + if (check_edge(g, e, v1, v2)) return 1; + do + { + e = g->next[e]; + if(e == EMPTY) return 0; + } + while (!check_edge(g, e, v1, v2)); + return 1; +} + +uint32 graph_vertex_id(graph_t *g, uint32 e, uint32 id) /* included -- Fabiano*/ +{ + return (g->edges[e + id*g->nedges]); +} + +uint32 graph_ncritical_nodes(graph_t *g) /* included -- Fabiano*/ +{ + return g->ncritical_nodes; +} + +graph_iterator_t graph_neighbors_it(graph_t *g, uint32 v) +{ + graph_iterator_t it; + it.vertex = v; + it.edge = g->first[v]; + return it; +} +uint32 graph_next_neighbor(graph_t *g, graph_iterator_t* it) +{ + uint32 ret; + if(it->edge == EMPTY) return GRAPH_NO_NEIGHBOR; + if (g->edges[it->edge] == it->vertex) ret = g->edges[it->edge + g->nedges]; + else ret = g->edges[it->edge]; + it->edge = g->next[it->edge]; + return ret; +} + + diff --git a/src/graph.h b/src/graph.h new file mode 100644 index 0000000..c512746 --- /dev/null +++ b/src/graph.h @@ -0,0 +1,39 @@ +#ifndef _CMPH_GRAPH_H__ +#define _CMPH_GRAPH_H__ + +#include +#include "cmph_types.h" + +#define GRAPH_NO_NEIGHBOR UINT_MAX + +typedef struct __graph_t graph_t; +typedef struct __graph_iterator_t graph_iterator_t; +struct __graph_iterator_t +{ + uint32 vertex; + uint32 edge; +}; + + + +graph_t *graph_new(uint32 nnodes, uint32 nedges); +void graph_destroy(graph_t *graph); + +void graph_add_edge(graph_t *g, uint32 v1, uint32 v2); +//void graph_del_edge(graph_t *g, uint32 v1, uint32 v2); +void graph_clear_edges(graph_t *g); +uint32 graph_edge_id(graph_t *g, uint32 v1, uint32 v2); + +graph_iterator_t graph_neighbors_it(graph_t *g, uint32 v); +uint32 graph_next_neighbor(graph_t *g, graph_iterator_t* it); + +void graph_obtain_critical_nodes(graph_t *g); /* included -- Fabiano*/ +uint8 graph_node_is_critical(graph_t * g, uint32 v); /* included -- Fabiano */ +uint32 graph_ncritical_nodes(graph_t *g); /* included -- Fabiano*/ +uint32 graph_vertex_id(graph_t *g, uint32 e, uint32 id); /* included -- Fabiano*/ + +int graph_is_cyclic(graph_t *g); + +void graph_print(graph_t *); + +#endif diff --git a/src/hash.c b/src/hash.c new file mode 100644 index 0000000..896c9fb --- /dev/null +++ b/src/hash.c @@ -0,0 +1,139 @@ +#include "hash_state.h" +#include +#include +#include +#include + +//#define DEBUG +#include "debug.h" + +const char *hash_names[] = { "jenkins", "djb2", "sdbm", "fnv", "glib", "pjw", NULL }; + +hash_state_t *hash_state_new(CMPH_HASH hashfunc, uint32 hashsize) +{ + hash_state_t *state = NULL; + switch (hashfunc) + { + case HASH_JENKINS: + DEBUGP("Jenkins function - %u\n", hashsize); + state = (hash_state_t *)jenkins_state_new(hashsize); + DEBUGP("Jenkins function created\n"); + break; + case HASH_DJB2: + state = (hash_state_t *)djb2_state_new(); + break; + case HASH_SDBM: + state = (hash_state_t *)sdbm_state_new(); + break; + case HASH_FNV: + state = (hash_state_t *)fnv_state_new(); + break; + default: + assert(0); + } + state->hashfunc = hashfunc; + return state; +} +uint32 hash(hash_state_t *state, const char *key, uint32 keylen) +{ + switch (state->hashfunc) + { + case HASH_JENKINS: + return jenkins_hash((jenkins_state_t *)state, key, keylen); + case HASH_DJB2: + return djb2_hash((djb2_state_t *)state, key, keylen); + case HASH_SDBM: + return sdbm_hash((sdbm_state_t *)state, key, keylen); + case HASH_FNV: + return fnv_hash((fnv_state_t *)state, key, keylen); + default: + assert(0); + } + assert(0); + return 0; +} + +void hash_state_dump(hash_state_t *state, char **buf, uint32 *buflen) +{ + char *algobuf; + switch (state->hashfunc) + { + case HASH_JENKINS: + jenkins_state_dump((jenkins_state_t *)state, &algobuf, buflen); + if (*buflen == UINT_MAX) return; + break; + case HASH_DJB2: + djb2_state_dump((djb2_state_t *)state, &algobuf, buflen); + if (*buflen == UINT_MAX) return; + break; + case HASH_SDBM: + sdbm_state_dump((sdbm_state_t *)state, &algobuf, buflen); + if (*buflen == UINT_MAX) return; + break; + case HASH_FNV: + fnv_state_dump((fnv_state_t *)state, &algobuf, buflen); + if (*buflen == UINT_MAX) return; + break; + default: + assert(0); + } + *buf = malloc(strlen(hash_names[state->hashfunc]) + 1 + *buflen); + memcpy(*buf, hash_names[state->hashfunc], strlen(hash_names[state->hashfunc]) + 1); + DEBUGP("Algobuf is %u\n", *(uint32 *)algobuf); + memcpy(*buf + strlen(hash_names[state->hashfunc]) + 1, algobuf, *buflen); + *buflen = strlen(hash_names[state->hashfunc]) + 1 + *buflen; + free(algobuf); + return; +} + +hash_state_t *hash_state_load(const char *buf, uint32 buflen) +{ + uint32 i; + uint32 offset; + CMPH_HASH hashfunc = HASH_COUNT; + for (i = 0; i < HASH_COUNT; ++i) + { + if (strcmp(buf, hash_names[i]) == 0) + { + hashfunc = i; + break; + } + } + if (hashfunc == HASH_COUNT) return NULL; + offset = strlen(hash_names[hashfunc]) + 1; + switch (hashfunc) + { + case HASH_JENKINS: + return (hash_state_t *)jenkins_state_load(buf + offset, buflen - offset); + case HASH_DJB2: + return (hash_state_t *)djb2_state_load(buf + offset, buflen - offset); + case HASH_SDBM: + return (hash_state_t *)sdbm_state_load(buf + offset, buflen - offset); + case HASH_FNV: + return (hash_state_t *)fnv_state_load(buf + offset, buflen - offset); + default: + return NULL; + } + return NULL; +} +void hash_state_destroy(hash_state_t *state) +{ + switch (state->hashfunc) + { + case HASH_JENKINS: + jenkins_state_destroy((jenkins_state_t *)state); + break; + case HASH_DJB2: + djb2_state_destroy((djb2_state_t *)state); + break; + case HASH_SDBM: + sdbm_state_destroy((sdbm_state_t *)state); + break; + case HASH_FNV: + fnv_state_destroy((fnv_state_t *)state); + break; + default: + assert(0); + } + return; +} diff --git a/src/hash.h b/src/hash.h new file mode 100644 index 0000000..90b8e37 --- /dev/null +++ b/src/hash.h @@ -0,0 +1,14 @@ +#ifndef __CMPH_HASH_H__ +#define __CMPH_HASH_H__ + +#include "cmph_types.h" + +typedef union __hash_state_t hash_state_t; + +hash_state_t *hash_state_new(CMPH_HASH, uint32 hashsize); +uint32 hash(hash_state_t *state, const char *key, uint32 keylen); +void hash_state_dump(hash_state_t *state, char **buf, uint32 *buflen); +hash_state_t *hash_state_load(const char *buf, uint32 buflen); +void hash_state_destroy(hash_state_t *state); + +#endif diff --git a/src/hash_state.h b/src/hash_state.h new file mode 100644 index 0000000..67dcd77 --- /dev/null +++ b/src/hash_state.h @@ -0,0 +1,18 @@ +#ifndef __HASH_STATE_H__ +#define __HASH_STATE_H__ + +#include "hash.h" +#include "jenkins_hash.h" +#include "djb2_hash.h" +#include "sdbm_hash.h" +#include "fnv_hash.h" +union __hash_state_t +{ + CMPH_HASH hashfunc; + jenkins_state_t jenkins; + djb2_state_t djb2; + sdbm_state_t sdbm; + fnv_state_t fnv; +}; + +#endif diff --git a/src/jenkins_hash.c b/src/jenkins_hash.c new file mode 100644 index 0000000..65ab7f3 --- /dev/null +++ b/src/jenkins_hash.c @@ -0,0 +1,191 @@ +#include "jenkins_hash.h" +#include +#include +#include +#include +#include + +//#define DEBUG +#include "debug.h" + +#define hashsize(n) ((uint32)1<<(n)) +#define hashmask(n) (hashsize(n)-1) + + + +//#define NM2 /* Define this if you do not want power of 2 table sizes*/ + + +/* + -------------------------------------------------------------------- + mix -- mix 3 32-bit values reversibly. + For every delta with one or two bits set, and the deltas of all three + high bits or all three low bits, whether the original value of a,b,c + is almost all zero or is uniformly distributed, + * If mix() is run forward or backward, at least 32 bits in a,b,c + have at least 1/4 probability of changing. + * If mix() is run forward, every bit of c will change between 1/3 and + 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.) + mix() was built out of 36 single-cycle latency instructions in a + structure that could supported 2x parallelism, like so: + a -= b; + a -= c; x = (c>>13); + b -= c; a ^= x; + b -= a; x = (a<<8); + c -= a; b ^= x; + c -= b; x = (b>>13); + ... + Unfortunately, superscalar Pentiums and Sparcs can't take advantage + of that parallelism. They've also turned some of those single-cycle + latency instructions into multi-cycle latency instructions. Still, + this is the fastest good hash I could find. There were about 2^^68 + to choose from. I only looked at a billion or so. + -------------------------------------------------------------------- + */ +#define mix(a,b,c) \ +{ \ + a -= b; a -= c; a ^= (c>>13); \ + b -= c; b -= a; b ^= (a<<8); \ + c -= a; c -= b; c ^= (b>>13); \ + a -= b; a -= c; a ^= (c>>12); \ + b -= c; b -= a; b ^= (a<<16); \ + c -= a; c -= b; c ^= (b>>5); \ + a -= b; a -= c; a ^= (c>>3); \ + b -= c; b -= a; b ^= (a<<10); \ + c -= a; c -= b; c ^= (b>>15); \ +} + +/* + -------------------------------------------------------------------- + hash() -- hash a variable-length key into a 32-bit value +k : the key (the unaligned variable-length array of bytes) +len : the length of the key, counting by bytes +initval : can be any 4-byte value +Returns a 32-bit value. Every bit of the key affects every bit of +the return value. Every 1-bit and 2-bit delta achieves avalanche. +About 6*len+35 instructions. + +The best hash table sizes are powers of 2. There is no need to do +mod a prime (mod is sooo slow!). If you need less than 32 bits, +use a bitmask. For example, if you need only 10 bits, do +h = (h & hashmask(10)); +In which case, the hash table should have hashsize(10) elements. + +If you are hashing n strings (uint8 **)k, do it like this: +for (i=0, h=0; iseed = rand() % size; + state->nbits = ceil(log(size)/M_LOG2E); + state->size = size; + DEBUGP("Initialized jenkins with size %u, nbits %u and seed %u\n", size, state->nbits, state->seed); + return state; +} +void jenkins_state_destroy(jenkins_state_t *state) +{ + free(state); +} + +uint32 jenkins_hash(jenkins_state_t *state, const char *k, uint32 keylen) +{ + uint32 a, b, c; + uint32 len, length; + + /* Set up the internal state */ + length = keylen; + len = length; + a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */ + c = state->seed; /* the previous hash value - seed in our case */ + + /*---------------------------------------- handle most of the key */ + while (len >= 12) + { + a += (k[0] +((uint32)k[1]<<8) +((uint32)k[2]<<16) +((uint32)k[3]<<24)); + b += (k[4] +((uint32)k[5]<<8) +((uint32)k[6]<<16) +((uint32)k[7]<<24)); + c += (k[8] +((uint32)k[9]<<8) +((uint32)k[10]<<16)+((uint32)k[11]<<24)); + mix(a,b,c); + k += 12; len -= 12; + } + + /*------------------------------------- handle the last 11 bytes */ + c += length; + switch(len) /* all the case statements fall through */ + { + case 11: + c +=((uint32)k[10]<<24); + case 10: + c +=((uint32)k[9]<<16); + case 9 : + c +=((uint32)k[8]<<8); + /* the first byte of c is reserved for the length */ + case 8 : + b +=((uint32)k[7]<<24); + case 7 : + b +=((uint32)k[6]<<16); + case 6 : + b +=((uint32)k[5]<<8); + case 5 : + b +=k[4]; + case 4 : + a +=((uint32)k[3]<<24); + case 3 : + a +=((uint32)k[2]<<16); + case 2 : + a +=((uint32)k[1]<<8); + case 1 : + a +=k[0]; + /* case 0: nothing left to add */ + } + + mix(a,b,c); + + /*-------------------------------------------- report the result */ + + //c = (c & hashmask(state->size)); + //c = (c >= state->size) ? c ^ state->size: c; + + //state->last_hash = c; Do not update last_hash because we use a fixed + //seed + return c; +} + +void jenkins_state_dump(jenkins_state_t *state, char **buf, uint32 *buflen) +{ + uint32 nseed = htonl(state->seed); + uint32 nnbits = htonl(state->nbits); + uint32 nsize = htonl(state->size); + *buflen = sizeof(uint32)*3; + *buf = malloc(*buflen); + if (!*buf) + { + *buflen = UINT_MAX; + return; + } + memcpy(*buf, &nseed, sizeof(uint32)); + memcpy(*buf + sizeof(uint32), &nnbits, sizeof(uint32)); + memcpy(*buf + sizeof(uint32)*2, &nsize, sizeof(uint32)); + DEBUGP("Dumped jenkins state with seed %u\n", state->seed); + + return; +} +jenkins_state_t *jenkins_state_load(const char *buf, uint32 buflen) +{ + jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t)); + state->seed = ntohl(*(uint32 *)buf); + state->nbits = ntohl(*(((uint32 *)buf) + 1)); + state->size = ntohl(*(((uint32 *)buf) + 2)); + state->hashfunc = HASH_JENKINS; + DEBUGP("Loaded jenkins state with seed %u\n", state->seed); + return state; +} diff --git a/src/jenkins_hash.h b/src/jenkins_hash.h new file mode 100644 index 0000000..1a84080 --- /dev/null +++ b/src/jenkins_hash.h @@ -0,0 +1,20 @@ +#ifndef __JEKINS_HASH_H__ +#define __JEKINS_HASH_H__ + +#include "hash.h" + +typedef struct __jenkins_state_t +{ + CMPH_HASH hashfunc; + uint32 seed; + uint32 nbits; + uint32 size; +} jenkins_state_t; + +jenkins_state_t *jenkins_state_new(uint32 size); //size of hash table +uint32 jenkins_hash(jenkins_state_t *state, const char *k, uint32 keylen); +void jenkins_state_dump(jenkins_state_t *state, char **buf, uint32 *buflen); +jenkins_state_t *jenkins_state_load(const char *buf, uint32 buflen); +void jenkins_state_destroy(jenkins_state_t *state); + +#endif diff --git a/src/main.c b/src/main.c new file mode 100644 index 0000000..bab4c97 --- /dev/null +++ b/src/main.c @@ -0,0 +1,282 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include "cmph.h" +#include "hash.h" +#include "../config.h" + +void usage(const char *prg) +{ + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-g [-s seed] ] [-m file.mph] [-a algorithm] keysfile\n", prg); +} +void usage_long(const char *prg) +{ + uint32 i; + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-g [-s seed] ] [-m file.mph] [-a algorithm] keysfile\n", prg); + fprintf(stderr, "Minimum perfect hashing tool\n\n"); + fprintf(stderr, " -h\t print this help message\n"); + fprintf(stderr, " -a\t algorithm - valid values are\n"); + for (i = 0; i < MPH_COUNT; ++i) fprintf(stderr, " \t * %s\n", mph_names[i]); + fprintf(stderr, " -f\t hash function (may be used multiple times) - valid values are\n"); + for (i = 0; i < HASH_COUNT; ++i) fprintf(stderr, " \t * %s\n", hash_names[i]); + fprintf(stderr, " -V\t print version number and exit\n"); + fprintf(stderr, " -v\t increase verbosity (may be used multiple times)\n"); + fprintf(stderr, " -g\t generation mode\n"); + fprintf(stderr, " -s\t random seed\n"); + fprintf(stderr, " -m\t minimum perfect hash function file \n"); + fprintf(stderr, " keysfile\t line separated file with keys\n"); +} + +static int key_read(void *data, char **key, uint32 *keylen) +{ + FILE *fd = (FILE *)data; + *key = NULL; + *keylen = 0; + while(1) + { + char buf[BUFSIZ]; + char *c = fgets(buf, BUFSIZ, fd); + if (c == NULL) return -1; + if (feof(fd)) return -1; + *key = (char *)realloc(*key, *keylen + strlen(buf) + 1); + memcpy(*key + *keylen, buf, strlen(buf)); + *keylen += strlen(buf); + if (buf[strlen(buf) - 1] != '\n') continue; + break; + } + if ((*keylen) && (*key)[*keylen - 1] == '\n') + { + (*key)[(*keylen) - 1] = 0; + --(*keylen); + } + return *keylen; +} + +static void key_dispose(void *data, char *key, uint32 keylen) +{ + free(key); +} +static void key_rewind(void *data) +{ + FILE *fd = (FILE *)data; + rewind(fd); +} + +static uint32 count_keys(FILE *fd) +{ + uint32 count = 0; + rewind(fd); + while(1) + { + char buf[BUFSIZ]; + char *c = fgets(buf, BUFSIZ, fd); + if (feof(fd)) break; + if (buf[strlen(buf) - 1] != '\n') continue; + ++count; + } + rewind(fd); + return count; +} + +int main(int argc, char **argv) +{ + char verbosity = 0; + char generate = 0; + char *mphf_file = NULL; + FILE *mphf_fd = stdout; + const char *keys_file = NULL; + FILE *keys_fd; + uint32 seed = UINT_MAX; + CMPH_HASH *hashes = NULL; + uint32 nhashes = 0; + uint32 i; + MPH_ALGO mph_algo = MPH_CZECH; + mph_t *mph = NULL; + mphf_t *mphf = NULL; + + key_source_t source; + + while (1) + { + char c = getopt(argc, argv, "hVva:f:gm:s:"); + if (c == -1) break; + switch (c) + { + case 's': + { + char *cptr; + seed = strtoul(optarg, &cptr, 10); + if(*cptr != 0) { + fprintf(stderr, "Invalid seed %s\n", optarg); + exit(1); + } + } + break; + case 'g': + generate = 1; + break; + case 'm': + mphf_file = strdup(optarg); + break; + case 'v': + ++verbosity; + break; + case 'V': + printf("%s\n", VERSION); + return 0; + case 'h': + usage_long(argv[0]); + return 0; + case 'a': + { + char valid = 0; + for (i = 0; i < MPH_COUNT; ++i) + { + if (strcmp(mph_names[i], optarg) == 0) + { + mph_algo = i; + valid = 1; + break; + } + } + if (!valid) + { + fprintf(stderr, "Invalid mph algorithm: %s\n", optarg); + return -1; + } + } + break; + case 'f': + { + char valid = 0; + for (i = 0; i < HASH_COUNT; ++i) + { + if (strcmp(hash_names[i], optarg) == 0) + { + hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 )); + hashes[nhashes] = i; + hashes[nhashes + 1] = HASH_COUNT; + ++nhashes; + valid = 1; + break; + } + } + if (!valid) + { + fprintf(stderr, "Invalid hash function: %s\n", optarg); + return -1; + } + } + break; + default: + usage(argv[0]); + return 1; + } + } + + if (optind != argc - 1) + { + usage(argv[0]); + return 1; + } + keys_file = argv[optind]; + if (seed == UINT_MAX) seed = time(NULL); + srand(seed); + + if (mphf_file == NULL) + { + mphf_file = (char *)malloc(strlen(keys_file) + 5); + memcpy(mphf_file, keys_file, strlen(keys_file)); + memcpy(mphf_file + strlen(keys_file), ".mph\0", 5); + } + + keys_fd = fopen(keys_file, "r"); + if (keys_fd == NULL) + { + fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno)); + return -1; + } + + source.data = (void *)keys_fd; + source.nkeys = count_keys(keys_fd); + source.read = key_read; + source.dispose = key_dispose; + source.rewind = key_rewind; + + if (generate) + { + //Create mphf + + mph = mph_new(mph_algo, &source); + if (nhashes) mph_set_hashfuncs(mph, hashes); + mph_set_verbosity(mph, verbosity); + mphf = mph_create(mph); + + if (mphf == NULL) + { + fprintf(stderr, "Unable to create minimum perfect hashing function\n"); + mph_destroy(mph); + free(mphf_file); + return -1; + } + + mphf_fd = fopen(mphf_file, "w"); + if (mphf_fd == NULL) + { + fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno)); + free(mphf_file); + return -1; + } + mphf_dump(mphf, mphf_fd); + mphf_destroy(mphf); + fclose(mphf_fd); + } + else + { + uint8 * hashtable = NULL; + mphf_fd = fopen(mphf_file, "r"); + if (mphf_fd == NULL) + { + fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno)); + free(mphf_file); + return -1; + } + mphf = mphf_load(mphf_fd); + fclose(mphf_fd); + if (!mphf) + { + fprintf(stderr, "Unable to parser input file %s\n", mphf_file); + free(mphf_file); + return -1; + } + hashtable = (uint8*)malloc(source.nkeys*sizeof(uint8)); + memset(hashtable, 0, source.nkeys); + //check all keys + for (i = 0; i < source.nkeys; ++i) + { + uint32 h; + char *buf; + uint32 buflen = 0; + source.read(source.data, &buf, &buflen); + h = mphf_search(mphf, buf, buflen); + if(hashtable[h])fprintf(stderr, "collision: %u\n",h); + assert(hashtable[h]==0); + hashtable[h] = 1; + if (verbosity) + { + printf("%s -> %u\n", buf, h); + } + source.dispose(source.data, buf, buflen); + } + mphf_destroy(mphf); + free(hashtable); + } + fclose(keys_fd); + free(mphf_file); + return 0; +} diff --git a/src/sdbm_hash.c b/src/sdbm_hash.c new file mode 100644 index 0000000..391d912 --- /dev/null +++ b/src/sdbm_hash.c @@ -0,0 +1,42 @@ +#include "sdbm_hash.h" +#include + +sdbm_state_t *sdbm_state_new() +{ + sdbm_state_t *state = (sdbm_state_t *)malloc(sizeof(sdbm_state_t)); + state->hashfunc = HASH_SDBM; + return state; +} + +void sdbm_state_destroy(sdbm_state_t *state) +{ + free(state); +} + +uint32 sdbm_hash(sdbm_state_t *state, const char *k, uint32 keylen) +{ + register unsigned int hash = 0; + const unsigned char *ptr = k; + int i = 0; + + while(i < keylen) { + hash = *ptr + (hash << 6) + (hash << 16) - hash; + ++ptr, ++i; + } + return hash; +} + + +void sdbm_state_dump(sdbm_state_t *state, char **buf, uint32 *buflen) +{ + *buf = NULL; + *buflen = 0; + return; +} + +sdbm_state_t *sdbm_state_load(const char *buf, uint32 buflen) +{ + sdbm_state_t *state = (sdbm_state_t *)malloc(sizeof(sdbm_state_t)); + state->hashfunc = HASH_SDBM; + return state; +} diff --git a/src/sdbm_hash.h b/src/sdbm_hash.h new file mode 100644 index 0000000..9e27f0f --- /dev/null +++ b/src/sdbm_hash.h @@ -0,0 +1,17 @@ +#ifndef __SDBM_HASH_H__ +#define __SDBM_HASH_H__ + +#include "hash.h" + +typedef struct __sdbm_state_t +{ + CMPH_HASH hashfunc; +} sdbm_state_t; + +sdbm_state_t *sdbm_state_new(); +uint32 sdbm_hash(sdbm_state_t *state, const char *k, uint32 keylen); +void sdbm_state_dump(sdbm_state_t *state, char **buf, uint32 *buflen); +sdbm_state_t *sdbm_state_load(const char *buf, uint32 buflen); +void sdbm_state_destroy(sdbm_state_t *state); + +#endif diff --git a/src/vqueue.c b/src/vqueue.c new file mode 100644 index 0000000..af07e0a --- /dev/null +++ b/src/vqueue.c @@ -0,0 +1,49 @@ +#include "vqueue.h" +#include +#include +struct __vqueue_t +{ + uint32 * values; + uint32 beg, end, capacity; +}; + +vqueue_t * vqueue_new(uint32 capacity) +{ + vqueue_t *q = (vqueue_t *)malloc(sizeof(vqueue_t)); + assert(q); + q->values = (uint32 *)calloc(capacity+1, sizeof(uint32)); + q->beg = q->end = 0; + q->capacity = capacity+1; + return q; +} + +uint8 vqueue_is_empty(vqueue_t * q) +{ + return (q->beg == q->end); +} + +void vqueue_insert(vqueue_t * q, uint32 val) +{ + assert((q->end + 1)%q->capacity != q->beg); // Is queue full? + q->end = (q->end + 1)%q->capacity; + q->values[q->end] = val; +} + +uint32 vqueue_remove(vqueue_t * q) +{ + assert(!vqueue_is_empty(q)); // Is queue empty? + q->beg = (q->beg + 1)%q->capacity; + return q->values[q->beg]; +} + +void vqueue_print(vqueue_t * q) +{ + uint32 i; + for (i = q->beg; i != q->end; i = (i + 1)%q->capacity) + fprintf(stderr, "%u\n", q->values[(i + 1)%q->capacity]); +} + +void vqueue_destroy(vqueue_t *q) +{ + free(q->values); q->values = NULL; +} diff --git a/src/vqueue.h b/src/vqueue.h new file mode 100644 index 0000000..cd870bb --- /dev/null +++ b/src/vqueue.h @@ -0,0 +1,18 @@ +#ifndef __CMPH_VQUEUE_H__ +#define __CMPH_VQUEUE_H__ + +#include "cmph_types.h" +typedef struct __vqueue_t vqueue_t; + +vqueue_t * vqueue_new(uint32 capacity); + +uint8 vqueue_is_empty(vqueue_t * q); + +void vqueue_insert(vqueue_t * q, uint32 val); + +uint32 vqueue_remove(vqueue_t * q); + +void vqueue_print(vqueue_t * q); + +void vqueue_destroy(vqueue_t * q); +#endif diff --git a/src/vstack.c b/src/vstack.c new file mode 100644 index 0000000..67b2945 --- /dev/null +++ b/src/vstack.c @@ -0,0 +1,79 @@ +#include "vstack.h" + +#include +#include + +//#define DEBUG +#include "debug.h" + +struct __vstack_t +{ + uint32 pointer; + uint32 *values; + uint32 capacity; +}; + +vstack_t *vstack_new() +{ + vstack_t *stack = (vstack_t *)malloc(sizeof(vstack_t)); + assert(stack); + stack->pointer = 0; + stack->values = NULL; + stack->capacity = 0; + return stack; +} + +void vstack_destroy(vstack_t *stack) +{ + assert(stack); + free(stack->values); + free(stack); +} + +void vstack_push(vstack_t *stack, uint32 val) +{ + assert(stack); + vstack_reserve(stack, stack->pointer + 1); + stack->values[stack->pointer] = val; + ++(stack->pointer); +} +void vstack_pop(vstack_t *stack) +{ + assert(stack); + assert(stack->pointer > 0); + --(stack->pointer); +} + +uint32 vstack_top(vstack_t *stack) +{ + assert(stack); + assert(stack->pointer > 0); + return stack->values[(stack->pointer - 1)]; +} +int vstack_empty(vstack_t *stack) +{ + assert(stack); + return stack->pointer == 0; +} +uint32 vstack_size(vstack_t *stack) +{ + return stack->pointer; +} +void vstack_reserve(vstack_t *stack, uint32 size) +{ + assert(stack); + if (stack->capacity < size) + { + uint32 new_capacity = stack->capacity + 1; + DEBUGP("Increasing current capacity %u to %u\n", stack->capacity, size); + while (new_capacity < size) + { + new_capacity *= 2; + } + stack->values = (uint32 *)realloc(stack->values, sizeof(uint32)*new_capacity); + assert(stack->values); + stack->capacity = new_capacity; + DEBUGP("Increased\n"); + } +} + diff --git a/src/vstack.h b/src/vstack.h new file mode 100644 index 0000000..85893c4 --- /dev/null +++ b/src/vstack.h @@ -0,0 +1,18 @@ +#ifndef __CMPH_VSTACK_H__ +#define __CMPH_VSTACK_H__ + +#include "cmph_types.h" +typedef struct __vstack_t vstack_t; + +vstack_t *vstack_new(); +void vstack_destroy(vstack_t *stack); + +void vstack_push(vstack_t *stack, uint32 val); +uint32 vstack_top(vstack_t *stack); +void vstack_pop(vstack_t *stack); +int vstack_empty(vstack_t *stack); +uint32 vstack_size(vstack_t *stack); + +void vstack_reserve(vstack_t *stack, uint32 size); + +#endif diff --git a/tests/Makefile.am b/tests/Makefile.am new file mode 100644 index 0000000..50cfcd6 --- /dev/null +++ b/tests/Makefile.am @@ -0,0 +1,4 @@ +noinst_PROGRAMS = graph_tests + +graph_tests_SOURCES = graph_tests.c +graph_tests_LDADD = ../src/libcmph.la diff --git a/tests/graph_tests.c b/tests/graph_tests.c new file mode 100644 index 0000000..35476aa --- /dev/null +++ b/tests/graph_tests.c @@ -0,0 +1,67 @@ +#include "../src/graph.h" + +#define DEBUG +#include "../src/debug.h" + +int main(int argc, char **argv) +{ + graph_iterator_t it; + uint32 i, neighbor; + graph_t *g = graph_new(5, 10); + + fprintf(stderr, "Building random graph\n"); + for (i = 0; i < 10; ++i) + { + uint32 v1 = i % 5; + uint32 v2 = (i*2) % 5; + if (v1 == v2) continue; + graph_add_edge(g, v1, v2); + DEBUGP("Added edge %u %u\n", v1, v2); + } + graph_print(g); + graph_del_edge(g, 4, 3); + graph_print(g); + graph_clear_edges(g); + graph_print(g); + graph_destroy(g); + + fprintf(stderr, "Building cyclic graph\n"); + g = graph_new(4, 5); + graph_add_edge(g, 0, 3); + graph_add_edge(g, 0, 1); + graph_add_edge(g, 1, 2); + graph_add_edge(g, 2, 0); + if (!graph_is_cyclic(g)) + { + return 1; + } + graph_destroy(g); + + fprintf(stderr, "Building non-cyclic graph\n"); + g = graph_new(5, 4); + graph_add_edge(g, 0, 1); + graph_add_edge(g, 1, 2); + graph_add_edge(g, 2, 3); + graph_add_edge(g, 3, 4); + + if (graph_is_cyclic(g)) + { + return 1; + } + + fprintf(stderr, "Checking neighbors iterator\n"); + it = graph_neighbors_it(g, 1); + neighbor = graph_next_neighbor(g, &it); + DEBUGP("Neighbor is %u\n", neighbor); + if (neighbor != 0 && neighbor != 2) return 1; + neighbor = graph_next_neighbor(g, &it); + DEBUGP("Neighbor is %u\n", neighbor); + if (neighbor != 0 && neighbor != 2) return 1; + neighbor = graph_next_neighbor(g, &it); + DEBUGP("Neighbor is %u\n", neighbor); + if (neighbor != GRAPH_NO_NEIGHBOR) return 1; + + + graph_destroy(g); + return 0; +}