%%%%%%%%%%%%%%%%%%%%%%% file template.tex %%%%%%%%%%%%%%%%%%%%%%%%% % % This is a template file for the LaTeX package SVJour2 for the % Springer journal "The VLDB Journal". % % Springer Heidelberg 2004/12/03 % % Copy it to a new file with a new name and use it as the basis % for your article. Delete % as needed. % %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % % First comes an example EPS file -- just ignore it and % proceed on the \documentclass line % your LaTeX will extract the file if required %\begin{filecontents*}{figs/minimalperfecthash-ph-mph.ps} %!PS-Adobe-3.0 EPSF-3.0 %%BoundingBox: 19 19 221 221 %%CreationDate: Mon Sep 29 1997 %%Creator: programmed by hand (JK) %%EndComments %gsave %newpath % 20 20 moveto % 20 220 lineto % 220 220 lineto % 220 20 lineto %closepath %2 setlinewidth %gsave % .4 setgray fill %grestore %stroke %grestore %\end{filecontents*} % \documentclass[twocolumn,fleqn,runningheads]{svjour2} % \smartqed % flush right qed marks, e.g. at end of proof % \usepackage{graphicx} \usepackage{listings} \usepackage{epsfig} \usepackage{textcomp} \usepackage[latin1]{inputenc} \usepackage{amssymb} \DeclareGraphicsExtensions{.png} % % \usepackage{mathptmx} % use Times fonts if available on your TeX system % % insert here the call for the packages your document requires %\usepackage{latexsym} % etc. % % please place your own definitions here and don't use \def but % \newcommand{}{} % \lstset{ language=Pascal, basicstyle=\fontsize{9}{9}\selectfont, captionpos=t, aboveskip=1mm, belowskip=1mm, abovecaptionskip=1mm, belowcaptionskip=1mm, % numbers = left, mathescape=true, escapechar=@, extendedchars=true, showstringspaces=false, columns=fixed, basewidth=0.515em, frame=single, framesep=2mm, xleftmargin=2mm, xrightmargin=2mm, framerule=0.5pt } \def\cG{{\mathcal G}} \def\crit{{\rm crit}} \def\ncrit{{\rm ncrit}} \def\scrit{{\rm scrit}} \def\bedges{{\rm bedges}} \def\ZZ{{\mathbb Z}} \journalname{The VLDB Journal} % \begin{document} \title{Space and Time Efficient Minimal Perfect Hash \\[0.2cm] Functions for Very Large Databases\thanks{ This work was supported in part by GERINDO Project--grant MCT/CNPq/CT-INFO 552.087/02-5, CAPES/PROF Scholarship (Fabiano C. Botelho), FAPESP Proj.\ Tem.\ 03/09925-5 and CNPq Grant 30.0334/93-1 (Yoshiharu Kohayakawa), and CNPq Grant 30.5237/02-0 (Nivio Ziviani).} } %\subtitle{Do you have a subtitle?\\ If so, write it here} %\titlerunning{Short form of title} % if too long for running head \author{Fabiano C. Botelho \and Davi C. Reis \and Yoshiharu Kohayakawa \and Nivio Ziviani} %\authorrunning{Short form of author list} % if too long for running head \institute{ F. C. Botelho \and N. Ziviani \at Dept. of Computer Science, Federal Univ. of Minas Gerais, Belo Horizonte, Brazil\\ \email{\{fbotelho,nivio\}@dcc.ufmg.br} \and D. C. Reis \at Google, Brazil \\ \email{davi.reis@gmail.com} \and Y. Kohayakawa Dept. of Computer Science, Univ. of S\~ao Paulo, S\~ao Paulo, Brazil\\ \email{yoshi@ime.usp.br} } \date{Received: date / Accepted: date} % The correct dates will be entered by the editor \maketitle \begin{abstract} We propose a novel external memory based algorithm for constructing minimal perfect hash functions~$h$ for huge sets of keys. For a set of~$n$ keys, our algorithm outputs~$h$ in time~$O(n)$. The algorithm needs a small vector of one byte entries in main memory to construct $h$. The evaluation of~$h(x)$ requires three memory accesses for any key~$x$. The description of~$h$ takes a constant number of bits for each key, which is optimal, i.e., the theoretical lower bound is $1/\ln 2$ bits per key. In our experiments, we used a collection of 1 billion URLs collected from the web, each URL 64 characters long on average. For this collection, our algorithm (i) finds a minimal perfect hash function in approximately 3 hours using a commodity PC, (ii) needs just 5.45 megabytes of internal memory to generate $h$ and (iii) takes 8.1 bits per key for the description of~$h$. \keywords{Minimal Perfect Hashing \and Large Databases} \end{abstract} % main text \def\cG{{\mathcal G}} \def\crit{{\rm crit}} \def\ncrit{{\rm ncrit}} \def\scrit{{\rm scrit}} \def\bedges{{\rm bedges}} \def\ZZ{{\mathbb Z}} \def\BSmax{\mathit{BS}_{\mathit{max}}} \def\Bi{\mathop{\rm Bi}\nolimits} \input{introduction} %\input{terminology} \input{relatedwork} \input{thealgorithm} \input{partitioningthekeys} \input{searching} %\input{computingoffset} %\input{hashingbuckets} \input{determiningb} %\input{analyticalandexperimentalresults} \input{analyticalresults} %\input{results} \input{conclusions} %\input{acknowledgments} %\begin{acknowledgements} %If you'd like to thank anyone, place your comments here %and remove the percent signs. %\end{acknowledgements} % BibTeX users please use %\bibliographystyle{spmpsci} %\bibliography{} % name your BibTeX data base \bibliographystyle{plain} \bibliography{references} \input{appendix} \end{document}