114 lines
7.7 KiB
TeX
114 lines
7.7 KiB
TeX
|
% Nivio: 29/jan/06
|
||
|
% Time-stamp: <Sunday 29 Jan 2006 11:58:28pm EST yoshi@flare>
|
||
|
\vspace{-2mm}
|
||
|
\subsection{Controlling disk accesses}
|
||
|
\label{sec:contr-disk-access}
|
||
|
|
||
|
In order to bring down the number of seek operations on disk
|
||
|
we benefit from the fact that our algorithm leaves almost all main
|
||
|
memory available to be used as disk I/O buffer.
|
||
|
In this section we evaluate how much the parameter $\mu$
|
||
|
affects the runtime of our algorithm.
|
||
|
For that we fixed $n$ in 1 billion of URLs,
|
||
|
set the main memory of the machine used for the experiments
|
||
|
to 1 gigabyte and used $\mu$ equal to $100, 200, 300, 400, 500$ and $600$
|
||
|
megabytes.
|
||
|
|
||
|
\enlargethispage{2\baselineskip}
|
||
|
Table~\ref{tab:diskaccess} presents the number of files $N$,
|
||
|
the buffer size used for all files, the number of seeks in the worst case considering
|
||
|
the pessimistic assumption mentioned in Section~\ref{sec:linearcomplexity}, and
|
||
|
the time to generate a MPHF for 1 billion of keys as a function of the amount of internal
|
||
|
memory available. Observing Table~\ref{tab:diskaccess} we noticed that the time spent in the construction
|
||
|
decreases as the value of $\mu$ increases. However, for $\mu > 400$, the variation
|
||
|
on the time is not as significant as for $\mu \leq 400$.
|
||
|
This can be explained by the fact that the kernel 2.6 I/O scheduler of Linux
|
||
|
has smart policies
|
||
|
for avoiding seeks and diminishing the average seek time
|
||
|
(see \texttt{http://www.linuxjournal.com/article/6931}).
|
||
|
\begin{table*}[ht]
|
||
|
\vspace{-2mm}
|
||
|
\begin{center}
|
||
|
{\scriptsize
|
||
|
\begin{tabular}{|l|c|c|c|c|c|c|}
|
||
|
\hline
|
||
|
$\mu$ (MB) & $100$ & $200$ & $300$ & $400$ & $500$ & $600$ \\
|
||
|
\hline
|
||
|
$N$ (files) & $619$ & $310$ & $207$ & $155$ & $124$ & $104$ \\
|
||
|
%\hline
|
||
|
\textbaht~(buffer size in KB) & $165$ & $661$ & $1,484$ & $2,643$ & $4,129$ & $5,908$ \\
|
||
|
%\hline
|
||
|
$\beta$/\textbaht~(\# of seeks in the worst case) & $384,478$ & $95,974$ & $42,749$ & $24,003$ & $15,365$ & $10,738$ \\
|
||
|
% \hline
|
||
|
% \raisebox{-0.2em}{\# of seeks performed in} & \raisebox{-0.7em}{$383,056$} & \raisebox{-0.7em}{$95,919$} & \raisebox{-0.7em}{$42,700$} & \raisebox{-0.7em}{$23,980$} & \raisebox{-0.7em}{$15,347$} & \raisebox{-0.7em}{$xx,xxx$} \\
|
||
|
% \raisebox{0.2em}{statement 1.3 of Figure~\ref{fig:readingbucket}} & & & & & & \\
|
||
|
% \hline
|
||
|
Time (hours) & $4.04$ & $3.64$ & $3.34$ & $3.20$ & $3.13$ & $3.09$ \\
|
||
|
\hline
|
||
|
\end{tabular}
|
||
|
\vspace{-1mm}
|
||
|
}
|
||
|
\end{center}
|
||
|
\caption{Influence of the internal memory area size ($\mu$) in our algorithm runtime.}
|
||
|
\label{tab:diskaccess}
|
||
|
\vspace{-14mm}
|
||
|
\end{table*}
|
||
|
|
||
|
|
||
|
|
||
|
% \begin{table*}[ht]
|
||
|
% \begin{center}
|
||
|
% {\scriptsize
|
||
|
% \begin{tabular}{|l|c|c|c|c|c|c|c|c|c|c|c|}
|
||
|
% \hline
|
||
|
% $\mu$ (MB) & $100$ & $150$ & $200$ & $250$ & $300$ & $350$ & $400$ & $450$ & $500$ & $550$ & $600$ \\
|
||
|
% \hline
|
||
|
% $N$ (files) & $619$ & $413$ & $310$ & $248$ & $207$ & $177$ & $155$ & $138$ & $124$ & $113$ & $103$ \\
|
||
|
% \hline
|
||
|
% \textbaht~(buffer size in KB) & $165$ & $372$ & $661$ & $1,033$ & $1,484$ & $2,025$ & $2,643$ & $3,339$ & & & \\
|
||
|
% \hline
|
||
|
% \# of seeks (Worst case) & $384,478$ & $170,535$ & $95,974$ & $61,413$ & $42,749$ & $31,328$ & $24,003$ & $19,000$ & & & \\
|
||
|
% \hline
|
||
|
% \raisebox{-0.2em}{\# of seeks performed in} & \raisebox{-0.7em}{$383,056$} & \raisebox{-0.7em}{$170,385$} & \raisebox{-0.7em}{$95,919$} & \raisebox{-0.7em}{$61,388$} & \raisebox{-0.7em}{$42,700$} & \raisebox{-0.7em}{$31,296$} & \raisebox{-0.7em}{$23,980$} & \raisebox{-0.7em}{$18,978$} & \raisebox{-0.7em}{$xx,xxx$} & \raisebox{-0.7em}{$xx,xxx$} & \raisebox{-0.7em}{$xx,xxx$} \\
|
||
|
% \raisebox{0.2em}{statement 1.3 of Figure~\ref{fig:readingbucket}} & & & & & & & & & & & \\
|
||
|
% \hline
|
||
|
% Time (horas) & $4.04$ & $3.93$ & $3.64$ & $3.46$ & $3.34$ & $3.26$ & $3.20$ & $3.13$ & & & \\
|
||
|
% \hline
|
||
|
% \end{tabular}
|
||
|
% }
|
||
|
% \end{center}
|
||
|
% \caption{Influence of the internal memory area size ($\mu$) in our algorithm runtime.}
|
||
|
% \label{tab:diskaccess}
|
||
|
% \end{table*}
|
||
|
|
||
|
|
||
|
|
||
|
% \begin{table*}[htb]
|
||
|
% \begin{center}
|
||
|
% {\scriptsize
|
||
|
% \begin{tabular}{|l|c|c|c|c|c|}
|
||
|
% \hline
|
||
|
% $n$ (millions) & 1 & 2 & 4 & 8 & 16 \\
|
||
|
% \hline % Part. 16 \% 16 \% 16 \% 18 \% 20\%
|
||
|
% Average time (s) & $14.124 \pm 0.128$ & $28.301 \pm 0.140$ & $56.807 \pm 0.312$ & $117.286 \pm 0.997$ & $241.086 \pm 0.936$ \\
|
||
|
% SD & $0.179$ & $0.196$ & $0.437$ & $1.394$ & $1.308$ \\
|
||
|
% \hline
|
||
|
% \hline
|
||
|
% $n$ (millions) & 32 & 64 & 128 & 512 & 1000 \\
|
||
|
% \hline % Part. 20 \% 20\% 20\% 18\% 18\%
|
||
|
% Average time (s) & $492.430 \pm 1.565$ & $1006.307 \pm 1.425$ & $2081.208 \pm 0.740$ & $9253.188 \pm 4.406$ & $19021.480 \pm 13.850$ \\
|
||
|
% SD & $2.188$ & $1.992$ & $1.035$ & $ 6.160$ & $18.016$ \\
|
||
|
% \hline
|
||
|
|
||
|
% \end{tabular}
|
||
|
% }
|
||
|
% \end{center}
|
||
|
% \caption{The runtime averages in seconds,
|
||
|
% the standard deviation (SD), and
|
||
|
% the confidence intervals given by the average time $\pm$
|
||
|
% the distance from average time considering
|
||
|
% a confidence level of $95\%$.
|
||
|
% }
|
||
|
% \label{tab:mediasbrz}
|
||
|
% \end{table*}
|