turbonss/vldb07/diskaccess.tex

114 lines
7.7 KiB
TeX
Raw Normal View History

2006-08-11 20:32:31 +03:00
% Nivio: 29/jan/06
% Time-stamp: <Sunday 29 Jan 2006 11:58:28pm EST yoshi@flare>
\vspace{-2mm}
\subsection{Controlling disk accesses}
\label{sec:contr-disk-access}
In order to bring down the number of seek operations on disk
we benefit from the fact that our algorithm leaves almost all main
memory available to be used as disk I/O buffer.
In this section we evaluate how much the parameter $\mu$
affects the runtime of our algorithm.
For that we fixed $n$ in 1 billion of URLs,
set the main memory of the machine used for the experiments
to 1 gigabyte and used $\mu$ equal to $100, 200, 300, 400, 500$ and $600$
megabytes.
\enlargethispage{2\baselineskip}
Table~\ref{tab:diskaccess} presents the number of files $N$,
the buffer size used for all files, the number of seeks in the worst case considering
the pessimistic assumption mentioned in Section~\ref{sec:linearcomplexity}, and
the time to generate a MPHF for 1 billion of keys as a function of the amount of internal
memory available. Observing Table~\ref{tab:diskaccess} we noticed that the time spent in the construction
decreases as the value of $\mu$ increases. However, for $\mu > 400$, the variation
on the time is not as significant as for $\mu \leq 400$.
This can be explained by the fact that the kernel 2.6 I/O scheduler of Linux
has smart policies
for avoiding seeks and diminishing the average seek time
(see \texttt{http://www.linuxjournal.com/article/6931}).
\begin{table*}[ht]
\vspace{-2mm}
\begin{center}
{\scriptsize
\begin{tabular}{|l|c|c|c|c|c|c|}
\hline
$\mu$ (MB) & $100$ & $200$ & $300$ & $400$ & $500$ & $600$ \\
\hline
$N$ (files) & $619$ & $310$ & $207$ & $155$ & $124$ & $104$ \\
%\hline
\textbaht~(buffer size in KB) & $165$ & $661$ & $1,484$ & $2,643$ & $4,129$ & $5,908$ \\
%\hline
$\beta$/\textbaht~(\# of seeks in the worst case) & $384,478$ & $95,974$ & $42,749$ & $24,003$ & $15,365$ & $10,738$ \\
% \hline
% \raisebox{-0.2em}{\# of seeks performed in} & \raisebox{-0.7em}{$383,056$} & \raisebox{-0.7em}{$95,919$} & \raisebox{-0.7em}{$42,700$} & \raisebox{-0.7em}{$23,980$} & \raisebox{-0.7em}{$15,347$} & \raisebox{-0.7em}{$xx,xxx$} \\
% \raisebox{0.2em}{statement 1.3 of Figure~\ref{fig:readingbucket}} & & & & & & \\
% \hline
Time (hours) & $4.04$ & $3.64$ & $3.34$ & $3.20$ & $3.13$ & $3.09$ \\
\hline
\end{tabular}
\vspace{-1mm}
}
\end{center}
\caption{Influence of the internal memory area size ($\mu$) in our algorithm runtime.}
\label{tab:diskaccess}
\vspace{-14mm}
\end{table*}
% \begin{table*}[ht]
% \begin{center}
% {\scriptsize
% \begin{tabular}{|l|c|c|c|c|c|c|c|c|c|c|c|}
% \hline
% $\mu$ (MB) & $100$ & $150$ & $200$ & $250$ & $300$ & $350$ & $400$ & $450$ & $500$ & $550$ & $600$ \\
% \hline
% $N$ (files) & $619$ & $413$ & $310$ & $248$ & $207$ & $177$ & $155$ & $138$ & $124$ & $113$ & $103$ \\
% \hline
% \textbaht~(buffer size in KB) & $165$ & $372$ & $661$ & $1,033$ & $1,484$ & $2,025$ & $2,643$ & $3,339$ & & & \\
% \hline
% \# of seeks (Worst case) & $384,478$ & $170,535$ & $95,974$ & $61,413$ & $42,749$ & $31,328$ & $24,003$ & $19,000$ & & & \\
% \hline
% \raisebox{-0.2em}{\# of seeks performed in} & \raisebox{-0.7em}{$383,056$} & \raisebox{-0.7em}{$170,385$} & \raisebox{-0.7em}{$95,919$} & \raisebox{-0.7em}{$61,388$} & \raisebox{-0.7em}{$42,700$} & \raisebox{-0.7em}{$31,296$} & \raisebox{-0.7em}{$23,980$} & \raisebox{-0.7em}{$18,978$} & \raisebox{-0.7em}{$xx,xxx$} & \raisebox{-0.7em}{$xx,xxx$} & \raisebox{-0.7em}{$xx,xxx$} \\
% \raisebox{0.2em}{statement 1.3 of Figure~\ref{fig:readingbucket}} & & & & & & & & & & & \\
% \hline
% Time (horas) & $4.04$ & $3.93$ & $3.64$ & $3.46$ & $3.34$ & $3.26$ & $3.20$ & $3.13$ & & & \\
% \hline
% \end{tabular}
% }
% \end{center}
% \caption{Influence of the internal memory area size ($\mu$) in our algorithm runtime.}
% \label{tab:diskaccess}
% \end{table*}
% \begin{table*}[htb]
% \begin{center}
% {\scriptsize
% \begin{tabular}{|l|c|c|c|c|c|}
% \hline
% $n$ (millions) & 1 & 2 & 4 & 8 & 16 \\
% \hline % Part. 16 \% 16 \% 16 \% 18 \% 20\%
% Average time (s) & $14.124 \pm 0.128$ & $28.301 \pm 0.140$ & $56.807 \pm 0.312$ & $117.286 \pm 0.997$ & $241.086 \pm 0.936$ \\
% SD & $0.179$ & $0.196$ & $0.437$ & $1.394$ & $1.308$ \\
% \hline
% \hline
% $n$ (millions) & 32 & 64 & 128 & 512 & 1000 \\
% \hline % Part. 20 \% 20\% 20\% 18\% 18\%
% Average time (s) & $492.430 \pm 1.565$ & $1006.307 \pm 1.425$ & $2081.208 \pm 0.740$ & $9253.188 \pm 4.406$ & $19021.480 \pm 13.850$ \\
% SD & $2.188$ & $1.992$ & $1.035$ & $ 6.160$ & $18.016$ \\
% \hline
% \end{tabular}
% }
% \end{center}
% \caption{The runtime averages in seconds,
% the standard deviation (SD), and
% the confidence intervals given by the average time $\pm$
% the distance from average time considering
% a confidence level of $95\%$.
% }
% \label{tab:mediasbrz}
% \end{table*}