diff --git a/mwd_gpu/figs/amdNormTimeCost_full.png b/mwd_gpu/figs/amdNormTimeCost_full.png
new file mode 100644
index 0000000..774d3f3
Binary files /dev/null and b/mwd_gpu/figs/amdNormTimeCost_full.png differ
diff --git a/mwd_gpu/figs/amdTimeCost_full.png b/mwd_gpu/figs/amdTimeCost_full.png
new file mode 100644
index 0000000..12fd7ef
Binary files /dev/null and b/mwd_gpu/figs/amdTimeCost_full.png differ
diff --git a/mwd_gpu/figs/mwdInputOutput.png b/mwd_gpu/figs/mwdInputOutput.png
index 5752955..8729dc3 100644
Binary files a/mwd_gpu/figs/mwdInputOutput.png and b/mwd_gpu/figs/mwdInputOutput.png differ
diff --git a/mwd_gpu/mwdGpu.tex b/mwd_gpu/mwdGpu.tex
index c90d1d3..af26539 100644
--- a/mwd_gpu/mwdGpu.tex
+++ b/mwd_gpu/mwdGpu.tex
@@ -55,38 +55,21 @@ detector.
 \label{sec:set_up}
 \subsection{Hardware}%
 \label{sub:hardware}
-There are two consumer computers used in this study:
+A consumer computer is used in this study:
 \begin{itemize}
-  \item PC 1\@:
-  \begin{itemize}
-    \item CPU\@: AMD Ryzen 5 2400G, running at \SI{3.60}{\giga\hertz},
-      maximum frequency \SI{3.90}{\giga\hertz}
-    \item GPU\@: GeForce GTX 1060, DDR5 memory \SI{6}{GB},
-      maximum frequency \SI{1.70}{\giga\hertz}
-  \end{itemize}
-  \item PC 2\@:
-  \begin{itemize}
-    \item CPU\@: Intel Core i5\hyp{}4590 CPU, running at \SI{3.30}{\giga\hertz},
-      maximum frequency \SI{3.70}{\giga\hertz}
-    \item GPU\@: GeForce GTX 1650, DDR5 memory \SI{4}{GB},
-      maximum frequency \SI{1.70}{\giga\hertz}
-  \end{itemize}
+  \item CPU\@: AMD Ryzen 5 2400G, running at \SI{3.60}{\giga\hertz},
+    maximum frequency \SI{3.90}{\giga\hertz}
+  \item RAM\@: \SI{16}{GB} DDR4 3000
+  \item GPU\@: GeForce GTX 1060, DDR5 memory \SI{6}{GB},
+    maximum frequency \SI{1.70}{\giga\hertz}
 \end{itemize}
 
 \subsection{Software}%
 \label{sub:software}
-The computers run two different versions of Linux:
+The OS is Debian 10.2, with following compilers:
 \begin{itemize}
-  \item PC 1\@: CentOS 7.2
-    \begin{itemize}
-      \item gcc
-      \item CUDA
-    \end{itemize}
-  \item PC 2\@: Debian 10.2
-    \begin{itemize}
-      \item gcc (Debian 8.3.0\hyp{}6) 8.3.0
-      \item nvcc V9.2.148, CUDA 10.1, driver version 418.74
-    \end{itemize}
+  \item gcc (Debian 8.3.0\hyp{}6) 8.3.0
+  \item nvcc V9.2.148, CUDA 10.1, driver version 418.74
 \end{itemize}
 
 \section{Implementations}%
@@ -104,7 +87,8 @@ There are two implementations of the MWD algorithm:
 This implementation uses raw array wrapped in a \codeword{struct} to represent
 waveforms, pointers are managed manually. There are 3 methods
 \codeword{Deconvolute}, \codeword{OffsetDifferentiate}, and
-\codeword{MovingAverage} corresponds to 3 stages of the MWD algorithm.
+\codeword{MovingAverage} corresponds to 3 stages of the MWD algorithm. The code
+is run on a single thread.
 
 The related files are:
 
@@ -182,12 +166,44 @@ Related files are:
   ]
 \end{forest}
 
+\subsection{Benchmarking}%
+\label{sub:benchmarking}
+Several tests are done with different waveform lengths from
+\numrange{246}{250000} to show performance of these two implementations. When
+a waveform length less than that of the original input is requested, successive
+sub\hyp{}waveforms from the original are fed into the algorithms. At each
+waveform length, the calculations are repeated from \numrange{1000}{3000}
+times, time costs of I/O and each MWD stages are recorded.
+
 \section{Results}%
 \label{sec:results}
+Average time costs of the two implementations as functions of waveform length are
+shown in \cref{fig:amdTimeCost_full}. The CPU is much quicker the GPU when data
+size is small, but the CPU time grows linearly much faster than the GPU time.
+For waveforms of about \num{10000}-sample long, the two codes take roughly the
+same time to complete. The GPU is about 30\% faster than the CPU when
+processing \num{250000}-sample long waveforms. The growing rates are shown more
+clearly on \cref{fig:amdNormTimeCost_full}, the CPU time cost grows about twice as
+fast as the GPU time cost does.
+
+\begin{figure}[tbp]
+  \centering
+  \includegraphics[width=0.9\linewidth]{figs/amdTimeCost_full.png}
+  \caption{Time costs of C++ (blue) and CUDA (red) codes as functions of
+    waveform length.}% 
+  \label{fig:amdTimeCost_full}
+\end{figure}
+
+\begin{figure}[tbp]
+  \centering
+  \includegraphics[width=0.9\linewidth]{figs/amdNormTimeCost_full.png}
+  \caption{Normalized time costs of C++ (blue) and CUDA (red) codes as
+    functions of waveform length.}% 
+  \label{fig:amdNormTimeCost_full}
+\end{figure}
 
 \section{Code}%
 \label{sec:code}
 A tarball of the code is attached as \codeword{mwd.tar.gz}
 
-
 \end{document}