\documentclass[11pt]{article}
\usepackage{mhchem}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{textcomp}
\usepackage{epsfig}
\usepackage{hyperref}
\usepackage{hyphenat}
\usepackage[noabbrev, capitalize]{cleveref} % hyperref must be loaded first
\usepackage[
detect-weight=true,
per=slash,
detect-family=true,
separate-uncertainty=true]{siunitx}

\usepackage[dvipsnames]{xcolor}
\usepackage{upquote}

\usepackage[framemethod=tikz]{mdframed}
\usepackage{adjustbox}
\usepackage{listings}
\usepackage{xparse}
\NewDocumentCommand{\codeword}{v}{%
  \texttt{\textcolor{blue}{#1}}%
}
\lstset{language=C++,keywordstyle={\bfseries \color{blue}}}

\usepackage{forest}

\begin{document}
\title{Moving window decomposition implementation using GPU}
\author{Nam H. Tran \\ Boston University}
\date{\today}
\maketitle

\section{Introduction}%
\label{sec:introduction}
The purpose of this work is assessing the feasibility and performance of using
GPU to process raw waveforms from a HPGe detector. Expected result of the MWD
algorithm is shown in \cref{fig:mwdInputOutput}. The input is a
\num{250000}-sample long waveform taken at the preamplifier output of the HPGe
detector. The MWD algorithm transforms each jump in the input waveform into a
flat-top peak, of which height is proportional to charge deposited in the
detector. 

\begin{figure}[tbp]
  \centering
  \includegraphics[width=0.90\linewidth]{figs/mwdInputOutput}
  \caption{MWD algorithm: input waveform on the left, and expected output on
    the right.}%
  \label{fig:mwdInputOutput}
\end{figure}

\section{Set up}%
\label{sec:set_up}
\subsection{Hardware}%
\label{sub:hardware}
A consumer computer is used in this study:
\begin{itemize}
  \item CPU\@: AMD Ryzen 5 2400G, running at \SI{3.60}{\giga\hertz},
    maximum frequency \SI{3.90}{\giga\hertz}
  \item RAM\@: \SI{16}{GB} DDR4 3000
  \item GPU\@: GeForce GTX 1060, DDR5 memory \SI{6}{GB},
    maximum frequency \SI{1.70}{\giga\hertz}
\end{itemize}

\subsection{Software}%
\label{sub:software}
The OS is Debian 10.2, with following compilers:
\begin{itemize}
  \item gcc (Debian 8.3.0\hyp{}6) 8.3.0
  \item nvcc V9.2.148, CUDA 10.1, driver version 418.74
\end{itemize}

\section{Implementations}%
\label{sec:implementations}
There are two implementations of the MWD algorithm:
\begin{itemize}
  \item C++ implementation which does all calculations on the CPU. This will be
    used to verify the accuracy of the other code, as well as a benchmark
  \item CUDA implementation: offloads the digital pule processing part on to
    the GPU, CPU only handles input/output related tasks
\end{itemize}

\subsection{C++ code}%
\label{sub:c_code}
This implementation uses raw array wrapped in a \codeword{struct} to represent
waveforms, pointers are managed manually. There are 3 methods
\codeword{Deconvolute}, \codeword{OffsetDifferentiate}, and
\codeword{MovingAverage} corresponds to 3 stages of the MWD algorithm. The code
is run on a single thread.

The related files are:

\begin{forest}
  for tree={
    font=\ttfamily,
    grow'=0,
    child anchor=west,
    parent anchor=south,
    anchor=west,
    calign=first,
    edge path={
      \noexpand\path [draw, \forestoption{edge}]
      (!u.south west) +(7.5pt,0) |- node[fill,inner sep=1.25pt] {} (.child anchor)\forestoption{edge label};
    },
    before typesetting nodes={
      if n=1
        {insert before={[,phantom]}}
        {}
    },
    fit=band,
    before computing xy={l=15pt},
  }
  [mwd
    [mwd.c]
    [srcs
      [vector.h]
      [vector.c]
      [algo.h]
      [algo.c]
    ]
  ]
\end{forest}

\subsection{CUDA code}%
\label{sub:cuda_code}
The CUDA code implements 3 GPU functions \codeword{gpuDeconvolute},
\codeword{gpuOffsetDifferentiate}, and \codeword{gpuMovingAverage} which
replace 3 C++ methods in the other implementation. There are also helpers for
moving data between main memory and GPU memory, error checking and time
keeping. The I/O part is the same as in the C++ version.

Related files are:

\begin{forest}
  for tree={
    font=\ttfamily,
    grow'=0,
    child anchor=west,
    parent anchor=south,
    anchor=west,
    calign=first,
    edge path={
      \noexpand\path [draw, \forestoption{edge}]
      (!u.south west) +(7.5pt,0) |- node[fill,inner sep=1.25pt] {} (.child anchor)\forestoption{edge label};
    },
    before typesetting nodes={
      if n=1
        {insert before={[,phantom]}}
        {}
    },
    fit=band,
    before computing xy={l=15pt},
  }
  [mwd
  [gmwd.cu]
    [srcs
      [gpuAlgo.cu]
      [gpuAlgo.h]
      [gpuTimer.h]
      [gpuUtils.h]
      [prefixScan.cu]
      [prefixScan.h]
    ]
  ]
\end{forest}

\subsection{Benchmarking}%
\label{sub:benchmarking}
Several tests are done with different waveform lengths from
\numrange{246}{250000} to show performance of these two implementations. When
a waveform length less than that of the original input is requested, successive
sub\hyp{}waveforms from the original are fed into the algorithms. At each
waveform length, the calculations are repeated from \numrange{1000}{3000}
times, time costs of I/O and each MWD stages are recorded.

\section{Results}%
\label{sec:results}
Average time costs of the two implementations as functions of waveform length are
shown in \cref{fig:amdTimeCost_full}. The CPU is much quicker the GPU when data
size is small, but the CPU time grows linearly much faster than the GPU time.
For waveforms of about \num{10000}-sample long, the two codes take roughly the
same time to complete. The GPU is about 30\% faster than the CPU when
processing \num{250000}-sample long waveforms. The growing rates are shown more
clearly on \cref{fig:amdNormTimeCost_full}, the CPU time cost grows about twice as
fast as the GPU time cost does.

\begin{figure}[tbp]
  \centering
  \includegraphics[width=0.9\linewidth]{figs/amdTimeCost_full.png}
  \caption{Time costs of C++ (blue) and CUDA (red) codes as functions of
    waveform length.}% 
  \label{fig:amdTimeCost_full}
\end{figure}

\begin{figure}[tbp]
  \centering
  \includegraphics[width=0.9\linewidth]{figs/amdNormTimeCost_full.png}
  \caption{Normalized time costs of C++ (blue) and CUDA (red) codes as
    functions of waveform length.}% 
  \label{fig:amdNormTimeCost_full}
\end{figure}

\section{Code}%
\label{sec:code}
A tarball of the code is attached as \codeword{mwd.tar.gz}

\end{document}