main.tex

%%%%%%%%%%%%%%%%%%%%%%% file template.tex %%%%%%%%%%%%%%%%%%%%%%%%%
%
% This is a general template file for the LaTeX package SVJour3
% for Springer journals.          Springer Heidelberg 2010/09/16
%
% Copy it to a new file with a new name and use it as the basis
% for your article. Delete % signs as needed.
%
% This template includes a few options for different layouts and
% content for various journals. Please consult a previous issue of
% your journal as needed.
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% First comes an example EPS file -- just ignore it and
% proceed on the \documentclass line
% your LaTeX will extract the file if required
\begin{filecontents*}{example.eps}
    %!PS-Adobe-3.0 EPSF-3.0
    %%BoundingBox: 19 19 221 221
    %%CreationDate: Mon Sep 29 1997
    %%Creator: programmed by hand (JK)
    %%EndComments
    gsave
    newpath
    20 20 moveto
    20 220 lineto
    220 220 lineto
    220 20 lineto
    closepath
    2 setlinewidth
    gsave
    .4 setgray fill
    grestore
    stroke
    grestore
\end{filecontents*}
%
\RequirePackage{fix-cm}
%
%\documentclass{svjour3}                     % onecolumn (standard format)
%\documentclass[smallcondensed]{svjour3}     % onecolumn (ditto)
%\documentclass[smallextended]{svjour3}       % onecolumn (second format)
\documentclass[smallextended,twocolumn]{svjour3}          % twocolumn
%
\journalname{Behavior Research Methods}
\smartqed  % flush right qed marks, e.g. at end of proof
%
\usepackage{graphicx}
%
%usepackage{mathptmx}      % use Times fonts if available on your TeX system
%
%\usepackage[natbibapa]{apacite}
\usepackage{natbib}


%\usepackage{tabularx}
\usepackage{amsmath}
\usepackage{textcomp}
\usepackage{booktabs}
\usepackage{units}
\usepackage[draft]{hyperref} %tmp added draft option for spilling refs
\usepackage{wrapfig}
\usepackage{todonotes}
\newcommand{\eg}{e.g., }
\newcommand{\ie}{i.e., }
\newcommand{\remodnav}{REMoDNaV}
\newcommand{\fig}[1]{{Figure~\ref{fig:#1}}}
\newcommand{\tab}[1]{{Table~\ref{tab:#1}}}
\newcommand{\param}[1]{{\texttt{#1}}}

\begin{document}

\input{results_def.tex}

\onecolumn
\title{REMoDNaV: Robust Eye-Movement Classification for Dynamic Stimulation} %\\ (remodnav)

% \titlenote{The title should be detailed enough for someone to know whether
% the article would be of interest to them, but also concise. Please ensure the
% broadness and claims within the title are appropriate to the content of the
% article itself.}

\author{%
  Asim~H.~Dar\textsuperscript{*} \and
Adina~S.~Wagner\textsuperscript{*} \and
Michael~Hanke\\
{\small \textsuperscript{*} Both authors have contributed equally}}

\institute{Asim~H.~Dar \at
Special Lab Non-Invasive Brain Imaging, Leibniz Institute for Neurobiology, Brenneckestra{\ss}e~6, Magde\-burg, Germany
%Tel.: +123-45-678910\\
%Fax: +123-45-678910\\
%\email{fauthor@example.com}           %  \\
%             \emph{Present address:} of F. Author  %  if needed
\and
Adina Wagner \at
Psychoinformatics lab, Institute of Neuroscience and Medicine (INM-7: Brain and Behaviour), Research Centre Jülich, Germany
\and
Michael Hanke \at
Psychoinformatics lab, Institute of Neuroscience and Medicine (INM-7: Brain and Behaviour), Research Centre Jülich and Institute of Systems Neuroscience, Medical Faculty,
Heinrich Heine University Düsseldorf, Germany
\email{michael.hanke@gmail.com}
}


%\affil[1]{Psychoinformatics Lab, Institute of Psychology, Otto-von-Guericke University, Magdeburg, Germany}

\date{Received: date / Accepted: date}

\maketitle

% Please list all authors that played a significant role in the research
% involved in the article. Please provide full affiliation information
% (including full institutional address, ZIP code and e-mail address) for all
% authors, and identify who is/are the corresponding author(s).

\begin{abstract}

% Abstracts should be up to 300 words and provide a succinct summary of the
% article. Although the abstract should explain why the article might be
% interesting, care should be taken not to inappropriately over-emphasize the
% importance of the work described in the article. Citations should not be used
% in the abstract, and the use of abbreviations should be minimized. If you are
% writing a Research or Systematic Review article, please structure your
% abstract into Background, Methods, Results, and Conclusions.

Tracking of eye movements is an established measurement for many types of
experimental paradigms.
More complex and more prolonged visual stimuli have made algorithmic approaches to
eye movement event classification the most pragmatic option.
A recent analysis revealed that many current algorithms are lackluster when it
comes to data from viewing dynamic stimuli such as video sequences.
Here we present an event classification algorithm---built on an existing
velocity-based approach---that is suitable for both static and dynamic
stimulation, and is capable of classifying saccades, post-saccadic
oscillations, fixations, and smooth pursuit events.
We validated classification performance and robustness on three public datasets:
1)~manually annotated, trial-based gaze trajectories for viewing static images,
moving dots, and short video sequences, 2)~lab-quality gaze recordings for a
feature length movie, and 3)~gaze recordings acquired under suboptimal lighting
conditions inside the bore of a magnetic resonance imaging (MRI) scanner for
the same full-length movie.
We found that the proposed algorithm performs on par or better compared
to state-of-the-art alternatives for static stimulation. Moreover, it yields
eye movement events with biologically plausible characteristics on prolonged
dynamic recordings. Lastly, algorithm performance is robust
on data acquired under suboptimal conditions that exhibit a temporally
varying noise level.
These results indicate that the proposed algorithm is a robust tool with
improved classification accuracy across a range of use cases.
The algorithm is cross-platform compatible, implemented using the
Python programming language, and readily available as free and open source software
from public sources.

\keywords{%
eye tracking \and
adaptive classification algorithm \and
saccade classification algorithm \and
statistical saccade analysis \and
glissade classification \and
adaptive threshold algorithm \and
data preprocessing
}
\end{abstract}

% \todo[inline]{The scope of the article is a "Data Note" that describes new
% "derived" data generated from the raw eyetracking data released by the
% studyforrest project. I propose to produce two types of artifacts: 1.
% filtered/preprocessed eyetracking data, and 2. a list with detected saccades
% for each recording.}

% \todo[inline]{It would be good to also release fully preprocessed data. Apart
% from applying the chosen filter, it would also make sense to me to temporally
% down-sample the data. What would be a practical sampling rate that reduces
% the data size (and some noise), but does not negatively impact most potential
% analyses? 100 Hz? 200 Hz? Even in the latter case it would still be a 5x
% reduction in size.}

% \todo[inline, backgroundcolor = green]{250 Hz is the lower limit to detect
% most saccades (Kern 2000). However,the downsampled data did not reach the
% same accuracy level as the higher frequency data, probably because the
% algorithm was designed to work on higher frequencies (as stated by the
% authors). Therefore we left it at the 1000 Hz frequency}

\twocolumn
\section*{Introduction}\label{intro}

% \todo[inline]{\textit{make connection to studyforrest. studyforrest has
% eyetracking data. why is it necessary to have that preprocessed?}}

% The data used for this thesis originates from the open science project
% 'studyforrest'. It centers around two large data acquisition phases employing
% the movie 'Forrest Gump' as stimulus. \cite{Hanke.2014,Hanke.2016} The
% project provides a large variety of collections of data to enable fellow
% researchers to build upon existing knowledge and further extend the dataset.
% Along other measures, the eye gaze coordinates were being recorded during the
% original sessions. Contrarily to the standard automatic detection process we
% applied an adaptive algorithm to the eye movement data to provide a more
% precise computation of saccades and fixations.

A spreading theme in cognitive neuroscience is to use dynamic and naturalistic
stimuli such as video clips or movies as opposed to static and isolated
stimuli \citep{real_world}. Using dynamic stimuli promises to observe
the nuances of cognition in a more life-like environment \citep{maguire2012studying}.
Some interesting applications include the determination of neural
response to changes in facial expression \citep{Harris2014}, understanding
complex social interactions by using videos \citep{Tikka2012}, and more
untouched themes such as the underlying processing of music
\citep{Toiviainen2014}. In such studies, an unobtrusive behavioral measurement
is required to quantify the relationship between stimulus and response.
Tracking the focus of participants' gaze is a suitable, well established
method that has been successfully employed in a variety of studies ranging
from the understanding of visual attention \citep{HantaoLiu2011}, memory
\citep{Hannula2010}, and language comprehension \citep{Gordon2006}.
%
Regardless of use case, the raw eye tracking data (gaze position coordinates)
provided by eye tracking devices are rarely used ``as is". Instead, in order
to disentangle different cognitive, oculomotor, or perceptive states
associated with different types of eye movements, most research relies on the
classification of eye gaze data into distinct eye movement event categories
\citep{Schutz2011}. The most feasible approach for doing this lies in the
application of appropriate event classification algorithms.

However, a recent comparison of algorithms found that while many readily
available algorithms for eye movement classification performed well on data
from static stimulation or short trial-based acquisitions with simplified
moving stimuli, none worked particularly well on data from complex
dynamic stimulation, such as video clips, when compared to human coders
\citep{Andersson2017}.
%
And indeed, when we evaluated an algorithm by \citet{Nystrom2010AnData}, one of
the winners in the aforementioned comparison, on data from prolonged
stimulation (\unit[$\approx$15]{min}) with a feature film, we found the
average and median durations of labeled fixations to exceed literature
reports \citep[\eg][]{holmqvist2011eye,dorr2010variability} by up to a factor
of two. Additionally, and in particular for increasing levels of noise in the
data, the algorithm classified too few fixations, as also noted by
\citet{Friedman2018}, because it discarded potential fixation events that
contained data artifacts such as signal-loss and distortion associated with
blinks.
%
%However, robust performance on noisy data is of particular relevance in the
%context of ``natural stimulation'', as the ultimate natural stimulation is the
%actual natural environment, and data acquired outdoors or with mobile
%devices typically does not match the quality achieved in dedicated lab
%setups.

Therefore our objective was to improve upon the available eye movement
classification algorithms, and develop a tool that performs
robustly on data from dynamic, feature-rich stimulation, without sacrificing classification
accuracy for static and simplified stimulation. Importantly, we aimed for
applicability to prolonged recordings that potentially exhibit periods of
signal-loss and non-stationary noise levels.
Finally, one of our main objectives was to keep the algorithm as accessible
and easily available as possible in order to ease the difficulties associated
with closed-source software or non-publicly available source code of published
algorithms.
% maybe work in \citep{Hooge2018} again

Following the best practices proposed by \citet{hessels2018eye},
we define the different eye-movements that are supported by our algorithm
on a functional and oculomotor dimension as follows:
A \textit{fixation} is a period of time during which a part of the visual stimulus
is looked at and thereby projected to a relatively constant location on the retina.
This type of eye movement is necessary for visual intake, and characterized by a
relatively still gaze position with respect to the world (e.g., a computer screen
used for stimulus presentation) in the eye-tracker signal.
A fixation event therefore excludes periods of \textit{smooth pursuit}.
These events are eye movements during which a
part of the visual stimulus that moves with respect to the world is looked at for
visual intake (e.g., a moving dot on a computer screen). Like fixations,
the stimulus is projected to a relatively constant location on the retina
\citep{carl1987pursuits}, however, the event is characterized by steadily changing
gaze position in the eye-tracker signal.
If this type of eye movement is not properly classified,
erroneous fixation and saccade events (which smooth pursuits would be classified into
instead) are introduced \citep{Andersson2017}. Contemporary algorithms rarely provide
this functionality \cite[but see \eg][for existing algorithms with
smooth pursuit classification]{LARSSON2015145,Komogortsev2013}.
\textit{Saccades} on the other hand are also characterized by changing gaze positions,
but their velocity trace is usually higher than that of pursuit movements.
They serve to shift the position of the eye to a target region, and, unlike
during pursuit or fixation events, visual intake
is suppressed \citep{Schutz2011}. Lastly, \textit{post-saccadic oscillations} are periods of
ocular instability after a saccade \citep{Nystrom2010AnData}.

Here we introduce \remodnav\ (robust eye movement classification for dynamic
stimulation), a new tool that aims to meet our objectives and classifies the
eye movement events defined above. It is built on the
aforementioned algorithm by \citet{Nystrom2010AnData} (subsequently labeled NH)
that employs an adaptive approach to velocity based eye movement event
classification. \remodnav\ enhances NH with the use of robust
statistics, and a compartmentalization of prolonged time series into short,
more homogeneous segments with more uniform noise levels.
Furthermore, it adds support for pursuit event classification.
Just as the original algorithm, its frame of reference is world centered,
i.e. the gaze coordinates have a reference to a stimulation set-up with a fixed
position in the world such as x and y coordinates in pixel of a computer screen,
and it is meant to be used with eye tracking data from participants viewing
static (e.g. images) or dynamic (e.g. videos) stimuli, recorded with remote or
tower-mounted eye trackers.
Importantly, it is built and distributed as free, open source software,
and can be easily obtained and executed with free tools.
We evaluated \remodnav\ on three different
datasets from conventional paradigms, and dynamic, feature-rich stimulation (high and lower
quality), and relate its performance to the algorithm comparison by
\cite{Andersson2017}.


\section*{Methods}\label{methods}

% Methods (3 sections; our algo, comparison to current algos, application on
% studyforrest dataset)

%\ todo[inline]{\textit{Elaborate on how the algorithm works;For software tool
%papers, this section should address how the tool works and any relevant
%technical details required for implementation of the tool by other
%developers.}}

Event classification algorithms can be broadly grouped into \textit{velocity-} and
\textit{dispersion-}based algorithms. The former rely on velocity thresholds to
differentiate between different eye movement events, while the latter classify
eye movements based on the size of the region the recorded data falls
into for a given amount of time \citep{holmqvist2011eye}. Both types of algorithms
are common (see e.g., \citet{hessels2017noise} for a recent dispersion-based,
and e.g., \citet{van2018gazepath} for a recent velocity-based solution,
and see \citet{dalveren2019evaluation} for an evaluation of common algorithms
of both types).
Like NH, \remodnav\ is a \textit{velocity-based} event classification algorithm.
The algorithm comprises two major steps: preprocessing and event classification.    The following
sections detail individual analysis steps. For each step relevant algorithm
parameters are given in parenthesis.
\fig{alg} provides an overview of the algorithm's main components.
\tab{parameters} summarizes all parameters, and lists their default values.
The computational definitions of the different eye movements
\citep{hessels2018eye} are given within the event classification description.
Note, however, that some of the computational definitions of eye movements can be
adjusted to comply to alternative definitions by changing the algorithms'
parameters.


\subsection*{Preprocessing}

The goal of data preprocessing is to compute a time series of eye movement
velocities on which the event classification algorithm can be executed, while jointly
reducing non-eyemovement-related noise in the data as much as possible.

First, implausible spikes in the coordinate time series are removed with a
heuristic spike filter \citep{stampe1993} (\fig{alg}, P1). This filter is
standard in many eye tracking toolboxes and often used for preprocessing
\citep[\eg][]{Friedman2018}.
%
Data samples around signal loss (\eg eye blinks) can be set to non-numeric values (NaN)
in order to eliminate spurious movement signals without shortening the time series
(\param{dilate\_nan}, \param{min\_blink\_duration}; \fig{alg}, P2). This is
motivated by the fact that blinks can produce artifacts in the eye-tracking signal when the
eyelid closes and re-opens \citep{choe2016pupil}.
%
Coordinate time series are temporally filtered in two different ways
\fig{alg}, P3). A relatively large median filter
(\param{median\_filter\_length}) is used to emphasize large amplitude saccades.  This type of
filtered data is later used for a coarse segmentation of a time series into
shorter intervals between major saccades.
%
Separately, data are also smoothed with a Savitzky-Golay filter
(\param{savgol\_ \{length,polyord\}}). All event classification beyond the
localization of major saccades for time series chunking is performed on this
type of filtered data.

After spike-removal and temporal filtering, movement velocities are computed.
To disregard biologically implausible measurements, a
configurable maximum velocity (\param{max\_vel}) is enforced---any samples
exceeding this threshold are replaced by this set value.

%The result of a default preprocessing procedure is displayed in \fig{preproc}.
%
%\begin{figure}
%  \includegraphics[width=0.5\textwidth]{img/preproc.pdf}
%  \caption{Examplary preprocessing.}
%  \label{fig:preproc}
%\end{figure}

\begin{figure*}
  \includegraphics[width=1\textwidth]{img/flowchart_2.pdf}
  \caption{Schematic algorithm overview.
  (A) Preprocessing. The two plots show raw (blue) and processed (black)
  time series after preprocessing with the default parameter values
  (see Table \ref{tab:parameters} for details).
  (B) Adaptive saccade velocity computation and time series chunking.
  Starting from an initial velocity threshold (\param{velthresh\_startvelocity}),
  a global velocity threshold is iteratively determined. The time series is chunked
  into intervals between the fastest saccades across the complete recording.
  (C) Saccade and PSO classification.
  Saccade on- and offsets, and PSO on- and offsets are classified based on adaptive
  velocity thresholds computed within the respective event contexts.
  The default context is either \unit[1]{s} centered on the peak velocity for saccadic
  events used for time series chunking, or the entire time series chunk for
  intersaccadic intervals. PSOs are classified into low- or high-velocity PSOs
  depending on whether they exceed the saccade onset- or peak-velocity threshold.
  (D) Fixation and pursuit classification.
  Remaining unlabeled segments are filtered with a low-pass Butterworth filter. Samples
  exceeding a configurable pursuit velocity threshold (\param{pursuit\_velthresh})
  are classified as pursuits, and segments that do not qualify as pursuits are
  classified as fixations.
  }
  \label{fig:alg}
\end{figure*}

\begin{table*}[tbp]
  \caption{Exhaustive list of algorithm parameters, their default values, and units.}
  \label{tab:parameters}
  \small
  \begin{tabular}{lp{85mm}l}
    \textbf{Name} & \textbf{Description} & \textbf{Value} \\
    & & \\
    \multicolumn{3}{l}{\textit{Preprocessing (in order of application during processing)}} \\
    \texttt{px2deg} &
    size of a single (square) pixel &
    no default [\unit{deg}]\\
    \texttt{sampling\_rate} &
    temporal data sampling rate/frequency &
    no default [\unit{Hz}]\\
    \texttt{min\_blink\_duration} &
    missing data windows shorter than this duration will not be considered for \texttt{dilate\_nan}&
    \unit[0.02]{s}\\
    \texttt{dilate\_nan} &
    duration for which to replace data by missing data markers on either side of a
    signal-loss window (\fig{alg}, P2)&
    \unit[0.01]{s}\\
    \texttt{median\_filter\_length} &
    smoothing median-filter size (for initial data chunking only) (\fig{alg}, P3)&
    \unit[0.05]{s}\\
    \texttt{savgol\_length} &
    size of Savitzky-Golay filter for noise reduction (\fig{alg}, P3)&
    \unit[0.019]{s}\\
    \texttt{savgol\_polyord} &
    polynomial order of Savitzky-Golay filter for noise reduction (\fig{alg}, P3)&
    2\\
    \texttt{max\_vel} &
    maximum velocity threshold, will replace value with maximum, and issue
    warning if exceeded to inform about
    potentially inappropriate filter settings
    \citep[default value based on ][]{holmqvist2011eye}&
    \unit[1000]{deg/s}\\

    \\\multicolumn{3}{l}{\textit{Event classification}} \\
    \texttt{min\_saccade\_duration} &
    minimum duration of a saccade event candidate (\fig{alg}, E3) &
    \unit[0.01]{s}\\
    \texttt{max\_pso\_duration} &
    maximum duration of a post-saccadic oscillation (glissade)  (\fig{alg}, E3) &
    \unit[0.04]{s}\\
    \texttt{min\_fixation\_duration} &
    minimum duration of a fixation event candidate (\fig{alg}, E4)&
    \unit[0.04]{s}\\
    \texttt{min\_pursuit\_duration} &
    minimum duration of a pursuit event candidate (\fig{alg}, E4)&
    \unit[0.04]{s}\\
    \texttt{min\_intersaccade\_duration} &
    no saccade classification is performed in windows shorter than twice this value, plus minimum saccade and PSO duration (\fig{alg}, E2)&
    \unit[0.04]{s}\\
    \texttt{noise\_factor} &
    adaptive saccade onset threshold velocity is the median absolute deviation of velocities in the window of interest, times this factor (peak velocity threshold is twice the onset velocity); increase for noisy data to reduce false positives \citep[equivalent: 3.0]{Nystrom2010AnData}(\fig{alg}E1)&
    5\\
    \texttt{velthresh\_startvelocity} &
    start value for adaptive velocity threshold algorithm \citep{Nystrom2010AnData}, should
    be larger than any conceivable minimum saccade velocity (\fig{alg}, E1)&
    \unit[300]{deg/s}\\
    \texttt{max\_initial\_saccade\_freq} &
    maximum saccade frequency for initial classification of major saccades, initial data
    chunking is stopped if this frequency is reached (should be smaller than an expected
    (natural) saccade frequency in a particular context), default based on literature reports of a natural, free-viewing saccade frequency of \unit[$\sim$1.7 $\pm$0.3]{Hz} during a movie stimulus \citep{amit2017temporal} (\fig{alg}E1)&
    \unit[2]{Hz}\\
    \texttt{saccade\_context\_window\_length} &
    size of a window centered on any velocity peak for adaptive determination of
    saccade velocity thresholds (for initial data chunking only) (\fig{alg}, E2)&
    \unit[1]{s}\\
    \texttt{lowpass\_cutoff\_freq} &
    cut-off frequency of a Butterworth low-pass filter applied to determine drift
    velocities in a pursuit event candidate (\fig{alg}, E4)&
    \unit[4]{Hz}\\
    \texttt{pursuit\_velthresh} &
    fixed drift velocity threshold to distinguish periods of pursuit from periods of fixation; higher than natural ocular drift velocities during fixations \citep[\eg ][]{GOLTZ1997789,cherici2012} (\fig{alg}, E4)&
    \unit[2]{deg/s}\\
  \end{tabular}
\end{table*}


\subsection*{Event classification}

\subsubsection*{Saccade velocity threshold}

Except for a few modifications, \remodnav\ employs the adaptive saccade
classification algorithm proposed by \cite{Nystrom2010AnData}, where saccades are
initially located by thresholding the velocity time series by a critical value.
Starting from an initial velocity threshold (\param{velthresh\_startvelocity},
termed $PT_1$ in NH), the critical value is determined adaptively by computing
the variance of sub-threshold velocities ($V$), and placing the new velocity
threshold at:
%
\begin{equation} PT_n = \overline{V}_{n-1} + F \times \sqrt{{\sum(V_{n-1} -
  \overline{V}_{n-1})^2} \over {N-1}} \end{equation}
%
where $F$ determines how many standard deviations above the average velocity
the new threshold is located.  This procedure is repeated until it stabilizes
on a threshold velocity.
%
\begin{equation} |PT_n - PT_{n-1}| < 1^\circ/sec \end{equation}

\remodnav\ alters this algorithm by using robust statistics that are more
suitable for the non-normal distribution of velocities \citep{Friedman2018},
such that the new threshold is computed by:
%
\begin{equation}\label{eq:threshold}
PT_n = median({V}_{n-1}) + F \times MAD({V}_{n-1})
\end{equation}
%
where $MAD$ is the median absolute deviation, and $F$ is a
scalar parameter of the algorithm.
This iterative process is illustrated in \fig{alg}, E1 (upper panel).

% Adina: removed in favor of new algorithm overview
%\begin{figure}
%  \includegraphics[width=0.5\textwidth]{img/vel_est_1.pdf}
%  \caption{Iterative, global estimation of velocity thresholds
%    for saccades (SACC), and high/low velocity post saccadic oscillations (HPSO/LPSO).
%    The method is adapted from \cite{Nystrom2010AnData}, but is modified to use robust statistics
%    with median absolute deviation (MAD) as a measure of variability, more suitable
%    for data with a non-normal distribution.}
%    \label{fig:velest1}
%\end{figure}

\subsection*{Time series chunking}

As the algorithm aims to be applicable to prolonged recordings with
potentially inhomogeneous noise levels, the time series needs
to be split into shorter chunks to prevent the negative impact of sporadic
noise flares on the aforementioned adaptive velocity thresholding procedure.

\remodnav\ implements this time-series chunking by determining a critical velocity on a
median-filtered (\param{median\_filter\_length}) time series comprising the
full duration of a recording (\fig{alg}, E2). Due to potentially elevated noise
levels, the resulting threshold tends to overestimate an optimal threshold.
Consequently, only periods of fastest eye movements will exceed this threshold.
All such periods of consecutive above-threshold velocities are weighted by the
sum of these velocities. Boundaries of time series chunks are determined by
selecting such events sequentially (starting with the largest sums), until a
maximum average frequency across the whole time series is reached
(\param{max\_initial\_saccade\_ freq}). The resulting chunks represent data
intervals between saccades of maximum magnitude in the respective data.
\fig{alg}, E3 (right) exemplifies event classification within such an intersaccadic interval.

\subsection*{Classification of saccades and post-saccadic oscillations}

Classification of these event types is identical to the NH algorithm, only the data
context and metrics for determining the velocity thresholds differ.  For
saccades that also represent time series chunk boundaries (event label
\texttt{SACC}), a context of \unit[1]{s}
(\param{saccade\_context\_window\_ length}) centered on the peak velocity is
used by default, for any other saccade (event label \texttt{ISAC}) the entire
time series chunk represents that context (\fig{alg}, E3).

Peak velocity threshold and on/offset velocity threshold are then determined by
equation \ref{eq:threshold} with $F$ set to $2\times\mathtt{noise\_factor}$ and
\param{noise\_factor}, respectively. Starting from a velocity peak, the
immediately preceding and the following velocity minima that do not exceed the
on/offset threshold are located and used as event boundaries. Qualifying events
are rejected if they do not exceed a configurable minimum duration or violate
the set saccade maximum proximity criterion (\param{min\_ saccade\_duration},
\param{min\_intersaccade\_duration}).

As in NH, post-saccadic oscillations are events that immediately follow a
saccade, where the velocity exceeds the saccade velocity threshold within a short
time window (\param{max\_pso\_duration}). \remodnav\ distinguishes low-velocity
(event label \texttt{LPSO} for chunk boundary event, \texttt{ILPS} otherwise)
and high-velocity oscillations (event label \texttt{HPSO} or \texttt{IHPS}),
where the velocity exceeds the saccade onset or peak velocity threshold,
respectively.

% Adina: Removed in favor of new algorithm overview
%\begin{figure}
%  \includegraphics[width=0.5\textwidth]{img/vel_est_2.pdf}
%  \caption{Iterative event classification between major saccades (SACC).
%  The algorithm reports saccades within major saccade windows (ISAC),
%  high/low velocity post saccadic oscillations after ISAC events (IHPS/ILPS),
%  fixations (FIXA), and smooth pursuits (PURS).}
%  \label{fig:velest2}
%\end{figure}

\subsection*{Pursuit and fixation classification}

For all remaining, unlabeled time series segments that are longer than a
minimum duration (\param{min\_fixation\_ duration}), velocities are low-pass
filtered (Butterworth, \param{lowpass\_cutoff\_freq}). Any segments
exceeding a minimum velocity threshold (\param{pursuit\_velthresh}) are
classified as pursuit (event label \texttt{PURS}). Pursuit on/offset classification
uses the same approach as that for saccades: search for local minima preceding
and following the above threshold velocities.
%
Any remaining segment that does not qualify as a pursuit event is classified
as a fixation (event label \texttt{FIXA}) (\fig{alg}, E4).


\subsection*{Operation}\label{op}

\remodnav\ is free and open-source software, written in the Python language and
released under the terms of the MIT license. In addition to the Python standard
library it requires the Python packages
%
NumPy \citep{oliphant2006guide},
Matplotlib \citep{hunter2007matplotlib},
statsmodels \citep{seabold2010statsmodels},
and SciPy \citep{JOP+2001} as software dependencies.
Furthermore, DataLad \citep{HH+2013},
and Pandas \citep{mckinney2010data}
%
have to be available to run the test
battery. \remodnav\ itself, and all software dependencies are available on all
major operating systems.  There are no particular hardware requirements for
running the software other than sufficient memory to load and process the data.

A typical program invocation looks like
%
\begin{verbatim}
remodnav <inputfile> <outputfile> \
    <px2deg> <samplingrate>
\end{verbatim}
%
where \texttt{<inputfile>} is the name of a tab-separated-value (TSV) text file
with one gaze coordinate sample per line. An input file can have any number of
columns, only the first two columns are read and interpreted as $X$ and $Y$
coordinates. Note that this constrains input data to a dense data representation,
i.e. either data from eye trackers with fixed sampling frequency throughout the
recording, or sparse data that has been transformed into a dense representation
beforehand.
The second argument \texttt{<outputfile>} is the file name of a
BIDS-compliant \citep{gorgolewski2016brain} TSV text file that will contain a
report on one classified eye movement event per line, with onset and offset time,
onset and offset coordinates, amplitude, peak velocity, median velocity and
average velocity. The remaining arguments are the only two mandatory
parameters: the conversion factor from pixels to visual degrees, \ie the visual
angle of a single (square) pixel (\texttt{<px2deg>} in \unit{deg}), and the
temporal sampling rate (\texttt{<sampling\_rate>} in \unit{Hz}).
Any other supported parameter can be added to the program invocation to override
the default values.

A complete list of supported parameters (sorted by algorithm step) with their
description and default value, are listed in \tab{parameters}.
While the required user input is kept minimal, the number of configurable
parameters is purposefully large to facilitate optimal parameterization for
data with specific properties. Besides the list of classified events, a
visualization of the classification results, together with a time course of
horizontal and vertical gaze position, and velocities is provided for
illustration and initial quality assessment of algorithm performance on each
input data file.


\section*{Validation analyses}\label{ana}

% \todo[inline]{three major types of comparison: with andersson human labeling,
% stats of forrest lab recording with andersson video data stats, forrest lab
% vs forrest mri stats. the goal is to show that we are similar to humans, as
% good (or better) as other algorithms (by comparison with scores in
% andersson2017), and proceduce "similar" results on a different movie dataset,
% and similar results across two different qualities of recordings with the
% same stimulus (lab vs MRI). No more, no less IMHO. This all translates to
% three use cases: trial-by-trial data (from anderson), good movie data without
% trial structure (forrest lab), bad movie data (forrest mri)}

% THIS SECTION  WILL BASICALLY SHOW THE INPUTS AND THE OUTPUTS(RESULTS
% BASICALLY)

The selection of datasets and analyses for validating algorithm performance was
guided by three objectives: 1) compare to other existing
solutions; 2) demonstrate plausible results on data from prolonged gaze
coordinate recordings during viewing of dynamic, feature-rich stimuli; and 3) illustrate result
robustness on lower-quality data. The following three sections each introduce a
dataset and present the validation results for these objectives.  All analysis
presented here are performed using default parameters (\tab{parameters}), with
no dataset-specific tuning other than the built-in adaptive behavior.


\subsection*{Algorithm comparison}\label{ana_1}

Presently, \cite{Andersson2017} represents the most comprehensive comparative
study on eye movement classification algorithms. Moreover, the dataset employed
in that study was made publicly available. Consequently, evaluating \remodnav\
performance on these data and using their metrics offers a straightforward
approach to relate this new development to alternative solutions.

% dataset
The dataset provided by
\cite{Andersson2017}\footnote{github.com/richardandersson/EyeMovementDetector\linebreak[0]Evaluation}
consists of monocular eye gaze data produced from viewing stimuli from three
distinct categories---images, moving dots and videos. The data release contains
gaze coordinate time series (\unit[500]{Hz} sampling rate), and metadata on
stimulus size and viewing distance.  Importantly, each time point was manually
classified by two expert human raters as one of six event categories: fixation,
saccade, PSO, smooth pursuit, blink and undefined (a sample that did not fit
any other category). A minor labeling mistake reported in \cite{Zemblys2018}
was fixed prior to this validation analysis.

For each stimulus category, we computed the proportion of misclassifications
per event type, comparing \remodnav\ to each of the human coders, and, as a
baseline measure, the human coders against each other.
%
A time point was counted as misclassified if the two compared classifications
did not assign the same label. We limited this analysis to all time points that
were labeled as fixation, saccade, PSO, or pursuit by any method, hence
ignoring the rarely used NaN/blinks or ``undefined" category. For a direct
comparison with the results in \cite{Andersson2017}, the analysis was repeated
while also excluding samples labeled as pursuit.
In the labeled data, there was no distinction made between high- and low-velocity
PSOs, potentially because the literature following \citet{Nystrom2010AnData}
did not adopt their differentiation of PSOs into velocity categories.
All high- and low-velocity PSOs classified by \remodnav\ were therefore
collapsed into a single PSO category. \tab{mclf} shows the
misclassification rates for all pairwise comparisons, in all stimulus types.
In comparison to the NH algorithm, after which the proposed work was modelled,
\remodnav\ performed consistently better (32/93/70\% average misclassification for NH,
vs. \imgMNALMclfWOP/\dotsRAALMclfWOP/ \videoRAALMclfWOP\% worst
misclassification for \remodnav\ in categories images, dots, and videos). Compared to all ten
algorithms evaluated in \citet{Andersson2017}, \remodnav\ exhibits the lowest
misclassification rates across all stimulus categories.
%
When taking smooth pursuit events into account, the misclassification rate
naturally increases, but remains comparably low. Importantly, it still exceeds the
performance of all algorithms tested in \citet{Andersson2017} in the dots
and video category, and performs among the best in the images category.
Additionally, both with and without smooth pursuit, \remodnav s performance
exceeds also that of a recent deep neural network trained specifically on
video clips \citep[compare Table 7: 34\% misclassification versus \videoMNALMCLF\%
for \remodnav]{Startsev2018}.


\begin{table}[tbp]
  % table caption is above the table
  \caption{Proportion of samples in each stimulus category classified in
  disagreement between human coders (MN, RA) and the \remodnav\ algorithm
  (AL). The MC (misclassification) column lists proportions considering
  all four event categories (fixation, saccade, PSO, pursuit), while
  the w/oP (without pursuit) column excludes pursuit events for a direct
  comparison with \citet[][Tables 8-10]{Andersson2017}.
  The remaining columns show the percentage of labels assigned to incongruent
  time points by each rater (deviation of their sum from 100\% is due to
  rounding).
  }
  \label{tab:mclf}       % Give a unique label
  % For LaTeX tables use
  \begin{tabular}{llllllll}
    \textbf{Images}&&&&&&&\\
    \hline\noalign{\smallskip}
    Comp & MC & w/oP & Coder & Fix & Sac & PSO & SP \\
    \noalign{\smallskip}\hline\noalign{\smallskip}
    MN-RA & \imgMNRAMCLF & \imgMNRAMclfWOP & MN & \imgMNRAFIXref & \imgMNRASACref & \imgMNRAPSOref & \imgMNRASPref  \\
    --- & --- & --- & RA & \imgMNRAFIXcod & \imgMNRASACcod & \imgMNRAPSOcod & \imgMNRASPcod \\
    MN-AL & \imgMNALMCLF & \imgMNALMclfWOP & MN & \imgMNALFIXref & \imgMNALSACref & \imgMNALPSOref & \imgMNALSPref \\
    --- & --- & --- & AL & \imgMNALFIXcod & \imgMNALSACcod & \imgMNALPSOcod & \imgMNALSPcod \\
    RA-AL & \imgRAALMCLF & \imgRAALMclfWOP & RA & \imgRAALFIXref & \imgRAALSACref & \imgRAALPSOref & \imgRAALSPref \\
    ---& ---& ---& AL & \imgRAALFIXcod & \imgRAALSACcod & \imgRAALPSOcod & \imgRAALSPcod \\
    \noalign{\smallskip}
    \textbf{Dots}&&&&&&&\\
    \hline\noalign{\smallskip}
    Comp & MC & w/oP & Coder & Fix & Sac & PSO & SP \\
    \noalign{\smallskip}\hline\noalign{\smallskip}
    MN-RA & \dotsMNRAMCLF & \dotsMNRAMclfWOP & MN & \dotsMNRAFIXref & \dotsMNRASACref & \dotsMNRAPSOref & \dotsMNRASPref  \\
    --- & --- & --- & RA & \dotsMNRAFIXcod & \dotsMNRASACcod & \dotsMNRAPSOcod & \dotsMNRASPcod \\
    MN-AL & \dotsMNALMCLF & \dotsMNALMclfWOP & MN & \dotsMNALFIXref & \dotsMNALSACref & \dotsMNALPSOref & \dotsMNALSPref \\
    --- & --- & --- & AL & \dotsMNALFIXcod & \dotsMNALSACcod & \dotsMNALPSOcod & \dotsMNALSPcod\\
    RA-AL & \dotsRAALMCLF & \dotsRAALMclfWOP & RA & \dotsRAALFIXref & \dotsRAALSACref & \dotsRAALPSOref & \dotsRAALSPref \\
    ---& ---& ---& AL & \dotsRAALFIXcod & \dotsRAALSACcod & \dotsRAALPSOcod & \dotsRAALSPcod \\
    \noalign{\smallskip}
    \textbf{Videos}&&&&&&&\\
    \hline\noalign{\smallskip}
    Comp & MC & w/oP & Coder & Fix & Sac & PSO & SP \\
    \noalign{\smallskip}\hline\noalign{\smallskip}
    MN-RA & \videoMNRAMCLF & \videoMNRAMclfWOP & MN & \videoMNRAFIXref & \videoMNRASACref & \videoMNRAPSOref & \videoMNRASPref \\
    --- & --- & --- & RA & \videoMNRAFIXcod & \videoMNRASACcod & \videoMNRAPSOcod & \videoMNRASPcod \\
    MN-AL & \videoMNALMCLF & \videoMNALMclfWOP & MN & \videoMNALFIXref & \videoMNALSACref & \videoMNALPSOref & \videoMNALSPref \\
    --- & --- & --- & AL & \videoMNALFIXcod & \videoMNALSACcod & \videoMNALPSOcod & \videoMNALSPcod\\
    RA-AL & \videoRAALMCLF & \videoRAALMclfWOP & RA & \videoRAALFIXref & \videoRAALSACref & \videoRAALPSOref & \videoRAALSPref \\
    ---& ---& ---& AL & \videoRAALFIXcod & \videoRAALSACcod & \videoRAALPSOcod & \videoRAALSPcod \\
    \noalign{\smallskip}\hline
  \end{tabular}
\end{table}

\fig{conf} shows confusion patterns for a comparison of algorithm
classifications with human labeling and displays the similarity between
classification decisions with Jaccard indices \citep[JI; ][]{jaccard1901etude}.
The JI is bound in range [0, 1] with higher values indicating higher similarity.
A value of 0.93 in the upper left cell of the very first matrix in \fig{conf}
for example indicates that 93\% of frames that  are labeled as a fixation by
human coders RA and MN are the same. This index allows to quantify the
similarity of classifications independent of values in other cells.
While \remodnav\ does not achieve a
labeling similarity that reaches the human inter-rater agreement, it still
performs well. In particular, the relative magnitude of agreement with each
individual human coder for fixations, saccades, and PSOs, resembles the
agreement between the human coders. Classification of smooth
pursuits is consistent with human labels for the categories moving dots, and
videos. However, there is a substantial confusion of fixation and pursuit for
the static images. In a real-world application of \remodnav, pursuit classification
could be disabled (by setting a high pursuit velocity threshold) for data from
static images, if the occurrence of pursuit events can be ruled out a priori.
For this evaluation, however, no such intervention was made.

\begin{figure*}
  % Use the relevant command to insert your figure file.
  % For example, with the graphicx package use
  % TODO make final figure and switch
  %\includegraphics[width=1\textwidth]{img/conf_drawing.eps}
  \includegraphics[trim=0 0 0 0,clip,width=1\textwidth]{img/confusion_MN_RA.pdf} \\
  \includegraphics[trim=0 0 0 6.6mm,clip,width=1\textwidth]{img/confusion_MN_AL.pdf} \\
  \includegraphics[trim=0 0 0 6.6mm,clip,width=1\textwidth]{img/confusion_RA_AL.pdf}
  % figure caption is below the figure

  \caption{Confusion patterns for pairwise eye movement classification
    comparison of both human raters \citep[MN and RA; ][]{Andersson2017} and the
    \remodnav\ algorithm (AL) for gaze recordings from stimulation with static
    images (left column), moving dots (middle column), and video clips (right
    column).  All matrices present gaze sample based Jaccard indices \citep[JI;
    ][]{jaccard1901etude}. Consequently, the diagonals depict the fraction of
    time points labeled congruently by both raters in relation to the number of
    timepoints assigned to a particular event category by any rater.}
  % Give a unique label
  \label{fig:conf}
\end{figure*}


\begin{table}[tbp]
  % table caption is above the table
  \caption{Cohen's Kappa reliability between human coders (MN, RA), and \remodnav\ (AL)
  with each of the human coders.
  }
  \label{tab:kappa}       % Give a unique label
  % For LaTeX tables use
  \begin{tabular*}{0.5\textwidth}{c @{\extracolsep{\fill}}llll}
    \textbf {Fixations}                   &                  &                   &                    \\
    \hline\noalign{\smallskip}
    Comparison                            & Images           & Dots              & Videos             \\
    \noalign{\smallskip}\hline\noalign{\smallskip}
    MN versus RA                          & \kappaRAMNimgFix & \kappaRAMNdotsFix & \kappaRAMNvideoFix \\
    AL versus RA                          & \kappaALRAimgFix & \kappaALRAdotsFix & \kappaALRAvideoFix \\
    AL versus MN                          & \kappaALMNimgFix & \kappaALMNdotsFix & \kappaALMNvideoFix \\
    \noalign{\smallskip}
    \textbf{Saccades}                     &                  &                   &                    \\
    \hline\noalign{\smallskip}
    Comparison                            & Images           & Dots              & Videos             \\
    \noalign{\smallskip}\hline\noalign{\smallskip}
    MN versus RA                          & \kappaRAMNimgSac & \kappaRAMNdotsSac & \kappaRAMNvideoSac \\
    AL versus RA                          & \kappaALRAimgSac & \kappaALRAdotsSac & \kappaALRAvideoSac \\
    AL versus MN                          & \kappaALMNimgSac & \kappaALMNdotsSac & \kappaALMNvideoSac \\
    \noalign{\smallskip}
    \textbf{PSOs}                         &                  &                   &                    \\
    \hline\noalign{\smallskip}
    Comparison                            & Images           & Dots              & Videos             \\
    \noalign{\smallskip}\hline\noalign{\smallskip}
    MN versus RA                          & \kappaRAMNimgPSO & \kappaRAMNdotsPSO & \kappaRAMNvideoPSO \\
    AL versus RA                          & \kappaALRAimgPSO & \kappaALRAdotsPSO & \kappaALRAvideoPSO \\
    AL versus MN                          & \kappaALMNimgPSO & \kappaALMNdotsPSO & \kappaALMNvideoPSO \\
    \noalign{\smallskip}\hline
  \end{tabular*}
\end{table}

In addition to the confusion analysis and again following \citet{Andersson2017},
we computed Cohen's Kappa \citep{cohen1960coefficient} for an additional measure
of similarity between human and algorithm performance. It quantifies the
sample-by-sample agreement between two ratings following equation \ref{eq:kappa}:
%
\begin{equation}\label{eq:kappa}
K = \frac{P_o - P_c}{1- P_c}
\end{equation}
%
where $P_o$ is the observed proportion of agreement between the ratings, and
$P_c$ is the proportion of chance agreement. A value of $K=1$ indicates perfect
agreement, and $K=0$ indicates chance level agreement.
Table \ref{tab:kappa} displays the resulting values
between the two human experts, and \remodnav\ with each of the experts, for
each stimulus category and the three event types used in \citet{Andersson2017},
namely fixations, saccades, and PSOs (compare to \citet{Andersson2017}, table 7).
For all event types and stimulus categories, \remodnav\ performs on par or better
than the original NH algorithm, and in many cases on par or better than the best
of all algorithms evaluated in \citet{Andersson2017} within an event or stimulus type.

In order to further rank the performance of the proposed algorithm with respect
to the ten algorithms studied in \citet{Andersson2017}, we followed their
approach to compute root mean square deviations (RMSD) from human labels for
event duration distribution characteristics (mean and standard deviation of
durations, plus number of events) for each stimulus category (images, dots,
videos) and event type (fixations, saccades, PSOs, pursuits). This measure
represents a scalar distribution dissimilarity score that can be used as an
additional comparison metric of algorithm performance that focuses on overall
number and durations of classified events, instead of sample-by-sample
misclassification. The RMSD measure has a lower bound of $0.0$ (identical to
the average of both human raters), with higher values indicating larger
differences \citep[for detail information on the calculation of this metric
see][]{Andersson2017}.

\tab{rmsd} is modelled after \citet[Tables
3-6]{Andersson2017}, appended with \remodnav, showing RMSD based on the scores of human raters given in the original tables. As
acknowledged by the authors, the absolute value of the RMSD scores is not
informative due to scaling with respect to the respective maximum value of each
characteristic.  Therefore, we converted RMSDs for each algorithm and event
type into zero-based ranks (lower is more human-like).

The LNS algorithm \citep{Larsson2013} was found to have the most human-like
performance for saccade and PSO classification in \cite{Andersson2017}.  \remodnav\
performs comparable to LNS for both event types (saccades: $2.0$ vs. $3.3$;
PSOs: $2.3$ vs. $2.0$, mean rank across stimulus categories for LNS and \remodnav,
respectively).

Depending on the stimulus type, different algorithms performed best for
fixation classification. NH performed best for images and videos, but worst for
moving dots. \remodnav\ outperforms all other algorithms in the dots category,
and achieves rank 5 and 6 (middle range) for videos and images, respectively.
Across all stimulus and event categories, \remodnav\ achieves a mean ranking
of $2.9$, and a mean ranking of $3.2$ when not taking smooth pursuit into account.

\begin{table*}[p]
  % table caption is above the table
  \caption{Comparison of event duration statistics (mean, standard deviation, and number
    of events) for image, dot, and video
    stimuli. This table is modeled after \citet[Tables 3-6]{Andersson2017}, and
    root-mean-square-deviations (RMSD) from human raters are shown
    for fixations, saccades, PSOs, and pursuit as zero-based ranks (rank zero
    is closest to the average of the two human raters). Summary statistics for
    all algorithms used in \citet{Andersson2017} were taken from their publicly
    available GitHub repository
    (github.com/richardandersson/EyeMovementDetectorEvaluation). Cohens Kappa
    was computed for the complete set of algorithms in \citet{Andersson2017} and
    \remodnav .}
  \label{tab:rmsd}       % Give a unique label
  % For LaTeX tables use
  \begin{small}
  \begin{tabular*}{\textwidth}{c @{\extracolsep{\fill}}lllllllllllll}
    \multicolumn{13}{l}{\textit{Fixations}}\\
    \toprule
    & \multicolumn{4}{l}{Images} & \multicolumn{4}{l}{Dots} & \multicolumn{4}{l}{Videos}\\
    Algorithm & Mean & SD & \# & rank &  Mean & SD & \# & rank & Mean & SD & \# & rank \\
    \noalign{\smallskip}\hline\noalign{\smallskip}
    MN        & \FIXimgmnMN   & \FIXimgsdMN   & \FIXimgnoMN   & \rankFIXimgMN   &  \FIXdotsmnMN   & \FIXdotssdMN   & \FIXdotsnoMN   & \rankFIXdotsMN    & \FIXvideomnMN   & \FIXvideosdMN   & \FIXvideonoMN   & \rankFIXvideoMN    \\
    RA        & \FIXimgmnRA   & \FIXimgsdRA   & \FIXimgnoRA   & \rankFIXimgRA   &  \FIXdotsmnRA   & \FIXdotssdRA   & \FIXdotsnoRA   & \rankFIXdotsRA    & \FIXvideomnRA   & \FIXvideosdRA   & \FIXvideonoRA   & \rankFIXvideoRA    \\
    CDT       & \FIXimgmnCDT  & \FIXimgsdCDT  & \FIXimgnoCDT  & \rankFIXimgCDT  &  \FIXdotsmnCDT  & \FIXdotssdCDT  & \FIXdotsnoCDT  & \rankFIXdotsCDT   & \FIXvideomnCDT  & \FIXvideosdCDT  & \FIXvideonoCDT  & \rankFIXvideoCDT   \\
    EM        & -             & -             & -             & -               &  -              & -              & -              & -                 & -               & -               & -               & -                  \\
    IDT       & \FIXimgmnIDT  & \FIXimgsdIDT  & \FIXimgnoIDT  & \rankFIXimgIDT  &  \FIXdotsmnIDT  & \FIXdotssdIDT  & \FIXdotsnoIDT  & \rankFIXdotsIDT   & \FIXvideomnIDT  & \FIXvideosdIDT  & \FIXvideonoIDT  & \rankFIXvideoIDT   \\
    IKF       & \FIXimgmnIKF  & \FIXimgsdIKF  & \FIXimgnoIKF  & \rankFIXimgIKF  &  \FIXdotsmnIKF  & \FIXdotssdIKF  & \FIXdotsnoIKF  & \rankFIXdotsIKF   & \FIXvideomnIKF  & \FIXvideosdIKF  & \FIXvideonoIKF  & \rankFIXvideoIKF   \\
    IMST      & \FIXimgmnIMST & \FIXimgsdIMST & \FIXimgnoIMST & \rankFIXimgIMST &  \FIXdotsmnIMST & \FIXdotssdIMST & \FIXdotsnoIMST & \rankFIXdotsIMST  & \FIXvideomnIMST & \FIXvideosdIMST & \FIXvideonoIMST & \rankFIXvideoIMST  \\
    IHMM      & \FIXimgmnIHMM & \FIXimgsdIHMM & \FIXimgnoIHMM & \rankFIXimgIHMM &  \FIXdotsmnIHMM & \FIXdotssdIHMM & \FIXdotsnoIHMM & \rankFIXdotsIHMM  & \FIXvideomnIHMM & \FIXvideosdIHMM & \FIXvideonoIHMM & \rankFIXvideoIHMM  \\
    IVT       & \FIXimgmnIVT  & \FIXimgsdIVT  & \FIXimgnoIVT  & \rankFIXimgIVT  &  \FIXdotsmnIVT  & \FIXdotssdIVT  & \FIXdotsnoIVT  & \rankFIXdotsIVT   & \FIXvideomnIVT  & \FIXvideosdIVT  & \FIXvideonoIVT  & \rankFIXvideoIVT   \\
    NH        & \FIXimgmnNH   & \FIXimgsdNH   & \FIXimgnoNH   & \rankFIXimgNH   &  \FIXdotsmnNH   & \FIXdotssdNH   & \FIXdotsnoNH   & \rankFIXdotsNH    & \FIXvideomnNH   & \FIXvideosdNH   & \FIXvideonoNH   & \rankFIXvideoNH    \\
    BIT       & \FIXimgmnBIT  & \FIXimgsdBIT  & \FIXimgnoBIT  & \rankFIXimgBIT  &  \FIXdotsmnBIT  & \FIXdotssdBIT  & \FIXdotsnoBIT  & \rankFIXdotsBIT   & \FIXvideomnBIT  & \FIXvideosdBIT  & \FIXvideonoBIT  & \rankFIXvideoBIT   \\
    LNS       & -             & -             & -             &  -              &  -              & -              & -              &  -                & -               & -               & -               &  -                 \\
    \remodnav\ & \FIXimgmnRE   & \FIXimgsdRE   & \FIXimgnoRE   & \rankFIXimgRE   &  \FIXdotsmnRE   & \FIXdotssdRE   & \FIXdotsnoRE   & \rankFIXdotsRE    & \FIXvideomnRE   & \FIXvideosdRE   & \FIXvideonoRE   & \rankFIXvideoRE    \\
    \noalign{\smallskip}\bottomrule
    \\
    \multicolumn{13}{l}{\textit{Saccades}}\\
    \toprule\noalign{\smallskip}
    & \multicolumn{4}{l}{Images} & \multicolumn{4}{l}{Dots} & \multicolumn{4}{l}{Videos}\\
    Algorithm & Mean & SD & \# & rank &  Mean & SD & \# & rank & Mean & SD & \# & rank \\
    \noalign{\smallskip}\hline\noalign{\smallskip}
    MN        & \SACimgmnMN   & \SACimgsdMN   & \SACimgnoMN   & \rankSACimgMN   &  \SACdotsmnMN   & \SACdotssdMN   & \SACdotsnoMN   & \rankSACdotsMN    & \SACvideomnMN   & \SACvideosdMN   & \SACvideonoMN   & \rankSACvideoMN    \\
    RA        & \SACimgmnRA   & \SACimgsdRA   & \SACimgnoRA   & \rankSACimgRA   &  \SACdotsmnRA   & \SACdotssdRA   & \SACdotsnoRA   & \rankSACdotsRA    & \SACvideomnRA   & \SACvideosdRA   & \SACvideonoRA   & \rankSACvideoRA    \\
    CDT       & -             & -             & -             & -               &  -              & -              & -              & -                 & -               & -               & -               & -                  \\
    EM        & \SACimgmnEM   & \SACimgsdEM   & \SACimgnoEM   & \rankSACimgEM    &  \SACdotsmnEM   & \SACdotssdEM   & \SACdotsnoEM   & \rankSACdotsEM    & \SACvideomnEM   & \SACvideosdEM   & \SACvideonoEM   & \rankSACvideoEM    \\
    IDT       & \SACimgmnIDT  & \SACimgsdIDT  & \SACimgnoIDT  & \rankSACimgIDT  &  \SACdotsmnIDT  & \SACdotssdIDT  & \SACdotsnoIDT  & \rankSACdotsIDT   & \SACvideomnIDT  & \SACvideosdIDT  & \SACvideonoIDT  & \rankSACvideoIDT   \\
    IKF       & \SACimgmnIKF  & \SACimgsdIKF  & \SACimgnoIKF  & \rankSACimgIKF  &  \SACdotsmnIKF  & \SACdotssdIKF  & \SACdotsnoIKF  & \rankSACdotsIKF   & \SACvideomnIKF  & \SACvideosdIKF  & \SACvideonoIKF  & \rankSACvideoIKF   \\
    IMST      & \SACimgmnIMST & \SACimgsdIMST & \SACimgnoIMST & \rankSACimgIMST &  \SACdotsmnIMST & \SACdotssdIMST & \SACdotsnoIMST & \rankSACdotsIMST  & \SACvideomnIMST & \SACvideosdIMST & \SACvideonoIMST & \rankSACvideoIMST  \\
    IHMM      & \SACimgmnIHMM & \SACimgsdIHMM & \SACimgnoIHMM & \rankSACimgIHMM &  \SACdotsmnIHMM & \SACdotssdIHMM & \SACdotsnoIHMM & \rankSACdotsIHMM  & \SACvideomnIHMM & \SACvideosdIHMM & \SACvideonoIHMM & \rankSACvideoIHMM  \\
    IVT       & \SACimgmnIVT  & \SACimgsdIVT  & \SACimgnoIVT  & \rankSACimgIVT  &  \SACdotsmnIVT  & \SACdotssdIVT  & \SACdotsnoIVT  & \rankSACdotsIVT   & \SACvideomnIVT  & \SACvideosdIVT  & \SACvideonoIVT  & \rankSACvideoIVT   \\
    NH        & \SACimgmnNH   & \SACimgsdNH   & \SACimgnoNH   & \rankSACimgNH   &  \SACdotsmnNH   & \SACdotssdNH   & \SACdotsnoNH   & \rankSACdotsNH    & \SACvideomnNH   & \SACvideosdNH   & \SACvideonoNH   & \rankSACvideoNH    \\
    BIT       & -             & -             & -             & -               &  -              & -              & -              & -                 & -               & -               & -               & -                  \\
    LNS       & \SACimgmnLNS  & \SACimgsdLNS  & \SACimgnoLNS  & \rankSACimgLNS  &  \SACdotsmnLNS  & \SACdotssdLNS  & \SACdotsnoLNS  & \rankSACdotsLNS   & \SACvideomnLNS  & \SACvideosdLNS  & \SACvideonoLNS  & \rankSACvideoLNS   \\
    \remodnav\ & \SACimgmnRE   & \SACimgsdRE   & \SACimgnoRE   & \rankSACimgRE   &  \SACdotsmnRE   & \SACdotssdRE   & \SACdotsnoRE   & \rankSACdotsRE    & \SACvideomnRE   & \SACvideosdRE   & \SACvideonoRE   & \rankSACvideoRE    \\
    \noalign{\smallskip}\bottomrule
    \\
    \multicolumn{13}{l}{\textit{Post-saccadic oscillations}}\\
    \toprule\noalign{\smallskip}
    & \multicolumn{4}{l}{Images} & \multicolumn{4}{l}{Dots} & \multicolumn{4}{l}{Videos}\\
    Algorithm & Mean & SD & \# & rank &  Mean & SD & \# & rank & Mean & SD & \# & rank \\
    \noalign{\smallskip}\hline\noalign{\smallskip}
    MN        & \PSOimgmnMN   & \PSOimgsdMN   & \PSOimgnoMN   & \rankPSOimgMN   &  \PSOdotsmnMN   & \PSOdotssdMN   & \PSOdotsnoMN   & \rankPSOdotsMN    & \PSOvideomnMN   & \PSOvideosdMN   & \PSOvideonoMN   & \rankPSOvideoMN    \\
    RA        & \PSOimgmnRA   & \PSOimgsdRA   & \PSOimgnoRA   & \rankPSOimgRA   &  \PSOdotsmnRA   & \PSOdotssdRA   & \PSOdotsnoRA   & \rankPSOdotsRA    & \PSOvideomnRA   & \PSOvideosdRA   & \PSOvideonoRA   & \rankPSOvideoRA    \\
    NH        & \PSOimgmnNH   & \PSOimgsdNH   & \PSOimgnoNH   & \rankPSOimgNH   &  \PSOdotsmnNH   & \PSOdotssdNH   & \PSOdotsnoNH   & \rankPSOdotsNH    & \PSOvideomnNH   & \PSOvideosdNH   & \PSOvideonoNH   & \rankPSOvideoNH    \\
    LNS       & \PSOimgmnLNS  & \PSOimgsdLNS  & \PSOimgnoLNS  & \rankPSOimgLNS  &  \PSOdotsmnLNS  & \PSOdotssdLNS  & \PSOdotsnoLNS  & \rankPSOdotsLNS   & \PSOvideomnLNS  & \PSOvideosdLNS  & \PSOvideonoLNS  & \rankPSOvideoLNS   \\
    \remodnav\ & \PSOimgmnRE   & \PSOimgsdRE   & \PSOimgnoRE   & \rankPSOimgRE   &  \PSOdotsmnRE   & \PSOdotssdRE   & \PSOdotsnoRE   & \rankPSOdotsRE    & \PSOvideomnRE   & \PSOvideosdRE   & \PSOvideonoRE   & \rankPSOvideoRE    \\
    \noalign{\smallskip}\hline
    \\
    \multicolumn{13}{l}{\textit{Pursuit}}\\
    \toprule\noalign{\smallskip}
    & \multicolumn{4}{l}{Images} & \multicolumn{4}{l}{Dots} & \multicolumn{4}{l}{Videos}\\
    Algorithm & Mean & SD & \# & rank &  Mean & SD & \# & rank & Mean & SD & \# & rank \\
    \noalign{\smallskip}\hline\noalign{\smallskip}
    MN        & \PURimgmnMN   & \PURimgsdMN   & \PURimgnoMN   & \rankPURimgMN   &  \PURdotsmnMN   & \PURdotssdMN   & \PURdotsnoMN   & \rankPURdotsMN    & \PURvideomnMN   & \PURvideosdMN   & \PURvideonoMN   & \rankPURvideoMN    \\
    RA        & \PURimgmnRA   & \PURimgsdRA   & \PURimgnoRA   & \rankPURimgRA   &  \PURdotsmnRA   & \PURdotssdRA   & \PURdotsnoRA   & \rankPURdotsRA    & \PURvideomnRA   & \PURvideosdRA   & \PURvideonoRA   & \rankPURvideoRA    \\
    \remodnav\ & \PURimgmnRE   & \PURimgsdRE   & \PURimgnoRE   & \rankPURimgRE   &  \PURdotsmnRE   & \PURdotssdRE   & \PURdotsnoRE   & \rankPURdotsRE    & \PURvideomnRE   & \PURvideosdRE   & \PURvideonoRE   & \rankPURvideoRE    \\
    \noalign{\smallskip}\bottomrule
  \end{tabular*}
  \end{small}
\end{table*}

Taken together, \remodnav\ yields classification results that are, on average,
more human-like than any other algorithm tested on the dataset and metrics put
forth by \citet{Andersson2017}. In particular, its performance largely equals
or exceeds that of the original NH algorithm. NH outperforms it only for
fixation classification in the image and video category, but in these categories
\remodnav\ also classifies comparatively well. These results are an indication
that the changes to the NH algorithm proposed here to improve upon its
robustness are not detrimental to its performance on data from conventional
paradigms and stimuli.


\subsection*{Prolonged viewing of dynamic stimuli}\label{ana_2}

Given that \remodnav\ yielded plausible results for the "video" stimulus
category data in the \citet{Andersson2017} dataset (\fig{conf}, and
\tab{rmsd}, right columns), we determined
whether it is capable of analyzing data from dynamic stimulation in prolonged
(\unit[$~15$]{min}) recordings.

As a test dataset we used publicly available eye tracking data from the
\textit{studyforrest.org} project, where 15~participants were recorded watching
a feature-length (\unit[$\approx$2]{h}) movie in a laboratory setting
\citep{Hanke2016}. Eye movements were measured by an Eyelink 1000 with a
standard desktop mount (software version 4.51; SR Research Ltd., Mississauga,
Ontario, Canada) and a sampling rate of \unit[1000]{Hz}. The movie stimulus was
presented on a \unit[$522\times294$]{mm} LCD monitor at a resolution of
\unit[$1920\times1280$]{px} and a viewing distance of \unit[85]{cm}. Participants
watched the movie in eight approximately \unit[15]{min} long segments,
with recalibration of the eye tracker before every segment.

\begin{figure*}[tbp]
  \includegraphics[trim=0 8mm 3mm 0,clip,width=.5\textwidth]{img/mainseq_lab}
  \includegraphics[trim=8mm 8mm 0 0,clip,width=.5\textwidth-3.3mm]{img/mainseq_sub_lab} \\
  \includegraphics[trim=0 0 3mm 0,clip,width=.5\textwidth]{img/mainseq_mri}
  \includegraphics[trim=8mm 0 0 0,clip,width=.5\textwidth-3.3mm]{img/mainseq_sub_mri}

  \caption{Main sequence of eye movement events during one 15 minute sequence of
  the movie (segment 2) for lab (top), and MRI participants (bottom). Data
  across all participants per dataset is shown on the left, and data for a single
  exemplary participant on the right.}

  \label{fig:overallComp}
\end{figure*}

As no manual eye movement event labeling exists for these data, algorithm
evaluation was limited to a comparison of marginal distributions and well-known
properties, such as the log-log-linear relationship of saccade amplitude and
saccade peak velocity \citep{bahill1975main}. \fig{overallComp} (top row)
depicts this main sequence relationship.
%
Additionally, \fig{dist} (top row) shows duration histograms for all four event
types across all participants. Shapes and locations of these distributions
match previous reports in the literature, such as a strong bias towards short
(less than \unit[500]{ms}) fixations for dynamic stimuli
\citep[Fig.~3]{dorr2010variability}, peak number of PSOs with durations between
\unit[10-20]{ms} \citep[Fig.~11]{Nystrom2010AnData}, and a non-Gaussian saccade
duration distribution located below \unit[100]{ms} \citep[Fig.~8, albeit for
static scene perception]{Nystrom2010AnData}.
%
Overall, the presented summary statistics suggest that \remodnav\ is capable
of classifying eye movements with plausible characteristics, in prolonged
gaze recordings. A visualization of such a classification
result is depicted in \fig{remodnav} (top row).

\begin{figure*}
  % Use the relevant command to insert your figure file.
  \includegraphics[width=0.24\textwidth]{img/hist_fixation_lab.pdf}
  \includegraphics[width=0.24\textwidth]{img/hist_saccade_lab.pdf}
  \includegraphics[width=0.24\textwidth]{img/hist_PSO_lab.pdf}
  \includegraphics[width=0.24\textwidth]{img/hist_pursuit_lab.pdf} \\
  \includegraphics[width=0.24\textwidth]{img/hist_fixation_mri.pdf}
  \includegraphics[width=0.24\textwidth]{img/hist_saccade_mri.pdf}
  \includegraphics[width=0.24\textwidth]{img/hist_PSO_mri.pdf}
  \includegraphics[width=0.24\textwidth]{img/hist_pursuit_mri.pdf} \\
  % figure caption is below the figure
  \caption{Comparison of eye movement event duration distributions for the
    high-quality lab sample (top row), and the lower quality MRI sample
    (bottom row) across all participants (each $N=15$), and the entire duration of
    the same feature-length movie stimulus. All histograms depict absolute
    number of events. Visible differences are limited to an overall lower number of
    events, and fewer long saccades for the MRI sample. These are attributable
    to a higher noise level and more signal loss \citep[compare][Fig.
    4b]{Hanke2016} in the MRI sample, and to stimulus size differences
    (\unit[23.75]{\textdegree} MRI vs. \unit[34]{\textdegree} lab).}
  \label{fig:dist}
  % Give a unique label
\end{figure*}

\begin{figure*}[tbp]
  \includegraphics[trim=0 8mm 0 0,clip,width=1\textwidth]{img/remodnav_lab.pdf} \\
  \includegraphics[trim=0 0 0 0,clip,width=1\textwidth]{img/remodnav_mri.pdf}\\
  \includegraphics[trim=0 0 0 0,clip,width=0.49\textwidth]{img/rawtrace_lab.pdf}
  \includegraphics[trim=0 0 0 0,clip,width=0.49\textwidth]{img/rawtrace_mri.pdf}\\
  \caption{Exemplary eye movement classification results for the same \unit[10]{s} excerpt
  of a movie stimulus for a single participant in the high quality lab sample (top),
  and in the lower quality MRI sample (middle). The plots show filtered gaze
  coordinates (black), computed velocity time series (gray) overlayed on the eye
  movement event segmentation with periods of fixation (green), pursuit (beige),
  saccades (blue), and high/low-velocity post-saccadic oscillations (dark/light
  purple). The bottom panel shows the first \unit[2]{s} of unfiltered gaze coordinates
  (black) and unfiltered velocity time series (gray) for lab (left) and mri (right)
  sample in greater detail. The variable noise level, and prolonged signal loss (white
  in top panel) visible in the MRI sample represent a challenge for algorithms.
  \remodnav\ uses an adaptive approach that determines major
  saccade events first, and subsequently tunes the velocity threshold to short time
  windows between these events. Figures like this accompany the program output to
  facilitate quality control and discovery of inappropriate preprocessing and classification
  parameterization.}

  \label{fig:remodnav}
\end{figure*}


\subsection*{Lower-quality data}\label{ana_3}

An explicit goal for \remodnav\ development was robust performance on
lower-quality data. While a lack of quality can inevitably lead to misses in eye
movement classification if too severe and cannot be arbitrarily compensated,
it is beneficial for any further analysis if operation on noisy data does not
introduce unexpected event property biases.

In order to investigate noise-robustness we ran \remodnav\ on another
publicly available dataset from the \textit{studyforrest.org} project, where 15
different participants watched the exact same movie stimulus, but this time
while lying on their back in the bore of an MRI scanner \citep{Hanke2016}.
These data were recorded with a different Eyelink 1000 (software version 4.594)
equipped with an MR-compatible telephoto lens and illumination kit (SR Research
Ltd., Mississauga, Ontario, Canada) at \unit[1000]{Hz} during simultaneous fMRI
acquisition. The movie was presented at a viewing distance of \unit[$63$]{cm}
on a \unit[26]{cm} (\unit[$1280\times1024$]{px}) LCD screen in 720p resolution
at full width, yielding a substantially smaller stimulus size, compared to the
previous stimulation setup. The eye tracking camera was mounted outside the
scanner bore and recorded the participants' left eye at a distance of about
\unit[100]{cm}.  Compared to the lab-setup, physical limitations of the scanner
environment, and sub-optimal infrared illumination led to substantially
noisier data, as evident from a larger spatial uncertainty
\citep[Technical Validation]{Hanke2016}, a generally higher amount of data loss,
and more samples with a velocity above \unit[800]{deg/s}.
Following common data quality criteria used to warrant exclusion by
\citet{Holmqvist2012}, a higher amount of data loss, a greater number of
samples with a velocity above \unit[800]{deg/s}, and lower spatial accuracy
can be indicative of lower quality data.
The average amount of data loss in the MRI sample was three times higher than
in the laboratory setting (15.1\% versus 4.1\% in the lab), with six out
of 15 subjects having one or more movie segments with data loss greater than 30\%.
In the laboratory setting, in comparison, zero out of 15 subjects had one or
more movie segments with data loss greater than 30\% \citep[Table 1]{Hanke2016}.
Figure \ref{vels} highlights the higher amount of extreme velocities in the MRI sample,
even though the stimulus size was smaller than in the laboratory setting.
Finally, the average spatial accuracy at the start of a recording, assessed with a 13-point
calibration procedure, was \unit[0.58]{degrees of visual angle} for the MRI sample and
\unit[0.45]{degrees} for the lab sample \citep[Technical Validation]{Hanke2016}.
An example of the amplified and variable noise pattern is shown in \fig{remodnav} (bottom
row, gray lines). Except for the differences in stimulation setup, all other
aspects of data acquisition, eye tracker calibration, and data processing
were identical to the previous dataset.

\begin{figure}
  \includegraphics[width=0.5\textwidth]{img/velhist.pdf}
  \caption{
    Comparison of sample velocity distributions for MRI and laboratory setting
    across all measurements and participants (excluding samples during periods
    of signal-loss). The MRI sample exhibits a larger fraction of higher
    velocities, despite a 30\% smaller stimulus size.
  }
  \label{vels}
\end{figure}

We performed the identical analysis as before, in order to compare performance
between a high and lower-quality data acquisition. This approach differs from
the common approach of adding increasing levels of artificial noise to data
(as done for example in \citet{hessels2017noise}),
but bears the important advantage of incorporating real lower-quality data
characteristics instead of potentially inappropriate or unnatural noise.
Figures~\ref{fig:overallComp}-\ref{fig:remodnav} depict the results for the
lab-quality dataset, and the MRI-scanner dataset in the top and bottom rows,
respectively.

% N+H describe: "Data quality is related to accuracy, precision, percentage of
% data loss, perhaps in addition to a subjective rating from the person
% responsible for the recording"

Overall, the classification results exhibit strong similarity, despite the potential
behavioral impact of watching a movie while lying on their back and looking
upwards on the participants, or the well known effect of increasing fatigue \citep{wakefulness}
during a two-hour session in an MRI-scanner. In particular, saccade amplitude
and peak velocity exhibit a clear main-sequence relationship that resembles
that found for the lab acquisition (\fig{overallComp}). Duration distributions
for fixations, PSOs, and pursuits are strikingly similar between the two
datasets (\fig{dist}), except for a generally lower number of classified events
for the MRI experiment, which could be explained by the higher noise level and
fraction of signal loss. There is a notable difference regarding the saccade
duration distributions, with a bias towards shorter saccades in the MRI
dataset. This effect may be attributable to the differences in stimulus size
(30\% smaller in the MRI environment).


\section*{Conclusion}\label{con}

Based on the adaptive, velocity-based algorithm for fixation, saccade, and PSO
classification by \cite{Nystrom2010AnData}, we have developed an improved algorithm
that performs robustly on prolonged or short recordings
with dynamic stimulation, with potentially variable noise levels,
and also supports the classification of smooth pursuit events. Through a series of
validation analyses we have shown that its performance is comparable to or
better than ten other contemporary algorithms, and that plausible
classification results are achieved on high and lower quality data.
These aspects of algorithm capabilities and performance suggest that \remodnav\
is a state-of-the-art tool for eye movement classification with particular relevance
for emerging complex data collections paradigms with dynamic stimulation, such as
the combination of eye tracking and functional
MRI in simultaneous measurements.

The proposed algorithm is rule-based, hence can be applied to data without
prior training, apart from the adaptive estimation of velocity thresholds.
This aspect distinguishes it from other recent developments based on deep
neural networks \citep{Startsev2018}, and machine-learning in general
\citep{Zemblys2018}.
Some statistical learning algorithms require (labeled) training data,
which can be a limitation in the context of a research study.
However, in its present form \remodnav\ cannot be used for
real-time data analysis, as its approach for time series chunking is based
on an initial sorting of major saccade events across the entire time series.
The proposed algorithm presently does not support the classification of eye blinks
as a category distinct from periods of general signal loss. While such a feature
could potentially be added, the current default preprocessing aims at removing
blink-related signal. The algorithm maintains a distinction between high- and
low-velocity PSOs first introduced by \citet{Nystrom2010AnData}, although, to our knowledge,
the present literature does not make use of such a distinction. Algorithm users
are encouraged to decide on a case-by-case basis whether to lump these event categories
together into a general PSO category, as done in our own validation analyses.
As a general remark it is also noteworthy that eye tracking systems using
pupil corneal reflection (pupil-CR) eye tracking may bias data towards premature PSO onset times and
inflated PSO peak velocities (see \citet{HOOGE20166}).
In deciding whether and how to interpret PSO events, it needs to be considered
whether the eye tracking device may have introduced biases in the data.
Lastly, the evaluation results presented here are based
on data with a relatively high temporal resolution (\unit[0.5 and 1]{kHz}). While
the algorithm does not impose any hard constraints on data acquisition parameters,
its performance on data from low-end, consumer grade hardware (\eg \unit[50]{Hz}
sampling rate) has not been tested.

Just as \cite{Andersson2017}, we considered human raters as a gold standard
reference for event classification when evaluating algorithms.  The implications of
the results presented herein are hence only valid if this assumption is
warranted. Some authors voice concerns \cite[\eg][]{5523936}, regarding
potential biases that may limit generalizability. Nevertheless, human-made
event labels are a critical component of algorithm validation, as pointed out
by \cite{Hooge2018}.

The validation analyses presented here are based on three different datasets:
a manually annoted dataset \citep{Andersson2017}, and two datasets with
prolonged recordings using movie stimuli \citep{Hanke2016}.
Beyond our own validation, a recent evaluation of nine different smooth
pursuit algorithms by Startsev, Agtzidis and Dorr as part of their recent
paper \citep{Startsev2018} also provides metrics for \remodnav.
In their analysis, algorithm performance was evaluated against a partially
hand-labelled eye movement annotation of the Hollywood2 dataset \citep{Mathe2012}.
We refrain from restating their methodology or interpreting their results here,
but encourage readers to consult this independent
report\footnote{https://www.michaeldorr.de/smoothpursuit/}.

\remodnav\ aims to be a readily usable tool, available as cross platform
compatible, free and open source software, with a simple command line interface
and carefully chosen default settings. However, as evident from numerous
algorithm evaluations
\citep[\eg][]{Andersson2017,Larsson2013,Zemblys2018,5523936}, different
underlying stimulation, and data characteristics can make certain algorithms or
parameterizations more suitable than others for particular applications. The
provided implementation of the \remodnav\ algorithm \citep{michael_hanke_2019_2651042}
acknowledges this fact by
exposing a range of parameters through its user interface that can be altered
in order to tune the classification for a particular use case.

The latest version of \remodnav\ can be installed from
PyPi\footnote{https://pypi.org/project/remodnav} via \texttt{pip install
remodnav}. The source code of the software can be found on
Github\footnote{https://github.com/psychoinformatics-de/remodnav}. All reports
on defects and enhancement can be submitted there.
%
The analysis code underlying all results and figures presented in this paper,
as well as the \LaTeX\ sources, are located in another Github
repository\footnote{https://github.com/psychoinformatics-de/paper-remodnav/}.
All required input data, from \cite{Andersson2017} and the
\textit{studyforrest.org} project, are referenced in this repository at precise
versions as DataLad\footnote{\url{http://datalad.org}} subdatasets, and can be
obtained on demand. The repository constitutes an automatically reproducible
research object, and readers interested in verifying the results and claims of
our paper can recompute and plot all results with a single command after cloning
the repository.

\subsection*{Author contributions}

% In order to give appropriate credit to each author of an article, the individual
% contributions of each author to the manuscript should be detailed in this section. We
% recommend using author initials and then stating briefly how they contributed.

AD, MH conceived and implemented the algorithm.
AD, AW, MH validated algorithm performance.
AD, AW, MH wrote the manuscript.

\subsection*{Competing interests}

% All financial, personal, or professional competing interests for any of the authors that
% could be construed to unduly influence the content of the article must be disclosed and
% will be displayed alongside the article.

No competing interests were disclosed.

\subsection*{Grant information}
% Please state who funded the work discussed in this article, whether it is your employer,
% a grant funder etc. Please do not list funding that you have that is not relevant to this
% specific piece of research. For each funder, please state the funder’s name, the grant
% number where applicable, and the individual to whom the grant was assigned.
% If your work was not funded by any grants, please include the line: ‘The author(s)
% declared that no grants were involved in supporting this work.’

Michael Hanke was supported by funds from the German federal state of
Saxony-Anhalt and the European Regional Development Fund (ERDF),
Project: Center for Behavioral Brain Sciences (CBBS).
Adina Wagner was supported by the German Academic Foundation.

\textit{The funders had no role in study design, data collection and analysis,
decision to publish, or preparation of the manuscript.}

% \todo[inline]{the journals template.tex says to use bibitem to create
% references. we should do that once finished here}

\begin{acknowledgements}

% This section should acknowledge anyone who contributed to the research or the
% article but who does not qualify as an author based on the criteria provided earlier
% (e.g. someone or an organisation that provided writing assistance). Please state how
% they contributed; authors should obtain permission to acknowledge from all those
% mentioned in the Acknowledgements section.
% Please do not list grant funding in this section.

This work is based on an earlier Python implementation and evaluation of the
original NH algorithm by Ulrike~Schnaithmann and Isabel~Dombrowe
\citep{Sch2017}.  We are grateful to \cite{Andersson2017} for releasing the
labeled eye tracking dataset used for validation under an open-source license.

\end{acknowledgements}

\bibliographystyle{spbasic}      % basic style, author-year citations
\bibliography{references}

% References can be listed in any standard referencing style that uses a numbering system
% (i.e. not Harvard referencing style), and should be consistent between references within
% a given article.

% Reference management systems such as Zotero provide options for exporting
% bibliographies as Bib\TeX{} files. Bib\TeX{} is a bibliographic tool that is
% used with \LaTeX{} to help organize the user's references and create a
% bibliography. This template contains an example of such a file,
% \texttt{sample.bib}, which can be replaced with your own. Use the
% \verb|\cite| command  to create in-text citations, like this
% \cite{Smith:2012qr} and this \cite{Smith:2013jd}.


% See this guide for more information on BibTeX:
% http://libguides.mit.edu/content.php?pid=55482&sid=406343

% For more author guidance please see:
% http://f1000research.com/author-guidelines

% When all authors are happy with the paper, use the
% ‘Submit to F1000Research' button from the menu above
% to submit directly to the open life science journal F1000Research.

% Please note that this template results in a draft pre-submission PDF document.
% Articles will be professionally typeset when accepted for publication.

% We hope you find the F1000Research Overleaf template useful,
% please let us know if you have any feedback using the help menu above.


\clearpage


\end{document}