@COMMENT This file was generated by bib2html.pl <http://www.cs.cmu.edu/~pfr/misc_software/index.html#bib2html> version 0.90
@COMMENT written by Patrick Riley <http://www.cs.cmu.edu/~pfr>
@COMMENT This file came from Peter Stone's publication pages at
@COMMENT http://www.cs.utexas.edu/~pstone/papers
@InProceedings{NeurIPS2020-Pavse,
  author = {Brahma S. Pavse and Josiah P. Hanna and Ishan Durugkar and Peter Stone},
  title = {On Sampling Error in Batch Action-Value Prediction Algorithms},
  booktitle = {In the Offline Reinforcement Learning Workshop at Neural Information Processing Systems (NeurIPS), December 2020.},
  location = {Remote (Virtual Conference)},
  month = {December},
  year = {2020},
  wwwnote={<a href="https://www.youtube.com/watch?v=8A2fA1ZEFfg&feature=youtu.be">5-minute Video Presentation</a>},
  abstract = {
Estimating a policy's action-values is a fundamental aspect of reinforcement 
learning. In this work, we study the application of TD methods for learning 
action-values in an offline setting with a fixed batch of data. Motivated by 
recent work \citep{pavse2020psectd}, we observe that a fixed batch of offline 
data may contain two forms of distribution shift: the data may be collected 
from a different behavior policy than the target policy (off-policy data) 
and the empirical distribution of the data may differ from the sampling 
distribution of the data (sampling error). In this work, we focus
on the second problem by analyzing the sampling error that arises due to 
variance in sampling from a finite-sized batch of data in the 
\emph{on-policy offline} RL setting. We study how action-value learning 
algorithms suffer from this \emph{sampling error} by considering their so-called 
\emph{certainty-equivalence estimates} \citep{ sutton1988learning,pavse2020psectd}. 
We prove that each algorithm uses its certainty-equivalence estimates of the 
policy and transition dynamics to converge to its respective fixed-point. We
then empirically evaluate each algorithm's performance by measuring the 
mean-squared value error on Gridworld. Ultimately, we find that by reducing 
sampling error, an algorithm can produce significantly accurate action-value
estimations. 
  },
}