@inproceedings{kaufmann2020adaptive, abstract = {Reward-free exploration is a reinforcement learning setting recently studied by Jin et al., who address it by running several algorithms with regret guarantees in parallel. In our work, we instead propose a more adaptive approach for reward-free exploration which directly reduces upper bounds on the maximum MDP estimation error. We show that, interestingly, our reward-free UCRL algorithm can be seen as a variant of an algorithm of Fiechter from 1994, originally proposed for a different objective that we call best-policy identification. We prove that RF-UCRL needs O((SAH4/$\epsilon$2)ln(1/$\delta$)) episodes to output, with probability 1−$\delta$, an $\epsilon$-approximation of the optimal policy for any reward function. We empirically compare it to oracle strategies using a generative model.}, author = {Kaufmann, Emilie and M{\'{e}}nard, Pierre and Domingues, Omar Darwiche and Jonsson, Anders and Leurent, Edouard and Valko, Michal}, booktitle = {Algorithmic Learning Theory}, title = {{Adaptive reward-free exploration}}, url = {https://arxiv.org/pdf/2006.06294.pdf}, year = {2021} }