@inproceedings{valko2012semi-supervised, abstract = {In apprenticeship learning we aim to learn a good policy by observing the behavior of an expert or a set of experts. In particular, we consider the case where the expert acts so as to maximize an unknown reward function defined as a linear combination of a set of state features. In this paper, we consider the setting where we observe many sample trajectories (i.e., sequences of states) but only one or a few of them are labeled as experts' trajectories. We investigate the conditions under which the remaining unlabeled trajectories can help in learning a policy with a good performance. In particular, we define an extension to the max-margin inverse reinforcement learning proposed by Abbeel and Ng (2004) where, at each iteration, the max-margin optimization step is replaced by a semi-supervised optimization problem which favors classifiers separating clusters of trajectories. Finally, we report empirical results on two grid-world domains showing that the semi-supervised algorithm is able to output a better policy in fewer iterations than the related algorithm that does not take the unlabeled trajectories into account.}, author = {Valko, Michal and Ghavamzadeh, Mohammad and Lazaric, Alessandro}, booktitle = {The 24th Journal of Machine Learning Research Proceedings of the 10th European Workshop on Reinforcement Learning}, month = jun, pages = {131--241}, publisher = {Sparc}, title = {{Semi-Supervised Apprenticeship Learning}}, url = {http://researchers.lille.inria.fr/~valko/hp/serve.php?what=publications/valko2012semi-supervised.pdf}, volume = {24}, year = {2012} }