@mastersthesis {xNietoa, title = {Discovery and Learning of Navigation Goals from Pixels in Minecraft}, year = {2021}, abstract = {
Pre-training Reinforcement Learning (RL) agents in a task-agnostic manner has shown promising results. However, previous works still struggle to learn and discover meaningful skills in high-dimensional state-spaces. We approach the problem by leveraging unsupervised skill discovery and self-supervised learning of state representations. In our work, we learn a compact latent representation by making use of variational or contrastive techniques. We demonstrate that both allow learning a set of basic navigation skills by maximizing an information theoretic objective. We assess our method in Minecraft 3D maps with different complexities. Our results show that representations and conditioned policies learned from pixels are enough for toy examples, but do not scale to realistic and complex maps. We also explore alternative rewards and input observations to overcome these limitations.
Defining a reward function in Reinforcement Learning (RL) is not always possible or very costly. For this reason, there is a great interest in training agents in a task-agnostic manner making use of intrinsic motivations and unsupervised techniques. Due to the complexity to learn useful behaviours in pixel-based domains, the results obtained in RL are still far from the remarkable results obtained in domains such as computer vision and natural language processing. We hypothesize that RL agents will also benefit from unsupervised pre-trainings with no extrinsic rewards, analogously to how humans mostly learn, especially in the early stages of life. Our main contribution is the deployment of the Explore, Discover and Learn (EDL) paradigm for unsupervised learning to the pixel space. In particular, our work focuses on the MineRL environment, where the observation of the agent is represented by: (a) its spatial coordinates in the Minecraft virtual world, and (b) an image from an egocentric viewpoint.
}, author = {Nieto, Juan Jos{\'e} and Creus, Roger and Xavier Gir{\'o}-i-Nieto} } @conference {cCreus, title = {PixelEDL: Unsupervised Skill Discovery and Learning from Pixels}, booktitle = {CVPR 2021 Embodied AI Workshop}, year = {2021}, month = {06/2021}, abstract = {We tackle embodied visual navigation in a task-agnostic set-up by putting the focus on the unsupervised discovery of skills (or options) that provide a good coverage of states. Our approach intersects with empowerment: we address the reward-free skill discovery and learning tasks to discover {\textquotedblleft}what{\textquotedblright} can be done in an environment and {\textquotedblleft}how{\textquotedblright}. For this reason, we adopt the existing Explore, Discover and Learn (EDL) paradigm, tested only in toy example mazes, and extend it to pixel-based state representations available for embodied AI agents.
}, author = {Creus, Roger and Nieto, Juan Jos{\'e} and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xCreus, title = {Unsupervised skill learning from pixels}, year = {2021}, abstract = {This work focuses on the self-acquirement of the fundamental task-agnostic knowledge available within an environment. The aim is to discover and learn baseline representations and behaviours that can later be useful for solving embodied visual navigation downstream tasks. Specifically, the presented approach extends the idea of the "Explore, Discover and Learn" (EDL) paradigm to the pixel domain. This way, this work is centered in the representations and behaviours that can be learnt by an agent that only integrates an image capture sensor. Both the agents and the environment that is used in this work run over the Habitat AI simulator, which is developed by Facebook AI, and renders 3D fotorealistic views of the insides of apartments.
Pre-training Reinforcement Learning agents in a task-agnostic manner has shown promising results. However, previous works still struggle in learning and discovering meaningful skills in high-dimensional state-spaces, such as pixel-spaces. We approach the problem by leveraging unsupervised skill discovery and self-supervised learning of state representations. In our work, we learn a compact latent representation by making use of variational and contrastive techniques. We demonstrate that both enable RL agents to learn a set of basic navigation skills by maximizing an information theoretic objective. We assess our method in Minecraft 3D pixel maps with different complexities. Our results show that representations and conditioned policies learned from pixels are enough for toy examples, but do not scale to realistic and complex maps. To overcome these limitations, we explore alternative input observations such as the relative position of the agent along with the raw pixels.
}, author = {Nieto, Juan Jos{\'e} and Creus, Roger and Xavier Gir{\'o}-i-Nieto} } @conference {cLinardosa, title = {Simple vs complex temporal recurrences for video saliency prediction}, booktitle = {British Machine Vision Conference (BMVC)}, year = {2019}, month = {09/2019}, publisher = {British Machine Vision Association}, organization = {British Machine Vision Association}, address = {Cardiff, Wales / UK.}, abstract = {This paper investigates modifying an existing neural network architecture for static saliency prediction using two types of recurrences that integrate information from the temporal domain. The first modification is the addition of a ConvLSTM within the architecture, while the second is a computationally simple exponential moving average of an internal convolutional state. We use weights pre-trained on the SALICON dataset and fine-tune our model on DHF1K. Our results show that both modifications achieve state-of-the-art results and produce similar saliency maps.
Saliency prediction is a topic undergoing intense study in computer vision with a broad range of applications. It consists in predicting where the attention is going to be received in an image or a video by a human. Our work is based on a deep neural network named SalGAN, which was trained on a saliency annotated dataset of static images. In this thesis we investigate different approaches for extending SalGAN to the video domain. To this end, we investigate the recently proposed saliency annotated video dataset DHF1K to train and evaluate our models. The obtained results indicate that techniques such as depth estimation or coordconv can effectively be used as additional modalities to enhance the saliency prediction of static images obtained with SalGAN, achieving encouraging results in the DHF1K benchmark. Our work is based on pytorch and it is publicly available here.