@conference {cHerrera-Palacioa, title = {Recurrent Instance Segmentation using Sequences of Referring Expressions}, booktitle = {NeurIPS workshop on Visually Grounded Interaction and Language (ViGIL)}, year = {2019}, month = {09/2019}, address = {Vancouver, Canada}, abstract = {

The goal of this work is segmenting the objects in an image which are referred to by a sequence of linguistic descriptions (referring expressions). We propose a deep neural network with recurrent layers that output a sequence of binary masks, one for each referring expression provided by the user. The recurrent layers in the architecture allow the model to condition each predicted mask on the previous ones, from a spatial perspective within the same image. Our multimodal approach uses off-the-shelf architectures to encode both the image and the referring expressions. The visual branch provides a tensor of pixel embeddings that are concatenated with the phrase embeddings produced by a language encoder. Our experiments on the RefCOCO dataset for still images indicate how the proposed architecture successfully exploits the sequences of referring expressions to solve a pixel-wise task of instance segmentation.

Recurrent Instance Segmentation with Linguistic Referring Expressions from Universitat Polit{\`e}cnica de Catalunya

}, author = {Herrera-Palacio, Alba and Ventura, C. and Silberer, Carina and Sorodoc, Ionut-Teodor and Boleda, Gemma and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xHerrera-Palacio, title = {Recurrent Instance Segmentation with Linguistic Referring Expressions}, year = {2019}, abstract = {

The goal of this work is segmenting the object in an image or video which is referred to by a linguistic description (referring expression).\ We propose a deep neural network with recurrent layers that output a sequence of binary masks, one for each referring expression provided by the user.\ The recurrent layers in the architecture allow the model to condition each predicted mask on the previous ones, from a spatial perspective within the same image.\ Our multimodal approach uses off-the-shelf architectures to encode both the image and the referring expressions.\ The visual branch provides a tensor of pixel embeddings that are concatenated with the phrase embeddings produced by a language encoder.\ We focus our study on comparing different configurations to encode and combine the visual and linguistic representations.\ Our experiments on the RefCOCO dataset for still images indicate how the proposed architecture successfully exploits the referring expressions to solve a pixel-wise task of instance segmentation.

Master in Computer Vision Barcelona 2019.

Recurrent Instance Segmentation with Linguistic Referring Expressions from Universitat Polit{\`e}cnica de Catalunya

}, author = {Herrera-Palacio, Alba}, editor = {Xavier Gir{\'o}-i-Nieto and Ventura, C. and Silberer, Carina} } @conference {cHerrera-Palacio, title = {Video Object Linguistic Grounding}, booktitle = {ACM Multimedia Workshop on Multimodal Understanding and Learning for Embodied Applications (MULEA)}, year = {2019}, month = {10/2019}, publisher = {ACM}, organization = {ACM}, address = {Nice, France}, abstract = {

The goal of this work is segmenting on a video sequence the objects\ which are mentioned in a linguistic description of the scene. We\ have adapted an existing deep neural network that achieves state of\ the art performance in semi-supervised video object segmentation,\ to add a linguistic branch that would generate an attention map\ over the video frames, making the segmentation of the objects\ temporally consistent along the sequence.

Video Object Linguistic Grounding from Universitat Polit{\`e}cnica de Catalunya

Xavier Giro-i-Nieto and Amanda Duarte in ACM Multimedia 2019

}, author = {Herrera-Palacio, Alba and Ventura, C. and Xavier Gir{\'o}-i-Nieto} }