@article {cBellver-Bueno20, title = {RefVOS: A Closer Look at Referring Expressions for Video Object Segmentation}, journal = {Multimedia Tools and Applications}, year = {2022}, month = {07/2022}, abstract = {

The task of video object segmentation with referring expressions (language-guided VOS) is to, given a linguistic phrase and a video, generate binary masks for the object to which the phrase refers. Our work argues that existing benchmarks used for this task are mainly composed of trivial cases, in which referents can be identified with simple phrases. Our analysis relies on a new categorization of the phrases in the DAVIS-2017 and Actor-Action datasets into trivial and non-trivial REs, with the non-trivial REs annotated with seven RE semantic categories. We leverage this data to analyze the results of RefVOS, a novel neural network that obtains competitive results for the task of language-guided image segmentation and state of the art results for language-guided VOS. Our study indicates that the major challenges for the task are related to understanding motion and static actions.

Source code
Publication on arXiv\ and Springer.
Video as part of M{\'\i}riam Bellver PhD thesis (starts at 35:56)
Tweet by @carles_ventura.

}, doi = {https://doi.org/10.1007/s11042-022-13413-x}, author = {Bellver-Bueno, M{\'\i}riam and Ventura, C. and Silberer, Carina and Kazakos, Ioannis and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cKazakos, title = {SynthRef: Generation of Synthetic Referring Expressions for Object Segmentation}, booktitle = {NAACL Visually Grounded Interaction and Language (ViGIL) Workshop}, year = {2021}, month = {06/2021}, address = {Virtual}, abstract = {

Recent advances in deep learning have brought significant progress in visual grounding tasks such as language-guided video object segmentation. However, collecting large datasets for these tasks is expensive in terms of annotation time, which represents a bottleneck. To this end, in our work we propose a novel method, namely SynthRef, for generating synthetic referring expressions for an image (or video frame), and we also present and disseminate the first large-scale dataset with synthetic referring expressions for video object segmentation. Our experiments demonstrate that by training with our synthetic dataset one can improve the ability of a model to generalize across different datasets, without any additional annotation cost. Moreover, our formulation allows its application to any object detection or segmentation dataset.

Project page.
Paper on arXiv.

}, author = {Kazakos, Ioannis and Bellver-Bueno, M{\'\i}riam and Ventura, C. and Silberer, Carina and Xavier Gir{\'o}-i-Nieto} } @conference {cHerrera-Palacioa, title = {Recurrent Instance Segmentation using Sequences of Referring Expressions}, booktitle = {NeurIPS workshop on Visually Grounded Interaction and Language (ViGIL)}, year = {2019}, month = {09/2019}, address = {Vancouver, Canada}, abstract = {

The goal of this work is segmenting the objects in an image which are referred to by a sequence of linguistic descriptions (referring expressions). We propose a deep neural network with recurrent layers that output a sequence of binary masks, one for each referring expression provided by the user. The recurrent layers in the architecture allow the model to condition each predicted mask on the previous ones, from a spatial perspective within the same image. Our multimodal approach uses off-the-shelf architectures to encode both the image and the referring expressions. The visual branch provides a tensor of pixel embeddings that are concatenated with the phrase embeddings produced by a language encoder. Our experiments on the RefCOCO dataset for still images indicate how the proposed architecture successfully exploits the sequences of referring expressions to solve a pixel-wise task of instance segmentation.

Recurrent Instance Segmentation with Linguistic Referring Expressions from Universitat Polit{\`e}cnica de Catalunya

}, author = {Herrera-Palacio, Alba and Ventura, C. and Silberer, Carina and Sorodoc, Ionut-Teodor and Boleda, Gemma and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xHerrera-Palacio, title = {Recurrent Instance Segmentation with Linguistic Referring Expressions}, year = {2019}, abstract = {

The goal of this work is segmenting the object in an image or video which is referred to by a linguistic description (referring expression).\ We propose a deep neural network with recurrent layers that output a sequence of binary masks, one for each referring expression provided by the user.\ The recurrent layers in the architecture allow the model to condition each predicted mask on the previous ones, from a spatial perspective within the same image.\ Our multimodal approach uses off-the-shelf architectures to encode both the image and the referring expressions.\ The visual branch provides a tensor of pixel embeddings that are concatenated with the phrase embeddings produced by a language encoder.\ We focus our study on comparing different configurations to encode and combine the visual and linguistic representations.\ Our experiments on the RefCOCO dataset for still images indicate how the proposed architecture successfully exploits the referring expressions to solve a pixel-wise task of instance segmentation.

Master in Computer Vision Barcelona 2019.

Recurrent Instance Segmentation with Linguistic Referring Expressions from Universitat Polit{\`e}cnica de Catalunya

}, author = {Herrera-Palacio, Alba}, editor = {Xavier Gir{\'o}-i-Nieto and Ventura, C. and Silberer, Carina} }