@conference {cHerrera-Palacio, title = {Video Object Linguistic Grounding}, booktitle = {ACM Multimedia Workshop on Multimodal Understanding and Learning for Embodied Applications (MULEA)}, year = {2019}, month = {10/2019}, publisher = {ACM}, organization = {ACM}, address = {Nice, France}, abstract = {

The goal of this work is segmenting on a video sequence the objects\ which are mentioned in a linguistic description of the scene. We\ have adapted an existing deep neural network that achieves state of\ the art performance in semi-supervised video object segmentation,\ to add a linguistic branch that would generate an attention map\ over the video frames, making the segmentation of the objects\ temporally consistent along the sequence.

Paper in\ ACM Digital Library\ and UPCommons.
ACM Multimedia 2019 Workshop on Multimodal Understanding and Learning for Embodied Applications

Video Object Linguistic Grounding from Universitat Polit{\`e}cnica de Catalunya

Xavier Giro-i-Nieto and Amanda Duarte in ACM Multimedia 2019

}, author = {Herrera-Palacio, Alba and Ventura, C. and Xavier Gir{\'o}-i-Nieto} }