@conference {cHerrera-Palacio, title = {Video Object Linguistic Grounding}, booktitle = {ACM Multimedia Workshop on Multimodal Understanding and Learning for Embodied Applications (MULEA)}, year = {2019}, month = {10/2019}, publisher = {ACM}, organization = {ACM}, address = {Nice, France}, abstract = {
The goal of this work is segmenting on a video sequence the objects\ which are mentioned in a linguistic description of the scene. We\ have adapted an existing deep neural network that achieves state of\ the art performance in semi-supervised video object segmentation,\ to add a linguistic branch that would generate an attention map\ over the video frames, making the segmentation of the objects\ temporally consistent along the sequence.
\
\
}, author = {Herrera-Palacio, Alba and Ventura, C. and Xavier Gir{\'o}-i-Nieto} }