@conference {cKazakos, title = {SynthRef: Generation of Synthetic Referring Expressions for Object Segmentation}, booktitle = {NAACL Visually Grounded Interaction and Language (ViGIL) Workshop}, year = {2021}, month = {06/2021}, address = {Virtual}, abstract = {

Recent advances in deep learning have brought significant progress in visual grounding tasks such as language-guided video object segmentation. However, collecting large datasets for these tasks is expensive in terms of annotation time, which represents a bottleneck. To this end, in our work we propose a novel method, namely SynthRef, for generating synthetic referring expressions for an image (or video frame), and we also present and disseminate the first large-scale dataset with synthetic referring expressions for video object segmentation. Our experiments demonstrate that by training with our synthetic dataset one can improve the ability of a model to generalize across different datasets, without any additional annotation cost. Moreover, our formulation allows its application to any object detection or segmentation dataset.

}, author = {Kazakos, Ioannis and Bellver-Bueno, M{\'\i}riam and Ventura, C. and Silberer, Carina and Xavier Gir{\'o}-i-Nieto} }