@conference {cFernandez, title = {More cat than cute? Interpretable Prediction of Adjective-Noun Pairs}, booktitle = {ACM Multimedia 2017 Workshop on Multimodal Understanding of Social, Affective and Subjective Attributes}, year = {2017}, month = {10/2017}, publisher = {ACM SIGMM}, organization = {ACM SIGMM}, address = {Mountain View, CA (USA)}, abstract = {

The increasing availability of affect-rich multimedia resources has bolstered interest in understanding sentiment and emotions in and from visual content. Adjective-noun pairs (ANP) are a popular mid-level semantic construct for capturing affect via visually detectable concepts such as {\textquoteleft}{\textquoteleft}cute dog" or {\textquoteleft}{\textquoteleft}beautiful landscape". Current state-of-the-art methods approach ANP prediction by considering each of these compound concepts as individual tokens, ignoring the underlying relationships in ANPs. This work aims at disentangling the contributions of the {\textquoteleft}adjectives{\textquoteright} and {\textquoteleft}nouns{\textquoteright} in the visual prediction of ANPs. Two specialised classifiers, one trained for detecting adjectives and another for nouns, are fused to predict 553 different ANPs. The resulting ANP prediction model is more interpretable as it allows us to study contributions of the adjective and noun components.

Project page
Paper on arXiv
ACM Multimedia 2017 MUSA2 Workshop\ (acceptance rate = 61\%)
Related master thesis by D{\`e}lia Fern{\'a}ndez

}, doi = {10.1145/3132515.3132520}, author = {Fern{\`a}ndez, D{\`e}lia and Woodward, Alejandro and V{\'\i}ctor Campos and Jou, Brendan and Xavier Gir{\'o}-i-Nieto and Chang, Shih-Fu} } @conference {cFernandeza, title = {ViTS: Video Tagging System from Massive Web Multimedia Collections}, booktitle = {ICCV 2017 Workshop on Web-scale Vision and Social Media }, year = {2017}, month = {10/2017}, address = {Venice, Italy}, abstract = {

The popularization of multimedia content on the Web has arised the need to automatically understand, index and retrieve it. In this paper we present ViTS, an automatic Video Tagging System which learns from videos, their web context and comments shared on social networks. ViTS analyses massive multimedia collections by Internet crawling, and maintains a knowledge base that updates in real time with no need of human supervision. As a result, each video is indexed with a rich set of labels and linked with other related contents. ViTS is an industrial product under exploitation with a vocabulary of over 2.5M concepts, capable of indexing more than 150k videos per month. We compare the quality and completeness of our tags with respect to the ones in the YouTube-8M dataset, and we show how ViTS enhances the semantic annotation of the videos with a larger number of labels (10.04 tags/video), with an accuracy of 80,87\%.

}, author = {Fern{\`a}ndez, D{\`e}lia and David Varas and Espadaler, Joan and Ferreira, Jordi and Woodward, Alejandro and Rodr{\'\i}guez, David and Xavier Gir{\'o}-i-Nieto and Riveiro, Juan Carlos and Bou, Elisenda} }