@conference {cTarresa, title = {Sign Language Translation from Instructional Videos}, booktitle = {CVPR 2023 Women in Computer Vision Workshop}, year = {2023}, month = {04/2023}, publisher = {Computer Vision Foundation / IEEE}, organization = {Computer Vision Foundation / IEEE}, address = {Vancouver, Canada}, abstract = {

The advances in automatic sign language translation (SLT) to spoken languages have been mostly benchmarked with datasets of limited size and restricted domains. Our work advances the state of the art by providing the first baseline results on How2Sign, a large and broad dataset. We train a Transformer over I3D video features, using the reduced BLEU as a reference metric for validation, instead of the widely used BLEU score. We report a result of 8.03 on the BLEU score, and publish the first open-source implementation of its kind to promote further advances.

}, author = {Laia Tarr{\'e}s and Gerard I. G{\'a}llego and Amanda Duarte and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dDuarte, title = {Data and methods for a visual understanding of sign languages}, volume = {PhD}, year = {2022}, type = {International Mention}, abstract = {

Signed languages are complete and natural languages used as the first or preferred mode of communication by millions of people worldwide. However, they, unfortunately, continue to be marginalized languages. Designing, building, and evaluating models that work on sign languages presents compelling research challenges and requires interdisciplinary and collaborative efforts. The recent advances in Machine Learning (ML) and Artificial Intelligence (AI) has the power to enable better accessibility to sign language users and narrow down the existing communication barrier between the Deaf community and non-sign language users. However, recent AI-powered technologies still do not account for sign language in their pipelines. This is mainly because sign languages are visual languages, that use manual and non-manual features to convey information, and do not have a standard written form. Thus, the goal of this thesis is to contribute to the development of new technologies that account for sign language by creating large-scale multimodal resources suitable for training modern data-hungry machine learning models and developing automatic systems that focus on computer vision tasks related to sign language that aims at learning better visual understanding of sign languages.Thus, in Part I, we introduce the How2Sign dataset, which is a large-scale collection of multimodal and multiview sign language videos in American Sign Language. In Part II, we contribute to the development of technologies that account for sign languages by presenting in Chapter 4 a framework called Spot-Align, based on sign spotting methods, to automatically annotate sign instances in continuous sign language. We further present the benefits of this framework and establish a baseline for the sign language recognition task on the How2Sign dataset. In addition to that, in Chapter 5 we benefit from the different annotations and modalities of the How2Sign to explore sign language video retrieval by learning cross-modal embeddings. Later in Chapter 6, we explore sign language video generation by applying Generative Adversarial Networks to the sign language domain and assess if and how well sign language users can understand automatically generated sign language videos by proposing an evaluation protocol based on How2Sign topics and English translation.

}, author = {Amanda Duarte}, editor = {Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @article {cBellver-Bueno20, title = {RefVOS: A Closer Look at Referring Expressions for Video Object Segmentation}, journal = {Multimedia Tools and Applications}, year = {2022}, month = {07/2022}, abstract = {

The task of video object segmentation with referring expressions (language-guided VOS) is to, given a linguistic phrase and a video, generate binary masks for the object to which the phrase refers. Our work argues that existing benchmarks used for this task are mainly composed of trivial cases, in which referents can be identified with simple phrases. Our analysis relies on a new categorization of the phrases in the DAVIS-2017 and Actor-Action datasets into trivial and non-trivial REs, with the non-trivial REs annotated with seven RE semantic categories. We leverage this data to analyze the results of RefVOS, a novel neural network that obtains competitive results for the task of language-guided image segmentation and state of the art results for language-guided VOS. Our study indicates that the major challenges for the task are related to understanding motion and static actions.

}, doi = {https://doi.org/10.1007/s11042-022-13413-x}, author = {Bellver-Bueno, M{\'\i}riam and Ventura, C. and Silberer, Carina and Kazakos, Ioannis and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cTarres22, title = {Tackling Low-Resourced Sign Language Translation: UPC at WMT-SLT 22}, booktitle = {EMNLP 2022 Seventh Conference on Machine Translation (WMT22)}, year = {2022}, month = {10/2022}, abstract = {

This paper describes the system developed at the Universitat Polit{\`e}cnica de Catalunya for the Workshop on Machine Translation 2022 Sign Language Translation Task, in particular, for the sign-to-text direction. We use a Transformer model implemented with the Fairseq modeling toolkit. We have experimented with the vocabulary size, data augmentation tech- niques and pretraining the model with the PHOENIX-14T dataset. Our system obtains 0.50 BLEU score for the test set, improving the organizers{\textquoteright} baseline by 0.38 BLEU. We remark the poor results for both the baseline and our system, and thus, the unreliability of our findings.

Poster view

}, author = {Laia Tarr{\'e}s and Gerard I. G{\'a}llego and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @conference {cBudria, title = {Topic Detection in Continuous Sign Language Videos}, booktitle = {Accessibility, Vision, and Autonomy Meet (AVA) CVPR Workshop}, year = {2022}, month = {06/2022}, abstract = {

Significant progress has been made recently on challenging tasks in automatic sign language understanding, such as sign language recognition, translation and production. However, these works have focused on datasets with relatively few samples, short recordings and limited vocabulary and signing space.\ In this work, we introduce the novel task of sign language topic detection. We base our experiments on How2Sign, a large-scale video dataset spanning multiple semantic domains. We provide strong baselines for the task of topic detection,\ \ and present a comparison between different visual features commonly used in the domain of sign language.

}, author = {{\'A}lvaro Budria and Laia Tarr{\'e}s and Gerard I. G{\'a}llego and Moreno, Francesc and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cDuarte20, title = {How2Sign: A Large-scale Multimodal Dataset for Continuous American Sign Language}, booktitle = {CVPR 2021}, year = {2021}, month = {06/2021}, abstract = {

Sign Language is the primary means of communication for the majority of the Deaf community. One of the factors that has hindered the progress in the areas of automatic sign language recognition, generation, and translation is the absence of large annotated datasets, especially continuous sign language datasets, i.e. datasets that are annotated and segmented at the sentence or utterance level. Towards this end, in this work we introduce How2Sign, a work-in-progress dataset collection. How2Sign consists of a parallel corpus of 80 hours of sign language videos (collected with multi-view RGB and depth sensor data) with corresponding speech transcriptions and gloss annotations. In addition, a three-hour subset was further recorded in a geodesic dome setup using hundreds of cameras and sensors, which enables detailed 3D reconstruction and pose estimation and paves the way for vision systems to understand the 3D geometry of sign language.





}, author = {Amanda Duarte and S. Palaskar and Lucas Ventura and Ghadiyaram, Deepti and DeHaan, Kenneth and F. Metze and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dBellver, title = {Image and Video Object Segmentation in Low Supervision Scenarios}, volume = {PhD}, year = {2021}, month = {03/2021}, school = {Universitat Politecnica de Catalunya}, type = {Academic}, address = {Barcelona}, abstract = {

Image and video segmentation are central tasks within the computer vision field. Nevertheless, deep learning solutions for segmentation typically rely on pixel-level annotations, which are very costly to collect. Likewise, some segmentation systems require human interaction at inference time, which involves effort for the end-user. In this thesis, we look into diverse supervision scenarios for image and video object segmentation. We discern between supervision when learning the model, i.e., which type of annotations are used during training, and supervision at inference, namely which kind of human input is required when running the system. Our target are models that require low forms of supervision.

In the first part of the thesis we present a novel recurrent architecture for video object segmentation that is end-to-end trainable in a fully-supervised setup, and that does not require any post-processing step, i.e., the output of the model directly solves the addressed task. The second part of the thesis aims at lowering the annotation cost, in terms of labeling time, needed to train image segmentation models. We explore semi-supervised pipelines and show results when a very limited budget is available. The third part of the dissertation attempts to alleviate the supervision required by semi-automatic systems at inference time. Particularly, we focus on semi-supervised video object segmentation, which typically requires generating a binary mask for each instance to be tracked. In contrast, we present a model for language-guided video object segmentation, which identifies the object to segment with a natural language expression. We study current benchmarks, propose a novel categorization of referring expressions for video, and identify the main challenges posed by the video task.

Evaluation committee: Zeynep Akata (University of T{\"u}bingen), Francesc Moreno-Noguer (UPC IRI-CSIC) and Yannis Kalantidis (Naver Labs Europ).

M{\'\i}riam Bellver Phd defense UPC 2021
}, author = {M{\'\i}riam Bellver}, editor = {Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dCampos20, title = {Deep Learning that Scales: Leveraging Compute and Data}, year = {2020}, month = {12/2020}, school = {Universitat Polit{\`e}cnica de Catalunya}, address = {Barcelona, Catalonia}, abstract = {

Deep learning has revolutionized the field of artificial intelligence in the past decade. Although the development of these techniques spans over several years, the recent advent of deep learning is explained by an increased availability of data and compute that have unlocked the potential of deep neural networks. They have become ubiquitous in domains such as natural language processing, computer vision, speech processing, and control, where enough training data is available. Recent years have seen continuous progress driven by ever-growing neural networks that benefited from large amounts of data and computing power. This thesis is motivated by the observation that scale is one of the key factors driving progress in deep learning research, and aims at devising deep learning methods that scale gracefully with the available data and compute. We narrow down this scope into two main research directions. The first of them is concerned with designing hardware-aware methods which can make the most of the computing resources in current high performance computing facilities. We then study bottlenecks preventing existing methods from scaling up as more data becomes available, providing solutions that contribute towards enabling training of more complex models. This dissertation studies the aforementioned research questions for two different learning paradigms, each with its own algorithmic and computational characteristics. The first part of this thesis studies the paradigm where the model needs to learn from a collection of examples, extracting as much information as possible from the given data. The second part is concerned with training agents that learn by interacting with a simulated environment, which introduces unique challenges such as efficient exploration and simulation.

}, url = { http://hdl.handle.net/10803/670372}, author = {V{\'\i}ctor Campos}, editor = {Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cCamposb, title = {Explore, Discover and Learn: Unsupervised Discovery of State-Covering Skills}, booktitle = {International Conference on Machine Learning (ICML) 2020}, year = {2020}, month = {07/2020}, abstract = {

Acquiring abilities in the absence of a task-oriented reward function is at the frontier of reinforcement learning research. This problem has been studied through the lens of empowerment, which draws a connection between option discovery and information theory. Information-theoretic skill discovery methods have garnered much interest from the community, but little research has been conducted in understanding their limitations. Through theoretical analysis and empirical evidence, we show that existing algorithms suffer from a common limitation -- they discover options that provide a poor coverage of the state space. In light of this, we propose {\textquoteright}Explore, Discover and Learn{\textquoteright} (EDL), an alternative approach to information-theoretic skill discovery. Crucially, EDL optimizes the same information-theoretic objective derived from the empowerment literature, but addresses the optimization problem using different machinery. We perform an extensive evaluation of skill discovery methods on controlled environments and show that EDL offers significant advantages, such as overcoming the coverage problem, reducing the dependence of learned skills on the initial state, and allowing the user to define a prior over which behaviors should be learned.




}, author = {V{\'\i}ctor Campos and Trott, Alexander and Xiong, Caiming and Socher, Richard and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @article {aBellver, title = {Mask-guided sample selection for Semi-Supervised Instance Segmentation}, journal = {Multimedia Tools and Applications}, year = {2020}, month = {07/2020}, abstract = {

Image segmentation methods are usually trained with pixel-level annotations, which require significant human effort to collect. The most common solution to address this constraint is to implement weakly-supervised pipelines trained with lower forms of supervision, such as bounding boxes or scribbles. Another option are semi-supervised methods, which leverage a large amount of unlabeled data and a limited number of strongly-labeled samples. In this second setup, samples to be strongly-annotated can be selected randomly or with an active learning mechanism that chooses the ones that will maximize the model performance.\ In this work, we propose a sample selection approach to decide which samples to annotate for semi-supervised instance segmentation. Our method consists in first predicting pseudo-masks for the unlabeled pool of samples, together with a score predicting the quality of the mask. This score is an estimate of the Intersection Over Union (IoU) of the segment with the ground truth mask. We study which samples are better to annotate given the quality score, and show how our approach outperforms a random selection, leading to improved performance for semi-supervised instance segmentation with low annotation budgets.

}, doi = {10.1007/s11042-020-09235-4}, url = {http://link.springer.com/article/10.1007/s11042-020-09235-4}, author = {M{\'\i}riam Bellver and Amaia Salvador and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cBellverb, title = {Budget-aware Semi-Supervised Semantic and Instance Segmentation}, booktitle = {CVPR 2019 DeepVision Workshop}, year = {2019}, month = {05/2019}, publisher = {OpenCVF}, organization = {OpenCVF}, address = {Long Beach, CA, USA}, abstract = {

Methods that move towards less supervised scenarios are key for image segmentation, as dense labels demand significant human intervention. Generally, the annotation burden is mitigated by labeling datasets with weaker forms of supervision, e.g. image-level labels or bounding boxes. Another option are semi-supervised settings, that commonly leverage a few strong annotations and a huge number of unlabeled/weakly-labeled data. In this paper, we revisit semi-supervised segmentation schemes and narrow down significantly the annotation budget (in terms of total labeling time of the training set) compared to previous approaches. With a very simple pipeline, we demonstrate that at low annotation budgets, semi-supervised methods outperform by a wide margin weakly-supervised ones for both semantic and instance segmentation. Our approach also outperforms previous semi-supervised works at a much reduced labeling cost. We present results for the Pascal VOC benchmark and unify weakly and semi-supervised approaches by considering the total annotation budget, thus allowing a fairer comparison between methods.

Winners of the best paper award at the $\#$CVPR2019 DeepVision workshop

Budget-aware Semi-Supervised Semantic and Instance Segmentation from Universitat Polit{\`e}cnica de Catalunya
}, url = {https://arxiv.org/abs/1905.05880}, author = {M{\'\i}riam Bellver and Amaia Salvador and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cDuarteb, title = {Cross-modal Neural Sign Language Translation}, booktitle = {Proceedings of the 27th ACM International Conference on Multimedia - Doctoral Symposium}, year = {2019}, month = {10/2019}, publisher = {ACM}, organization = {ACM}, address = {Nice, France}, abstract = {

Sign Language is the primary means of communication for the majority of the Deaf and hard-of-hearing communities. Current computational approaches in this general research area have focused specifically on sign language recognition and the translation of sign language to text. However, the reverse problem of translating from spoken to sign language has so far not been widely explored.

The goal of this doctoral research is to explore sign language translation in this generalized setting, i.e. translating from spoken language to sign language and vice versa. Towards that end, we propose a concrete methodology for tackling the problem of speech to sign language translation and introduce How2Sign, the first public, continuous American Sign Language dataset that enables such research. With a parallel corpus of almost 60 hours of sign language videos (collected with both RGB and depth sensor data) and the corresponding speech transcripts for over 2500 instructional videos, How2Sign is a public dataset of unprecedented scale that can be used to advance not only sign language translation, but also a wide range of sign language understanding tasks.

Poster at ACM Multimedia 2019

Xavier Giro and Amanda Duarte in ACM Multimedia 2019.

}, doi = {10.1145/3343031.3352587}, url = {https://dl.acm.org/citation.cfm?id=3352587}, author = {Amanda Duarte}, editor = {Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cDuartea, title = {Wav2Pix: Speech-conditioned Face Generation using Generative Adversarial Networks}, booktitle = {ICASSP}, year = {2019}, month = {05/2019}, publisher = {IEEE}, organization = {IEEE}, address = {Brighton, UK}, abstract = {

Speech is a rich biometric signal that contains information\ about the identity, gender and emotional state of the speaker.\ In this work, we explore its potential to generate face images\ of a speaker by conditioning a Generative Adversarial\ Network (GAN) with raw speech input. We propose a deep\ neural network that is trained from scratch in an end-to-end\ fashion, generating a face directly from the raw speech waveform\ without any additional identity information (e.g reference\ image or one-hot encoding). Our model is trained in a\ self-supervised fashion by exploiting the audio and visual signals\ naturally aligned in videos. With the purpose of training\ from video data, we present a novel dataset collected for this\ work, with high-quality videos of ten youtubers with notable\ expressiveness in both the speech and visual signals.

}, doi = {10.1109/ICASSP.2019.8682970}, url = {http://hdl.handle.net/2117/167073}, author = {Amanda Duarte and Rold{\'a}n, Francisco and Tubau, Miquel and Escur, Janna and Pascual-deLaPuente, Santiago and Amaia Salvador and Mohedano, Eva and McGuinness, Kevin and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cSuris, title = {Cross-modal Embeddings for Video and Audio Retrieval}, booktitle = {ECCV 2018 Women in Computer Vision Workshop}, year = {2018}, month = {09/2018}, publisher = {Springer}, organization = {Springer}, address = {Munich, Germany}, abstract = {

The increasing amount of online videos brings several opportunities for training self-supervised neural networks. The creation of large scale datasets of videos such as the YouTube-8M allows us to deal with this large amount of data in manageable way. In this work, we find new ways of exploiting this dataset by taking advantage of the multi-modal information it provides. By means of a neural network, we are able to create links between audio and visual documents, by projecting them into a common region of the feature space, obtaining joint audio-visual embeddings. These links are used to retrieve audio samples that fit well to a given silent video, and also to retrieve images that match a given a query audio. The results in terms of Recall@K obtained over a subset of YouTube-8M videos show the potential of this unsupervised approach for cross-modal feature learning. We train embeddings for both scales and assess their quality in a retrieval problem, formulated as using the feature extracted from one modality to retrieve the most similar videos based on the features computed in the other modality.

Cross Modal Embeddings for Video and Audio Retrieval $\#$WiCV18 from Universitat Polit{\`e}cnica de Catalunya
}, isbn = {978-3-030-11018-5}, doi = {10.1007/978-3-030-11018-5_62}, url = {https://doi.org/10.1007/978-3-030-11018-5_62}, author = {Sur{\'\i}s, D{\'\i}dac and Amanda Duarte and Amaia Salvador and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cCamposa, title = {Importance Weighted Evolution Strategies}, booktitle = {NeurIPS 2018 Deep Reinforcement Learning Workshop }, year = {2018}, month = {11/2018}, address = {Montreal, Quebec}, abstract = {

Evolution Strategies (ES) emerged as a scalable alternative to popular Reinforcement Learning (RL) techniques, providing an almost perfect speedup when distributed across hundreds of CPU cores thanks to a reduced communication overhead. Despite providing large improvements in wall-clock time, ES is data inefficient when compared to competing RL methods. One of the main causes of such inefficiency is the collection of large batches of experience, which are discarded after each policy update. In this work, we study how to perform more than one update per batch of experience by means of Importance Sampling while preserving the scalability of the original method. The proposed method, Importance Weighted Evolution Strategies (IW-ES), shows promising results and is a first step towards designing efficient ES algorithms.

}, author = {V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @conference {cSalvadord, title = {Recurrent Neural Networks for Semantic Instance Segmentation}, booktitle = {ECCV 2018 Women in Computer Vision (WiCV) Workshop}, year = {2018}, month = {12/2017}, abstract = {

We present a recurrent model for semantic instance segmentation that sequentially generates pairs of masks and their associated class probabilities for every object in an image. Our proposed system is trainable end-to-end, does not require post-processing steps on its output and is conceptually simpler than current methods relying on object proposals. We observe that our model learns to follow a consistent pattern to generate object sequences, which correlates with the activations learned in the encoder part of our network. We achieve competitive results on three different instance segmentation benchmarks (Pascal VOC 2012, Cityscapes and CVPPP Plant Leaf Segmentation).

Recurrent Neural Networks for Semantic Instance Segmentation from Universitat Polit{\`e}cnica de Catalunya
}, url = {https://imatge-upc.github.io/rsis/}, author = {Amaia Salvador and M{\'\i}riam Bellver and Baradad, Manel and V{\'\i}ctor Campos and Marqu{\'e}s, F. and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cSalvadore, title = {Recurrent Neural Networks for Semantic Instance Segmentation}, booktitle = {CVPR 2018 DeepVision Workshop}, year = {2018}, month = {06/2018}, abstract = {

We present a recurrent model for semantic instance segmentation that sequentially generates binary masks and their associated class probabilities for every object in an image. Our proposed system is trainable end-to-end from an input image to a sequence of labeled masks and, compared to methods relying on object proposals, does not require post-processing steps on its output. We study the suitability of our recurrent model on three different instance segmentation benchmarks, namely Pascal VOC 2012, CVPPP Plant Leaf Segmentation and Cityscapes. Further, we analyze the object sorting patterns generated by our model and observe that it learns to follow a consistent pattern, which correlates with the activations learned in the encoder part of our network.

}, author = {Amaia Salvador and M{\'\i}riam Bellver and Baradad, Manel and V{\'\i}ctor Campos and Marqu{\'e}s, F. and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @inbook {bCampos, title = {Sentiment concept embedding for visual affect recognition}, booktitle = {Multimodal Behavior Analysis in theWild}, year = {2018}, publisher = {Elsevier}, organization = {Elsevier}, edition = {1}, chapter = {16}, abstract = {

Automatic sentiment and emotion understanding of general visual content has recently garnered much research attention. However, the large visual variance associated with\ high-level affective concepts presents a challenge when designing systems with high-performance requirements. One\ popular approach to bridge the {\textquotedblleft}affective gap{\textquotedblright} between\ low-level visual features and affective semantics consists of\ using Adjective Noun Pair (ANP) semantic constructs for\ concepts, e.g. {\textquotedblleft}beautiful landscape{\textquotedblright} or {\textquotedblleft}scary face{\textquotedblright} which\ act as a mid-level representation that can be recognized by\ visual classifers while still carrying an affective bias. In\ this work, we formulate the ANP detection task in images\ over a continuous space defined over an embedding that\ captures the inter-concept relationships between ANPs. We\ show how the compact representations obtained from the\ embeddeding extend the discrete concepts in the ontology\ and can be used for improved visual sentiment and emotion\ prediction, as well as new applications such as zero-shot\ ANP detection.

}, url = {https://www.elsevier.com/books/multimodal-behavior-analysis-in-the-wild/alameda-pineda/978-0-12-814601-9}, author = {V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto and Jou, Brendan and Jordi Torres and Chang, Shih-Fu} } @conference {cCampos18, title = {Skip RNN: Learning to Skip State Updates in Recurrent Neural Networks}, booktitle = {International Conference on Learning Representations (ICLR)}, year = {2018}, month = {01/2018}, abstract = {

Recurrent Neural Networks (RNNs) continue to show\  outstanding performance in sequence modeling tasks. However, training RNNs on long sequences often face challenges like slow inference, vanishing gradients and difficulty in capturing long term dependencies. In backpropagation through time settings, these issues are tightly coupled with the large, sequential computational graph resulting from unfolding the RNN in time. We introduce the Skip RNN model which extends existing RNN models by learning to skip state updates and shortens the effective size of the computational graph. This model can also be encouraged to perform fewer state updates through a budget constraint. We evaluate the proposed model on various tasks and show how it can reduce the number of required RNN updates while preserving, and sometimes even improving, the performance of the baseline RNN models.

Skip RNN: Learning to Skip State Updates in Recurrent Neural Networks from Xavier Giro-i-Nieto
}, author = {V{\'\i}ctor Campos and Jou, Brendan and Xavier Gir{\'o}-i-Nieto and Jordi Torres and Chang, Shih-Fu} } @conference {cDuarte, title = {Towards Speech to Sign Language Translation}, booktitle = {ECCV 2018 Workshop on Shortcomings in Vision and Language}, year = {2018}, month = {08/2018}, abstract = {

Sign Language (SL) is the primary means of communication for a majority of the hearing-impaired community. Current computational approaches in this research area have focused specifically on Sign Language Recognition (SLR) and Sign Language Translation (from SL to text) (SLT). However, the reverse problem of translating from spoken language to sign language has so far been unexplored. The goal of our ongoing project is to provide to people with hearing disabilities the audio tracks from online videos, by automatically generating a video-based speech to sign language translation. In this paper, we will point out the shortcomings that limit the advances of this research area and propose first steps towards this end.

}, author = {Amanda Duarte and Camli, Gorkem and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cBellvera, title = {Detection-aided liver lesion segmentation using deep learning}, booktitle = {ML4H: Machine Learning for Health Workshop at NIPS 2017}, year = {2017}, month = {11/2017}, abstract = {

A fully automatic technique for segmenting the liver and localizing its unhealthy tissues is a convenient tool in order to diagnose hepatic diseases and assess the response to the according treatments. In this work we propose a method to segment the liver and its lesions from Computed Tomography (CT) scans using Convolutional Neural Networks (CNNs), that have proven good results in a variety of computer vision tasks, including medical imaging. The network that segments the lesions consists of a cascaded architecture, which first focuses on the region of the liver in order to segment the lesions on it. Moreover, we train a detector to localize the lesions, and mask the results of the segmentation network with the positive detections. The segmentation architecture is based on DRIU, a Fully Convolutional Network (FCN) with side outputs that work on feature maps of different resolutions, to finally\  benefit from the multi-scale information learned by different stages of the network. The main contribution of this work is the use of a detector to localize the lesions, which we show to be beneficial to remove false positives triggered by the segmentation network.

Detection-aided liver lesion segmentation using deep learning from Xavier Giro-i-Nieto
}, author = {M{\'\i}riam Bellver and Kevis-Kokitsi Maninis and Jordi Pont-Tuset and Jordi Torres and Xavier Gir{\'o}-i-Nieto and Luc van Gool} } @mastersthesis {xBellvera, title = {Detection-aided medical image segmentation using deep learning}, year = {2017}, abstract = {

Program:\ Master{\textquoteright}s Degree in Telecommunications Engineering

Grade: A with honours (10.0/10.0)

A fully automatic technique for segmenting the liver and localizing its unhealthy tissues is a convenient tool in order to diagnose hepatic diseases and also to assess the response to the according treatments. In this thesis we propose a method to segment the liver and its lesions from Computed Tomography (CT) scans, as well as other anatomical structures and organs of the human body. We have used Convolutional Neural Networks (CNNs), that have proven good results in a variety of tasks, including medical imaging. The network to segment the lesions consists of a cascaded architecture, which first focuses on the liver region in order to segment the lesion. Moreover, we train a detector to localize the lesions and just keep those pixels from the output of the segmentation network where a lesion is detected. The segmentation architecture is based on DRIU [24], a Fully Convolutional Network (FCN) with side outputs that work at feature maps of different resolutions, to finally benefit from the multi-scale information learned by different stages of the network. Our pipeline is 2.5D, as the input of the network is a stack of consecutive slices of the CT scans. We also study different methods to benefit from the liver segmentation in order to delineate the lesion. The main focus of this work is to use the detector to localize the lesions, as we demonstrate that it helps to remove false positives triggered by the segmentation network. The benefits of using a detector on top of the segmentation is that the detector acquires a more global insight of the healthiness of a liver tissue compared to the segmentation network, whose final output is pixel-wise and is not forced to take a global decision over a whole liver patch. We show experiments with the LiTS dataset for the lesion and liver segmentation. In order to prove the generality of the segmentation network, we also segment several anatomical structures from the Visceral dataset.

Detection-aided liver lesion segmentation using deep learning from Xavier Giro-i-Nieto
}, author = {M{\'\i}riam Bellver}, editor = {Kevis-Kokitsi Maninis and Jordi Pont-Tuset and Luc van Gool and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @conference {cLina, title = {Disentangling Motion, Foreground and Background Features in Videos}, booktitle = {CVPR 2017 Workshop Brave New Motion Representations}, year = {2017}, month = {05/2017}, abstract = {

This paper instroduces an unsupervised framework to extract semantically rich features for video representation. Inspired by how the human visual system groups objects based on motion cues, we propose a deep convolutional neural network that disentangles motion, foreground and background information. The proposed architecture consists of a 3D convolutional feature encoder for blocks of 16 frames, which is trained for reconstruction tasks over the first and last frames of the sequence. The model is trained with a fraction of videos from the UCF-101 dataset taking as ground truth the bounding boxes around the activity regions. Qualitative results indicate that the network can successfully update the foreground appearance based on pure-motion features. The benefits of these learned features are shown in a discriminative classification task when compared with a random initialization of the network weights, providing a gain of accuracy above the 10\%.

Disentangle motion, Foreground and Background Features in Videos from Xavier Giro-i-Nieto
}, author = {Lin, Xunyu and V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto and Jordi Torres and Cristian Canton-Ferrer} } @conference {c, title = {Distributed training strategies for a computer vision deep learning algorithm on a distributed GPU cluster}, booktitle = {International Conference on Computational Science (ICCS)}, year = {2017}, month = {06/2017}, publisher = {Elsevier}, organization = {Elsevier}, address = {Zurich, Switzerland}, abstract = {

Deep learning algorithms base their success on building high learning capacity models with millions of parameters that are tuned in a data-driven fashion. These models are trained by processing millions of examples, so that the development of more accurate algorithms is usually limited by the throughput of the computing devices on which they are trained. In this work, we explore how the training of a state-of-the-art neural network for computer vision can be parallelized on a distributed GPU cluster. The effect of distributing the training process is addressed from two different points of view. First, the scalability of the task and its performance in the distributed setting are analyzed. Second, the impact of distributed training methods on the final accuracy of the models is studied.

[ICCS 2017 website][Related session in ICCS 2017][Paper in UPCommons]

}, keywords = {distributed computing; parallel systems; deep learning; Convolutional Neural Networks}, doi = {https://doi.org/10.1016/j.procs.2017.05.074}, url = {http://www.sciencedirect.com/science/article/pii/S1877050917306129}, author = {V{\'\i}ctor Campos and Sastre, Francesc and Yag{\"u}es, Maurici and M{\'\i}riam Bellver and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @inbook {bBellver17, title = {Hierarchical Object Detection with Deep Reinforcement Learning}, booktitle = {Deep Learning for Image Processing Applications}, volume = {31}, year = {2017}, publisher = {IOS Press}, organization = {IOS Press}, address = {Amsterdam, The Netherlands}, abstract = {

This work introduces a model for Hierarchical Object Detection with Deep Reinforcement Learning (HOD-DRL). The key idea is to focus on those parts of the image that contain richer information and zoom on them. We train an intelligent agent that, given an image window, is capable of deciding where to focus the attention on five different predefined region candidates (smaller windows). This procedure is iterated providing a hierarchical image analysis.We compare two different candidate proposal strategies to guide the object search: with and without overlap. Moreover, our work compares two different strategies to extract features from a convolutional neural network for each region proposal: a first one that computes new feature maps for each region proposal, and a second one that computes the feature maps for the whole image to later generate crops for each region proposal. Experiments indicate better results for the overlapping candidate proposal strategy and a loss of performance for the cropped image features due to the loss of spatial resolution. We argue that, while this loss seems unavoidable when working with a large number of object candidates, the much more reduced number of region proposals generated by our reinforcement learning agent allows considering to extract features for each location without sharing convolutional computation among regions. Source code and models are available at https://imatge-upc.github.io/detection-2016-nipsws/.

}, keywords = {computer Vision, Object detection, reinforcement learning}, issn = {978-1-61499-822-8}, doi = {10.3233/978-1-61499-822-8-164}, url = {http://ebooks.iospress.nl/volumearticle/48029}, author = {M{\'\i}riam Bellver and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Jordi Torres} } @conference {cPana, title = {SalGAN: Visual Saliency Prediction with Generative Adversarial Networks}, booktitle = {CVPR 2017 Scene Understanding Workshop (SUNw)}, year = {2017}, address = {Honolulu, Hawaii, USA}, abstract = {

We introduce SalGAN, a deep convolutional neural network for visual saliency prediction trained with adversarial examples. The first stage of the network consists of a generator model whose weights are learned by back-propagation computed from a binary cross entropy (BCE) loss over downsampled versions of the saliency maps. The resulting prediction is processed by a discriminator network trained to solve a binary classification task between the saliency maps generated by the generative stage and the ground truth ones. Our experiments show how adversarial training allows reaching state-of-the-art performance across different metrics when combined with a widely-used loss function like BCE.

}, url = {https://arxiv.org/abs/1701.01081}, author = {Pan, Junting and Cristian Canton-Ferrer and McGuinness, Kevin and O{\textquoteright}Connor, N. and Jordi Torres and Elisa Sayrol and Xavier Gir{\'o}-i-Nieto} } @conference {cTorres, title = {Scaling a Convolutional Neural Network for classification of Adjective Noun Pairs with TensorFlow on GPU Clusters}, booktitle = {17th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGrid)}, year = {2017}, month = {05/2017}, publisher = {IEEE}, organization = {IEEE}, address = {Madrid, Spain}, abstract = {

Deep neural networks have gained popularity in recent years, obtaining outstanding results in a wide range of applications such as computer vision in both academia and multiple industry areas. The progress made in recent years cannot be understood without taking into account the technological advancements seen in key domains such as High Performance Computing, more specifically in the Graphic Processing Unit (GPU) domain. These kind of deep neural networks need massive amounts of data to effectively train the millions of parameters they contain, and this training can take up to days or weeks depending on the computer hardware we are using. In this work, we present how the training of a deep neural network can be parallelized on a distributed GPU cluster. The effect of distributing the training process is addressed from two different points of view. First, the scalability of the task and its performance in the distributed setting are analyzed. Second, the impact of distributed training methods on the training times and final accuracy of the models is studied. We used TensorFlow on top of the GPU cluster of servers with 2 K80 GPU cards, at Barcelona Supercomputing Center (BSC). The results show an improvement for both focused areas. On one hand, the experiments show promising results in order to train a neural network faster. The training time is decreased from 106 hours to 16 hours in our experiments. On the other hand we can observe how increasing the numbers of GPUs in one node rises the throughput, images per second, in a near-linear way. Morever an additional distributed speedup of 10.3 is achieved with 16 nodes taking as baseline the speedup of one node.

[CCGRID 2017] [Paper page]

}, url = {http://easychair.org/smart-program/CCGRID2017/2017-05-15.html$\#$session:13550}, author = {Jordi Torres and Sastre, Francesc and Yag{\"u}es, Maurici and V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto} } @conference {cCampos, title = {Skip RNN: Learning to Skip State Updates in Recurrent Neural Networks}, booktitle = {NIPS Time Series Workshop 2017}, year = {2017}, month = {08/2017}, address = {Long Beach, CA, USA}, abstract = {

Recurrent Neural Networks (RNNs) continue to show \ outstanding performance in sequence modeling tasks. However, training RNNs on long sequences often face challenges like slow inference, vanishing gradients and difficulty in capturing long term dependencies. In backpropagation through time settings, these issues are tightly coupled with the large, sequential computational graph resulting from unfolding the RNN in time. We introduce the Skip RNN model which extends existing RNN models by learning to skip state updates and shortens the effective size of the computational graph. This model can also be encouraged to perform fewer state updates through a budget constraint. We evaluate the proposed model on various tasks and show how it can reduce the number of required RNN updates while preserving, and sometimes even improving, the performance of the baseline RNN models.

Skip RNN: Learning to Skip State Updates in Recurrent Neural Networks from Xavier Giro-i-Nieto
}, url = {https://imatge-upc.github.io/skiprnn-2017-telecombcn/}, author = {V{\'\i}ctor Campos and Jou, Brendan and Xavier Gir{\'o}-i-Nieto and Jordi Torres and Chang, Shih-Fu} } @conference {xDuarte, title = {Temporal-aware Cross-modal Embeddings for Video and Audio Retrieval}, booktitle = {NIPS 2017 Women in Machine Learning Workshop (WiML)}, year = {2017}, month = {12/2017}, publisher = {NIPS 2017 Women in Machine Learning Workshop}, organization = {NIPS 2017 Women in Machine Learning Workshop}, address = {Long Beach, CA, USA}, abstract = {

The increasing amount of videos online brings several opportunities for training self-supervised neural networks. In this work, we explore cross-modal embeddings between audio and vision by exploiting their alignment on YouTube videos.

Joint audio-visual embeddings allow creating links between audio and visual documents by projecting them to a common region of the feature space. They can be applied to enriching radio broadcasts with images, finding soundtracks for user-generated videos or simply enriching a topic search with both audio and video documents.

The idea of creating a joint embedding space across modalities has being exploited by other areas [3, 4]. However, joint representation between the video frames and its audio have yet to be fully exploited. A similar approach to the proposed one was [2], where a soundtrack was retrieved to match a music video. However, this work did not target a synchronization between both modalities.

We aim at training a temporal-aware embedding which can align both audio and visual tracks. We use the visual and audio features provided in the YouTube-8M dataset [1]. The dataset includes features at both the clip and frame (temporal window) level. We train embeddings for both scales and assess their quality in a retrieval problem, formulated as using the feature extracted from one modality to retrieve the most similar videos based on the features computed in the other modality.

We aim at not only finding related documents, but synchronize both sequences. The alignment between the two sequences will rely on computing temporal-aware features with recurrent neural networks at different scales. At retrieval time, different scales will be assessed and results evaluated both with ranking metrics and Amazon Mechanical Turk.

References

[1] Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, and Sudheendra Vijayanarasimhan. Youtube-8m: A large-scale video classification benchmark. arXiv preprint arXiv:1609.08675, 2016.

[2] Sungeun Hong, Woobin Im, and Hyun S Yang. Deep learning for content-based, cross-modal retrieval of videos and music. arXiv preprint arXiv:1704.06761, 2017.

[3] Amaia Salvador, Nicholas Hynes, Yusuf Aytar, Javier Marin, Ferda Ofli, Ingmar Weber, and Antonio Torralba. Learning cross-modal embeddings for cooking recipes and food images. In CVPR, 2017.

[4] Liwei Wang, Yin Li, and Svetlana Lazebnik. Learning deep structure-preserving image-text embeddings. In CVPR, 2016.

}, author = {Amanda Duarte and Sur{\'\i}s, D{\'\i}dac and Amaia Salvador and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cBellver, title = {Hierarchical Object Detection with Deep Reinforcement Learning}, booktitle = {Deep Reinforcement Learning Workshop, NIPS 2016}, year = {2016}, month = {12/2016}, abstract = {

We present a method for performing hierarchical object detection in images guided by a deep reinforcement learning agent. The key idea is to focus on those parts of the image that contain richer information and zoom on them. We train an intelligent agent that, given an image window, is capable of deciding where to focus the attention among five different predefined region candidates (smaller windows). This procedure is iterated providing a hierarchical image analysis.

\ We compare two different candidate proposal strategies to guide the object search: with and without overlap. Moreover, our work compares two different strategies to extract features from a convolutional neural network for each region proposal: a first one that computes new feature maps for each region proposal, and a second one that computes the feature maps for the whole image to later generate crops for each region proposal.\ 

Experiments indicate better results for the overlapping candidate proposal strategy and a loss of performance for the cropped image features due to the loss of spatial resolution. We argue that, while this loss seems unavoidable when working with large amounts of object candidates, the much more reduced amount of region proposals generated by our reinforcement learning agent allows considering to extract features for each location without sharing convolutional computation among regions.

[Project page][arXiv][gitXiv][UPCommons][YouTube]

}, author = {M{\'\i}riam Bellver and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Jordi Torres} } @mastersthesis {xFerri16, title = {Object Tracking in Video with TensorFlow}, year = {2016}, abstract = {

[Project repo]

[Additional repo for setting up the environment]

}, author = {Ferri, Andrea}, editor = {Xavier Gir{\'o}-i-Nieto and Jordi Torres and Amaia Salvador} } @conference {cAnton05, title = {Collaborative Network Space: Infrastructure and Learning Application}, booktitle = {IEEE Region 8 EUROCON 2005 Conference: Computer as a tool.}, year = {2005}, pages = {803{\textendash}806}, isbn = {1-4244-0050-3}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=01630054}, author = {Anton, M. and Vall-Llosera, M. and Jordi Torres and Romeu, J. and Jofre, L. and Sole, F. and Marqu{\'e}s, F. and Sabate, F. and Berenguer, J.} } @conference {cMarques05, title = {El concepto NetCampus}, booktitle = {3es Jornadas de la C{\'a}tedra Telef{\'o}nica-UPC}, year = {2005}, pages = {15{\textendash}20}, url = {https://catedratelefonica.upc.edu/documents/llibres/docs/el_espacio_innovador_y_la_red.pdf}, author = {Marqu{\'e}s, F. and Jofre, L. and Sole, F. and Sabate, F. and Berenguer, J. and Romeu, J. and Jordi Torres} } @book {eJofre05, title = {El "Espacio Innovador" y la red}, year = {2005}, url = {http://www.upc.edu/web/CatedraTelefonicaUPC}, author = {Jofre, L. and Sole, F. and Sabate, F. and Berenguer, J. and Marqu{\'e}s, F. and Romeu, J. and Jordi Torres} } @book {eJofre04, title = {El {\textquoteright}Profesional Innovador{\textquoteright} y la red}, year = {2004}, url = {http://catedratelefonica.upc.edu/documents/llibres/docs/jornada_2004_catedra_telf_upc.pdf}, author = {Jofre, L. and Sole, F. and Sabate, F. and Marqu{\'e}s, F. and Romeu, J. and Jordi Torres} } @conference {cJofre04, title = {Una Enginyeria per a la Societat del Coneixement}, booktitle = {II Congr{\'e}s d{\textquoteright}Enginyeria en Llengua Catalana}, year = {2004}, url = {http://www.eicc.cat/celc/formacio.htm}, author = {Jofre, L. and Sole, F. and Sabate, F. and Marqu{\'e}s, F. and Romeu, J. and Jordi Torres} }