@article {aGeleta, title = {Deep Variational Autoencoders for Population Genetics}, year = {Submitted}, abstract = {

Motivation Modern biobanks provide numerous high-resolution genomic sequences of diverse populations. These datasets enable a better understanding of genotype-phenotype interactions with genome-wide association studies (GWAS) and power a new personalized precision medicine with polygenic risk scores (PRS). In order to account for diverse and admixed populations, new algorithmic tools are needed in order to properly capture the genetic composition of populations. Here we explore deep learning techniques, namely variational autoencoders (VAEs), to process genomic data from a population perspective. We hope this work will encourage the adoption of deep neural networks in the population genetics community.

Results In this paper, we show the power of VAEs for a variety of tasks relating to the interpretation, classification, simulation, and compression of genomic data with several worldwide whole genome datasets from both humans and canids and evaluate the performance of the proposed applications with and without ancestry conditioning. The unsupervised setting of autoencoders allows for the detection and learning of granular population structure and inferring of informative latent factors. The learned latent spaces of VAEs are able to capture and represent differentiated Gaussian-like clusters of samples with similar genetic composition on a fine-scale from single nucleotide polymorphisms (SNPs), enabling applications in dimensionality reduction, data simulation, and imputation. These individual genotype sequences can then be decomposed into latent representations and reconstruction errors (residuals) which provide a sparse representation useful for lossless compression. We show that different population groups have differentiated compression ratios and classification accuracies. Additionally, we analyze the entropy of the SNP data, its effect on compression across populations, its relation to historical migrations, and we show how to introduce autoencoders into existing compression pipelines.

}, doi = {https://doi.org/10.1101/2023.09.27.558320}, url = {https://www.biorxiv.org/content/10.1101/2023.09.27.558320v1}, author = {Geleta, Margarita and Mas-Montserrat, Daniel and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @article {aPina, title = {Feature propagation as self-supervision signals on graphs}, journal = {Knowledge-Based Systems}, year = {Submitted}, author = {Oscar Pina and Ver{\'o}nica Vilaplana} } @inbook {bHernandez, title = {Interpreting Machine Learning Models for Survival Analysis: A Study of Cutaneous Melanoma Using the SEER Database}, booktitle = {Explainable Artificial Intelligence and Process Mining Applications for Healthcare,}, volume = {2020}, number = {Communications in Computer and Information Science}, year = {Submitted}, abstract = {

In this study, we train and compare three types of machine learning algorithms for Survival Analysis: Random Survival Forest, DeepSurv and DeepHit, using the SEER database to model cutaneous malignant melanoma. Additionally, we employ SurvLIMEpy library, a Python package designed to provide explainability for survival machine learning models, to analyse feature importance. The results demonstrate that machine learning algorithms outperform the Cox Proportional Hazards Model. Our work underscores the importance of explainability methods for interpreting black-box models and provides insights into important features related to melanoma prognosis.

}, author = {Carlos Hernandez and Cristian Pach{\'o}n-Garc{\'\i}a and Pedro Delicado and Ver{\'o}nica Vilaplana} } @article {aCumplido-Mayoral, title = {The mediating role of neuroimaging-derived biological brain age between risk factors for dementia and cognitive decline in middle/late-aged asymptomatic individuals: a cohort study}, journal = {The Lancet Healthy Longevity}, year = {Submitted}, author = {Irene Cumplido-Mayoral and Anna Brugulat-Serra and Gonzalo S{\'a}nchez-Benavides and Jose Luis Molinuevo and Marc Suarez-Calvet and Ver{\'o}nica Vilaplana and Juan Domingo Gispert} } @article {aGene-Molad, title = {AmodalAppleSize_RGB-D dataset: RGB-D images of apple trees annotated with modal and amodal segmentation masks for fruit detection, visibility and size estimation}, journal = {Data in Brief}, volume = {52}, year = {2024}, month = {02/2024}, abstract = {

The present dataset comprises a collection of RGB-D apple tree images that can be used to train and test computer vision-based fruit detection and sizing methods. This dataset encompasses two distinct sets of data obtained from a Fuji and an Elstar apple orchards. The Fuji apple orchard sub-set consists of 3925 RGB-D images containing a total of 15335 apples annotated with both modal and amodal apple segmentation masks. Modal masks denote the visible portions of the apples, whereas amodal masks encompass both visible and occluded apple regions. Notably, this dataset is the first public resource to incorporate on-tree fruit amodal masks. This pioneering inclusion addresses a critical gap in existing datasets, enabling the development of robust automatic fruit sizing methods and accurate fruit visibility estimation, particularly in the presence of partial occlusions. Besides the fruit segmentation masks, the dataset also includes the fruit size (calliper) ground truth for each annotated apple. The second sub- set comprises 2731 RGB-D images capturing five Elstar apple trees at four distinct growth stages. This sub-set includes mean diameter information for each tree at every growth stage and serves as a valuable resource for evaluating fruit sizing methods trained with the first sub-set. The present data was employed in the research papers titled "Looking behind occlusions: a study on amodal segmentation for robust on-tree apple fruit size estimation" [1] and {\textquotedblleft}Simultaneous fruit detection and size estimation using multitask deep neural networks{\textquotedblright} [2].

}, keywords = {Agricultural robotics, amodal segmentation, depth image, Fruit measurement, Fruit visibility, Instance Segmentation, modal segmentation, Yield prediction}, doi = {https://doi.org/10.1016/j.dib.2023.110000}, author = {Gen{\'e}-Mola, Jordi and Ferrer-Ferrer, M. and Hemming, J. and Dalfsen, P. and Hoog, D. and Sanz-Cortiella, R. and Rosell-Polo, Joan R. and Morros, J.R. and Ver{\'o}nica Vilaplana and Ruiz-Hidalgo, J. and Gregorio, Eduard} } @conference {cBonet23, title = {HyperFast: Instant Classification for Tabular Data}, booktitle = {38th Annual AAAI Conference on Artificial Intelligence (AAAI)}, year = {2024}, month = {12/2024}, abstract = {

Training deep learning models and performing hyperparameter tuning can be computationally demanding and time-consuming. Meanwhile, traditional machine learning methods like gradient-boosting algorithms remain the preferred choice for most tabular data applications, while neural network alternatives require extensive hyperparameter tuning or work only in toy datasets under limited settings. In this paper, we introduce HyperFast, a meta-trained hypernetwork designed for instant classification of tabular data in a single forward pass. HyperFast generates a task-specific neural network tailored to an unseen dataset that can be directly used for classification inference, removing the need for training a model. We report extensive experiments with OpenML and genomic data, comparing HyperFast to competing tabular data neural networks, traditional ML methods, AutoML systems, and boosting machines. HyperFast shows highly competitive results, while being significantly faster. Additionally, our approach demonstrates robust adaptability across a variety of classification tasks with little to no fine-tuning, positioning HyperFast as a strong solution for numerous applications and rapid model deployment. HyperFast introduces a promising paradigm for fast classification, with the potential to substantially decrease the computational burden of deep learning.

}, author = {Bonet, David and Mas-Montserrat, Daniel and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @article {aPachon-Garcia, title = {SurvLIMEpy: A Python package implementing SurvLIME}, journal = {Expert Systems With Applications}, volume = {237, Part C}, year = {2024}, month = {03/2024}, abstract = {

In this paper we present SurvLIMEpy, an open-source Python package that implements the SurvLIME algorithm. This method allows to compute local feature importance for machine learning algorithms designed for modelling Survival Analysis data. Our\ implementation takes advantage of the parallelisation paradigm as all computations are performed in a matrix-wise fashion which speeds up execution time. Additionally, SurvLIMEpy assists the user with visualization tools to better understand the result of the algorithm. The package supports a wide variety of survival models, from the Cox Proportional Hazards Model to deep learning models such as DeepHit or DeepSurv. Two types of experiments are presented in this paper. First, by means of simulated data, we study the ability of the algorithm to capture the importance of the features. Second, we use three open source survival datasets together with a set of survival algorithms in order to demonstrate how SurvLIMEpy behaves when applied to different models.

}, keywords = {eXplainable Artificial Intelligence, Interpretable Machine Learning, machine learning, python, Survival Analysis}, author = {Cristian Pach{\'o}n-Garc{\'\i}a and Carlos Hernandez and Pedro Delicado and Ver{\'o}nica Vilaplana} } @conference {cBarrabes, title = {Adversarial Learning for Feature Shift Detection and Correction}, booktitle = {Neural Information Processing Systems (NeurIPS)}, year = {2023}, month = {12/2023}, address = {New Orleans, USA}, abstract = {

TL;DR: We introduce a framework inspired in adversarial learning to detect and correct features originating a distribution shift between datasets.

Data shift is a phenomenon present in many real-world applications, and while there are multiple methods trying to detect shifts, the task of localizing and correcting the features originating such shifts has not been studied in depth. Feature shifts can occur in many datasets, including in multi-sensor data, where some sensors are malfunctioning, or in tabular and structured data, including biomedical, financial, and survey data, where faulty standardization and data processing pipelines can lead to erroneous features. In this work, we explore using the principles of adversarial learning, where the information from several discriminators trained to distinguish between two distributions is used to both detect the corrupted features and fix them in order to remove the distribution shift between datasets. We show that mainstream supervised classifiers, such as random forest or gradient boosting trees, combined with simple iterative heuristics, can localize and correct feature shifts outperforming current statistical and neural network-based techniques.

}, author = {Barrab{\'e}s, M{\'\i}riam and Mas-Montserrat, Daniel and Geleta, Margarita and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @article {aCumplido-Mayoral22, title = {Biological Brain Age Prediction Using Machine Learning on Structural Neuroimaging Data: Multi-Cohort Validation Against Biomarkers of Alzheimer{\textquoteright}s Disease and Neurodegeneration stratified by sex}, journal = {eLife}, volume = {12}, year = {2023}, month = {04/2023}, abstract = {

Brain-age can be inferred from structural neuroimaging and compared to chronological age (brain-age delta) as a marker of biological brain aging. Accelerated aging has been found in neurodegenerative disorders like Alzheimer9s disease (AD), but its validation against markers of neurodegeneration and AD is lacking. Here, imaging-derived measures from the UK Biobank dataset (N=22,661) were used to predict brain-age in 2,314 cognitively unimpaired (CU) individuals at higher risk of AD and mild cognitive impaired (MCI) patients from four independent cohorts with available biomarker data: ALFA+, ADNI, EPAD and OASIS. Brain-age delta was associated with abnormal amyloid-β, more advanced stages (AT) of AD pathology and APOE-ε4 status. Brain-age delta was positively associated with plasma neurofilament light, a marker of neurodegeneration, and sex differences in the brain effects of this marker were found. These results validate brain-age delta as a non-invasive marker of biological brain aging related to markers of AD and neurodegeneration.

}, issn = {2050-084X}, doi = {https://doi.org/10.7554/eLife.81067}, author = {Irene Cumplido-Mayoral and Marina Garc{\'\i}a-Prat and Gregory Operto and Carles Falcon and Mahnaz Shekari and Raffaele Cacciaglia and Marta Mila-Aloma and Luigi Lorenzini and Carolina Minguillon and Jose Luis Molinuevo and Marc Suarez-Calvet and Ver{\'o}nica Vilaplana and Juan Domingo Gispert} } @conference {cCumplido-Mayoral23a, title = {Brain-age mediates the association between modifiable risk factors and cognitive decline early in the AD continuum}, booktitle = {Alzheimer{\textquoteright}s Association International Conference (AAIC)}, year = {2023}, month = {07/2023}, address = {Amsterdam, Netherlands}, author = {Irene Cumplido-Mayoral and Anna Brugulat-Serrat and Gonzalo S{\'a}nchez-Benavides and Armand Gonz{\'a}lez-Escalante and Federica Anastasi and Marta Mila-Aloma and Carles Falcon and Mahnaz Shekari and Raffaele Cacciaglia and Carolina Minguillon and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @conference {cCumplido-Mayoral23, title = {Brain-age prediction and its associations with glial and synaptic CSF markers}, booktitle = {Alzheimer{\textquoteright}s Association International Conference}, year = {2023}, month = {07/2023}, address = {Amsterdam, Netherlands}, author = {Irene Cumplido-Mayoral and Marta Mila-Aloma and Carles Falcon and Raffaele Cacciaglia and Carolina Minguillon and Karine Fauria and Jose Luis Molinuevo and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @article {ade-Mas-Gimenez23, title = {Gradient-Based Metrics for the Evaluation of Image Defogging}, journal = {World Electric Vehicle Journal}, volume = {14}, year = {2023}, month = {09/2023}, chapter = {254}, abstract = {

Fog, haze, or smoke are standard atmospheric phenomena that dramatically compromise the overall visibility of any scene, critically affecting features such as the illumination, contrast, and contour detection of objects. The decrease in visibility compromises the performance of computer vision algorithms such as pattern recognition and segmentation, some of which are very relevant to decision-making in the field of autonomous vehicles. Several dehazing methods have been proposed that either need to estimate fog parameters through physical models or are statistically based. But physical parameters greatly depend on the scene conditions, and statistically based methods require large datasets of natural foggy images together with the original images without fog, i.e., the ground truth, for evaluation. Obtaining proper fog-less ground truth images for pixel-to-pixel evaluation is costly and time-consuming, and this fact hinders progress in the field. This paper aims to tackle this issue by proposing gradient-based metrics for image defogging evaluation that do not require a ground truth image without fog or a physical model. A comparison of the proposed metrics with metrics already used in the NTIRE 2018 defogging challenge as well as several state-of-the-art defogging evaluation metrics is performed to prove its effectiveness in a general situation, showing comparable results to conventional metrics and an improvement in the no-reference scene. A Matlab implementation of the proposed metrics has been developed and it is open-sourced in a public GitHub repository.

}, doi = {10.3390/wevj14090254}, url = {https://www.mdpi.com/2032-6653/14/9/254}, author = {Gerard de-Mas-Gim{\'e}nez and Pablo Garc{\'\i}a-G{\'o}mez and Casas, J. and S. Royo} } @phdthesis {dMosella-Montoro23, title = {Graph Convolutional Neural Networks for 3D Data Analysis}, volume = {Phd}, year = {2023}, school = {Universitat Polit{\`e}cnica de Catalunya}, address = {Barcelona}, abstract = {

Deep Learning allows the extraction of complex features directly from raw input data, eliminating the need for hand-crafted features from the classical Machine Learning pipeline. This new paradigm brought a boost in the performance across several domains, including computer vision, natural language processing and audio processing. However, there are still challenges when dealing with unorganized structures. This thesis addresses this challenge using Graph Convolutional Neural Networks, a new set of techniques capable of managing graph structures that can be used for processing 3D data.\ \ 

The first part of the thesis focuses on the Graph Analysis task, in which we study the capabilities of Graph Convolutional Neural Networks to capture the intrinsic geometric information of 3D data. We propose the Attention Graph Convolution layer that learns to infer the kernel used during the convolution, taking into account the particularities of each neighbourhood of the graph. We explore two variants of the Attention Graph Convolution layer, one that explores a residual approach and another one that allows the convolution to combine different neighbourhood domains. Furthermore, we propose a set of 3D pooling layers that mimics the behaviour of the pooling layers found in common 2D Convolutional Neural Networks architectures. Finally, we present a 2D-3D Fusion block capable of merging the 3D geometric information that we get from a Graph Convolutional Neural Network with the texture information obtained by a 2D Convolutional Neural Network. We evaluate the presented contributions on the RGB-D Scene Classification task.\ 

The second part of this thesis focuses on the Node Analysis task, which consists of extracting features on a node level, taking into account the neighbourhood structure. We present the Multi-Aggregator Graph Convolution layer that uses a multiple aggregator approach to better generalize for unseen topologies and learn better local representations. In addition, it reduces the memory footprint with respect to the Attention Graph Convolution layer. Finally, we analyze the capabilities of our proposed Graph Convolution layers to deal with heterogeneous graphs where the nodes of the graph may belong to different modalities. We evaluate the presented contributions with the Computer Graphics process of skinning a character mesh. Specifically, we propose a Two-Stream Graph Neural Network capable of predicting the skinning weights of a 3D character.

}, url = {http://hdl.handle.net/10803/689400}, author = {Mosella-Montoro, Albert}, editor = {Ruiz-Hidalgo, J.} } @conference {cHernandez23, title = {Interpreting Machine Learning models for Survival Analysis: A study of Cutaneous Melanoma using the SEER Database}, booktitle = {XAI-Healthcare 2023 Workshop at 21st International Conference of Artificial Intelligence in Medicine (AIME 2023)}, year = {2023}, month = {06/2023}, address = {Portoroz, Slovenia}, author = {Carlos Hernandez and Cristian Pach{\'o}n-Garc{\'\i}a and Pedro Delicado and Ver{\'o}nica Vilaplana} } @phdthesis {dFernandez23, title = {Knowledge graph population from news streams}, volume = {Doctorate}, year = {2023}, month = {10/2023}, school = {Universitat Polit{\`e}cnica de Catalunya}, type = {Industrial}, address = {Barcelona, Catalonia}, abstract = {

Media producers publish large amounts of multimedia content online - both text, audio, image and video.\  As the online media market grows, the management and delivery of contents becomes a challenge. Semantic and linking technologies can be used to organize and exploit these contents through the use of knowledge graphs. This industrial doctorate dissertation addresses the problem of constructing knowledge resources and integrating them into a system used by media producers to manage and explore their contents. For that purpose, knowledge graphs and their maintenance through Information Extraction (IE) from news streams is studied. This thesis presents solutions for multimedia understanding and knowledge extraction from online news, and their exploitation in real product applications, and it is structured in three parts.

The first part consists on the construction of IE tools that will be used for knowledge graph population. For that, we built an holistic Entity Linking (EL) system capable of combining multimodal data inputs to extract a set of semantic entities that describe news content.\  The EL system is followed by a Relation Extraction (RE) model that predicts relations between pairs of entities with a novel method based on entity-type knowledge. The final system is capable of extracting triples describing the contents of a news article.

The second part focuses on the automatic construction of a news event knowledge graph. We present an online multilingual system for event detection and comprehension from media feeds, called VLX-Stories. The system retrieves information from news sites, aggregates them into events (event detection), and summarizes them by extracting semantic labels of its most relevant entities (event representation) in order to answer four Ws from journalism: who, what, when and where.\  This part of the thesis deals with the problems of Topic Detection and Tracking (TDT), topic modeling and event representation.

The third part of the thesis builds on top of the models developed in the two previous parts to populate a knowledge graph from aggregated news.
The system is completed with an emerging entity detection module, which detects mentions of novel people appearing on the news and creates new knowledge graph entities from them. Finally, data validation and triple classification tools are added to increase the quality of the knowledge graph population.

This dissertation addresses many general knowledge graph and information extraction problems, like knowledge dynamicity, self-learning, and quality assessment. Moreover, as an industrial work, we provide solutions that were deployed in production and verify our methods with real customers.

}, keywords = {Entity Linking, Information Extraction, Knowledge Graph Population, Named Entity Disambiguation, Named Entity Recognition, Natural Language Processing, Relation Extraction, Topic Detection and Tracking, Topic Modeling, Triple Validation}, author = {Fern{\`a}ndez, D{\`e}lia and Marqu{\'e}s, F. and Xavier Gir{\'o}-i-Nieto and Bou-Balust, Elisenda} } @conference {cPinab, title = {Layer-wise self-supervised learning on graphs}, booktitle = {KDD 2023 Workshop on Deep Learning on Graphs: Methods and Applications (DLG-KDD 2023)}, year = {2023}, month = {08/2023}, address = {Long Beach, USA}, abstract = {

End-to-end training of graph neural networks (GNN) on large graphs presents several memory and computational challenges, and limits the application to shallow architectures as depth exponentially increases the memory and space complexities. In this manuscript, we propose Layer-wise Regularized Graph Infomax, an algorithm to train GNNs layer by layer in a self-supervised manner. We decouple the feature propagation and feature transformation carried out by GNNs to learn node representations in order to derive a loss function based on the prediction of future inputs. We evaluate the algorithm in inductive large graphs and show similar performance to other end to end methods and a substantially increased efficiency, which enables the training of more sophisticated models in one single device. We also show that our algorithm avoids the oversmoothing of the representations, another common challenge of deep GNNs.

}, author = {Oscar Pina and Ver{\'o}nica Vilaplana} } @article {aGene-Mola23, title = {Looking behind occlusions: A study on amodal segmentation for robust on-tree apple fruit size estimation}, journal = {Computers and Electronics in Agriculture}, volume = {209}, year = {2023}, month = {04/2023}, abstract = {

The detection and sizing of fruits with computer vision methods is of interest because it provides relevant information to improve the management of orchard farming. However, the presence of partially occluded fruits limits the performance of existing methods, making reliable fruit sizing a challenging task. While previous fruit segmentation works limit segmentation to the visible region of fruits (known as modal segmentation), in this work we propose an amodal segmentation algorithm to predict the complete shape, which includes its visible and occluded regions. To do so, an end-to-end convolutional neural network (CNN) for simultaneous modal and amodal instance segmentation was implemented. The predicted amodal masks were used to estimate the fruit diameters in pixels. Modal masks were used to identify the visible region and measure the distance between the apples and the camera using the depth image. Finally, the fruit diameters in millimetres (mm) were computed by applying the pinhole camera model. The method was developed with a Fuji apple dataset consisting of 3925 RGB-D images acquired at different growth stages with a total of 15,335 annotated apples, and was subsequently tested in a case study to measure the diameter of Elstar apples at different growth stages. Fruit detection results showed an F1-score of 0.86 and the fruit diameter results reported a mean absolute error (MAE) of 4.5\ mm and R2\ =\ 0.80 irrespective of fruit visibility. Besides the diameter estimation, modal and amodal masks were used to automatically determine the percentage of visibility of measured apples. This feature was used as a confidence value, improving the diameter estimation to MAE\ =\ 2.93\ mm and R2\ =\ 0.91 when limiting the size estimation to fruits detected with a visibility higher than 60\%. The main advantages of the present methodology are its robustness for measuring partially occluded fruits and the capability to determine the visibility percentage. The main limitation is that depth images were generated by means of photogrammetry methods, which limits the efficiency of data acquisition. To overcome this limitation, future works should consider the use of commercial RGB-D sensors. The code and the dataset used to evaluate the method have been made publicly available at\ https://github.com/GRAP-UdL-AT/Amodal_Fruit_Sizing.

}, keywords = {deep learning, Fruit detection, Fruit measurement, Fruit visibility, Precision agriculture, Yield estimation}, issn = {ISSN 0168-1699}, doi = {https://doi.org/10.1016/j.compag.2023.107854}, url = {https://authors.elsevier.com/sd/article/S0168-1699(23)00242-9}, author = {Gen{\'e}-Mola, Jordi and Ferrer-Ferrer, M. and Gregorio, Eduard and Blok, P. M. and Hemming, J. and Morros, J.R. and Rosell-Polo, Joan R. and Ver{\'o}nica Vilaplana and Ruiz-Hidalgo, J.} } @article {aDominguez, title = {Neural ADMIXTURE: rapid population clustering with autoencoders}, journal = {Nature Computational Science}, year = {2023}, month = {07/2023}, abstract = {

Characterizing the genetic substructure of large cohorts has become increasingly important as genetic association and prediction studies are extended to massive, increasingly diverse, biobanks. ADMIXTURE and STRUCTURE are widely used unsupervised clustering algorithms for characterizing such ancestral genetic structure. These methods decompose individual genomes into fractional cluster assignments with each cluster representing a vector of DNA marker frequencies. The assignments, and clusters, provide an interpretable representation for geneticists to describe population substructure at the sample level. However, with the rapidly increasing size of population biobanks and the growing numbers of variants genotyped (or sequenced) per sample, such traditional methods become computationally intractable. Furthermore, multiple runs with different hyperparameters are required to properly depict the population clustering using these traditional methods, increasing the computational burden. This can lead to days of compute. In this work we present Neural ADMIXTURE, a neural network autoencoder that follows the same modeling assumptions as ADMIXTURE, providing similar (or better) clustering, while reducing the compute time by orders of magnitude. Indeed, the equivalent of one month of continuous compute can be reduced to hours. In addition, Neural ADMIXTURE can include multiple outputs, providing the equivalent results as running the original ADMIXTURE algorithm many times with different numbers of clusters. Our models can also be stored, allowing later cluster assignment to be performed with a linear computational time.

}, author = {Dominguez, Albert and Mas-Montserrat, Daniel and Bustamante, Carlos and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @conference {cLozano23, title = {Optical Phased Array Antenna Apodization for Lidar in Autonomous Vehicles}, booktitle = {XIII Reuni{\'o}n OptoElectr{\'o}nica}, year = {2023}, month = {06/2023}, publisher = {OPTOEL}, organization = {OPTOEL}, address = {Sevilla, Spain}, abstract = {

This paper presents the specific design of an Optical Phased Array antenna (OPA) to apodize the emission of a lidar in the context of a project where diverse optoelectronic sensors such as cameras, radars, and commercial lidars are used to provide data in order to fuse them and develop perception for robots as future autonomous vehicles. While mechanical based lidars are already commercially available, this work focuses on designing much more robust and potentially cheaper lidars based on photonic integrated circuits and energy optimization through the apodization of the emission of the OPA.

}, url = {https://www.optoel2023.es/94299/section/44205/optoel-2023.html}, author = {Jos{\'e} Lozano and Humberto Jim{\'e}nez and Sergio Torres and Pau Biosca and Bernat Fontanet and Jorge Pinazo and Adolfo Ler{\'\i}n and Federico Dios and Casas, J. and Jos{\'e} Antonio L{\'a}zaro} } @article {aPlasencia23, title = {A Preliminary Study of Deep Learning Sensor Fusion for Pedestrian Detection}, journal = {Sensors}, volume = {23}, year = {2023}, month = {04/2023}, chapter = {4167}, abstract = {

Most pedestrian detection methods focus on bounding boxes based on fusing RGB with lidar. These methods do not relate to how the human eye perceives objects in the real world. Furthermore, lidar and vision can have difficulty detecting pedestrians in scattered environments, and radar can be used to overcome this problem. Therefore, the motivation of this work is to explore, as a preliminary step, the feasibility of fusing lidar, radar, and RGB for pedestrian detection that potentially can be used for autonomous driving that uses a fully connected convolutional neural network architecture for multimodal sensors. The core of the network is based on SegNet, a pixel-wise semantic segmentation network. In this context, lidar and radar were incorporated by transforming them from 3D pointclouds into 2D gray images with 16-bit depths, and RGB images were incorporated with three channels. The proposed architecture uses a single SegNet for each sensor reading, and the outputs are then applied to a fully connected neural network to fuse the three modalities of sensors. Afterwards, an up-sampling network is applied to recover the fused data. Additionally, a custom dataset of 60 images was proposed for training the architecture, with an additional 10 for evaluation and 10 for testing, giving a total of 80 images. The experiment results show a training mean pixel accuracy of 99.7\% and a training mean intersection over union of 99.5\%. Also, the testing mean of the IoU was 94.4\%, and the testing pixel accuracy was 96.2\%. These metric results have successfully demonstrated the effectiveness of using semantic segmentation for pedestrian detection under the modalities of three sensors. Despite some overfitting in the model during experimentation, it performed well in detecting people in test mode. Therefore, it is worth emphasizing that the focus of this work is to show that this method is feasible to be used, as it works regardless of the size of the dataset. Also, a bigger dataset would be necessary to achieve a more appropiate training. This method gives the advantage of detecting pedestrians as the human eye does, thereby resulting in less ambiguity. Additionally, this work has also proposed an extrinsic calibration matrix method for sensor alignment between radar and lidar based on singular value decomposition.

}, keywords = {autonomous driving, Convolutional Neural Networks, sensor calibration, sensor fusion}, doi = {10.3390/s23084167}, url = {https://www.mdpi.com/1424-8220/23/8/4167}, author = {Alfredo Ch{\'a}vez Plasencia and Pablo Garc{\'\i}a-G{\'o}mez and Eduardo Bernal P{\'e}rez and Gerard de-Mas-Gim{\'e}nez and Casas, J. and S. Royo} } @conference {cMorros, title = {Real-time lane classification and accident detection for safer micromobility}, booktitle = {11th Internationa Congress on Transportation Research}, year = {2023}, month = {09/2023}, address = {Heraklion, Crete}, abstract = {

The lack of knowledge of the micromobility regulations by e-scooter users is an important factor behind some of\ the accidents involving these vehicles. We present two modules that can increase the safety of the users and\ pedestrians: First, a computer vision model that analyses the video feed captured with a smartphone attached to\ the e-scooter, and predicts in real-time the type of lane in which the user is riding. This knowledge is used by an\ application which combines this information with GSNN location information and a database of mobility\ regulations, and informs the user when he/she is not complying with these regulations. Second, an accident\ detection system, using the smartphone accelerometer, that detects if there is a fall during the riding, so that the\ app can contact the authorities to determine the appropriate response. The experimental results show excellent\ results for both modules.

}, keywords = {accident detection, computer Vision, deep learning, lane classification, Micromobility, RideSafeUM}, author = {Morros, J.R. and Broquetas, A. and Mateo, A. and Puig, J. and Davins, M.} } @conference {cTarresa, title = {Sign Language Translation from Instructional Videos}, booktitle = {CVPR 2023 Women in Computer Vision Workshop}, year = {2023}, month = {04/2023}, publisher = {Computer Vision Foundation / IEEE}, organization = {Computer Vision Foundation / IEEE}, address = {Vancouver, Canada}, abstract = {

The advances in automatic sign language translation (SLT) to spoken languages have been mostly benchmarked with datasets of limited size and restricted domains. Our work advances the state of the art by providing the first baseline results on How2Sign, a large and broad dataset. We train a Transformer over I3D video features, using the reduced BLEU as a reference metric for validation, instead of the widely used BLEU score. We report a result of 8.03 on the BLEU score, and publish the first open-source implementation of its kind to promote further advances.

}, author = {Laia Tarr{\'e}s and Gerard I. G{\'a}llego and Amanda Duarte and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @article {aFerrer-Ferrer, title = {Simultaneous Fruit Detection and Size Estimation Using Multitask Deep Neural Networks }, journal = {Biosystems Engineering}, volume = {233}, year = {2023}, month = {09/2023}, pages = {63-75}, abstract = {

The measurement of fruit size is of great interest to estimate the yield and predict the harvest resources in advance. This work proposes a novel technique for in-field apple detection and measurement based on Deep Neural Networks. The proposed\ \ framework was trained with RGB-D data and consists of an end-to-end multitask Deep Neural Network architecture 13 specifically designed to perform the following tasks: 1) detection and segmentation of each fruit from its surroundings; 2) estimation of the diameter of each detected fruit. The methodology was tested with a total of 15335 annotated apples at different growth stages, with diameters varying from 27 mm to 95 mm. Fruit detection results reported an F1-score for apple detection of 0.88 and a mean absolute error of diameter estimation of 5.64 mm. These are state-of-the-art results with the additional advantages of: a) using an end-to-end multitask trainable network; b) an efficient and fast inference speed; and c) being based on RGB-D data which can be acquired with affordable depth cameras. On the contrary, the main disadvantage is the need of annotating a large amount of data with fruit masks and diameter ground truth to train the model. Finally, a fruit visibility analysis showed an improvement in the prediction when limiting the measurement to apples above 65\% of visibility (mean absolute error of 5.09 mm). This suggests that future works should develop a method for automatically identifying the most visible apples and discard the prediction of highly occluded fruits.

}, keywords = {deep learning, Fruit measurement, Fruit visibility, Precision agriculture, Yield estimation}, doi = {https://doi.org/10.1016/j.biosystemseng.2023.07.010}, author = {Ferrer-Ferrer, M. and Ruiz-Hidalgo, J. and Gregorio, Eduard and Ver{\'o}nica Vilaplana and Morros, J.R. and Gen{\'e}-Mola, Jordi} } @conference {cCaselles, title = {SIRA: Relightable Avatars from a Single Image}, booktitle = {Winter Conference on Applications of Computer Vision (WACV)}, year = {2023}, abstract = {

Recovering the geometry of a human head from a single image, while factorizing the materials and illumination is a severely ill-posed problem that requires prior information to be solved. Methods based on 3D Morphable Models (3DMM), and their combination with differentiable renderers, have shown promising results. However, the expressiveness of 3DMMs is limited, and they typically yield over-smoothed and identity-agnostic 3D shapes limited to the face region. Highly accurate full head reconstructions have recently been obtained with neural fields that parameterize the geometry using multilayer perceptrons. The versatility of these representations has also proved effective for disentangling geometry, materials and lighting. However, these methods require several tens of input images. In this paper, we introduce SIRA, a method which, from a single image, reconstructs human head avatars with high fidelity geometry and factorized lights and surface materials. Our key ingredients are two data-driven statistical models based on neural fields that resolve the ambiguities of single-view 3D surface reconstruction and appearance factorization. Experiments show that SIRA obtains state of the art results in 3D head reconstruction while at the same time it successfully disentangles the global illumination, and the diffuse and specular albedos. Furthermore, our reconstructions are amenable to physically-based appearance editing and head model relighting.

}, author = {Caselles, Pol and Ramon, Eduard and Garcia, Jaime and Xavier Gir{\'o}-i-Nieto and Moreno, Francesc and Triginer, Gil} } @article {aPardas23, title = {Stromal tissue segmentation in Ki67 histology images based on cytokeratin-19 stain translation}, journal = {JOURNAL OF MEDICAL IMAGING}, volume = {10}, year = {2023}, month = {06/2023}, abstract = {

Purpose

The diagnosis and prognosis of breast cancer relies on histopathology image analysis. In this context, proliferation markers, especially Ki67, are increasingly important. The diagnosis using these markers is based on the quantification of proliferation, which implies the counting of Ki67 positive and negative tumoral cells in epithelial regions, thus excluding stromal cells. However, stromal cells are often very difficult to distinguish from negative tumoral cells in Ki67 images and often lead to errors when automatic analysis is used.

Approach

We study the use of automatic semantic segmentation based on convolutional neural networks (CNNs) to separate stromal and epithelial areas on Ki67 stained images. CNNs need to be accurately trained with extensive databases with associated ground truth. As such databases are not publicly available, we propose a method to produce them with minimal manual labelling effort. Inspired by the procedure used by pathologists, we have produced the database relying on knowledge transfer from cytokeratin-19 images to Ki67 using an image-to-image (I2I) translation network.

Results

The automatically produced stroma masks are manually corrected and used to train a CNN that predicts very accurate stroma masks for unseen Ki67 images. An F-score value of 0.87 is achieved. Examples of effect on the KI67 score show the importance of the stroma segmentation.

Conclusions

An I2I translation method has proved very useful for building ground-truth labeling in a task where manual labeling is unfeasible. With reduced correction effort, a dataset can be built to train neural networks for the difficult problem of separating epithelial regions from stroma in stained images where separation is very hard without additional information.

}, isbn = {2329-4302}, doi = {https://doi.org/10.1117/1.JMI.10.3.037502}, author = {M. Pard{\`a}s and D. Anglada and M. Espina and F. Marques and Salembier, P.} } @conference {cHurtado23, title = {Study of Manifold Geometry using Multiscale Non-Negative Kernel Graphs}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, year = {2023}, month = {06/2023}, address = {Rhodes Island, Greece}, isbn = {978-1-7281-6327-7}, doi = {https://doi.org/10.1109/ICASSP49357.2023.10095956}, author = {Hurtado, C. and Shekkizhar, S. and Ruiz-Hidalgo, J. and Ortega, A.} } @article {xPerera22, title = {Ancestry-conditioned Generative Models for Genotyping}, year = {2022}, abstract = {

Local ancestry inference (LAI) identifies the ancestry of each segment of an individual{\textquoteright}s genome and it is a critical step in the analysis of human genomes with applications from pharmacogenomics and personalized medicine to increase detection of genetic associations.\ 

New LAI techniques are appearing at a fast pace in both industry and academic research and large data-sets of human genomic sequences from the ancestries of interest are required to train those methods. Usually, those data-sets are protected by privacy regulations, are proprietary or accessible only when they come with restrictions due to its nature. An interesting way to overcome those difficulties is through the generation of data samples that could be similar enough to real sequences from ancestries of interest. A generalized model can be openly shared because there is no real individual information in there.\ 

Thus, we present a class-conditional Generative adversarial Model and a Conditional Generative Moment-Matching Network intended to generate new realistic genotypes of a desired ancestry. In addition, we present a privacy mechanism that extracts features from the real data to generate new realistic genotypes by using features.

}, author = {Perera, Maria}, editor = {Mas-Montserrat, Daniel and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @conference {cCombaliae, title = {Artificial intelligence to predict positivity of sentinel lymph node biopsy in melanoma patients}, booktitle = {European Association of Dermato Oncology (EADO 2022)}, year = {2022}, month = {04/2022}, author = {Marc Combalia and Sebastian Podlipnik and Carlos Hernandez and Sergio Garc{\'\i}a and Joan Ficapal and Julio Burgos and Ver{\'o}nica Vilaplana and Josep Malvehy} } @conference {cCumplido-Mayoral22, title = {Biological brain age prediction using machine learning on structural neuroimaging data: Multi-cohort validation against biomarkers of Alzheimer{\textquoteright}s disease and neurodegeneration stratified by sex}, booktitle = {15th Clinical Trials on Alzheimer{\textquoteright}s Disease Conference (CTAD)}, year = {2022}, month = {11/2022}, address = {San Francisco, USA}, author = {Irene Cumplido-Mayoral and Marta Mila-Aloma and Luigi Lorenzini and Alle Meije Wink and H. Mutsaerts and Sven Haller and Ga{\"e}l Chetelat and Frederik Barkhof and Marc Suarez-Calvet and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @conference {cCumplido-Mayoral, title = {Biological Brain Age Prediction Using Machine Learning on Structural Neuroimaging Data: Multi-Cohort Validation Against Biomarkers of Alzheimer{\textquoteright}s Disease and Neurodegeneration}, booktitle = {Alzheimer{\textquoteright}s Association International Conference}, year = {2022}, month = {07/2022}, author = {Irene Cumplido-Mayoral and Marina Garc{\'\i}a-Prat and Greg Operto and Carles Falcon and Mahnaz Shekari and Raffaele Cacciaglia and Marta Mila-Aloma and Marc Suarez Calvet and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @conference {cBonet22, title = {Channel Redundancy and Overlap in Convolutional Neural Networks with Channel-Wise NNK Graphs}, booktitle = {International Conference on Acoustics, Speech and Signal Processing}, year = {2022}, month = {05/2022}, abstract = {

Feature spaces in the deep layers of convolutional neural networks (CNNs) are often very high-dimensional and difficult to interpret. However, convolutional layers consist of multiple channels that are activated by different types of inputs, which suggests that more insights may be gained by studying the channels and how they relate to each other. In this paper, we first analyze theoretically channel-wise non-negative kernel (CW-NNK) regression graphs, which allow us to quantify the overlap between channels and, indirectly, the intrinsic dimension of the data representation manifold. We find that redundancy between channels is significant and varies with the layer depth and the level of regularization during training. Additionally, we observe that there is a correlation between channel overlap in the last convolutional layer and generalization performance. Our experimental results demonstrate that these techniques can lead to a better understanding of deep representations.\ 

}, author = {Bonet, D. and Ortega, A. and Ruiz-Hidalgo, J. and Shekkizhar, S.} } @article {canet2022context, title = {Context-unsupervised adversarial network for video sensors}, journal = {Sensors}, volume = {22}, year = {2022}, pages = {3171}, abstract = {

Foreground object segmentation is a crucial first step for surveillance systems based on networks of video sensors. This problem in the context of dynamic scenes has been widely explored in the last two decades, but it still has open research questions due to challenges such as strong shadows, background clutter and illumination changes. After years of solid work based on statistical background pixel modeling, most current proposals use convolutional neural networks (CNNs) either to model the background or to make the foreground/background decision. Although these new techniques achieve outstanding results, they usually require specific training for each scene, which is unfeasible if we aim at designing software for embedded video systems and smart cameras. Our approach to the problem does not require specific context or scene training, and thus no manual labeling. We propose a network for a refinement step on top of conventional state-of-the-art background subtraction systems. By using a statistical technique to produce a rough mask, we do not need to train the network for each scene. The proposed method can take advantage of the specificity of the classic techniques, while obtaining the highly accurate segmentation that a deep learning system provides. We also show the advantage of using an adversarial network to improve the generalization ability of the network and produce more consistent results than an equivalent non-adversarial network. The results provided were obtained by training the network on a common database, without fine-tuning for specific scenes. Experiments on the unseen part of the CDNet database provided 0.82 a F-score, and 0.87 was achieved for LASIESTA databases, which is a database unrelated to the training one. On this last database, the results outperformed by 8.75\% those available in the official table. The results achieved for CDNet are well above those of the methods not based on CNNs, and according to the literature, among the best for the context-unsupervised CNNs systems.

}, author = {Canet Tarr{\'e}s, Gemma and Pard{\`a}s, Montse} } @conference {cHernandez22a, title = {Contrastive and attention-based multiple instance learning for the prediction of sentinel lymph node status from histopathologies of primary melanoma tumours.}, booktitle = {Cancer Prevention through early detecTion (Caption) Workshop at 25th International Conference on Medical Image Computing and Computer Assisted Intervention (MICCAI 2022)}, year = {2022}, month = {09/2022}, abstract = {

Sentinel lymph node status is a crucial prognosis factor for melanomas; nonetheless, the invasive surgery required to obtain it always puts the patient at risk. In this study, we develop a Deep Learning-based approach to predict lymph node metastasis from Whole Slide Images of primary tumours. Albeit very informative, these images come with complexities that hamper their use in machine learning applications, namely their large size and limited datasets. We propose a pre-training strategy based on self-supervised contrastive learning to extract better image feature representations and an attention-based Multiple Instance Learning approach to enhance the model{\textquoteright}s performance. With this work, we quantitatively demonstrate that combining both methods improves various classification metrics and qualitatively show that contrastive learning encourages the network to output higher attention scores to tumour tissue and lower scores to image artifacts.

}, author = {Carlos Hernandez and Marc Combalia and Susana Puig and Josep Malvehy and Ver{\'o}nica Vilaplana} } @phdthesis {dDuarte, title = {Data and methods for a visual understanding of sign languages}, volume = {PhD}, year = {2022}, type = {International Mention}, abstract = {

Signed languages are complete and natural languages used as the first or preferred mode of communication by millions of people worldwide. However, they, unfortunately, continue to be marginalized languages. Designing, building, and evaluating models that work on sign languages presents compelling research challenges and requires interdisciplinary and collaborative efforts. The recent advances in Machine Learning (ML) and Artificial Intelligence (AI) has the power to enable better accessibility to sign language users and narrow down the existing communication barrier between the Deaf community and non-sign language users. However, recent AI-powered technologies still do not account for sign language in their pipelines. This is mainly because sign languages are visual languages, that use manual and non-manual features to convey information, and do not have a standard written form. Thus, the goal of this thesis is to contribute to the development of new technologies that account for sign language by creating large-scale multimodal resources suitable for training modern data-hungry machine learning models and developing automatic systems that focus on computer vision tasks related to sign language that aims at learning better visual understanding of sign languages.Thus, in Part I, we introduce the How2Sign dataset, which is a large-scale collection of multimodal and multiview sign language videos in American Sign Language. In Part II, we contribute to the development of technologies that account for sign languages by presenting in Chapter 4 a framework called Spot-Align, based on sign spotting methods, to automatically annotate sign instances in continuous sign language. We further present the benefits of this framework and establish a baseline for the sign language recognition task on the How2Sign dataset. In addition to that, in Chapter 5 we benefit from the different annotations and modalities of the How2Sign to explore sign language video retrieval by learning cross-modal embeddings. Later in Chapter 6, we explore sign language video generation by applying Generative Adversarial Networks to the sign language domain and assess if and how well sign language users can understand automatically generated sign language videos by proposing an evaluation protocol based on How2Sign topics and English translation.

}, author = {Amanda Duarte}, editor = {Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @article {aTemprana-Salvador22, title = {DigiPatICS: Digital Pathology Transformation of the Catalan Health Institute Network of 8 Hospitals - Planification, Implementation and Preliminary Results}, journal = {Diagnostics}, volume = {12}, year = {2022}, month = {03/2022}, chapter = {852}, abstract = {

Complete digital pathology transformation for primary histopathological diagnosis is a challenging yet rewarding endeavor. Its advantages are clear with more efficient workflows, but there are many technical and functional difficulties to be faced. The Catalan Health Institute (ICS) has started its DigiPatICS project, aiming to deploy digital pathology in an integrative, holistic, and comprehensive way within a network of 8 hospitals, over 168 pathologists, and over 1 million slides each year. We describe the bidding process and the careful planning that was required, followed by swift implementation in stages. The purpose of the DigiPatICS project is to increase patient safety and quality of care, improving diagnosis and the efficiency of processes in the pathological anatomy departments of the ICS through process improvement, digital pathology, and artificial intelligence tools.

}, keywords = {artificial intelligence, computational pathology, deep learning, digital pathology, implementation, LIS, primary diagnosis, telepathology, workflow}, doi = {10.3390/diagnostics12040852}, url = {https://www.mdpi.com/2075-4418/12/4/852}, author = {Jordi Temprana-Salvador and Pau L{\'o}pez-Garc{\'\i}a and Josep Castellv{\'\i} Vives and Llu{\'\i}s de Haro and Eudald Ballesta and Matias Rojas Abusleme and Miquel Arrufat and Ferran Marques and Casas, J. and Carlos Gallego and Laura Pons and Jos{\'e} Luis Mate and Pedro Luis Fern{\'a}ndez and Eugeni L{\'o}pez-Bonet and Ramon Bosch and Salom{\'e} Mart{\'\i}nez and Santiago Ram{\'o}n y Cajal and Xavier Matias-Guiu} } @article {aPedersen22, title = {Experimental confirmation of efficient island divertor operation and successful neoclassical transport optimization in Wendelstein 7-X}, journal = {Nuclear Fusion}, volume = {62}, year = {2022}, month = {04/2022}, issn = {1741-4326}, doi = {10.1088/1741-4326/ac2cf5}, author = {Thomas Sunn Pedersen and al et and Casas, J. and Salembier, P.} } @mastersthesis {xMohamed22, title = {Exploring Visual Representations for Sign Language Translation}, year = {2022}, abstract = {

Sign Language Translation (SLT) task has been addressed in multiple approaches in recent years. In this work we aim to investigate the impact of using different types of visual sign language representation for SLT. For this investigation we use the state-of-the-art in SLT, the Sign Language Transformers model. We compare the translation output performance of two types of body pose estimation models as our skeleton extractor,\  and 2D CNN features trained on the test dataset. These later perform best, and I3D features outperform the pose estimation-based ones.\ 

}, author = {Maram A. Mohamed}, editor = {Laia Tarr{\'e}s and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dRamon22, title = {Few-shot 3D Reconstruction of Body Parts with Deep Neural Networks}, volume = {Excellent}, year = {2022}, month = {09/2022}, type = {Industrial}, abstract = {

In this thesis, we study the problem of reconstructing objects from a concrete category\ in 3D when few images are available as input, i.e. less than 10. We apply our findings to digitalizing human body parts such as heads and torsos for medical applications. The first part of the thesis explores systems that rely on 3D Morphable Models. When approaching a concrete task, training such systems requires expensive manual hyper-parameter tuning of both the architecture and the loss, which is time consuming. We focus on designing novel losses without hyperparameters and modular architectures that allow to train models without tuning efforts. We also aim at providing a fine alignment between the 3D space and the image space by estimating camera poses with a low re-projection error that further improve the texturing process in 3D modelling applications or the rendering process in augmented reality applications. Our findings lead to systems that are very stable and that naturally scale to different scenes.

While 3D Morphable Models are fast and robust, they are still very limited in terms\ of accuracy and expressiveness, which might be prohibitive for applications that require high fidelity. A promising alternative to 3D Morphable Models are implicit functions,which in combination with differentiable rendering techniques have shown impressive results at reconstructing 3D surfaces. However, the later require large sets of images at test time to obtain satisfactory results. In the second part of the thesis, we propose to use a probabilistic model that represents a distribution of implicit surfaces in combination with a differentiable renderer to reduce the number of images required at test time. The resulting 3D reconstruction system is highly accurate and allows to reconstruct a wide variety of human head shapes when only 3 images are available.

}, author = {Ramon, Eduard}, editor = {Moreno, Francesc and Garcia, Jaime and Xavier Gir{\'o}-i-Nieto} } @conference {cMas-Montserrat, title = {Generative Moment Matching Networks for Genotype Simulation}, booktitle = {44th Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC{\textquoteright}22)}, year = {2022}, abstract = {

The generation of synthetic genomic sequences using neural networks has potential to overcome privacy and data sharing restrictions and to mitigate potential bias within datasets due to under-representation of some population groups. However, there is not a consensus on which architectures, training procedures, and evaluation metrics should be used when simulating single nucleotide polymorphism (SNP) sequences with neural networks. In this paper, we explore the use of Generative Moment Matching Networks (GMMNs) for SNP simulation, we present some architectural and procedural changes to properly train the networks, and we introduce an evaluation scheme to qualitatively and quantitatively asses the quality of the simulated sequences.

}, author = {Mas-Montserrat, Daniel and Perera, Maria and Barrab{\'e}s, M{\'\i}riam and Geleta, Margarita and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @mastersthesis {xDomenech, title = {Hiding Images in their Spoken Narratives}, year = {2022}, abstract = {

Steganography is the technique of hiding secret data within an ordinary, non-secret, file or message in order to avoid its detection. Throughout our work, we study the case where the hidden secret data is an image and the non-secret data or cover signal is an audio. To this end, we use a recently proposed residual architecture operating on top of short-time discrete cosine transform (STDCT) audio spectrograms. In our work, we evaluate the above mentioned residual steganography architecture with the Localized Narratives dataset, explore the feasibility of using short-time fourier transform (STFT) audio spectrograms instead of STDCTs to improve the efficiency of the system, investigate the use of hidden signals permuted with the objective to spread the audio corruption of the revealed images, apply averaged audio windows to improve quality results and tested the system in real-world distortions.

}, author = {Teresa Domenech}, editor = {McGuinness, Kevin and Pons, Jordi and Xavier Gir{\'o}-i-Nieto} } @conference {cSchurholt22, title = {Hyper-Representations as Generative Models: Sampling Unseen Neural Network Weights}, booktitle = {NeurIPS 2022 - Neural Information Processing Systems}, year = {2022}, abstract = {

Learning representations of neural network weights given a model zoo is an emerging and challenging area with many potential applications from model inspection, to neural architecture search or knowledge distillation. Recently, an autoencoder trained on a model zoo was able to learn a hyper-representation, which captures intrinsic and extrinsic properties of the models in the zoo. In this work, we extend hyper-representations for generative use to sample new model weights as pre-training. We propose layer-wise loss normalization which we demonstrate is key to generate high-performing models and a sampling method based on the empirical density of hyper-representations. The models generated using our methods are diverse, performant and capable to outperform conventional baselines for transfer learning. Our results indicate the potential of knowledge aggregation from model zoos to new models via hyper-representations thereby paving the avenue for novel research directions.

}, author = {Sch{\"u}rholt, Konstantin and Knyazev, Boris and Xavier Gir{\'o}-i-Nieto and Borth, Damian} } @article {xBarrabes22, title = {Machine Learning for Genomic Sequence Processing}, year = {2022}, abstract = {

Introduction to Research, BSc Data Science and Engineering, Autumn 2021:

Predicting Dog Phenotypes from Genotypes

In this paper, we analyze dog genotypes {\textendash} positions of DNA sequences that often vary between different dogs {\textendash}in order to predict the corresponding phenotypes {\textendash} unique characteristics that result from different genetic code. More specifically, given chromosome data from a dog, we aim to predict its breed category, height, and weight. We explore a variety of linear and non-linear classification and regression techniques to accomplish these three tasks. We also investigate the use of a neural network (both in linear and non-linear modes) for breed classification and compare its performance to traditional statistical methods. We show that linear methods generally outperform or match the performance of non-linear methods for breed classification. However, the reverse case is true for height and weight regression. We also evaluate the results of all of these methods based on the number of input features used in the analy sis and demonstrate that phenotypes can be predicted with as few as 0.5\% of the input features, and dog breeds can be classified with 50\% balanced accuracy with as few as 0.02\% of the full genomic sequences for our analysis.

MergeGenome. A Python-based Toolkit for Merging VCF files

A challenge of genomic studies is the lack of easy to access and properly formatted datasets. When having access to more than one dataset, it seems desirable to combine them. There is a lack of tools to duly merge genomic datasets without losing all non-matching features. To fill this gap, we present the MergeGenome toolkit, designed to integrate DNA sequences from two files in variant call format (VCF) while targeting data quality. MergeGenome is a robust pipeline of comprehensive steps to standardize nomenclature, remove ambiguities, correct flips, eliminate mismatches, select important features, and filter likely erroneous features (the latter with machine learning). MergeGenome is Python-based and relies on pre-existing software for manipulation and imputation of VCF data. We evaluate the result of merging two datasets with dog DNA sequences of dissimilar lengths and notice that genotype imputation with Beagle v5.1 usually fails for low-frequency alleles. Alternatively, we explore several multi-label machine learning classifiers. Although K-Nearest Neighbors achieves competitive results, none of the methods tried outperforms Beagle v5.1.

}, author = {Barrab{\'e}s, M{\'\i}riam}, editor = {Mas-Montserrat, Daniel and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @article {pTerradas22, title = {A method, system and computer programs to automatically transform an image}, journal = {European Patent Office}, volume = {21382176}, year = {2022}, abstract = {

The present invention is directed, in general, to a method and a system to automatically transform an image using neural networks. More specifically, the invention relates to a controllable image generation through an image representation and several conditions using a conditional Neural Network.

The method comprises receiving, by a processing unit, at least one image and processing the received image to obtain an image representation thereof (i.e. an intermediate representation of the initial image that captures high level features and low level properties of the image and that is structured in an understandable way for a conditional Neural Network such as a deep generative Neural Network). The method also includes receiving, by an encoding unit, one or more references (e.g. other images, text, labels, combinations thereof, or even other data describing how the received image should be transformed) and encoding the received one or more references into one or more features, the latter being further provided to a conditional Neural Network as a condition(s). In addition, the method further applies the conditional Neural Network to transform the obtained image representation into a resulting conditioned image based on said condition(s).

}, issn = {21382176.2}, url = {https://patentscope.wipo.int/search/es/detail.jsf?docId=EP373278976}, author = {Terradas, R. and Pau Domingo and Grau, M. and Alarc{\'o}n, E. and Ruiz-Hidalgo, J.} } @conference {cSchurholt22a, title = {Model Zoos: A Dataset of Diverse Populations of Neural Network Models}, booktitle = {NeurIPS}, year = {2022}, month = {09/2022}, abstract = {

In the last years, neural networks (NN) have evolved from laboratory environments to the state-of-the-art for many real-world problems. It was shown that NN models (i.e., their weights and biases) evolve on unique trajectories in weight space during training. Following, a population of such neural network models (referred to as model zoo) would form structures in weight space. We think that the geometry, curvature and smoothness of these structures contain information about the state of training and can reveal latent properties of individual models. With such model zoos, one could investigate novel approaches for (i) model analysis, (ii) discover unknown learning dynamics, (iii) learn rich representations of such populations, or (iv) exploit the model zoos for generative modelling of NN weights and biases. Unfortunately, the lack of standardized model zoos and available benchmarks significantly increases the friction for further research about populations of NNs. With this work, we publish a novel dataset of model zoos containing systematically generated and diverse populations of NN models for further research. In total the proposed model zoo dataset is based on eight image datasets, consists of 27 model zoos trained with varying hyperparameter combinations and includes 50{\textquoteright}360 unique NN models as well as their sparsified twins, resulting in over 3,844,360 collected model states. Additionally, to the model zoo data we provide an in-depth analysis of the zoos and provide benchmarks for multiple downstream tasks.

}, author = {Sch{\"u}rholt, Konstantin and Taskiran, Diyar and Knyazev, Boris and Xavier Gir{\'o}-i-Nieto and Borth, Damian} } @conference {cSchurholt22b, title = {Model Zoos: A Dataset of Diverse Populations of Neural Network Models}, booktitle = {NeurIPS 2022 Track Datasets and Benchmarks}, year = {2022}, month = {12/2022}, address = {New Orleans, Louisiana, USA.}, abstract = {

In the last years, neural networks (NN) have evolved from laboratory environments to the state-of-the-art for many real-world problems. It was shown that NN models (i.e., their weights and biases) evolve on unique trajectories in weight space during training. Following, a population of such neural network models (referred to as model zoo) would form structures in weight space. We think that the geometry, curvature and smoothness of these structures contain information about the state of training and can reveal latent properties of individual models. With such model zoos, one could investigate novel approaches for (i) model analysis, (ii) discover unknown learning dynamics, (iii) learn rich representations of such populations, or (iv) exploit the model zoos for generative modelling of NN weights and biases. Unfortunately, the lack of standardized model zoos and available benchmarks significantly increases the friction for further research about populations of NNs. With this work, we publish a novel dataset of model zoos containing systematically generated and diverse populations of NN models for further research. In total the proposed model zoo dataset is based on eight image datasets, consists of 27 model zoos trained with varying hyperparameter combinations and includes 50{\textquoteright}360 unique NN models as well as their sparsified twins, resulting in over 3{\textquoteright}844{\textquoteright}360 collected model states. Additionally, to the model zoo data we provide an in-depth analysis of the zoos and provide benchmarks for multiple downstream tasks.The dataset can be found at www.modelzoos.cc.

}, author = {Sch{\"u}rholt, Konstantin and Taskiran, Diyar and Knyazev, Boris and Xavier Gir{\'o}-i-Nieto and Borth, Damian} } @article {xBudria22, title = {Multimodal 3D Hand Pose Enhancement for Sign Language}, year = {2022}, abstract = {

The application of recent deep learning breakthroughs to the domain of sign language has yielded very promising results. However, sign language processing systems depend on large amounts of labeled high-quality data to work properly. Current hand pose estimation methods are often unreliable and do not always produce estimations with enough quality. To mitigate this issue, we explore the applicability of the novel Body2Hands method for the obtainment of high-quality hand pose estimations.

}, author = {{\'A}lvaro Budria}, editor = {Laia Tarr{\'e}s and Moreno, Francesc and Xavier Gir{\'o}-i-Nieto} } @conference {cGarcia-Gomez22, title = {Multimodal imaging System based on s{\`o}lid-State LiDAR for Advanced perception applications}, booktitle = {10th International Symposium on Optronics in defence \& security}, year = {2022}, month = {06/2022}, publisher = {3AF OPTRO2022}, organization = {3AF OPTRO2022}, address = {Versailles, France}, url = {https://www.3af-optro.com/}, author = {Pablo Garc{\'\i}a-G{\'o}mez and S. Royo and Noel Rodrigo and Casas, J. and Jordi Riu} } @conference {cPinazo22, title = {Perception in the era of Autonomous Vehicles}, booktitle = {Photonics 4 Smart Cities, SCEWC 2022}, year = {2022}, month = {11/2022}, publisher = {Photonics21}, organization = {Photonics21}, address = {Barcelona}, url = {https://www.fotonica21.org/photonics-4-smart-cities}, author = {Jorge Pinazo and Adolfo Ler{\'\i}n and Francesc Xavier de Gibert and {\'A}lvaro Moliner and Daniel Sevilla and Antonio Jurado and Iv{\'a}n R{\'\i}os and Rodrigo Jerez and Jaime Santiago and {\'A}lvaro Linuesa and Antonio Cano and Federico Dios and Adolfo Comer{\'o}n and Casas, J. and Jos{\'e} Antonio L{\'a}zaro} } @conference {cGeleta21, title = {PixInWav: Residual Steganography for Hiding Pixels in Audio}, booktitle = {ICASSP}, year = {2022}, month = {06/2021}, abstract = {

Steganography comprises the mechanics of hiding data in a host media that may be publicly available. While previous works focused on unimodal setups (e.g., hiding images in images, or hiding audio in audio), PixInWav targets the multimodal case of hiding images in audio. To this end, we propose a novel residual architecture operating on top of short-time discrete cosine transform (STDCT) audio spectrograms. Among our results, we find that the residual audio steganography setup we propose allows independent encoding of the hidden image from the host audio without compromising quality. Accordingly, while previous works require both host and hidden signals to hide a signal, PixInWav can encode images offline --- which can be later hidden, in a residual fashion, into any audio signal. Finally, we test our scheme in a lab setting to transmit images over airwaves from a loudspeaker to a microphone verifying our theoretical insights and obtaining promising results.

\ Deep Learning Barcelona Symposium 2022

Presentation from the early stages of the project (January 2021):

}, author = {Geleta, Margarita and Punt{\'\i}, Cristina and McGuinness, Kevin and Pons, Jordi and Cristian Canton-Ferrer and Xavier Gir{\'o}-i-Nieto} } @conference {cBartusiak, title = {Predicting Dog Phenotypes from Genotypes}, booktitle = {44th Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC{\textquoteright}22)}, year = {2022}, abstract = {

In this paper, we analyze dog genotypes - positions of DNA sequences that often vary between different dogs - in order to predict the corresponding phenotypes - unique characteristics that result from different genetic code. More specifically, given chromosome data from a dog, we aim to predict its breed category, height, and weight. We explore a variety of linear and non-linear classification and regression techniques to accomplish these three tasks. We show that linear methods generally outperform or match non-linear methods for breed classification. However, the reverse case is true for height and weight regression. We also evaluate the performance of all of these methods based on the number of input features used in the analysis. We conduct experiments using different fractions of the full genomic sequences and demonstrate that phenotypes can be predicted with as few as 0.5\% of the input features available for our analysis, and dog breeds can be classified with 50\% balanced accuracy with as few as 0.02\% of the features.

}, doi = {https://doi.org/10.1101/2022.04.13.488108 }, author = {Bartusiak, Emily and Barrab{\'e}s, M{\'\i}riam and Aigerim Rymbe and J{\'u}lia Gimbernat and Cayetana L{\'o}pez and Lorenzo Barberis and Mas-Montserrat, Daniel and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @article {aa, title = {QU-BraTS: MICCAI BraTS 2020 Challenge on Quantifying Uncertainty in Brain Tumor Segmentation--Analysis of Ranking Metrics and Benchmarking Results}, journal = {Journal of Machine Learning for Biomedical Imaging}, year = {2022}, month = {08/2022}, abstract = {

Deep learning (DL) models have provided state-of-the-art performance in various medical imaging benchmarking challenges, including the Brain Tumor Segmentation (BraTS) challenges. However, the task of focal pathology multi-compartment segmentation (e.g., tumor and lesion sub-regions) is particularly challenging, and potential errors hinder translating DL models into clinical workflows. Quantifying the reliability of DL model predictions in the form of uncertainties could enable clinical review of the most uncertain regions, thereby building trust and paving the way toward clinical translation. Several uncertainty estimation methods have recently been introduced for DL medical image segmentation tasks. Developing scores to evaluate and compare the performance of uncertainty measures will assist the end-user in making more informed decisions. In this study, we explore and evaluate a score developed during the BraTS 2019 and BraTS 2020 task on uncertainty quantification (QU-BraTS) and designed to assess and rank uncertainty estimates for brain tumor multi-compartment segmentation. This score (1) rewards uncertainty estimates that produce high confidence in correct assertions and those that assign low confidence levels at incorrect assertions, and (2) penalizes uncertainty measures that lead to a higher percentage of under-confident correct assertions. We further benchmark the segmentation uncertainties generated by 14 independent participating teams of QU-BraTS 2020, all of which also participated in the main BraTS segmentation task. Overall, our findings confirm the importance and complementary value that uncertainty estimates provide to segmentation algorithms, highlighting the need for uncertainty quantification in medical image analyses. Finally, in favor of transparency and reproducibility, our evaluation code is made publicly available at:\ this https URL.

}, url = {https://www.melba-journal.org/papers/2022:026.html}, author = {Raghav Metha and Angelos Filos and Ujjwal Baid and Laura Mora and Ver{\'o}nica Vilaplana and Christos Davatzikos and Bjoern Menze and Spyridon Bakas and Yarin Gal and Tar Arbel} } @article {cBellver-Bueno20, title = {RefVOS: A Closer Look at Referring Expressions for Video Object Segmentation}, journal = {Multimedia Tools and Applications}, year = {2022}, month = {07/2022}, abstract = {

The task of video object segmentation with referring expressions (language-guided VOS) is to, given a linguistic phrase and a video, generate binary masks for the object to which the phrase refers. Our work argues that existing benchmarks used for this task are mainly composed of trivial cases, in which referents can be identified with simple phrases. Our analysis relies on a new categorization of the phrases in the DAVIS-2017 and Actor-Action datasets into trivial and non-trivial REs, with the non-trivial REs annotated with seven RE semantic categories. We leverage this data to analyze the results of RefVOS, a novel neural network that obtains competitive results for the task of language-guided image segmentation and state of the art results for language-guided VOS. Our study indicates that the major challenges for the task are related to understanding motion and static actions.

}, doi = {https://doi.org/10.1007/s11042-022-13413-x}, author = {Bellver-Bueno, M{\'\i}riam and Ventura, C. and Silberer, Carina and Kazakos, Ioannis and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @article {aBernal22, title = {SALAI-Net: species-agnostic local ancestry inference network}, journal = {Bioinformatics}, volume = {38}, year = {2022}, month = {09/2022}, chapter = {ii27}, abstract = {

Motivation
Local ancestry inference (LAI) is the high resolution prediction of ancestry labels along a DNA sequence. LAI is important in the study of human history and migrations, and it is beginning to play a role in precision medicine applications including ancestry-adjusted genome-wide association studies (GWASs) and polygenic risk scores (PRSs). Existing LAI models do not generalize well between species, chromosomes or even ancestry groups, requiring re-training for each different setting. Furthermore, such methods can lack interpretability, which is an important element in each of these applications.

Results
We present SALAI-Net, a portable statistical LAI method that can be applied on any set of species and ancestries (species-agnostic), requiring only haplotype data and no other biological parameters. Inspired by identity by descent methods, SALAI-Net estimates population labels for each segment of DNA by performing a reference matching approach, which leads to an interpretable and fast technique. We benchmark our models on whole-genome data of humans and we test these models{\textquoteright} ability to generalize to dog breeds when trained on human data. SALAI-Net outperforms previous methods in terms of balanced accuracy, while generalizing between different settings, species and datasets. Moreover, it is up to two orders of magnitude faster and uses considerably less RAM memory than competing methods.

}, doi = {https://doi.org/10.1093/bioinformatics/btac464}, url = {https://academic.oup.com/bioinformatics/article/38/Supplement_2/ii27/6701999?login=false$\#$373158629}, author = {Bernal, Oriol and Mas-Montserrat, Daniel and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @article {aSalgueirob, title = {SEG-ESRGAN: A multi-task network for super-resolution and semantic segmentation of remote sensing images}, journal = {Remote Sensing}, volume = {14}, year = {2022}, month = {2022}, chapter = {5862}, abstract = {

The production of highly accurate land cover maps is one of the primary challenges in remote sensing, which depends on the spatial resolution of the input images. Sometimes, high-resolution imagery is not available or is too expensive to cover large areas or to perform multitemporal analysis. In this context, we propose a multi-task network to take advantage of the freely available Sentinel-2 imagery to produce a super-resolution image, with a scaling factor of 5, and the corresponding high-resolution land cover map. Our proposal, named SEG-ESRGAN, consists of two branches: the super-resolution branch, that produces Sentinel-2 multispectral images at 2 m resolution, and an encoder{\textendash}decoder architecture for the semantic segmentation branch, that generates the enhanced land cover map. From the super-resolution branch, several skip connections are retrieved and concatenated with features from the different stages of the encoder part of the segmentation branch, promoting the flow of meaningful information to boost the accuracy in the segmentation task. Our model is trained with a multi-loss approach using a novel dataset to train and test the super-resolution stage, which is developed from Sentinel-2 and WorldView-2 image pairs. In addition, we generated a dataset with ground-truth labels for the segmentation task. To assess the super-resolution improvement, the PSNR, SSIM, ERGAS, and SAM metrics were considered, while to measure the classification performance, we used the IoU, confusion matrix and the F1-score. Experimental results demonstrate that the SEG-ESRGAN model outperforms different full segmentation and dual network models (U-Net, DeepLabV3+, HRNet and Dual_DeepLab), allowing the generation of high-resolution land cover maps in challenging scenarios using Sentinel-2 10 m bands.

}, issn = {2072-4292 }, doi = {https://doi.org/10.3390/rs142}, url = {https://doi.org/10.3390/rs14225862 }, author = {Luis Salgueiro and Javier Marcello and Ver{\'o}nica Vilaplana} } @conference {cPinaa, title = {Self-supervised graph representations of WSIs}, booktitle = {Geometric Deep Learning in Medical Image Analysis}, year = {2022}, month = {2022}, abstract = {

In this manuscript we propose a framework for the analysis of whole slide images (WSI) on the cell entity space with self-supervised deep learning on graphs and explore its representation quality at different levels of application. It consists of a two step process in which the cell level analysis is performed locally, by clusters of nearby cells that can be seen as small regions of the image, in order to learn representations that capture the cell environment and distribution. In a second stage, a WSI graph is generated with these regions as nodes and the representations learned as initial node embeddings. The graph is leveraged for a downstream task, region of interest (ROI) detection addressed as a graph clustering. The representations outperform the evaluation baselines at both levels of application, which has been carried out predicting whether a cell, or region, is tumor or not based on its learned representations with a logistic regressor.

}, url = {https://proceedings.mlr.press/v194/pina22a/pina22a.pdf}, author = {Oscar Pina and Ver{\'o}nica Vilaplana} } @conference {cHernandeza, title = {Sentinel lymph node status prediction using self-attention networks and contrastive learning from routine histology images of primary tumours}, booktitle = {Medical Imaging with Deep Learning MIDL 2022}, year = {2022}, month = {07/2022}, abstract = {

Deep learning-based computational pathology approaches are becoming increasingly prominent in histopathology image analysis. However, these images typically come with drawbacks that hamper automatic analysis, which include: labeled sample scarcity or the extremely large size of the images (ranging from\ \ to\ \ pixels). Nonetheless, they have proven to be a powerful tool for diagnosis and risk prevention. One such prevention is reducing the number of patients who undergo surgeries that do not benefit them. This study develops a pipeline for predicting sentinel lymph node (SLN) metastasis non-invasively from digitised Whole Slide Images (WSI) of primary melanoma tumours. Furthermore, we combine the use of a weakly supervised architecture with self-supervised contrastive pre-training. We experimentally demonstrate that 1) the use of self-attention improves sentinel lymph node status prediction and 2) self-supervised contrastive learning improves the quality of the learned representations compared to a standard ImageNet pre-training, which boosts the model{\textquoteright}s performance.

}, author = {Carlos Hernandez and Marc Combalia and Josep Malvehy and Ver{\'o}nica Vilaplana} } @conference {cHernandez22, title = {Sentinel lymph node status prediction with self-attention neural networks using histologies of primary melanoma tumours}, booktitle = {European Association of Dermato Oncology (EADO 2022)}, year = {2022}, month = {04/2022}, author = {Carlos Hernandez and Ver{\'o}nica Vilaplana and Marc Combalia and Sergio Garc{\'\i}a and Sebastian Podlipnik and Julio Burgos and Susana Puig and Josep Malvehy} } @article {xCabot22, title = {Sign Language Translation based on Transformers for the How2Sign Dataset}, year = {2022}, abstract = {

Introduction to Research, BSc Data Science and Engineering, Autumn 2021:

The end goal of Sign Language Translation is to either produce spoken sentences from sign videos or generate sign videos from their corresponding written transcriptions. In this situation, this task has been address in multiple approaches in recent years. Moreover, it has been proved that taking advantage of the sign gloss representations improves substantially the model{\textquoteright}s performance in this task. Therefore, in this work we replicate the state-of-the-art Transformer-based approach on the task and evaluate it on the multimodal American Sign Language How2Sign dataset. Furthermore, we provide baseline recognition and translation results that represent an starting point to further research on the topic. In addition, we provide a new sentence-based alignment for the How2Sign videos, as their current alignment was with speech, which we have used to tackle the Sign Language Translation task properly.\ 

}, author = {Patricia Cabot}, editor = {Laia Tarr{\'e}s and Xavier Gir{\'o}-i-Nieto} } @conference {cDuartec, title = {Sign Language Video Retrieval with Free-Form Textual Queries}, booktitle = {CVPR 2022 - CVF/IEEE Conference on Computer Vision and Pattern Recognition}, year = {2022}, abstract = {

Systems that can efficiently search collections of sign language videos have been highlighted as a useful application of sign language technology. However, the problem of searching videos beyond individual keywords has received limited attention in the literature. To address this gap, in this work we introduce the task of sign language retrieval with free-form textual queries: given a written query (e.g., a sentence) and a large collection of sign language videos, the objective is to find the signing video in the collection that best matches the written query. We propose to tackle this task by learning cross-modal embeddings on the recently introduced large-scale How2Sign dataset of American Sign Language (ASL). We identify that a key bottleneck in the performance of the system is the quality of the sign video embedding which suffers from a scarcity of labeled training data. We, therefore, propose SPOT-ALIGN, a framework for interleaving iterative rounds of sign spotting and feature alignment to expand the scope and scale of available training data. We validate the effectiveness of SPOT-ALIGN for learning a robust sign video embedding through improvements in both sign recognition and the proposed video retrieval task.

}, author = {Amanda Duarte and Samuel Albanie and Xavier Gir{\'o}-i-Nieto and G{\"u}l Varol} } @mastersthesis {xCabot22a, title = {Sign-Language Translation with Pseudo-Glosses}, year = {2022}, abstract = {

Sign Language Translation is an open problem whose goal is to generate written sentences from sign videos. In recent years, many research works that have been developed in this field mainly addressed the Sign Language Recognition task, which consists in understanding the input signs and transcribing them into sequences of annotations. Moreover, current studies show that taking advantage of the latter task helps to learn meaningful representations and can be seen as an intermediate step towards the end goal of translation.


In this work, we present a method to generate automatic pseudo-glosses from written sentences, which can work as a replacement for real glosses. This addresses the issue of their collection, as they need to be manually annotated and it is extremely costly.

Furthermore, we introduce a new implementation built on Fairseq of the Transformer-model approach introduced by Camgoz et al., which is jointly trained to solve the recognition and translation tasks. Besides, we provide new baseline results on both implementations: first, on the Phoenix dataset, we present results that outperform the ones provided by Camgoz et al. in their work, and, second, on the How2Sign dataset, we present the first results on the translation task. These results can work as a baseline for future research in the field.

Patricia Cabot with her advisors Laia Tarr{\'e}s, Gerard I. G{\'a}llego and Xavier Gir{\'o}-i-Nieto.
}, author = {Patricia Cabot and Laia Tarr{\'e}s and Xavier Gir{\'o}-i-Nieto} } @conference {cMosella-Montoro22, title = {SkinningNet: Two-Stream Graph Convolutional Neural Network for Skinning Prediction of Synthetic Characters}, booktitle = {IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR)}, year = {2022}, month = {06/2022}, address = {New Orleans, USA}, abstract = {

This work presents SkinningNet, an end-to-end Two-Stream Graph Neural Network architecture that computes skinning weights from an input mesh and its associated skeleton, without making any assumptions on shape class and structure of the provided mesh. Whereas previous methods pre-compute handcrafted features that relate the mesh and the skeleton or assume a fixed topology of the skeleton, the proposed method extracts this information in an end-to-end learnable fashion by jointly learning the best relationship between mesh vertices and skeleton joints. The proposed method exploits the benefits of the novel Multi-Aggregator Graph Convolution that combines the results of different aggregators during the summarizing step of the Message-Passing scheme, helping the operation to generalize for unseen topologies. Experimental results demonstrate the effectiveness of the contributions of our novel architecture, with SkinningNet outperforming current state-of-the-art alternatives.

https://imatge-upc.github.io/skinningnet/

}, url = {https://imatge-upc.github.io/skinningnet/}, author = {Mosella-Montoro, Albert and Ruiz-Hidalgo, J.} } @conference {cPina, title = {Structural Networks for Brain Age Prediction}, booktitle = {Medical Imaging with Deep Learning (MIDL 2022)}, year = {2022}, month = {08/2022}, abstract = {

Biological networks have gained considerable attention within the Deep Learning community because of the promising framework of Graph Neural Networks (GNN), neural models that operate in complex networks. In the context of neuroimaging, GNNs have successfully been employed for functional MRI processing but their application to ROI-level structural MRI (sMRI) remains mostly unexplored. In this work we analyze the implementation of these geometric models with sMRI by building graphs of ROIs (ROI graphs) using tools from Graph Signal Processing literature and evaluate their performance in a downstream supervised task, age prediction. We first make a qualitative and quantitative comparison of the resulting networks obtained with common graph topology learning strategies. In a second stage, we train GNN-based models for brain age prediction. Since the order of every ROI graph is exactly the same and each vertex is an entity by itself (a ROI), we evaluate whether including ROI information during message-passing or global pooling operations is beneficial and compare the performance of GNNs against a Fully-Connected Neural Network baseline. The results show that ROI-level information is needed during the global pooling operation in order to achieve competitive results. However, no relevant improvement has been detected when it is incorporated during the message passing. These models achieve a MAE of 4.27 in hold-out test data, which is a performance very similar to the baseline, suggesting that the inductive bias included with the obtained graph connectivity is relevant and useful to reduce the dimensionality of the problem

}, author = {Oscar Pina and Irene Cumplido-Mayoral and Raffaele Cacciaglia and Jos{\'e} Mar{\'\i}a Gonz{\'a}lez-de-Ech{\'a}varri and Juan D. Gispert and Ver{\'o}nica Vilaplana} } @phdthesis {dSalgueiro22, title = {Super-resolution and semantic segmentation of remote sensing images using deep learning techniques}, volume = {PhD}, year = {2022}, month = {10/2022}, abstract = {

Remote sensing for Earth observation is a growing scientific field essential for many human activities. Among the different applications in the Remote Sensing domain, the production of thematic maps, such as Land Cover and Land Use, are among the most relevant, as this information plays a critical role in management, planning and monitoring activities at different levels. In this context, the Sentinel-2 satellites are of great importance since they provide open data on land and coastal areas at different spatial resolutions (10, 20, and 60 m), democratizing usability, and creating a high potential for the generation of valuable information, useful in many scenarios, such as agriculture, forestry, land cover and urban planning, among others.

In this thesis, we aim to exploit the usability of Sentinel-2 data by applying deep learning techniques, which are revolutionizing the world of computer vision and, recently, remote sensing. First, we propose super-resolution models to improve the spatial details of the different Sentinel-2 bands, and second, we propose the conjunction of semantic segmentation with super-resolution to generate improved land cover maps that benefit from the enhanced spatial details of the bands.

We first address super-resolution by proposing two different models, one for the 10 m/pixel bands to reach 2 m/pixel and another for the 20 and 60 m/pixel bands to achieve 10 m/pixel. Then, we propose two different multitasking models to derive land cover maps. The first one extending a semantic segmentation model to produce an additional super-resolution image and the second, improving our first super-resolution approach, to provide a semantic segmentation map, as well. We combine features of the different tasks within a single model to improve performance and to generate a high-resolution image with the corresponding highquality land cover map. All models developed were evaluated, quantitatively and qualitatively, using different datasets, showing excellent performance in diverse complex scenarios.

}, author = {Luis Salgueiro}, editor = {Ver{\'o}nica Vilaplana and Javier Marcello} } @conference {cTarres22, title = {Tackling Low-Resourced Sign Language Translation: UPC at WMT-SLT 22}, booktitle = {EMNLP 2022 Seventh Conference on Machine Translation (WMT22)}, year = {2022}, month = {10/2022}, abstract = {

This paper describes the system developed at the Universitat Polit{\`e}cnica de Catalunya for the Workshop on Machine Translation 2022 Sign Language Translation Task, in particular, for the sign-to-text direction. We use a Transformer model implemented with the Fairseq modeling toolkit. We have experimented with the vocabulary size, data augmentation tech- niques and pretraining the model with the PHOENIX-14T dataset. Our system obtains 0.50 BLEU score for the test set, improving the organizers{\textquoteright} baseline by 0.38 BLEU. We remark the poor results for both the baseline and our system, and thus, the unreliability of our findings.

Poster view

}, author = {Laia Tarr{\'e}s and Gerard I. G{\'a}llego and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @mastersthesis {xBudria, title = {Topic Detection from Sign Language Videos}, year = {2022}, abstract = {

Significant progress has been made recently on challenging tasks in automatic sign language understanding, such as sign language recognition, translation and production. However, most works have focused on datasets with relatively few samples, short recordings and limited vocabulary and signing space. Moreover, they have neglected the less complex task of sign language video classification, whose analogue in spoken language, namely text classification, has been widely addressed. For this reason, in this work, we introduce the novel task of sign language topic detection. We base our experiments on How2Sign, a large-scale video dataset spanning multiple semantic domains. The contributions of this thesis are twofold. First, we present the first study of sign language topic detection in continuous sign language videos, providing baseline models for this task. Second, we perform a comparison between different visual features and deep learning architectures that are commonly employed in the sign language understanding literature. We implement our modelling pipelines in Fairseq, a PyTorch library that is extensively used in the spoken language community. Modular, extensible code for running our experiments is provided along this thesis.

}, author = {{\'A}lvaro Budria}, editor = {Laia Tarr{\'e}s and Xavier Gir{\'o}-i-Nieto} } @conference {cBudria, title = {Topic Detection in Continuous Sign Language Videos}, booktitle = {Accessibility, Vision, and Autonomy Meet (AVA) CVPR Workshop}, year = {2022}, month = {06/2022}, abstract = {

Significant progress has been made recently on challenging tasks in automatic sign language understanding, such as sign language recognition, translation and production. However, these works have focused on datasets with relatively few samples, short recordings and limited vocabulary and signing space.\ In this work, we introduce the novel task of sign language topic detection. We base our experiments on How2Sign, a large-scale video dataset spanning multiple semantic domains. We provide strong baselines for the task of topic detection,\ \ and present a comparison between different visual features commonly used in the domain of sign language.

}, author = {{\'A}lvaro Budria and Laia Tarr{\'e}s and Gerard I. G{\'a}llego and Moreno, Francesc and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @article {xGiro-i-Nieto22, title = {Towards Sign Language Translation and Production}, year = {2022}, abstract = {

Machine translation and computer vision have greatly benefited from the advances in deep learning. A large and diverse amount of textual and visual data have been used to train neural networks whether in a supervised or self-supervised manner. Nevertheless, the convergence of the two fields in sign language translation and production still poses multiple open challenges, like the low video resources, limitations in hand pose estimation, or 3D spatial grounding from poses.

}, author = {Xavier Gir{\'o}-i-Nieto and Amanda Duarte} } @article {xAguilar21, title = {2D-to-3D Lifting of Sign Language Body Poses with Recurrent Neural Networks}, year = {2021}, month = {01/2021}, institution = {UPC ETSETB TelecomBCN}, address = {Barcelona}, abstract = {

This paper aims at improving the quality of a dataset\ that contains multiple sequences of 3D poses extracted from\ American Sign Language videos. Each pose consists of 147\ points with three coordinates each. We propose an algorithm able to correct missing points as well as to add some\ constraints such as the length of the bones. To prove the\ quality of the algorithm{\textquoteright}s outcome, we evaluate the task of\ lifting 2D to 3D poses with a deep learning model trained\ on raw data, and another one trained with the preprocessed\ data.

}, author = {Aguilar, Jordi}, editor = {Xavier Gir{\'o}-i-Nieto and Amanda Duarte} } @article {aMosella-Montoro21, title = {2D{\textendash}3D Geometric Fusion network using Multi-Neighbourhood Graph Convolution for RGB-D indoor scene classification}, journal = {Information Fusion}, volume = {76}, year = {2021}, month = {12/2021}, chapter = {46-54}, abstract = {

Multi-modal fusion has been proved to help enhance the performance of scene classification tasks. This paper presents a 2D-3D Fusion stage that combines 3D Geometric Features with 2D Texture Features obtained by 2D Convolutional Neural Networks. To get a robust 3D Geometric embedding, a network that uses two novel layers is proposed. The first layer, Multi-Neighbourhood Graph Convolution, aims to learn a more robust geometric descriptor of the scene combining two different neighbourhoods: one in the Euclidean space and the other in the Feature space. The second proposed layer, Nearest Voxel Pooling, improves the performance of the well-known Voxel Pooling. Experimental results, using NYU-Depth-V2 and SUN RGB-D datasets, show that the proposed method outperforms the current state-of-the-art in RGB-D indoor scene classification task.\ 

https://imatge-upc.github.io/munegc/

}, doi = {10.1016/j.inffus.2021.05.002}, url = {https://imatge-upc.github.io/munegc/}, author = {Mosella-Montoro, Albert and Ruiz-Hidalgo, J.} } @conference {cMayoral21a, title = {Brain structural alterations in cognitively unimpaired individuals with discordant amyloid-β PET and CSF Aβ42 status: findings using Machine Learning}, booktitle = {Alzheimer{\textquoteright}s Association International Conference}, year = {2021}, month = {07/2021}, author = {Irene Cumplido-Mayoral and Mahnaz Shekari and Gemma Salvad{\'o} and Greg Operto and Raffaele Cacciaglia and Carles Falcon and Aida Ni{\~n}erola Baiz{\'a}n and Andr{\'e}s Perissinotti and Carolina Minguillon and Karine Fauria and Ivonne Suridjan and Gwendlyn Kollmorgen and Jose Luis Molinuevo and Henrik Zetterberg and Kaj Blennow and Marc Suarez Calvet and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @conference {cBonet21, title = {Channel-Wise Early Stopping without a Validation Set via NNK Polytope Interpolation}, booktitle = {Asia Pacific Signal and Information Processing Association Annual Summit, APSIPA}, year = {2021}, month = {12/2021}, address = {Tokyo, Japan}, abstract = {

State-of-the-art neural network architectures continue to scale in size and deliver impressive generalization results, although this comes at the expense of limited interpretability. In particular, a key challenge is to determine when to stop training the model, as this has a significant impact on generalization. Convolutional neural networks (ConvNets) comprise high-dimensional feature spaces formed by the aggregation of multiple channels, where analyzing intermediate data representations and the model{\textquoteright}s evolution can be challenging owing to the curse of dimensionality. We present channel-wise DeepNNK (CW-DeepNNK), a novel channel-wise generalization estimate based on non-negative kernel regression (NNK) graphs with which we perform local polytope interpolation on low-dimensional channels. This method leads to instance-based interpretability of both the learned data representations and the relationship between channels. Motivated by our observations, we use CW-DeepNNK to propose a novel early stopping criterion that (i) does not require a validation set, (ii) is based on a task performance metric, and (iii) allows stopping to be reached at different points for each channel. Our experiments demonstrate that our proposed method has advantages as compared to the standard criterion based on validation set performance.

}, url = {https://arxiv.org/abs/2107.12972}, author = {Bonet, D. and Ortega, A. and Ruiz-Hidalgo, J. and Shekkizhar, S.} } @article {xGiro-i-Nieto19, title = {Deep Learning Representations for All (a.ka. the AI hype)}, year = {2021}, abstract = {

Deep neural networks have revolutionized the data analytics scene by improving results in several and diverse benchmarks with the same recipe: learning feature representations from data. These achievements have raised the interest across multiple scientific fields, especially in those where large amounts of data and computation are available. This change of paradigm in data analytics has several ethical and economic implications that are driving large investments, political debates and sounding press coverage under the generic label of artificial intelligence (AI). This talk will present the fundamentals of deep learning through the classic example of image classification, and point at how the same principal has been adopted for several tasks. Finally, some of the forthcoming potentials and risks for AI will be pointed.

\ Version 2021

\ Version 2020

}, author = {Xavier Gir{\'o}-i-Nieto} } @conference {cGeleta, title = {Deep variational autoencoders for population genetics: applications in classification, imputation, dimensionality reduction, and novel lossless data compression}, booktitle = {American Society of Human Genetics (ASHG)}, year = {2021}, month = {10/2021}, publisher = {ASHG}, organization = {ASHG}, address = {Virtual}, abstract = {

In this study we show the power of variational autoencoders (VAEs) for a variety of tasks relating to the interpretation and compression of genomic data. The unsupervised setting allows for detecting and learning of granular population structure and inferring of new informative latent factors, opening up an avenue for applications in dimensionality reduction, data simulation, population classification, imputation, and lossless genomic data compression. The latent spaces of VAEs are able to capture and represent clearly differentiated Gaussian-like clusters of similar genetic composition on a fine-scale with a relatively small number of SNPs as input. Furthermore, sequences can be decomposed into latent representations and reconstruction errors (residuals) providing a sparse representation that provides a means for efficient lossless compression.

Identifying genetic clusters can be important when performing genome-wide association studies and provides an alternative to self-reported ethnic labels, which are culturally constructed and vary according to the location and individual. A variety of unsupervised dimensionality reduction methods have been explored in the past for such applications, including PCA, MDS, t-SNE, and UMAP. Our proposed VAE can represent the population structure as a Gaussian-distributed continuous multi-dimensional representation and as classification probabilities providing flexible and interpretable population descriptors.

We train our VAE method with several worldwide whole genome datasets from both humans and canids and evaluate the performance of the different proposed applications with networks with and without ancestry conditioning. Our experiments show that different population groups have significantly differentiated compression ratios and classification accuracies. Additionally, we analyze the entropy of the SNP data, noting its effect on compression across populations and connect these patterns to historical migrations and ancestral relationships.

Video from the related BSc thesis at UPC Data Science Engineering (2021):

}, author = {Geleta, Margarita and Mas-Montserrat, Daniel and Bustamante, Carlos and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @mastersthesis {xNietoa, title = {Discovery and Learning of Navigation Goals from Pixels in Minecraft}, year = {2021}, abstract = {

Pre-training Reinforcement Learning (RL) agents in a task-agnostic manner has shown promising results. However, previous works still struggle to learn and discover meaningful skills in high-dimensional state-spaces. We approach the problem by leveraging unsupervised skill discovery and self-supervised learning of state representations. In our work, we learn a compact latent representation by making use of variational or contrastive techniques. We demonstrate that both allow learning a set of basic navigation skills by maximizing an information theoretic objective. We assess our method in Minecraft 3D maps with different complexities. Our results show that representations and conditioned policies learned from pixels are enough for toy examples, but do not scale to realistic and complex maps. We also explore alternative rewards and input observations to overcome these limitations.



}, author = {Nieto, Juan Jos{\'e}}, editor = {V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xCaselles21, title = {Disentangling neural network structure from the weights space}, year = {2021}, abstract = {

Deep Neural Networks have been used to tackle a wide variety of tasks achieving great performance. However, there is still a lack of knowledge of how the training of these models converge and how weights relate to their properties. In this thesis we investigate the structure of the weight space and try to disentangle its properties. Attention mechanisms are introduced to capture relations among neurons{\textquoteright} weights that help in weight reconstruction, hyper-parameter classification and accuracy prediction. Our approach further has the potential to work with variable input size allowing different network width, depth or even architecture types.

}, author = {Caselles, Pol}, editor = {Sch{\"u}rholt, Konstantin and Borth, Damian and Xavier Gir{\'o}-i-Nieto} } @article {aAbadal, title = {A Dual Network for Super-Resolution and Semantic Segmentation of Sentinel-2 imagery}, journal = {Remote Sensing}, volume = {13}, year = {2021}, month = {2021}, pages = {4547}, abstract = {

There is a growing interest in the development of automated data processing workflows that provide reliable, high spatial resolution land cover maps. However, high-resolution remote sensing images are not always affordable. Taking into account the free availability of Sentinel-2 satellite data, in this work we propose a deep learning model to generate high-resolution segmentation maps from low-resolution inputs in a multi-task approach. Our proposal is a dual-network model with two branches: the Single Image Super-Resolution branch, that reconstructs a high-resolution version of the input image, and the Semantic Segmentation Super-Resolution branch, that predicts a high-resolution segmentation map with a scaling factor of 2. We performed several experiments to find the best architecture, training and testing on a subset of the S2GLC 2017 dataset. We based our model on the DeepLabV3+ architecture, enhancing the model and achieving an improvement of 5\% on IoU and almost 10\% on the recall score. Furthermore, our qualitative results demonstrate the effectiveness and usefulness of the proposed approach.

}, author = {Sa{\"u}c Abadal and Luis Salgueiro and Javier Marcello and Ver{\'o}nica Vilaplana} } @conference {cTarres, title = {GAN-based Image Colourisation with Feature Reconstruction Loss}, booktitle = {CVPR 2021 Women in Computer Vision Workshop}, year = {2021}, month = {06/2021}, address = {Virtual}, abstract = {

Image colourisation is the task of adding plausible colour to grayscale images. This transformation requires obtaining a three dimensional colour-valued mapping from a real-valued grayscale image, which leads to an undetermined problem because the gray-scale semantics and texture provide cues for multiple possible colour mappings. The goal of image colourisation in not to recover the ground truth colour in a manner that it is perceived as natural by a human observer.\ Our work takes as a baseline a scheme based on an end-to-end trainable convolutional neural network (CNN) trained with a smooth L1 loss to predict the $ab$ channels of a colour image given the $L$ channel. We introduce an extra perceptual reconstruction loss during training to improve the capabilities of a adversarial adversarial model, that we adopt as a baseline.

}, author = {Laia Tarr{\'e}s and G{\'o}rriz, Marc and Xavier Gir{\'o}-i-Nieto and Mrak, Marta} } @mastersthesis {xTarres21, title = {GAN-based Image Colourisation with Feature Reconstruction Loss}, year = {2021}, abstract = {

Automatic image colourisation is a complex and ambiguous task due to having multiple correct solutions. Previous approaches have resulted in desaturated results unless relying on significant user interaction.\ In this thesis we study the state of the art for colourisation and we propose an automatic colourisation approaches based on generative adversarial networks that incorporates a feature reconstruction loss during training. The generative network is framed in an adver- sarial model that learns how to colourise by incorporating a perceptual understanding of the colour. Qualitative and quantitative results show the capacity of the proposed method to colourise images in a realistic way, boosting the colourfulness and perceptual realism of previous GAN-based methodologies.\ We also study and propose a second approach that incorporates segmentation information in the GAN framework and obtain quantitative and qualitative results.

}, author = {Laia Tarr{\'e}s}, editor = {Mrak, Marta and Xavier Gir{\'o}-i-Nieto} } @conference {cRamonb, title = {H3D-Net: Few-Shot High-Fidelity 3D Head Reconstruction}, booktitle = {International Conference on Computer Vision (ICCV)}, year = {2021}, publisher = {IEEE/CVF}, organization = {IEEE/CVF}, address = {Virtual}, abstract = {

Recent learning approaches that implicitly represent surface geometry using coordinate-based neural representations have shown impressive results in the problem of multi-view 3D reconstruction. The effectiveness of these techniques is, however, subject to the availability of a large number (several tens) of input views of the scene, and computationally demanding optimizations. In this paper, we tackle these limitations for the specific problem of few-shot full 3D head reconstruction, by endowing coordinate-based representations with a probabilistic shape prior that enables faster convergence and better generalization when using few input images (down to three). First, we learn a shape model of 3D heads from thousands of incomplete raw scans using implicit representations. At test time, we jointly overfit two coordinate-based neural networks to the scene, one modeling the geometry and another estimating the surface radiance, using implicit differentiable rendering. We devise a two-stage optimization strategy in which the learned prior is used to initialize and constrain the geometry during an initial optimization phase. Then, the prior is unfrozen and fine-tuned to the scene. By doing this, we achieve high-fidelity head reconstructions, including hair and shoulders, and with a high level of detail that consistently outperforms both state-of-the-art 3D Morphable Models methods in the few-shot scenario, and non-parametric methods when large sets of views are available.

}, author = {Ramon, Eduard and Triginer, Gil and Escur, Janna and Pumarola, Albert and Garcia, Jaime and Xavier Gir{\'o}-i-Nieto and Moreno, Francesc} } @conference {cDuarte20, title = {How2Sign: A Large-scale Multimodal Dataset for Continuous American Sign Language}, booktitle = {CVPR 2021}, year = {2021}, month = {06/2021}, abstract = {

Sign Language is the primary means of communication for the majority of the Deaf community. One of the factors that has hindered the progress in the areas of automatic sign language recognition, generation, and translation is the absence of large annotated datasets, especially continuous sign language datasets, i.e. datasets that are annotated and segmented at the sentence or utterance level. Towards this end, in this work we introduce How2Sign, a work-in-progress dataset collection. How2Sign consists of a parallel corpus of 80 hours of sign language videos (collected with multi-view RGB and depth sensor data) with corresponding speech transcriptions and gloss annotations. In addition, a three-hour subset was further recorded in a geodesic dome setup using hundreds of cameras and sensors, which enables detailed 3D reconstruction and pose estimation and paves the way for vision systems to understand the 3D geometry of sign language.





}, author = {Amanda Duarte and S. Palaskar and Lucas Ventura and Ghadiyaram, Deepti and DeHaan, Kenneth and F. Metze and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dBellver, title = {Image and Video Object Segmentation in Low Supervision Scenarios}, volume = {PhD}, year = {2021}, month = {03/2021}, school = {Universitat Politecnica de Catalunya}, type = {Academic}, address = {Barcelona}, abstract = {

Image and video segmentation are central tasks within the computer vision field. Nevertheless, deep learning solutions for segmentation typically rely on pixel-level annotations, which are very costly to collect. Likewise, some segmentation systems require human interaction at inference time, which involves effort for the end-user. In this thesis, we look into diverse supervision scenarios for image and video object segmentation. We discern between supervision when learning the model, i.e., which type of annotations are used during training, and supervision at inference, namely which kind of human input is required when running the system. Our target are models that require low forms of supervision.

In the first part of the thesis we present a novel recurrent architecture for video object segmentation that is end-to-end trainable in a fully-supervised setup, and that does not require any post-processing step, i.e., the output of the model directly solves the addressed task. The second part of the thesis aims at lowering the annotation cost, in terms of labeling time, needed to train image segmentation models. We explore semi-supervised pipelines and show results when a very limited budget is available. The third part of the dissertation attempts to alleviate the supervision required by semi-automatic systems at inference time. Particularly, we focus on semi-supervised video object segmentation, which typically requires generating a binary mask for each instance to be tracked. In contrast, we present a model for language-guided video object segmentation, which identifies the object to segment with a natural language expression. We study current benchmarks, propose a novel categorization of referring expressions for video, and identify the main challenges posed by the video task.

Evaluation committee: Zeynep Akata (University of T{\"u}bingen), Francesc Moreno-Noguer (UPC IRI-CSIC) and Yannis Kalantidis (Naver Labs Europ).

M{\'\i}riam Bellver Phd defense UPC 2021
}, author = {M{\'\i}riam Bellver}, editor = {Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cHernandez, title = {Implementation of personalized medicine in cutaneous melanoma patients aided by artificial intelligence}, booktitle = {10th World Congress of2 Melanoma / 17th EADO Congress}, year = {2021}, month = {04/2021}, author = {Carlos Hernandez and Anil Kiroglu and Sergio Garc{\'\i}a and Joan Ficapal and Julio Burgos and Sebastian Podlipnik and Neus Calbet and Susana Puig and Josep Malvehy and Ver{\'o}nica Vilaplana and Marc Combalia} } @unpublished {xBonet21, title = {Improved Neural Network Generalization using Channel-Wise NNK Graph Constructions}, year = {2021}, publisher = {Final Year Project, UPC}, abstract = {

State-of-the-art neural network architectures continue to scale in size and deliver impressive results on unseen data points at the expense of poor interpretability. In the deep layers of these models we often encounter very high dimensional feature spaces, where constructing graphs from intermediate data representations can lead to the well-known curse of dimensionality. We propose a channel-wise graph construction method that works on lower dimensional subspaces and provides a new channel-based perspective that leads to better interpretability of the data and relationship between channels. In addition, we introduce a novel generalization estimate based on the proposed graph construction method with which we perform local polytope interpolation. We show its potential to replace the standard generalization estimate based on validation set performance to perform progressive channel-wise early stopping without requiring a validation set.

}, author = {Bonet, David}, editor = {Ortega, Antonio and Ruiz-Hidalgo, J. and Shekkizhar, Sarath} } @article {xGiro-i-Nieto21b, title = {Learning Representations for Sign Language Videos}, year = {2021}, abstract = {

These slides review the research of our lab since 2016 on applied deep learning, starting from our participation in the TRECVID Instance Search 2014, moving into video analysis with CNN+RNN architectures, and our current efforts in sign language translation and production.

Learning Representations for Sign Language Videos - Xavier Giro - NIST TRECVID 2021 from Universitat Polit{\`e}cnica de Catalunya
}, author = {Xavier Gir{\'o}-i-Nieto} } @conference {cMayoral21, title = {Machine learning on combined neuroimaging and plasma biomarkers for triaging participants of secondary prevention trials in Alzheimer{\textquoteright}s Disease}, booktitle = {Alzheimer{\textquoteright}s Association International Conference}, year = {2021}, month = {07/2021}, author = {Irene Cumplido-Mayoral and Gemma Salvad{\'o} and Mahnaz Shekari and Carles Falcon and Marta Mil{\`a} Alom{\`a} and Aida Ni{\~n}erola Baiz{\'a}n and Jose Luis Molinuevo and Henrik Zetterberg and Kaj Blennow and Marc Suarez Calvet and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @inbook {bMora, title = {MRI brain tumor segmentation and uncertainty estimation using 3D-UNet architectures}, booktitle = {Brainlesion: Glioma, Multiple Sclerosis, Stroke and Traumatic Brain Injuries. BrainLes 2020}, volume = {12658}, number = {Lecture Notes in Computer Science}, year = {2021}, pages = {376-390}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {

Automation of brain tumor segmentation in 3D magnetic resonance images (MRIs) is key to assess the diagnostic and treatment of the disease. In recent years, convolutional neural networks (CNNs) have shown improved results in the task. However, high memory consumption is still a problem in 3D-CNNs. Moreover, most methods do not include uncertainty information, which is especially critical in medical diagnosis. This work studies 3D encoder-decoder architectures trained with patch-based techniques to reduce memory consumption and decrease the effect of unbalanced data. The different trained models are then used to create an ensemble that leverages the properties of each model, thus increasing the performance. We also introduce voxel-wise uncertainty information, both epistemic and aleatoric using test-time dropout (TTD) and data-augmentation (TTA) respectively. In addition, a hybrid approach is proposed that helps increase the accuracy of the segmentation. The model and uncertainty estimation measurements proposed in this work have been used in the BraTS{\textquoteright}20 Challenge for task 1 and 3 regarding tumor segmentation and uncertainty estimation.

}, isbn = {978-3-030-72083-4}, issn = {978-3-030-72084-1}, doi = {https://doi.org/10.1007/978-3-030-72084-1_34}, author = {Laura Mora and Ver{\'o}nica Vilaplana} } @conference {cGarcia-Gomez21, title = {Multimodal solid-state LiDAR for advanced perception applications}, booktitle = {OPTOEL}, year = {2021}, month = {06/2021}, abstract = {

Perception of the environment is an essential requirement for the fields of autonomous vehicles and robotics. Consequently, LiDAR imaging sensors have become crucial sen-sors for such applications due to their 3D geometry sensing capability. However, auton-omous systems claim for high amounts of data to make reliable decisions so many dif-ferent sensors are often combined. In this context, we present a multimodal imaging sys-tem based on a solid-state LiDAR combined with three other imaging sensors that pro-vides multimodal information with low parallax fusion error.

}, keywords = {artificial intelligence, autonomous navigation, computer Vision, enhanced perception, robotics, sensor fusion, solid-state LiDAR}, author = {Pablo Garc{\'\i}a-G{\'o}mez and Noel Rodrigo and Jordi Riu and Casas, J. and S. Royo} } @conference {cGirbau21, title = {Multiple Object Tracking with Mixture Density Networks for Trajectory Estimation}, booktitle = {CVPR 2021 Robust Video Scene Understanding: Tracking and Video Segmentation (RVSU) Workshop}, year = {2021}, abstract = {

Multiple object tracking faces several challenges that may be alleviated with trajectory information. Knowing the posterior locations of an object helps disambiguating and solving situations such as occlusions, re-identification, and identity switching. In this work, we show that trajectory estimation can become a key factor for tracking, and present TrajE, a trajectory estimator based on recurrent mixture density networks, as a generic module that can be added to existing object trackers. To provide several trajectory hypotheses, our method uses beam search. Also, relying on the same estimated trajectory, we propose to reconstruct a track after an occlusion occurs. We integrate TrajE into two state of the art tracking algorithms, CenterTrack [63] and Tracktor [3]. Their respective performances in the MOTChallenge 2017 test set are boosted 6.3 and 0.3 points in MOTA score, and 1.8 and 3.1 in IDF1, setting a new state of the art for the CenterTrack+TrajE configuration.

}, url = {https://arxiv.org/abs/2106.10950}, author = {Girbau, A. and Xavier Gir{\'o}-i-Nieto and Rius, Ignasi and Marqu{\'e}s, F.} } @conference {cDominguez, title = {Neural ADMIXTURE: rapid population clustering with autoencoders}, booktitle = {2021 Society for Molecular Biology \& Evolution meeting (SMBEv2021)}, year = {2021}, month = {06/2021}, address = {Virtual}, abstract = {

Characterizing the genetic substructure of large cohorts has become increasingly important as genetic association and prediction studies are extended to massive, increasingly diverse, biobanks. ADMIXTURE and STRUCTURE are widely used unsupervised clustering algorithms for characterizing such ancestral genetic structure. These methods decompose individual genomes into fractional cluster assignments with each cluster representing a vector of DNA marker frequencies. The assignments, and clusters, provide an interpretable representation for geneticists to describe population substructure at the sample level. However, with the rapidly increasing size of population biobanks and the growing numbers of variants genotyped (or sequenced) per sample, such traditional methods become computationally intractable. Furthermore, multiple runs with different hyperparameters are required to properly depict the population clustering using these traditional methods, increasing the computational burden. This can lead to days of compute. In this work we present Neural ADMIXTURE, a neural network autoencoder that follows the same modeling assumptions as ADMIXTURE, providing similar (or better) clustering, while reducing the compute time by orders of magnitude. In addition, this network can include multiple outputs, providing the equivalent results as running the original ADMIXTURE algorithm many times with different numbers of clusters. These models can also be stored, allowing later cluster assignment to be performed with a linear computational time.

}, author = {Dominguez, Albert and Mas-Montserrat, Daniel and Bustamante, Carlos and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @article {xGiro-i-Nieto21a, title = {Object Detection with Deep Learning}, year = {2021}, abstract = {

Object detection in computer vision is the task of localizing and categorizing object instances in still images. This talk reviews the main approaches for solving the task with deep neural networks, following a historically perspective starting from the image classification task.


[GSlides]

}, author = {Xavier Gir{\'o}-i-Nieto and Ventura, C.} } @article {xEscobar21, title = {Object Model Adaptation for Multiple Object Tracking}, year = {2021}, abstract = {

Multiple object tracking is a broadly used task in multi- ple applications, all the way from bioengineering to security applications. In this paper we propose a variation of RVOS by adding the center estimation of detected instances, by means of a second head in the decoder which is assigned the task of detecting the corresponding object{\textquoteright}s bounding box arithmetic center. We have trained the model using three variants of the cross-entropy loss, which has been adapted to tackle the class imbalance caused by the fact that the center of an object is represented by only one pixel of the image, and have obtained some promising results.

}, author = {Escobar, Miquel}, editor = {Girbau, A. and Ventura, C. and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cPodlipnik21, title = {Personalized medicine in melanoma patients aided by artificial intelligence}, booktitle = {Clinical Translation of Medical Image Computing and Computer Assisted Interventions (CLINICCAI) Workshop at MICCAI}, year = {2021}, month = {09/2021}, abstract = {

The 8th Edition of the American Joint Committee on Cancer (AJCC) staging system1 is the current standard for classifying patients into prognostic and treatment groups. This classification is used to predict the evolution of the patient, and therefore treatment actions provided to the individual. However, patients at the same stage behave differently, indicating that the current classification system is often insufficient to provide a customized prognosis for each patient2. It is, therefore, necessary to improve patient classification into prognostic groups. Furthermore, patients{\textquoteright} systemic and surgical treatments often involve significant toxicities and morbidities that impact their quality of life (i.e., sentinel node biopsy is not needed for 80\% of the melanoma patients, 50\% of patients do not benefit from adjuvant treatment)3. Therefore, melanoma patients should benefit from a more precise risk estimation.

We create a survival dataset for melanoma risk estimation and train survival XGBoost algorithms4 to predict the mortality, relapse, and metastasis risk. We compare their performance to the AJCC 2018 risk stratification system. Furthermore, we train classifiers to predict the risk of a positive lymph node biopsy and distant metastasis on melanoma patients and compare the performance of the proposed system to the clinical practice.

}, author = {Sebastian Podlipnik and Carlos Hernandez and Anil Kiroglu and Sergio Garc{\'\i}a and Joan Ficapal and Julio Burgos and Neus Calbet and Susana Puig and Josep Malvehy and Ver{\'o}nica Vilaplana and Marc Combalia} } @conference {cNieto, title = {PiCoEDL: Discovery and Learning of Minecraft Navigation Goals from Pixels and Coordinates}, booktitle = {CVPR 2021 Embodied AI Workshop}, year = {2021}, month = {06/2021}, abstract = {

Defining a reward function in Reinforcement Learning (RL) is not always possible or very costly. For this reason, there is a great interest in training agents in a task-agnostic manner making use of intrinsic motivations and unsupervised techniques. Due to the complexity to learn useful behaviours in pixel-based domains, the results obtained in RL are still far from the remarkable results obtained in domains such as computer vision and natural language processing. We hypothesize that RL agents will also benefit from unsupervised pre-trainings with no extrinsic rewards, analogously to how humans mostly learn, especially in the early stages of life. Our main contribution is the deployment of the Explore, Discover and Learn (EDL) paradigm for unsupervised learning to the pixel space. In particular, our work focuses on the MineRL environment, where the observation of the agent is represented by: (a) its spatial coordinates in the Minecraft virtual world, and (b) an image from an egocentric viewpoint.

}, author = {Nieto, Juan Jos{\'e} and Creus, Roger and Xavier Gir{\'o}-i-Nieto} } @conference {cCreus, title = {PixelEDL: Unsupervised Skill Discovery and Learning from Pixels}, booktitle = {CVPR 2021 Embodied AI Workshop}, year = {2021}, month = {06/2021}, abstract = {

We tackle embodied visual navigation in a task-agnostic set-up by putting the focus on the unsupervised discovery of skills (or options) that provide a good coverage of states. Our approach intersects with empowerment: we address the reward-free skill discovery and learning tasks to discover {\textquotedblleft}what{\textquotedblright} can be done in an environment and {\textquotedblleft}how{\textquotedblright}. For this reason, we adopt the existing Explore, Discover and Learn (EDL) paradigm, tested only in toy example mazes, and extend it to pixel-based state representations available for embodied AI agents.

}, author = {Creus, Roger and Nieto, Juan Jos{\'e} and Xavier Gir{\'o}-i-Nieto} } @conference {cMayoral21b, title = {Prediction of amyloid pathology in cognitively unimpaired individuals using structural MRI}, booktitle = {Alzheimer{\textquoteright}s Association International Conference}, year = {2021}, month = {07/2021}, author = {Irene Cumplido-Mayoral and Silvia Ingala and Luigi Lorenzini and Alle Meije Wink and Sven Haller and Jose Luis Molinuevo and Robin Wolz and Alessandro Palombit and Adam J Schwarz and Ga{\"e}l Chetelat and Pierre Payoux and Pablo Martinez-Lage and Giovanni Frisoni and Nick C Fox and Craig W Ritchie and Joanna M Wardlaw and Adam Waldman and Frederik Barkhof and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @article {aPuig-Sitjes21, title = {Real-time detection of overloads on the plasma-facing components of Wendelstein 7-X}, journal = {Applied sciences (Basel)}, volume = {11}, year = {2021}, month = {12/2021}, chapter = {1}, issn = {2076-3417}, doi = {10.3390/app112411969}, url = {http://hdl.handle.net/2117/361558}, author = {Puig-Sitjes, A. and Jakubowski, M. and Naujoks, D. and Gao, Y. and Drewelow, P. and Niemann, H. and Felinger, J. and Casas, J. and Salembier, P. and Clemente, R.} } @conference {cManas, title = {Seasonal Contrast: Unsupervised Pre-Training from Uncurated Remote Sensing Data}, booktitle = {International Conference in Computer Vision (ICCV)}, year = {2021}, publisher = {IEEE/CVF}, organization = {IEEE/CVF}, address = {Virtual}, abstract = {

Remote sensing and automatic earth monitoring are key to solve global-scale challenges such as disaster prevention, land use monitoring, or tackling climate change. Although there exist vast amounts of remote sensing data, most of it remains unlabeled and thus inaccessible for supervised learning algorithms. Transfer learning approaches can reduce the data requirements of deep learning algorithms. However, most of these methods are pre-trained on ImageNet and their generalization to remote sensing imagery is not guaranteed due to the domain gap. In this work, we propose Seasonal Contrast (SeCo), an effective pipeline to leverage unlabeled data for in-domain pre-training of re-mote sensing representations. The SeCo pipeline is com-posed of two parts. First, a principled procedure to gather large-scale, unlabeled and uncurated remote sensing datasets containing images from multiple Earth locations at different timestamps. Second, a self-supervised algorithm that takes advantage of time and position invariance to learn transferable representations for re-mote sensing applications. We empirically show that models trained with SeCo achieve better performance than their ImageNet pre-trained counterparts and state-of-the-art self-supervised learning methods on multiple downstream tasks. The datasets and models in SeCo will be made public to facilitate transfer learning and enable rapid progress in re-mote sensing applications.

}, url = {https://arxiv.org/abs/2103.16607}, author = {Ma{\~n}as,Oscar and Lacoste, Alexandre and Xavier Gir{\'o}-i-Nieto and Vazquez, David and Rodr{\'\i}guez, Pau} } @article {xGiro-i-Nieto21, title = {Sign Language Translation and Production Multimedia and Multimodal Challenges for All}, year = {2021}, abstract = {

Machine translation and computer vision have greatly benefited of the advances in deep learning. The large and diverse amount of textual and visual data have been used to train neural networks whether in a supervised or self-supervised manner. Nevertheless, the convergence of the two field in sign language translation and production is still poses multiple open challenges, like the low video resources, limitations in hand pose estimation, or 3D spatial grounding from poses. This talk will present these challenges and the How2V️Sign dataset recorded at CMU in collaboration with UPC, BSC, Gallaudet University and Facebook.



}, author = {Xavier Gir{\'o}-i-Nieto} } @article {aSalgueiroa, title = {Single-image super-resolution of Sentinel-2 low resolution bands with residual dense convolutional neural networks}, journal = {Remote Sensing}, volume = {13}, year = {2021}, month = {2021}, pages = {5007}, abstract = {

Sentinel-2 satellites have become one of the main resources for\  Earth observation images because they are free of charge, have a great spatial coverage and high temporal revisit. Sentinel-2 senses the same location providing different spatial resolutions as well as generating a multi-spectral image with 13 bands of 10, 20, and 60 m/pixel. In this work, we propose a single-image super-resolution model based on convolutional neural networks that enhances the low-resolution bands (20 m and 60 m) to reach the maximal resolution sensed (10 m) at the same time, whereas other approaches provide two independent models for each group of LR bands. Our proposed model, named Sen2-RDSR, is made up of Residual in Residual blocks that produce two final outputs at maximal resolution, one for 20 m/pixel bands and the other for 60 m/pixel bands. The training is done in two stages, first focusing on 20 m bands and then on the 60 m bands. Experimental results using 6 quality metrics (RMSE, SRE, SAM, PSNR, SSIM, ERGAS) show that our model has superior performance compared to other state-of-the-art approaches, and it is very effective and suitable as a preliminary step for land and coastal applications, as studies involving pixel-based classification for Land-Use-Land-Cover or the generation of vegetation indices.

}, author = {Luis Salgueiro and Javier Marcello and Ver{\'o}nica Vilaplana} } @conference {cPuig-Sitjes21, title = {Spatio-temporal Detection and Tracking of Thermal Events on the Plasma Facing Components of Wendelstein 7-X}, booktitle = {4th IAEA Technical Meeting on Fusion Data Processing, Validation and Analysis}, year = {2021}, month = {11/2021}, publisher = {iaea.org}, organization = {iaea.org}, address = {Shanghai (online)}, abstract = {

In steady-state fusion devices like Wendelstein 7-X (W7-X), the active control of heat loads is mandatory to attain long-plasma operation. An intelligent feedback control system that mitigates the risk of overheating is required to avoid a premature plasma termination by the safety system. To keep the plasma within the safe operational limits of the plasma facing components, the feedback control system must be informed of the ongoing thermal events and their evolution in time. Then it can take effectives countermeasures to prevent the thermal events from reaching a critical point. These countermeasures may include reducing the heating power, changing the strike-line position or inducing detachment. With reaction times of the order of a hundred milliseconds, a fully automated real-time image analysis algorithm is required.
In this work, we present a spatio-temporal algorithm to detect, classify and track the thermal events observed by the thermography diagnostic on the plasma facing components of W7-X. The system detects and distinguishes between strike-lines and isolated hot spots as well as leading edges. The segmentation of the strike-line is specially challenging at W7-X. As a 3-dimensional helically-shaped stellarator equipped with 10 island divertors, the strike-lines have a complex heat load distribution with a high-dynamic range. The use of morphological tools and, in particular, the use of the Max-tree transform allow us to segment the thermal events in a hierarchical way preserving the inclusion relationship between different events, like hot spots and leading edges embedded in the strike-line structure. The thermal events are segmented for each frame and tracked over time in order to forecast their temporal evolution and to evaluate their risk. To this end, a spatio-temporal graph is built and spatio-temporal connected components are used to track the thermal events across the sequence frames. The spatio-temporal components in the graph are used to label the events in the sequence preserving temporal coherence and minimizing discontinuities, solving splits and merges. Spatio-temporal descriptors are then generated for each event to assess their risk.
The algorithm was tested offline on the infrared data acquired during the last operation phase OP1.2 and the results are presented here. Further work will follow to accelerate the code with GPUs to reach real-time processing and be ready to protect the water-cooled plasma facing components in the forthcoming operation phase OP2.

}, url = {https://conferences.iaea.org/event/251/contributions/20704/}, author = {Puig-Sitjes, A. and Casas, J. and Salembier, P. and Vizcarro, D. and Clemente, R. and Jakubowski, M. and Gao, Y. and W7-X Team} } @mastersthesis {xOriola, title = {Species-agnostic Local Ancestry Inference on Genomic Data with Convolutions}, year = {2021}, abstract = {

Local Ancestry Inference (LAI) is the high resolution prediction of ancestry (African, European, ...) across a DNA sequence. LAI is becoming increasingly important in DNA sequence analysis for the study of human ancestry and migrations. It is also necessary for polygenic risk scores research (prediction of traits and disease risk). Most current LAI models are built for specific species, set of ancestries and chromosomes, hence a new model needs to be trained from scratch for every slightly different setting. This creates a big barrier for research and industry to shift across different LAI scenarios. In this thesis we present SALAI-Net, the first statistical method for LAI with reference panel that can be used on any set of species and ancestries (species-agnostic). Loter is the state of the art in species-agnostic models with reference panel, and is based on a dynamic programming algorithm. However, it is slow and does not perform very well in small reference panel settings. Our model is based on a novel hand-engineered template matching block followed by a convolutional smoothing filter optimized to minimize cross-entropy loss on a training dataset. The right choice of DNA sequence encoding, similarity features and architecture is what makes our model able to generalize well to unseen ancestries, species, and different chromosomes. We benchmark our models on whole genome data of humans and we test the ability to generalize to dog species when trained on human data. Our models outperform the state-of-the-art method by a big margin in terms of accuracy, testing in different settings and datasets. Moreover, it is up to two orders of magnitude faster. Our model also shows close to no generalization gap when switching between species.

}, author = {Oriol, Benet}, editor = {Mas-Montserrat, Daniel and Ioannidis, Alexander G. and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dGirbau21, title = {Sports broadcasting and multiple object tracking with deep learning methods}, volume = {PhD}, year = {2021}, month = {03/2021}, type = {Industrial}, abstract = {

Since less than a decade ago, deep learning techniques started to dominate many different\ fields, revolutionizing the possibilities of artificial intelligence. Seeing their potential, industrial\ sectors started to invest in applying such technologies as key components of the company\ strategy. This thesis has been developed in an industrial context, in AutomaticTV. The main\ focus along this period has been the transfer of knowledge and know-how between academia\ and industry, development of tools to exploit this knowledge, the exploration of new techniques\ for future challenges, and, from an academic research perspective, contributions to the\ multiple object tracking problem.

The first part of the thesis is devoted to the introduction of deep learning technologies to\ AutomaticTV, a company dedicated to automatic sports analysis and broadcasting, and the\ development of tools and tasks that surround the application.

The second part of this thesis introduces the contributions to the multiple object tracking\ challenge. We present TrajE, a trajectory estimator based on mixture density networks and\ beam search, used to boost the performance of existing multiple object trackers, and introduce\ an occlusion reconstruction step using the estimated trajectory information. By adding TrajE\ to an existing multiple object tracker, we boost its performance by 6.3, 1.8 points in MOTA and\ IDF1 scores respectively, becoming the new state of the art in the MOTChallenge dataset.

}, author = {Girbau, A.}, editor = {Rius, Ignasi and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cKazakos, title = {SynthRef: Generation of Synthetic Referring Expressions for Object Segmentation}, booktitle = {NAACL Visually Grounded Interaction and Language (ViGIL) Workshop}, year = {2021}, month = {06/2021}, address = {Virtual}, abstract = {

Recent advances in deep learning have brought significant progress in visual grounding tasks such as language-guided video object segmentation. However, collecting large datasets for these tasks is expensive in terms of annotation time, which represents a bottleneck. To this end, in our work we propose a novel method, namely SynthRef, for generating synthetic referring expressions for an image (or video frame), and we also present and disseminate the first large-scale dataset with synthetic referring expressions for video object segmentation. Our experiments demonstrate that by training with our synthetic dataset one can improve the ability of a model to generalize across different datasets, without any additional annotation cost. Moreover, our formulation allows its application to any object detection or segmentation dataset.

}, author = {Kazakos, Ioannis and Bellver-Bueno, M{\'\i}riam and Ventura, C. and Silberer, Carina and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xIturralde, title = {Towards video alignment across cameras with sign language 2D poses}, year = {2021}, abstract = {

This thesis degree is part of a project from the Image Group at UPC that is focused on sign language translation using deep learning technologies. This thesis builds on top of an existing database called How2Sign, that contains more than 83 hours of sign language translation videos. This database has some textual annotations aligned to a front RGB camera. The same scenes are also captured by a side RGB and a front RGB-D cameras. These three cameras are not synchronized, so it is necessary to align the segments annotated on the RGB front camera to the other cameras. This thesis explores a solution based on the cross correlation operator. Our work is to process the coordinates of the joints of the subject that appears in the videos, not from the point of view of image or video processing based on pixels. The first part if this thesis is to investigate the properties of the cross-correlation function by locating short video segments of a long recording based on automatically extracted 2D human poses. The experiments studied the impact of adding noise. The second part applied the cross-correlation to try to align two videos with the same scene, but recorded with different cameras from different points of view.

}, author = {Andrea Iturralde}, editor = {Laia Tarr{\'e}s and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {dGeleta21, title = {Unsupervised learning with applications in genomic}, volume = {BSc Data Science Engineering}, year = {2021}, abstract = {

In this study we show the power of variational autoencoders (VAEs) for a variety of tasks relating to the interpretation and compression of genomic data. The unsupervised setting allows for detecting and learning of granular population structure and inferring of new informative latent factors, opening up an avenue for applications in dimensionality reduction, data simulation, population classification, imputation, and lossless genomic data compression. The latent spaces of VAEs are able to capture and represent clearly differentiated Gaussian-like clusters of similar genetic composition on a fine-scale with a relatively small number of Single Nucleotide Polymorphisms (SNPs) as input. Furthermore, sequences can be decomposed into latent representations and reconstruction errors (residuals) providing a sparse representation that provides a means for efficient lossless compression.

Identifying genetic clusters can be important when performing genome-wide association studies and provides an alternative to self-reported ethnic labels, which are culturally constructed and vary according to the location and individual. A variety of unsupervised dimensionality reduction methods have been explored in the past for such applications, including PCA, MDS, t-SNE, and UMAP. Our proposed VAE can represent the population structure as a Gaussian-distributed continuous multi-dimensional representation and as classification probabilities providing flexible and interpretable population descriptors.

We train our VAE method with several worldwide whole genome datasets from both humans and canids, and evaluate the performance of the different proposed applications with networks with and without ancestry conditioning. Our experiments show that different population groups have significantly differentiated compression ratios and classification accuracies. Additionally, we analyze the entropy of the SNP data, noting its effect on compression across populations and connect these patterns to historical migrations and ancestral relationships.

}, author = {Geleta, Margarita}, editor = {Mas-Montserrat, Daniel and Ioannidis, Alexander G. and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xCreus, title = {Unsupervised skill learning from pixels}, year = {2021}, abstract = {

This work focuses on the self-acquirement of the fundamental task-agnostic knowledge available within an environment. The aim is to discover and learn baseline representations and behaviours that can later be useful for solving embodied visual navigation downstream tasks. Specifically, the presented approach extends the idea of the "Explore, Discover and Learn" (EDL) paradigm to the pixel domain. This way, this work is centered in the representations and behaviours that can be learnt by an agent that only integrates an image capture sensor. Both the agents and the environment that is used in this work run over the Habitat AI simulator, which is developed by Facebook AI, and renders 3D fotorealistic views of the insides of apartments.





}, author = {Creus, Roger}, editor = {Nieto, Juan Jos{\'e} and Xavier Gir{\'o}-i-Nieto} } @conference {cNietoa, title = {Unsupervised Skill-Discovery and Skill-Learning in Minecraft}, booktitle = {ICML 2021 Workshop on Unsupervised Reinforcement Learning (URL)}, year = {2021}, month = {07/2021}, abstract = {

Pre-training Reinforcement Learning agents in a task-agnostic manner has shown promising results. However, previous works still struggle in learning and discovering meaningful skills in high-dimensional state-spaces, such as pixel-spaces. We approach the problem by leveraging unsupervised skill discovery and self-supervised learning of state representations. In our work, we learn a compact latent representation by making use of variational and contrastive techniques. We demonstrate that both enable RL agents to learn a set of basic navigation skills by maximizing an information theoretic objective. We assess our method in Minecraft 3D pixel maps with different complexities. Our results show that representations and conditioned policies learned from pixels are enough for toy examples, but do not scale to realistic and complex maps. To overcome these limitations, we explore alternative input observations such as the relative position of the agent along with the raw pixels.

}, author = {Nieto, Juan Jos{\'e} and Creus, Roger and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xGraneroa, title = {2D to 3D body pose estimation for sign language with Deep Learning}, year = {2020}, abstract = {

This project aims at leveraging the challenge of using 3D poses for Sign Language translation or animation by transforming 2D pose datasets into 3D ones. The goal is, using a 3D dataset of American Sign Language, to train a deep neural network that will predict the depth coordinates of the skeleton keypoints from 2D coordinates. Specifically, it will be explored a Long Short-Term Memory network, an architecture broadly used for sequence to sequence tasks. The conclusions extracted on this report are that despite some of the results being good enough to be used for actual 3D SL annotation, the majority of them lack the precision to do so, and they are too variant with respect to the dataset split. It is also concluded that the solutions approached here could be improved by adding some regularization methods, more powerful hardware to run better experiments, and new input features such as keypoint visibility.

}, author = {P{\'e}rez-Granero, Pol}, editor = {McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xEscur, title = {Attention-based multi-view 3D reconstruction models}, year = {2020}, abstract = {

Attention mechanism has been successfully used in multiple tasks in the fields of Computer Vision and Natural Language Processing, but has not ever been applied to 3D reconstruction problems. In this work, we explore the potential of attention in a multi-view 3D face recon- struction pipeline. On one hand, we use spatial attention when extracting the features of the input images, taking advantage of the interpretability it provides us. This allows us to validate the proper behaviour of the model. On the other hand, we want to make this multi-view setup invariant to the order of the input image?s views. To do so, instead of concatenating the fea- tures of the different views, we use part of the Transformer architecture as a symmetric merging function, which is based on a multi-head self-attention mechanism, showing an improvement in the performance.

}, author = {Escur, Janna}, editor = {Ramon, Eduard and Xavier Gir{\'o}-i-Nieto} } @conference {xCaros19, title = {Automatic Reminiscence Therapy for Dementia}, booktitle = {ACM International Conference on Multimedia Retrieval (ICMR)}, year = {2020}, month = {06/2020}, publisher = {ACM}, organization = {ACM}, address = {Dublin, Ireland}, abstract = {

With people living longer than ever, the number of cases with dementia such as Alzheimer{\textquoteright}s disease increases steadily. It affects more than 46 million people worldwide, and it is estimated that in 2050 more than 100 million will be affected. While there are not effective treatments for these terminal diseases, therapies such as reminiscence, that stimulate memories from the past are recommended. Currently, reminiscence therapy takes place in care homes and is guided by a therapist or a carer. In this work, we present an AI-based solution to automatize the reminiscence therapy, which consists in a dialogue system that uses photos as input to generate questions. We run a usability case study with patients diagnosed of mild cognitive impairment that shows they found the system very entertaining and challenging. Overall, this paper presents how reminiscence therapy can be automatized by using machine learning, and deployed to smartphones and laptops, making the therapy more accessible to every person affected by dementia. (demo paper)

Automatic Reminiscence Therapy for Dementia from Universitat Polit{\`e}cnica de Catalunya



}, doi = {https://doi.org/10.1145/3372278.3391927}, url = {https://arxiv.org/abs/1910.11949}, author = {Caros, Mariona and Garolera, Maite and Radeva, Petia and Xavier Gir{\'o}-i-Nieto} } @conference {cMora20, title = {Brain Tumor Segmentation using 3D-CNNs with Uncertainty Estimation}, booktitle = {MICCAI 2020 - Brain Lesion Workshop (BrainLes), Multimodal Brain Tumor Segmentation Challenge (BRATS)}, year = {2020}, month = {10/2020}, abstract = {

Automation of brain tumors in 3D magnetic resonance images (MRIs) is key to assess the diagnostic and treatment of the disease. In recent years, convolutional neural networks (CNNs) have shown improved results in the task. However, high memory consumption is still a problem in 3D-CNNs. Moreover, most methods do not include uncertainty information, which is specially critical in medical diagnosis. This work proposes a 3D encoder-decoder architecture, based on V-Net which is trained with patching techniques to reduce memory consumption and decrease the effect of unbalanced data. We also introduce voxel-wise uncertainty, both epistemic and aleatoric using test-time dropout and data-augmentation respectively. Uncertainty maps can provide extra information to expert neurologists, useful for detecting when the model is not confident on the provided segmentation.

}, author = {Laura Mora and Ver{\'o}nica Vilaplana} } @conference {cVentura20, title = {Can Everybody Sign Now? Exploring Sign Language Video Generation from 2D Poses}, booktitle = {ECCV 2020 Workshop on Sign Language recognition, Production and Translation (SLRTP)}, year = {2020}, month = {08/2020}, abstract = {

Recent work have addressed the generation of human poses represented by 2D/3D coordinates of human joints for sign language. We use the state of the art in Deep Learning for motion transfer and evaluate them on How2Sign, an American Sign Language dataset, to generate videos of signers performing sign language given a 2D pose skeleton. We evaluate the generated videos quantitatively and qualitatively showing that the current models are not enough to generated adequate videos for Sign Language due to lack of detail in hands.

}, author = {Lucas Ventura and Amanda Duarte and Xavier Gir{\'o}-i-Nieto} } @conference {cGonzalez-i-Calabuig, title = {Curriculum Learning for Recurrent Video Object Segmentation}, booktitle = {ECCV 2020 Women in Computer Vision Workshop}, year = {2020}, month = {08/2020}, abstract = {

Video object segmentation can be understood as a sequence-to-sequence task that can benefit from the curriculum learning strategies for better and faster training of deep neural networks.\ This work explores different schedule sampling and frame skipping variations to significantly improve the performance of a recurrent architecture.\ Our results on the car class of the KITTI-MOTS challenge indicate that, surprisingly, an inverse schedule sampling is a better option than a classic forward one. Also, that a progressive skipping of frames during training is beneficial, but only when training with the ground truth masks instead of the predicted ones.



Curriculum Learning for Recurrent Video Object Segmentation from Universitat Polit{\`e}cnica de Catalunya
}, author = {Gonzalez-i-Calabuig, Maria and Ventura, C. and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xGonzalez-i-Calabuig20, title = {Curriculum Learning for Recurrent Video Object Segmentation}, year = {2020}, abstract = {

Video object segmentation (VOS) is a computer vision task that aims at determining\ the pixels of an object of interest along a video sequence. This thesis explores different\ curriculum learning strategies for a deep neural network trained to solve this task.

Curriculum learning defines a methodology where the training data are not randomly presented to the model, instead, they are organized in a meaningful way. Simple concepts are first presented and gradually become more complex. Four different\ curriculum strategies are explored: schedule sampling, frame skipping, the effect of\ temporal and spatial recurrence variations and loss penalization by the object{\textquoteright}s area.

This work focuses on the RVOS neural architecture, a recurrent architecture originally tested on the DAVIS and YouTube-VOS datasets for one-shot video object segmentation, over the cars class of the KITTI-MOTS dataset. Even though this architecture is a fast solution for the VOS task, the model struggles with the KITTI-MOTS\ dataset, whose videos are more crowded and challenging.

For the schedule sampling curriculum, both the classic and inverse implementations are evaluated. Results show how inverse schedule sampling strategies improve\ the model{\textquoteright}s performance instead of the classic approach, the forward one. The different frame skipping schemes are also beneficial, but only when training with the\ ground truth mask instead of the predicted ones. Lastly, both the curriculums that\ vary the temporal and spatial recurrence or penalize the loss by the object{\textquoteright}s area have\ shown poor model{\textquoteright}s performance.

These results show how curriculum learning strategies affect greatly the performance of recurrent neural networks. Moreover, the results on the inverse schedule\ sampling and frame skipping strategies invite to further explore this schemes to exploit\ their benefits.




Curriculum Learning for Recurrent Video Object Segmentationfrom\ Universitat Polit{\`e}cnica de Catalunya
}, author = {Gonzalez-i-Calabuig, Maria}, editor = {Xavier Gir{\'o}-i-Nieto and Ventura, C.} } @phdthesis {dCampos20, title = {Deep Learning that Scales: Leveraging Compute and Data}, year = {2020}, month = {12/2020}, school = {Universitat Polit{\`e}cnica de Catalunya}, address = {Barcelona, Catalonia}, abstract = {

Deep learning has revolutionized the field of artificial intelligence in the past decade. Although the development of these techniques spans over several years, the recent advent of deep learning is explained by an increased availability of data and compute that have unlocked the potential of deep neural networks. They have become ubiquitous in domains such as natural language processing, computer vision, speech processing, and control, where enough training data is available. Recent years have seen continuous progress driven by ever-growing neural networks that benefited from large amounts of data and computing power. This thesis is motivated by the observation that scale is one of the key factors driving progress in deep learning research, and aims at devising deep learning methods that scale gracefully with the available data and compute. We narrow down this scope into two main research directions. The first of them is concerned with designing hardware-aware methods which can make the most of the computing resources in current high performance computing facilities. We then study bottlenecks preventing existing methods from scaling up as more data becomes available, providing solutions that contribute towards enabling training of more complex models. This dissertation studies the aforementioned research questions for two different learning paradigms, each with its own algorithmic and computational characteristics. The first part of this thesis studies the paradigm where the model needs to learn from a collection of examples, extracting as much information as possible from the given data. The second part is concerned with training agents that learn by interacting with a simulated environment, which introduces unique challenges such as efficient exploration and simulation.

}, url = { http://hdl.handle.net/10803/670372}, author = {V{\'\i}ctor Campos}, editor = {Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @article {xGiro-i-Nieto20, title = {Deep Self-Supervised Learning for All}, year = {2020}, abstract = {

Deep neural networks have achieved outstanding results in various applications such as vision, language, audio, speech, or reinforcement learning. These powerful function approximators typically require large amounts of data to be trained, which poses a challenge in the usual case where little labeled data is available. During the last year, multiple solutions have been proposed to leverage this problem, based on the concept of self-supervised learning, which can be understood as a specific case of unsupervised learning. This talk will cover its basic principles and provide examples in the field of multimedia.



Deep Self-supervised Learning for All - Xavier Giro - X-Europe 2020 from Universitat Polit{\`e}cnica de Catalunya
}, url = {https://deep-self-supervised-learning.carrd.co/}, author = {Xavier Gir{\'o}-i-Nieto} } @conference {cFernandezf, title = {Enhancing Online Knowledge Graph Population with Semantic Knowledge}, booktitle = {19th International Semantic Web Conference (ISWC)}, year = {2020}, month = {11/2020}, address = {Virtual}, abstract = {

Knowledge Graphs (KG) are becoming essential to organize, represent and store the world{\textquoteright}s knowledge, but they still rely heavily on humanly-curated structured data. Information Extraction (IE) tasks, like disambiguating entities and relations from unstructured text, are key to automate KG population. However, Natural Language Processing (NLP) methods alone can not guarantee the validity of the facts extracted and may introduce erroneous information into the KG.\ This work presents an end-to-end system that combines Semantic Knowledge and Validation techniques with NLP methods, to provide KG population of novel facts from clustered news events.\ The contributions of this paper are two-fold: First, we present a novel method for including entity-type knowledge into a Relation Extraction model, improving F1-Score over the baseline with TACRED and TypeRE datasets. Second, we increase the precision by adding data validation on top of the Relation Extraction method. These two contributions are combined in an industrial pipeline for automatic KG population over aggregated news, demonstrating increased data validity when performing online learning from unstructured web data. Finally, the TypeRE and AggregatedNewsRE datasets build to benchmark these results are also published to foster future research in this field.

}, keywords = {Data Validation, Knowledge Graph, Relation Extraction}, author = {Fern{\`a}ndez, D{\`e}lia and Rimmek, Joan Marco and Espadaler, Joan and Garolera, Blai and Barja, Adri{\`a} and Codina, Marc and Sastre, Marc and Xavier Gir{\'o}-i-Nieto and Riveiro, Juan Carlos and Bou-Balust, Elisenda} } @conference {cCamposb, title = {Explore, Discover and Learn: Unsupervised Discovery of State-Covering Skills}, booktitle = {International Conference on Machine Learning (ICML) 2020}, year = {2020}, month = {07/2020}, abstract = {

Acquiring abilities in the absence of a task-oriented reward function is at the frontier of reinforcement learning research. This problem has been studied through the lens of empowerment, which draws a connection between option discovery and information theory. Information-theoretic skill discovery methods have garnered much interest from the community, but little research has been conducted in understanding their limitations. Through theoretical analysis and empirical evidence, we show that existing algorithms suffer from a common limitation -- they discover options that provide a poor coverage of the state space. In light of this, we propose {\textquoteright}Explore, Discover and Learn{\textquoteright} (EDL), an alternative approach to information-theoretic skill discovery. Crucially, EDL optimizes the same information-theoretic objective derived from the empowerment literature, but addresses the optimization problem using different machinery. We perform an extensive evaluation of skill discovery methods on controlled environments and show that EDL offers significant advantages, such as overcoming the coverage problem, reducing the dependence of learned skills on the initial state, and allowing the user to define a prior over which behaviors should be learned.




}, author = {V{\'\i}ctor Campos and Trott, Alexander and Xiong, Caiming and Socher, Richard and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @article {aGene-Molac, title = {Fruit detection and 3D location using instance segmentation neural networks and structure-from-motion photogrammetry}, journal = {Computers and Electronics in Agriculture}, volume = {169}, year = {2020}, month = {02/2020}, abstract = {

The development of remote fruit detection systems able to identify and 3D locate fruits provides opportunities to improve the efficiency of agriculture management. Most of the current fruit detection systems are based on 2D image analysis. Although the use of 3D sensors is emerging, precise 3D fruit location is still a pending issue. This work presents a new methodology for fruit detection and 3D location consisting of: (1) 2D fruit detection and segmentation using Mask R-CNN instance segmentation neural network; (2) 3D point cloud generation of detected apples using structure-from-motion (SfM) photogrammetry; (3) projection of 2D image detections onto 3D space; (4) false positives removal using a trained support vector machine. This methodology was tested on 11 Fuji apple trees containing a total of 1455 apples. Results showed that, by combining instance segmentation with SfM the system performance increased from an F1-score of 0.816 (2D fruit detection) to 0.881 (3D fruit detection and location) with respect to the total amount of fruits. The main advantages of this methodology are the reduced number of false positives and the higher detection rate, while the main disadvantage is the high processing time required for SfM, which makes it presently unsuitable for real-time work. From these results, it can be concluded that the combination of instance segmentation and SfM provides high performance fruit detection with high 3D data precision. The dataset has been made publicly available and an interactive visualization of fruit detection results is accessible at http://www.grap.udl.cat/documents/photogrammetry_fruit_detection.html

}, keywords = {Fruit detection, Fruit location, Mask R-CNN, Structure-from-motion, Terrestrial remote sensing}, issn = {ISSN: 0168-1699}, doi = {https://doi.org/10.1016/j.compag.2019.105165}, url = {https://doi.org/10.1016/j.compag.2019.105165}, author = {Gen{\'e}-Mola, Jordi and Sanz, Ricardo and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Ver{\'o}nica Vilaplana and Gregorio, Eduard} } @article {aRey-Arena20, title = {FuCiTNet: Improving the generalization of deep learning networks by the fusion of learned class-inherent transformations}, journal = {Information Fusion}, volume = {63}, year = {2020}, month = {10/2020}, chapter = {188}, abstract = {

It is widely known that very small datasets produce overfitting in Deep Neural Networks (DNNs), i.e., the network becomes highly biased to the data it has been trained on. This issue is often alleviated using transfer learning, regularization techniques and/or data augmentation. This work presents a new approach, independent but complementary to the previous mentioned techniques, for improving the generalization of DNNs on very small datasets in which the involved classes share many visual features. The proposed model, called FuCiTNet (Fusion Class inherent Transformations Network), inspired by GANs, creates as many generators as classes in the problem. Each generator, k, learns the transformations that bring the input image into the k-class domain. We introduce a classification loss in the generators to drive the leaning of specific k-class transformations. Our experiments demonstrate that the proposed transformations improve the generalization of the classification model in three diverse datasets.

}, doi = {10.1016/j.inffus.2020.06.015}, author = {Rey-Arena, M. and Guirado, E. and Tabik, S. and Ruiz-Hidalgo, J.} } @article {aGene-Mola20, title = {Fuji-SfM dataset: A collection of annotated images and point clouds for Fuji apple detection and location using structure-from-motion photogrammetry}, volume = {Data in Brief}, year = {2020}, month = {06/2020}, keywords = {Fruit detection, Mask R-CNN, Photogrammetry, Structure-from-motion, Terrestrial remote sensing, Yield mapping, Yield prediction}, doi = {https://doi.org/10.1016/j.dib.2020.105591}, author = {Gen{\'e}-Mola, Jordi and Sanz, Ricardo and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Ver{\'o}nica Vilaplana and Gregorio, Eduard} } @mastersthesis {xKazakos20, title = {Generation of Synthetic Referring Expressions for Object Segmentation in Videos}, year = {2020}, abstract = {

Integrating computer vision with natural language processing has achieved significant progress over the last years owing to the continuous evolution of deep learning. A novel vision and language task, which is tackled in the present Master thesis is referring video object segmentation, in which a language query defines which instance to segment from a video sequence. One of the biggest chal- lenges for this task is the lack of relatively large annotated datasets since a tremendous amount of time and human effort is required for annotation. Moreover, existing datasets suffer from poor qual- ity annotations in the sense that approximately one out of ten language expressions fails to uniquely describe the target object.

The purpose of the present Master thesis is to address these challenges by proposing a novel method for generating synthetic referring expressions for an image (video frame). This method pro- duces synthetic referring expressions by using only the ground-truth annotations of the objects as well as their attributes, which are detected by a state-of-the-art object detection deep neural network. One of the advantages of the proposed method is that its formulation allows its application to any object detection or segmentation dataset.

By using the proposed method, the first large-scale dataset with synthetic referring expressions for video object segmentation is created, based on an existing large benchmark dataset for video instance segmentation. A statistical analysis and comparison of the created synthetic dataset with existing ones is also provided in the present Master thesis.

The conducted experiments on three different datasets used for referring video object segmen- tation prove the efficiency of the generated synthetic data. More specifically, the obtained results demonstrate that by pre-training a deep neural network with the proposed synthetic dataset one can improve the ability of the network to generalize across different datasets, without any additional an- notation cost. This outcome is even more important taking into account that no additional annotation cost is involved.

}, author = {Kazakos, Ioannis}, editor = {Xavier Gir{\'o}-i-Nieto} } @article {aGarcia-Gomez20, title = {Geometric Model and Calibration Method for a Solid-State LiDAR}, journal = {Sensors}, volume = {20}, year = {2020}, month = {05/2020}, pages = {2898}, abstract = {

This paper presents a novel calibration method for solid-state LiDAR devices based on a geometrical description of their scanning system, which has variable angular resolution. Determining this distortion across the entire Field-of-View of the system yields accurate and precise measurements which enable it to be combined with other sensors. On the one hand, the geometrical model is formulated using the well-known Snell{\textquoteright}s law and the intrinsic optical assembly of the system, whereas on the other hand the proposed method describes the scanned scenario with an intuitive camera-like approach relating pixel locations with scanning directions. Simulations and experimental results show that the model fits with real devices and the calibration procedure accurately maps their variant resolution so undistorted representations of the observed scenario can be provided. Thus, the calibration method proposed during this work is applicable and valid for existing scanning systems improving their precision and accuracy in an order of magnitude.

}, keywords = {solid-state LiDAR; LiDAR calibration; distortion correction; FOV mapping}, issn = {1424-8220}, doi = {10.3390/s20102898}, url = {https://www.mdpi.com/1424-8220/20/10/2898}, author = {Pablo Garc{\'\i}a-G{\'o}mez and S. Royo and Noel Rodrigo and Casas, J.} } @article {9103248, title = {Grounded Sequence to Sequence Transduction}, journal = {IEEE Journal of Selected Topics in Signal Processing}, volume = {14}, year = {2020}, month = {05/2020}, pages = {577-591}, abstract = {

Speech recognition and machine translation have made major progress over the past decades, providing practical systems to map one language sequence to another. Although multiple modalities such as sound and video are becoming increasingly available, the state-of-the-art systems are inherently unimodal, in the sense that they take a single modality {\textemdash} either speech or text {\textemdash} as input. Evidence from human learning suggests that additional modalities can provide disambiguating signals crucial for many language tasks. In this article, we describe the How2 dataset\ , a large, open-domain collection of videos with transcriptions and their translations. We then show how this single dataset can be used to develop systems for a variety of language tasks and present a number of models meant as starting points. Across tasks, we find that building multimodal architectures that perform better than their unimodal counterpart remains a challenge. This leaves plenty of room for the exploration of more advanced solutions that fully exploit the multimodal nature of the How2 dataset\ , and the general direction of multimodal learning with other datasets as well.

}, author = {L. Specia and L. Barrault and O. Caglayan and Amanda Duarte and D. Elliott and S. Gella and N. Holzenberger and C. Lala and S. J. Lee and J. Libovicky and P. Madhyastha and F. Metze and K. Mulligan and A. Ostapenko and S. Palaskar and R. Sanabria and J. Wang and R. Arora} } @article {xGiro-i-Nieto20b, title = {Image and Video Object Segmentation with Low Supervision}, year = {2020}, abstract = {

Invited talk at VI-Lab, University oif Bristol (November 3rd, 2020 - 2pm)

Image and video segmentation are central tasks within the computer vision field. Nevertheless, deep learning solutions for segmentation typically rely on pixel-level annotations, which are very costly to collect. Likewise, some segmentation systems require human interaction at inference time, which involves some effort for the end-user. In this talk, we look into diverse supervision scenarios for image and video object segmentation. We discern between supervision when learning the model, i.e., which type of annotations are used during training, and supervision at inference, namely which kind of human input is required when running the system. Our target are models that require low forms of supervision

In the first part of the talk we present recurrent architectures for image and video object segmentation that are end-to-end trainable in a fully-supervised setup, and that does not require any post-processing step, i.e., the output of the model directly solves the addressed task. The second part of the talk aims at lowering the annotation cost, in terms of labelling time, needed to train image segmentation models. We explore semi-supervised pipelines and show results when a very limited budget is available. The third part of the session attempts to alleviate the supervision required by semi-automatic systems at inference time. Particularly, we focus on semi-supervised video object segmentation, which typically requires generating a binary mask for each instance to be tracked. In contrast, we present a model for language-guided video object segmentation, which identifies the object to segment with a natural language expression. We study current benchmarks, propose a novel categorization of referring expressions for video, and propose a method to generate synthetic referring expressions.

Slides

}, author = {Xavier Gir{\'o}-i-Nieto} } @article {xGiro-i-Nieto20a, title = {Image Segmentation with Deep Learning}, year = {2020}, abstract = {

Image segmentation is a classic computer vision task that aims at labeling pixels with semantic classes. These slides provide an overview of the basic approaches applied from the deep learning field to tackle this challenge and presents the basic subtasks (semantic, instance and panoptic segmentation) and related datasets.

Presented at the International Summer School on Deep Learning (ISSonDL) 2020 held online and organized by the University of Gdansk (Poland) between the 30th August and 2nd September.

Image Segmentation with Deep Learning - Xavier Giro \& Carles Ventura - ISSonDL 2020 from Universitat Polit{\`e}cnica de Catalunya
}, author = {Xavier Gir{\'o}-i-Nieto and Ventura, C.} } @mastersthesis {xMuschik20, title = {Learn2Sign : sign language recognition and translation using human keypoint estimation and transformer model}, year = {2020}, abstract = {

Sign language recognition and translation has been an active research field in the recent years with most approaches using deep neural networks to extract information from sign language data. This work investigates the mostly disregarded approach of using human keypoint estimation from image and video data with OpenPose in combination with transformer network architecture. Firstly, it was shown that it is possible to recognize individual signs (4.5\% word error rate (WER)). Continuous sign language recognition though was more error prone (77.3\% WER) and sign language translation was not possible using the proposed methods, which might be due to low accuracy scores of human keypoint estimation by OpenPose and accompanying loss of information or insufficient capacities of the used transformer model. Results may improve with the use of datasets containing higher repetition rates of individual signs or focusing more precisely on keypoint extraction of hands.

}, doi = {10.18419/opus-11197}, url = {https://elib.uni-stuttgart.de/handle/11682/11214}, author = {Muschik, Peter}, editor = {A. Duarte and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dPujol-Miro20, title = {Learning to extract features for 2D-3D multimodal registration}, year = {2020}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

The ability to capture depth information form an scene has greatly increased in the recent years. 3D sensors, traditionally high cost and low resolution sensors, are being democratized and 3D scans of indoor and outdoor scenes are becoming more and more common.

However, there is still a great data gap between the amount of captures being per- formed with 2D and 3D sensors. Although the 3D sensors provide more information about the scene, 2D sensors are still more accessible and widely used. This trade-off between availability and information between sensors brings us to a multimodal scenario of mixed 2D and 3D data.

This thesis explores the fundamental block of this multimodal scenario: the reg- istration between a single 2D image and a single unorganized point cloud. An unorganized 3D point cloud is the basic representation of a 3D capture. In this representation the surveyed points are represented only by their real word coordi- nates and, optionally, by their colour information. This simplistic representation brings multiple challenges to the registration, since most of the state of the art works leverage the existence of metadata about the scene or prior knowledges.

Two different techniques are explored to perform the registration: a keypoint-based technique and an edge-based technique. The keypoint-based technique estimates the transformation by means of correspondences detected using Deep Learning, whilst the edge-based technique refines a transformation using a multimodal edge detection to establish anchor points to perform the estimation.

An extensive evaluation of the proposed methodologies is performed. Albeit further research is needed to achieve adequate performances, the obtained results show the potential of the usage of deep learning techniques to learn 2D and 3D similari- ties. The results also show the good performance of the proposed 2D-3D iterative refinement, up to the state of the art on 3D-3D registration.

}, url = {http://hdl.handle.net/2117/330132}, author = {A. Pujol-Mir{\'o}}, editor = {Casas, J. and Ruiz-Hidalgo, J.} } @article {aBellver, title = {Mask-guided sample selection for Semi-Supervised Instance Segmentation}, journal = {Multimedia Tools and Applications}, year = {2020}, month = {07/2020}, abstract = {

Image segmentation methods are usually trained with pixel-level annotations, which require significant human effort to collect. The most common solution to address this constraint is to implement weakly-supervised pipelines trained with lower forms of supervision, such as bounding boxes or scribbles. Another option are semi-supervised methods, which leverage a large amount of unlabeled data and a limited number of strongly-labeled samples. In this second setup, samples to be strongly-annotated can be selected randomly or with an active learning mechanism that chooses the ones that will maximize the model performance.\ In this work, we propose a sample selection approach to decide which samples to annotate for semi-supervised instance segmentation. Our method consists in first predicting pseudo-masks for the unlabeled pool of samples, together with a score predicting the quality of the mask. This score is an estimate of the Intersection Over Union (IoU) of the segment with the ground truth mask. We study which samples are better to annotate given the quality score, and show how our approach outperforms a random selection, leading to improved performance for semi-supervised instance segmentation with low annotation budgets.

}, doi = {10.1007/s11042-020-09235-4}, url = {http://link.springer.com/article/10.1007/s11042-020-09235-4}, author = {M{\'\i}riam Bellver and Amaia Salvador and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @article {aCasamitjanab, title = {NeAT: a nonlinear analysis toolbox for neuroimaging}, journal = {Neuroinformatics}, year = {2020}, month = {03/2020}, abstract = {

NeAT is a modular, flexible and user-friendly neuroimaging analysis toolbox for modeling linear and nonlinear effects overcoming the limitations of the standard neuroimaging methods which are solely based on linear models. NeAT provides a wide range of statistical and machine learning non-linear methods for model estimation, several metrics based on curve fitting and complexity for model inference and a graphical user interface (GUI) for visualization of results. We illustrate its usefulness on two study cases where non-linear effects have been previously established. Firstly, we study the nonlinear effects of Alzheimer{\textquoteright}s disease on brain morphology (volume and cortical thickness). Secondly, we analyze the effect of the apolipoprotein APOE-ε4 genotype on brain aging and its interaction with age. NeAT is fully documented and publicly distributed at https://imatge-upc.github.io/neat-tool/.

}, keywords = {Alzheimer{\textquoteright}s disease, APOE, GAM, GLM, inference, neuroimaging, nonlinear., SVR}, doi = {10.1007/s12021-020-09456-w}, url = {https://link.springer.com/article/10.1007/s12021-020-09456-w}, author = {Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana and Santi Puch and Asier Aduriz and Carlos Lopez and G. Operto and R. Cacciaglia and C. Falcon and J.L. Molinuevo and Juan D. Gispert} } @conference {cGiro-i-Nieto, title = {One Perceptron to Rule Them All: Language, Vision, Audio and Speech (tutorial)}, booktitle = {ACM International Conference on Multimedia Retrieval (ICMR) 2020}, year = {2020}, month = {06/2020}, publisher = {ACM}, organization = {ACM}, address = {Dublin, Ireland}, abstract = {

Deep neural networks have boosted the convergence of multimedia data analytics in a unified framework shared by practitioners in natural language, vision and speech. Image captioning, lip reading or video sonorization are some of the first applications of a new and exciting field of research exploiting the generalization properties of deep neural representation. This tutorial will firstly review the basic neural architectures to encode and decode vision, text and audio, to later review the those models that have successfully translated information across modalities.

Part II: Neural Encoders \& Decoders [GSlides] [Video]

Part III: Language \& Vision: [GSlides] [Video]

Part IV: Audio \& Vision: [GSlides] [Video]

Part V: Speech \& Vision [GSlides]
}, keywords = {deep learning; multimodal; cross-modal; joint embeddings}, doi = {https://doi.org/10.1145/3372278.3390740}, author = {Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xPunti20, title = {PixInPix: Hidding Pixels in Pixels}, year = {2020}, abstract = {

PixInPix is an steganography hidding system of images within other images.\ The system designed is able to create, from an cover image and a message, a new steganography image. This new stego-image looks as similar as possible as the cover but has the message hidden in it.\ Our approach adopts the U-net architecture and combines two reconstruction losses to provide a simple yet effective approach tested in low resolution images from MNIST, CIFAR and ImageNet.

}, author = {Punt{\'\i}, Cristina}, editor = {McGuinness, Kevin and Cristian Canton-Ferrer and Xavier Gir{\'o}-i-Nieto} } @article {aCasamitjanac, title = {Projection to Latent Spaces disentangles pathological effects on brain morphology in the asymptomatic phase of Alzheimer{\textquoteright}s disease}, journal = {Frontiers in Neurology, section Applied Neuroimaging}, volume = {11}, year = {2020}, month = {07/2020}, chapter = {648}, abstract = {

Alzheimer{\textquoteright}s disease (AD) continuum is defined as a cascade of several neuropathological processes that can be measured using biomarkers such as cerebrospinal fluid (CSF) levels of Aβ, p-tau and t-tau. In parallel, brain anatomy can be characterized through imaging techniques such as magnetic resonance imaging (MRI). In this work, we relate both sets of measurements seeking associations between biomarkers and brain structure that can be indicative of AD progression. The goal is to uncover underlying multivariate effects of AD pathology on regional brain morphological information. For this purpose, we use the projection to latent structures (PLS)\ method. Using PLS, we find a low dimensional latent space that best describes the covariance between both sets of measurements on the same subjects. Possible confounder effects (age and sex) on brain morphology are included in the model and regressed out using an orthogonal PLS model. We look for statistically significant correlations between brain morphology and CSF biomarkers that explain part of the volumetric variance at each region-of-interest (ROI). Furthermore, we use a clustering technique to discover a small set of CSF-related patterns describing the AD continuum. We apply this technique to the study of subjects in the whole AD continuum from the preclinical asymptomatic stages all through to the symptomatic groups. Subsequent analyses involve splitting the course of the disease into diagnostic categories: cognitively unimpaired subjects (CU), mild cognitive impaired subjects (MCI) and dementia subjects (AD-dementia) where all symptoms are due to AD.

}, keywords = {Alzheimer{\textquoteright}s disease, brain morphology, CSF biomarkers, Latent model, PLS, preclinical AD}, author = {Adri{\`a} Casamitjana and Paula Petrone and Jose Luis Molinuevo and Juan D. Gispert and Ver{\'o}nica Vilaplana} } @conference {cPardas20, title = {Refinement network for unsupervised on the scene foreground segmentation}, booktitle = {EUSIPCO European Signal Processing Conference}, year = {2020}, month = {08/2020}, publisher = {European Association for Signal Processing (EURASIP)}, organization = {European Association for Signal Processing (EURASIP)}, abstract = {

In this paper we present a network for foreground segmentation based on background subtraction which does not require specific scene training. The network is built as a refinement step on top of classic state of the art background subtraction systems. In this way, the system combines the possibility to define application oriented specifications as background subtraction systems do, and the highly accurate object segmentation abilities of deep learning systems. The refinement system is based on a semantic segmentation network. The network is trained on a common database and is not fine-tuned for the specific scenes, unlike existing solutions for foreground segmentation based on CNNs. Experiments on available databases show top results among unsupervised methods.

}, url = {https://www.eurasip.org/Proceedings/Eusipco/Eusipco2020/pdfs/0000705.pdf}, author = {M. Pard{\`a}s and G. Canet} } @mastersthesis {xManas, title = {Self-Supervised Visual Representation Learning for Remote Sensing}, year = {2020}, abstract = {

With the creation of large-scale annotated datasets such as the ImageNet, fully-supervised machine learning methods have become the standard for solving computer vision tasks. These methods require large amounts of labeled data, which are usually obtained with crowdsourcing tools or social media tags. However, these approaches do not scale for specialized domains, such as medical or satellite imaging, where annotations must be provided by experts at a prohibitive cost. Recently, self-supervised learning has emerged as an alternative for obtaining transferable visual representations from unlabeled data. Models based on these representations match the performance of fully-supervised models while only requiring a small fraction of the annotations. In this work, we aim to explore the application of self-supervised learning methods in the remote sensing domain. We propose a contrastive approach for learning visual representations by exploiting the multi-spectral information of satellite images. These representations serve as a good starting point for a variety of downstream tasks that involve remote sensing imagery, accelerating convergence with fewer labeled examples.

Best thesis award 2020 (draw with 4 more other works)\ 

}, author = {Ma{\~n}as,Oscar}, editor = {Rodr{\'\i}guez, Pau and Xavier Gir{\'o}-i-Nieto} } @conference {cPuig-Sitjes20, title = {Strategy for the real-time detection of thermal events on the plasma facing components of Wendelstein 7-X}, booktitle = {31st Symposium on Fusion Technology (SOFT2020)}, year = {2020}, month = {09/2020}, address = {Dubrovnik, Croatia}, abstract = {

Wendelstein 7-X (W7-X), the most advanced fusion experiment in the stellarator line, aims at demonstrating the feasibility of the stellarator concept as a future fusion power plant. It is planned to restart operation by the end of 2021 with a high heat flux divertor and water-cooled plasma facing components (PFCs) to demonstrate steady-state operation. With plasma energy limits starting at 1 GJ and gradually increasing to 18 GJ over several experimental campaigns, the PFCs have to be protected from overheating. For that, a fully autonomous system is required in order to prevent damage to the plasma facing components due to thermal events.
During the last experimental campaign, when W7-X was equipped with inertially cooled test divertor units, extensive experience was gained with the preliminary design of the thermal event detection system. By then, the system was not yet real-time capable and it was not fully automated, requiring manual supervision between discharges. This experience, however, allowed to prove the validity of some design concepts and to define the new strategy towards the protection of the machine in steady-state operation, when the system will be connected to the Interlock System and the feedback control.
In this work, the design of the real-time thermal event detection system for W7-X for steady-state operation is presented. The system is based on the thermography and video diagnostics to monitor the divertor units, the baffles, and the wall heat-shields and panels. It will be implemented on a real-time system and integrated in CoDaC{\textquoteright}s safety infrastructure. The system relies on computer vision and machine learning techniques to perform a spatio-temporal analysis to detect and classify the thermal events and to perform a risk evaluation. The results and the main conclusions drawn from the analysis of the data from the past campaign are reported.

}, url = {http://hdl.handle.net/21.11116/0000-0007-7FF5-7}, author = {Puig-Sitjes, A. and Jakubowski, M. and Fellinger, J. and Drewelow, P. and Gao, Y. and Niemann, H. and Sunn-Pedersen, T. and K{\"o}nig, R. and Naujoks, D. and Winter, A. and Laqua, H. and Dumke, S. and Moncada, V. and Belafdil, C. and Mitteau, R. and Aumeunier, M.-H. and Pisano, F. and Aymerich, E. and Cannas, B. and Kocsis, G. and Szepesi, T. and Cseh, G. and Szabolics, T. and Casas, J. and Morros, J.R. and Salembier, P. and Clemente, R. and Cobos, M. and I. Caminal and Palacios Corral, A. and Moreno Punzano, A. and Quiceno Lopera, S.} } @article {aSalgueiro, title = {Super-Resolution of Sentinel-2 Imagery Using Generative Adversarial Networks}, journal = {Remote Sensing}, volume = {12}, year = {2020}, month = {01/2020}, abstract = {

Sentinel-2 satellites provide multi-spectral optical remote sensing images with four bands at 10 m of spatial resolution. These images, due to the open data distribution policy, are becoming an important resource for several applications. However, for small scale studies, the spatial detail of these images might not be sufficient. On the other hand, WorldView commercial satellites offer multi-spectral images with a very high spatial resolution, typically less than 2 m, but their use can be impractical for large areas or multi-temporal analysis due to their high cost. To exploit the free availability of Sentinel imagery, it is worth considering deep learning techniques for single-image super-resolution tasks, allowing the spatial enhancement of low-resolution (LR) images by recovering high-frequency details to produce high-resolution (HR) super-resolved images. In this work, we implement and train a model based on the Enhanced Super-Resolution Generative Adversarial Network (ESRGAN) with pairs of WorldView-Sentinel images to generate a super-resolved multispectral Sentinel-2 output with a scaling factor of 5. Our model, named RS-ESRGAN, removes the upsampling layers of the network to make it feasible to train with co-registered remote sensing images. Results obtained outperform state-of-the-art models using standard metrics like PSNR, SSIM, ERGAS, SAM and CC. Moreover, qualitative visual analysis shows spatial improvements as well as the preservation of the spectral information, allowing the super-resolved Sentinel-2 imagery to be used in studies requiring very high spatial resolution.

}, author = {Luis Salgueiro and Javier Marcello and Ver{\'o}nica Vilaplana} } @conference {cBernal, title = {Transcription-Enriched Joint Embeddings or Spoken Descriptions of Images and Videos}, booktitle = {CVPR 2020 Workshop on Egocentric Perception, Interaction and Computing}, year = {2020}, month = {06/2020}, publisher = {arXiv}, organization = {arXiv}, address = {Seattle, WA, USA}, abstract = {

In this work, we propose an effective approach for training unique embedding representations by combining three simultaneous modalities: image and spoken and textual narratives. The proposed methodology departs from a baseline system that spawns a embedding space trained with only spoken narratives and image cues. Our experiments on the EPIC-Kitchen and Places Audio Caption datasets show that introducing the human-generated textual transcriptions of the spoken narratives helps to the training procedure yielding to get better embedding representations. The triad speech, image and words allows for a better estimate of the point embedding and show an improving of the performance within tasks like image and speech retrieval,\ even when text third modality, text, is not present in the task.

Transcription-Enriched Joint Embeddings for Spoken Descriptions of Images and Videos from Universitat Polit{\`e}cnica de Catalunya
}, author = {Oriol, Benet and Luque, J. and Diego, Ferran and Xavier Gir{\'o}-i-Nieto} } @conference {cCombaliad, title = {Uncertainty Estimation in Deep Neural Networks for Dermoscopic Image Classification}, booktitle = {CVPR 2020, ISIC Skin Image Analysis Workshop}, year = {2020}, month = {2020}, abstract = {

The high performance of machine learning algorithms for the task of skin lesion classification has been shown over the past few years. However, real-world implementations are still scarce. One of the reasons could be that most methods do not quantify the uncertainty in the predictions and are not able to detect data that is anomalous or significantly different from that used in training, which may lead to a lack of confidence in the automated diagnosis or errors in the interpretation of results.

In this work, we explore the use of uncertainty estimation techniques and metrics for deep neural networks based on Monte-Carlo sampling and apply them to the problem of skin lesion classification on data from ISIC Challenges 2018 and 2019.

Our results show that uncertainty metrics can be successfully used to detect difficult and out-of-distribution samples.

}, author = {Marc Combalia and Ferran Hueto and Susana Puig and Josep Malvehy and Ver{\'o}nica Vilaplana} } @conference {cMoliner, title = {Weakly Supervised Semantic Segmentation for Remote Sensing Hyperspectral Imaging}, booktitle = {International Conference on Acoustics, Speech, and Signal Processing (ICASSP 2020)}, year = {2020}, month = {05/2020}, abstract = {

This paper studies the problem of training a semantic segmentation neural network with weak annotations, in order to be applied in aerial vegetation images from Teide National Park. It proposes a Deep Seeded Region Growing system which consists on training a semantic segmentation network from a set of seeds generated by a Support Vector Machine. A region growing algorithm module is applied to the seeds to progressively increase the pixel-level supervision. The proposed method performs better than an SVM, which is one of the most popular segmentation tools in remote sensing image applications.

}, author = {Eloi Moliner and Luis Salgueiro and Ver{\'o}nica Vilaplana} } @conference {cGorrizc, title = {Assessing Knee OA Severity with CNN attention-based end-to-end architectures}, booktitle = {International Conference on Medical Imaging with Deep Learning (MIDL) 2019}, year = {2019}, month = {02/2019}, publisher = {JMLR}, organization = {JMLR}, address = {London, United Kingdom}, abstract = {

This work proposes a novel end-to-end convolutional neural network (CNN) architecture to\ automatically quantify the severity of knee osteoarthritis (OA) using X-Ray images, which\ incorporates trainable attention modules acting as unsupervised fine-grained detectors of\ the region of interest (ROI). The proposed attention modules can be applied at different\ levels and scales across any CNN pipeline helping the network to learn relevant attention\ patterns over the most informative parts of the image at different resolutions. We test\ the proposed attention mechanism on existing state-of-the-art CNN architectures as our\ base models, achieving promising results on the benchmark knee OA datasets from the\ osteoarthritis initiative (OAI) and multicenter osteoarthritis study (MOST). All the codes\ from our experiments will be publicly available on the github repository: https://github.com/marc-gorriz/KneeOA-CNNAttention

}, url = {http://proceedings.mlr.press/v102/gorriz19a.html}, author = {G{\'o}rriz, Marc and Antony, Joseph and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @conference {cLopez-Palmab, title = {Audience measurement using a top-view camera and oriented trajectories}, booktitle = {IEEE IECON 2019}, year = {2019}, month = {10/2019}, address = {Lisbon, Portugal}, abstract = {

One of the points of maximum interest in the valuation of the areas of commercial advertising is the probability with which that publicity will be seen. This paper presents a method based on top-view camera measurement, where the probability of viewing is estimated based on the trajectories and movements of the heads of the passerby individuals in the area of interest. Using a camera with a depth sensor, the heads of the people in the range of view can be detected and modeled. This allows to\  determine the orientation of the head which is used to estimate the direction of vision. A tracking by detection algorithm\  allows to compute the trajectory of each user. The attention at each advertising point is estimated based on the trajectories and head orientations of the individuals in the area of interest.

}, keywords = {attention time, depth sensor, top-view camera, tracking}, author = {Manuel L{\'o}pez-Palma and Morros, J.R. and Montserrat Corbal{\'a}n and Javier Gago} } @conference {cCombaliac, title = {BCN20000: Dermoscopic Lesions in the Wild}, booktitle = {International Skin Imaging Collaboration (ISIC) Challenge on Dermoscopic Skin Lesion Analysis 2019}, year = {2019}, month = {10/2019}, abstract = {

This article summarizes the BCN20000 dataset, composed of 19424 dermoscopic images of skin lesions captured from 2010 to 2016 in the facilities of the Hospital Cl{\'\i}nic in Barcelona. With this dataset, we aim to study the problem of unconstrained classification of dermoscopic images of skin cancer, including lesions found in hard-to-diagnose locations (nails and mucosa), large lesions which do not fit in the aperture of the dermoscopy device, and hypo-pigmented lesions. The BCN20000 will be provided to the participants of the ISIC Challenge 2019 \cite{ISIC2019}, where they will be asked to train algorithms to classify dermoscopic images of skin cancer automatically.

}, author = {Marc Combalia and Noel C. F. Codella and Veronica Rotemberg and Brian Helba and Ver{\'o}nica Vilaplana and Ofer Reiter and Cristina Carrera and Alicia Barreiro and Allan C. Halpern and Susana Puig and Josep Malvehy} } @article {aWang19, title = {Benchmark on Automatic 6-month-old Infant Brain Segmentation Algorithms: The iSeg-2017 Challenge}, journal = {IEEE Transactions on Medical Imaging}, year = {2019}, month = {2019/2/27}, abstract = {

Accurate segmentation of infant brain magnetic resonance (MR) images into white matter (WM), gray matter (GM), and cerebrospinal fluid (CSF) is an indispensable foundation for early studying of brain growth patterns and morphological changes in neurodevelopmental disorders. Nevertheless, in the isointense phase (approximately 6-9 months of age), due to inherent myelination and maturation process, WM and GM exhibit similar levels of intensity in both T1-weighted (T1w) and T2-weighted (T2w) MR images, making tissue segmentation very challenging. Despite many efforts were devoted to brain segmentation, only few studies have focused on the segmentation of 6-month infant brain images. With the idea of boosting methodological development in the community, iSeg-2017 challenge (http://iseg2017.web.unc.edu) provides a set of 6-month infant subjects with manual labels for training and testing the participating methods. Among the 21 automatic segmentation methods participating in iSeg-2017, we review the 8 top-ranked teams, in terms of Dice ratio, modified Hausdorff distance and average surface distance, and introduce their pipelines, implementations, as well as source codes. We further discuss limitations and possible future directions. We hope the dataset in iSeg-2017 and this review article could provide insights into methodological development for the community.

}, doi = {10.1109/TMI.2019.2901712}, author = {Li Wang and Dong Nie and Guannan Li and Elodie Puybareau and Jose Dolz and Qian Zhang and Fan Wang and Jing Xia and Zhengwang Wu and Jiawei Chen and Kim-HanThung and Toan Duc Bui and Jitae Shin and Guodong Zeng and Guoyan Zheng and Vladimir S. Fonov and Andrew Doyle and Yongchao Xu and Pim Moeskops and Josien Pluim and Christian Desrosiers and Ismail Ben Ayed and Gerard Sanroma and Oualid Benkarim and Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana and Weili Lin and Gang Li and Dinggang Shen} } @conference {cBellverb, title = {Budget-aware Semi-Supervised Semantic and Instance Segmentation}, booktitle = {CVPR 2019 DeepVision Workshop}, year = {2019}, month = {05/2019}, publisher = {OpenCVF}, organization = {OpenCVF}, address = {Long Beach, CA, USA}, abstract = {

Methods that move towards less supervised scenarios are key for image segmentation, as dense labels demand significant human intervention. Generally, the annotation burden is mitigated by labeling datasets with weaker forms of supervision, e.g. image-level labels or bounding boxes. Another option are semi-supervised settings, that commonly leverage a few strong annotations and a huge number of unlabeled/weakly-labeled data. In this paper, we revisit semi-supervised segmentation schemes and narrow down significantly the annotation budget (in terms of total labeling time of the training set) compared to previous approaches. With a very simple pipeline, we demonstrate that at low annotation budgets, semi-supervised methods outperform by a wide margin weakly-supervised ones for both semantic and instance segmentation. Our approach also outperforms previous semi-supervised works at a much reduced labeling cost. We present results for the Pascal VOC benchmark and unify weakly and semi-supervised approaches by considering the total annotation budget, thus allowing a fairer comparison between methods.

Winners of the best paper award at the $\#$CVPR2019 DeepVision workshop

Budget-aware Semi-Supervised Semantic and Instance Segmentation from Universitat Polit{\`e}cnica de Catalunya
}, url = {https://arxiv.org/abs/1905.05880}, author = {M{\'\i}riam Bellver and Amaia Salvador and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xDomenech19, title = {Clasificaci{\'o}n de im{\'a}genes dermatosc{\'o}picas utilizando Redes Neuronales Convolucionales e informaci{\'o}n de metadatos}, year = {2019}, abstract = {

Convolutional Neural Networks (CNNs) is a technology that is evolving very quickly. The CNNs are used in image recognition to improve computer vision systems in image classification, object detection and segmentation tasks. For all the mentioned features we can apply the CNN in the field of medicine to automate the techniques of obtaining diagnosis and facilitate the tasks of doctors.

The purpose of this project is to classify images of different skin lesions in order to detect possible diseases such as skin cancer (melanoma) through CNNs and metadata information. The images used in this project have been provided by the Dermatology Department of the Clinical Hospital of Barcelona

}, author = {Teresa Domenech}, editor = {Ver{\'o}nica Vilaplana} } @mastersthesis {xTarres19, title = {Clasificaci{\'o}n de lesiones de piel con un ensemble de redes neuronales residuales}, year = {2019}, abstract = {

Convolutional Neural Networks have gained popularity in the recent years due to their performance regarding image analysis, both in classification and segmentation. Especially in the medical field, it is increasingly common to use automatic techniques to help specialists with the diagnosis.

In this thesis, the problem of skin lesion classification is studied. The study is based on the ISIC Challenges, given the collaboration with Hospital Cl{\'\i}nic de Barcelona, and we help in the development of the database for the ISIC Challenge 2019.

One of the key points of the development is obtaining a model that manages to classify with accuracy a database provided. To do so, we study residual neural networks and an ensemble of them to further improve the results.

The purpose of this project, therefore is the study, analysis and evaluation of the variants and modifications of residual neural networks so that it adapts to our problem using an ensemble of them. In the process, the neural network will have to tackle the problem of class imbalance

}, author = {Laia Tarr{\'e}s}, editor = {Ver{\'o}nica Vilaplana and Marc Combalia} } @conference {cIsart, title = {CNN-based bacilli detection in sputum samples for tuberculosis diagnosis}, booktitle = {International Symposium on Biomedical Imaging (ISBI 2019)}, year = {2019}, month = {04/2019}, abstract = {

Tuberculosis (TB) is one of the infectious diseases that causes more deaths in low and middle-income countries. A low-cost method to diagnose TB consists in analyzing sputum smear samples through microscope observation. Manual identification and counting of bacilli is a very time consuming task and the sensitivity of the diagnosis depends on the availability of skilled technicians. We propose a computer vision technique based on a convolutional neural network (CNN) to automatically segment and count bacilli in sputum samples and predict the infection level.

}, author = {Antoni Isart and Mateu Espasa and Ver{\'o}nica Vilaplana and Elisa Sayrol} } @conference {cSalgueiro, title = {Comparative study of upsampling methods for super-resolution in remote sensing}, booktitle = {International Conference on Machine Vision}, year = {2019}, month = {11/2019}, abstract = {

Many remote sensing applications require high spatial resolution images, but the elevated cost of these images makes some studies unfeasible. Single-image super-resolution algorithms can improve the spatial resolution of a low-resolution image by recovering feature details learned from pairs of low-high resolution images. In this work, several configurations of ESRGAN, a state-of-the-art algorithm for image super-resolution are tested. We make a comparison between several scenarios, with different modes of upsampling and channels involved.\  The best results are obtained training a model with RGB-IR channels and using progressive upsampling.\ 

}, keywords = {deep learning, Remote sensing, Super-resolution, WorldView-2}, author = {Luis Salgueiro and Javier Marcello and Ver{\'o}nica Vilaplana} } @phdthesis {dSalvador, title = {Computer Vision beyond the visible: Image understanding through language}, volume = {Phd}, year = {2019}, school = {Universitat Politecnica de Catalunya}, address = {Barcelona}, abstract = {

In the past decade, deep neural networks have revolutionized computer vision. High performing deep neural architectures trained for visual recognition tasks have pushed the field towards methods relying on learned image representations instead of hand-crafted ones, in the seek of designing end-to-end learning methods to solve challenging tasks, ranging from long-lasting ones such as image classification to newly emerging tasks like image captioning.\ 

As this thesis is framed in the context of the rapid evolution of computer vision, we present contributions that are aligned with three major changes in paradigm that the field has recently experienced, namely 1) the power of re-utilizing deep features from pre-trained neural networks for different tasks, 2) the advantage of formulating problems with end-to-end solutions given enough training data, and 3) the growing interest of describing visual data with natural language rather than pre-defined categorical label spaces, which can in turn enable visual understanding beyond scene recognition.

The first part of the thesis is dedicated to the problem of visual instance search, where we particularly focus on obtaining meaningful and discriminative image representations which allow efficient and effective retrieval of similar images given a visual query. Contributions in this part of the thesis involve the construction of sparse Bag-of-Words image representations from convolutional features from a pre-trained image classification neural network, and an analysis of the advantages of fine-tuning a pre-trained object detection network using query images as training data.

The second part of the thesis presents contributions to the problem of image-to-set prediction, understood as the task of predicting a variable-sized collection of unordered elements for an input image. We conduct a thorough analysis of current methods for multi-label image classification, which are able to solve the task in an end-to-end manner by simultaneously estimating both the label distribution and the set cardinality. Further, we extend the analysis of set prediction methods to semantic instance segmentation, and present an end-to-end recurrent model that is able to predict sets of objects (binary masks and categorical labels) in a sequential manner.

Finally, the third part of the dissertation takes insights learned in the previous two parts in order to present deep learning solutions to connect images with natural language in the context of cooking recipes and food images. First, we propose a retrieval-based solution in which the written recipe and the image are encoded into compact representations that allow the retrieval of one given the other. Second, as an alternative to the retrieval approach, we propose a generative model to predict recipes directly from food images, which first predicts ingredients as sets and subsequently generates the rest of the recipe one word at a time by conditioning both on the image and the predicted ingredients.

}, url = {https://www.tdx.cat/handle/10803/667162}, author = {Amaia Salvador}, editor = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @article {aPujol-Miro19, title = {Correspondence matching in unorganized 3D point clouds using Convolutional Neural Networks}, journal = {Image and Vision Computing}, volume = {83-84}, year = {2019}, month = {04/2019}, chapter = {51-60}, abstract = {

This document presents a novel method based in Convolutional Neural Networks (CNN) to obtain correspondence matchings between sets of keypoints of several unorganized 3D point cloud captures, independently of the sensor used. The proposed technique extends a state-of-the-art method for correspondence matching in standard 2D images to sets of unorganized 3D point clouds. The strategy consists in projecting the 3D neighborhood of the keypoint onto an RGBD patch, and the classification of patch pairs using CNNs. The objective evaluation of the proposed 3D point matching based in CNNs outperforms existing 3D feature descriptors, especially when intensity or color data is available.

}, doi = {https://doi.org/10.1016/j.imavis.2019.02.013}, author = {A. Pujol-Mir{\'o} and Casas, J. and Ruiz-Hidalgo, J.} } @conference {cDuarteb, title = {Cross-modal Neural Sign Language Translation}, booktitle = {Proceedings of the 27th ACM International Conference on Multimedia - Doctoral Symposium}, year = {2019}, month = {10/2019}, publisher = {ACM}, organization = {ACM}, address = {Nice, France}, abstract = {

Sign Language is the primary means of communication for the majority of the Deaf and hard-of-hearing communities. Current computational approaches in this general research area have focused specifically on sign language recognition and the translation of sign language to text. However, the reverse problem of translating from spoken to sign language has so far not been widely explored.

The goal of this doctoral research is to explore sign language translation in this generalized setting, i.e. translating from spoken language to sign language and vice versa. Towards that end, we propose a concrete methodology for tackling the problem of speech to sign language translation and introduce How2Sign, the first public, continuous American Sign Language dataset that enables such research. With a parallel corpus of almost 60 hours of sign language videos (collected with both RGB and depth sensor data) and the corresponding speech transcripts for over 2500 instructional videos, How2Sign is a public dataset of unprecedented scale that can be used to advance not only sign language translation, but also a wide range of sign language understanding tasks.

Poster at ACM Multimedia 2019

Xavier Giro and Amanda Duarte in ACM Multimedia 2019.

}, doi = {10.1145/3343031.3352587}, url = {https://dl.acm.org/citation.cfm?id=3352587}, author = {Amanda Duarte}, editor = {Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xBalibrea19, title = {Deep learning for semantic segmentation of airplane hyperspectral imaging}, year = {2019}, abstract = {

Given their success, both qualitative and quantitative, Deep Neural Networks have been used to approach classification and segmentation problems for images, especially during these last few years where it has been possible to design computers with sufficient capacity to make quick and efficient experiments.

In this work, we will study the use of two Convolutional Neural Networks (CNNs) to segment the ground of a land section of Maspalomas{\textquoteright} Park using an image taken by the flight of an airplane.

The comparison will be made in terms of computational cost, complexity and results that will be obtained while testing different algorithms, loss functions or optimizers and also while tuning some other parameters. The results will also be compared with a past work done with the same dataset but another methodology (SVM).

}, author = {Mar Balibrea}, editor = {Luis Salgueiro and Ver{\'o}nica Vilaplana} } @article {x19, title = {Deep Learning Representations for All (a.k.a. the AI hype)}, year = {2019}, abstract = {

Deep neural networks have revolutionized the data analytics scene by improving results in several and diverse benchmarks with the same recipe: learning feature representations from data. These achievements have raised the interest across multiple scientific fields, especially in those where large amounts of data and computation are available. This change of paradigm in data analytics has several ethical and economic implications that are driving large investments, political debates and sounding press coverage under the generic label of artificial intelligence (AI). This talk will present the fundamentals of deep learning through the classic example of image classification, and point at how the same principal has been adopted for several tasks. Finally, some of the forthcoming potentials and risks for AI will be pointed.

Deep Learning Representations for All (a.ka. the AI hype) from Universitat Polit{\`e}cnica de Catalunya
} } @article {aLin19, title = {Depth Estimation and Semantic Segmentation from a Single RGB Image Using a Hybrid Convolutional Neural Network}, journal = {Sensors}, volume = {19}, year = {2019}, month = {04/2019}, chapter = {1795}, abstract = {

Semantic segmentation and depth estimation are two important tasks in computer vision, and many methods have been developed to tackle them. Commonly these two tasks are addressed independently, but recently the idea of merging these two problems into a sole framework has been studied under the assumption that integrating two highly correlated tasks may benefit each other to improve the estimation accuracy. In this paper, depth estimation and semantic segmentation are jointly addressed using a single RGB input image under a unified convolutional neural network. We analyze two different architectures to evaluate which features are more relevant when shared by the two tasks and which features should be kept separated to achieve a mutual improvement. Likewise, our approaches are evaluated under two different scenarios designed to review our results versus single-task and multi-task methods. Qualitative and quantitative experiments demonstrate that the performance of our methodology outperforms the state of the art on single-task approaches, while obtaining competitive results compared with other multi-task methods.

}, keywords = {depth estimation; semantic segmentation; convolutional neural networks; hybrid architecture}, issn = {1424-8220}, doi = {10.3390/s19081795}, url = {https://www.mdpi.com/1424-8220/19/8/1795}, author = {X. Lin and D. Sanchez-Escobedo and Casas, J. and M. Pard{\`a}s} } @conference {cCasamitjana19, title = {Detection of Amyloid Positive Cognitively unimpaired individuals using voxel-based machine learning on structural longitudinal brain MRI}, booktitle = {Alzheimer{\textquoteright}s Association International Conference}, year = {2019}, month = {07/2019}, author = {Adri{\`a} Casamitjana and P. Petrone and C. Falcon and M. Artigues and G. Operto and R. Cacciaglia and J.L. Molinuevo and Ver{\'o}nica Vilaplana and J.D. Gispert} } @article {aCasamitjana19, title = {Detection of Amyloid-Positive Cognitively Unimpaired Individuals Using Voxel-Based Machine Learning on Structural Longitudinal Brain MRI}, journal = {Alzheimer{\textquoteright}s \& Dementia}, volume = {15}, year = {2019}, month = {07/2019}, chapter = {752}, abstract = {

Magnetic resonance imaging (MRI) has unveiled specific AD alterations at different stages of the AD pathophysiologic continuum that constitutes what has been established as the {\textquoteleft}AD signature{\textquoteright}. To what extent MRI can detect amyloid-related cerebral changes from structural MRI in unimpaired individuals is still an area open for exploration.

}, issn = {1552-5260}, doi = {10.1016/j.jalz.2019.06.2796}, author = {Adri{\`a} Casamitjana and P. Petrone and C. Falcon and M. Artigues and G. Operto and R. Cacciaglia and J.L. Molinuevo and Ver{\'o}nica Vilaplana and J.D. Gispert} } @article {aRoisman18, title = {Differential expression of long non-coding RNAs related to proliferation and histological diversity in follicular lymphomas}, journal = {British Journal of Haematology}, volume = {184}, year = {2019}, month = {Feb 2019}, pages = {373-383}, issn = {ISSN:1365-2141}, doi = {DOI: 10.1111/bjh.15656}, author = {A. Roisman and A. Navarro and G. Clot and G. Castellano and B. Gonzalez-Farre and P. P{\'e}rez-Galan and A. Esteve and M. Dabad and S. Heath and M. Gut and Bosio, M. and P. Bellot and Salembier, P. and Albert Oliveras and I. Slavusky and L. Magnano and H. Horn and A. Rosenwald and G. Ott and M. Aymerich and A. L{\'o}pez-Guillermo and P. Jares and J.I. Mart{\'\i}n-Subero and E. Campo and L. Hern{\'a}ndez} } @conference {cCombaliab, title = {Digitally Stained Confocal Microscopy through Deep Learning}, booktitle = {International Conference on Medical Imaging with Deep Learning (MIDL 2019)}, year = {2019}, month = {07/2019}, address = {London}, abstract = {

Specialists have used confocal microscopy in the ex-vivo modality to identify tumors with\ an overall sensitivity of 96.6\% and specicity of 89.2\%. However, this technology hasn{\textquoteright}t\ established yet in the standard clinical practice because most pathologists lack the knowledge to interpret its output. In this paper we propose a combination of deep learning and\ computer vision techniques to digitally stain confocal microscopy images into H\&E-like\ slides, enabling pathologists to interpret these images without specic training. We use a\ fully convolutional neural network with a multiplicative residual connection to denoise the\ confocal microscopy images, and then stain them using a Cycle Consistency Generative\ Adversarial Network.

}, author = {Marc Combalia and Javiera P{\'e}rez-Anker and Adriana Garc{\'\i}a-Herrera and Ll{\'u}cia Alos and Ver{\'o}nica Vilaplana and Ferran Marques and Susana Puig and Josep Malvehy} } @mastersthesis {xComas19, title = {Exploring Methods for Enhancing Linear Prediction of Video Sequences}, year = {2019}, abstract = {

Video prediction has for a long time received attention within the field of computer\ vision, but it has gained importance during the last decade with the popularization of deep\ neural networks and their applications to computer vision.\ In this thesis, the main focus will be to linearize the dynamics of time sequences by exploiting the spatial context that video offers, with the final scope of obtaining better predictions.\ In the first place, we provide the theoretical base for dynamics. Following, we present\ several modifications for an existing deterministic predictor network called Dynamical\ Atoms-based Network (DYAN) [1], which models time sequences as the output of Linear\ Time-Invariant (LTI) systems using system identification and dynamics foundations. The\ solutions present different levels of success and an in some cases they beat the State Of\ The Art (SOTA) for at least one dataset, in the metrics SSIM, MSE and MMF.\ We also present two novel convolutional autoencoder architectures (LODAEs) for low order\ dynamics manifold embedding, strongly based on deep neural networks, with the primary\ aim of giving a generalized solution for mapping video sequences into a new manifold, to\ adapt them to the pipeline of predictors such as DYAN, based on system identification.\ The results for the LODAEs are promising as they seem to achieve their goal for a very\ simple synthetic dataset by lowering the order of the latent space sequences and providing\ good reconstructions and in some cases, predictions.

}, author = {Comas, Armand}, editor = {Camps, Octavia and Xavier Gir{\'o}-i-Nieto} } @article {aGene-Molab, title = {Fruit Detection in an Apple Orchard Using a Mobile Terrestrial Laser Scanner}, journal = {Biosystems Engineering}, volume = {187}, year = {2019}, month = {09/2019}, chapter = {171}, abstract = {

The development of reliable fruit detection and localization systems provides an opportunity to improve the crop value and management by limiting fruit spoilage and optimized harvesting practices. Most proposed systems for fruit detection are based on RGB cameras and thus are affected by intrinsic constraints, such as variable lighting conditions. This work presents a new technique that uses a mobile terrestrial laser scanner (MTLS) to detect and localise Fuji apples. An experimental test focused on Fuji apple trees (Malus domestica Borkh. cv. Fuji) was carried out. A 3D point cloud of the scene was generated using an MTLS composed of a Velodyne VLP-16 LiDAR sensor synchronized with an RTK-GNSS satellite navigation receiver. A reflectance analysis of tree elements was performed, obtaining mean apparent reflectance values of 28.9\%, 29.1\%, and 44.3\% for leaves, branches and trunks, and apples, respectively. These results suggest that the apparent reflectance parameter (at 905 nm wavelength) can be useful to detect apples. For that purpose, a four-step fruit detection algorithm was developed. By applying this algorithm, a localization success of 87.5\%, an identification success of 82.4\%, and an F1-score of 0.858 were obtained in relation to the total amount of fruits. These detection rates are similar to those obtained by RGB-based systems, but with the additional advantages of providing direct 3D fruit location information, which is not affected by sunlight variations. From the experimental results, it can be concluded that LiDAR-based technology and, particularly, its reflectance information, has potential for remote apple detection and 3D location.

}, issn = {1537-5110}, doi = {10.1016/j.biosystemseng.2019.08.017}, url = {https://authors.elsevier.com/c/1Zmc45Tbkk9EHW}, author = {Gen{\'e}-Mola, Jordi and Gregorio, Eduard and Guevara, Javier and Auat Cheein, Fernando and Sanz, Ricardo and Escol{\`a}, Alexandre and Llorens Calveras, Jordi and Morros, J.R. and Ruiz-Hidalgo, J. and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R.} } @mastersthesis {xCaros, title = {A Generative Dialogue System for Reminiscence Therapy}, year = {2019}, abstract = {

With people living longer than ever, the number of cases with neurodegenerative diseases such as Alzheimer{\textquoteright}s or cognitive impairment increases steadily. In Spain it affects more than 1.2 million patients and it is estimated that in 2050 more than 100 million people will be affected. While there are not effective treatments for this terminal disease, therapies such as reminiscence, that stimulate memories of the patient{\textquoteright}s past are recommended, as they encourage the communication and produce mental and emotional benefits on the patient. Currently, reminiscence therapy takes place in hospitals or residences, where the therapists are located. Since people that receive this therapy are old and may have mobility difficulties, we present an AI solution to guide older adults through reminiscence sessions by using their laptop or smartphone.\ 

Our solution consists in a generative dialogue system composed of two deep learning architectures to recognize image and text content. An Encoder-Decoder with Attention is trained to generate questions from photos provided by the user, which is composed of a pretrained Convolution Neural Network to encode the picture, and a Long Short-Term Memory to decode the image features and generate the question. The second architecture is a sequence-to-sequence model that provides feedback to engage the user in the conversation.

Thanks to the experiments, we realise that we obtain the best performance by training the dialogue model with Persona-Dataset and fine-tuning it with Cornell Movie-Dialogues dataset. Finally, we integrate Telegram as the interface for the user to interact with Elisabot, our trained conversational agent.

}, author = {Caros, Mariona}, editor = {Radeva, Petia and Xavier Gir{\'o}-i-Nieto} } @conference {cOriol, title = {Hate Speech in Pixels: Detection of Offensive Memes towards Automatic Moderation}, booktitle = {NeurIPS 2019 Workshop on AI for Social Good}, year = {2019}, month = {09/2019}, address = {Vancouver, Canada}, abstract = {

This work addresses the challenge of hate speech detection in Internet memes, and attempts using visual information to automatically detect hate speech, unlike any previous work of our knowledge.\ Memes are pixel-based multimedia documents that contain photos or illustrations together with phrases which, when combined, usually adopt a funny meaning.\ However, hate memes are also used to spread hate through social networks, so their automatic detection would help reduce their harmful societal impact.\ Our results indicate that the model can learn to detect some of the memes, but that the task is far from being solved with this simple architecture.\ While previous work focuses on linguistic hate speech, our experiments indicate how the visual modality can be much more informative for hate speech detection than the linguistic one in memes.\ In our experiments, we built a dataset of 5,020 memes to train and evaluate a multi-layer perceptron over the visual and language representations, whether independently or fused.

Hate Speech in Pixels: Detection of Offensive Memes towards Automatic Moderation from Universitat Polit{\`e}cnica de Catalunya

\>
}, author = {Oriol, Benet and Cristian Canton-Ferrer and Xavier Gir{\'o}-i-Nieto} } @conference {cRamon, title = {Hyperparameter-Free Losses for Model-Based Monocular Reconstruction}, booktitle = {ICCV 2019 Workshop on Geometry Meets Deep Learning}, year = {2019}, month = {11/2019}, publisher = {IEEE / Computer Vision Foundation}, organization = {IEEE / Computer Vision Foundation}, address = {Seoul, South Corea}, abstract = {

This work proposes novel hyperparameter-free losses for single view 3D reconstruction with morphable models (3DMM). We dispense with the hyperparameters used in other works by exploiting geometry, so that the shape of the object and the camera pose are jointly optimized in a sole term expression. This simplification reduces the optimization time and its complexity. Moreover, we propose a novel implicit regularization technique based on random virtual projections that does not require additional 2D or 3D annotations. Our experiments suggest that minimizing a shape reprojection error together with the proposed implicit regularization is especially suitable for applications that require precise alignment between geometry and image spaces, such as augmented reality. We evaluate our losses on a large scale dataset with 3D ground truth and publish our implementations to facilitate reproducibility and public benchmarking in this field.

}, author = {Ramon, Eduard and Ruiz, Guillermo and Batard, Thomas and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xCaselles, title = {Integrating low-level motion cues in deep video saliency}, year = {2019}, abstract = {

This thesis investigates the importance of motion when predicting saliency in videos. Naturally, humans observe both dynamic and static objects. When we are focused on watching a video, we tend to keep our eyes on the objects that are moving in the scene, items that we quickly recognize, as well as to those that attract our attention. In this work, different experiments are presented to corroborate this implication. Various approaches will be shown implementing an adaptation of the SalBCE neural network by using only motion. A simple implementation is proposed for the generation of saliency maps using previously extracted static and dynamic information from the images. The DHF1K dataset has been used for the experiment{\textquoteright}s realization.

}, keywords = {Motion, Saliency, video}, author = {Caselles, Pol}, editor = {McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xDomingo19, title = {Interpretability of Deep Learning Models}, year = {2019}, abstract = {

In recent years we have seen growth on interest for Deep Learning (DL) algorithms on a variety of problems, due to their outstanding performance. This is more palpable on a multitude of elds, where self learning algorithms are becoming indispensable tools to help professionals solve complex problems.

However as these models are getting better, they also tend to be more complex and are sometimes referred to as "Black Boxes". The lack of explanations for the resulting predictions and the inability of humans to understand those decisions seems problematic.

In this project, dierent methods to increase the interpretability of Deep Neural Networks (DNN) such as Convolutional Neural Network (CNN) are studied. Additionally, we evaluate how these interpretability methods or techniques can be implemented, measured and applied to real-world problems, by creating a python ToolBox.

}, author = {Pau Domingo}, editor = {Ver{\'o}nica Vilaplana} } @conference {cSalvadorf, title = {Inverse Cooking: Recipe Generation from Food Images}, booktitle = {CVPR}, year = {2019}, month = {06/2019}, publisher = {OpenCVF / IEEE}, organization = {OpenCVF / IEEE}, address = {Long Beach, CA, USA}, abstract = {

People enjoy food photography because they appreciate food. Behind each meal there is a story described in a complex recipe and, unfortunately, by simply looking at a food image we do not have access to its preparation process. Therefore, in this paper we introduce an inverse cooking system that recreates cooking recipes given food images. Our system predicts ingredients as sets by means of a novel architecture, modeling their dependencies without imposing any order, and then generates cooking instructions by attending to both image and its inferred ingredients simultaneously. We extensively evaluate the whole system on the large-scale Recipe1M dataset and show that (1) we improve performance w.r.t. previous baselines for ingredient prediction; (2) we are able to obtain high quality recipes by leveraging both image and ingredients; (3) our system is able to produce more compelling recipes than retrieval-based approaches according to human judgment.

}, url = {http://openaccess.thecvf.com/content_CVPR_2019/html/Salvador_Inverse_Cooking_Recipe_Generation_From_Food_Images_CVPR_2019_paper.html}, author = {Amaia Salvador and Drozdzal, Michal and Xavier Gir{\'o}-i-Nieto and Romero, Adriana} } @article {aGene-Molaa, title = {KFuji RGB-DS database: Fuji apple multi-modal images for fruit detection with color, depth and range-corrected IR data}, journal = {Data in Brief}, year = {2019}, month = {07/2019}, abstract = {

This article contains data related to the research article entitle {\textquotedblleft}Multi-modal Deep Learning for Fruit Detection Using RGB-D Cameras and their Radiometric Capabilities{\textquotedblright} [1]. The development of reliable fruit detection and localization systems is essential for future sustainable agronomic management of high-value crops. RGB-D sensors have shown potential for fruit detection and localization since they provide 3D information with color data. However, the lack of substantial datasets is a barrier for exploiting the use of these sensors. This article presents the KFuji RGB-DS database which is composed by 967 multi-modal images of Fuji apples on trees captured using Microsoft Kinect v2 (Microsoft, Redmond, WA, USA). Each image contains information from 3 different modalities: color (RGB), depth (D) and range corrected IR intensity (S). Ground truth fruit locations were manually annotated, labeling a total of 12,839 apples in all the dataset. The current dataset is publicly available at http://www.grap.udl.cat/publicacions/datasets.html.

}, keywords = {Depth cameras; RGB-D, Fruit detection, Fruit reflectance, Fuji apple, Multi-modal dataset}, doi = {10.1016/j.dib.2019.104289}, author = {Gen{\'e}-Mola, Jordi and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Gregorio, Eduard} } @article {aSala, title = {Measuring traffic lane-changing by converting video into space-time still images}, journal = {Computer-Aided Civil and Infrastructure Engineering}, year = {2019}, month = {01/2019}, abstract = {

Empirical data is needed in order to extend\ our knowledge of traffic behavior. Video recordings are\ used to enrich typical data from loop detectors. In this\ context, data extraction from videos becomes a\ challenging task. Setting automatic video processing\ systems is costly, complex, and the accuracy achieved is\ usually not enough to improve traffic flow models. In\ contrast "visual" data extraction by watching the\ recordings requires extensive human intervention.

A semi-automatic video processing methodology to\ count lane-changing in freeways is proposed. The method\ allows counting lane changes faster than with the visual\ procedure without falling into the complexities and\ errors of full automation. The method consists in\ converting the video into a set of space-time still images,\ from where to visually count. This methodology has been\ tested at several freeway locations near Barcelona\ (Spain) with good results

}, doi = {https://doi.org/10.1111/mice.12430}, url = {https://onlinelibrary.wiley.com/doi/full/10.1111/mice.12430}, author = {Marcel Sala and Francesc Soriguera and Kevin Huillca and Ver{\'o}nica Vilaplana} } @article {aGene-Mola, title = {Multi-modal Deep Learning for Fuji Apple Detection Using RGB-D Cameras and their Radiometric Capabilities}, journal = {Computers and Electronics in Agriculture}, volume = {162}, year = {2019}, month = {07/2019}, chapter = {689-698}, abstract = {

Fruit detection and localization will be essential for future agronomic management of fruit crops, with applications in yield prediction, yield mapping and automated harvesting. RGB-D cameras are promising sensors for fruit detection given that they provide geometrical information with color data. Some of these sensors work on the principle of time-of-flight (ToF) and, besides color and depth, providethe backscatter signal intensity. However, this radiometric capability has not been exploited for fruit detection applications. This workpresents the KFuji RGB-DS database, composed of 967 multi-modal images containing a total of 12,839 Fuji apples. Compilation of th\ database allowed a study of the usefulness of fusing RGB-D and radiometric information obtained with Kinect v2 for fruit detection. Todo so, the signal intensity was range corrected to overcome signal attenuation, obtaining an image that was proportional to the reflectanceof the scene. A registration between RGB, depth and intensity images was then carried out. The Faster R-CNN model was adapted foruse with five-channel input images: color (RGB), depth (D) and range-corrected intensity signal (S). Results show an improvement of4.46\% in F1-score when adding depth and range-corrected intensity channels, obtaining an F1-score of 0.898 and an AP of 94.8\% whenall channels are used. From our experimental results, it can be concluded that the radiometric capabilities of ToF sensors give valuableinformation for fruit detection.

}, keywords = {Agricultural robotics, Convolutional Neural Networks, Fruit detection, Fruit reflectance, Multi-modal faster R-CNN, RGB-D}, doi = {10.1016/j.compag.2019.05.016}, author = {Gen{\'e}-Mola, Jordi and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Gregorio, Eduard} } @mastersthesis {xOriol, title = {Multimodal Hate Speech Detection in Memes}, year = {2019}, abstract = {

This thesis explores a multimodal approach to Hate Speech detection, involving vision and language (text). More specifically, we deal with the context of memes, a form of internet humour which will present additional challenges.\ We first gather meme data from different sources. This way, we create a hate memes dataset for this task. Then, we use this data for the training and evaluation of statistical models, which are based on state-of-the art neural networks.\ We study different ways to fine-tune pretrained descriptors for our specific task. We propose a way to add expert knowledge into the system and orient it into a real world issue-solving system. We also discuss ways to deal with the issue of reduced amount of data, experimenting with a self-supervised learning approach for pre-training.\ We also compare the effect or contribution of each modality in the overall performance of the model.

}, author = {Oriol, Benet}, editor = {Cristian Canton-Ferrer and Xavier Gir{\'o}-i-Nieto} } @article {aVentura, title = {Multiresolution co-clustering for uncalibrated multiview segmentation}, journal = {Signal Processing: Image Communication}, year = {2019}, abstract = {

We propose a technique for coherently co-clustering uncalibrated views of a scene\ with a contour-based representation. Our work extends the previous framework,\ an iterative algorithm for segmenting sequences with small variations, where the\ partition solution space is too restrictive for scenarios where consecutive images\ present larger variations. To deal with a more flexible scenario, we present three\ main contributions. First, motion information has been considered both for\ region adjacency and region similarity. Second, a two-step iterative architecture\ is proposed to increase the partition solution space. Third, a feasible global\ optimization that allows to jointly process all the views has been implemented.\ In addition to the previous contributions, which are based on low-level features,\ we have also considered introducing higher level features as semantic information\ in the co-clustering algorithm. We evaluate these techniques on multiview and\ temporal datasets, showing that they outperform state-of-the-art approaches.

}, keywords = {Co-clustering techniques, Image segmentation, Multiview segmentation, Object segmentation}, doi = {10.1016/j.image.2019.04.010}, url = {https://www.sciencedirect.com/science/article/abs/pii/S0923596518302054}, author = {Ventura, C. and David Varas and Ver{\'o}nica Vilaplana and Xavier Gir{\'o}-i-Nieto and Ferran Marques} } @conference {cRamona, title = {Multi-View 3D Face Reconstruction in the Wild using Siamese Networks}, booktitle = {ICCV 2019 Workshop on 3D Face Alignment in the Wild Challenge Workshop (3DFAW)}, year = {2019}, month = {11/2019}, publisher = {IEEE/CVF}, organization = {IEEE/CVF}, address = {Seoul, South Corea}, abstract = {

In this work, we present a novel learning based approach\ to reconstruct 3D faces from a single or multiple images.\ Our method uses a simple yet powerful architecture based\ on siamese neural networks that helps to extract relevant\ features from each view while keeping the models small. Instead of minimizing multiple objectives, we propose to simultaneously learn the 3D shape and the individual camera\ poses by using a single term loss based on the reprojection\ error, which generalizes from one to multiple views. This allows to globally optimize the whole scene without having to\ tune any hyperparameters and to achieve low reprojection\ errors, which are important for further texture generation.\ Finally, we train our model on a large scale dataset with\ more than 6,000 facial scans. We report competitive results\ in 3DFAW 2019 challenge, showing the effectiveness of our\ method.

}, author = {Ramon, Eduard and Escur, Janna and Xavier Gir{\'o}-i-Nieto} } @article {xGiro-i-Nieto19a, title = {One Perceptron to Rule Them All: Language and Vision}, year = {2019}, abstract = {

Deep neural networks have boosted the convergence of multimedia data analytics in a unified framework shared by practitioners in natural language and vision. Image captioning, visual question answering or multimodal translation are some of the first applications of a new and exciting field that exploiting the generalization properties of deep neural representations. This talk will provide an overview of how vision and language problems are addressed with deep neural networks, and the exciting challenges being addressed nowadays by the research community.

One Perceptron to Rule Them All: Language and Vision from Universitat Polit{\`e}cnica de Catalunya
}, author = {Xavier Gir{\'o}-i-Nieto} } @conference {cLin19, title = {One Shot Learning for Generic Instance Segmentation in RGBD Videos}, booktitle = {International Conference on Computer Vision, Theory and Applications}, year = {2019}, month = {02/2019}, publisher = {SciTePress}, organization = {SciTePress}, address = {Prague}, abstract = {

Hand-crafted features employed in classical generic instance segmentation methods have limited discriminative power to distinguish different objects in the scene, while Convolutional Neural Networks (CNNs) based semantic segmentation is restricted to predefined semantics and not aware of object instances. In this paper, we combine the advantages of the two methodologies and apply the combined approach to solve a generic instance segmentation problem in RGBD video sequences. In practice, a classical generic instance segmentation method is employed to initially detect object instances and build temporal correspondences, whereas instance models are trained based on the few detected instance samples via CNNs to generate robust features for instance segmentation. We exploit the idea of one shot learning to deal with the small training sample size problem when training CNNs. Experiment results illustrate the promising performance of the proposed approach.

}, keywords = {Convolutional Neural Network, Instance Segmentation, One Shot Learning}, isbn = {978-989-758-354-4}, doi = {10.5220/0007259902330239}, author = {X. Lin and Casas, J. and M. Pard{\`a}s} } @conference {cMas, title = {Picking groups instead of samples: A close look at Static Pool-based Meta-Active Learning}, booktitle = {ICCV Workshop - MDALC 2019}, year = {2019}, month = {10/2019}, address = {Seoul, South Korea}, abstract = {

Active Learning techniques are used to tackle learning problems where obtaining training labels is costly. In this work we use Meta-Active Learning to learn to select a subset of samples from a pool of unsupervised input for further annotation. This scenario is called Static Pool-based Meta-Active Learning. We propose to extend existing approaches by performing the selection in a manner that, unlike previous works, can handle the selection of each sample based on the whole selected subset.

}, author = {Mas, Ignasi and Morros, J.R. and Ver{\'o}nica Vilaplana} } @article {xLLados19, title = {The pillars of the Computer Vision Catalan Alliance}, year = {2019}, abstract = {

Panel at the 6th Annual Catalan Meeting on Computer Vision.

$\#$ACM2019 @JosepLlados now introducing CVCatalliance with @xantallavina and experts from different sectors: @Vilynx @Satellogic @osalap @TheCollider_MWC @DocXavi pic.twitter.com/o5tVWW0gMQ

{\textemdash} CVC_UAB (@CVC_UAB) September 17, 2019
}, author = {LLad{\'o}s, Josep and Bou, Elisenda and Bressan, Marco and Sala, Oscar and Xavier Gir{\'o}-i-Nieto} } @unpublished {cRamon19, title = {Plug-and-Train Loss for Model-Based Single View 3D Reconstruction}, journal = {BMVA Technical Meeting: 3D vision with Deep Learning}, year = {2019}, month = {02/2019}, publisher = {UPC}, address = {London, UK}, abstract = {

Obtaining 3D geometry from images is a well studied problem by the computer vision community. In the concrete case of a single image, a considerable amount of prior knowledge is often required to obtain plausible reconstructions. Recently, deep neural networks in combination with 3D morphable models (3DMM) have been used in order to address the lack of scene information, leading to more accurate results. Nevertheless, the losses employed during the training process are usually a linear combination of terms where the coefficients, also called hyperparameters, must be carefully tuned for each dataset to obtain satisfactory results. In this work we propose a hyperparameters-free loss that exploits the geometry of the problem for learning 3D reconstruction from a single image. The proposed formulation is not dataset dependent, is robust against very large camera poses and jointly optimizes the shape of the object and the camera pose.

Plug and-train Loss for Single View 3D Reconstruction from Universitat Polit{\`e}cnica de Catalunya
}, author = {Ramon, Eduard and Villar, Jordi and Ruiz, Guillermo and Batard, Thomas and Xavier Gir{\'o}-i-Nieto} } @conference {cPareto, title = {Prediction of a second clinical event in CIS patients by combining lesion and brain features}, booktitle = {Congress of the European Comitee for Treatment and Research in Multiple Sclerosis (ECTRIMS 2019)}, year = {2019}, month = {09/2019}, abstract = {

Prediction of a second clinical event in clinically isolated syndrome (CIS) patients, leading to clinically definite multiple sclerosis (CDMS), is still a matter of investigation. The aim of the current study was to predict conversion to CDMS after a first attack, by means of a machine-learning approach and MRI-derived brain features. For that purpose, lesion-based features (volume and distribution within brain lobes) as well as cortical thickness (CT) and deep grey matter volumes (DGMV) were used. The final goal was to determine which features were more frequently found in the classification between converters and non-converters.

}, author = {Deborah Pareto and Pau Vidal and M. Alberich and Carlos Lopez and C. Auger and M. Tintor{\'e} and X. Montalban and J. Sastre-Garriga and Ver{\'o}nica Vilaplana and Alex Rovira} } @article {aPetrone, title = {Prediction of amyloid pathology in cognitively unimpaired individuals using voxelwise analysis of longitudinal structural brain MRI}, journal = {Alzheimer{\textquoteright}s Research \& Therapy}, volume = {11}, year = {2019}, month = {12/2019}, abstract = {

Background: Magnetic resonance imaging (MRI) has unveiled specific alterations at different stages of Alzheimer{\textquoteright}s disease (AD) pathophysiologic continuum constituting what has been established as {\textquoteleft}AD signature{\textquoteright}. To what extent MRI can detect amyloid-related cerebral changes from structural MRI in cognitively unimpaired individuals is still an area open for exploration.

Method: Longitudinal 3D-T1 MRI scans were acquired from a subset of the ADNI cohort comprising 403 subjects: 79 controls (Ctrls), 50 preclinical AD (PreAD), 274 MCI and dementia due to AD (MCI/AD). Amyloid CSF was used as gold-standard measure with established cut-offs (\<192pg/mL) to establish diagnostic categories. Cognitively unimpaired individuals were defined as Ctrls if were amyloid negative and PreAD otherwise. The MCI/AD group was amyloid positive. Only subjects with the same diagnostic category at baseline and follow-up visits were considered for the study. Longitudinal morphometric analysis was performed using SPM12 to calculate Jacobian determinant maps. Statistical analysis was carried out on these jacobian maps to identify structural changes that were significantly different between diagnostic categories. A machine learning classifier was applied on Jacobian determinant maps to predict the presence of abnormal amyloid levels in cognitively unimpaired individuals. The performance of this classifier was evaluated using receiver operating characteristic curve analysis and as a function of the follow-up time between MRI scans. We applied a cost function to assess the benefit of using this classifier in the triaging of individuals in a clinical trial-recruitment setting.

Results: The optimal follow-up time for classification of Ctrls vs PreAD was Δt\>2.5 years and hence, only subjects within this temporal span are used for evaluation (15 Ctrls, 10 PreAD). The longitudinal voxel-based classifier achieved an AUC=0.87 (95\%CI:0.72-0.97). The brain regions that showed the highest discriminative power to detect amyloid abnormalities were the medial, inferior and lateral temporal lobes, precuneus, caudate heads, basal forebrain and lateral ventricles.

Conclusions: Our work supports that machine learning applied to longitudinal brain volumetric changes can be used to predict, with high precision, presence of amyloid abnormalities in cognitively unimpaired subjects. Used as a triaging method to identify a fixed number of amyloid positive individuals, this longitudinal voxelwise classifier is expected to avoid 55\% of unnecessary CSF and/or PET scans and reduce economic cost by 40\%.

}, doi = {https://doi.org/10.1186/s13195-019-0526-8}, url = {https://link.springer.com/article/10.1186/s13195-019-0526-8}, author = {Paula Petrone and Adri{\`a} Casamitjana and Carles Falcon and Miguel Artigues C{\`a}naves and G. Operto and R. Cacciaglia and Jose Luis Molinuevo and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @conference {cSalembier19, title = {Processing Radar Images with Hierarchical Region-Based Representations and Graph Signal Processing Tools}, booktitle = {Workshop on Digital Topology and Mathematical Morphology associated to the 21st IAPR International Conference on Discrete Geometry for Computer Imagery, Keynote speech}, year = {2019}, address = {Paris, France}, author = {Salembier, P.} } @conference {cHerrera-Palacioa, title = {Recurrent Instance Segmentation using Sequences of Referring Expressions}, booktitle = {NeurIPS workshop on Visually Grounded Interaction and Language (ViGIL)}, year = {2019}, month = {09/2019}, address = {Vancouver, Canada}, abstract = {

The goal of this work is segmenting the objects in an image which are referred to by a sequence of linguistic descriptions (referring expressions). We propose a deep neural network with recurrent layers that output a sequence of binary masks, one for each referring expression provided by the user. The recurrent layers in the architecture allow the model to condition each predicted mask on the previous ones, from a spatial perspective within the same image. Our multimodal approach uses off-the-shelf architectures to encode both the image and the referring expressions. The visual branch provides a tensor of pixel embeddings that are concatenated with the phrase embeddings produced by a language encoder. Our experiments on the RefCOCO dataset for still images indicate how the proposed architecture successfully exploits the sequences of referring expressions to solve a pixel-wise task of instance segmentation.

Recurrent Instance Segmentation with Linguistic Referring Expressions from Universitat Polit{\`e}cnica de Catalunya
}, author = {Herrera-Palacio, Alba and Ventura, C. and Silberer, Carina and Sorodoc, Ionut-Teodor and Boleda, Gemma and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xHerrera-Palacio, title = {Recurrent Instance Segmentation with Linguistic Referring Expressions}, year = {2019}, abstract = {

The goal of this work is segmenting the object in an image or video which is referred to by a linguistic description (referring expression).\ We propose a deep neural network with recurrent layers that output a sequence of binary masks, one for each referring expression provided by the user.\ The recurrent layers in the architecture allow the model to condition each predicted mask on the previous ones, from a spatial perspective within the same image.\ Our multimodal approach uses off-the-shelf architectures to encode both the image and the referring expressions.\ The visual branch provides a tensor of pixel embeddings that are concatenated with the phrase embeddings produced by a language encoder.\ We focus our study on comparing different configurations to encode and combine the visual and linguistic representations.\ Our experiments on the RefCOCO dataset for still images indicate how the proposed architecture successfully exploits the referring expressions to solve a pixel-wise task of instance segmentation.

Recurrent Instance Segmentation with Linguistic Referring Expressions from Universitat Polit{\`e}cnica de Catalunya
}, author = {Herrera-Palacio, Alba}, editor = {Xavier Gir{\'o}-i-Nieto and Ventura, C. and Silberer, Carina} } @conference {cRuiz-Hidalgo19, title = {Residual Attention Graph Convolutional Network for Geometric 3D Scene Classification}, booktitle = {IEEE Conference on Computer Vision Workshop (ICCVW)}, year = {2019}, month = {11/2019}, publisher = {IEEE}, organization = {IEEE}, address = {Seoul, Korea}, abstract = {

Geometric 3D scene classification is a very challenging task. Current methodologies extract the geometric information using only a depth channel provided by an RGBD sensor. These kinds of methodologies introduce possible errors due to missing local geometric context in the depth channel. This work proposes a novel Residual Attention Graph Convolutional Network that exploits the intrinsic geometric context inside a 3D space without using any kind of point features, allowing the use of organized or unorganized 3D data. Experiments are done in NYU Depth v1 and SUN-RGBD datasets to study the different configurations and to demonstrate the effectiveness of the proposed method. Experimental results show that the proposed method outperforms current state-of-the-art in geometric 3D scene classification tasks.\ 

}, doi = {10.1109/ICCVW.2019.00507}, url = {https://imatge-upc.github.io/ragc/}, author = {Mosella-Montoro, Albert and Ruiz-Hidalgo, J.} } @conference {cGullon, title = {Retinal lesions segmentation using CNNs and adversarial training}, booktitle = {International Symposium on Biomedical Imaging (ISBI 2019)}, year = {2019}, month = {04/2019}, abstract = {

Diabetic retinopathy (DR) is an eye disease associated with diabetes mellitus that affects retinal blood vessels. Early detection is crucial to prevent vision loss. The most common method for detecting the disease is the analysis of digital fundus images, which show lesions of small vessels and functional abnormalities.

Manual detection and segmentation of lesions is a time-consuming task requiring proficient skills. Automatic methods for retinal image analysis could help ophthalmologists in large-scale screening programs of population with diabetes mellitus allowing cost-effective and accurate diagnosis.

In this work we propose a fully convolutional neural network with adversarial training to automatically segment DR lesions in funduscopy images.\ 

}, author = {Nat{\`a}lia Gull{\'o}n and Ver{\'o}nica Vilaplana} } @conference {cVenturaa, title = {RVOS: End-to-End Recurrent Network for Video Object Segmentation}, booktitle = {CVPR}, year = {2019}, month = {06/2019}, publisher = {OpenCVF / IEEE}, organization = {OpenCVF / IEEE}, address = {Long Beach, CA, USA}, abstract = {

Multiple object video object segmentation is a challenging task, specially for the zero-shot case, when no object mask is given at the initial frame and the model has to find the objects to be segmented along the sequence. In our work, we propose RVOS, a recurrent network that is fully end-to-end trainable for multiple object video object segmentation, with a recurrence module working on two different domains: (i) the spatial, which allows to discover the different object instances within a frame, and (ii) the temporal, which allows to keep the coherence of the segmented objects along time. We train RVOS for zero-shot video object segmentation and are the first ones to report quantitative results for DAVIS-2017 and YouTube-VOS benchmarks. Further, we adapt RVOS for one-shot video object segmentation by using the masks obtained in previous time-steps as inputs to be processed by the recurrent module. Our model reaches comparable results to state-of-the-art techniques in YouTube-VOS benchmark and outperforms all previous video object segmentation methods not using online learning in the DAVIS-2017 benchmark.

RVOS: End-to-End Recurrent Network for Video Object Segmentation (CVPR 2019) from Universitat Polit{\`e}cnica de Catalunya



}, url = {http://openaccess.thecvf.com/content_CVPR_2019/html/Ventura_RVOS_End-To-End_Recurrent_Network_for_Video_Object_Segmentation_CVPR_2019_paper.html}, author = {Ventura, C. and M{\'\i}riam Bellver and Girbau, A. and Amaia Salvador and Marqu{\'e}s, F. and Xavier Gir{\'o}-i-Nieto} } @conference {cGarcia-Gomez19, title = {Self-registered lidar and polarimetric images in real-time: application to detection of small objects at sea}, booktitle = {7th Workshop on Active Imaging}, year = {2019}, month = {11/2019}, publisher = {French-German Research Institute of Saint-Louis (ISL)}, organization = {French-German Research Institute of Saint-Louis (ISL)}, address = {Saint-Louis, France}, author = {Pablo Garc{\'\i}a-G{\'o}mez and Jordi Riu and Casas, J. and S. Royo} } @article {aCasamitjanaa, title = {Shared latent structures between imaging features and biomarkers in early stages of Alzheimer{\textquoteright}s disease: a predictive study}, journal = {IEEE Journal of Biomedical and Health Informatics}, year = {2019}, month = {08/2019}, abstract = {

Magnetic resonance imaging (MRI) provides high resolution brain morphological information and is used as a biomarker in neurodegenerative diseases. Population studies of brain morphology often seek to identify pathological structural changes related to different diagnostic categories (e.g: controls, mild cognitive impairment or dementia) which normally describe highly heterogeneous groups with a single categorical variable. Instead, multiple biomarkers are used as a proxy for pathology and are more powerful in capturing structural variability. Hence, using the joint modeling of brain morphology and biomarkers, we aim at describing structural changes related to any brain condition by means of few underlying processes. In this regard, we use a multivariate approach based on Projection to Latent Structures in its regression variant (PLSR) to study structural changes related to aging and AD pathology. MRI volumetric and cortical thickness measurements are used for brain morphology and cerebrospinal fluid (CSF) biomarkers (t-tau, p-tau and amyloid-beta) are used as a proxy for AD pathology. By relating both sets of measurements, PLSR finds a low-dimensional latent space describing AD pathological effects on brain structure. The proposed framework allows to separately model aging effects on brain morphology as a confounder variable orthogonal to the pathological effect. The predictive power of the associated latent spaces (i.e. the capacity of predicting biomarker values) is assessed in a cross-validation framework.\ 

}, keywords = {CSF biomarkers, Latent model, MRI, PLS, preclinical AD}, doi = {10.1109/JBHI.2019.2932565}, author = {Adri{\`a} Casamitjana and Paula Petrone and J.L. Molinuevo and Juan D. Gispert and Ver{\'o}nica Vilaplana} } @article {aSalembier18, title = {Ship Detection in SAR Images Based on Maxtree Representation and Graph Signal Processing}, journal = {IEEE Transactions on Geoscience and Remote Sensing}, volume = {57}, year = {2019}, month = {05/2019}, pages = {2709 - 2724}, author = {Salembier, P. and Liesegang, S. and L{\'o}pez-Mart{\'\i}nez, C.} } @conference {cLinardosa, title = {Simple vs complex temporal recurrences for video saliency prediction}, booktitle = {British Machine Vision Conference (BMVC)}, year = {2019}, month = {09/2019}, publisher = {British Machine Vision Association}, organization = {British Machine Vision Association}, address = {Cardiff, Wales / UK.}, abstract = {

This paper investigates modifying an existing neural network architecture for static saliency prediction using two types of recurrences that integrate information from the temporal domain. The first modification is the addition of a ConvLSTM within the architecture, while the second is a computationally simple exponential moving average of an internal convolutional state. We use weights pre-trained on the SALICON dataset and fine-tune our model on DHF1K. Our results show that both modifications achieve state-of-the-art results and produce similar saliency maps.

}, url = {https://bmvc2019.org/wp-content/uploads/papers/0952-paper.pdf}, author = {Linardos, Panagiotis and Mohedano, Eva and Nieto, Juan Jos{\'e} and O{\textquoteright}Connor, N. and Xavier Gir{\'o}-i-Nieto and McGuinness, Kevin} } @article {aKuijf, title = {Standardized Assessment of Automatic Segmentation of White Matter Hyperintensities; Results of the WMH Segmentation Challenge}, journal = {IEEE Transactions on Medical Imaging}, year = {2019}, month = {03/2019}, abstract = {

Quantification of white matter hyperintensities (WMH) of presumed vascular origin is of key\ importance in many neurological research studies. Advanced\ measurements are obtained from manual segmentations on brain\ MR images, which is a laborious procedure. Automatic WMH\ segmentation methods exist, but a standardized comparison of\ such methods is lacking. We organized a scientific challenge, in\ which developers could evaluate their method on a standardized\ multi-center/-scanner image dataset, giving an objective comparison:\ the WMH Segmentation Challenge (http://wmh.isi.uu.nl/).\ Sixty T1+FLAIR images from three MR scanners were released\ with manual WMH segmentations. A secret test set of 110\ images from five MR scanners was used for evaluation. Methods\ had to be containerized and submitted to the challenge organizers.Five evaluation metrics were used to rank the methods:\ (1) Dice Similarity Coefficient, (2) modified Hausdorff distance\ (95th percentile), (3) absolute percentage volume difference, (4)\ sensitivity for detecting individual lesions, and (5) F1-score for\ individual lesions. Additionally, methods were ranked on their\ inter-scanner robustness.\ Twenty participants submitted their method for evaluation.\ This paper provides a detailed analysis of the results. In brief,there is a cluster of four methods that rank significantly better\ than the other methods. There is one clear winner, which also\ has the best inter-scanner robustness.\ The challenge remains open for future submissions and provides\ a public platform for method evaluation.

}, keywords = {brain, Evaluation and performance, Magnetic resonance imaging (MRI), segmentation}, issn = {0278-0062}, doi = {10.1109/TMI.2019.2905770}, author = {Hugo Kuijf and Matthijs Biesbroek and Jeroen de Bresser and Rutger Heinen and Simon Andermatt and Mariana Bento and Matt Berseth and Mikhail Belyaev and Jorge Cardoso and Adri{\`a} Casamitjana and Louis Collins and Mahsa Dadar and Achileas Georgiou and Mohsen Ghafoorian and Dakai Jin and April Khademi and Jesse Knight and Hongwei Li and Xavier Llado and Miguel Luna and Qaiser Mahmood and Richard McKinley and Alireza Mehrtash and Sebastien Ourselin and Bo-yong Park and Hyunkin Park and Sang Hyun Park and Simon Pezold and Elodie Puybareau and Leticia Rittner and Carole Sudre and Sergi Valverde and Ver{\'o}nica Vilaplana and Rolan Wiest and Yongchao Xu and Ziyue Xu and Guodong Zeng and Jianguo Zhang and Guoyan Zheng and Christoper Chen and Wiesje van der Flier and Frederik Barkhof and Max Viergever and Geert Jan Biessels} } @phdthesis {dCasamitjana19, title = {Study of early stages of Alzheimer{\textquoteright}s disease using magnetic resonance imaging}, volume = {PhD}, year = {2019}, school = {Universitat Polit{\`e}cnica de Catalunya}, address = {Barcelona}, abstract = {

Alzheimer{\textquoteright}s disease (AD) is a neurodegenerative disorder that constitutes the most common dementia pathology. It represents a global epidemic that expands exponentially as the life expectancy increases with no yet useful treatment. Currently, it represents a huge social and economic burden for our societies and it is expected to tension public health infraestructures and finances in the near future.

AD is characterized by amyloid plaque deposition and neurofibrillary tangles measured by ex-vivo examination of the brain. Recent developments in fluid biomarkers and brain imaging allow in-vivo quantification of pathophysiological processes of amyloid deposition or tau tangles formation in the brain, providing the community with highly sensitive and specific in-vivo biomarkers for Alzheimer{\textquoteright}s disease diagnosis. Abnormal levels of these biomarkers are thought as the initiating event to a cascade of subsequent events that continue with synapse loss, cell death, memory impairment, functional dysfunction and cognitive decline. All these events constitute the Alzheimer{\textquoteright}s continuum which can be broadly split into two main parts: an initial long and silent preclinical stage characterized by abnormal AD biomarkers and cognition within the normal range that could last from 15 to 30 years and a posterior clinical stage where subjects develop dementia symptoms.

The etiology of AD is still poorly understood even though several risk factors are identified. Large observational studies can help the study of AD and its related biomarkers and risk factors. In this thesis we provide methodological tools for the analysis of Alzheimer{\textquoteright}s disease using magnetic resonance imaging (MRI). We focus on the study of subjects within the preclinial stage of AD by using statistical learning and pattern recognition frameworks to perform inferential statistics and develop predictive models.

The main outcomes of this thesis are three-fold: firstly, we develop an open-source toolbox for nonlinear neuroimage analysis in population studies. While nonlinear association between medical images and several factors is already known, standard neuroimaging softwares only provide linear statistical frameworks that limit the analyses. Secondly, we study the relationship between brain structure using MRI and the underlying Alzheimer{\textquoteright}s pathology along the disease continuum and at different stages. The close relationship between MRI and clinical symptoms has been widely studied but describing AD using biomarkers instead of clinical phenotypes allows us to study preclinical stages of AD. Finally, we present a framework to predict cognitively unimpaired and amyloid positive subjects using MR imaging and machine learning. We report the results in a cross-sectional study and in a longitudinal study that compares the volumetric rate-of-change between subjects with different amyloid status. We further test the proposed methodology as a part of the triaging process in clinical trials showing great potential benefits.

}, url = {http://hdl.handle.net/10803/668251}, author = {Adri{\`a} Casamitjana}, editor = {Ver{\'o}nica Vilaplana} } @mastersthesis {xCasals19, title = {Synthesis of acne images for data augmentation with generative adversarial networks}, year = {2019}, abstract = {

Generative Adversarial Networks (GANs) are deep learning architectures known for their usefulness on synthesizing new images. Conditioned image generation or the synthesis of super-resolution images are some of their main uses, but they are also helpful when tackling particular image classification and segmentation problems. The latter application is the motivation for the work presented in this document.

This work studies the synthesis of acne images for data augmentation, a procedure validated using said synthetic images to tackle an image classification problem.

The main challenge will be to work around the instability in the training of GANs. Therefore, different known solutions will be implemented in order to overcome this problem.

}, author = {Roger Casals}, editor = {Ver{\'o}nica Vilaplana} } @inbook {bBellot19, title = {Unsupervised GRN Ensemble}, booktitle = {Sanguinetti G., Huynh-Thu V. (eds) Methods in Molecular Biology }, volume = {1883}, year = {2019}, pages = {283-302}, publisher = { Springer science, Humana Press}, organization = { Springer science, Humana Press}, address = {New York, NY}, issn = {978-1-4939-8881-5}, doi = {https://doi-org.recursos.biblioteca.upc.edu/10.1007/978-1-4939-8882-2}, author = {P. Bellot and Salembier, P. and Pham, N.C. and Meyer, P. E.} } @conference {cGene-Mola19, title = {Uso de redes neuronales convolucionales para la detecci{\'o}n remota de frutos con c{\'a}maras RGB-D}, booktitle = {Congreso Ib{\'e}rico de Agroingenier{\'\i}a}, year = {2019}, month = {09/2019}, publisher = { Universidad de Zaragoza (UZA)}, organization = { Universidad de Zaragoza (UZA)}, address = {Huesca}, abstract = {

La detecci{\'o}n remota de frutos ser{\'a} una herramienta indispensable para la gesti{\'o}n agron{\'o}mica optimizada y sostenible de las plantaciones frut{\'\i}colas del futuro, con aplicaciones en previsi{\'o}n de cosecha, robotizaci{\'o}n de la recolecci{\'o}n y elaboraci{\'o}n de mapas de producci{\'o}n. Este trabajo propone el uso de c{\'a}maras de profundidad RGB-D para la detecci{\'o}n y la posterior localizaci{\'o}n 3D de los frutos. El material utilizado para la adquisici{\'o}n de datos consiste en una plataforma terrestre autopropulsada equipada con dos sensores Kinect v2 de Microsoft y un sistema de posicionamiento RTK-GNSS. Con este equipo se escanearon 3 filas de manzanos Fuji de una explotaci{\'o}n comercial. El conjunto de datos adquiridos est{\'a} compuesto por 110 capturas que contienen un total de 12,838 manzanas Fuji. La detecci{\'o}n de frutos se realiz{\'o} mediante los datos RGB (im{\'a}genes de color proporcionadas por el sensor). Para ello, se implement{\'o} y se entren{\'o} la red neuronal convolucional de detecci{\'o}n de objetos Faster R-CNN, la cual est{\'a} compuesta por dos m{\'o}dulos: red de propuesta de regiones de inter{\'e}s y red de clasificaci{\'o}n. Ambos m{\'o}dulos comparten las primeras capas convolucionales siguiendo el modelo VGG-16 pre-entrenado con la base de datos ImageNet. Los resultados de test muestran un porcentaje de detecci{\'o}n del 91.4\% de los frutos con un 15.9\% de falsos positivos (F1-score = 0.876). La evaluaci{\'o}n cualitativa de las detecciones muestra que los falsos positivos corresponden a zonas de la imagen que presentan un patr{\'o}n muy similar a una manzana, donde, incluso a percepci{\'o}n del ojo humano, es dif{\'\i}cil de determinar si hay o no manzana. Por otro lado, las manzanas no detectadas corresponden a aquellas que estaban ocultas casi en su totalidad por otros {\'o}rganos vegetativos (hojas o ramas) o a manzanas cortadas por los m{\'a}rgenes de la imagen. De los resultados experimentales se concluye que el sensor Kinect v2 tiene un gran potencial para la detecci{\'o}n y localizaci{\'o}n 3D de frutos. La principal limitaci{\'o}n del sistema es que el rendimiento del sensor de profundidad se ve afectado en condiciones de alta iluminaci{\'o}n

}, keywords = {C{\'a}maras de profundidad, Detecci{\'o}n de frutos, Redes neuronales convolucionales, RGB-D, Rob{\'o}tica agr{\'\i}cola}, doi = {https://doi.org/10.26754/c_agroing.2019.com.3325}, author = {Gen{\'e}-Mola, Jordi and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Gregorio, Eduard} } @mastersthesis {xGranero, title = {A Video Database for Analyzing Affective Physiological Responses}, year = {2019}, abstract = {

Affective computing, leveraged by machine learning techniques, is advancing rapidly in the task of affect recognition in videos. However, there is a need for more annotated data. Several studies have built huge video datasets with emotions annotations. Others have collected music videos or film scenes datasets with physiological signals. However, none of them approached a solution with both physiological signals and user-generated videos. The work introduced here presents GALLUS, a novel database of user-generated videos with affective physiological responses. The database is composed of 775 videos that have been previously annotated through an online crowdsourcing platform. Physiological responses such as electroencephalography, electrocardiography, galvanic skin response, facial emotion recognition, and eye-gaze have been collected from 30 participants while they watched the stimuli. Our dataset will be made public to foster research in affect recognition.

}, author = {Granero, Marcel}, editor = {Borth, Damian and Weber, Barbara and Xavier Gir{\'o}-i-Nieto} } @conference {cHerrera-Palacio, title = {Video Object Linguistic Grounding}, booktitle = {ACM Multimedia Workshop on Multimodal Understanding and Learning for Embodied Applications (MULEA)}, year = {2019}, month = {10/2019}, publisher = {ACM}, organization = {ACM}, address = {Nice, France}, abstract = {

The goal of this work is segmenting on a video sequence the objects\ which are mentioned in a linguistic description of the scene. We\ have adapted an existing deep neural network that achieves state of\ the art performance in semi-supervised video object segmentation,\ to add a linguistic branch that would generate an attention map\ over the video frames, making the segmentation of the objects\ temporally consistent along the sequence.

Video Object Linguistic Grounding from Universitat Polit{\`e}cnica de Catalunya

Xavier Giro-i-Nieto and Amanda Duarte in ACM Multimedia 2019

}, author = {Herrera-Palacio, Alba and Ventura, C. and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xNieto, title = {Video Saliency Prediction with Deep Neural Networks}, year = {2019}, abstract = {

Saliency prediction is a topic undergoing intense study in computer vision with a broad range of applications. It consists in predicting where the attention is going to be received in an image or a video by a human. Our work is based on a deep neural network named SalGAN, which was trained on a saliency annotated dataset of static images. In this thesis we investigate different approaches for extending SalGAN to the video domain. To this end, we investigate the recently proposed saliency annotated video dataset DHF1K to train and evaluate our models. The obtained results indicate that techniques such as depth estimation or coordconv can effectively be used as additional modalities to enhance the saliency prediction of static images obtained with SalGAN, achieving encouraging results in the DHF1K benchmark. Our work is based on pytorch and it is publicly available here.

Video Saliency Prediction with Deep Neural Networks - Juan Jose Nieto - DCU 2019 from Universitat Polit{\`e}cnica de Catalunya
}, author = {Nieto, Juan Jos{\'e}}, editor = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @conference {cFernandeze, title = {VLX-Stories: a Semantically Linked Event platform for media publishers}, booktitle = {Proceedings of the ISWC 2019 Satellite Tracks (Posters \& Demonstrations, Industry, and Outrageous Ideas) co-located with 18th International Semantic Web Conference (ISWC 2019)}, year = {2019}, month = {10/2019}, publisher = {CEUR Workshop Proceeedings}, organization = {CEUR Workshop Proceeedings}, address = {Auckland, New Zealand}, abstract = {

In this article we present a web platform used by media producers to monitor word events, detected by VLX-Stories.\ The event detector system retrieves multi-regional articles from news sites, aggregates them by topic, and summarizes them by extracting and structuring its most relevant entities in order to answer the journalism W{\textquoteright}s: who, what, when and where.\ The dashboard displays online detected events in a semantically linked space which allows navigation among trending news stories on distinct countries, categories and time.\ Moreover, detected events are linked to costumer contents, helping editorial process by providing real time access to breaking news related to their contents.\ (Demo paper)

}, url = {http://ceur-ws.org/Vol-2456/paper61.pdf}, author = {Fern{\`a}ndez, D{\`e}lia and Bou, Elisenda and Xavier Gir{\'o}-i-Nieto} } @conference {cFernandezd, title = {VLX-Stories: building an online Event Knowledge Base with Emerging Entity detection}, booktitle = {The Semantic Web {\textendash} ISWC 2019}, year = {2019}, month = {10/2019}, pages = {382-399}, publisher = {Springer, Cham}, organization = {Springer, Cham}, chapter = {24}, address = {Auckland, New Zealand}, abstract = {

We present an online multilingual system for event detection and comprehension from media feeds. The system retrieves information from news sites and social networks, aggregates them into events (event detection), and summarizes them by extracting semantic labels of its most relevant entities (event representation) in order to answer the journalism W{\textquoteright}s: who, what, when and where. The generated events populate VLX-Stories -an event Knowledge Base (KB)- transforming unstructured text data to a structured knowledge base representation.\ Our system exploits an external entity Knowledge Base (VLX-KG) to help populate VLX-Stories. At the same time, this external knowledge base can also be extended with a Dynamic Entity Linking (DEL) module, which detects Emerging Entities (EE) on unstructured data and adds them to VLX-KG.\ The system is currently used in production, detecting over 6000 monthly events from over 3500 news feeds from seven different countries and in three different languages.

}, keywords = {emerging entities, Entity Linking, event encoding, knowledge base population. knowledge graph, topic detection}, issn = {978-3-030-30796-7}, doi = {10.1007/978-3-030-30796-7_24}, url = {https://link.springer.com/chapter/10.1007/978-3-030-30796-7_24}, author = {Fern{\`a}ndez, D{\`e}lia and Bou, Elisenda and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xTubau, title = {Wav2Pix: Enhancement and Evaluation of a Speech-conditioned Image Generator}, year = {2019}, abstract = {

The interest for image synthesis has grown exponentially for the last years. Few years ago, it was invented a very powerful tool for this task: Generative Adversarial Networks (GANs). As its high performance in generating realistic images has been proved, nowadays many researchers are putting the focus on cross-modal learning.

Taking advantage of the huge amount of information we can extract from speech (such as identity, gender or emotional state), in this\  work\  we\  explore\  its\  potential\  to\  generate\  face\  images of a speaker by conditioning a GAN with his/her voice.\  We propose the enhancement and evaluation of a deep neural network that is trained from scratch in an end-to-end fashion, generating a face directly from the raw speech waveform without any additional identity information (e.g reference image or one-hot encoding).

This project focus on the enhancement of a previous model proposed by Francisco Roldan. As a result of a deep analysis on the former project strengths and weaknesses, we present a novel dataset collected for this work, with high-quality videos of ten youtubers with notable expressiveness in both the speech and visual signals. Besides, unlike in the preliminary project, four different techniques are proposed in order to assess the results.

}, author = {Tubau, Miquel}, editor = {Amanda Duarte and Xavier Gir{\'o}-i-Nieto} } @conference {cDuartea, title = {Wav2Pix: Speech-conditioned Face Generation using Generative Adversarial Networks}, booktitle = {ICASSP}, year = {2019}, month = {05/2019}, publisher = {IEEE}, organization = {IEEE}, address = {Brighton, UK}, abstract = {

Speech is a rich biometric signal that contains information\ about the identity, gender and emotional state of the speaker.\ In this work, we explore its potential to generate face images\ of a speaker by conditioning a Generative Adversarial\ Network (GAN) with raw speech input. We propose a deep\ neural network that is trained from scratch in an end-to-end\ fashion, generating a face directly from the raw speech waveform\ without any additional identity information (e.g reference\ image or one-hot encoding). Our model is trained in a\ self-supervised fashion by exploiting the audio and visual signals\ naturally aligned in videos. With the purpose of training\ from video data, we present a novel dataset collected for this\ work, with high-quality videos of ten youtubers with notable\ expressiveness in both the speech and visual signals.

}, doi = {10.1109/ICASSP.2019.8682970}, url = {http://hdl.handle.net/2117/167073}, author = {Amanda Duarte and Rold{\'a}n, Francisco and Tubau, Miquel and Escur, Janna and Pascual-deLaPuente, Santiago and Amaia Salvador and Mohedano, Eva and McGuinness, Kevin and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cXu, title = {Action Tube Extraction based 3D -CNN for RGB-D Action Recognition}, booktitle = {International Conference on Content-Based Multimedia Indexing CBMI 2018}, year = {2018}, month = {09/2018}, abstract = {

In this paper we propose a novel action tube extractor for RGB-D action recognition in trimmed videos. The action tube extractor takes as input a video and outputs an action

tube. The method consists of two parts: spatial tube extraction and temporal sampling. The first part is built upon MobileNet-SSD and its role is to define the spatial region where the action takes place. The second part is based on the structural similarity index (SSIM) and is designed to remove frames without obvious motion from the primary action tube. The final extracted action tube has two benefits: 1) a higher ratio of ROI (subjects of action) to background; 2) most frames contain obvious motion change. We propose to use a two-stream (RGB and Depth) I3D architecture as our 3D-CNN model. Our approach outperforms the state-of-the-art methods on the OA and NTU RGB-D datasets.

}, author = {Zineng Xu and Ver{\'o}nica Vilaplana and Morros, J.R.} } @inbook {bTochon18, title = {Advances in utilization of hierarchical representations in remote sensing data analysis}, booktitle = {Reference Module in Earth Systems and Environmental Sciences}, volume = {2}, year = {2018}, pages = {77-107}, publisher = {Elsevier}, organization = {Elsevier}, issn = {978-0-12-409548-9}, author = {G. Tochon and Dalla Mura, M. and Veganzones, M.A. and Valero, S. and Salembier, P. and Chanussot, J.} } @mastersthesis {xArtigot18, title = {Automatic fruit classification using deep learning}, year = {2018}, abstract = {

To achieve greater efficiency at harvesting tasks, the mechanization of such task is unavoidable. Apart from the mechanical aspects, the harvesting systems needs software that can locate the fruit to be harvested.

The use of machine learning and deep learning techniques to achieve such software was studied in this thesis. The results showed that an accuracy similar to other studies is feasible with a limited number of training samples using deep learning techniques.

From this thesis we conclude that the mechanization of the harvesting labour is possible, at least from the software point of view, while the crop estimation application may need some more work before being feasible.

}, author = {Joaquim Mart{\'\i}nez Artigot}, editor = {Josep Ramon Morros and Ver{\'o}nica Vilaplana} } @mastersthesis {xRoca, title = {Block-based Speech-to-Speech Translation}, year = {2018}, abstract = {

This bachelor{\textquoteright}s thesis explores different ways of building a block-based Speech Translation system with the aim of generating huge amounts of parallel speech data. The first goal is to research and manage to run suitable tools to implement each one of the three blocks that integrates the Speech Translation system: Speech Recognition, Translation and Speech Synthesis. We experiment with some open-source toolkits and we manage to train a speech recognition system and a neural machine translation system. Then, we test them to evaluate their performance. As an alternative option, we use the cloud computing solutions provided by Google Cloud to implement the three sequential blocks and we successfully build the overall system. Finally, we make a comparative study between an in-house software development versus Cloud computing implementation.

Block-based Speech to Speech Translation from Universitat Polit{\`e}cnica de Catalunya
}, author = {Roca, Sandra}, editor = {Amanda Duarte and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xRosello18, title = {Brain lesion segmentation using Convolutional Neuronal Networks}, year = {2018}, abstract = {

Convolutional neural networks (CNN) are powerful tools for learning representations from images. They are being used in a large range of applications, being the state-of-the art in many computer vision tasks. In this work, we study the brain tumor segmentation problem using CNNs and the publicly available BraTS dataset. One of the key factors for this task is which training scheme is used since it should deal with memory constraints and should alleviate the high-imbalance nature between healthy and lesion tissue in the brain.

Thus, the purpose of this project is to propose a comparison between several training schemes and extensively analyze and evaluate them in terms of the dice score. We evaluate densetraining against patch-sampling, and particularly, xed-rule against adaptive sampling scheme. Furthermore, variants and modications of the existing training schemes have been proposed in order to enhance their performance. Finally, several loss functions for each training scheme have been analyzed.

}, author = {Clara Bon{\'\i}n Rosell{\'o}}, editor = {Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana} } @conference {cSancheza, title = {Brain MRI Super-Resolution using Generative Adversarial Networks}, booktitle = {International Conference on Medical Imaging with Deep Learning}, year = {2018}, month = {07/2018}, address = {Amsterdam, The Netherlands}, author = {Irina S{\'a}nchez and Ver{\'o}nica Vilaplana} } @inbook {bCasamitjana18, title = {Cascaded V-Net Using ROI Masks for Brain Tumor Segmentation}, booktitle = {Brainlesion: Glioma, Multiple Sclerosis, Stroke and Traumatic Brain Injuries. BrainLes 2017}, volume = { 10670}, number = {Lecture Notes in Computer Science}, year = {2018}, pages = {381-391}, publisher = {Springer}, organization = {Springer}, edition = {Crimi A., Bakas S., Kuijf H., Menze B., Reyes M. (eds)}, address = {Cham}, abstract = {

In this work we approach the brain tumor segmentation problem with a cascade of two CNNs inspired in the V-Net architecture, reformulating residual connections and making use of ROI masks to constrain the networks to train only on relevant voxels. This architecture allows dense training on problems with highly skewed class distributions, such as brain tumor segmentation, by focusing training only on the vecinity of the tumor area. We report results on BraTS2017 Training and Validation sets.

}, isbn = {978-3-319-75238-9}, issn = {978-3-319-75237-2}, doi = {https://doi.org/10.1007/978-3-319-75238-9_33}, url = {https://link.springer.com/chapter/10.1007/978-3-319-75238-9_33}, author = {Adri{\`a} Casamitjana and Marcel Cat{\`a} and Irina S{\'a}nchez and Marc Combalia and Ver{\'o}nica Vilaplana} } @article {aPetrone18, title = {Characteristic Brain Volumetric Changes in the AD Preclinical Signature.}, journal = {Alzheimer{\textquoteright}s \& Dementia: The Journal of the Alzheimer{\textquoteright}s Association}, volume = {14}, year = {2018}, month = {07/2018}, pages = {P1235}, doi = {10.1016/j.jalz.2018.06.1737}, author = {P. Petrone and Adri{\`a} Casamitjana and C. Falcon and M. Artigues and G. Operto and S. Skouras and J.L. Molinuevo and Ver{\'o}nica Vilaplana and J.D. Gispert} } @conference {cPetrone, title = {Characteristic Brain Volumetric Changes in the AD Preclinical Signature}, booktitle = {Alzheimer{\textquoteright}s Association International Conference}, year = {2018}, month = {07/2018}, address = {Chicago, USA}, abstract = {

In the last decade, magnetic resonance imaging has unveiled specific AD alterations at different stages of the AD pathophysiologic continuum that conform what has been established as the AD signature. To which extent MRI can detect structural changes at the preclinical asymptomatic stage of AD - the preclinical AD signature- is still an area open for exploration. Our work supports the idea that there are brain volumetric changes specific to preclinical AD subjects and defines the preclinical AD signature based on longitudinal data. While some regions show a pattern of atrophy that overlaps with the AD signature, other specific regions exhibit changes that are unique to this early asymptomatic AD stage.

}, author = {P. Petrone and Adri{\`a} Casamitjana and C. Falcon and M. Artigues and G. Operto and S. Skouras and R. Cacciaglia and J.L. Molinuevo and Ver{\'o}nica Vilaplana and J.D. Gispert} } @mastersthesis {xPelegri18, title = {Clasificaci{\'o}n de im{\'a}genes histol{\'o}gicas mediante redes neuronales convolucionales}, year = {2018}, author = {Joel Bustos Pelegr{\'\i}}, editor = {Marc Combalia and Ver{\'o}nica Vilaplana} } @conference {cFojo, title = {Comparing Fixed and Adaptive Computation Time for Recurrent Neural Network}, booktitle = {International Conference on Learning Representations (ICLR)}, year = {2018}, month = {02/2018}, address = {Vancouver, Canada}, abstract = {

Deep networks commonly perform better than shallow ones, but allocating the proper amount of computation for each particular input sample remains an open problem. This issue is particularly challenging in sequential tasks, where the required complexity may vary for different tokens in the input sequence. Adaptive Computation Time (ACT) was proposed as a method for dynamically adapting the computation at each step for Recurrent Neural Networks (RNN). ACT introduces two main modifications to the regular RNN formulation: (1) more than one RNN steps may be executed between an input sample is fed to the layer and and this layer generates an output,\  and (2) this number of steps is dynamically predicted depending on the input token and the hidden state of the network. In our work, we aim at gaining intuition about the contribution of these two factors to the overall performance boost observed when augmenting RNNs with ACT. We design a new baseline, Repeat-RNN, which performs a constant number of RNN state updates larger than one before generating an output. Surprisingly, such uniform distribution of the computational resources matches the performance of ACT in the studied tasks. We hope that this finding motivates new research efforts towards designing RNN architectures that are able to dynamically allocate computational resources.

Reproducing and Analyzing Adaptive Computation Time in PyTorch and TensorFlow from Universitat Polit{\`e}cnica de Catalunya
}, author = {Fojo, Daniel and V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto} } @conference {cSuris, title = {Cross-modal Embeddings for Video and Audio Retrieval}, booktitle = {ECCV 2018 Women in Computer Vision Workshop}, year = {2018}, month = {09/2018}, publisher = {Springer}, organization = {Springer}, address = {Munich, Germany}, abstract = {

The increasing amount of online videos brings several opportunities for training self-supervised neural networks. The creation of large scale datasets of videos such as the YouTube-8M allows us to deal with this large amount of data in manageable way. In this work, we find new ways of exploiting this dataset by taking advantage of the multi-modal information it provides. By means of a neural network, we are able to create links between audio and visual documents, by projecting them into a common region of the feature space, obtaining joint audio-visual embeddings. These links are used to retrieve audio samples that fit well to a given silent video, and also to retrieve images that match a given a query audio. The results in terms of Recall@K obtained over a subset of YouTube-8M videos show the potential of this unsupervised approach for cross-modal feature learning. We train embeddings for both scales and assess their quality in a retrieval problem, formulated as using the feature extracted from one modality to retrieve the most similar videos based on the features computed in the other modality.

Cross Modal Embeddings for Video and Audio Retrieval $\#$WiCV18 from Universitat Polit{\`e}cnica de Catalunya
}, isbn = {978-3-030-11018-5}, doi = {10.1007/978-3-030-11018-5_62}, url = {https://doi.org/10.1007/978-3-030-11018-5_62}, author = {Sur{\'\i}s, D{\'\i}dac and Amanda Duarte and Amaia Salvador and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @unpublished {xRamon, title = {Deep Learning algorithms for 3D Reconstruction and Simulation of Aesthetic Procedures}, year = {2018}, month = {07/2018}, type = {Phd Thesis Proposal}, abstract = {

3D technology is key for a wide range of industries. Medicine, construction, cinema and many other disciplines can nowadays digitalize the world we perceive using 3D reconstruction algorithms, create new objects by means of 3D printers or analyze the world using 3D detection and segmentation techniques.

These tools are transforming how research and industry problems are addressed. Concretely, in the field of aesthetic surgery, a fluent communication between doctors and patients is crucial in order to maximize the success of the operatories and the satisfaction of the patients. A new trend in the industry is to incorporate 3D technology during the consultation, with the goa lof improving that communication. By reconstructing the body of the patient and simulating aesthetic procedures on it, he or she can realise how a concrete modification would look like when applied to his or her body. It is also beneficial for the physicians, since they can measure the effectiveness of the applied treatments and also convert more consultations into operations due to an increase of confidence in the patient.

This thesis proposal defines the research directions to follow during an industrial doctorate to be developed in Crisalix Labs, in collaboration of the Image Processing Group at the Universitat Polit{\`e}cnica de Catalunya. Industrial doctorates are promoted from the Government of Catalonia to promote the transfer of knowledge from universities to local industry as an element for innovation and technical excellence.

}, author = {Ramon, Eduard}, editor = {Xavier Gir{\'o}-i-Nieto} } @conference {cGomez, title = {Demonstration of an Open Source Framework for Qualitative Evaluation of CBIR Systems}, booktitle = {ACM Multimedia}, year = {2018}, month = {10/2018}, publisher = {ACM}, organization = {ACM}, address = {Seoul, South Korea}, abstract = {

Evaluating image retrieval systems in a quantitative way, for example by computing measures like mean average precision, allows for objective comparisons with a ground-truth. However, in cases where ground-truth is not\  available, the only alternative is to collect feedback from a user. Thus, qualitative assessments become important to better understand how the system works. Visualizing the results could be, in some scenarios, the only way to evaluate the results obtained and also the only opportunity to identify that a system is failing. This necessitates developing a User Interface (UI) for a Content Based Image Retrieval (CBIR) system that allows visualization of results and improvement via capturing user relevance feedback. A well-designed UI facilitates understanding of the performance of the system, both in cases where it works well and perhaps more importantly those which highlight the need for improvement. Our open-source system implements three components to facilitate researchers to quickly develop these capabilities for their retrieval engine. We present: a web-based user interface to visualize retrieval results and collect user annotations; a server that simplifies\  connection with any underlying CBIR system; and a server that manages the search engine data.\ 

User Interface for an Image Retrieval Engine System from Universitat Polit{\`e}cnica de Catalunya
}, doi = {10.1145/3240508.3241395}, url = {https://dl.acm.org/citation.cfm?id=3241395}, author = {Gomez, Paula and Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @unpublished {xMoreno18, title = {English to ASL Translator for Speech2Signs}, year = {2018}, month = {01/2018}, type = {Internal report}, abstract = {

This paper illustrates the work around the English - American Signs Language (ASL) data generation for the speech2signs system that is devoted to the generation of a signs language interpreter. The current work will be, first, an approximation to the speech2signs system and, second, a video-to-video corpus generator for an end-to-end approximation of speech2signs. In order to generate the desired corpus data, the Google Transformer (a Neural Machine Translation system based completely on attention) will be trained to translate from English to ASL. The dataset used to train the Transformer is the ASLG-PC12.

}, author = {Moreno, Daniel and Costa-juss{\`a}, Marta R. and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {x, title = {Exploring Automatic Speech Recognition with TensorFlow}, year = {2018}, abstract = {

Advisors: Marta R. Costa-juss{\`a} (TALP, UPC) and Xavier Giro-i-Nieto (GPI, UPC)

Grade: A (9.8/10.9)

Speech recognition is the task aiming to identify words in spoken language and convert them into text. This bachelor{\textquoteright}s thesis focuses on using deep learning techniques to build an end-to-end Speech Recognition system. As a preliminary step, we overview the most relevant methods carried out over the last several years. Then, we study one of the latest proposals for this end-to-end approach that uses a sequence to sequence model with attention-based mechanisms. Next, we successfully reproduce the model and test it over the TIMIT database. We analyze the similarities and differences between the current implementation proposal and the original theoretical work. And finally, we experiment and contrast using different parameters (e.g. number of layer units, learning rates and batch sizes) and reduce the Phoneme Error Rate in almost 12\% relative.

Exploring Automatic Speech Recognition with Rensorflow from Universitat Polit{\`e}cnica de Catalunya
}, author = {Escur, Janna}, editor = {Costa-juss{\`a}, Marta R. and Xavier Gir{\'o}-i-Nieto} } @conference {cGene-Mola18, title = {Fruit Detection Using Mobile Terrestrial Laser Scanning}, booktitle = {AgEng 2018,}, year = {2018}, month = {07/2018}, address = {Wageningen (Netherlands)}, abstract = {

The development of reliable fruit detection and localization systems is essential for future sustainable agronomic management of high-value crops. Up to date, most proposed systems on fruit detection and characterization are based on RGB cameras and thus affected by intrinsic constraints, such as variable lighting conditions and camera calibration. This work presents a new technique that uses a mobile terrestrial laser scanner to detect and localize fruits regardless of the prevailing lighting conditions and without the need of a previous calibration. An experimental test focused on two Fuji apple trees (containing 139 and 145 apples each) was carried out. A 3D point cloud of this scene was generated using a Velodyne VLP-16 LiDAR sensor synchronized with a RTK-GNSS receiver. A reflectivity analysis of tree elements was performed, obtaining mean reflectivity values of 28.9\%, 29.1\%, and 44.3\% for leaves, trunks, and fruits, respectively. These results suggest that the reflectivity parameter can be useful to localize fruits in the tree. From this knowledge, a three-step fruit detection algorithm has been developed: 1) reflectivity thresholding to remove most of the leaves and trunks from the original point cloud; 2) statistical outlier removal to reduce noise; 3) connected components clustering using a density-based algorithm. By applying this algorithm to our dataset, a localization success of 85\%, a detachment success of 78.8\%, and a false detection rate of 15.2\% were obtained. These detection rates are similar to those obtained by current RGB-based system, but with the additional advantage of providing direct 3D fruit location information (global coordinates) which is not affected by sunlight variations. It can be concluded that LiDAR technology and, particularly, its reflectivity information, might have potential use in fruit detection. Future work should include the application of this fruit detection technique on a wider range of crop types

}, author = {Gen{\'e}-Mola, Jordi and Gregorio, Eduard and Guevara, Javier and Auat, Fernando and Escol{\`a}, Alexandre and Morros, J.R. and Rosell-Polo, Joan R.} } @mastersthesis {xRos18, title = {Generative Adversarial Networks for Anomaly Detection in Images}, year = {2018}, abstract = {

Anomaly detection is used to identify abnormal observations that don{\textquoteright}t follow a normal pattern. In

this work, we use the power of Generative Adversarial Networks in sampling from image distributions

to perform anomaly detection with images and to identify local anomalous segments within this

images. Also, we explore potential application of this method to support pathological analysis of

biological tissues.

}, author = {Guillem Batiste}, editor = {Ver{\'o}nica Vilaplana} } @conference {cSanchez-Escobedo, title = {HybridNet for Depth Estimation and Semantic Segmentation}, booktitle = {ICASSP 2018}, year = {2018}, month = {04/2018}, publisher = {IEEE}, organization = {IEEE}, address = {Calgary, Alberta, Canada}, abstract = {

Semantic segmentation and depth estimation are two important tasks in the area of image processing. Traditionally, these two tasks are addressed in an independent manner. However, for those applications where geometric and semantic information is required, such as robotics or autonomous navigation, depth or semantic segmentation alone are not sufficient. In this paper, depth estimation and semantic segmentation are addressed together from a single input image through a hybrid convolutional network. Different from the state of the art methods where features are extracted by a sole feature extraction network for both tasks, the proposed HybridNet improves the features extraction by separating the relevant features for one task from those which are relevant for both. Experimental results demonstrate that HybridNet results are comparable with the state of the art methods, as well as the single task methods that HybridNet is based on.

}, doi = {10.1109/ICASSP.2018.8462433}, url = {https://ieeexplore.ieee.org/abstract/document/8462433}, author = {D. Sanchez-Escobedo and X. Lin and Casas, J. and M. Pard{\`a}s} } @conference {cBakas19, title = {Identifying the best machine learning algorithms for brain tumor segmentation, progression assessment, and overall survival prediction in the BRATS challenge}, booktitle = {MICCAI - Multimodal Brain Tumor Segmentation Challenge}, year = {2018}, month = {12/2018}, abstract = {

Gliomas are the most common primary brain malignancies, with different degrees of aggressiveness, variable prognosis and various heterogeneous histologic sub-regions, i.e., peritumoral edematous/invaded tissue, necrotic core, active and non-enhancing core. This intrinsic heterogeneity is also portrayed in their radio-phenotype, as their sub-regions are depicted by varying intensity profiles disseminated across multi-parametric magnetic resonance imaging (mpMRI) scans, reflecting varying biological properties. Their heterogeneous shape, extent, and location are some of the factors that make these tumors difficult to resect, and in some cases inoperable. The amount of resected tumor is a factor also considered in longitudinal scans, when evaluating the apparent tumor for potential diagnosis of progression. Furthermore, there is mounting evidence that accurate segmentation of the various tumor sub-regions can offer the basis for quantitative image analysis towards prediction of patient overall survival. This study assesses the state-of-the-art machine learning (ML) methods used for brain tumor image analysis in mpMRI scans, during the last seven instances of the International Brain Tumor Segmentation (BraTS) challenge, i.e., 2012-2018. Specifically, we focus on i) evaluating segmentations of the various glioma sub-regions in pre-operative mpMRI scans, ii) assessing potential tumor progression by virtue of longitudinal growth of tumor sub-regions, beyond use of the RECIST/RANO criteria, and iii) predicting the overall survival from pre-operative mpMRI scans of patients that underwent gross total resection. Finally, we investigate the challenge of identifying the best ML algorithms for each of these tasks, considering that apart from being diverse on each instance of the challenge, the multi-institutional mpMRI BraTS dataset has also been a continuously evolving/growing dataset.

}, url = {https://arxiv.org/pdf/1811.02629.pdf}, author = {Spyridon Bakas and Mauricio Reyes and Andras Jakab and Stefan Bauer and Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana and et. Al.} } @mastersthesis {xColl-Pol, title = {The Importance of Time in Visual Attention Models}, year = {2018}, abstract = {

Predicting visual attention is a very active field in the computer vision community. Visual attention is a mechanism of the visual system that can select relevant areas within a scene. Models for saliency prediction are intended to automatically predict which regions are likely to be attended by a human observer. Traditionally, ground truth saliency maps are built using only the spatial position of the fixation points, being these xation points the locations where an observer fixates the gaze when viewing a scene. In this work we explore encoding the temporal information as well, and assess it in the application of prediction saliency maps with deep neural networks. It has been observed that the later fi xations in a scanpath are usually selected randomly during visualization, specially in those images with few regions of interest. Therefore, computer vision models have dificulties learning to predict them. In this work, we explore a temporal weighting over the saliency maps to better cope with this random behaviour. The newly proposed saliency representation assigns di erent weights depending on the position in the sequence of gaze fixations, giving more importance to early timesteps than later ones. We used this maps to train MLNet, a state of the art for predicting saliency maps. MLNet predictions were evaluated and compared to the results obtained when the model has been trained using traditional saliency maps. Finally, we show how the temporally weighted saliency maps brought some improvement when used to weight the visual features in an image retrieval task.

The Importance of Time in Visual Attention Models from Universitat Polit{\`e}cnica de Catalunya
}, author = {Coll-Pol, Marta}, editor = {McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @conference {cCamposa, title = {Importance Weighted Evolution Strategies}, booktitle = {NeurIPS 2018 Deep Reinforcement Learning Workshop }, year = {2018}, month = {11/2018}, address = {Montreal, Quebec}, abstract = {

Evolution Strategies (ES) emerged as a scalable alternative to popular Reinforcement Learning (RL) techniques, providing an almost perfect speedup when distributed across hundreds of CPU cores thanks to a reduced communication overhead. Despite providing large improvements in wall-clock time, ES is data inefficient when compared to competing RL methods. One of the main causes of such inefficiency is the collection of large batches of experience, which are discarded after each policy update. In this work, we study how to perform more than one update per batch of experience by means of Importance Sampling while preserving the scalability of the original method. The proposed method, Importance Weighted Evolution Strategies (IW-ES), shows promising results and is a first step towards designing efficient ES algorithms.

}, author = {V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @mastersthesis {xAlsina, title = {An interactive Lifelog Search Engine for LSC2018}, year = {2018}, abstract = {

This thesis consists on developing an interactive lifelog search engine for the LSC 2018 search\ challenge at ACM ICMR 2018. This search engine is created in order to browse for images from a given lifelog dataset and display them along with some written information related to them and four other images providing contextualization about the searched one.

First of all, the work makes an introduction to the relevance of this project. It introduces the\ reader to the main social problems affronted and the aim of our project to deal with them. Thus, go ahead with the scope of the project introducing to the main objectives fixed. Also, the work is gone by the actual state of the same kind of prototypes that already exist to let the reader see the differences that our project presents.

After the project approach is done, it begins a travel trough the methodology and creation\ process, going deep in the main aspects and the explanation of every election and decision, also remarking the limits of the current prototype.

Additionally, the project concludes with a result section where the system is tested with six users. They are asked to find three specific images using the search engine. This test is divided in two sections: first, a qualitative section where the user is asked to test the system and fill out a survey to see how comfortable it is for him. And a second section, more quantitative, where they value the speed of our system.

Finally, the project concludes going through the actual and future ethics of lifelogging in\ general and with a final conclusion further investigation and future improvement.

}, author = {Alsina, Adri{\`a}}, editor = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto} } @conference {cAlsina, title = {An Interactive Lifelog Search Engine for LSC2018}, booktitle = {Lifelog Search Challenge workshop at ICMR2018}, year = {2018}, month = {06/2018}, publisher = {ACM}, organization = {ACM}, address = {Yokohama, Japan}, abstract = {

In this work, we describe an interactive lifelog search engine developed for the LSC 2018 search challenge at ACM ICMR 2018. The paper introduces the four-step process required to support lifelog search engines and describes the source data for the search engine as well as the approach to ranking chosen for the iterative search engine. Finally the interface used is introduced before we highlight the limits of the current prototype and suggest opportunities for future work.

}, doi = {10.1145/3210539.3210546}, url = {https://dl.acm.org/citation.cfm?id=3210546}, author = {Alsina, Adri{\`a} and Xavier Gir{\'o}-i-Nieto and Gurrin, Cathal} } @article {aDimiccoli18, title = {Introduction to the special issue: Egocentric Vision and Lifelogging}, journal = {Journal of Visual Communication and Image Representation}, year = {2018}, month = {06/2018}, abstract = {

}, doi = {10.1016/j.jvcir.2018.06.010}, url = {https://www.sciencedirect.com/science/article/pii/S1047320318301354}, author = {Dimiccoli, M. and Gurrin, Cathal and Crandall, David and Xavier Gir{\'o}-i-Nieto and Radeva, Petia} } @article {xGiro-i-Nieto18, title = {Learning Where and When to Look}, year = {2018}, abstract = {

Deep learning models do not only achieve superior performances in image recognition tasks, but also in predicting where and when users focus their attention. This talk will provide an overview of how convolutional neural networks have been trained to predict saliency maps that describe the probability of fixing the gaze on each image location. Different solution have been proposed for this task, and our recent work has added a temporal dimension by predicting the gaze scanpath over 360 degree images for VR/AR. These techniques allow simulating eye tracker data with no need of user data collection.

Learning Where and When to Look $\#$reworkRETAIL 2018 from Universitat Polit{\`e}cnica de Catalunya
}, url = {https://www.re-work.co/events/deep-learning-in-retail-summit-london-2018}, author = {Xavier Gir{\'o}-i-Nieto} } @conference {cGorrizb, title = {Leishmaniasis Parasite Segmentation and Classification Using Deep Learning}, booktitle = {International Conference on Articulated Motion and Deformable Objects}, year = {2018}, address = {Palma, Spain}, abstract = {

Leishmaniasis is considered a neglected disease that causes thousands of deaths annually in some tropical and subtropical countries. There are various techniques to diagnose leishmaniasis of which manual microscopy is considered to be the gold standard. There is a need for the development of automatic techniques that are able to detect parasites in a robust and unsupervised manner. In this paper we present a procedure for automatizing the detection process based on a deep learning approach. We train a U-net model that successfully segments leismania parasites and classifies them into promastigotes, amastigotes and adhered parasites.

}, author = {G{\'o}rriz, Marc and Albert Aparicio and Berta Ravent{\'o}s and Daniel L{\'o}pez-Codina and Ver{\'o}nica Vilaplana and Elisa Sayrol} } @inbook {bGorriz18, title = {Leishmaniasis Parasite Segmentation and Classification Using Deep Learning}, booktitle = { Articulated Motion and Deformable Objects}, volume = {10945}, number = {Lecture Notes in Computer Science}, year = {2018}, pages = {53-62}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {

Leishmaniasis is considered a neglected disease that causes thousands of deaths annually in some tropical and subtropical countries. There are various techniques to diagnose leishmaniasis of which manual microscopy is considered to be the gold standard. There is a need for the development of automatic techniques that are able to detect parasites in a robust and unsupervised manner. In this paper we present a procedure for automatizing the detection process based on a deep learning approach. We train a U-net model that successfully segments leismania parasites and classifies them into promastigotes, amastigotes and adhered parasites.

}, issn = {978-3-319-94544-6}, doi = {10.1007/978-3-319-94544-6}, author = {G{\'o}rriz, Marc and Albert Aparicio and Berta Ravent{\'o}s and Ver{\'o}nica Vilaplana and Elisa Sayrol and Daniel L{\'o}pez-Codina} } @conference {cFernandezc, title = {Linking Media: adopting Semantic Technologies for multimodal media connection}, booktitle = {International Semantic Web Conference - ISWC (Industry Track)}, year = {2018}, month = {08/2018}, address = {Monterey, CA, USA}, abstract = {

Today{\textquoteright}s media and news organizations are constantly generating large amounts of multimedia content, majorly delivered online. As the online media market grows, the management and delivery of contents is becoming a challenge.\ Computational approaches can help to overcome this challenge by governing different applications such as content creation, production, search, and its promotion and distribution to different audiences.\ In this abstract we present a success story of the adoption of semantic technologies on the aforementioned applications, which\ \ are built on top of a semantic tagging framework, based on a Knowledge Graph (KG).\ The presented pipeline combines multimodal inputs into a contextual entity linking module, which indexes documents and links them to trends and stories developing on the news.\ \ We will describe how documents are linked and provided to media producers through Vilynx{\textquoteright}s platform, which is currently indexing over 20k media documents a day.

}, keywords = {Knowledge graph; Linked data; Multimedia; Semantic web Linked data; Computational approach; Content creation; Knowledge graphs; Multimedia; Multimedia contents; Multimodal inputs; Semantic tagging; Semantic technologies; Semantic Web}, url = {http://ceur-ws.org/Vol-2180/}, author = {Fern{\`a}ndez, D{\`e}lia and Bou-Balust, Elisenda and Xavier Gir{\'o}-i-Nieto} } @article {pPerez-Pellitero17, title = {Method for upscaling an image and apparatus for upscaling an image}, number = {US 20170132759 A1}, year = {2018}, month = {05/2017}, address = {US 20170132759 A1}, abstract = {

Image super-resolution (SR) generally enhance the resolution of images. One of SR{\textquoteright}s main challenge is discovering mappings between low-resolution (LR) and high-resolution (HR) image patches. The invention learns patch upscaling projection matrices from a training set of images. Input images are divided into overlapping patches, which are normalized and transformed to a defined orientation. Different transformations can be recognized and dealt with by using a simple 2D-projection. The transformed patches are clustered, and cluster specific upscaling projection matrices and corresponding cluster centroids determined during training are applied to obtain upscaled patches. The upscaled patches are assembled to an upscaled image.

https://worldwide.espacenet.com/publicationDetails/biblio?II=2\&ND=3\&adjacent=true\&locale=en_EP\&FT=D\&date=20170511\&CC=US\&NR=2017132759A1\&KC=A1

}, issn = {US 20170132759 A1}, url = {https://register.epo.org/ipfwretrieve?apn=US.201615341080.A\&lng=en}, author = {E. Perez-Pellitero and Salvador, J. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @conference {cCombaliaa, title = {Monte-Carlo Sampling applied to Multiple Instance Learning for Histological Image Classification}, booktitle = {Workshop on Deep Learning in Medical Image Analysis, MICCAI}, year = {2018}, month = {2018}, address = {Granada, Spain}, abstract = {

We propose a patch sampling strategy based on a sequential Monte-Carlo method for high resolution image classification in the context of Multiple Instance Learning. When compared with grid sampling and uniform sampling techniques, it achieves higher generalization performance. We validate the strategy on two artificial datasets and two histological datasets for breast cancer and sun exposure classification.\ 

}, author = {Marc Combalia and Ver{\'o}nica Vilaplana} } @inbook {bCombalia18, title = {Monte-Carlo Sampling Applied to Multiple Instance Learning for Histological Image Classification}, booktitle = {Deep Learning in Medical Image Analysis and Multimodal Learning for Clinical Decision Support}, number = {11045}, year = {2018}, pages = {274-281}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {

We propose a patch sampling strategy based on a sequential Monte-Carlo method for high resolution image classification in the context of Multiple Instance Learning. When compared with grid sampling and uniform sampling techniques, it achieves higher generalization performance. We validate the strategy on two artificial datasets and two histological datasets for breast cancer and sun exposure classification.

}, isbn = {978-3-030-00889-5}, issn = {10.1007/978-3-030-00889-5}, doi = {10.1007/978-3-030-00889-5}, author = {Marc Combalia and Ver{\'o}nica Vilaplana} } @conference {cCombalia, title = {Monte-Carlo Sampling applied to Multiple Instance Learning for Whole Slide Image Classification}, booktitle = {International Conference on Medical Imaging with Deep Learning}, year = {2018}, month = {07/2018}, address = {Amsterdam, The Netherlands}, author = {Marc Combalia and Ver{\'o}nica Vilaplana} } @article {aCasamitjana, title = {MRI-Based Screening of Preclinical Alzheimer{\textquoteright}s Disease for Prevention Clinical Trials}, journal = {Journal of Alzheimer{\textquoteright}s Disease}, volume = {64}, year = {2018}, month = {07/2018}, chapter = {1099}, abstract = {

The identification of healthy individuals harboring amyloid pathology constitutes one important challenge for secondary prevention clinical trials in Alzheimer{\textquoteright}s disease (AD). Consequently, noninvasive and cost-efficient techniques to detect preclinical AD constitute an unmet need of critical importance. In this manuscript, we apply machine learning to structural MRI (T1 and DTI) of 96 cognitively normal subjects to identify amyloid-positive ones. Models were trained on public ADNI data and validated on an independent local cohort. Used for subject classification in a simulated clinical trial setting, the proposed method is able to save 60\% unnecessary CSF/PET tests and to reduce 47\% of the cost of recruitment when used in a simulated clinical trial setting. This recruitment strategy capitalizes on already acquired MRIs to reduce the overall amount of invasive PET/CSF tests in prevention trials, demonstrating a potential value as a tool for AD screening. This protocol could foster the development of secondary prevention strategies for AD.

}, author = {Adri{\`a} Casamitjana and Paula Petrone and Alan Tucholka and Carles Falcon and Stavros Skouras and Jose Luis Molinuevo and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @unpublished {xFernandezb, title = {Multimodal Knowledge Base Population from News Streams for Media Applications}, year = {2018}, month = {07/2018}, type = {Phd thesis proposal}, abstract = {

Media producers publish large amounts of multimedia content online - both text, audio\ and video. As the online media market grows, the management and delivery of contents\ is becoming a challenge. Semantic and Linking technologies can be used to organize and\ exploit this contents. This dissertation addresses the problem of integrating Semantic Web\ technologies and linking data technologies into Vilynx{\textquoteright}s platform, a system used by media\ producers to manage and explode its contents. For that purpose, Knowledge Graphs (KG)\ and its maintenance through multimodal Knowledge Base Population (KBP) from online\ data extracted from the Web is studied. The Web is a very large unstructured data source\ with millions of text, images, videos and audio. This thesis is willing to generate solutions\ to facilitate automatic learning from these multimodal data and use it in real product applications\ for media.

This thesis is going to be structured in three parts. The first part of the thesis will cover\ the construction of a multimodal KG, which will be the core of the system for knowledge\ extraction, standardization and contextualization.

The\ second part will consist on the construction of the tools that will be used for KBP. For\ that we will construct a multimodal semantic tagging framework, based on the previously\ mentioned KG. This block addresses some typical challenges of KBP and data mining, like:\ name entity recognition (NER), entity linking (EL), context set construction (CSC), structured\ data creation, standardization, entity matching and data fusion.

The third part will focus on the extraction of knowledge from the Web to populate the knowledge\ base. As the KG domain is media, we will populate the KG using events detected from\ news streams using a multilmodal perspective. To detect events we will construct a news\ aggregator system. This part will deal with the problems of Topic Detection and Tracking\ (TDT), Topic Modeling (TM) and multi-document summarization. From these data we will\ learn relations between world entities, that will populate our KG, dealing with the automatic\ detection and update of concepts and relations. Also social media information will be\ analyzed to understand trendiness and world interests.

}, keywords = {Entity Detection, Entity Linking, Knowledge Base Population, Knowledge Graph, Linked Technologies, Multi-document Summarization, multimedia, Multimodal Systems, Natural Language Processing, Semantic Web, Topic Detection and Tracking, Topic Modeling}, author = {Fern{\`a}ndez, D{\`e}lia and Bou-Balust, Elisenda and Xavier Gir{\'o}-i-Nieto} } @article {xGiro-i-Nieto18a, title = {One Perceptron to Rule them All}, year = {2018}, abstract = {
Once Perceptron to Rule Them all: Deep Learning for Multimedia from Universitat Polit{\`e}cnica de Catalunya
}, author = {Xavier Gir{\'o}-i-Nieto} } @conference {cShou, title = {Online Action Detection in Untrimmed, Streaming Videos}, booktitle = {European Conference on Computer Vision (ECCV)}, year = {2018}, month = {02/2018}, address = {Munich, German}, abstract = {

}, url = {https://arxiv.org/abs/1802.06822}, author = {Shou, Zheng and Pan, Junting and Chan, Johnatan and Miyazawa, Kazuyuki and Mansour, Hassan and Vetro, Anthony and Xavier Gir{\'o}-i-Nieto and Chang, Shih-Fu} } @conference {cLopez-Palma, title = {Oriented trajectories as a method for audience measurement}, booktitle = {27th International Symposium on Industrial Electronics (ISIE)}, year = {2018}, month = {06/2018}, publisher = {IEEE}, organization = {IEEE}, address = {Cairns, Australia}, abstract = {

The quantification of the attention received by advertisements is of paramount importance to determine its effectiveness. In this work, a simple and effective objective method for the assessment of the attention given to advertisements is provided. The proposed method is based on computing the oriented trajectory of the different test persons along with their head pose. This way, it is possible to determine if a given person is looking towards the advertisement. While other similar methods use more complex setups, requiring a camera at each advertisement location, our method needs only a single (or a few) ceiling camera. Even though the apparent simplicity, the method can compute attention measures at each point of the scene.

}, author = {Manuel L{\'o}pez-Palma and Morros, J.R. and Javier Gago and Montserrat Corbal{\'a}n} } @conference {cAssensa, title = {PathGAN: Visual Scanpath Prediction with Generative Adversarial Networks}, booktitle = {ECCV 2018 Workshop on Egocentric Perception, Interaction and Compution (EPIC)}, year = {2018}, month = {07/2018}, publisher = {Springer}, organization = {Springer}, address = {Munich, Germany}, abstract = {

We introduce PathGAN, a deep neural network for visual scanpath prediction trained on adversarial examples. A visual scanpath is defined as the sequence of fixation points over an image defined by a human observer with its gaze. PathGAN is composed of two parts, the generator and the discriminator. Both parts extract features from images using off-the-shelf networks, and train recurrent layers to generate or discriminate scanpaths accordingly. In scanpath prediction, the stochastic nature of the data makes it very difficult to generate realistic predictions using supervised learning strategies, but we adopt adversarial training as a suitable alternative. Our experiments prove how PathGAN improves the state of the art of visual scanpath prediction on the Salient360! dataset.

This work obtained the\ 2nd award in Prediction of Head-gaze Scan-paths for Images, and the 2nd award in Prediction of Eye-gaze Scan-paths for Images at the IEEE ICME 2018 Salient360! Challenge.

}, doi = {10.1007/978-3-030-11021-5_25}, url = {https://doi.org/10.1007/978-3-030-11021-5_25}, author = {Assens, Marc and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @mastersthesis {xCanaves18, title = {Prevention of Alzheimer{\textquoteright}s Disease: a contribution from MRI and machine learning}, year = {2018}, abstract = {

Alzheimer{\textquoteright}s disease (AD) is a neurodegenerative disease and the leading cause of dementia (50-70\% of cases). Despite worldwide efforts, there is no progress in developing a cure for AD and dementia. Machine learning, hand in hand with magnetic resonance imaging (MRI), come to the aid of disease diagnostics. In the scope of AD, many efforts have been dedicated to the automated detection of mild-cognitive impairment and dementia. In our research, instead we focus on the prediction of AD in its preclinical stage using machine learning classification. Another key innovation is that we will work with a longitudinal pipeline. In addition to classification, the project focuses on detecting the most relevant imaging voxels for classification, that is, to help us locate where AD-specific structural brain changes occur. We have improved classification performance i n comparison with results obtained with cross-sectional datasets in previous studies and we have identified possible regions of interest based on feature scores obtained from feature selection.

}, author = {Miguel Artigues C{\`a}naves}, editor = {Paula Petrone and Ver{\'o}nica Vilaplana} } @conference {cCasamitjanab, title = {Projection to Latent Spaces Disentangles Specific Cerebral Morphometric Patterns Associated to Aging and Preclinical AD}, booktitle = {Alzheimer{\textquoteright}s Association International Conference}, year = {2018}, month = {07/2018}, address = {Chicago, USA}, abstract = {

Partial Least Squares (PLS) is a mathematical technique that relates two sets of observable variables by means of a few latent explanatory factors. The aim of this study is to use PLS to discover the associations between CSF biomarkers and structural brain imaging in preclinical AD and to disentangle their specific contribution from confounding demographic factors.PLS is able to disentangle the cerebral morphometric patterns associated to preclinical AD stages from other demographic factors. Results with both cortical thickness and volumetric data present significant overlap, thus showing the robustness of this approach. Interestingly, volumetric data showed more significant correlations with CSF Abeta than cortical thickness.

}, author = {Adri{\`a} Casamitjana and P. Petrone and M. Artigues and J.L. Molinuevo and J.D. Gispert and Ver{\'o}nica Vilaplana} } @article {aCasamitjana18, title = { Projection to Latent Spaces Disentangles Specific Cerebral Morphometric Patterns Associated to Aging and Preclinical AD}, journal = {Alzheimer{\textquoteright}s \& Dementia: The Journal of the Alzheimer{\textquoteright}s Association }, volume = {14}, year = {2018}, month = {07/2018}, pages = {P869-P870}, doi = {10.1016/j.jalz.2018.06.1111}, author = {Adri{\`a} Casamitjana and P. Petrone and M. Artigues and J.L. Molinuevo and J.D. Gispert and Ver{\'o}nica Vilaplana} } @conference {cSalvadord, title = {Recurrent Neural Networks for Semantic Instance Segmentation}, booktitle = {ECCV 2018 Women in Computer Vision (WiCV) Workshop}, year = {2018}, month = {12/2017}, abstract = {

We present a recurrent model for semantic instance segmentation that sequentially generates pairs of masks and their associated class probabilities for every object in an image. Our proposed system is trainable end-to-end, does not require post-processing steps on its output and is conceptually simpler than current methods relying on object proposals. We observe that our model learns to follow a consistent pattern to generate object sequences, which correlates with the activations learned in the encoder part of our network. We achieve competitive results on three different instance segmentation benchmarks (Pascal VOC 2012, Cityscapes and CVPPP Plant Leaf Segmentation).

Recurrent Neural Networks for Semantic Instance Segmentation from Universitat Polit{\`e}cnica de Catalunya
}, url = {https://imatge-upc.github.io/rsis/}, author = {Amaia Salvador and M{\'\i}riam Bellver and Baradad, Manel and V{\'\i}ctor Campos and Marqu{\'e}s, F. and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cSalvadore, title = {Recurrent Neural Networks for Semantic Instance Segmentation}, booktitle = {CVPR 2018 DeepVision Workshop}, year = {2018}, month = {06/2018}, abstract = {

We present a recurrent model for semantic instance segmentation that sequentially generates binary masks and their associated class probabilities for every object in an image. Our proposed system is trainable end-to-end from an input image to a sequence of labeled masks and, compared to methods relying on object proposals, does not require post-processing steps on its output. We study the suitability of our recurrent model on three different instance segmentation benchmarks, namely Pascal VOC 2012, CVPPP Plant Leaf Segmentation and Cityscapes. Further, we analyze the object sorting patterns generated by our model and observe that it learns to follow a consistent pattern, which correlates with the activations learned in the encoder part of our network.

}, author = {Amaia Salvador and M{\'\i}riam Bellver and Baradad, Manel and V{\'\i}ctor Campos and Marqu{\'e}s, F. and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xFojo, title = {Reproducing and Analyzing Adaptive Computation Time in PyTorch and TensorFlow}, year = {2018}, abstract = {

The complexity of solving a problem can differ greatly to the complexity of posing that problem. Building a Neural Network capable of dynamically adapting to the complexity of the inputs would be a great feat for the machine learning community. One of the most promising approaches is Adaptive Computation Time for Recurrent Neural Network (ACT) \parencite{act}. In this thesis, we implement ACT in two of the most used deep learning frameworks, PyTorch and TensorFlow. Both are open source and publicly available. We use this implementations to evaluate the capability of ACT to learn algorithms from examples. We compare ACT with a proposed baseline where each input data sample of the sequence is read a fixed amount of times, learned as a hyperparameter during training. Surprisingly, we do not observe any benefit from ACT when compared with this baseline solution, which opens new and unexpected directions for future research.

Reproducing and Analyzing Adaptive Computation Time in PyTorch and TensorFlow from Universitat Polit{\`e}cnica de Catalunya
}, author = {Fojo, Daniel}, editor = {V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xGullon18, title = {Retinal lesions segmentation using CNNs and adversarial training}, year = {2018}, abstract = {

Convolutional Neural Networks (CNNs) are frequently used to tackle image classification and segmentation problems due to its recently proven successful results. In particular, in medical domain, it is more and more common to see automated techniques to help doctors in their diagnosis. In this work, we study the retinal lesions segmentation problem using CNNs on the Indian Diabetic Retinopathy Image Dataset (IDRiD). Additionally, the idea of adversarial training used by Generative Adversarial Networks (GANs) will be also added to the previous CNN to improve its results, making segmentation maps more accurate and realistic. A comparison between these two architectures will be made. One of the main challenges we will be facing is the high-imbalance between lesions and healthy parts of the retina and the fact that some lesion classes are very scattered in small fractions. Thus, different loss functions, optimizers and training schemes will be studied and evaluated to see which one best addresses our problem.

}, author = {Nat{\`a}lia Gull{\'o}n}, editor = {Ver{\'o}nica Vilaplana} } @conference {cMohedanob, title = {Saliency Weighted Convolutional Features for Instance Search}, booktitle = {Content-Based Multimedia Indexing - CBMI}, year = {2018}, month = {09/2018}, publisher = {IEEE}, organization = {IEEE}, address = {La Rochelle, France}, abstract = {

This work explores attention models to weight the contribution of local convolutional representations for the instance search task. We present a retrieval framework based on bags of local convolutional features (BLCF) that benefits from saliency weighting to build an efficient image representation. The use of human visual attention models (saliency) allows significant improvements in retrieval performance without the need to conduct region analysis or spatial verification, and without requiring any feature fine tuning. We investigate the impact of different saliency models, finding that higher performance on saliency benchmarks does not necessarily equate to improved performance when used in instance search tasks. The proposed approach outperforms the state-of-the-art on the challenging INSTRE benchmark by a large margin, and provides similar performance on the Oxford and Paris benchmarks compared to more complex methods that use off-the-shelf representations.

Saliency Weighted Convolutional Features for Instance Search from Universitat Polit{\`e}cnica de Catalunya
}, author = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @article {aAssens, title = {Scanpath and Saliency Prediction on 360 Degree Images}, journal = {Elsevier Signal Processing: Image Communication}, year = {2018}, abstract = {

We introduce deep neural networks for scanpath and saliency prediction trained on 360-degree images. The scanpath prediction model called SaltiNet is based on a temporal-aware novel representation of saliency information named the saliency volume. The first part of the network consists of a model trained to generate saliency volumes, whose parameters are fit by back-propagation using a binary cross entropy (BCE) loss over downsampled versions of the saliency volumes. Sampling strategies over these volumes are used to generate scanpaths over the 360-degree images. Our experiments show the advantages of using saliency volumes, and how they can be used for related tasks. We also show how a similar architecture achieves state-of-the-art performance for the related task of saliency map prediction. Our source code and trained models available here.

}, url = {https://www.sciencedirect.com/science/article/pii/S0923596518306209}, author = {Assens, Marc and McGuinness, Kevin and O{\textquoteright}Connor, N. and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dLin18, title = {Semantic and Generic Object Segmentation for Scene Analysis Using RGB-D Data}, year = {2018}, month = {07/2018}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {PhD}, address = {download link}, abstract = {

In this thesis, we study RGB-D based segmentation problems from different per- spectives in terms of the input data. Apart from the basic photometric and geometric information contained in the RGB-D data, also semantic and temporal information are usually considered in an RGB-D based segmentation system.

The first part of this thesis focuses on an RGB-D based semantic segmentation problem, where the predefined semantics and annotated training data are available. First, we review how RGB-D data has been exploited in the state-of-the-art to help training classifiers in semantic segmentation task. Inspired by these works, we follow a multi-task learning schema, where semantic segmentation and depth estimation are jointly tackled in a Convolutional Neural Network (CNN). Since semantic segmenta- tion and depth estimation are two highly correlated tasks, approaching them jointly can be mutually beneficial. In this case, depth information along with the segmenta- tion annotation in the training data helps better defining the target of the training process of the classifier, instead of feeding the system blindly with an extra input channel. We design a novel hybrid CNN architecture by investigating the common attributes as well as the distinction for depth estimation and semantic segmentation. The proposed architecture is tested and compared with state-of-the-art approaches in different datasets.

Although outstanding results are achieved in semantic segmentation, the limita- tions in these approaches are also obvious. Semantic segmentation strongly relies on predefined semantics and a large amount of annotated data, which may not be avail- able in more general applications. On the other hand, classical image segmentation tackles the segmentation task in a more general way. But classical approaches hardly obtain object level segmentation due to the lack of higher level knowledge. Thus, in the second part of this thesis, we focus on an RGB-D based generic instance segmenta- tion problem where temporal information is available from the RGB-D video while no semantic information is provided. We present a novel generic segmentation approach for 3D point cloud video (stream data) thoroughly exploiting the explicit geometry iii and temporal correspondences in RGB-D. The proposed approach is validated and compared with state-of-the-art generic segmentation approaches in different datasets.

Finally, in the third part of this thesis, we present a method which combines the advantages in both semantic segmentation and generic segmentation, where we discover object instances using the generic approach and model them by learning from the few discovered examples by applying the approach of semantic segmentation. To do so, we employ the one shot learning technique, which performs knowledge transfer from a generally trained model to a specific instance model. The learned instance models generate robust features in distinguishing dierent instances, which is fed to the generic segmentation approach to perform improved segmentation. The approach is validated with experiments conducted on a carefully selected dataset.

}, url = {http://hdl.handle.net/10803/620762}, author = {X. Lin}, editor = {Casas, J. and M. Pard{\`a}s} } @inbook {bCampos, title = {Sentiment concept embedding for visual affect recognition}, booktitle = {Multimodal Behavior Analysis in theWild}, year = {2018}, publisher = {Elsevier}, organization = {Elsevier}, edition = {1}, chapter = {16}, abstract = {

Automatic sentiment and emotion understanding of general visual content has recently garnered much research attention. However, the large visual variance associated with\ high-level affective concepts presents a challenge when designing systems with high-performance requirements. One\ popular approach to bridge the {\textquotedblleft}affective gap{\textquotedblright} between\ low-level visual features and affective semantics consists of\ using Adjective Noun Pair (ANP) semantic constructs for\ concepts, e.g. {\textquotedblleft}beautiful landscape{\textquotedblright} or {\textquotedblleft}scary face{\textquotedblright} which\ act as a mid-level representation that can be recognized by\ visual classifers while still carrying an affective bias. In\ this work, we formulate the ANP detection task in images\ over a continuous space defined over an embedding that\ captures the inter-concept relationships between ANPs. We\ show how the compact representations obtained from the\ embeddeding extend the discrete concepts in the ontology\ and can be used for improved visual sentiment and emotion\ prediction, as well as new applications such as zero-shot\ ANP detection.

}, url = {https://www.elsevier.com/books/multimodal-behavior-analysis-in-the-wild/alameda-pineda/978-0-12-814601-9}, author = {V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto and Jou, Brendan and Jordi Torres and Chang, Shih-Fu} } @inbook {bCasamitjana18a, title = {Shared Latent Structures Between Imaging Features and Biomarkers in Early Stages of Alzheimer{\textquoteright}s Disease}, booktitle = {PRedictive Intelligence in MEdicine}, volume = {11121}, year = {2018}, pages = {60-67}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {

In this work, we identify meaningful latent patterns in MR images for patients across the Alzheimer{\textquoteright}s disease (AD) continuum. For this purpose, we apply Projection to Latent Structures (PLS) method using cerebrospinal fluid (CSF) biomarkers (t-tau, p-tau, amyloid-beta) and age as response variables and imaging features as explanatory variables. Freesurfer pipeline is used to compute MRI surface and volumetric features resulting in 68 cortical ROIs and 84 cortical and subcortical ROIs, respectively. The main assumption of this work is that there are two main underlying processes governing brain morphology along the AD continuum: brain aging and dementia. We use two different and orthogonal PLS models to describe each process: PLS-aging and PLS-dementia. To define PLS-aging model we use normal aging subjects and age as predictor and response variables, respectively, while for PLS-dementia we only use demented subjects and biomarkers as response variables.

}, issn = {978-3-030-00320-3}, doi = {10.1007/978-3-030-00320-3}, author = {Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana and Paula Petrone and Jose Luis Molinuevo and Juan D. Gispert} } @conference {cCasamitjanac, title = {Shared latent structures between imaging features and biomarkers in early stages of Alzheimer{\textquoteright}s disease}, booktitle = {Workshop on Predictive Intelligence in Medicine (PRIME), MICCAI}, year = {2018}, month = {2018}, address = {Granada, Spain}, abstract = {

In this work, we identify meaningful latent patterns in MR images for patients across the Alzheimer{\textquoteright}s disease (AD) continuum. For this purpose, we apply Projection to Latent Structures (PLS) method using cerebrospinal fluid (CSF) biomarkers (t-tau, p-tau, amyloid-beta) and age as response variables and imaging features as explanatory variables. Freesurfer pipeline is used to compute MRI surface and volumetric features\  \ resulting in 68 cortical ROIs and 84 cortical and subcortical ROIs, respectively. The main assumption of this work is that there are two main underlying processes governing brain morphology along the AD continuum: brain aging and dementia. We use two different and orthogonal PLS models to describe each process: PLS-aging and PLS-dementia. To define PLS-aging model we use normal aging subjects and age as predictor and response variables, respectively, while for PLS-dementia we only use demented subjects and biomarkers as response variables.\ 

}, author = {Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana and Paula Petrone and Jose Luis Molinuevo and Juan D. Gispert} } @conference {cCampos18, title = {Skip RNN: Learning to Skip State Updates in Recurrent Neural Networks}, booktitle = {International Conference on Learning Representations (ICLR)}, year = {2018}, month = {01/2018}, abstract = {

Recurrent Neural Networks (RNNs) continue to show\  outstanding performance in sequence modeling tasks. However, training RNNs on long sequences often face challenges like slow inference, vanishing gradients and difficulty in capturing long term dependencies. In backpropagation through time settings, these issues are tightly coupled with the large, sequential computational graph resulting from unfolding the RNN in time. We introduce the Skip RNN model which extends existing RNN models by learning to skip state updates and shortens the effective size of the computational graph. This model can also be encouraged to perform fewer state updates through a budget constraint. We evaluate the proposed model on various tasks and show how it can reduce the number of required RNN updates while preserving, and sometimes even improving, the performance of the baseline RNN models.

Skip RNN: Learning to Skip State Updates in Recurrent Neural Networks from Xavier Giro-i-Nieto
}, author = {V{\'\i}ctor Campos and Jou, Brendan and Xavier Gir{\'o}-i-Nieto and Jordi Torres and Chang, Shih-Fu} } @conference {cCaminal18, title = {SLAM-based 3D outdoor reconstructions from LIDAR data}, booktitle = {IC3D}, year = {2018}, month = {12/2018}, publisher = {IEEE}, organization = {IEEE}, address = {Brussels, Belgium}, abstract = {

The use of depth (RGBD) cameras to reconstruct large outdoor environments is not feasible due to lighting conditions and low depth range. LIDAR sensors can be used instead. Most state of the art SLAM methods are devoted to indoor environments and depth (RGBD) cameras. We have adapted two SLAM systems to work with LIDAR data. Quantitative evaluations have been performed with LIDAR and RGBD data allowing to compare the systems. Results show that the best method with LIDAR is RTAB-Map with clear difference. Additionally, RTAB-Map has been used to create 3D reconstructions with and without the use of a visible color camera. This proves the potential of Lidar sensors for the reconstructionof outdoor environments for immersion or audiovisual production applications.

}, keywords = {3D imaging, Lidar cameras, mapping, point-cloud processing, SLAM, time-of-flight}, url = {http://www.stereopsia.com/international-conference-3d-immersion-ic3d}, author = {I. Caminal and Casas, J. and S. Royo} } @mastersthesis {xRoldana, title = {Speech-conditioned Face Generation with Deep Adversarial Networks}, year = {2018}, abstract = {

Image synthesis have been a trending task for the AI community in recent years. Many works have shown the potential of Generative Adversarial Networks (GANs) to deal with tasks such as text or audio to image synthesis. In particular, recent advances in deep learning using audio have inspired many works involving both visual and auditory information. In this work we propose a face synthesis method using audio and/or language representations as inputs. Furthermore, a dataset which relates speech utterances with a face and an identity has been built, fitting for other tasks apart from face synthesis such as speaker recognition or voice conversion.

Speech Conditioned Face Generation with Deep Adversarial Networks from Universitat Polit{\`e}cnica de Catalunya
}, author = {Rold{\'a}n, Francisco}, editor = {Pascual-deLaPuente, Santiago and Amaia Salvador and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @conference {cLinardos, title = {Temporal Saliency Adaptation in Egocentric Videos}, booktitle = {ECCV 2018 Workshop on Egocentric Perception, Interaction and Computing}, year = {2018}, month = {07/2018}, publisher = {Extended abstract}, organization = {Extended abstract}, address = {Munich, Germany}, abstract = {

}, author = {Linardos, Panagiotis and Mohedano, Eva and Chert{\'o}, M{\`o}nica and Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto} } @article {aLin, title = {Temporally Coherent 3D Point Cloud Video Segmentation in Generic Scenes}, journal = {IEEE Transactions on Image Processing}, volume = {27}, year = {2018}, month = {06/2018}, pages = {3087 - 3099}, abstract = {

Video segmentation is an important building block for high level applications such as scene understanding and interaction analysis. While outstanding results are achieved in this field by state-of-the-art learning and model based methods, they are restricted to certain types of scenes or require a large amount of annotated training data to achieve object segmentation in generic scenes. On the other hand, RGBD data, widely available with the introduction of consumer depth sensors, provides actual world 3D geometry compared to 2D images. The explicit geometry in RGBD data greatly helps in computer vision tasks, but the lack of annotations in this type of data may also hinder the extension of learning based methods to RGBD. In this paper, we present a novel generic segmentation approach for 3D point cloud video (stream data) thoroughly exploiting the explicit geometry in RGBD. Our proposal is only based on low level features, such as connectivity and compactness. We exploit temporal coherence by representing the rough estimation of objects in a single frame with a hierarchical structure, and propagating this hierarchy along time. The hierarchical structure provides an efficient way to establish temporal correspondences at different scales of object-connectivity, and to temporally manage the splits and merges of objects. This allows updating the segmentation according to the evidence observed in the history. The proposed method is evaluated on several challenging datasets, with promising results for the presented approach.

}, issn = {1057-7149}, doi = {10.1109/TIP.2018.2811541}, url = {https://ieeexplore.ieee.org/document/8306148/}, author = {X. Lin and Casas, J. and M. Pard{\`a}s} } @conference {cDuarte, title = {Towards Speech to Sign Language Translation}, booktitle = {ECCV 2018 Workshop on Shortcomings in Vision and Language}, year = {2018}, month = {08/2018}, abstract = {

Sign Language (SL) is the primary means of communication for a majority of the hearing-impaired community. Current computational approaches in this research area have focused specifically on Sign Language Recognition (SLR) and Sign Language Translation (from SL to text) (SLT). However, the reverse problem of translating from spoken language to sign language has so far been unexplored. The goal of our ongoing project is to provide to people with hearing disabilities the audio tracks from online videos, by automatically generating a video-based speech to sign language translation. In this paper, we will point out the shortcomings that limit the advances of this research area and propose first steps towards this end.

}, author = {Amanda Duarte and Camli, Gorkem and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cIndia18, title = {UPC Multimodal Speaker Diarization System for the 2018 Albayzin Challenge}, booktitle = {IberSpeech 2018}, year = {2018}, month = {11/2018}, address = {Barcelona}, abstract = {

This paper presents the UPC system proposed for the Multimodal Speaker Diarization task of the 2018 Albayzin Challenge. This approach works by processing individually the speech and the image signal. In the speech domain, speaker diarization is performed using identity embeddings created by a triplet loss DNN that uses i-vectors as input. The triplet DNN is trained with an additional regularization loss that minimizes the variance of both positive and negative distances. A sliding window is then used to compare speech segments with enrollment speaker targets using cosine distance between the embeddings. To detect identities from the face modality, a face detector followed by a face tracker has been used on the videos. For each cropped face a feature vector is obtained using a Deep Neural Network based on the ResNet 34 architecture, trained using a\  metric learning triplet loss (available from dlib library). For each track the face feature vector is obtained by averaging the features obtained for each one of the frames of that track. Then, this feature vector is compared with the features extracted from the images of the enrollment identities. The proposed system is evaluated on the RTVE2018 database.

}, keywords = {Face Diarization, Multimodal Systems, Speaker Diarization}, author = {India, M. and Sagastiberri, I. and Palau, P. and Elisa Sayrol and Morros, J.R. and Hernando, J.} } @mastersthesis {xArenas, title = {Video Understanding through the Disentanglement of Appearance and Motion}, year = {2018}, abstract = {

Understanding the inner workings of deep learning algorithms is key to eciently exploit the\ large number of videos that are generated every day. For the self-supervised learning of the spatiotemporal\ information contained within these videos, there are several types of algorithms based\ on convolutional neural networks (CNNs) following an auto-encoder style architecture. However,\ we have checked that this type of models, trained for the frame prediction task, learn jointly\ these spatio-temporal information, so the model is not able to recognize appearance-motion\ combinations not seen during training. Our proposed model, called DisNet, can learn separately\ the appearance and motion through disentanglement, so that it solves the generalization and\ scalability problems. To demonstrate this, we conducted numerous experiments under highly\ controlled conditions, generating speci c datasets that make the conventional model fail for\ the appearance and motion classi cation tasks, and analyzing how well our proposal behaves\ under the same conditions.

}, author = {Arenas, Carlos}, editor = {Xavier Gir{\'o}-i-Nieto and V{\'\i}ctor Campos and Palacio, Sebastian} } @conference {cFernandezb, title = {What is going on in the world? A display platform for media understanding}, booktitle = {IEEE Multimedia Information Processing and Retrieval (MIPR) Conference}, year = {2018}, month = {04/2018}, publisher = {IEEE}, organization = {IEEE}, address = {Miami, FL (USA)}, abstract = {

News broadcasters and on-line publishers daily generate a large amount of articles and videos describing events currently happening in the world. In this, work we present a system that automatically indexes videos from a library and links them to stories developing in the news. The user interface displays in an intuitive manner the links between\  videos and stories and allows navigation through related content by using associated tags. This interface is a powerful industrial tool for publishers to index, retrieve and visualize their video content. It helps them identify which topics require more attention or retrieve related content that{\textquoteright}s already been published about the stories.

}, doi = {https://doi.org/10.1109/MIPR.2018.00045}, url = {https://www.youtube.com/watch?v=eaXcB2X-5xY}, author = {Fern{\`a}ndez, D{\`e}lia and David Varas and Bou, Elisenda and Xavier Gir{\'o}-i-Nieto} } @conference {cLopez-Palmaa, title = {Who watches the watchers? Quality control of the human inspection in production lines using Visual Intensity of Attention}, booktitle = {SAAEI 2018}, year = {2018}, month = {07/2018}, address = {Barcelona}, abstract = {

On multiple occasions production lines require inspectors, human operators that visualize certain steps of the production and determine the quality of the resulting products. However, inspectors are subject to errors. We propose a method based on computer vision to decide if the inspector has used an adequate attention in the different points of inspection so that pieces that have not been verified can be marked for rejection or re-inspection. The method uses a top-view ceiling camera that computes the trajectories and areas of vision of the inspector, and determines which products have received the correct amount of attention. The resulting attention figure can be compared with the acceptance range in the inspection protocol to determine if the inspection is valid.

}, author = {Manuel L{\'o}pez-Palma and Morros, J.R. and Javier Gago and Montserrat Corbal{\'a}n} } @inbook {bCasamitjana, title = {3D Convolutional Neural Networks for Brain Tumor Segmentation: a comparison of multi-resolution architectures}, booktitle = {Lecture Notes in Computer Vision}, volume = {Brainlesion: Glioma, Multiple Sclerosis, Stroke and Traumatic Brain Injuries}, year = {2017}, pages = {150-161}, publisher = {Springer}, organization = {Springer}, issn = {978-3-319-55524-9}, author = {Adri{\`a} Casamitjana and Santi Puch and Asier Aduriz and Ver{\'o}nica Vilaplana} } @article {aMaceiraa, title = {3D hierarchical optimization for multi-view depth map coding}, journal = {Multimedia Tools and Applications}, year = {2017}, month = {12/2017}, abstract = {

Full version available at:\ http://rdcu.be/zHtU

Depth data has a widespread use since the popularity of high resolution 3D sensors. In multi-view sequences, depth information is used to supplement the color data of each view. This article proposes a joint encoding of multiple depth maps with a unique representation. Color and depth images of each view are segmented independently and combined in an optimal Rate-Distortion fashion. The resulting partitions are projected to a reference view where a coherent hierarchy forthe multiple views is built. A Rate-Distortion optimization is applied to obtain the final segmentation choosing nodes of the hierarchy. The consistent segmentation is used to robustly encode depth maps of multiple views obtaining competitive results with HEVC coding standards.

}, keywords = {3D representation, Depth coding, Multiview coding, segmentation based coding}, doi = {10.1007/s11042-017-5409-z}, url = {http://rdcu.be/zHtU}, author = {Maceira, M. and David Varas and Morros, J.R. and Ruiz-Hidalgo, J. and Marqu{\'e}s, F.} } @conference {cSanchez, title = {3D Medical Image Synthesis using Generative Adversarial Networks}, booktitle = {ACM Europe Celebration of Women in Computing, womENcourage 2017, Barcelona, Spain}, year = {2017}, month = {09/2017}, author = {Irina S{\'a}nchez and Ver{\'o}nica Vilaplana} } @conference {cLinb, title = {3D Point Cloud Segmentation Using a Fully Connected Conditional Random Field}, booktitle = {The 25th European Signal Processing Conference (EUSIPCO 2017)}, year = {2017}, month = {08/2017}, publisher = {Eurasip/IEEE}, organization = {Eurasip/IEEE}, address = {Kos island, Greece}, abstract = {

Traditional image segmentation methods working with low level image features are usually difficult to be adapted to higher level tasks, such as object recognition and scene understanding. Object segmentation emerges as a new challenge in this research filed. It aims at obtaining more meaningful segments related to semantic objects in the scene by analyzing a combination of different information. 3D point cloud data obtained from consumer depth sensors has been exploited to tackle many computer vision problems due to its richer information about the geometry of 3D scenes compared to 2D images. Meanwhile, new challenges have also emerged as the depth information is usually noisy, sparse and unorganized. In this paper, we present a novel point cloud segmentation approach for segmenting interacting objects in a stream of point clouds by exploiting spatio-temporal coherence. We pose the problem as an energy minimization task in a fully connected conditional random field with the energy function defined based on both current and previous information. We compare different methods and prove the better segmentation performance and robustness of the proposed approach in sequences with over 2k frames.

}, keywords = {Image color analysis, Image segmentation, Labeling, Object segmentation, Three-dimensional displays, Two dimensional displays}, doi = {10.23919/EUSIPCO.2017.8081170}, url = {https://www.eusipco2017.org/}, author = {X. Lin and Casas, J. and M. Pard{\`a}s} } @mastersthesis {xGorriz, title = {Active Deep Learning for Medical Imaging Segmentation}, year = {2017}, abstract = {

Grade: A (9.7/10)

This thesis proposes a novel active learning framework capable to train e ectively a convolutional neural network for semantic segmentation of medical imaging, with a limited amount of training labeled data. Our approach tries to apply in segmentation existing active learning techniques, which is becoming an important topic today because of the many problems caused by the lack of large amounts of data. We explore di erent strategies to study the image information and introduce a previously used cost-e ective active learning method based on the selection of high con dence predictions to assign automatically pseudo-labels with the aim of reducing the manual annotations. First, we made a simple application for handwritten digit classi cation to get started to the methodology and then we test the system with a medical image database for the treatment of melanoma skin cancer. Finally, we compared the traditional training methods with our active learning proposals, specifying the conditions and parameters required for it to be optimal.

Active Deep Learning for Medical Imaging from Xavier Giro-i-Nieto
}, url = {http://hdl.handle.net/2117/109304}, author = {G{\'o}rriz, Marc}, editor = {Xavier Gir{\'o}-i-Nieto and Carlier, Axel and Faure, Emmanuel} } @conference {cGorriz, title = {Active Deep Learning for Medical Imaging Segmentation}, booktitle = {Medical Image meets NIPS 2017 Workshop}, year = {2017}, month = {11/2017}, abstract = {

We propose a novel Active Learning framework capable to train effectively a convolutional neural network for semantic segmentation of medical imaging, with a limited amount of training labeled data. Our contribution is a practical Cost-Effective Active Learning approach using Dropout at test time as Monte Carlo sampling to model the pixel-wise uncertainty and to analyze the image information to improve the training performance.

Active Deep Learning for Medical Imaging from Xavier Giro-i-Nieto
}, author = {G{\'o}rriz, Marc and Xavier Gir{\'o}-i-Nieto and Carlier, Axel and Faure, Emmanuel} } @article {xSalvador17, title = {Artificial intelligence suggests recipes based on food photos}, volume = {2017}, year = {2017}, publisher = {MIT News}, address = {Boston}, abstract = {

Given a still image of a dish filled with food, CSAIL team{\textquoteright}s deep-learning algorithm recommends ingredients and recipes.

}, url = {Artificial intelligence suggests recipes based on food photos}, author = {Amaia Salvador} } @conference {cCasamitjanaa, title = {Augmented V-Net for infant brain segmentation}, booktitle = {MICCAI Grand Challenge on 6-month Infant Brain MRI Segmentation, MICCAI 2017}, year = {2017}, month = {09/2017}, author = {Adri{\`a} Casamitjana and Irina S{\'a}nchez and Marc Combalia and Ver{\'o}nica Vilaplana} } @conference {cCasamitjana, title = {Augmented V-Net for White Matter Hyperintensities segmentation}, booktitle = {WMH Segmentation Challenge, Brain-lesion Workshop, MICCAI 2017 }, year = {2017}, month = {09/2017}, author = {Adri{\`a} Casamitjana and Irina S{\'a}nchez and Marc Combalia and Ver{\'o}nica Vilaplana} } @mastersthesis {xJimenez, title = {Class Weighted Convolutional Features for Image Retrieval}, year = {2017}, abstract = {

Program: Master{\textquoteright}s Degree in Telecommunications Engineering

Grade: A with honours (10.0/10.0)

Image retrieval in realistic scenarios targets large dynamic datasets of unlabeled images. In these cases, training or ne-tuning a model every time new images are added to the database is neither ecient nor scalable. Convolutional Neural Networks trained for image classi cation over large datasets have been proven e ective feature extractors when transferred to the task of image retrieval. The most successful approaches are based in encoding the activations of convolutional layers as they convey the image spatial information. Our proposal goes beyond and aims at a local-aware encoding of these features depending on the predicted image semantics, with the advantage of using only of the knowledge contained inside the network. In particular, we employ Class Activation Maps (CAMs) to obtain the most discriminative regions of the image from a semantic perspective. Additionally, CAMs are also used to generate object proposals during an unsupervised re-ranking stage after a rst fast search. Our experiments on two public available datasets for instance retrieval, Oxford5k and Paris6k, demonstrate that our system is competitive and even outperforms the current state-of-the-art when using o -the-shelf models trained on the object classes of ImageNet.

[Project page]

Class Weighted Convolutional Features for Image Retrieval from Xavier Giro-i-Nieto
}, keywords = {Convolutional Neural Networks, deep learning, image retrieval, Transfer Learning, Visual Instance Search}, author = {Jim{\'e}nez, Albert}, editor = {Xavier Gir{\'o}-i-Nieto and {\'A}lvarez, Jose M} } @conference {cJimenez, title = {Class Weighted Convolutional Features for Visual Instance Search}, booktitle = {28th British Machine Vision Conference (BMVC)}, year = {2017}, month = {09/2017}, address = {London, UK}, abstract = {

Image retrieval in realistic scenarios targets large dynamic datasets of unlabeled images.\ In these cases, training or fine-tuning a model every time new images are added to the database is neither efficient nor scalable.\ Convolutional neural networks trained for image classification over large datasets have been proven effective feature extractors when transferred to the task of image retrieval. The most successful approaches are based on encoding the activations of convolutional layers, as they convey the image spatial information. \ Our proposal goes beyond and aims at a local-aware encoding of these features depending on the predicted image semantics, with the advantage of using only of the knowledge contained inside the network.\ In particular, we employ Class Activation Maps (CAMs) to obtain the most discriminative regions from a semantic perspective. Additionally, CAMs are also used to generate object proposals during an unsupervised re-ranking stage after a first fast search.\ Our experiments on two public available datasets for instance retrieval, Oxford5k and Paris6k, demonstrate that our system is competitive and even outperforms the current state-of-the-art when using off-the-shelf models trained on the object classes of ImageNet.

Class Weighted Convolutional Features for Image Retrieval from Xavier Giro-i-Nieto
}, author = {Jim{\'e}nez, Albert and {\'A}lvarez, Jose M and Xavier Gir{\'o}-i-Nieto} } @article {aMartinez17, title = {A closed-loop approach for tracking a humanoid robot using particle filtering and depth data}, journal = {Intelligent Service Robotics}, volume = {10}, year = {2017}, month = {10/2017}, pages = {297{\textendash}312}, abstract = {

Humanoid robots introduce instabilities during biped march that complicate the process of estimating their position and orientation along time. Tracking humanoid robots may be useful not only in typical applications such as navigation, but in tasks that require benchmarking the multiple processes that involve registering measures about the performance of the humanoid during walking. Small robots represent an additional challenge due to their size and mechanic limitations which may generate unstable swinging while walking. This paper presents a strategy for the active localization of a humanoid robot in environments that are monitored by external devices. The problem is faced using a particle filter method over depth images captured by an RGB-D sensor in order to effectively track the position and orientation of the robot during its march. The tracking stage is coupled with a locomotion system controlling the stepping of the robot toward a given oriented target. We present an integral communication framework between the tracking and the locomotion control of the robot based on the robot operating system, which is capable of achieving real-time locomotion tasks using a NAO humanoid robot.

}, keywords = {Humanoid robot, ~Particle Filter, ~RGB-D sensor, ~ROS, ~Tracking}, issn = {1861-2776}, doi = {10.1007/s11370-017-0230-0}, url = {http://rdcu.be/s1RE}, author = {Mart{\'\i}nez, P. A. and X. Lin and Castel{\'a}n, M and Casas, J. and Arechavaleta, G.} } @conference {cvan Sabben17, title = {Collaborative voting of 3D features for robust gesture estimation}, booktitle = {International Conference on Acoustics, Speech and Signal Processing}, year = {2017}, month = {03/2017}, address = {New Orleans, USA}, abstract = {

Human body analysis raises special interest because it enables a wide range of interactive applications. In this paper we present a gesture estimator that discriminates body poses in depth images. A novel collaborative method is proposed to learn 3D features of the human body and, later, to estimate specific gestures. The collaborative estimation framework is inspired by decision forests, where each selected point (anchor point) contributes to the estimation by casting votes. The main idea is to detect a body part by accumulating the inference of other trained body parts. The collaborative voting encodes the global context of human pose, while 3D features represent local appearance. Body parts contributing to the detection are interpreted as a voting process. Experimental results for different 3D features prove the validity of the proposed algorithm.

}, author = {van Sabben, D. and Ruiz-Hidalgo, J. and Suau, X. and Casas, J.} } @conference {cGorriza, title = {Cost-Effective Active Learning for Melanoma Segmentation}, booktitle = {ML4H: Machine Learning for Health Workshop at NIPS 2017}, year = {2017}, month = {11/2017}, address = {Long Beach, CA, USA}, abstract = {

We propose a novel Active Learning framework capable to train effectively a convolutional neural network for semantic segmentation of medical imaging, with a limited amount of training labeled data. Our contribution is a practical Cost-Effective Active Learning approach using Dropout at test time as Monte Carlo sampling to model the pixel-wise uncertainty and to analyze the image information to improve the training performance.

Active Deep Learning for Medical Imaging from Xavier Giro-i-Nieto
}, url = {https://arxiv.org/abs/1711.09168}, author = {G{\'o}rriz, Marc and Xavier Gir{\'o}-i-Nieto and Carlier, Axel and Faure, Emmanuel} } @conference {cBellvera, title = {Detection-aided liver lesion segmentation using deep learning}, booktitle = {ML4H: Machine Learning for Health Workshop at NIPS 2017}, year = {2017}, month = {11/2017}, abstract = {

A fully automatic technique for segmenting the liver and localizing its unhealthy tissues is a convenient tool in order to diagnose hepatic diseases and assess the response to the according treatments. In this work we propose a method to segment the liver and its lesions from Computed Tomography (CT) scans using Convolutional Neural Networks (CNNs), that have proven good results in a variety of computer vision tasks, including medical imaging. The network that segments the lesions consists of a cascaded architecture, which first focuses on the region of the liver in order to segment the lesions on it. Moreover, we train a detector to localize the lesions, and mask the results of the segmentation network with the positive detections. The segmentation architecture is based on DRIU, a Fully Convolutional Network (FCN) with side outputs that work on feature maps of different resolutions, to finally\  benefit from the multi-scale information learned by different stages of the network. The main contribution of this work is the use of a detector to localize the lesions, which we show to be beneficial to remove false positives triggered by the segmentation network.

Detection-aided liver lesion segmentation using deep learning from Xavier Giro-i-Nieto
}, author = {M{\'\i}riam Bellver and Kevis-Kokitsi Maninis and Jordi Pont-Tuset and Jordi Torres and Xavier Gir{\'o}-i-Nieto and Luc van Gool} } @mastersthesis {xBellvera, title = {Detection-aided medical image segmentation using deep learning}, year = {2017}, abstract = {

Program:\ Master{\textquoteright}s Degree in Telecommunications Engineering

Grade: A with honours (10.0/10.0)

A fully automatic technique for segmenting the liver and localizing its unhealthy tissues is a convenient tool in order to diagnose hepatic diseases and also to assess the response to the according treatments. In this thesis we propose a method to segment the liver and its lesions from Computed Tomography (CT) scans, as well as other anatomical structures and organs of the human body. We have used Convolutional Neural Networks (CNNs), that have proven good results in a variety of tasks, including medical imaging. The network to segment the lesions consists of a cascaded architecture, which first focuses on the liver region in order to segment the lesion. Moreover, we train a detector to localize the lesions and just keep those pixels from the output of the segmentation network where a lesion is detected. The segmentation architecture is based on DRIU [24], a Fully Convolutional Network (FCN) with side outputs that work at feature maps of different resolutions, to finally benefit from the multi-scale information learned by different stages of the network. Our pipeline is 2.5D, as the input of the network is a stack of consecutive slices of the CT scans. We also study different methods to benefit from the liver segmentation in order to delineate the lesion. The main focus of this work is to use the detector to localize the lesions, as we demonstrate that it helps to remove false positives triggered by the segmentation network. The benefits of using a detector on top of the segmentation is that the detector acquires a more global insight of the healthiness of a liver tissue compared to the segmentation network, whose final output is pixel-wise and is not forced to take a global decision over a whole liver patch. We show experiments with the LiTS dataset for the lesion and liver segmentation. In order to prove the generality of the segmentation network, we also segment several anatomical structures from the Visceral dataset.

Detection-aided liver lesion segmentation using deep learning from Xavier Giro-i-Nieto
}, author = {M{\'\i}riam Bellver}, editor = {Kevis-Kokitsi Maninis and Jordi Pont-Tuset and Luc van Gool and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @conference {cLina, title = {Disentangling Motion, Foreground and Background Features in Videos}, booktitle = {CVPR 2017 Workshop Brave New Motion Representations}, year = {2017}, month = {05/2017}, abstract = {

This paper instroduces an unsupervised framework to extract semantically rich features for video representation. Inspired by how the human visual system groups objects based on motion cues, we propose a deep convolutional neural network that disentangles motion, foreground and background information. The proposed architecture consists of a 3D convolutional feature encoder for blocks of 16 frames, which is trained for reconstruction tasks over the first and last frames of the sequence. The model is trained with a fraction of videos from the UCF-101 dataset taking as ground truth the bounding boxes around the activity regions. Qualitative results indicate that the network can successfully update the foreground appearance based on pure-motion features. The benefits of these learned features are shown in a discriminative classification task when compared with a random initialization of the network weights, providing a gain of accuracy above the 10\%.

Disentangle motion, Foreground and Background Features in Videos from Xavier Giro-i-Nieto
}, author = {Lin, Xunyu and V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto and Jordi Torres and Cristian Canton-Ferrer} } @conference {c, title = {Distributed training strategies for a computer vision deep learning algorithm on a distributed GPU cluster}, booktitle = {International Conference on Computational Science (ICCS)}, year = {2017}, month = {06/2017}, publisher = {Elsevier}, organization = {Elsevier}, address = {Zurich, Switzerland}, abstract = {

Deep learning algorithms base their success on building high learning capacity models with millions of parameters that are tuned in a data-driven fashion. These models are trained by processing millions of examples, so that the development of more accurate algorithms is usually limited by the throughput of the computing devices on which they are trained. In this work, we explore how the training of a state-of-the-art neural network for computer vision can be parallelized on a distributed GPU cluster. The effect of distributing the training process is addressed from two different points of view. First, the scalability of the task and its performance in the distributed setting are analyzed. Second, the impact of distributed training methods on the final accuracy of the models is studied.

[ICCS 2017 website][Related session in ICCS 2017][Paper in UPCommons]

}, keywords = {distributed computing; parallel systems; deep learning; Convolutional Neural Networks}, doi = {https://doi.org/10.1016/j.procs.2017.05.074}, url = {http://www.sciencedirect.com/science/article/pii/S1877050917306129}, author = {V{\'\i}ctor Campos and Sastre, Francesc and Yag{\"u}es, Maurici and M{\'\i}riam Bellver and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @inbook {bBellot17, title = {Efficient Combination of Pairwise Feature Networks}, booktitle = {Neural Connectomics Challenge}, year = {2017}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, chapter = {7}, issn = {978-3-319-53069-7}, doi = {10.1007/978-3-319-53070-3}, author = {P Bellot and Patrick Meyer}, editor = {Battaglia, D and Guyon, I. and Lemaire, V. and Orlandi, J. and Ray, B. and Soriano, J.} } @mastersthesis {xCastello17, title = {Extracci{\'o}n de cr{\'a}neo en im{\'a}genes de resonancia magn{\'e}tica del cerebro utilizando una red neuronal convolucional 3D}, year = {2017}, author = {David Rodr{\'\i}guez Castell{\'o}}, editor = {Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana} } @unpublished {xMohedanoa, title = {Fine-tuning of CNN models for Instance Search with Pseudo-Relevance Feedback}, year = {2017}, publisher = {NIPS 2017 Women in Machine Learning Workshop}, address = {Long Beach, CA, USA}, abstract = {

CNN classification models trained on millions of labeled images have been proven to encode {\textquotedblleft}general purpose{\textquotedblright} descriptors in their intermediate layers. These descriptors are useful for a diverse range of computer vision problems~\cite{1}. However, the target task of these models is substantially different to the instance search task. While classification is concerned with distinguishing between different classes, instance search is concerned with identifying concrete instances of a particular class.\ 

In this work we propose an unsupervised approach to finetune a model for similarity learning~\cite{2}. For that, we combine two different search engines: one based on off-the-shelf CNN features, and another one on the popular SIFT features. As shown in the figure below, we observe that the information of pre-trained CNN representations and SIFT is in most of the cases complementary, which allows the generation of high quality rank lists. The fusion of the two rankings is used to generate training data for a particular dataset. A pseudo-relevance feedback strategy~\cite{3} is used for sampling images from rankings, considering the top images as positive examples of a particular instance and middle-low ranked images as negative examples.

}, author = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @article {xCampos, title = {From Pixels to Sentiment: Fine-tuning CNNs for Visual Sentiment Prediction}, journal = {Image and Vision Computing}, year = {2017}, abstract = {

Visual multimedia have become an inseparable part of our digital social lives, and they often capture moments tied with deep affections. Automated visual sentiment analysis tools can provide a means of extracting the rich feelings and latent dispositions embedded in these media. In this work, we explore how Convolutional Neural Networks (CNNs), a now de facto computational machine learning tool particularly in the area of Computer Vision, can be specifically applied to the task of visual sentiment prediction. We accomplish this through fine-tuning experiments using a state-of-the-art CNN and via rigorous architecture analysis, we present several modifications that lead to accuracy improvements over prior art on a dataset of images from a popular social media platform. We additionally present visualizations of local patterns that the network learned to associate with image sentiment for insight into how visual positivity (or negativity) is perceived by the model.

}, doi = {http://dx.doi.org/10.1016/j.imavis.2017.01.011}, url = {http://arxiv.org/abs/1604.03489}, author = {V{\'\i}ctor Campos and Jou, Brendan and Xavier Gir{\'o}-i-Nieto} } @inbook {bBellver17, title = {Hierarchical Object Detection with Deep Reinforcement Learning}, booktitle = {Deep Learning for Image Processing Applications}, volume = {31}, year = {2017}, publisher = {IOS Press}, organization = {IOS Press}, address = {Amsterdam, The Netherlands}, abstract = {

This work introduces a model for Hierarchical Object Detection with Deep Reinforcement Learning (HOD-DRL). The key idea is to focus on those parts of the image that contain richer information and zoom on them. We train an intelligent agent that, given an image window, is capable of deciding where to focus the attention on five different predefined region candidates (smaller windows). This procedure is iterated providing a hierarchical image analysis.We compare two different candidate proposal strategies to guide the object search: with and without overlap. Moreover, our work compares two different strategies to extract features from a convolutional neural network for each region proposal: a first one that computes new feature maps for each region proposal, and a second one that computes the feature maps for the whole image to later generate crops for each region proposal. Experiments indicate better results for the overlapping candidate proposal strategy and a loss of performance for the cropped image features due to the loss of spatial resolution. We argue that, while this loss seems unavoidable when working with a large number of object candidates, the much more reduced number of region proposals generated by our reinforcement learning agent allows considering to extract features for each location without sharing convolutional computation among regions. Source code and models are available at https://imatge-upc.github.io/detection-2016-nipsws/.

}, keywords = {computer Vision, Object detection, reinforcement learning}, issn = {978-1-61499-822-8}, doi = {10.3233/978-1-61499-822-8-164}, url = {http://ebooks.iospress.nl/volumearticle/48029}, author = {M{\'\i}riam Bellver and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Jordi Torres} } @article {aFrias-Velazquez, title = {Hierarchical stack filtering: a bitplane-based algorithm for massively parallel processors}, journal = {Journal of Real-Time Image Processing}, year = {2017}, month = {03/2017}, abstract = {

Full version available at http://rdcu.be/p6w1

With the development of novel parallel architectures for image processing, the implementation of well-known image operators needs to be reformulated to take advantage of the so-called massive parallelism. In this work, we propose a general algorithm that implements a large class of nonlinear filters, called stack filters, with a 2D-array processor. The proposed method consists of decomposing an image into bitplanes with the bitwise decomposition, and then process every bitplane hierarchically. The filtered image is reconstructed by simply stacking the filtered bitplanes according to their order of significance. Owing to its hierarchical structure, our algorithm allows us to trade-off between image quality and processing time, and to significantly reduce the computation time of low-entropy images. Also, experimental tests show that the processing time of our method is substantially lower than that of classical methods when using large structuring elements. All these features are of interest to a variety of real-time applications based on morphological operations such as video segmentation and video enhancement.\ 

}, keywords = {Array processors, Bitwise decomposition, Morphological operators, Smart camera, Stack filters}, url = {http://rdcu.be/p6w1}, author = {Frias-Velazquez, A. and Morros, J.R. and Garc{\'\i}a, M. and Philips, Wilfried} } @conference {cRomero-Lopeza, title = {The Impact of Segmentation on the Accuracy and Sensitivity of a Melanoma Classifier based on Skin Lesion Images}, booktitle = {Annual Meeting of the Society of Imaging Informatics in Medicine (SIIM)}, year = {2017}, month = {06/2017}, publisher = {Society of Imaging Informatics for Medicine}, organization = {Society of Imaging Informatics for Medicine}, address = {Pittsburgh, PA, USA}, abstract = {

The accuracy and sensitivity of a Deep Learning based approach for a 2-class classifier for early melanoma detection\ based on skin lesion dermoscopic images increases when\ the classifier is trained with segmented inputs (i.e., images\ containing only the lesions as binary masks, without the\ surrounding context) instead of entire images.

[SIIM 2017 Annual Meeting website]

[SIIM 2017 Session where our work is presented]

The Impact of Segmentation on the Accuracy and Sensitivity of a Melanoma Classifier Based on Skin Lesion Images from Oge Marques
}, url = {http://hdl.handle.net/2117/105582}, author = {Romero-Lopez, Adria and Burdick, Jack and Xavier Gir{\'o}-i-Nieto and Marques, Oge} } @mastersthesis {xArazo, title = {The impact of visual saliency prediction in image classification}, year = {2017}, abstract = {

Advisors: Eva Mohedano, Kevin McGuinness and Xavier Giro-i-Nieto

Program: Master{\textquoteright}s degree in Telecommunications Engineering (MET)

Grade: A (9.7/10.0)

This thesis introduces an architecture to improve the accuracy of a Convolutional Neural Network trained for image classification using visual saliency predictions from the original images. In this thesis the accuracy of a Convolutional Neural Network (CNN) trained for classification has been improved using saliency maps from the original images. The network had an AlexNet architecture and was trained using 1.2 million images from the Imagenet dataset. Two methods had been explored in order to exploit the information from the visual saliency predictions. The first methodologies implemented applied the saliency maps directly to the existing layers of the CNN, which in some cases were already trained for classification and in other they were initialized with random weights. In the second methodology the information from the saliency maps was merged from a new branch, trained at the same time as the initial CNN. In order to speed up the training of the networks the experiments were implemented using images reduced to 128x128. With this sizes the proposed model achieves 12.39\% increase in Top-1 accuracy performance with respect to the original CNN, and additionally reduces the number of parameters needed compared to AlexNet. Regarding the original size images 227x227 a model that increases 1.72\% Top-1 accuracy is proposed. To accelerate the training process of the network the images have been reduced. The methodology that provides the higher improvement in accuracy will be implemented using the original size of the images. The results will be compared to those obtained from the network trained only with the original images. All the methodologies proposed are implemented in a network previously trained for classification. Additionally the most successful methodologies will be implemented in the training of a network. The results will provide information about the best way to add saliency maps to improve the accuracy.

The impact of visual saliency prediction in image classification from Xavier Giro
}, author = {Arazo, Eric}, editor = {McGuinness, Kevin and Mohedano, Eva and Xavier Gir{\'o}-i-Nieto} } @article {xGiro-i-Nieto17, title = {La meitat de les not{\'\i}cies que consumirem el 2022 seran falses}, year = {2017}, publisher = {Corporaci{\'o} Catalana de Mitjans Audiovisuals}, address = {Sant Joan Desp{\'\i}}, abstract = {

Reportatge em{\`e}s dins el Telenot{\'\i}cies Vespre de Televisi{\'o} de Catalunya el diumenge 26 de novembre de 2017.

Els programes d{\textquoteright}intel{\textperiodcentered}lig{\`e}ncia artificial s{\'o}n capa{\c c}os de crear imatges i veus cada cop m{\'e}s realistes i obren la porta a generar mentides de forma m{\'e}s automatitzada

}, keywords = {deep learning, fake news, gan}, url = {http://www.ccma.cat/324/la-meitat-de-les-noticies-que-consumirem-el-2022-seran-falses/noticia/2823178/}, author = {Xavier Gir{\'o}-i-Nieto and Pascual-deLaPuente, Santiago and Mir{\'o}, Vict{\`o}ria and Esteve, Oriol} } @conference {cSalvadorc, title = {Learning Cross-modal Embeddings for Cooking Recipes and Food Images}, booktitle = {CVPR}, year = {2017}, month = {03/2017}, publisher = {CVF / IEEE}, organization = {CVF / IEEE}, address = {Honolulu, Hawaii, USA}, abstract = {

In this paper, we introduce Recipe1M, a new large-scale, structured corpus of over 1m cooking recipes and 800k food images. As the largest publicly available collection of recipe data, Recipe1M affords the ability to train high-capacity models on aligned, multi-modal data. Using these data, we train a neural network to find a joint embedding of recipes and images that yields impressive results on an image-recipe retrieval task. Additionally, we demonstrate that regularization via the addition of a high-level classification objective both improves retrieval performance to rival that of humans and enables semantic vector arithmetic. We postulate that these embeddings will provide a basis for further exploration of the Recipe1M dataset and food and cooking in general.

[Project page]

In the news:

}, doi = {10.1109/CVPR.2017.327}, url = {http://openaccess.thecvf.com/content_cvpr_2017/html/Salvador_Learning_Cross-Modal_Embeddings_CVPR_2017_paper.html}, author = {Amaia Salvador and Hynes, Nicholas and Aytar, Yusuf and Marin, Javier and Ofli, Ferda and Weber, Ingmar and Torralba, Antonio} } @mastersthesis {xCampos17, title = {Learning to Skip State Updates in Recurrent Neural Networks}, year = {2017}, abstract = {

Program:\ Master{\textquoteright}s Degree in Telecommunications Engineering

Grade: A with honours (10.0/10.0)

Recurrent Neural Networks (RNNs) continue to show outstanding performance in sequence modeling tasks. However, training RNNs on long sequences often face challenges like slow inference, vanishing gradients and dificulty in capturing long term dependencies. In backpropagation through time settings, these issues are tightly coupled with the large, sequential computational graph resulting from unfolding the RNN in time. We introduce the Skip RNN model which extends existing RNN models by learning to skip state updates and shortens the effective size of the computational graph. This network can be encouraged to perform fewer state updates through a novel loss term. We evaluate the proposed model on various tasks and show how it can reduce the number of required RNN updates while preserving, and sometimes even improving, the performance of the baseline models.

Skip RNN: Learning to Skip State Updates in Recurrent Neural Networks from Xavier Giro-i-Nieto
}, keywords = {conditional computation, deep learning, machine learning, recurrent neural networks, sequence modeling}, url = {https://imatge-upc.github.io/skiprnn-2017-telecombcn/}, author = {V{\'\i}ctor Campos}, editor = {Jou, Brendan and Chang, Shih-Fu and Xavier Gir{\'o}-i-Nieto} } @conference {cGurrina, title = {LTA 2017: The Second Workshop on Lifelogging Tools and Applications}, booktitle = {ACM Multimedia}, year = {2017}, month = {10/2017}, publisher = {ACM}, organization = {ACM}, address = {Mountain View, California USA}, abstract = {

The organisation of personal data is receiving increasing research attention due to the challenges we face in gathering, enriching, searching, and visualising such data. \ Given the increasing ease with which personal data being gathered by individuals, the concept of a lifelog digital library of rich multimedia and sensory content for every individual is fast becoming a reality. \ The LTA~2017 workshop aims to bring together academics and practitioners to discuss approaches to lifelog data analytics and applications; and to debate the opportunities and challenges for researchers in this new and challenging area.

}, doi = {10.1145/3123266.3132050}, author = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto and Radeva, Petia and Dimiccoli, M. and Dang-Nguyen, Duc-Tien and Joho, H.} } @article {aPetrone17, title = {Magnetic Resonance Imaging as a valuable tool for Alzheimer{\textquoteright}s disease screening}, journal = {Alzheimer{\textquoteright}s \& Dementia: The Journal of the Alzheimer{\textquoteright}s Association}, volume = {13}, year = {2017}, month = {07/2017}, pages = {P1245}, doi = {10.1016/j.jalz.2017.07.457}, url = {https://doi.org/10.1016/j.jalz.2017.07.457}, author = {P. Petrone and Ver{\'o}nica Vilaplana and Adri{\`a} Casamitjana and D. Sanchez-Escobedo and A. Tucholka and R. Cacciaglia and G. Operto and S. Skouras and C. Falcon and J.L. Molinuevo and J.D. Gispert} } @conference {cPetrone17, title = {Magnetic Resonance Imaging as a valuable tool for Alzheimer{\textquoteright}s disease screening}, booktitle = {Alzheimer{\textquoteright}s Association International Conference, London, 2017}, year = {2017}, month = {07/2017}, author = {P. Petrone and Ver{\'o}nica Vilaplana and Adri{\`a} Casamitjana and A. Tucholka and C. Falcon and R. Cacciaglia and G. Operto and S. Skouras and J.L. Molinuevo and J.D. Gispert} } @phdthesis {dPerez-Pellitero17, title = {Manifold Learning for Super Resolution}, year = {2017}, school = {Leibniz Universit{\"a}t Hannover}, address = {Hannover}, abstract = {

The development pace of high-resolution displays has been so fast in the recent years that many images acquired with low-end capture devices are already outdated or will be shortly in time. Super Resolution is central to match the resolution of the already existing image content to that of current and future high resolution displays and applications. This dissertation is focused on learning how to upscale images from the statistics of natural images. We build on a sparsity model that uses learned coupled low- and high-resolution dictionaries in order to upscale images.

Firstly, we study how to adaptively build coupled dictionaries so that their content is semantically related with the input image. We do so by using a Bayesian selection stage which finds the best-fitted texture regions from the training dataset for each input image. The resulting adapted subset of patches is compressed into a coupled dictionary via sparse coding techniques.

We then shift from l1 to a more efficient l2 regularization, as introduced by Timofte et al. Instead of using their patch-to-dictionary decomposition, we propose a fully collaborative neighbor embedding approach. In this novel scheme, for each atom in the dictionary we create a densely populated neighborhood from an extensive training set of raw patches (i.e. in the order of hundreds of thousands). This generates more accurate regression functions.

We additionally propose using sublinear search structures such as spherical hashing and trees to speed up the nearest neighbor search involved in regression-based Super Resolution. We study the positive impact of antipodally invariant metrics for linear regression frameworks, and we propose two efficient solutions: (a) the Half Hypersphere Confinement, which enables antipodal invariance within the Euclidean space, and (b) the bimodal tree, whose split functions are designed to be antipodally invariant and which we use in the context of a Bayesian Super Resolution forest.

In our last contribution, we extend antipodal invariance by also taking into consideration the dihedral group of transforms (i.e. rotations and reflections). We study them as a group of symmetries within the high-dimensional manifold. We obtain the respective set of mirror-symmetry axes by means of a frequency analysis, and we use them to collapse the redundant variability, resulting in a reduced manifold span which, in turn, greatly improves quality performance and reduces the dictionary sizes.

}, author = {E. Perez-Pellitero}, editor = {Rosenhahn, B. and Ruiz-Hidalgo, J.} } @conference {cCata, title = {Masked V-Net: an approach to brain tumor segmentation}, booktitle = {Multimodal Brain Tumor Segmentation Benchmark, Brain-lesion Workshop, MICCAI 2017 }, year = {2017}, month = {09/2017}, author = {Marcel Cat{\`a} and Adri{\`a} Casamitjana and Irina S{\'a}nchez and Marc Combalia and Ver{\'o}nica Vilaplana} } @article {xSalvador17b, title = {MIT is building a system that can identify a recipe using pictures of food}, year = {2017}, publisher = {Techcrunch}, abstract = {

Mastering the ability to distinguish hot dogs from not hot dogs is truly one of mankind{\textquoteright}s greatest achievements. Attempting to one-up that sort of masterstroke would surely be flying too close to the sun {\textemdash} but researchers at MIT are trying nonetheless.

}, url = {https://techcrunch.com/2017/07/20/mit-is-building-a-system-that-can-identify-a-recipe-using-pictures-of-food/}, author = {Amaia Salvador} } @conference {cFernandez, title = {More cat than cute? Interpretable Prediction of Adjective-Noun Pairs}, booktitle = {ACM Multimedia 2017 Workshop on Multimodal Understanding of Social, Affective and Subjective Attributes}, year = {2017}, month = {10/2017}, publisher = {ACM SIGMM}, organization = {ACM SIGMM}, address = {Mountain View, CA (USA)}, abstract = {

The increasing availability of affect-rich multimedia resources has bolstered interest in understanding sentiment and emotions in and from visual content. Adjective-noun pairs (ANP) are a popular mid-level semantic construct for capturing affect via visually detectable concepts such as {\textquoteleft}{\textquoteleft}cute dog" or {\textquoteleft}{\textquoteleft}beautiful landscape". Current state-of-the-art methods approach ANP prediction by considering each of these compound concepts as individual tokens, ignoring the underlying relationships in ANPs. This work aims at disentangling the contributions of the {\textquoteleft}adjectives{\textquoteright} and {\textquoteleft}nouns{\textquoteright} in the visual prediction of ANPs. Two specialised classifiers, one trained for detecting adjectives and another for nouns, are fused to predict 553 different ANPs. The resulting ANP prediction model is more interpretable as it allows us to study contributions of the adjective and noun components.

}, doi = {10.1145/3132515.3132520}, author = {Fern{\`a}ndez, D{\`e}lia and Woodward, Alejandro and V{\'\i}ctor Campos and Jou, Brendan and Xavier Gir{\'o}-i-Nieto and Chang, Shih-Fu} } @mastersthesis {xCompri, title = {Multi-label Remote Sensing Image Retrieval based on Deep Features}, year = {2017}, abstract = {

Student: Michele Compri

Advisors: Beg{\"u}m Demir (University of Trento) and Xavier Giro-i-Nieto (UPC)

Recent advances in satellite technology has led to an increased volume of remote sensing (RS) image archives, from which retrieving useful information is challenging. Thus, one important research area in remote sensing (RS) is the content-based retrieval of RS images (CBIR). The performance of the CBIR systems depends on the capability of the RS image features in modeling the content of the images as well as the considered retrieval algorithm that assesses the similarity among the features. Existing CBIR systems in the RS literature assume that each image is categorized by only a single label in terms of a land-cover class that is associated to the most significant content of the image. However, RS images usually have complex content, i.e., there are usually several regions within each image related to multiple land-cover classes. Thus, available CBIR systems are not capable of accurately characterizing and exploiting the high level semantic content of RS images for retrieval problems.

To overcome these problems and to effectively characterize the high-level semantic content of RS images, we investigate effectiveness of different deep learning architectures in the framework of multi-label remote sensing image retrieval problems. This is achieved based on a two-steps strategy. In the first step, aConvolutional Neural Network (CNN) pre-trained for image classification with the ImageNet dataset is used off-the-shelf as a feature extractor. In particular, three popular architectures are explored: 1) VGG16; 2) Inception V3; and 3) ResNet50. VGG16 is a CNN characterized by 16 convolutional layers of stacked 3x3 filters, with intermediate max pooling layers and 3 fully connected layers at the end. Inception V3 is an improved version of the former GoogleNet, which contains more layers but less parameters, by removing fully connected layers and using a global average pooling from the last convolutional layer. ResNet50 is even deeper thanks to the introduction of residual layers, that allow data to flow by skipping the convolutional blocks. In he second step of our research, we modify these three off-the-shelf models by fine-tunning their parameters with a subset of RS images and their multi-label information. Experiments carried out on an archive of aerial images show that fine-tuning CNN architectures with annotated images with multi-labels significantly improve the retrieval accuracy with respect to the standard CBIR methods. We find that fine-tunning using with a multi-class approach achieves better results than than considering each label as an independent class. Due to the space constraints, the detailed information on each step of the proposed method will be given in the full version of the paper.\ 

[source code on GitHub]

Multi-label Remote Sensing Image Retrieval based on Deep Features from Xavier Giro
}, author = {Compri, Michele}, editor = {Demir, Beg{\"u}m and Xavier Gir{\'o}-i-Nieto} } @article {aPont-Tuset2015, title = {Multiscale Combinatorial Grouping for Image Segmentation and Object Proposal Generation}, journal = { IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)}, volume = {38}, year = {2017}, pages = {128-140}, abstract = {

We propose a unified approach for bottom-up hierarchical image segmentation and object proposal generation for recognition, called Multiscale Combinatorial Grouping (MCG). For this purpose, we first develop a fast normalized cuts algorithm. We then propose a high-performance hierarchical segmenter that makes effective use of multiscale information. Finally, we propose a grouping strategy that combines our multiscale regions into highly-accurate object proposals by exploring efficiently their combinatorial space. We also present Single-scale Combinatorial Grouping (SCG), a faster version of MCG that produces competitive proposals in under five second per image. We conduct an extensive and comprehensive empirical validation on the BSDS500, SegVOC12, SBD, and COCO datasets, showing that MCG produces state-of-the-art contours, hierarchical regions, and object proposals.

}, url = {http://arxiv.org/abs/1503.00848v1}, author = {Jordi Pont-Tuset and Pablo Arbelaez and Jonathan T. Barron and Marqu{\'e}s, F. and Jitendra Malik} } @phdthesis {dMaceira17, title = {Multi-view depth coding based on a region representation combining color and depth information}, year = {2017}, month = {06/2017}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {PhD}, abstract = {

Depth map data is used to supplement the color data in multi-view sequences. As depth maps present distinct characteristics than natural color images, new coding techniques are required to represent their smooth regions and sharp edges. In this thesis, segmentation-based coding techniques are proposed to encode depth maps by exploiting the redundancy between color and depth information. Methods developed combine partitions obtained from color and depth images to find efficient representations. The color image is assumed to be available before the depth map coding process, therefore a color partition can be obtained at the decoder without introducing coding cost.

Two hierarchical image segmentation algorithms are proposed to generate color and depth partitions for coding applications. The color segmentation obtains a super-pixel representation using color information, spatial distribution and shape complexity. The depth segmentation uses a 3D planar model for each region to extract the structure of the scene. Color and depth partitions are combined in depth map coding methods to find the final coding partition.

Different methods for texture representation have been explored in this thesis. Initial approaches used 2D coding methods, while a 3D representation have been proposed to represent depth maps from multiple views with a unique segmentation. This 3D representation is used to segment depth maps in single-view and multi-view configurations. Final coding partitions are obtained with a ratedistortion optimization over a hierarchy of regions. Segmentation-based coding techniques proposed obtain competitive results with HEVC coding standards.

}, url = {http://hdl.handle.net/2117/110439}, author = {Maceira, M.}, editor = {Ruiz-Hidalgo, J. and Morros, J.R.} } @inbook {bMohedano17, title = {Object Retrieval with Deep Convolutional Features}, booktitle = {Deep Learning for Image Processing Applications}, volume = {31}, number = {Advances in Parallel Computing}, year = {2017}, publisher = {IOS Press}, organization = {IOS Press}, address = {Amsterdam, The Netherlands}, abstract = {

Image representations extracted from convolutional neural networks (CNNs) outdo hand-crafted features in several computer vision tasks, such as visual image retrieval. This chapter recommends a simple pipeline for encoding the local activations of a convolutional layer of a pretrained CNN utilizing the well-known Bag of Words (BoW) aggregation scheme and called bag of local convolutional features (BLCF). Matching each local array of activations in a convolutional layer to a visual word results in an assignment map, which is a compact representation relating regions of an image with a visual word. We use the assignment map for fast spatial reranking, finding object localizations that are used for query expansion. We show the suitability of the BoW representation based on local CNN features for image retrieval, attaining state-of-the-art performance on the Oxford and Paris buildings benchmarks. We demonstrate that the BLCF system outperforms the latest procedures using sum pooling for a subgroup of the challenging TRECVid INS benchmark according to the mean Average Precision (mAP) metric.

}, issn = {978-1-61499-822-8 }, doi = {10.3233/978-1-61499-822-8-137}, url = {http://ebooks.iospress.nl/volumearticle/48028}, author = {Mohedano, Eva and Amaia Salvador and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Marqu{\'e}s, F.} } @article {x17, title = {One Perceptron to Rule Them All}, year = {2017}, abstract = {

Deep neural networks have boosted the convergence of multimedia data analytics in a unified framework shared by practitioners in natural language, vision and speech. Image captioning, lip reading or video sonorization are some of the first applications of a new and exciting field of research exploiting the generalisation properties of deep learning. Get the latest results on how convolutional and recurrent neural networks are combined to find the most hidden patterns in multimedia.

One Perceptron to Rule Them All (Re-Work Deep Learning Summit, London 2017) from Universitat Polit{\`e}cnica de Catalunya
} } @mastersthesis {xBozal, title = {Personalized Image Classi cation from EEG Signals using Deep Learning}, year = {2017}, abstract = {

This thesis explores the semantic classi cation of images based processing of electroencephalogram\ (EEG) signals generated by the viewer{\textquoteright}s brain. The work extends an existing solution by\ exploring the gains obtained when the parameters of the classi er are adapted to the user. Firstly,\ we developed an universal end-to-end model based on deep learning that extracts features from\ the EEG raw signals predicts the semantic content of the image between 40 possible classes from\ the ImageNet dataset. Our main contribution aims at adapting this universal model to new users,\ in order to build a personalized model based on the minimum feedback from the new user. We\ explored di fferent deep learning architectures and hyperparameters to obtain a better accuracy\ than the baseline by Spampinato et al (CVPR 2017). We achieve a result of 89.03 \% and 90.34\ \% of the universal and personalized model respectively.\ 

Personalized Image classification of EEG Signals using Deep Learning from Xavier Giro-i-Nieto
}, author = {Bozal, Alberto}, editor = {Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xBernal, title = {Predicting emotion in movies: Recurrent and convolutional models applied to videos}, year = {2017}, abstract = {

This Thesis explores di erent approaches using deep learning techniques to predict emotions\ in videos.\ Working with videos implies a huge amount of data including visual frames and acoustic\ samples. The rst step of the project is basically to extract features to represent the videos in\ small sets of arrays. This procedure is done using pre-trained models based on Convolutional\ Networks, the state of the art in visual recognition. Firstly, visual features are extracted using\ 3D convolutions and acoustic features are extracted using VGG19, a pre-trained convolutional\ model for images fi ne-tuned to accept the audio inputs.\ Later, these features are fed into a Recurrent model capable of exploiting the temporal information.\ Emotions are measured in terms of valence and arousal, values between [-1, 1]. Additionally,\ the same techniques are also used to attempt to predict fear scenes. In consequence, this thesis\ deals with both regression and classi cation problems.\ Several architectures and di erent parameters have been tested in order to achieve the best\ performance. Finally, the results will be published in the MediaEval 2017 Challenge and compared\ to the state-of-the-art solutions.

Predicting Emotion in Movies from Xavier Giro-i-Nieto
}, author = {Bernal, Oriol}, editor = {Xavier Gir{\'o}-i-Nieto and Zaharieva, Maia} } @mastersthesis {xCardoner, title = {Predicting Media Interestingness}, year = {2017}, abstract = {

This thesis explores the application of a deep learning approach for the prediction of media\ interestingness. Two di erent models are investigated, one for the prediction of image and one\ for the prediction of video interestingness.\ For the prediction of image interestingness, the ResNet50 network is ne-tuned to obtain\ best results. First, some layers are added. Next, the model is trained and fine-tuned using data\ augmentation, dropout, class weights, and changing other hyper parameters.\ For the prediction of video interestingness, rst, features are extracted with a 3D convolutional\ network. Next a LSTM network is trained and fine-tuned with the features.\ The nal result is a binary label for each image/video: 1 for interesting, 0 for not interesting.\ Additionally, a con fidence value is provided for each prediction. Finally, the Mean Average\ Precision (MAP) is employed as evaluation metric to estimate the quality of the fi nal results.

Predicting Media Interestingness from Xavier Giro-i-Nieto
}, author = {Cardoner, LLuc}, editor = {Zaharieva, Maia and Xavier Gir{\'o}-i-Nieto} } @conference {xSalvadora, title = {Recurrent Semantic Instance Segmentation}, booktitle = {NIPS 2017 Women in Machine Learning Workshop (WiML)}, year = {2017}, month = {12/2017}, publisher = {NIPS 2017 Women in Machine Learning Workshop}, organization = {NIPS 2017 Women in Machine Learning Workshop}, address = {Long Beach, CA, USA}, abstract = {

We present a recurrent model for end-to-end instance-aware semantic segmentation that is able to sequentially generate pairs of masks and class predictions. Our proposed system is trainable end-to-end for instance segmentation, does not require further post-processing steps on its output and is conceptually simpler than current methods relying on object proposals. While recent works have proposed recurrent architectures for instance segmentation, these are trained and evaluated for a single category.

Our model is composed of a series of Convolutional LSTMs that are applied in chain with upsampling layers in between to predict a sequence of binary masks and associated class probabilities. Skip connections are incorporated in our model by concatenating the output of the corresponding convolutional layer in the base model with the upsampled output of the ConvLSTM. Binary masks are finally obtained with a 1x1 convolution with sigmoid activation. We concatenate the side outputs of all ConvLSTM layers and apply a per-channel max-pooling operation followed by a single fully-connected layer with softmax activation to obtain the category for each predicted mask.

We train and evaluate our models with the Pascal VOC 2012 dataset. Future work will aim at analyzing and understanding the behavior of the network on other datasets, comparing the system with state of the art solutions and study the relationship of the learned object discovery patterns of our model with those of humans.

}, author = {Amaia Salvador and Baradad, Manel and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cPujol-Miro17, title = {Registration of Images to Unorganized 3D Point Clouds Using Contour Cues}, booktitle = {The 25th European Signal Processing Conference (EUSIPCO 2017)}, year = {2017}, month = {08/2017}, publisher = {Eurasip}, organization = {Eurasip}, address = {Kos island, Greece}, abstract = {

Low resolution commercial 3D sensors contribute to computer vision tasks even better when the analysis is carried out in a combination with higher resolution image data. This requires registration of 2D images to unorganized 3D point clouds. In this paper we present a framework for 2D-3D data fusion to obtain directly the camera pose of a 2D color image in relation to a 3D point cloud. It includes a novel multiscale intensity feature detection algorithm and a modified ICP procedure based on point-to-line distances. The framework is generic for several data types (such as CAD designs or LiDAR data without photometric information), and results show that performance is comparable to the state of the art, while avoiding manual markers or specific patterns on the data.

}, keywords = {Cameras, Feature extraction, Iterative closest point algorithm, Sensors, Signal processing algorithms, Three-dimensional displays}, doi = {10.23919/EUSIPCO.2017.8081173}, url = {https://www.eusipco2017.org/}, author = {A. Pujol-Mir{\'o} and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cPana, title = {SalGAN: Visual Saliency Prediction with Generative Adversarial Networks}, booktitle = {CVPR 2017 Scene Understanding Workshop (SUNw)}, year = {2017}, address = {Honolulu, Hawaii, USA}, abstract = {

We introduce SalGAN, a deep convolutional neural network for visual saliency prediction trained with adversarial examples. The first stage of the network consists of a generator model whose weights are learned by back-propagation computed from a binary cross entropy (BCE) loss over downsampled versions of the saliency maps. The resulting prediction is processed by a discriminator network trained to solve a binary classification task between the saliency maps generated by the generative stage and the ground truth ones. Our experiments show how adversarial training allows reaching state-of-the-art performance across different metrics when combined with a widely-used loss function like BCE.

}, url = {https://arxiv.org/abs/1701.01081}, author = {Pan, Junting and Cristian Canton-Ferrer and McGuinness, Kevin and O{\textquoteright}Connor, N. and Jordi Torres and Elisa Sayrol and Xavier Gir{\'o}-i-Nieto} } @conference {cAssens, title = {SaltiNet: Scan-path Prediction on 360 Degree Images using Saliency Volumes}, booktitle = {ICCV Workshop on Egocentric Perception, Interaction and Computing}, year = {2017}, month = {07/2017}, publisher = {IEEE}, organization = {IEEE}, address = {Venice, Italy}, abstract = {

We introduce SaltiNet, a deep neural network for scanpath prediction trained on 360-degree images. The first part of the network consists of a model trained to generate saliency volumes, whose parameters are learned by back-propagation computed from a binary cross entropy (BCE) loss over downsampled versions of the saliency volumes. Sampling strategies over these volumes are used to generate scanpaths over the 360-degree images. Our experiments show the advantages of using saliency volumes, and how they can be used for related tasks.

Winner of three awards at the Salient 360 Challenge at IEEE ICME 2017 (Hong Kong): Best Scan Path, Best Student Scan-path and Audience Award.

SaltiNet: The Temporal Dimension of Visual Attention Models from Xavier Giro-i-Nieto
}, isbn = {978-1-5386-1034-3}, doi = {10.1109/ICCVW.2017.275}, url = {http://ieeexplore.ieee.org/document/8265485/}, author = {Assens, Marc and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @conference {cTorres, title = {Scaling a Convolutional Neural Network for classification of Adjective Noun Pairs with TensorFlow on GPU Clusters}, booktitle = {17th IEEE/ACM International Symposium on Cluster, Cloud and Grid Computing (CCGrid)}, year = {2017}, month = {05/2017}, publisher = {IEEE}, organization = {IEEE}, address = {Madrid, Spain}, abstract = {

Deep neural networks have gained popularity in recent years, obtaining outstanding results in a wide range of applications such as computer vision in both academia and multiple industry areas. The progress made in recent years cannot be understood without taking into account the technological advancements seen in key domains such as High Performance Computing, more specifically in the Graphic Processing Unit (GPU) domain. These kind of deep neural networks need massive amounts of data to effectively train the millions of parameters they contain, and this training can take up to days or weeks depending on the computer hardware we are using. In this work, we present how the training of a deep neural network can be parallelized on a distributed GPU cluster. The effect of distributing the training process is addressed from two different points of view. First, the scalability of the task and its performance in the distributed setting are analyzed. Second, the impact of distributed training methods on the training times and final accuracy of the models is studied. We used TensorFlow on top of the GPU cluster of servers with 2 K80 GPU cards, at Barcelona Supercomputing Center (BSC). The results show an improvement for both focused areas. On one hand, the experiments show promising results in order to train a neural network faster. The training time is decreased from 106 hours to 16 hours in our experiments. On the other hand we can observe how increasing the numbers of GPUs in one node rises the throughput, images per second, in a near-linear way. Morever an additional distributed speedup of 10.3 is achieved with 16 nodes taking as baseline the speedup of one node.

[CCGRID 2017] [Paper page]

}, url = {http://easychair.org/smart-program/CCGRID2017/2017-05-15.html$\#$session:13550}, author = {Jordi Torres and Sastre, Francesc and Yag{\"u}es, Maurici and V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto} } @conference {cBazazian16, title = {Segmentation-based Multi-Scale Edge Extraction to Measure the Persistence of Features in Unorganized Point Clouds}, booktitle = {International Conference on Computer Vision Theory and Applications}, year = {2017}, month = {02/2017}, address = {Porto, Portugal}, abstract = {

Edge extraction has attracted a lot of attention in computer vision. The accuracy of extracting edges in point clouds can be a significant asset for a variety of engineering scenarios. To address these issues, we propose a segmentation-based multi-scale edge extraction technique.\ 

In this approach, different regions of a point cloud are segmented by a global analysis according to the geodesic distance. Afterwards, a multi-scale operator is defined according to local neighborhoods. Thereupon, by applying this operator at multiple scales of the point cloud, the persistence of features is determined. We illustrate the proposed method by computing a feature weight that measures the likelihood of a point to be an edge, then detects the edge points based on that value at both global and local scales. Moreover, we evaluate quantitatively and qualitatively our method. Experimental results show that the proposed approach achieves a superior accuracy. Furthermore, we demonstrate the robustness of our approach in noisier real-world datasets.

}, doi = {10.5220/0006092503170325}, url = {http://www.scitepress.org/DigitalLibrary/Link.aspx?doi=10.5220\%2f0006092503170325}, author = {D. Bazazian and Casas, J. and Ruiz-Hidalgo, J.} } @conference {xLidona, title = {Semantic Summarization of Egocentric Photo Stream Events}, booktitle = {ACM Multimedia 2017 Workshop on Lifelogging Tools and Applications}, year = {2017}, month = {10/2017}, publisher = {ACM}, organization = {ACM}, address = {Mountain View, CA, USA}, abstract = {

With the rapid increase of users of wearable cameras in recent years and of the amount of data they produce, there is a strong need for automatic retrieval and summarization techniques. This work addresses the problem of automatically summarizing egocentric photo streams captured through a wearable camera by taking an image retrieval perspective. After removing non-informative images by a new CNN-based filter, \ images are ranked by relevance to ensure semantic diversity and \ finally re-ranked by a novelty criterion to reduce redundancy. \ To assess the results, a new evaluation metric is proposed which takes into account the non-uniqueness of the solution. Experimental results applied on a database of 7,110 images from 6 different subjects and evaluated by experts gave 95.74\% of experts satisfaction and a Mean Opinion Score of 4.57 out of 5.0.

}, doi = {10.1145/3133202.3133204}, url = {https://dl.acm.org/citation.cfm?id=3133204}, author = {Lidon, Aniol and Bola{\~n}os, Marc and Dimiccoli, M. and Radeva, Petia and Garolera, Maite and Xavier Gir{\'o}-i-Nieto} } @conference {cRomero-Lopez, title = {Skin Lesion Classification from Dermoscopic Images using Deep Learning}, booktitle = {The 13th IASTED International Conference on Biomedical Engineering (BioMed 2017)}, year = {2017}, month = {02/2017}, address = {Innsbruck Austria}, abstract = {

The recent emergence of deep learning methods for medical image analysis has enabled the development of intelligent medical imaging-based diagnosis systems that can assist the human expert in making better decisions about a patient{\textquoteright}s health. In this paper we focus on the problem of skin lesion classification, particularly early melanoma detection, and present a deep-learning based approach to solve the problem of classifying a dermoscopic image containing a skin lesion as malignant or benign. \ The proposed solution is built around the VGGNet convolutional neural network architecture and uses the transfer learning paradigm. Experimental results are encouraging: on the ISIC Archive dataset, the proposed method achieves a sensitivity value of 78.66\%, which is significantly higher than the current state of the art on that dataset.

[BioMed conference 2017]

}, keywords = {Convolutional Neural Networks, deep learning, machine learning, Medical Decision Support Systems, Medical Image Analysis, Skin Lesions}, url = {http://upcommons.upc.edu/handle/2117/103386}, author = {Romero-Lopez, Adria and Burdick, Jack and Xavier Gir{\'o}-i-Nieto and Marques, Oge} } @mastersthesis {xRomero-Lopez, title = {Skin Lesion Detection from Dermoscopic Images using Convolutional Neural Networks}, year = {2017}, abstract = {

Advisors: Oge Marques (Florida Atlantic University) and Xavier Giro-i-Nieto (UPC)

The recent emergence of machine learning and deep learning methods for medical image analysis has enabled the development of intelligent medical imaging-based diagnosis systems that can assist physicians in making better decisions about a patient{\textquoteright}s health. In particular, skin imaging is a field where these new methods can be applied with a high rate of success.\ 

This thesis focuses on the problem of automatic skin lesion detection, \ particularly on melanoma detection, by applying semantic segmentation and classification from dermoscopic images using a deep learning based approach.\ For the first problem, a U-Net convolutional neural network architecture is applied for an accurate extraction of the lesion region.\ For the second problem, the current model performs a binary classification (benign versus malignant) that can be used for early melanoma detection. The model is general enough to be extended to multi-class skin lesion classification. The proposed solution is built around the VGG-Net ConvNet architecture and uses the transfer learning paradigm.\ Finally, this work performs a comparative evaluation of classification \ alone (using the entire image) against a combination of the two approaches (segmentation followed by classification) in order to assess which of them achieves better classification results.

[Source code]

Skin Lesion Detection from Dermoscopic Images using Convolutional Neural Networks from Xavier Giro
}, author = {Romero-Lopez, Adria}, editor = {Xavier Gir{\'o}-i-Nieto and Marques, Oge} } @conference {cCampos, title = {Skip RNN: Learning to Skip State Updates in Recurrent Neural Networks}, booktitle = {NIPS Time Series Workshop 2017}, year = {2017}, month = {08/2017}, address = {Long Beach, CA, USA}, abstract = {

Recurrent Neural Networks (RNNs) continue to show \ outstanding performance in sequence modeling tasks. However, training RNNs on long sequences often face challenges like slow inference, vanishing gradients and difficulty in capturing long term dependencies. In backpropagation through time settings, these issues are tightly coupled with the large, sequential computational graph resulting from unfolding the RNN in time. We introduce the Skip RNN model which extends existing RNN models by learning to skip state updates and shortens the effective size of the computational graph. This model can also be encouraged to perform fewer state updates through a budget constraint. We evaluate the proposed model on various tasks and show how it can reduce the number of required RNN updates while preserving, and sometimes even improving, the performance of the baseline RNN models.

Skip RNN: Learning to Skip State Updates in Recurrent Neural Networks from Xavier Giro-i-Nieto
}, url = {https://imatge-upc.github.io/skiprnn-2017-telecombcn/}, author = {V{\'\i}ctor Campos and Jou, Brendan and Xavier Gir{\'o}-i-Nieto and Jordi Torres and Chang, Shih-Fu} } @article {xSalvador17a, title = {Snap a photo, get a recipe: pic2recipe uses AI to predict food ingredients}, year = {2017}, publisher = {Digital Trends}, abstract = {

No recipe, no problem. Using image recognition, we can now learn how to cook something just by analyzing a photo.

}, url = {https://www.digitaltrends.com/photography/pics2recipe-mit-research/}, author = {Amaia Salvador} } @conference {cLuque17, title = {Spatio-Temporal Road Detection from Aerial Imagery using CNNs}, booktitle = {International Conference on Computer Vision Theory and Applications}, year = {2017}, month = {2/2017}, address = {Porto, Portugal}, abstract = {

The main goal of this paper is to detect roads from aerial imagery recorded by drones. To achieve this, we propose a modification of SegNet, a deep fully convolutional neural network for image segmentation. In order to train this neural network, we have put together a database containing videos of roads from the point of view of a small commercial drone. Additionally, we have developed an image annotation tool based on the watershed technique, in order to perform a semi-automatic labeling of the videos in this database. The experimental results using our modified version of SegNet show a big improvement on the performance of the neural network when using aerial imagery, obtaining over 90\% accuracy.

}, doi = {10.5220/0006128904930500}, author = {Luque, B. and Morros, J.R. and Ruiz-Hidalgo, J.} } @phdthesis {dBellot17, title = {Study of Gene Regulatory Networks Inference Methods from Gene Expression Data}, year = {2017}, author = {Pau Bellot}, editor = {Salembier, P.} } @mastersthesis {xAssens, title = {The Temporal Dimension of Visual Attention Models}, year = {2017}, abstract = {

Program: Bachelor Degree on Telecommunications Science and Technologies (CITTEL)

Grade: A with honours (10.0/10.0)

This thesis explores methodologies for scanpath prediction on images using deep learning\ frameworks.\ As a preliminary step, we analyze the characteristics of the data provided by di erent datasets.\ We then explore the use of Convolutional Neural Networks (CNN) and Long-Short-Term-Memory\ (LSTM) newtworks for scanpath prediction. We observe that these models fail due to the high\ stochastic nature of the data.\ With the gained insight, we propose a novel time-aware visual saliency representation named\ Saliency Volume, that averages scanpaths over multiple observers.\ Next, we explore the SalNet network and adapt it for saliency volume prediction, and we find\ several ways of generating scanpaths from saliency volumes.\ Finally, we ne-tuned our model for scanpaht prediction on 360-degree images and successfully\ submitted it to the Salient360! Challenge from ICME. The source code and models are publicly\ available at https://github.com/massens/saliency-360salient-2017.

The Temporal Dimension of Visual Attention Models from Xavier Giro-i-Nieto
}, author = {Assens, Marc}, editor = {McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and Noel E. O{\textquoteright}Connor} } @conference {xDuarte, title = {Temporal-aware Cross-modal Embeddings for Video and Audio Retrieval}, booktitle = {NIPS 2017 Women in Machine Learning Workshop (WiML)}, year = {2017}, month = {12/2017}, publisher = {NIPS 2017 Women in Machine Learning Workshop}, organization = {NIPS 2017 Women in Machine Learning Workshop}, address = {Long Beach, CA, USA}, abstract = {

The increasing amount of videos online brings several opportunities for training self-supervised neural networks. In this work, we explore cross-modal embeddings between audio and vision by exploiting their alignment on YouTube videos.

Joint audio-visual embeddings allow creating links between audio and visual documents by projecting them to a common region of the feature space. They can be applied to enriching radio broadcasts with images, finding soundtracks for user-generated videos or simply enriching a topic search with both audio and video documents.

The idea of creating a joint embedding space across modalities has being exploited by other areas [3, 4]. However, joint representation between the video frames and its audio have yet to be fully exploited. A similar approach to the proposed one was [2], where a soundtrack was retrieved to match a music video. However, this work did not target a synchronization between both modalities.

We aim at training a temporal-aware embedding which can align both audio and visual tracks. We use the visual and audio features provided in the YouTube-8M dataset [1]. The dataset includes features at both the clip and frame (temporal window) level. We train embeddings for both scales and assess their quality in a retrieval problem, formulated as using the feature extracted from one modality to retrieve the most similar videos based on the features computed in the other modality.

We aim at not only finding related documents, but synchronize both sequences. The alignment between the two sequences will rely on computing temporal-aware features with recurrent neural networks at different scales. At retrieval time, different scales will be assessed and results evaluated both with ranking metrics and Amazon Mechanical Turk.

References

[1] Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, and Sudheendra Vijayanarasimhan. Youtube-8m: A large-scale video classification benchmark. arXiv preprint arXiv:1609.08675, 2016.

[2] Sungeun Hong, Woobin Im, and Hyun S Yang. Deep learning for content-based, cross-modal retrieval of videos and music. arXiv preprint arXiv:1704.06761, 2017.

[3] Amaia Salvador, Nicholas Hynes, Yusuf Aytar, Javier Marin, Ferda Ofli, Ingmar Weber, and Antonio Torralba. Learning cross-modal embeddings for cooking recipes and food images. In CVPR, 2017.

[4] Liwei Wang, Yin Li, and Svetlana Lazebnik. Learning deep structure-preserving image-text embeddings. In CVPR, 2016.

}, author = {Amanda Duarte and Sur{\'\i}s, D{\'\i}dac and Amaia Salvador and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cLe, title = {Towards large scale multimedia indexing: A case study on person discovery in broadcast news}, booktitle = {International Workshop on Content-Based Multimedia Indexing - CBMI 2017}, year = {2017}, month = {06/2017}, address = {Firenze, Italy}, abstract = {

The rapid growth of multimedia databases and the human interest in their peers make indices representing the location and identity of people in audio-visual documents essential for searching archives. Person discovery in the absence of prior identity knowledge requires accurate association of audio-visual cues and detected names. To this end, we present 3 different strategies to approach this problem: clustering-based naming, verification-based naming, and graph-based naming. Each of these strategies utilizes different recent advances in unsupervised face / speech representation, verification, and optimization. To have a better understanding of the approaches, this paper also provides a quantitative and qualitative comparative study of these approaches using the associated corpus of the Person Discovery challenge at MediaEval 2016. From the results of our experiments, we can observe the pros and cons of each approach, thus paving the way for future promising research directions.

}, author = {Nam Le and Herv{\'e} Bredin and Gabriel Sargent and Miquel India and Paula Lopez-Otero and Claude Barras and Camille Guinaudeau and Guillaume Gravier and Gabriel Barbosa da Fonseca and Izabela Lyon Freire and Zenilton Patroc{\'\i}nio Jr. and Silvio Jamil F. Guimaraes and Gerard Mart{\'\i} and Morros, J.R. and Javier Hernando and Laura Docio-Fernandez and Carmen Garcia-Mateo and Sylvain Meignier and Jean-Marc Odobez} } @mastersthesis {xRoldan, title = {Visual Question Answering 2.0}, year = {2017}, abstract = {

This bachelor{\textquoteright}s thesis explores di erent deep learning techniques to solve the Visual Question-Answering (VQA) task, whose aim is to answer questions about images. We study di erent Convolutional\ Neural Networks (CNN) to extract the visual representation from images: Kernelized-CNN (KCNN), VGG-16 and Residual Networks (ResNet). We also analyze the impact of using\ pre-computed word embeddings trained in large datasets (GloVe embeddings). Moreover, we\ examine di erent techniques of joining representations from di erent modalities. This work has\ been submitted to the second edition Visual Question Answering Challenge, and obtained a\ 43.48\% of accuracy.

[Project page]

Visual Question Answering 2.0 from Xavier Giro-i-Nieto
}, author = {Rold{\'a}n, Francisco}, editor = {Xavier Gir{\'o}-i-Nieto and Masuda-Mora, Issey and Pascual-deLaPuente, Santiago} } @conference {cFernandeza, title = {ViTS: Video Tagging System from Massive Web Multimedia Collections}, booktitle = {ICCV 2017 Workshop on Web-scale Vision and Social Media }, year = {2017}, month = {10/2017}, address = {Venice, Italy}, abstract = {

The popularization of multimedia content on the Web has arised the need to automatically understand, index and retrieve it. In this paper we present ViTS, an automatic Video Tagging System which learns from videos, their web context and comments shared on social networks. ViTS analyses massive multimedia collections by Internet crawling, and maintains a knowledge base that updates in real time with no need of human supervision. As a result, each video is indexed with a rich set of labels and linked with other related contents. ViTS is an industrial product under exploitation with a vocabulary of over 2.5M concepts, capable of indexing more than 150k videos per month. We compare the quality and completeness of our tags with respect to the ones in the YouTube-8M dataset, and we show how ViTS enhances the semantic annotation of the videos with a larger number of labels (10.04 tags/video), with an accuracy of 80,87\%.

}, author = {Fern{\`a}ndez, D{\`e}lia and David Varas and Espadaler, Joan and Ferreira, Jordi and Woodward, Alejandro and Rodr{\'\i}guez, David and Xavier Gir{\'o}-i-Nieto and Riveiro, Juan Carlos and Bou, Elisenda} } @conference {cCasamitjana16, title = {3D Convolutional Neural Networks for Brain Tumor Segmentation}, booktitle = {MICCAI 2016 - Brain Lesion Workshop (BrainLes), Multimodal Brain Tumor Segmentation Challenge (BRATS)}, year = {2016}, month = {11/2016}, author = {Adri{\`a} Casamitjana and Santi Puch and Asier Aduriz and Elisa Sayrol and Ver{\'o}nica Vilaplana} } @conference {cLin16a, title = {3D Point Cloud Segmentation Oriented to The Analysis of Interactions}, booktitle = {The 24th European Signal Processing Conference (EUSIPCO 2016)}, year = {2016}, month = {08/2016}, publisher = {Eurasip}, organization = {Eurasip}, address = {Budapest, Hungary}, abstract = {

Given the widespread availability of point cloud data from consumer depth sensors, 3D point cloud segmentation becomes a promising building block for high level applications such as scene understanding and interaction analysis. It benefits from the richer information contained in real world 3D data compared to 2D images. This also implies that the classical color segmentation challenges have shifted to RGBD data, and new challenges have also emerged as the depth information is usually noisy, sparse and unorganized. Meanwhile, the lack of 3D point cloud ground truth labeling also limits the development and comparison among methods in 3D point cloud segmentation. In this paper, we present two contributions: a novel graph based point cloud segmentation method for RGBD stream data with interacting objects and a new ground truth labeling for a previously published data set. This data set focuses on interaction (merge and split between {\textquoteright}object{\textquoteright} point clouds), which differentiates itself from the few existing labeled RGBD data sets which are more oriented to Simultaneous Localization And Mapping (SLAM) tasks. The proposed point cloud segmentation method is evaluated with the 3D point cloud ground truth labeling. Experiments show the promising result of our approach.

}, url = {http://www.eusipco2016.org/$\#$Wed1030}, author = {X. Lin and Casas, J. and M. Pard{\`a}s} } @conference {cLin, title = {3D Point Cloud Video Segmentation Based on Interaction Analysis}, booktitle = {ECCV 2016: Computer Vision {\textendash} ECCV 2016 Workshops}, volume = {III}, number = {III}, year = {2016}, month = {10/2016}, pages = {821 - 835}, publisher = {Springer}, organization = {Springer}, address = {Amsterdam}, abstract = {

Given the widespread availability of point cloud data from consumer depth sensors, 3D segmentation becomes a promising building block for high level applications such as scene understanding and interaction analysis. It benefits from the richer information contained in actual world 3D data compared to apparent (projected) data in 2D images. This also implies that the classical color segmentation challenges have recently shifted to RGBD data, whereas new emerging challenges are added as 3D information from depth measurements is usually noisy, sparse and unorganized. We present a novel segmentation approach for 3D point cloud video based on low level features and oriented to the analysis of object interactions. A hierarchical representation of the input point cloud is proposed to efficiently segment 3D data at the finer level, and to temporally establish the correspondence between segments, while dynamically managing the object split and merge at the coarser level. Experiments illustrate promising results and its potential application in object interaction analysis.

}, issn = {978-3-319-49409-8}, doi = {10.1007/978-3-319-49409-8}, url = {http://eecs.oregonstate.edu/IWVS_workshop/}, author = {X. Lin and Casas, J. and M. Pard{\`a}s} } @mastersthesis {xAduriz16, title = {Analysis of the dynamics of gray matter reduction in Alzheimer{\textquoteright}s Disease}, year = {2016}, abstract = {

Advisor: Ver{\'o}nica Vilaplana

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

This project attempts to study the cerebral atrophy patterns in gray matter across the
di erent stages of the Alzheimer{\textquoteright}s Disease (AD), or more speci cally, along the entire AD
continuum, in a voxelwise approach. To this end, we propose and implement an extensible
toolbox that allows to t di erent models to the data, hence de ning a curve for each voxel
that shows the evolution of the gray matter volume in the respective region as compared
to the progression of the disease. The toolbox also includes several evaluation methods
to estimate how closely the proposed model ts the data for each particular voxel. The
resulting values, namely tting-scores, serve as a base to achieve two di erent goals: a)
to identify the regions within the brain that are (most) likely to follow the curve-shape
speci ed in a given model, and b) to depict the model that best describes the behavior of
the gray matter volume in each voxel from a xed set of models.

}, author = {Asier Aduriz}, editor = {Ver{\'o}nica Vilaplana} } @article {aPerez-Pellitero16, title = {Antipodally Invariant Metrics For Fast Regression-Based Super-Resolution}, journal = {IEEE Transactions on Image Processing}, volume = {25}, year = {2016}, month = {06/2016}, pages = {2468}, chapter = {2456}, abstract = {

Dictionary-based Super-Resolution algorithms usually select dictionary atoms based on distance or similarity metrics. Although the optimal selection of nearest neighbors is of central importance for such methods, the impact of using proper metrics for Super-Resolution (SR) has been overlooked in literature, mainly due to the vast usage of Euclidean distance. In this paper we present a very fast regression-based algorithm which builds on densely populated anchored neighborhoods and sublinear search structures. We perform a study of the nature of the features commonly used for SR, observing that those features usually lie in the unitary hypersphere, where every point has a diametrically opposite one, i.e. its antipode, with same module and angle, but opposite direction. Even though we validate the benefits of using antipodally invariant metrics, most of the binary splits use Euclidean distance, which does not handle antipodes optimally. In order to benefit from both worlds, we propose a simple yet effective Antipodally Invariant Transform (AIT) that can be easily included in the Euclidean distance calculation. We modify the original Spherical Hashing algorithm with this metric in our Antipodally Invariant Spherical Hashing scheme, obtaining the same performance as a pure antipodally invariant metric. We round up our contributions with a novel feature transform that obtains a better coarse approximation of the input image thanks to Iterative Back Projection. The performance of our method, which we named Antipodally Invariant Super-Resolution (AIS), improves quality (PSNR) and it is faster than any other state-of-the-art method.

}, doi = {10.1109/TIP.2016.2549362}, url = {http://perezpellitero.github.io/project_websites/ais_sr.html}, author = {E. Perez-Pellitero and Salvador, J. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @article {a, title = {Assessment of Crowdsourcing and Gamification Loss in User-Assisted Object Segmentation}, journal = {Multimedia Tools and Applications}, volume = {23}, year = {2016}, month = {11/2016}, chapter = {15901-15928}, abstract = {

There has been a growing interest in applying human computation -- \ particularly crowdsourcing techniques -- to assist in the solution of multimedia, image processing, and computer vision problems which are still too difficult to solve using fully automatic algorithms, and yet relatively easy for humans.

In this paper we focus on a specific problem -- object segmentation within color images -- and compare different solutions which combine color image segmentation algorithms with human efforts, either in the form of an explicit interactive segmentation task or through an implicit collection of valuable human traces with a game.\ We use Click{\textquoteright}n{\textquoteright}Cut, a friendly, web-based, interactive segmentation tool that allows segmentation tasks to be assigned to many users, and Ask{\textquoteright}nSeek, a game with a purpose designed for object detection and segmentation.

The two main contributions of this paper are: (i) We use the results of Click{\textquoteright}n{\textquoteright}Cut campaigns with different groups of users to examine and quantify the crowdsourcing loss incurred when an interactive segmentation task is assigned to paid crowd-workers, comparing their results to the ones obtained when computer vision experts are asked to perform the same tasks. (ii) Since interactive segmentation tasks are inherently tedious and prone to fatigue, we\ compare the quality \ of the results obtained with Click{\textquoteright}n{\textquoteright}Cut with the ones obtained using a (fun, interactive, and potentially less tedious) game designed for the same purpose. We call this contribution the assessment of the gamification loss, since it refers to how much quality of segmentation results may be lost when we switch to a game-based approach to the same task.\ 

We demonstrate that the crowdsourcing loss is significant when using all the data points from workers, but decreases substantially (and becomes comparable to the quality of expert users performing similar tasks) after performing a modest amount of data analysis and filtering out of users whose data are clearly not useful. We also show that -- on the other hand -- the gamification loss is significantly more severe: the quality of the results drops roughly by half when switching from a focused (yet tedious) task to a more fun and relaxed game environment.\ 

}, keywords = {Crowdsourcing, GWAP, Object detection, Object segmentation, Serious games}, issn = {1573-7721}, doi = {10.1007/s11042-015-2897-6}, url = {http://dx.doi.org/10.1007/s11042-015-2897-6}, author = {Carlier, Axel and Amaia Salvador and Cabezas, Ferran and Xavier Gir{\'o}-i-Nieto and Charvillat, Vincent and Marques, Oge} } @conference {cMohedanoa, title = {Bags of Local Convolutional Features for Scalable Instance Search}, booktitle = {ACM International Conference on Multimedia Retrieval (ICMR)}, year = {2016}, month = {06/2016}, publisher = {ACM}, organization = {ACM}, address = {New York City, NY; USA}, abstract = {

Image representations extracted from convolutional neural networks (CNNs) have been shown to outperform hand-crafted features in multiple computer vision tasks, such as visual image retrieval. This work proposes a simple pipeline for encoding the local activations of a convolutional layer of a pre-trained CNN using the well-known bag of words aggregation scheme (BoW). Assigning each local array of activations in a convolutional layer to a visual word produces an \textit{assignment map}, a compact representation that relates regions of an image with a visual word. We use the assignment map for fast spatial reranking, obtaining object localizations that are used for query expansion. We demonstrate the suitability of the Bag of Words representation based on local CNN features for image retrieval, achieving state-of-the-art performance on the Oxford and Paris buildings benchmarks. We show that our proposed system for CNN feature aggregation with BoW outperforms state-of-the-art techniques using sum pooling at a subset of the challenging TRECVid INS benchmark.

Best poster award at ACM ICMR 2016

Preprint on arXiv

Project page with source code

Post on GitXiv

Overall acceptance rate in ICMR 2016: 30\%\ 

2016-05-Seminar-AmaiaSalvador-DeepVision from Image Processing Group on Vimeo.

Convolutional Features for Instance Search from Xavier Giro
}, keywords = {Bag of Words, Convolutional Neural Networks, Instance Retrieval}, isbn = {978-1-4503-4359-6}, doi = {http://dx.doi.org/10.1145/2911996.2912061}, url = {http://dx.doi.org/10.1145/2911996.2912061}, author = {Mohedano, Eva and Amaia Salvador and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Marqu{\'e}s, F.} } @conference {cPoignant16, title = {The CAMOMILE Collaborative Annotation Platform for Multi-modal, Multi-lingual and Multi-media Documents}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, month = {05/2016}, address = {Portoro{\v z} (Slovenia)}, abstract = {

In this paper, we describe the organization and the implementation of the CAMOMILE collaborative annotation framework for multimodal, multimedia, multilingual (3M) data. Given the versatile nature of the analysis which can be performed on 3M data, the structure of the server was kept intentionally simple in order to preserve its genericity, relying on standard Web technologies. Layers of annotations, defined as data associated to a media fragment from the corpus, are stored in a database and can be managed through standard interfaces with authentication. Interfaces tailored specifically to the needed task can then be developed in an agile way, relying on simple but reliable services for the management of the centralized annotations. We then present our implementation of an active learning scenario for person annotation in video, relying on the CAMOMILE server; during a dry run experiment, the manual annotation of 716 speech segments was thus propagated to 3504 labeled tracks. The code of the CAMOMILE framework is distributed in open source.

}, keywords = {active learning, Annotation tool, collaborative annotation, multimedia, person annotation}, isbn = {978-2-9517408-9-1}, url = {http://www.lrec-conf.org/proceedings/lrec2016/pdf/456_Paper.pdf}, author = {Johann Poignant and Mateusz Budnik and Herv{\'e} Bredin and Claude Barras and Mickael Stefas and Pierrick Bruneau and Gilles Adda and Laurent Besacier and Hazim Ekenel and Gil Francopoulo and Javier Hernando and Joseph Mariani and Morros, J.R. and Georges Qu{\'e}not and Sophie Rosset and Thomas Tamisier} } @mastersthesis {xFernandez, title = {Clustering and Prediction of Adjective-Noun Pairs for Affective Computing}, year = {2016}, abstract = {

Student: D{\`e}lia Fern{\`a}ndez

Advisors: V{\'\i}ctor Campos (UPC), Brendan Jou (Columbia University), Xavier Gir{\'o}-i-Nieto (UPC) and Shih-Fu Chang (Columbia University)

Grade: A+ (10.0/10.0) - Best Master Thesis award (Class 2016)

One of the main problems in visual affective computing is overcoming the affective gap between low-level visual features and the emotional content of the image. One rising method to capture visual affection is through the use of Adjective-Noun Pairs (ANP), a mid-level affect representation. This thesis addresses two challenges related to ANPs: representing ANPs in a structured ontology and improving ANP detectability. The first part develops two techniques to exploit relations between adjectives and nouns for automatic ANP clustering. The second part introduces and analyzes a novel deep neural network for ANP prediction. Based on the hypothesis of a different contribution of the adjective and the noun depending of the ANP, the novel network fuses the feature representations of adjectives and nouns from two independently trained convolutional neural networks.

}, author = {Fern{\`a}ndez, D{\`e}lia}, editor = {V{\'\i}ctor Campos and Jou, Brendan and Xavier Gir{\'o}-i-Nieto and Chang, Shih-Fu} } @article {aMaceira, title = {Depth map compression via 3D region-based representation}, journal = {Multimedia Tools and Applications}, year = {2016}, month = {07/2016}, abstract = {

The final publication is available at Springer via http://dx.doi.org/10.1007/s11042-016-3727-1

In 3D video, view synthesis is used to create new virtual views between encoded camera views. Errors in the coding of the depth maps introduce geometry inconsistencies in synthesized views. In this paper, a new 3D plane representation of the scene is presented which improves the performance of current standard video codecs in the view synthesis domain. Two image segmentation algorithms are proposed for generating a color and depth segmentation. Using both partitions, depth maps are segmented into regions without sharp discontinuities without having to explicitly signal all depth edges. The resulting regions are represented using a planar model in the 3D world scene. This 3D representation allows an ecient encoding while preserving the 3D characteristics of the scene. The 3D planes opens up the possibility to code multiview images with an unique representation.

}, keywords = {3D representation, Data Compression, Depth map coding, Image segmentation}, doi = {10.1007/s11042-016-3727-1}, url = {http://rdcu.be/nqyE}, author = {Maceira, M. and Morros, J.R. and Ruiz-Hidalgo, J.} } @mastersthesis {xCarne-Herrera16, title = {Detect Snap Points in Egocentric Images with Physiological Signals}, year = {2016}, abstract = {

This project addresses a novel problem that has appears in the last years. The use of egocentric cameras, devices that takes images of what we see are growing and the main problem of this images are big data (because at the end of the day, we can have a thousand of images, some of them similar and sometimes with a bad quality or low information) and image retrieval (due to the big data, find a certain moment are very difficult and if we don{\textquoteright}t avoid that problem, the properties of egocentric images became useless).

This work have two objectives: the first one explore images that have physiological signals associated in order to allow us to add some physiological features for the retrieval instead of only base the retrieval in visual features as the actual state of the art. For this part we associate interesting images to the images that are memorable, so a correlation between memorability and physiological signals will be found. The second objective is to deal with the egocentric paradigm. Some recent works shows that machine learning algorithms that have been trained with human-taken images cannot be extended to egocentric images due to the image construction. Based on MIT (Massachusetts Institute of Technology) previous work I built a visual game that allows me to manually annotate the memorability of the images with a simple user interaction (the user don{\textquoteright}t know that he is annotating images during the game). From this game I have computed memorability score and I have obtain predicted scores from a convolutional neural network that MIT present in his work: MemNet. From both results I have compared the results in order to decide if the application of the algorithms on egocentric images is possible.

}, author = {Carn{\'e}-Herrera, Marc}, editor = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto} } @conference {cMarsden, title = {Dublin City University and Partners{\textquoteright} Participation in the INS and VTT Tracks at TRECVid 2016}, booktitle = {TRECVID Workshop 2016}, year = {2016}, month = {11/2016}, address = {Gaithersburg, MD, USA}, abstract = {

DCU participated with a consortium of colleagues from NUIG and UPC in two tasks,\ INS and VTT. For the INS task we developed a framework consisting of face detection and\ representation and place detection and representation, with a user annotation of top-ranked\ videos. For the VTT task we ran 1,000 concept detectors from the VGG-16 deep CNN on\ 10 keyframes per video and submitted 4 runs for caption re-ranking, based on BM25, Fusion,\ Word2Vec and a fusion of baseline BM25 and Word2Vec. With the same pre-processing for\ caption generation we used an open source image-to-caption CNN-RNN toolkit NeuralTalk2\ to generate a caption for each keyframe and combine them.

}, url = {http://doras.dcu.ie/21484/}, author = {Marsden, Mark and Mohedano, Eva and McGuinness, Kevin and Calafell, Andrea and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Zhou, Jiang and Azevedo, Lucas and Daubert, Tobias and Davis, Brian and H{\"u}rlimann, Manuela and Afli, Haithem and Du, Jinhua and Ganguly, Debasis and Li, Wei and Way, Andy and Smeaton, Alan F.} } @unpublished {xBellver, title = {Efficient search of objects in images using deep reinforcement learning}, journal = {NIPS Women in Machine Learning Workshop}, year = {2016}, type = {Abstract}, address = {Barcelona.}, abstract = {

When we humans look at an image, we always perform a sequential extraction of information in order to understand its content. First, we fix our gaze to the most salient part of the image, and from the information extracted we guide our look towards another point of the image, until we have analyzed all the relevant information of it. This is our natural and instinctive behaviour to gather information from our surroundings. Traditionally in computer vision, images have been analysed at the local scale following a sliding window scanning, often at different scales. This approach analyses the different parts of the images independently, without constructing a correlation among them. Just by introducing a hierarchical partition of the image, we can more easily exploit the correlation between regions through a top-down scanning which firstly takes a global view of the image to sequentially focus on the local parts that contain the relevant information (eg. objects or faces). Moreover, if we train a deep architecture that is not based on rewarding regions observed independently, such as traditional object proposals, but rewards successful long-term searches by connecting the different regions observed, we can achieve a sequential detection of objects. , which is proven to be richer in information compared to use simple independent fixations.

The goal of this ongoing research is to perform an efficient detection of objects in images. In order to be efficient, the key idea is to focus on those parts of the image which contain richer information and zoom on them, guiding a hierarchical search for objects. An intelligent entity capable of deciding where to focus the attention in the image is trained using deep reinforcement learning techniques. This RL agent first looks the whole image and decides which of the partitions of a quadtree partition is richer in order to find a certain category of objects. The reinforcement learning agent is trained using deep Q-learning using a similar architecture to the one used by DeepMind [1].

This work is based on the key idea that with reinforcement learning we can perform a sequential search that rewards short sequences of searches that obtain the highest long-term reward in terms of intersection over union of predicted bounding boxes and ground truth bounding boxes.

The input of the network is a convolutional descriptor of the region observed at the current step and a history vector that describes the previous steps of the search. This idea was also used in [2]. Our main difference with the approach of such paper, is that we use a fixed hierarchical partition to guide our sequential search. Furthermore, in order to be efficient, sharing of convolutional features is a key aspect of the pipeline of our system. Convolutional features from VGG-16 [3] are extracted from the initial whole resolution picture, and then the descriptors for each subpartition are cropped from this feature map.

References:

[1] Mnih, V., Kavukcuoglu, K., Silver, D., Graves, A., Antonoglou, I., Wierstra, D., \& Riedmiller, M. (2013). Playing atari with deep reinforcement learning. arXiv preprint arXiv:1312.5602.

[2] Caicedo, J. C., \& Lazebnik, S. (2015). Active object localization with deep reinforcement learning. In Proceedings of the IEEE International Conference on Computer Vision (pp. 2488-2496).


[3] Simonyan, K., \& Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. International Conference on Learning Representation 2015.

}, author = {M{\'\i}riam Bellver and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @article {xCarne-Herrera, title = {EgoMemNet: Visual Memorability Adaptation to Egocentric Images}, year = {2016}, month = {06/2016}, institution = {4th Workshop on Egocentric (First-Person) Vision, CVPR 2016}, address = {Las Vegas, NV, USA}, abstract = {

This work explores the adaptation of visual memorability\ prediction for photos intentionally captured by handheld\ cameras, to images passively captured from an egocentric\ point of view by wearable cameras. The estimation of a visual\ memorability score for an egocentric images is a valuable\ cue when filtering among the large amount of photos\ generated by wearable cameras. For this purpose, a new\ annotation tool and annotated dataset are presented, used\ to fine-tune a pre-trained convolutional neural network.

Extended abstract presented as poster in the\ 4th Workshop on Egocentric (First-Person) Vision,\ CVPR 2016.

}, author = {Carn{\'e}-Herrera, Marc and Xavier Gir{\'o}-i-Nieto and Gurrin, Cathal} } @mastersthesis {xCherto, title = {EgoMon Gaze and Video Dataset for Visual Saliency Prediction}, year = {2016}, abstract = {

This project focuses on the creation of a new type of egocentric (first person) vision dataset. For that purpose, the EgoMon Gaze \& Video Dataset is presented. This EgoMon dataset was recorded using the eye gaze tracking technology that studies the movement and position of the eyes. The Tobii glasses (wearable, eye tracker and head-mounted device) were the main tool used to record and extract the gaze data for this dataset. The dataset consists in 7 videos of 34 minutes each one of average, 13428 frames extracted from each video (with a frequency of 1 fps), and 7 files with the gaze data (fixations points of the wearer of the glasses) for each frame and video. The videos were recorded in the city of Dublin (Ireland) both indoor and outdoor. The generated dataset has been used to evaluate the performance of a state of art model for visual saliency prediction on egocentric video.

[Project page]

Grade: B (8.2/10.0)

EgoMon Gaze and Video Dataset for Visual Saliency Prediction from Xavier Giro
}, author = {Chert{\'o}, M{\`o}nica}, editor = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto} } @conference {cSalvadorb, title = {Faster R-CNN Features for Instance Search}, booktitle = {CVPR Workshop Deep Vision}, year = {2016}, month = {06/2016}, abstract = {

Image representations derived from pre-trained Convolutional Neural Networks (CNNs) have become the new state of the art in computer vision tasks such as instance retrieval. This work explores the suitability for instance retrieval of image- and region-wise representations pooled from an object detection CNN such as Faster R-CNN. We take advantage of the object proposals learned by a Region Proposal Network (RPN) and their associated CNN features to build an instance search pipeline composed of a first filtering stage followed by a spatial reranking. We further investigate the suitability of Faster R-CNN features when the network is fine-tuned for the same objects one wants to retrieve. We assess the performance of our proposed system with the Oxford Buildings 5k, Paris Buildings 6k and a subset of TRECVid Instance Search 2013, achieving competitive results.

2016-05-Seminar-AmaiaSalvador-DeepVision from Image Processing Group on Vimeo.

Convolutional Features for Instance Search from Xavier Giro
}, url = {http://www.cv-foundation.org/openaccess/content_cvpr_2016_workshops/w12/papers/Salvador_Faster_R-CNN_Features_CVPR_2016_paper.pdf}, author = {Amaia Salvador and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Satoh, Shin{\textquoteright}ichi} } @mastersthesis {xCata16, title = {Feature Selection Methods for Predicting Pre-Clinical Stage in Alzheimer{\textquoteright}s Disease}, year = {2016}, abstract = {

Advisors: Ver{\'o}nica Vilaplana, Adri{\`a} Casamitjana

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Alzheimer{\textquoteright}s disease is still an incurable disease. Nevertheless, some of its biomarkers suffer changes in the early stages of the disease, long before clinical symptoms appear. In order to determine how biomarkers obtained from magnetic resonance (MRI) techniques affect the disease{\textquoteright}s evolution, machine learning techniques have been used to design and implement a classification system so as to predict the stages in which several patients belong. One of the main objectives of this project is reducing the number of data to manage, since MRI provide a large volume of data for each patient. As a result, we will focus on the stage of reduction and extraction of characteristics of the classifier which may be relevant for the mentioned problem. We will carry out an exhaustive analysis of different methods of selection of features to apply to biomedical data related to Alzheimer{\textquoteright}s disease. Results obtained will also be applicable to other fields. Finally, we will assess these methods with a multimodal data base provided by the collaboration agreement with Pasqual Maragall Foundation (FPM).

}, author = {Marcel Cat{\`a}}, editor = {Ver{\'o}nica Vilaplana and Adri{\`a} Casamitjana} } @conference {cLin16, title = {Graph based Dynamic Segmentation of Generic Objects in 3D}, booktitle = {CVPR SUNw: Scene Understanding Workshop}, year = {2016}, month = {06/2016}, address = {Las Vegas, US}, abstract = {

We propose a novel 3D segmentation method for RBGD stream data to deal with 3D object segmentation task in a generic scenario with frequent object interactions. It mainly contributes in two aspects, while being generic and not requiring initialization: firstly, a novel tree structure representation for the point cloud of the scene is proposed. Then, a dynamic manangement mechanism for connected component splits and merges exploits the tree structure representation.

}, url = {http://sunw.csail.mit.edu/posters.html}, author = {X. Lin and Casas, J. and M. Pard{\`a}s} } @conference {cPerez-Pellitero16, title = {Half Hypersphere Confinement for Piecewise Linear Regression}, booktitle = {IEEE Winter Conference on Applications of Computer Vision}, year = {2016}, month = {03/2016}, address = {Lake Placid, NY, USA}, abstract = {

2016 WACV\  \  \ 

IIn this paper we study the characteristics of the metrics best suited for the piecewise regression algorithms, in which comparisons are usually made between normalized vectors that lie on the unitary hypersphere. Even though Euclidean distance has been widely used for this purpose, it is suboptimal since it does not handle antipodal points (i.e. diametrically opposite points) properly. Therefore, we propose the usage of antipodally invariant metrics and introduce the Half Hypersphere Confinement (HHC), a fast alternative to Multidimensional Scaling (MDS) that allows to map antipodally invariant distances in the Euclidean space with very little approximation error. The performance of our method, which we named HHC Regression (HHCR), applied to Super-Resolution (SR) improves both in quality (PSNR) and it is faster than any other state-of-the-art method. Additionally, under an application-agnostic interpretation of our regression framework, we also test our algorithm for denoising and depth upscaling with promising results.

}, doi = {10.1109/WACV.2016.7477651}, author = {E. Perez-Pellitero and Salvador, J. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @unpublished {xFernandeza, title = {Is a {\textquotedblleft}happy dog{\textquotedblright} more {\textquotedblleft}happy{\textquotedblright} than {\textquotedblleft}dog{\textquotedblright}? - Adjective and Noun Contributions for Adjective-Noun Pair prediction}, journal = {NIPS Women in Machine Learning Workshop}, year = {2016}, month = {12/2016}, address = {Barcelona}, abstract = {

Computers are acquiring increasing ability to detect high level visual content such as objects in images, but often lack an affective comprehension of this content. Affective computing is useful for behavioral sciences, with applications in brand monitoring or advertisement effect. The main problem of the visual task of mapping affect or emotions to images is overcoming the affective gap between low-level features and the image emotional content.

One rising method to capture visual affections is through the use of Adjective-Noun Pair (ANP). ANPs were introduced as a mid-level affect representation to overcome the affective gap by combining nouns, which define the object content, and adjectives, which add a strong emotional bias, yielding concepts such as {\textquotedblleft}happy dog{\textquotedblright} or {\textquotedblleft}misty morning{\textquotedblright}.

Current state of the art methods approach ANP prediction by training visual classifiers on these pairs. In this work, we hypothesize that the visual contribution between nouns and adjectives differ between ANPs. We propose a feature-based intermediate representation for ANP prediction using specialized convolutional networks for adjectives and nouns separately. By fusing a representation from nouns and adjectives, the network learns how much the nouns and adjectives contribute to each ANP, which a single tower network does not allow.

The specialized noun and adjective networks follow an AlexNet-styled architecture. These networks are fused into an intermediate feature representation, and ANPs are then learned from it using a fully-connected network. We investigate noun and adjective contributions with two kinds of fusions. First fusion uses the output of the softmax layer: these are class-probability features, so all dimensions have class-correspondence to adjectives and nouns. Second fusion uses the fc7 layer output: these features contain visual information, allowing interpretation of adjective and noun visual relevance. For the feature contributions of each ANP, we compute a deep Taylor decomposition [1].

For experiments, we use a subset of 1,200 ANPs from the tag-based English-MVSO [2] dataset. ANPs are composed by the combination of 350 adjective and 617 nouns. With identical settings to the adjective and noun networks, an ANP classification network is trained end-to-end as the baseline. Using the fc7 features, we improve over the baseline in both top-1 and top-5 accuracy. Also, we observe adjective and nouns contribute differently between ANPs; e.g. for the ANP {\textquotedblleft}pregnant woman{\textquotedblright}, the adjective contributes the most, while for {\textquotedblleft}cute cat{\textquotedblright} the predominant contribution is in the noun. Using the probability features we find other insights, as nouns or adjectives co-occurring together, e.g. for {\textquotedblleft}happy halloween{\textquotedblright} the contribution is also high of the nouns {\textquotedblleft}blood{\textquotedblright} and {\textquotedblleft}cat{\textquotedblright}, and of the adjectives {\textquotedblleft}haunted{\textquotedblright} and {\textquotedblleft}dark{\textquotedblright}.\ 

Based on experiment results, we confirm our hypothesis of adjective and nouns contributing differently to ANP concepts. Furthermore, our architecture proves to outperform traditional methods by giving insights on the role of adjectives and nouns on the prediction.

[1] Montavon, Gr{\'e}goire, et al. "Deep Taylor Decomposition of Neural Networks." ICML Workshop on Visualization for Deep Learning, 2016.

[2] Jou, Brendan, et al. "Visual affect around the world: A large-scale multilingual visual sentiment ontology." ACMM, 2015.

}, author = {Fern{\`a}ndez, D{\`e}lia and V{\'\i}ctor Campos and Jou, Brendan and Xavier Gir{\'o}-i-Nieto and Chang, Shih-Fu} } @conference {cBellver, title = {Hierarchical Object Detection with Deep Reinforcement Learning}, booktitle = {Deep Reinforcement Learning Workshop, NIPS 2016}, year = {2016}, month = {12/2016}, abstract = {

We present a method for performing hierarchical object detection in images guided by a deep reinforcement learning agent. The key idea is to focus on those parts of the image that contain richer information and zoom on them. We train an intelligent agent that, given an image window, is capable of deciding where to focus the attention among five different predefined region candidates (smaller windows). This procedure is iterated providing a hierarchical image analysis.

\ We compare two different candidate proposal strategies to guide the object search: with and without overlap. Moreover, our work compares two different strategies to extract features from a convolutional neural network for each region proposal: a first one that computes new feature maps for each region proposal, and a second one that computes the feature maps for the whole image to later generate crops for each region proposal.\ 

Experiments indicate better results for the overlapping candidate proposal strategy and a loss of performance for the cropped image features due to the loss of spatial resolution. We argue that, while this loss seems unavoidable when working with large amounts of object candidates, the much more reduced amount of region proposals generated by our reinforcement learning agent allows considering to extract features for each location without sharing convolutional computation among regions.

[Project page][arXiv][gitXiv][UPCommons][YouTube]

}, author = {M{\'\i}riam Bellver and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Jordi Torres} } @conference {cCasanova16, title = {Interactive Registration Method for 3D data Fusion}, booktitle = {IC3D}, year = {2016}, month = {12/2016}, publisher = {IEEE}, organization = {IEEE}, address = {Li{\`e}ge, Belgium}, abstract = {

Commercial depth sensors represent an opportunity for automation of certain 3D production and analysis tasks. One way to overcome some of their inherent limitations is by capturing the same scene with several depth sensors and merging their data, i.e. by performing 3D data fusion, which requires the registration of point clouds from different sensors. We propose a new interactive, fast and user-friendly method for depth sensor registration. We replace the traditional checkerboard pattern used to extract key points in the scene by a finger detector. This provides a main advantage: the method is easier to use and does not require external objects, while the elapsed time and the registration error are similar to those obtained through the classical method.

We test the proposed approach with an interactive hand tracking application, improved to use more than a single sensor, and we show the increase in detection area by more than 70\%.

}, url = {http://www.3dstereomedia.eu/ic3d}, author = {A. Casanova and A. Pujol-Mir{\'o} and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cdeOliveira-Barraa, title = {Large Scale Content-Based Video Retrieval with LIvRE}, booktitle = {14th International Workshop on Content-based Multimedia Indexing (CBMI)}, year = {2016}, month = {06/2016}, publisher = {IEEE}, organization = {IEEE}, address = {Bucharest, Romania}, abstract = {

The fast growth of video data requires robust, efficient, and scalable systems to allow for indexing and retrieval. These systems must be accessible from lightweight, portable and usable interfaces to help users in management and search of video content. This demo paper presents LIvRE, an extension of an existing open source tool for image retrieval to support video indexing. LIvRE consists of three main system components (pre-processing, indexing and retrieval), as well as a scalable and responsive HTML5 user interface accessible from a web browser. LIvRE supports image-based queries, which are efficiently matched with the extracted frames of the indexed videos.

}, author = {Gabriel de Oliveira-Barra and Lux, Mathias and Xavier Gir{\'o}-i-Nieto} } @conference {cdeOliveira-Barra, title = {LEMoRe: A Lifelog Engine for Moments Retrieval at the NTCIR-Lifelog LSAT Task}, booktitle = {The 12th NTCIR Conference, Evaluation of Information Access Technologies}, year = {2016}, month = {06/2016}, publisher = {National Institute of Informatics (NII)}, organization = {National Institute of Informatics (NII)}, address = {Tokyo, Japan}, abstract = {

Semantic image retrieval from large amounts of egocentric visual data requires to leverage powerful techniques for filling in the semantic gap. This paper introduces LEMoRe, a Lifelog Engine for Moments Retrieval, developed in the context of the Lifelog Semantic Access Task (LSAT) of the the NTCIR-12 challenge and discusses its performance variation on different trials. \ LEMoRe integrates classical image descriptors with high-level semantic concepts extracted by Convolutional Neural Networks (CNN), powered by a graphic user interface that uses natural language processing. \ Although this is just a first attempt towards interactive image retrieval from large egocentric datasets and there is a large room for improvement of the system components and the user interface, the structure of the system itself and the way the single components cooperate are very promising.

LEMoRe - A Lifelog Engine for Moments Retrieval at NTCIR-12 from University of Barcelona
}, url = {http://research.nii.ac.jp/ntcir/workshop/OnlineProceedings12/NTCIR/toc_ntcir.html$\#$Lifelog}, author = {Gabriel de Oliveira-Barra and Xavier Gir{\'o}-i-Nieto and Cartas-Ayala, Alejandro and Radeva, Petia} } @conference {cGurrin, title = {LTA 2016 - The First Workshop on Lifelogging Tools and Applications}, booktitle = {ACM Multimedia}, year = {2016}, month = {10/2016}, publisher = {ACM}, organization = {ACM}, address = {Amsterdam, The Netherlands}, abstract = {

The organisation of personal data is receiving increasing research attention due to the challenges that are faced in gathering, enriching, searching and visualising this data. Given the increasing quantities of personal data being gathered by individuals, the concept of a lifelog digital library of rich multimedia and sensory content for every individual is fast becoming a reality. The LTA2016 lifelogging workshop at ACM MM 2016 aims to bring together academics and practitioners to discuss approaches to lifelog data analytics and the applications of same, and to debate the opportunities and challenges for researchers in this new and challenging area.

[Workshop web page]

[Workshop proceedings]

[UPCommons]

}, keywords = {lifelogging, Personal digital archives, Personal information management}, doi = {http://dx.doi.org/10.1145/2964284.2980534}, url = {http://lta2016.computing.dcu.ie/}, author = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto and Radeva, Petia and Dimiccoli, M. and Johansen, H. and Joho, H. and Singh, Vivek K} } @mastersthesis {xSantamarta16, title = {Machine learning for recommendation systems in job postings selection}, year = {2016}, abstract = {

This project is the result of the study, implementation and combination of several recommender systems technics. Moreover, the project try to submit a solution for a challenge on job recommender system. Thus, it has been realized a study of the available technics on recommender systems. After that, it has been analyzed the information provided by organizers, in order to design the most accurate system. The design and the implementation of recommender systems models has been the next step on the project. Finally, the analysis and discussion of the results obtained has been the final step of the project. This step provides the best recommender system designed.

}, author = {Victor Marcos Santamarta}, editor = {Vidal, J. and Ver{\'o}nica Vilaplana} } @mastersthesis {xPuch16, title = {Nonlinear analysis toolbox for neurodegenerative diseases and aging}, year = {2016}, abstract = {

Advisor: Ver{\'o}nica Vilaplana

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Neurodegenerative diseases impose substantial public health burdens on populations
throughout the world. Alzheimer{\textquoteright}s disease is among the major neurodegenerative diseases,
and its causes and treatment are still unknown. Researchers around the world are conducting
large data-driven studies in order to unveil the causes and biological mechanisms of such
diseases, and for that reason automatic tools that allow to uncover statistically signi cant
ndings are needed.
To address this problem we present in this thesis a software toolbox that provides the
tools to analyze the linear and nonlinear dynamics of gray-matter and study the statistical
signi cance of such dynamics at the voxel level. The toolbox features various tting methods
and t evaluation metrics, an automatic hyperparameters look-up algorithm and several
visualization and comparison tools.
All the features provided in this toolbox were tested in two real problems provided by
the Pasqual Maragall Foundation, and it yielded results that were validated by the ndings
in the original studies.

}, author = {Santi Puch}, editor = {Ver{\'o}nica Vilaplana} } @mastersthesis {xFerri16, title = {Object Tracking in Video with TensorFlow}, year = {2016}, abstract = {

[Project repo]

[Additional repo for setting up the environment]

}, author = {Ferri, Andrea}, editor = {Xavier Gir{\'o}-i-Nieto and Jordi Torres and Amaia Salvador} } @mastersthesis {xMasuda-Mora, title = {Open-Ended Visual Question-Answering}, year = {2016}, abstract = {

Advisors: Santiago de la Puente and Xavier Gir{\'o}-i-Nieto

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Grade: A with honors (10/10.0)

This thesis studies methods to solve Visual Question-Answering (VQA) tasks with a Deep Learning framework.As a preliminary step, we explore Long Short-Term Memory (LSTM) networks used in Natural Language Processing (NLP) to tackle Question-Answering (text based). We then modify the previous model to accept an image as an input in addition to the question. For this purpose, we explore the VGG-16 and K-CNN convolutional neural networks to extract visual features from the image. These are merged with the word embedding or with a sentence embedding of the question to predict the answer. This work was successfully submitted to the Visual Question Answering Challenge 2016, where it achieved a 53,62\% of accuracy in the test dataset. The developed software has followed the best programming practices and Python code style, providing a consistent baseline in Keras for different configurations. The source code and models are publicly available at https://github.com/imatge-upc/vqa-2016-cvprw.

Open-ended Visual Question-Answering from Xavier Giro
}, url = {https://upcommons.upc.edu/handle/2117/89671}, author = {Masuda-Mora, Issey}, editor = {Pascual-deLaPuente, Santiago and Xavier Gir{\'o}-i-Nieto} } @article {aSalembier, title = {Optimum Graph-Cuts for Pruning Binary Partition Trees of Polarimetric SAR images}, journal = {IEEE Transactions on Geoscience and Remote Sensing}, volume = {54}, year = {2016}, month = {05/2016}, pages = {5493 {\textendash} 5502}, abstract = {

This paper investigates several optimum graph-cuts techniques for pruning Binary Partition Trees (BPTs) and their usefulness for low-level processing of Polarimetric SAR (PolSAR) images. BPTs group pixels to form homogeneous regions, which are hierarchically structured by inclusion in a binary tree. They provide multiple resolutions of description and easy access to subsets of regions. Once constructed, BPTs can be used for a large number of applications. Many of these applications consist in populating the tree with a specific feature and in applying a graph-cut called pruning to extract a partition of the space. In this paper, different pruning examples involving the optimization of a global criterion are discussed and analyzed in the context of PolSAR images for segmentation. Through objective evaluation of the resulting partitions by means of Precision and Recall for boundaries curves, the best pruning technique is identified and the influence of the tree construction on the performances is assessed.\ 

}, author = {Salembier, P. and S. Foucher} } @conference {cPerez-Pellitero, title = {PSyCo: Manifold Span Reduction for Super Resolution}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition}, year = {2016}, month = {06/2016}, address = {Las Vegas, Nevada, USA}, abstract = {

The main challenge in Super Resolution (SR) is to discover the mapping between the low- and high-resolution manifolds of image patches, a complex ill-posed problem which has recently been addressed through piecewise linear regression with promising results. In this paper we present a novel regression-based SR algorithm that benefits from an extended knowledge of the structure of both manifolds. We propose a transform that collapses the 16 variations induced from the dihedral group of transforms (i.e. rotations, vertical and horizontal reflections) and antipodality (i.e. di- ametrically opposed points in the unitary sphere) into a single primitive. The key idea of our transform is to study the different dihedral elements as a group of symmetries within the high-dimensional manifold. We obtain the respective set of mirror-symmetry axes by means of a frequency analysis of the dihedral elements, and we use them to collapse the redundant variability through a modified symmetry distance. The experimental validation of our algorithm shows the effectiveness of our approach, which obtains competitive quality with a dictionary of as little as 32 atoms (reducing other methods{\textquoteright} dictionaries by at least a factor of 32) and further pushing the state-of-the-art with a 1024 atoms dictionary.

}, url = {http://perezpellitero.github.io/}, author = {E. Perez-Pellitero and Salvador, J. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @phdthesis {dVaras16, title = {Region-based Particle Filter Leveraged with a Hierarchical Co-clustering}, year = {2016}, month = {11/2016}, school = {UPC}, type = {PhD}, address = {Barcelona}, abstract = {

In this thesis, we exploit the hierarchical information associated with images to tackle two fundamental problems of computer vision: video object segmentation and video segmentation.

In the first part of the thesis, we present a video object segmentation approach that extends the well known particle filter algorithm to a region based image representation. Image partition is considered part of the particle filter measurement, which enriches the available information and leads to a reformulation of the particle filter theory. We define particles as unions of regions in the current image partition and their propagation is computed through a single optimization process. During this propagation, the prediction step is performed using a co-clustering between the previous image object partition and a partition of the current one, which allows us to tackle the evolution of non-rigid structures.

The second part of the thesis is devoted to the exploration of a co-clustering technique for video segmentation. This technique, given a collection of images and their associated hierarchies, clusters nodes from these hierarchies to obtain a coherent multiresolution representation of the image collection. We formalize the co-clustering as a Quadratic Semi-Assignment Problem and solve it with a linear programming relaxation approach that makes e ffective use of information from hierarchies. Initially, we address the problem of generating an optimal, coherent partition per image and, afterwards, we extend this method to a multiresolution framework. Finally, we particularize this framework to an iterative multiresolution video segmentation algorithm in sequences with small variations.

Finally, in the last part of the thesis we validate the presented techniques for object and video segmentation using the proposed algorithms as tools to tackle problems in a context for which they were not initially thought.

}, author = {David Varas}, editor = {Marqu{\'e}s, F.} } @conference {cPan, title = {Shallow and Deep Convolutional Networks for Saliency Prediction}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition, CVPR}, year = {2016}, month = {06/2016}, publisher = {Computer Vision Foundation / IEEE}, organization = {Computer Vision Foundation / IEEE}, address = {Las Vegas, NV, USA}, abstract = {

The prediction of salient areas in images has been traditionally addressed with hand-crafted features based on neuroscience principles. This paper, however, addresses the problem with a completely data-driven approach by training a convolutional neural network (convnet). The learning process is formulated as a minimization of a loss function that measures the Euclidean distance of the predicted saliency map with the provided ground truth. The recent publication of large datasets of saliency prediction has provided enough data to train end-to-end architectures that are both fast and accurate. Two designs are proposed: a shallow convnet trained from scratch, and a another deeper solution whose first three layers are adapted from another network trained for classification. To the authors knowledge, these are the first end-to-end CNNs trained and tested for the purpose of saliency prediction.

}, url = {http://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Pan_Shallow_and_Deep_CVPR_2016_paper.pdf}, author = {Pan, Junting and McGuinness, Kevin and Elisa Sayrol and O{\textquoteright}Connor, N. and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xNespereira, title = {Siamese Convolutional Neural Network for Learning Object Similarities in RGB-D Images}, year = {2016}, abstract = {

Student: Alejandro Nespereira

Advisors: Farzad Husain (Catchoom), Tomasz Adamek (Catchoom) and Xavier Gir{\'o}-i-Nieto (UPC)

Program:\ Master in Computer Vision\ (Class of 2016)

This report explores the suitability of using a Siamese Convolutional Neural Network (CNN) for the task of false positive rejection. We present a Siamese CNN model trained with an in-house dataset of weakly textured objects. Our model is able to successfully assert the classification of an object detection pipeline with unseen new objects. Additionally, we also compare it with a hand-crafted method in order to compare its performance. We demonstrate the usage of our model by learning to discriminate between inter and intra object classes for a challenging dataset.

}, author = {Nespereira, Alejandro}, editor = {Husain, Farzad and Adamek, Tomasz and Xavier Gir{\'o}-i-Nieto} } @article {Pont-Tuset2015c, title = {Supervised Evaluation of Image Segmentation and Object Proposal Techniques}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI)}, volume = {38}, year = {2016}, pages = {1465 - 1478}, author = {Jordi Pont-Tuset and Marqu{\'e}s, F.} } @conference {cMontes, title = {Temporal Activity Detection in Untrimmed Videos with Recurrent Neural Networks}, booktitle = {1st NIPS Workshop on Large Scale Computer Vision Systems 2016}, year = {2016}, month = {12/2016}, abstract = {

This work proposes a simple pipeline to classify and temporally localize activities in untrimmed videos. Our system uses features from a 3D Convolutional Neural Network (C3D) as input to train a a recurrent neural network (RNN) that learns to classify video clips of 16 frames. After clip prediction, we post-process the output of the RNN to assign a single activity label to each video, and determine the temporal boundaries of the activity within the video. We show how our system can achieve competitive results in both tasks with a simple architecture. We evaluate our method in the ActivityNet Challenge 2016, achieving a 0.5874 mAP and a 0.2237 mAP in the classification and detection tasks, respectively. Our code and models are publicly available at:\ https://imatge-upc.github.io/activitynet-2016-cvprw/

}, author = {Montes, Alberto and Amaia Salvador and Pascual-deLaPuente, Santiago and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xMontes, title = {Temporal Activity Detection in Untrimmed Videos with Recurrent Neural Networks}, year = {2016}, abstract = {

Advisors: Amaia Salvador and Xavier Gir{\'o}-i-Nieto.

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Grade: A (9.8/10.0)

This thesis explore different approaches using Convolutional and Recurrent Neural Networks to classify and temporally localize activities on videos, furthermore an implementation to achieve it has been proposed.

As the first step, features have been extracted from video frames using an state of the art 3D Convolutional Neural Network. This features are fed in a recurrent neural network that solves the activity classification and temporally location tasks in a simple and flexible way.

Different architectures and configurations have been tested in order to achieve the best performance and learning of the video dataset provided. In addition it has been studied different kind of post processing over the trained network{\textquoteright}s output to achieve a better results on the temporally localization of activities on the videos.

The results provided by the neural network developed in this thesis have been submitted to the ActivityNet Challenge 2016 of the CVPR, achieving competitive results using a simple and flexible architecture.

Temporal Activity Detection in Untrimmed Videos with Recurrent Neural Networks from Xavier Giro
}, keywords = {deep learning, neural networks, videos}, author = {Montes, Alberto}, editor = {Amaia Salvador and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xReyesa, title = {Time-sensitive Egocentric Image Retrieval for Fidings Objects in Lifelogs}, year = {2016}, abstract = {

Advisors: Eva Mohedano (Insight DCU), Kevin McGuinness (Insight DCU) and\ Xavier Gir{\'o}-i-Nieto\ (UPC)

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Grade: A (9.4/10.0)

This work explores diverse practices for conducting an object search from large amounts of egocentric images taking into account their temporal information. The application of this technology is to identify where personal belongings were lost or forgotten. We develop a pipeline-structured system. Firstly, the images of the day being scanned are sorted based on their probability to depict the forgotten object. This stage is solved by applying an existing visual search engine based on deep learning features. Secondly, a learned threshold selects the top ranked images as candidates to contain the object. Finally the images are reranked based on temporal and diversity criteria. Furthermore, we build a validation environment for assessing the system{\textquoteright}s performance aiming to find the optimal configuration of its parameters. Due to the lack of related works to be compared with, this thesis proposes an novel evaluation framework and metric to assess the problem.

[Tfg cristian reyes] time sensitive egocentric image retrieval for finding objects in lifelogs from Xavier Giro
}, author = {Reyes, Cristian}, editor = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @article {xMora, title = {Towards Automatic Generation of Question Answer Pairs from Images}, year = {2016}, institution = {Visual Question Answering Challenge Workshop, CVPR 2016}, address = {Las Vegas, NV, USA}, abstract = {

This extended abstract presents our research in the generic field of Visual Question-Answering (VQA) focusing on a new branch that aims to generate question-answer pairs based on an image.\ To do so, we use the VQA dataset provided for the VQA challenge to train a Deep Neural Network which has the image as an input and two different outputs, the question and its associated answer.

Extended abstract presented as poster in the Visual Question Answering Challenge Workshop, CVPR 2016.

}, url = {http://www.visualqa.org/abstracts.html}, author = {Masuda-Mora, Issey and Pascual-deLaPuente, Santiago and Xavier Gir{\'o}-i-Nieto} } @conference {cIndia, title = {UPC System for the 2016 MediaEval Multimodal Person Discovery in Broadcast TV task}, booktitle = {MediaEval 2016 Workshop}, year = {2016}, month = {10/2016}, address = {Hilversum, The Netherlands}, abstract = {

The UPC system works by extracting monomodal signal segments (face tracks, speech segments) that overlap with the person names overlaid in the video signal. These segments are assigned directly with the name of the person and used as a reference to \ compare against the non-overlapping (unassigned) signal segments. This process is performed independently both on the speech and video signals. A simple fusion scheme is used to combine both monomodal annotations into a single one.

}, author = {India, M. and Mart{\'\i}, G. and Cotillas, C. and Bouritsas, G. and Elisa Sayrol and Morros, J.R. and Hernando, J.} } @mastersthesis {xCalafell, title = {Video Retrieval of Specific Persons in Specific Locations}, year = {2016}, abstract = {

Student: Andrea Calafell

Advisors: Eva Mohedano (Insight), Kevin McGuinness (Insight), Noel E. O{\textquoteright}Connor (Insight) and Xavier Gir{\'o}-i-Nieto (UPC)

Program: Master in Computer Vision (Class of 2016)

Grade: A (9.0/10.0)

This thesis explores good practices for improving the detection of specific people in specific places. An approach combining recurrent and convolutional neural network have been considered to perform face detection. However, other more conventional methods have been tested, obtaining the best results by exploiting a deformable part model approach. A CNN is also used to obtain the face feature vectors and, \ with the purpose of helping in the face recognition, an approach to perform query expansion has been also developed. Furthermore, in order to be able to evaluate the different configurations in our non-labelled dataset, a user interface has been used to annotate the images and be able to obtain a precision of the system. Finally, different fusion and normalization strategies has been explored with the aim of combining the scores obtained from the face recognition with the ones obtained in the place recognition.

Video Retrieval of Specific Persons in Specific Locations from Xavier Giro
}, author = {Calafell, Andrea}, editor = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xCarne-Herreraa, title = {Visual Memorability for Egocentric Cameras}, year = {2016}, abstract = {

This project explores visual memorability of egocentric in different ways having three main\ contributions. The first and the main contribution of the project is a new tool visual\ memorability in egocentric images. This tool that consists in a web application that allows\ the annotation of the visual memorability associated to still images with an online game.\ The second contribution of this work is a convolutional neural network model for visual\ memorability prediction that adapts an off-the-shelf model to egocentric images. Moreover,\ a visualization study has been pursued to localize the regions of the images that are more\ memorable than others. With this maps a comparison with saliency maps and is explored.\ This part of the research opens a new branch in visual memorability that consists in use\ memorability maps for saliency prediction. Also the memorability of the images is related\ with a sentiment analysis applying a model that predicts that feature.\ The final contribution is related to join visual memorability of images with human\ behaviour and physical state, finding a relation between memory and some physiological\ signals as: heart rate, galvanic skin response and electroencephalographic signals.

Grade: A with honors (9.8/10.)

Visual Memorability for Egocentric Cameras from Xavier Giro
}, author = {Carn{\'e}-Herrera, Marc}, editor = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dVentura16, title = {Visual Object Analysis using Regions and Local Features}, year = {2016}, abstract = {

Thesis submission: 10-06-2016

Defense date:\ 08-07-2016

Grade: Excel{\textperiodcentered}lent Cum Laude \& International Mention

The fi rst part of this dissertation focuses on an analysis of the spatial context in semantic image segmentation. First, we review how spatial context has been tackled in the literature by local features and spatial aggregation techniques. From a discussion about whether the context is bene ficial or not for object recognition, we extend a Figure-Border-Ground segmentation for local feature aggregation with ground truth annotations to a more realistic scenario where object proposals techniques are used instead. Whereas the Figure and Ground regions represent the object and the surround respectively, the Border is a region around the object contour, which is found to be the region with the richest contextual information for object recognition. Furthermore, we propose a new contour-based spatial aggregation technique of the local features within the object region by a division of the region into four subregions. Both contributions have been tested on a semantic segmentation benchmark with a combination of free and non-free context local\ features that allows the models automatically learn whether the context is benefi cial or not for each semantic category.

The second part of this dissertation addresses the semantic segmentation for a set of closely-related images from an uncalibrated multiview scenario. State-of-the-art semantic segmentation algorithms fail on correctly segmenting the objects from some viewpoints when the techniques are independently applied to each viewpoint image. The lack of large annotations available for multiview segmentation do not allow to obtain a proper model that is robust to viewpoint changes. In this second part, we exploit the spatial correlation that exists between the di erent viewpoints images to obtain a more robust semantic segmentation. First, we review the state-of-the-art co-clustering, co-segmentation and video segmentation techniques that aim to segment the set of images in a generic way, i.e. without considering semantics. Then, a new architecture that considers motion information and provides a multiresolution segmentation is proposed for the co-clustering framework and outperforms state-of-the-art techniques for generic multiview segmentation. Finally,\ the proposed multiview segmentation is combined with the semantic segmentation results giving a method for automatic resolution selection and a coherent semantic multiview segmentation.

Visual Object Analysis using Regions and Local Features from Xavier Giro

}, url = {http://hdl.handle.net/10803/398407}, author = {Ventura, C.}, editor = {Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @conference {cPuch, title = {Voxelwise nonlinear regression toolbox for neuroimage analysis: Application to aging and neurodegenerative disease modeling}, booktitle = {NIPS 2016 Workshop on Machine Learning for Health}, year = {2016}, month = {12/2016}, author = {Santi Puch and Asier Aduriz and Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @article {xReyes, title = {Where did I leave my phone ?}, year = {2016}, month = {06/2016}, institution = {4th Workshop on Egocentric (First-Person) Vision, CVPR 2016}, type = {Extended abstract}, address = {Las Vegas, NV, USA}, abstract = {

The interest of users in having their lives digitally recorded has grown in the last years thanks to the advances on wearable sensors.\ Wearable cameras are one of the most informative ones, but they generate large amounts of images that require automatic analysis to build useful applications upon them.\ In this work we explore the potential of these devices to find the last appearance of personal objects among the more than 2,000 images that are generated everyday.\ This application could help into developing personal assistants capable of helping users when they do not remember where they left their personal objects.\ We adapt a previous work on instance search to the specific domain of egocentric vision.

Extended abstract presented as poster in the 4th Workshop on Egocentric (First-Person) Vision,\ CVPR 2016.

}, author = {Reyes, Cristian and Mohedano, Eva and McGuinness, Kevin and O{\textquoteright}Connor, N. and Xavier Gir{\'o}-i-Nieto} } @conference {cReyes, title = {Where is my Phone? Personal Object Retrieval from Egocentric Images}, booktitle = {Lifelogging Tools and Applications Workshop in ACM Multimedia}, year = {2016}, month = {10/2016}, publisher = {ACM}, organization = {ACM}, address = {Amsterdam, The Netherlands}, abstract = {

This work presents a retrieval pipeline and evaluation scheme for the problem of finding the last appearance of personal objects in a large dataset of images captured from a wearable camera. Each personal object is modelled by a small set of images that define a query for a visual search engine.The retrieved results are reranked considering the temporal timestamps of the images to increase the relevance of the later detections. Finally, a temporal interleaving of the results is introduced for robustness against false detections. The Mean Reciprocal Rank is proposed as a metric to evaluate this problem. This application could help into developing personal assistants capable of helping users when they do not remember where they left their personal belongings.

}, doi = {http://dx.doi.org/10.1145/2983576.2983582}, url = {http://arxiv.org/abs/1608.08139}, author = {Reyes, Cristian and Mohedano, Eva and McGuinness, Kevin and Noel E. O{\textquoteright}Connor and Xavier Gir{\'o}-i-Nieto} } @conference {cPerez-Pellitero15, title = {Accelerating Super-Resolution for 4K Upscaling}, booktitle = {IEEE International Conference on Consumer Electronics}, year = {2015}, month = {01/2015}, address = {Las Vegas, NV, USA}, abstract = {

This paper presents a fast Super-Resolution (SR) algorithm based on a selective patch processing. Motivated by the observation that some regions of images are smooth and unfocused and can be properly upscaled with fast interpolation methods, we locally estimate the probability of performing a degradation-free upscaling. Our proposed framework explores the usage of supervised machine learning techniques and tackles the problem using binary boosted tree classifiers. The applied upscaler is chosen based on the obtained probabilities: (1) A fast upscaler (e.g. bicubic interpolation) for those regions which are smooth or (2) a linear regression SR algorithm for those which are ill-posed. The proposed strategy accelerates SR by only processing the regions which benefit from it, thus not compromising quality. Furthermore all the algorithms composing the pipeline are naturally parallelizable and further speed-ups could be obtained.

}, doi = {10.1109/ICCE.2015.7066429}, author = {E. Perez-Pellitero and Salvador, J. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @mastersthesis {xPanizo15, title = {Classification techniques for Alzheimer{\textquoteright}s disease early diagnosis}, year = {2015}, abstract = {

Advisor: Ver{\'o}nica Vilaplana

Studies: Telecommunication Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Alzheimer{\textquoteright}s disease currently affects more than 36 million people in the world. A patient{\textquoteright}s brain suffers changes during the earliest stages of the disease and long before showing any clinical symptoms. For that reason, researchers focus their efforts towards defining which changes occur and where do they take place, with the goal of detecting indicators to predict the development of the disease. Specifically, the entity Fundaci{\'o}n Pascual Maragall para la investigaci{\'o}n contra el Alzheimer studies the processes of the brain all along the disease{\textquoteright}s stages using images obtained through different MRI techniques. The huge volume of data generated in this kind of investigation is a big obstacle to carry out analysis and extracting conclusions. The aim of this thesis is making this process easier by using data mining techniques. The goal is to develop a basic classification system to distinguish in which stage of the disease a patient is in, using data extracted from cerebral images. This system must form the basis for a future data mining system that satisfies the necessities of the Fundaci{\'o}n Pascual Maragall researchers. In addition to the classification system, this project focuses on distinguishing which is the most relevant data in the classification and on optimizing the classification in the pre-clinical stage of the disease.

}, author = {Eva Panizo}, editor = {Ver{\'o}nica Vilaplana} } @mastersthesis {xCabezas, title = {Co-filtering human interaction and object segmentation}, year = {2015}, abstract = {

Advisors: Axel Carlier and Vincent Charvillat (ENSEEIHT-Universit{\'e} de Toulouse) / Amaia Salvador and\ Xavier Gir{\'o}-i-Nieto\ (UPC)\ 

Degree: Audiovisual Engineering (4 years) at\ Telecom BCN-ETSETB\ (UPC)

Grade: A with honors (9.6/10)

For so many years the problem of object segmentation has been present in image processing field. Click{\textquoteright}n{\textquoteright}Cut, an already existing web tool for interactive object segmentation, helps us to obtain segmentations of the objects by clicking in green (foreground clicks) inside the object to segment, and in red(background clicks) outside the object to segment. However, the behaviour of all human in front of this web tool is not equal. For this reason, it can be possible that these human interactions do not help us to obtain a good object segmentation as would be the result of a bad human interaction. The main aim of this project is to implement some techniques that allow us to treat with these bad human interactions in order to obtain the best object segmentation possible.

Co-filtering human interaction and object segmentation from Xavi Gir{\'o}
}, url = {http://hdl.handle.net/2099.1/25821}, author = {Cabezas, Ferran}, editor = {Carlier, Axel and Amaia Salvador and Xavier Gir{\'o}-i-Nieto and Charvillat, Vincent} } @mastersthesis {xVela15, title = {Combinar im{\'a}genes y h{\'a}bitos musicales para mejorar los sistemas de recomendaci{\'o}n de m{\'u}sica}, year = {2015}, url = {http://hdl.handle.net/2099.1/26236}, author = {Enrique Sa{\~n}oso Vela}, editor = {Ver{\'o}nica Vilaplana and Josep Ramon Morros} } @conference {cSalvadora, title = {Cultural Event Recognition with Visual ConvNets and Temporal Models}, booktitle = {CVPR ChaLearn Looking at People Workshop 2015}, year = {2015}, month = {06/2015}, abstract = {

This paper presents our contribution to the ChaLearn Challenge 2015 on Cultural Event Classification. The challenge in this task is to automatically classify images from 50 different cultural events. Our solution is based on the combination of visual features extracted from convolutional neural networks with temporal information using a hierarchical classifier scheme. We extract visual features from the last three fully connected layers of both CaffeNet (pretrained with ImageNet) and our fine tuned version for the ChaLearn challenge. We propose a late fusion strategy that trains a separate low-level SVM on each of the extracted neural codes. The class predictions of the low-level SVMs form the input to a higher level SVM, which gives the final event scores. We achieve our best result by adding a temporal refinement step into our classification scheme, which is applied directly to the output of each low-level SVM. Our approach penalizes high classification scores based on visual features when their time stamp does not match well an event-specific temporal distribution learned from the training and validation data. Our system achieved the second best result in the \ ChaLearn Challenge 2015 on Cultural Event Classification with a mean average precision of 0.767 on the test set.

[Preprint in arXiv][Workshop site][Slides on GDrive]

Cultural Event Recognition with Visual ConvNets and Temporal Models from Xavier Giro
}, url = {http://www.cv-foundation.org/openaccess/content_cvpr_workshops_2015/W09/papers/Salvador_Cultural_Event_Recognition_2015_CVPR_paper.pdf}, author = {Amaia Salvador and Zeppelzauer, Matthias and Manchon-Vizuete, Daniel and Calafell, Andrea and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xAlonso15, title = {Detecci{\'o} de Text utilitzant Xarxes Neuronals Convolucionals }, year = {2015}, abstract = {

Text detection in images is a challenging problem that has received a significant amount of attention. The inclusion of machine learning allows building accurate classifiers. The goal of this work is to detect text in images. In order to achieve this goal we analyze the Convolutional Neural Network (CNN). We use an implemented CNN to detect areas where there may be text. Then we analyze and filter this areas so we can obtain the detected text.\ 

}, author = {Antea Herrera Alonso}, editor = {Ver{\'o}nica Vilaplana} } @conference {cCampos-Camunez, title = {Diving Deep into Sentiment: Understanding Fine-tuned CNNs for Visual Sentiment Prediction}, booktitle = {1st International Workshop on Affect and Sentiment in Multimedia}, year = {2015}, month = {10/2015}, publisher = {ACM}, organization = {ACM}, address = {Brisbane, Australia}, abstract = {

Visual media are powerful means of expressing emotions and sentiments. The constant generation of new content in social networks highlights the need of automated visual sentiment analysis tools. While Convolutional Neural Networks (CNNs) have established a new state-of-the-art in several vision problems, their application to the task of sentiment analysis is mostly unexplored and there are few studies regarding how to design CNNs for this purpose. In this work, we study the suitability of fine-tuning a CNN for visual sentiment prediction as well as explore performance boosting techniques within this deep learning setting. Finally, we provide a deep-dive analysis into a benchmark, state-of-the-art network architecture to gain insight about how to design patterns for CNNs on the task of visual sentiment prediction.

Acceptance rate at the ASM workshop{\textquoteright}15: 56\% (9/16) [source]

Diving deep into sentiment: Understanding fine-tuned CNNs for visual sentiment prediction from Xavier Giro

Visual Sentiment Detection by Deep CNNs $\#$acmmm15 @DocXavi @mad_astronaut @brendanjou - great paper/analysis! pic.twitter.com/RcGchcPA9f

{\textemdash} Damian Borth (@damianborth) October 30, 2015
}, doi = {10.1145/2813524.2813530}, url = {http://arxiv.org/abs/1508.05056}, author = {V{\'\i}ctor Campos and Amaia Salvador and Xavier Gir{\'o}-i-Nieto and Jou, Brendan} } @conference {cTacchini, title = {Do you have a Pop face? Here is a Pop song. Using profile pictures to mitigate the cold-start problem in Music Recommender Systems}, booktitle = {9th ACM Conference on Recommender Systems}, year = {2015}, month = {09/2015}, author = {Tacchini, E. and Morros, J.R. and Ver{\'o}nica Vilaplana and Sa{\~n}oso, E.} } @mastersthesis {xBellver-Bueno, title = {Efficient Exploration of Region Hierarchies for Semantic Segmentation}, year = {2015}, abstract = {

Advisors: Carles Ventura-Royo (UPC) and Xavier Gir{\'o}-i-Nieto (UPC)

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Grade: A (9.0/10.0)

The motivation of this work is the efficient exploration of hierarchical partitions for semantic segmentation as a method for locating objects in images. While many efforts have been focused on efficient image search in large-scale databases, few works have addressed the problem of locating and recognizing objects efficiently within a given image. My work considers as an input a hierarchical partition of an image that defines a set of regions as candidate locations to contain an object. This approach will be compared to other state of the art algorithms that extract object candidates for an image. The final goal of this work is to semantically segment images efficiently by exploiting the multiscale information provided by a hierarchical partition, maximizing the accuracy of the segmentation when only a very few regions of the partition are analysed.

Efficient exploration of region hierarchies for semantic segmentation from Xavier Giro

}, author = {M{\'\i}riam Bellver}, editor = {Ventura, C. and Xavier Gir{\'o}-i-Nieto} } @article {xPan, title = {End-to-end Convolutional Network for Saliency Prediction}, year = {2015}, month = {07/2015}, institution = {arXiv}, address = {Boston, MA (USA)}, abstract = {

The prediction of saliency areas in images has been traditionally addressed with hand crafted features based on neuroscience principles. This paper however addresses the problem with a completely data-driven approach by training a convolutional network. The learning process is formulated as a minimization of a loss function that measures the Euclidean distance of the predicted saliency map with the provided ground truth. The recent publication of large datasets of saliency prediction has provided enough data to train a not very deep architecture which is both fast and accurate. The convolutional network in this paper, named JuntingNet, won the LSUN 2015 challenge on saliency prediction with a superior performance in all considered metrics.



}, url = {http://arxiv.org/abs/1507.01422}, author = {Pan, Junting and Xavier Gir{\'o}-i-Nieto} } @conference {cRoldan-Carlosa, title = {Event Video Retrieval using Global and Local Descriptors in Visual Domain}, booktitle = {IEEE/ACM International Workshop on Content-Based Multimedia Indexing - CBMI 2015 }, year = {2015}, month = {06/2015}, abstract = {

With the advent of affordable multimedia smart phones, it has become common that people take videos when they are at events. The larger the event, the larger is the amount of videos taken there and also, the more videos get shared online. To search in this mass of videos is a challenging topic. In this paper we present and discuss a prototype software for searching in such videos. We focus only on visual information, and we report on experiments based on a research data set. With a small study we show that our prototype demonstrates promising results by identifying the same scene in different videos taken from different angles solely based on content based image retrieval.

}, url = {https://upcommons.upc.edu/handle/2117/76553}, author = {Roldan-Carlos, Jennifer and Lux, Mathias and Xavier Gir{\'o}-i-Nieto and Pia Mu{\~n}oz-Trallero and Anagnostopoulos, Nektarios} } @conference {cMohedano, title = {Exploring EEG for Object Detection and Retrieval}, booktitle = {ACM International Conference on Multimedia Retrieval (ICMR) }, year = {2015}, address = {Shanghai, China}, abstract = {

This paper explores the potential for using Brain Computer Interfaces (BCI) as a relevance feedback mechanism in content-based image retrieval. We investigate if it is possible to capture useful EEG signals to detect if relevant objects are present in a dataset of realistic and complex images. \ We perform several experiments using a rapid serial visual presentation (RSVP) of images at different rates (5Hz and 10Hz) on 8 users with different degrees of familiarization with BCI and the dataset. We then use the feedback from the BCI and mouse-based interfaces to retrieve objects in a subset of TRECVid images. We show that it is indeed possible detect such objects in complex images and, also, that users with previous knowledge on the dataset or experience with the RSVP outperform others. When the users have limited time to annotate the images (100 seconds in our experiments) both interfaces are comparable in performance. Comparing our best users in a retrieval task, we found that EEG-based relevance feedback outperforms mouse-based feedback. The realistic and complex image dataset differentiates our work from previous studies on EEG for image retrieval.

[Extended version in arXiv:1504.02356]

[ACM ICMR 2015 website]

Overall acceptance rate: 33\% (source)

}, doi = {10.1145/2671188.2749368}, url = {http://arxiv.org/abs/1504.02356}, author = {Mohedano, Eva and Amaia Salvador and Porta, Sergi and Xavier Gir{\'o}-i-Nieto and Kevin McGuinness and Healy, Graham and O{\textquoteright}Connor, N.} } @conference {cBazazian15, title = {Fast and Robust Edge Extraction in Unorganized Point Clouds}, booktitle = {International Conference on Digital Image Computing: Techniques and Applications}, year = {2015}, month = {11/2015}, publisher = {DICTA/IEEE}, organization = {DICTA/IEEE}, address = {Adelaide, Australia}, abstract = {

Edges provide important visual information in scene surfaces. The need for fast and robust feature extraction from 3D data is nowadays fostered by the widespread availability of cheap commercial depth sensors and multi-camera setups. This article investigates the challenge of detecting edges in surfaces represented by unorganized point clouds. Generally, edge recognition requires the extraction of geometric features such as normal vectors and curvatures. Since the normals alone do not provide enough information about the geometry of the cloud, further analysis of extracted normals is needed for edge extraction, such as a clustering method. Edge extraction through these techniques consists of several steps with parameters which depend on the density and the scale of the point cloud. In this paper we propose a fast and precise method to detect sharp edge features by analysing the eigenvalues of the covariance matrix that are defined by each point{\textquoteright}s k-nearest neighbors. Moreover, we evaluate quantitatively, and qualitatively the proposed methods for sharp edge extraction using several dihedral angles and well known examples of unorganized point clouds. Furthermore, we demonstrate the robustness of our approach in the noisier real-world datasets.

}, doi = {10.1109/DICTA.2015.7371262}, url = {http://dictaconference.org/dicta2015/}, author = {D. Bazazian and Casas, J. and Ruiz-Hidalgo, J.} } @mastersthesis {xCalafell-Oros, title = {Fine-tuning a Convolutional Network for Cultural Event Recognition}, year = {2015}, abstract = {

Advisors: Amaia Salvador (UPC), Matthias Zeppelzauer (FH St P{\"o}lten), Xavier Gir{\'o}-i-Nieto (UPC)

Studies: Bachelor Degree in Audiovisual Systems Engineering at Telecom BCN-ETSETB from the Technical University of Catalonia (UPC)

Grade: A with honors (10/10)

This thesis explores good practices for improving the performance of an existing convnet trained with a dataset of clean data when an additional dataset of noisy data is available. We develop techniques to clean the noisy data with the help of the clean one, a family of solutions that we will refer to as denoising, and then we explore the best sorting of the clean and noisy datasets during the fine-tuning of a convnet. Then we study strategies to select the subset of images of the clean data that will improve the classification performance, a practice we will efer to as fracking. Next, we determine how many layers are actually better to fine-tune in our convnet, given our amount of data. And finally, we compare the classic convnet architecture where a single network is fine-tuned to solve a multi-class problem with the case of fine-tuning a convnet for binary classification for each considered class.

Fine tuning a convolutional network for cultural event recognition from Xavier Giro

2015-TFG-AndreaCalafell-FineTuningConvolutionalNetworkForCulturalEventRecognition from Image Processing Group on Vimeo.

See https://imatge.upc.edu/web/publications/cultural-event-recognition-visual-convnets-and-temporal-models

}, url = {http://hdl.handle.net/2117/78391}, author = {Calafell, Andrea}, editor = {Amaia Salvador and Zeppelzauer, Matthias and Xavier Gir{\'o}-i-Nieto} } @article {xCanton-Ferrer15, title = {From Catalonia to America: notes on how to achieve a successful post-PhD career}, year = {2015}, abstract = {

Presented at the 2nd Annual Catalan Meeting on Computer Vision - ACMCV (September 18, 2015)

Talk summary

Reaching the green prairies where the big fat tech companies graze has been historically regarded as a hard and painful journey for young freshly graduated PhDs. The amount of contenders is large and the competition may be overwhelming but, despite all, you often see researchers related to Catalan universities that have found a prominent role in companies like Microsoft, Google, Facebook, etc. By talking to them you start to detect some common factors that contributed to the success of their careers and some common views on the pros/cons related to their experience in big corporations. In this talk, I will present some hints/advice/directions on how to bridge the gap between the end of a PhD and the start of a professional career in the software industry. What is needed? What is the posture of big companies w.r.t. research? Which are our strong assets coming out from a Catalan university (and which are not!)?

Short bio about the speaker

Cristian Canton (Barcelona, 1980) got his MSc in Telecommunications Engineering by the Technical University of Catalonia (UPC) in 2003, carrying out his MSc Thesis at the Swiss Federal Institute of Technology of Lausanne (EPFL), covering topics in image processing and coding. After that, he pursued his PhD at UPC on computer vision and machine learning topics applied to human motion capture in multi-view scenarios. During this period, he was visiting researcher at Technion (Haifa, Israel) and Koc University (Istanbul, Turkey) and took active roles in several EU founded projects. In 2008, he was a Postdoc researcher at NXP in Paris where he developed gesture-based HCIs for mobile devices. In 2009, he joined VICON Ltd., a leading company in the field of vision-based motion capture systems for entertainment and biomedical industries. During 2010-2012, he was EU Project reviewer. He joined Microsoft in 2012 as a senior researcher and developer where transfers technology into real life products such as Surface Hub, Hololens and Skype.

}, keywords = {america, catalonia, computer Vision, job, microsoft}, author = {Cristian Canton-Ferrer} } @article {aMohedano, title = {Improving Object Segmentation by using EEG signals and Rapid Serial Visual Presentation}, journal = {Multimedia Tools and Applications}, year = {2015}, month = {07/2015}, abstract = {

This paper explores the potential of brain-computer interfaces in segmenting objects from images. Our approach is centered around designing an effective method for displaying the image parts to the users such that they generate measurable brain reactions. When a block of pixels is displayed, we estimate the probability of that block containing the object of interest using a score based on EEG activity. After several such blocks are displayed in rapid visual serial presentation, the resulting probability map is binarized and combined with the GrabCut algorithm to segment the image into object and background regions. This study extends our previous work that showed how BCI and simple EEG analysis are useful in locating object boundaries in images

}, issn = {1573-7721}, doi = {10.1007/s11042-015-2805-0}, url = {http://dx.doi.org/10.1007/s11042-015-2805-0}, author = {Mohedano, Eva and Healy, Graham and Kevin McGuinness and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Smeaton, Alan F.} } @conference {cVentura, title = {Improving Spatial Codification in Semantic Segmentation}, booktitle = {IEEE International Conference on Image Processing (ICIP), 2015}, year = {2015}, month = {09/2015}, publisher = {IEEE}, organization = {IEEE}, address = {Quebec City}, abstract = {

This paper explores novel approaches for improving the spatial codification for the pooling of local descriptors to solve the semantic segmentation problem. We propose to partition the image into three regions for each object to be described: Figure, Border and Ground. This partition aims at minimizing the influence of the image context on the object description and vice versa by introducing an intermediate zone around the object contour. Furthermore, we also propose a richer visual descriptor of the object by applying a Spatial Pyramid over the Figure region. Two novel Spatial Pyramid configurations are explored: Cartesian-based and crown-based Spatial Pyramids. We test these approaches with state-of-the-art techniques and show that they improve the Figure-Ground based pooling in the Pascal VOC 2011 and 2012 semantic segmentation challenges.

Improving Spatial Codification in Semantic Segmentation from Xavier Giro
}, url = {http://arxiv.org/abs/1505.07409}, author = {Ventura, C. and Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Kevin McGuinness and Marqu{\'e}s, F. and O{\textquoteright}Connor, N.} } @article {xVentura, title = {Improving Spatial Codification in Semantic Segmentation (Supplementary Material)}, year = {2015}, month = {09/2015}, abstract = {

This document contains supplementary material for the paper "Improving Spatial Codification in Semantic Segmentation" submitted to ICIP 2015. First, there is a section dedicated to the results obtained by categories when ideal object candidates (ground truth masks) are used. Then, an analysis of the results using CPMC and MCG object candidates also detailed by categories. Finally, visual results for CPMC and MCG are showed.

}, author = {Ventura, C. and Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Kevin McGuinness and Marqu{\'e}s, F. and Noel E. O{\textquoteright}Connor} } @mastersthesis {xAlfaro15, title = {Inclusion of depth information on a temporal hierarchical co-clustering technique}, year = {2015}, abstract = {

Video segmentation aims to consistently group regions that are similar in appearance and movement along a sequence. This task is an essential step of video analysis and it has important applications in video coding, indexing and retrieval, 3D reconstruction, action recognition, etc. In particular, in this thesis, a multiresolution hierarchical co-clustering technique (MRHC) is analyzed in order to include depth information for improving image segmentation in sequences with small variations. This information is invariant to changes on brightness and/or texture and camera position. Thus, it may correct some errors that are present when segmentation techniques based on color and movement are used, because this information is independent of color information of the image and the movement that occurs in the scene.

Two considerations can be made. Firstly, all the regions of the same object should share similar depth values and, secondly, depth values from regions that do not belong to the same object show discontinuities. Three different ways of coding depth information in MRHC have been studied in this project. In the first approach, the similarity between regions is weighted according to the depth difference between them. The second approach determines the 3D-neighborhood between regions. Finally, a combination of the previous approaches is considered.

The Video Occlusion/Object Boundary Dataset has been used to evaluate the inclusion of the depth on MRHC and to compare this method with the state-of-the-art techniques in the field of video segmentation. The results obtained show that the use of depth information improves the outcome video techniques obtained with the MRHC, outperforming the state-of-the-art methods in this scenario.

}, author = {Alfaro, M.}, editor = {David Varas and Marqu{\'e}s, F.} } @conference {cMcGuinnessa, title = {Insight DCU at TRECVID 2015}, booktitle = {TRECVID 2015 Workshop}, year = {2015}, month = {11/2015}, publisher = {NIST}, organization = {NIST}, address = {Gaithersburg, MD, USA}, abstract = {

Insight-DCU participated in the instance search (INS), semantic indexing (SIN), and localization tasks (LOC) this year.

In the INS task we used deep convolutional network features trained on external data and the query data for this year to train our system. We submitted four runs, three based on convolutional network features, and one based on SIFT/BoW. F A insightdcu 1 was an automatic run using features from the last convolutional layer of a deep network with bag-of-words encoding and achieved 0.123 mAP. F A insightdcu 2 modi ed the previous run to use re-ranking based on an R-CNN model and achieved 0.111 mAP. I A insightdcu 3, our interactive run, achieved 0.269 mAP. Our SIFT-based run F A insightdcu 2 used weak geometric consistency to improve performance over the previous year to 0.187 mAP. Overall we found that using features from the convolutional layers improved performance over features from the fully connected layers used in previous years, and that weak geometric consistency improves performance for local feature ranking.

In the SIN task we again used convolutional network features, this time netuning a network pretrained on external data for the task. We submitted four runs, 2C D A insightdcu.15 1..4 varying the top-level learning algorithm and use of concept co-occurance. 2C D A insightdcu.15 1 used a linear SVM top-level learner, and achieved 0.63 mAP. Exploiting concept co-occurance improved the accuracy of our logistic regression run 2C D A insightdcu.15 3 from 0.058 mAP to 0.6 2C D A insightdcu.15 3.

Our LOC system used training data from IACC.1.B and features similar to our INS run, but using a VLAD encoding instead of a bag-of-words. Unfortunately there was problem with the run that we are still investigating.

Note: UPC and NII participated only in the INS task of this submission.

}, url = {http://www-nlpir.nist.gov/projects/tvpubs/tv.pubs.15.org.html}, author = {Kevin McGuinness and Mohedano, Eva and Amaia Salvador and Zhang, ZhenXing and Marsden, Mark and Wang, Peng and Jargalsaikhan, Iveel and Antony, Joseph and Xavier Gir{\'o}-i-Nieto and Satoh, Shin{\textquoteright}ichi and O{\textquoteright}Connor, N. and Smeaton, Alan F.} } @mastersthesis {xRamos-Caballero15, title = {Keyframe-based Video Summarization Designer}, year = {2015}, abstract = {

Advisors: Xavier Gir{\'o}-i-Nieto (UPC) and Horst Eidenberger (TU Wien)

Studies: Bachelor Degree inAudiovisual Systems Engineeringat Telecom BCN-ETSETB from the Technical University of Catalonia (UPC)

Grade: B (8.7/10)

This Final Degree Work extends two previous projects and consists in carrying out an improvement of the video keyframe extraction module from one of them called Designer Master, by integrating the algorithms that were developed in the other, Object Maps.

Firstly the proposed solution is explained, which consists in a shot detection method, where the input video is sampled uniformly and afterwards, cumulative pixel-to-pixel difference is applied and a classifier decides which frames are keyframes or not.

Last, to validate our approach we conducted a user study in which both applications were compared. Users were asked to complete a survey regarding to different summaries created by means of the original application and with the one developed in this project. The results obtained were analyzed and they showed that the improvement done in the keyframes extraction module improves slightly the application performance and the quality of the generated summaries.

Keyframe-based Video Summarization Designer from Xavier Giro
}, author = {Ramos-Caballero, Carlos}, editor = {Eidenberger, Horst and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xCampos-Camunez, title = {Layer-wise CNN Surgery for Visual Sentiment Prediction}, year = {2015}, abstract = {

Advisors: Amaia Salvador (UPC), Brendan Jou (Columbia University) and Xavier Gir{\'o}-i-Nieto (UPC)

Visual media are powerful means of expressing emotions and sentiments. The constant generation of new content in social networks highlights the need of automated visual sentiment analysis tools. While Convolutional Neural Networks (CNNs) have established a new state-of-the-art in several vision problems, their application to the task of sentiment analysis is mostly unexplored and there are few studies regarding how to design CNNs for this purpose. In this work, we study the suitability of fine-tuning a CNN for visual sentiment prediction as well as explore performance boosting techniques within this deep learning setting. Finally, we provide a deep-dive analysis into a benchmark, state-of-the-art network architecture to gain insight about how to design patterns for CNNs on the task of visual sentiment prediction.

Layer-wise CNN Surgery for Visual Sentiment Prediction from Xavier Giro

}, author = {V{\'\i}ctor Campos}, editor = {Amaia Salvador and Jou, Brendan and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xBarra, title = {LIvRE: A Video Extension to the LIRE Content-Based Image Retrieval System}, year = {2015}, abstract = {

Advisors:\ Mathias Lux\ (Klagenfurt University) and\ Xavier Gir{\'o}-i-Nieto\ (UPC)

Degree: Electronic Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)

Grade: A (9.0/10.0)

This project explores the expansion of Lucene Image Retrieval Engine (LIRE), an open-source Content-Based Image Retrieval (CBIR) system, for video retrieval on large scale video datasets. The fast growth of the need to store huge amounts of video in servers requires efficient, scalable search and indexing engines capable to assist users in their management and retrieval. In our tool, queries are formulated by visual examples allowing users to find the videos and the moment of time when the query image is matched with. The video dataset used on this scenario comprise over 1,000 hours of different news broadcast channels. This thesis presents an extension and adaptation of Lire and its plugin for Solr, an open-source enterprise search platform from the Apache Lucene project, for video retrieval based on visual features, as well as a web-interface for users from different devices.

LIvRE: A Video Extension to the LIRE Content-Based Image Retrieval System from Xavier Giro
}, url = {http://upcommons.upc.edu/handle/2117/79052}, author = {Gabriel de Oliveira-Barra}, editor = {Xavier Gir{\'o}-i-Nieto and Lux, Mathias} } @conference {cHuang15, title = {Mode dependent vector quantization with a rate-distortion optimized codebook for residue coding in video compression}, booktitle = {IEEE Int. Conf. on Acoustics Speech and Signal Processing, ICASSP 2015}, year = {2015}, month = {04/2015}, publisher = {IEEE}, organization = {IEEE}, address = {Brisbane, Australia}, abstract = {

The High Efficiency Video Coding standard (HEVC) supports a total of 35 intra prediction modes which aim at reducing spatial redundancy by exploiting pixel correlation within a local neighborhood. In this paper, we show that spatial correlation remains after intra prediction, leading to high energy prediction residues. We propose a novel scheme for encoding the prediction residues using a Mode Dependent Vector Quantization (MDVQ) which aims at reducing the redundancy in residual domain. The MDVQ codebook is optimized in a rate-distortion (RD) sense. Experimental results show that the codebook can be independent of the quantization parameter (QP) with no loss in terms of coding efficiency. A bitrate reduction of 1:1\% on average compared to HEVC can be achieved, while further tests indicate that codebook adaptivity could substantially improve the performance.

}, author = {Huang, B. and Henry, F. and Guillemot, C. and Salembier, P.} } @conference {cVaras15, title = {Multiresolution hierarchy co-clustering for semantic segmentation in sequences with small variations}, booktitle = {ICCV - International Conference on Computer Vision}, year = {2015}, month = {12/2015}, abstract = {

This paper presents a co-clustering technique that, given a collection of images and their hierarchies, clusters nodes from these hierarchies to obtain a coherent multiresolution representation of the image collection. We formalize the co-clustering as a Quadratic Semi-Assignment Problem and solve it with a linear programming relaxation approach that makes effective use of information from hierarchies. Initially, we address the problem of generating an optimal, coherent partition per image and, afterwards, we extend this method to a multiresolution framework. Finally, we particularize this framework to an iterative multiresolution video segmentation algorithm in sequences with small variations. We evaluate the algorithm on the Video Occlusion/Object Boundary Detection Dataset, showing that it produces state-of-the-art results in these scenarios.\ 

}, url = {http://arxiv.org/abs/1510.04842}, author = {David Varas and Alfaro, M. and Marqu{\'e}s, F.} } @inbook {bDigne15, title = {Neighborhood Filters and the Recovery of 3D Information}, booktitle = {Handbook of Mathematical Methods in Imaging}, year = {2015}, pages = {1645-1673}, publisher = {Springer Verlag}, organization = {Springer Verlag}, edition = {2}, isbn = {978-1-4939-0789-2}, author = {J. Digne and Dimiccoli, M. and N. Sabater and Salembier, P.} } @article {aBellot Pujalte15, title = {NetBenchmark: a bioconductor package for reproducible benchmarks of gene regulatory network inference}, journal = {BMC Bioinformatics}, volume = {16}, year = {2015}, month = {09/2015}, abstract = {

Background:
In the last decade, a great number of methods for reconstructing gene regulatory networks from expression data have been proposed. However, very few tools and datasets allow to evaluate accurately and reproducibly those methods. Hence, we propose here a new tool, able to perform a systematic, yet fully reproducible, evaluation of transcriptional network inference methods.

Results:
Our open-source and freely available Bioconductor package aggregates a large set of tools to assess the robustness of network inference algorithms against different simulators, topologies, sample sizes and noise intensities.

Conclusions:
The benchmarking framework that uses various datasets highlights the specialization of some methods toward network types and data. As a result, it is possible to identify the techniques that have broad overall performances.

}, keywords = {Benchmark, Bioconductor package, Gene expression, Gene regulation network reconstruction, Gene regulatory networks, Synthetic genetic networks}, doi = {10.1186/s12859-015-0728-4}, url = {http://bmcbioinformatics.biomedcentral.com/articles/10.1186/s12859-015-0728-4}, author = {P. Bellot and Olsen, Catharina and Salembier, P. and Albert Oliveras and Meyer, P. E.} } @conference {cNguyen, title = {NII-HITACHI-UIT at TRECVID 2015 Instance Search}, booktitle = {TRECVID 2015 Workshop}, year = {2015}, month = {11/2015}, publisher = {NIST}, organization = {NIST}, address = {Gaithersburg, MD, USA}, abstract = {

In this paper, we propose two methods to improve last year instance search framework. Both of them are based on post processing scheme that try to rerank top K shots returned from BOW model. The rst system is to propose a query-adaptive weighting technique between DPM object detectors score and BOW{\textquoteright}s score. In order to nd a good weight, we use a neural network which learns characteristics of the query including number of features, number of shared words and area of the query topic. The second system combines two state-of-the-art object detectors: DPM and Fast RCNN to estimate object location and similarity score, respectively. The nal score is computed using these components together with BOW based similarity score returned from the baseline system. The experimental results show that our system improved pretty much even with a smaller number of top K input ranked list. Compared to other teams, we got the second place with the same run.

}, url = {http://www-nlpir.nist.gov/projects/tvpubs/tv.pubs.15.org.html}, author = {Nguyen, Vinh-Tiep and Duy-Dinh-Le and Amaia Salvador and Caizhi-Zhu and Nguyen, Dinh-Luan and Tran, Minh-Triet and Ngo Duc, Thanh and Anh Duong, Duc and Satoh, Shin{\textquoteright}ichi and Xavier Gir{\'o}-i-Nieto} } @article {aValero15, title = {Object recognition in hyperspectral images using Binary Partition Tree representation}, journal = {Pattern Recognition Letters}, volume = {56}, year = {2015}, month = {04/2015}, pages = {45-51}, abstract = {

In this work, an image representation based on Binary Partition Tree is proposed for object detection in hyperspectral images. This hierarchical region-based representation can be interpreted as a set of hierarchical regions stored in a tree structure, which succeeds in presenting: (i) the decomposition of the image in terms of coherent regions and (ii) the inclusion relations of the regions in the scene. Hence, the BPT representation defines a search space for constructing a robust object identification scheme. Spatial and spectral information are integrated in order to analyze hyperspectral images with a region-based perspective. For each region represented in the BPT, spatial and spectral descriptors are computed and the likelihood that they correspond to an instantiation of the object of interest is evaluated. Experimental results demonstrate the good performances of this BPT-based approach.

}, issn = {0167-8655}, author = {Valero, S. and Salembier, P. and Chanussot, J.} } @mastersthesis {xGris-Sarabia, title = {Pyxel, una llibreria per a l{\textquoteright}anotació automàtica de fotografies}, year = {2015}, abstract = {

Tutor:\ Xavier Gir{\'o}-i-Nieto\ (UPC)\ 

Grau: Enginyeria en Sistemes Audiovisuals (4 anys) a l{\textquoteright}Escola d{\textquoteright}Enginyeria de Terrassa (UPC)

Donat que actualment la generació i difusió de contingut multimèdia ha incrementat molt, s{\textquoteright}està fent recerca per trobar eines d{\textquoteright}anotació automàtica o semiautomàtica d{\textquoteright}imatges, sons, vídeos, etc.

Aquest document recull la informació sobre el desenvolupament i funcionament de la llibreria Pyxel, orientada, concretament, a l{\textquoteright}anotació d{\textquoteright}imatges mitjançant la informació visual de la imatge i la informació textual que l{\textquoteright}acompanya, les metadades.

El Pyxel és un conjunt de classes desenvolupades en llenguatge de programació Python per dur a terme una cadena d{\textquoteright}anotació d{\textquoteright}imatges complerta, es a dir, permet l{\textquoteright}extracció de característiques visuals amb descriptors SIFT i l{\textquoteright}extracció de característiques textuals de metadades mitjançant l{\textquoteright}algoritme de processat de text TF-IDF, així com eines per crear ambdós vocabularis. També proporciona eines per l{\textquoteright}entrenament i la detecció d{\textquoteright}un classificador SVM.

Per tal de gestionar les dades de grans volums d{\textquoteright}imatges les eines del Pyxel estan pensades per fer processat d{\textquoteright}imatges en paral{\textperiodcentered}lel, molt útil per aprofitar d{\textquoteright}una manera optima els recursos d{\textquoteright}un servei de computació gestionat amb SLURM.

Qualificaci{\'o} final: A (9/10)

Thesis defence by Irene Gris Sarabia, with Pyxel developer team.

Defensa del Projecte de Fi de Grau de la Irene Gris Sarabia (23 de gener de 2015)
}, keywords = {image classification, python}, author = {Gris-Sarabia, Irene}, editor = {Xavier Gir{\'o}-i-Nieto} } @conference {cCabezas, title = {Quality Control in Crowdsourced Object Segmentation}, booktitle = {IEEE International Conference on Image Processing (ICIP), 2015}, year = {2015}, month = {09/2015}, abstract = {

This paper explores processing techniques to deal with noisy data in crowdsourced object segmentation tasks. We use the data collected with "Click{\textquoteright}n{\textquoteright}Cut", an online interactive segmentation tool, and we perform several experiments towards improving the segmentation results. First, we introduce different superpixel-based techniques to filter users{\textquoteright} traces, and assess their impact on the segmentation result. Second, we present different criteria to detect and discard the traces from potential bad users, resulting in a remarkable increase in performance. Finally, we show a novel superpixel-based segmentation algorithm which does not require any prior filtering and is based on weighting each user{\textquoteright}s contribution according to his/her level of expertise.

Selected among Top 10\% papers in ICIP 2015 based on the reviewer scores and recommendations.

[Preprint on arXiv]

[Related BSc thesis by Ferran Cabezas]

[IEEE ICIP 2015 conference website]

Co-filtering human interaction and object segmentation from Xavier Giro

}, url = {http://arxiv.org/abs/1505.00145}, author = {Cabezas, Ferran and Carlier, Axel and Amaia Salvador and Xavier Gir{\'o}-i-Nieto and Charvillat, Vincent} } @article {aBonet-Carne14a, title = {Quantitative Ultrasound Texture Analysis of Fetal Lungs To Predict Neonatal Respiratory Morbidity}, journal = {Ultrasound in Obstetrics and Gynecology, Wiley}, volume = {45}, year = {2015}, pages = {427{\textendash}433}, author = {E. Bonet-Carne and M. Palacio and T. Cobo and A. Perez-Moreno and M. Lopez and J. P. Piraquive and J. C. Ramirez and F. Marques and E. Gratacos} } @mastersthesis {xPorta, title = {Rapid Serial Visual Presentation for Relevance Feedback in Image Retrieval with EEG Signals}, year = {2015}, abstract = {

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Grade: A (9/10)

This thesis explores the potential of relevance feedback for image retrieval using EEG signals for human-computer interaction. This project aims at studying the optimal parameters of a rapid serial visual presentation (RSVP) of frames from a video database when the user is searching for an object instance. The simulations reported in this thesis assess the trade-off between using a small or a large amount of images in each RSVP round that captures the user feedback. While short RSVP rounds allow a quick learning of the user intention from the system, RSVP rounds must also be long enough to let users generate the P300 EEG signals which are triggered by relevant images. This work also addresses the problem of how to distribute potential relevant and non-relevant images in a RSVP round to maximize the probabilities of displaying each relevant frame separated at least 1 second from another relevant frame, as this configuration generates a cleaner P300 EEG signal. The presented simulations are based on a realistic set up for video retrieval with a subset of 1,000 frames from the TRECVID 2014 Instance Search task.

}, keywords = {eeg, feedback, image, relevance, retrieval}, author = {Porta, Sergi}, editor = {Amaia Salvador and Mohedano, Eva and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @mastersthesis {xLopez15, title = {Reconstrucci{\'o} de la forma del rostre a partir de contorns}, year = {2015}, abstract = {

La reconstrucci{\'o} i el modelatge de cares 3D s{\textquoteright}han convertit en els {\'u}ltims anys en una l{\'\i}nia d{\textquoteright}investigaci{\'o} molt activa a causa de la seva utilitzaci{\'o} en gran nombre d{\textquoteright}aplicacions com s{\'o}n el reconeixement facial en 3D, modelatge de cares en videojocs, cirurgia est{\`e}tica, etc. Durant les {\'u}ltimes d{\`e}cades s{\textquoteright}han desenvolupat m{\'u}ltiples t{\`e}cniques computacionals de reconstrucci{\'o} facial 3D. Una de les tecnologies m{\'e}s utilitzades est{\`a} basada en imatges 2D i m{\`e}todes estad{\'\i}stics (3D Morphable Models). L{\textquoteright}estimaci{\'o} de la forma de la cara abordada amb models estad{\'\i}stics, t{\'e} com a objectiu principal trobar un conjunt de par{\`a}metres de la cara que millor s{\textquoteright}ajusten a una imatge o un conjunt d{\textquoteright}imatges. Per a aquest projecte es disposa d{\textquoteright}un model estad{\'\i}stic capa{\c c} d{\textquoteright}estimar la forma 3D del rostre quan una cara o un conjunt de contorns 2D est{\`a} disponible des de m{\'u}ltiples punts de vista. Aquest model estima directament l{\textquoteright}estructura de la cara 3D mitjan{\c c}ant l{\textquoteright}{\'u}s d{\textquoteright}una matriu de regressi{\'o} constru{\"\i}da a trav{\'e}s de PLS (Partial Least Squares). Despr{\'e}s de la validaci{\'o} del model i els bons resultats obtinguts en la predicci{\'o} de subjectes sint{\`e}tics, aquest treball proposa un nou enfoc, entrenar el model amb subjectes reals a partir d{\textquoteright}una seq{\"u}{\`e}ncia de v{\'\i}deo. D{\textquoteright}aquesta forma s{\textquoteright}obtenen reconstruccions de la forma de la cara 3D amb dades reals. Per tant, l{\textquoteright}objectiu d{\textquoteright}aquest projecte {\'e}s la definici{\'o}, la implementaci{\'o} software i l{\textquoteright}an{\`a}lisi d{\textquoteright}un procediment que ens permeti ajustar un model estad{\'\i}stic facial tridimensional gen{\`e}ric a les caracter{\'\i}stiques facials espec{\'\i}fiques d{\textquoteright}un individu a partir dels contorns del seu rostre.

}, author = {Laia del Pino L{\'o}pez}, editor = {Ver{\'o}nica Vilaplana and Josep Ramon Morros} } @conference {cMaceira15, title = {Region-based depth map coding using a 3D scene representation}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing}, year = {2015}, month = {04/2015}, address = {Brisbane, Australia}, abstract = {

In 3D video, view synthesis is used to process new virtual views between encoded camera views. Errors in the coding of the depth maps introduce geometry inconsistencies in synthesized views. In this paper, a 3D plane representation of the scene is presented which improve the performance of current standard video codecs in the view synthesis domain. Depth maps are segmented into regions without sharp edges and represented with a plane in the 3D world scene coordinates. This 3D representation allows an efficient representation while preserving the 3D characteristics of the scene. Experimental results are provided obtaining gains from 10 to 40 \% in bitrate compared to HEVC.

}, author = {Maceira, M. and Morros, J.R. and Ruiz-Hidalgo, J.} } @mastersthesis {xGirbau15, title = {Region-based Particle Filter}, year = {2015}, abstract = {

In this project the implementation of a video object tracking technique based on a particle filter that uses the partitions of the various frames in the video has been tackled. This is an extension of the standard particle filter tracker in which unions of regions of the image are used to generate particles. By doing so, the tracking of the object of interest through the video sequence is expected to be done in a more accurate and robust way. One of the main parts of this video object tracker is a co-clustering technique that allows having an initial estimation of the object in the current frame, relying on the instance of the same object in a previous frame. While developing the object tracker, we realized the importance of this co-clustering technique, not only in the context of the current video tracker but as a basic tool for several of the research projects in the image group. Therefore, we decided to concentrate on the implementation of a generic, versatile co-clustering technique instead of the simple version that was necessary for the tracking problem. This way, the main goal of this project consists on implementing the co-clustering method presented in an accurate way while presenting a low computation time. Moreover, the complete Region-based particle filter for tracking purposes is presented. Therefore, the aim of this Final Degree Project is, mainly, to give a guideline to future researchers who will use this algorithm; to help understand and apply the mentioned co-clustering for any project in need of this method.

}, url = {http://upcommons.upc.edu/bitstream/handle/2099.1/25370/Andreu_Girbau_Xalabarder_TFG.pdf?sequence=4\&isAllowed=y}, author = {Girbau Xalabarder, A.}, editor = {David Varas and Marqu{\'e}s, F.} } @mastersthesis {xBosch, title = {Region-oriented Convolutional Networks for Object Retrieval}, year = {2015}, abstract = {

Advisors: Amaia Salvador and\ Xavier Gir{\'o}-i-Nieto\ (UPC)\ 

Study program: Engineering on Audiovisual Systems (4 years) at Escola d{\textquoteright}Enginyeria de Terrassa\ (UPC)

Grade: A (9.6/10)

This thesis is framed in the computer vision field, addressing a challenge related to instance search. Instance search consists in searching for occurrences of a certain visual instance on a large collection of visual content, and generating a ranked list of results sorted according to their relevance to a user query. This thesis builds up on existing work presented at the TRECVID Instance Search Task in 2014, and explores the use of local deep learning features extracted from object proposals. The performance of different deep learning architectures (at both global and local scales) is evaluated, and a thorough comparison of them is performed. Secondly, this thesis presents the guidelines to follow in order to fine-tune a convolutional neural network for tasks such as image classification, object detection and semantic segmentation. It does so with the final purpose of fine tuning SDS, a CNN trained for both object detection and semantic segmentation, with the recently released Microsoft COCO dataset.

Region-oriented Convolutional Networks for Object Retrieval from Xavier Giro
}, author = {Fontdevila-Bosch, Eduard}, editor = {Amaia Salvador and Xavier Gir{\'o}-i-Nieto} } @article {aVilaplana, title = {Saliency Maps on Image Hierarchies}, journal = {Signal Processing: Image Communication. Special Issue on Recent Advances in Saliency Models, Applications and Evaluations}, volume = {38}, year = {2015}, pages = {84-99}, abstract = {

In this paper we propose two saliency models for salient object segmentation based on a hierarchical image segmentation, a tree-like structure that represents regions at di erent scales from the details to the whole image (e.g. gPb-UCM, BPT). The first model is based on a hierarchy of image partitions. The saliency at each level is computed on a region basis, taking into account the contrast between regions. The maps obtained for the di erent partitions are then integrated into a nal saliency map. The second model directly works on the structure created by the segmentation algorithm, computing saliency at each node and integrating these cues in a straightforward manner into a single saliency map. We show that the proposed models produce high quality saliency maps. Objective evaluation demonstrates that the two methods achieve state-of-the-art performance in several benchmark datasets.

}, issn = {0923-5965}, author = {Ver{\'o}nica Vilaplana} } @mastersthesis {xLidon, title = {Semantic and Diverse Summarization of Egocentric Photo Events}, year = {2015}, abstract = {

Student: Aniol Lidon

Advisors: Petia Radeva (UB) and Xavier Gir{\'o}-i-Nieto (UPC)

Program: Master in Computer Vision

Grade: A (9.8/10.0)

This project generates visual summaries of events depicted from egocentric photos taken with a wearable camera. These summaries are addressed to mild-dementia patients in order to exercise their memory in a daily base. The main contribution is an iterative approach that guarantees the semantic diversity of the summary and a novel soft metric to assess subjective results. Medical experts validated the proposed solution with a Mean Opinion Score of 4.6 out of of 5.0. The flexibility and quality of the solution was also tested in the 2015 Retrieving Diverse Social Images Task from the scientific international benchmark, MediaEval.\ 

Semantic and Diverse Summarization of Egocentric Photo Events from Xavier Giro
Aniol Lidon Master Thesis
}, author = {Lidon, Aniol}, editor = {Radeva, Petia and Xavier Gir{\'o}-i-Nieto} } @conference {cSalembier15, title = {Study of Binary Partition Tree Pruning Techniques for Polarimetric SAR images}, booktitle = {International Symposium on Mathematical Morphology, ISMM 2015}, year = {2015}, month = {05/2015}, publisher = {Springer}, organization = {Springer}, address = {Reykjavik, Iceland}, isbn = {978-3-319-18719-8}, author = {Salembier, P.} } @conference {Bell1512:Study, title = {Study of Normalization and Aggregation Approaches for Consensus Network Estimation}, booktitle = {2015 IEEE Symposium Series on Computational Intelligence: IEEE Symposium on Artificial Life (2015 IEEE ALIFE)}, year = {2015}, address = {Cape Town, South Africa}, abstract = {

Inferring gene regulatory networks from expression data is a very difficult problem that has raised the interest of the scientific community. Different algorithms have been proposed to try to solve this issue, but it has been shown that the different methods have some particular biases and strengths, and none of them is the best across all types of data and datasets. As a result, the idea of aggregating various network inferences through a consensus mechanism naturally arises. In this paper, a common framework to standardize already proposed consensus methods is presented, and based on this framework different proposals are introduced and analyzed in two different scenarios: Homogeneous and Heterogeneous. The first scenario reflects situations where the networks to be aggregated are rather similar because the are obtained with inference algorithms working on the same data, whereas the second scenario deals with very diverse networks because various sources of data are used to generate the individual networks. A procedure for combining multiple network inference algorithms is analyzed in a systematic way. The results show that there is a very significant difference between these two scenarios, and that the best way to combine networks in the Heterogeneous scenario is not the most commonly used. We show in particular that aggregation in the Heterogeneous scenario can be very beneficial if the individual networks are combined with our new proposed method ScaleLSum.

}, author = {P. Bellot and Salembier, P. and Albert Oliveras and Meyer, P. E.} } @conference {cLin15, title = {Time consistent estimation of End-effectors from RGB-D data}, booktitle = {Image and Video Technology: 7th Pacific-Rim Symposium, PSIVT 2015, Auckland, New Zealand, November 25-27, 2015, Revised Selected Papers}, number = {9431}, year = {2015}, month = {12/2015}, pages = {529-543}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, address = {Cham}, abstract = {

End-effectors are usually related to the location of the free end of a kinematic chain. Each of them contains rich structure information about the entity. Hence, estimating stable end-effectors of different entities enables robust tracking as well as a generic representation. In this paper, we present a system for end-effector estimation from RGB-D stream data. Instead of relying on a specific pose or configuration for initialization, we exploit time coherence without making any assumption with respect to the prior knowledge. This makes the estimation process more robust in a predict-update framework. Qualitative and quantitative experiments are performed against the reference method with promising results.

}, keywords = {End-effector estimation; Time coherence; Topology representation}, isbn = {978-3-319-29451-3}, doi = {10.1007/978-3-319-29451-3_42}, author = {X. Lin and Casas, J. and M. Pard{\`a}s} } @conference {cIndia15, title = {UPC System for the 2015 MediaEval Multimodal Person Discovery in Broadcast TV task}, booktitle = {MediaEval 2015 Workshop}, year = {2015}, month = {09/2015}, address = {Wurzen, Germany}, abstract = {

This paper describes a system to identify people in broadcast TV shows in a purely unsupervised manner. The system outputs the identity of people that appear, talk and can be identified by using information appearing in the show (in our case, text with person names). Three types of monomodal technologies are used: speech diarization, video diarization and text detection / named entity recognition. These technologies are combined using a linear programming approach where some restrictions are imposed.

}, author = {India, M. and David Varas and Ver{\'o}nica Vilaplana and Morros, J.R. and Hernando, J.} } @conference {cLidon, title = {UPC-UB-STP @ MediaEval 2015 Diversity Task: Iterative Reranking of Relevant Images}, booktitle = {MediaEval 2015 Workshop}, year = {2015}, month = {09/2015}, abstract = {

This paper presents the results of the UPC-UB-STP team in the 2015 MediaEval Retrieving Diverse Images Task.The goal of the challenge is to provide a ranked list of Flickr photos for a predefined set of queries. Our approach firstly generates a ranking of images based on a query-independent estimation of its relevance. Only top results are kept and iteratively re-ranked based on their intra-similarity to introduce diversity.

UPC-UB-STP @ MediaEval 2015 diversity task iterative reranking of relevant images from Xavier Giro
}, url = {http://ceur-ws.org/Vol-1436/}, author = {Lidon, Aniol and Bola{\~n}os, Marc and Seidl, Markus and Xavier Gir{\'o}-i-Nieto and Radeva, Petia and Zeppelzauer, Matthias} } @article {aTochon15, title = {On the use of binary partition trees for the tree crown segmentation of tropical rainforest hyperspectral images}, journal = {Remote Sensing of Environment}, volume = {159}, year = {2015}, month = {03/2015}, pages = {318-331}, abstract = {

The segmentation of remotely sensed images acquired over tropical forests is of great interest for numerous ecological applications, such as forest inventories or conservation and management of ecosystems, for which species classification techniques and estimation of the number of individuals are highly valuable inputs. In this paper, we propose a method for hyperspectral image segmentation, based on the binary partition tree (BPT) algorithm, and we apply it to two sites located in Hawaiian and Panamean tropical rainforests. Different strategies combining spatial and spectral dimensionality reduction are compared prior to the construction of the BPT. Various superpixel generation methods including watershed transformation and mean shift clustering are applied to decrease spatial dimensionality and provide an initial segmentation map. Principal component analysis is performed to reduce the spectral dimensionality and different combinations of principal components are compared. A non-parametric region model based on histograms, combined with the diffusion distance to merge regions, is used to build the BPT. An adapted pruning strategy based on the size discontinuity of the merging regions is proposed and compared with an already existing pruning strategy. Finally, a set of criteria to assess the quality of the tree segmentation is introduced. The proposed method correctly segmented up to 68\% of the tree crowns and produced reasonable patterns of the segmented landscapes.

}, issn = {0034-4257}, author = {G. Tochon and J.B. F{\'e}eret and Valero, S. and R.E. Martin and D.E. Knapp and Salembier, P. and Chanussot, J. and G. Asner} } @conference {cRoldan-Carlos, title = {Visual Information Retrieval in Endoscopic Video Archives}, booktitle = {IEEE/ACM International Workshop on Content-Based Multimedia Indexing - CBMI 2015 }, year = {2015}, month = {06/2015}, address = {Prague, Czech Republic}, abstract = {

In endoscopic procedures, surgeons work with live video strea\-ms from the inside of their subjects. A main source for documentation of procedures are still frames from the video, identified and taken during the surgery. However, with growing demands and technical means, the streams are saved to storage servers and the surgeons need to retrieve parts of the videos on demand. In this submission we present a demo application allowing for video retrieval based on visual features and late fusion, which allows surgeons to re-find shots taken during the procedure.

[Paper on arXiv]

[CBMI 2015 Conference website]

Presented in the Special Session on Medical Multimedia Processing (acceptance rate for special sessions= 55\%)

}, url = {http://arxiv.org/abs/1504.07874}, author = {Roldan-Carlos, Jennifer and Lux, Mathias and Xavier Gir{\'o}-i-Nieto and Pia Mu{\~n}oz-Trallero and Anagnostopoulos, Nektarios} } @mastersthesis {xPana, title = {Visual Saliency Prediction using Deep learning Techniques}, year = {2015}, abstract = {

Advisor: Xavier Gir{\'o}-i-Nieto (UPC)

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Grade: A with honors (9.9/10.0)

A saliency map is a model that predicts eye fixations on a visual scene. In other words, it is the prediction of saliency areas in images has been traditionally addressed with hand crafted features inspired on neuroscience principles. This work however addresses the problem with a completely data-driven approach by training a convolutional network. The recent publication of large datasets of saliency prediction has provided enough data to train a not very deep network architecture which is both fast and accurate. In our system, named JuntingNet, the learning process is formulated as a minimization of a loss function that measures the Euclidean distance of the predicted saliency map with the provided ground truth. The convolutional network developed in this work, named JuntingNet, won the CVPR Large-scale Scene UNderstanding (LSUN) 2015 challenge on saliency prediction with a superior performance in all considered metrics.

Saliency prediction using deep learning techniques from Xavier Giro

2015-TFG-JuntingPan-VisualSaliencyPredictionUsingDeepLearningTechniques from Image Processing Group on Vimeo.

See https://imatge.upc.edu/web/resources/end-end-convolutional-networks-saliency-prediction-software.

}, author = {Pan, Junting}, editor = {Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xRoldan-Carlos, title = {Visual Search for Musical Performances and Endoscopic Videos}, year = {2015}, abstract = {

Advisors: Mathias Lux (Klagenfurt University) and\ Xavier Gir{\'o}-i-Nieto\ (UPC)

Degree: Electronic Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)

Grade: A (9.5/10)

This project explores the potential of LIRE, an en existing Content-Based Image Retrieval (CBIR) system, when used to retrieve medical videos. These videos are recording of the live streams used by surgeons during the endoscopic procedures, captured from inside of the subject. The growth of such video content stored in servers requires search engines capable to assist surgeons in their management and retrieval. In our tool, queries are formulated by visual examples and those allow surgeons to re-find shots taken during the procedure. This thesis presents an extension and adaptation of Lire for video retrieval based on visual features and late fusion. The results are assessed from two perspectives: a quantitative and qualitative one. While the quantitative one follows the standard practices and metrics for video retrieval, the qualitative assessment has been based on an empirical social study using a semi-interactive web-interface. In particular, a thinking aloud test was applied to analyze if the user expectations and requirements were fulfilled. Due to the scarcity of surgeons available for the qualitative tests, a second domain was also addressed: videos captured at musical performances. These type of videos has also experienced an exponential growth with the advent of affordable multimedia smart phones, available to a large audience. Analogously to the endoscopic videos, searching in a large data set of such videos is a challenging topic.

}, url = {http://hdl.handle.net/2099.1/26032}, author = {Roldan-Carlos, Jennifer}, editor = {Xavier Gir{\'o}-i-Nieto and Lux, Mathias} } @mastersthesis {xMestre, title = {Visual Summary of Egocentric Photostreams by Representative Keyframes}, year = {2015}, abstract = {

Studies: Bachelor degree in Engineering of Audiovisual Systems at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Grade: A with honors (9.6/10)

This Final Degree Work approach the problem of the visual summarization of sets of images captured by an eggocentric camera for lifelogging purposes. In first place we try to group the images (which represent the day of person{\textquoteright}s life) into distinguishable and significant esdeveniments. For this purpose, we use visual features extracted with the software Caffe. In second place, we explain the design of extraction techniques of the representative images through similarity graphs. Finally we analyze the assessment scores given by different users whom we presented the different visual summaries obtained in this project. We achieve a 60\% of favorable opinions of the quality of the visual summaries obtained with techniques developed in this project.

(This thesis report is writen in Catalan)

Aquest Treball de Final de Grau aborda el problema de resumir visualment conjunts d{\textquoteright}imatges capturats mitjançant una càmera egocèntrica per a propòsits de registre vital (lifelogging en anglès).\ En primer lloc s{\textquoteright}intenta agrupar les imatges (que representen el dia d{\textquoteright}una persona) per esdeveniments distingibles i significatius. Per fer-ho s{\textquoteright}utilitzen característiques visuals extretes amb el programari Caffe.\ En segon lloc s{\textquoteright}explica el disseny de tècniques de extracció d{\textquoteright}imatges representatives mitjançant grafs de similitud.\ Per últim s{\textquoteright}analitzen les puntuacions d{\textquoteright}avaluació donades per diversos usuaris als quals se{\textquoteright}ls han presentat els diferents resums visuals obtinguts en aquest projecte. S{\textquoteright}ha pogut assolir un 60\% d{\textquoteright}opinions favorables a la qualitat dels resums obtinguts amb tècniques desenvolupades en aquest treball.

}, keywords = {computer Vision, egocentric image, image processing, lifeblogging, summarization, wearable camera}, author = {Mestre, Ricard}, editor = {Xavier Gir{\'o}-i-Nieto} } @conference {cMestre, title = {Visual Summary of Egocentric Photostreams by Representative Keyframes}, booktitle = {IEEE International Workshop on Wearable and Ego-vision Systems for Augmented Experience (WEsAX) 2015}, year = {2015}, month = {07/2015}, address = {Turin, Italy}, abstract = {

Building a visual summary from an egocentric photostream captured by a lifelogging wearable camera is of high interest for different applications (e.g. memory reinforcement).\ In this paper, we propose a new summarization method based on keyframes selection that\ uses visual features extracted by means of a convolutional neural network. Our method applies an unsupervised clustering for dividing the photostreams into events, and finally extracts the most relevant keyframe for each event. We assess the results by applying a blind-taste test on a group of 20 people who assessed the quality of the summaries.

}, keywords = {egocentric, keyframes, lifelogging, summarization}, url = {http://arxiv.org/abs/1505.01130}, author = {Bola{\~n}os, Marc and Mestre, Ricard and Talavera, Estefan{\'\i}a and Xavier Gir{\'o}-i-Nieto and Radeva, Petia} } @conference {cMartinez14, title = {3D Shape Reconstruction from a Humanoid Generated Video Sequence}, booktitle = {IEEE International Conference on Humanoid Robots}, year = {2014}, address = {Madrid}, abstract = {

This paper presents a strategy for estimating the\ geometry of an interest object from a monocular video sequence\ acquired by a walking humanoid robot. The problem is solved\ using a space carving algorithm, which relies on both the\ accurate extraction of the occluding boundaries of the object as well as the precise estimation of the camera pose for each video frame. For data acquisition, a monocular visual-based control has been developed that drives the trajectory of the robot around an object placed on a small table. Due to the stepping of the humanoid, the recorded sequence is contaminated with\ artefacts that affect the correct extraction of contours along the\ video frames. To overcome this issue, a method that assigns a fitness score for each frame is proposed, delivering a subset\ of camera poses and video frames that produce consistent 3D shape estimations of the objects used for experimental evaluation.

}, author = {Mart{\'\i}nez, P. A. and David Varas and Castel{\'a}n, M and Camacho, M and Marqu{\'e}s, F. and Arechavaleta, G.} } @mastersthesis {xQueralt14, title = {Automatic Human Detection and Tracking for Robust Video Sequence Annotation}, year = {2014}, author = {Ramon Llorca Queralt}, editor = {Morros, J.R. and David Varas} } @mastersthesis {xEscue, title = {Bundling interest points for object classification}, year = {2014}, abstract = {

Advisors: Xavier Giro-i-Nieto and Carles Ventura-Royo

Bundling interest points for object classification from Xavi Gir{\'o}

This Bachelor of Science thesis addresses the problem of image classification combining two popular visual representations: points and regions. Firstly, the study explores bundling interest points with regions. These regions are generated with an initial SLIC partition and using Binary Partition Tree (BPT), considering different scales of resolution in the segmentation. Secondly, it explores modelling visual classes as a group of points extracted from different images. Based on Naive-Bayes Nearest Neighbor (NBNN), we are using 1-Nearest Neighbor with SURF descriptor on the 17 Category Flower Dataset with 1360 images of flowers distributed into 17 classes, 80 images per class. We have verified that grouping interest points of the same class improves the F1-score a 9.2\%. However, bundling interest points into regions using segmentation worsens the F1-score between 1\% and 7\%, depending on the number of regions in the segmentation.

[Extended summary on Bitsearch blog]

Author{\textquoteright}s website: jordisanchez.info

Grade: A (9.3/10)

(This BSc thesis was written in Catalan language.)

---------------------------------

Aquest Treball Final de Grau aborda el problema de la classificaci{\'o} d{\textquoteright}imatges combinant dues representacions visuals populars: els punts i les regions. En primer lloc, l{\textquoteright}estudi explora l{\textquoteright}agrupaci{\'o} de punts d{\textquoteright}inter{\`e}s amb les regions. Aquestes regions es generen amb una partici{\'o} inicial SLIC i s{\textquoteright}utilitzen els Arbres de Partici{\'o} Bin{\`a}ria (Binary Partition Tree, BPT), considerant diferents escales de resoluci{\'o} en la segmentaci{\'o}. En segon lloc, s{\textquoteright}estudia modelar les classes com a grup de punts d{\textquoteright}inter{\`e}s extrets d{\textquoteright}imatges diferents. Basant-nos en el classificador Naive-Bayes Nearest Neighbor (NBNN), hem utilitzat el ve{\"\i} m{\'e}s proper amb un descriptor SURF sobre la base de dades 17 Category Flower Dataset, que cont{\'e} 1360 imatges de flors distribu{\"\i}des en 17 classes, amb 80 imatges per classe. Hem pogut verificar que el fet d{\textquoteright}ajuntar els punts d{\textquoteright}inter{\`e}s de les imatges d{\textquoteright}una mateixa classe millora la puntuaci{\'o} F1 en un 9,2\%. No obstant, l{\textquoteright}agrupaci{\'o} de punts d{\textquoteright}inter{\`e}s en regions utilitzant una segmentaci{\'o} de la imatge empitjora la puntuaci{\'o} F1 entre l{\textquoteright}1\% i el 7\%, depenent del nombre de regions de la segmentaci{\'o}.

Lloc web de l{\textquoteright}autor: jordisanchez.info

Qualificaci{\'o}: A (9.3/10)

Jordi Sanchez Escue

}, keywords = {Bundling interest points, Digital Images, image classification, Images, Nearest Neighbor, SURF}, url = {http://hdl.handle.net/2099.1/22672}, author = {S{\'a}nchez-Escu{\'e}, Jordi}, editor = {Ventura, C. and Xavier Gir{\'o}-i-Nieto} } @conference {cCarlier, title = {Click{\textquoteright}n{\textquoteright}Cut: Crowdsourced Interactive Segmentation with Object Candidates}, booktitle = {3rd International ACM Workshop on Crowdsourcing for Multimedia (CrowdMM)}, year = {2014}, month = {11/2014}, address = {Orlando, Florida (USA)}, abstract = {

This paper introduces Click{\textquoteright}n{\textquoteright}Cut, a novel web tool for interactive object segmentation addressed to crowdsourcing tasks. Click{\textquoteright}n{\textquoteright}Cut combines bounding boxes and clicks generated by workers to obtain accurate object segmentations. These segmentations are created by combining precomputed object candidates in a light computational fashion that allows an immediate response from the interface. Click{\textquoteright}n{\textquoteright}Cut has been tested with a crowdsourcing campaign to annotate a subset of the Berkeley Segmentation Dataset (BSDS). Results show competitive results with state of the art, especially in time to converge to a high quality segmentation. The data collection campaign included golden standard tests to detect cheaters.

[Related master thesis by Amaia Salvador]

[Related Phd thesis by Axel Carlier]

[CrowdMM website]

}, keywords = {Crowdsourcing, figure-ground segmentation, human computing, object candidates}, doi = {10.1145/2660114.2660125}, url = {http://dx.doi.org/10.1145/2660114.2660125}, author = {Carlier, Axel and Amaia Salvador and Xavier Gir{\'o}-i-Nieto and Marques, Oge and Charvillat, Vincent} } @mastersthesis {xGutierrez14, title = {Comparaci{\'o} d{\textquoteright}algoritmes de classificaci{\'o} de tipus de pla en imatges de futbol}, year = {2014}, abstract = {

The purpose of this project is to analyze and evaluate football image viewpoint classification algorithms and to implement, if it is possible, a set of upgrades to improve the results of this classification. In particular, the analysis of a publication of the State of the Art and its subsequent comparison to an algorithm created by the Image Processing Group (GPI) of the Polytechnic University of Catalonia is formulated. Sometimes, during the analysis process of sporting events is interesting to automate the extraction of semantic content. In this context, the algorithms compared in this project are football image viewpoint classification algorithms. To carry through the classification, these algorithms use different descriptors calculated on the images. This project originates from the need to compare the image processing group algorithm to present techniques, in order to assess the obtained results and have a better understanding of the current State of the Art. Thus, the State of the Art algorithm will be implemented, analyzed and improved if possible. Then, it will be compared to the one created by the image processing group using an extensive database previously selected. Finally, the results will be presented and analyzed.

}, url = {http://upcommons.upc.edu/bitstream/handle/2099.1/20890/Final_Report.pdf?sequence=4\&isAllowed=y}, author = {Coronado Guti{\'e}rrez, D}, editor = {David Varas and Marqu{\'e}s, F.} } @mastersthesis {xTella, title = {Contextless Object Recognition with Shape-enriched SIFT and Bags of Features}, year = {2014}, abstract = {
Final thesis slides (1) from Xavi Gir{\'o}

Advisors:\ Xavier Gir{\'o}-i-Nieto\ (UPC) and Matthias Zeppelzauer\ (TU Wien)

Degree: Telecommunications Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)

Currently, there are highly competitive results in the field of object recognition based on the aggregation of point-based features. The aggregation process, typically with an average or max-pooling of the features generates a single vector that represents the image or region that contains the object.

The aggregated point-based features typically describe the texture around the points with descriptors such as SIFT. These descriptors present limitations for wired and textureless objects. A possible solution is the addition of shape-based information. Shape descriptors have been previously used to encode shape information and thus, recognise those types of objects. But generally an alignment step is required in order to match every point from one shape to other ones. The computational cost of the similarity assessment is high.

We purpose to enrich location and texture-based features with shape-based ones. Two main architectures are explored: On the one side, to enrich the SIFT descriptors with shape information before they are aggregated. On the other side, to create the standard Bag of Words histogram and concatenate a shape histogram, classifying them as a single vector.

We evaluate the proposed techniques and the novel features on the Caltech-101 dataset.

Results show that shape features increase the final performance. Our extension of the Bag of Words with a shape-based histogram(BoW+S) results in better performance. However, for a high number of shape features, BoW+S and enriched SIFT architectures tend to converge.

Final grade: A with honors (10/10)

}, keywords = {Bag of Words, Interest Points, object candidates, Object detection, segmentation, shape coding, SIFT, textureless objects, wired objects.}, url = {http://hdl.handle.net/2099.1/22390}, author = {Tella, Marcel}, editor = {Zeppelzauer, Matthias and Xavier Gir{\'o}-i-Nieto} } @article {aPaloua, title = {Depth order estimation for video frames using motion occlusions}, journal = {IET Computer Vision}, volume = {8}, year = {2014}, month = {04/2014}, pages = {152-160}, doi = {10.1049/iet-cvi.2012.0287}, author = {Palou, G. and Salembier, P.} } @article {cBellot Pujalte14, title = {Efficient combination of pairwise feature networks}, journal = {JMLR: Workshop and Conference Proceedings }, volume = {46}, year = {2014}, month = {10/2015}, pages = {77 - 84}, publisher = {JMLR: Workshop and Conference Proceedings}, address = {Nancy, France}, abstract = {

This paper presents a novel method for the reconstruction of a neural network connectivity using calcium fluorescence data. We introduce a fast unsupervised method to integrate different networks that reconstructs structural connectivity from neuron activity. Our method improves the state-of-the-art reconstruction method General Transfer Entropy (GTE). We are able to better eliminate indirect links, improving therefore the quality of the network via a normalization and ensemble process of GTE and three new informative features. The approach is based on a simple combination of networks, which is remarkably fast. The performance of our approach is benchmarked on simulated time series provided at the connectomics challenge and also submitted at the public competition.

}, keywords = {connectomes, elimination of indirect links, network reconstruction algorithms}, author = {P. Bellot and Meyer, P. E.} } @conference {cBosch14, title = {An Epipolar-Constrained Prior for Efficient Search in Multi-View Scenarios}, booktitle = {EUSIPCO}, year = {2014}, month = {09/2014}, address = {Lisbon}, abstract = {

In this paper we propose a novel framework for fast exploitation of multi-view cues with applicability in different image processing problems. In order to bring our proposed framework into practice, an epipolar-constrained prior is presented, onto which a random search algorithm is proposed to find good matches among the different views of the same scene. This algorithm includes a generalization of the local coherency in 2D images for multi-view wide-baseline cases. Experimental results show that the geometrical constraint allows a faster initial convergence when finding good matches. We present some applications of the proposed framework on classical image processing problems.

}, keywords = {approximate nearest neighbor, deblurring, epipolar line, Super resolution}, author = {Bosch, I. and Salvador, J. and E. Perez-Pellitero and Ruiz-Hidalgo, J.} } @mastersthesis {xSalvador, title = {Exploiting User Interaction and Object Candidates for Instance Retrieval and Object Segmentation}, year = {2014}, abstract = {

Author: Amaia Salvador-Aguilera

Advisors:\ Xavier Gir{\'o}-i-Nieto\ (UPC) and Kevin McGuinness (Dublin CIty University)

Degree:\ Master in Computer Vision\ (1 year)

Video: Thesis defense

This thesis addresses two of the main challenges nowadays for computer vision: object segmentation and visual instance retrieval. The methodologies proposed to solve both problems are based on the use of object candidates and human computation in the computer vision loop. In the object segmentation side, this work explores how human computation can be useful to achieve better segmentation results, by combining users{\textquoteright} traces with a segmentation algorithm based on object candidates. On the other hand, the instance retrieval problem is also addressed using object candidates to compute local features, and involving the user in the retrieval loop by applying relevance feedback strategies.

}, keywords = {computer Vision, human computing, instance search, object candidates, segmentation}, author = {Amaia Salvador}, editor = {Xavier Gir{\'o}-i-Nieto and Kevin McGuinness} } @conference {cPerez-Pellitero14, title = {Fast Super-Resolution via Dense Local Training and Inverse Regressor Search}, booktitle = {Asian Conference in Computer Vision (ACCV)}, year = {2014}, month = {11/2014}, address = {Singapore}, abstract = {

Regression-based Super-Resolution (SR) addresses the up- scaling problem by learning a mapping function (i.e. regressor) from the low-resolution to the high-resolution manifold. Under the locally linear assumption, this complex non-linear mapping can be properly modeled by a set of linear regressors distributed across the manifold. In such methods, most of the testing time is spent searching for the right regressor within this trained set. In this paper we propose a novel inverse-search approach for regression-based SR. Instead of performing a search from the image to the dictionary of regressors, the search is done inversely from the regressors{\textquoteright} dictionary to the image patches. We approximate this framework by applying spherical hashing to both image and regressors, which reduces the inverse search into computing a trained function. Additionally, we propose an improved training scheme for SR linear regressors which improves perceived and objective quality. By merging both contributions we improve speed and quality compared to the state-of-the-art.

}, author = {E. Perez-Pellitero and Salvador, J. and Torres-Xirau, I. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @article {aGiro-i-Nieto13, title = {From Global Image Annotation to Interactive Object Segmentation}, journal = {Multimedia Tools and Applications}, volume = {70}, year = {2014}, month = {05/2014}, chapter = {475}, abstract = {

This paper presents a graphical environment for the annotation of still images that works both at the global and local scales. At the global scale, each image can be tagged with positive, negative and neutral labels referred to a semantic class from an ontology. These annotations can be used to train and evaluate an image classifier. A finer annotation at a local scale is also available for interactive segmentation of objects. This process is formulated as a selection of regions from a precomputed hierarchical partition called Binary Partition Tree. Three different semi-supervised methods have been presented and evaluated: bounding boxes, scribbles and hierarchical navigation. The implemented Java source code is published under a free software license.

}, keywords = {annotation, Hierarchical, Interaction, Multiscale, segmentation}, doi = {10.1007/s11042-013-1374-3}, author = {Xavier Gir{\'o}-i-Nieto and Martos, Manel and Mohedano, Eva and Jordi Pont-Tuset} } @phdthesis {dBosio14, title = {Hierarchical information representation and efficient classification of gene expression microarray data}, volume = {PhD Thesis}, year = {2014}, month = {06/2014}, pages = {150}, school = {Universitat Politecnica de Catalunya}, address = {Barcelona}, abstract = {

In the field of computational biology, microarryas are used to measure the activity of thousands of genes at once and create a global picture of cellular function. Microarrays allow scientists to analyze expression of many genes in a single experiment quickly and efficiently. Even if microarrays are a consolidated research technology nowadays and the trends in high-throughput data analysis are shifting towards new technologies like Next Generation Sequencing (NGS), an optimum method for sample classification has not been found yet.

Microarray classification is a complicated task, not only due to the high dimensionality of the feature set, but also to an apparent lack of data structure. This characteristic limits the applicability of processing techniques, such as wavelet filtering or other filtering techniques that take advantage of known structural relation. On the other hand, it is well known that genes are not expressed independently from other each other: genes have a high interdependence related to the involved regulating biological process.

This thesis aims to improve the current state of the art in microarray classification and to contribute to understand how signal processing techniques can be developed and applied to analyze microarray data. The goal of building a classification framework needs an exploratory work in which algorithms are constantly tried and adapted to the analyzed data. The developed algorithms and classification frameworks in this thesis tackle the problem with two essential building blocks. The first one deals with the lack of a priori structure by inferring a data-driven structure with unsupervised hierarchical clustering tools. The second key element is a proper feature selection tool to produce a precise classifier as an output and to reduce the overfitting risk.

The main focus in this thesis is the binary data classification, field in which we obtained relevant improvements to the state of the art. The first key element is the data-driven structure, obtained by modifying hierarchical clustering algorithms derived from the Treelets algorithm from the literature. Several alternatives to the original reference algorithm have been tested, changing either the similarity metric to merge the feature or the way two feature are merged. Moreover, the possibility to include external sources of information from publicly available biological knowledge and ontologies to improve the structure generation has been studied too. About the feature selection, two alternative approaches have been studied: the first one is a modification of the IFFS algorithm as a wrapper feature selection, while the second approach involved an ensemble learning focus. To obtain good results, the IFFS algorithm has been adapted to the data characteristics by introducing new elements to the selection process like a reliability measure and a scoring system to better select the best feature at each iteration. The second feature selection approach is based on Ensemble learning, taking advantage of the microarryas feature abundance to implement a different selection scheme. New algorithms have been studied in this field, improving state of the art algorithms to the microarray data characteristic of small sample and high feature numbers.

In addition to the binary classification problem, the multiclass case has been addressed too. A new algorithm combining multiple binary classifiers has been evaluated, exploiting the redundancy offered by multiple classifiers to obtain better predictions.

All the studied algorithm throughout this thesis have been evaluated using high quality publicly available data, following established testing protocols from the literature to offer a proper benchmarking with the state of the art.

Whenever possible, multiple Monte Carlo simulations have been performed to increase the robustness of the obtained results.\ 

}, url = {http://hdl.handle.net/10803/145902}, author = {Bosio, M.}, editor = {Salembier, P. and Albert Oliveras} } @phdthesis {dPont-Tuset14, title = {Image Segmentation Evaluation and Its Application to Object Detection}, year = {2014}, month = {0/2014}, school = {Universitat Polit{\`e}cnica de Catalunya, BarcelonaTech}, address = {Barcelona}, abstract = {

The first two parts of this Thesis are focused on the study of the supervised evaluation of image segmentation algorithms. Supervised in the sense that the segmentation results are compared to a human-made annotation, known as ground truth, by means of different measures of similarity. The evaluation depends, therefore, on three main points.

First, the image segmentation techniques we evaluate. We review the state of the art in image segmentation, making an explicit difference between those techniques that provide a flat output, that is, a single clustering of the set of pixels into regions; and those that produce a hierarchical segmentation, that is, a tree-like structure that represents regions at different scales from the details to the whole image.

Second, ground-truth databases are of paramount importance in the evaluation. They can be divided into those annotated only at object level, that is, with marked sets of pixels that refer to objects that do not cover the whole image; or those with annotated full partitions, which provide a full clustering of all pixels in an image. Depending on the type of database, we say that the analysis is done from an object perspective or from a partition perspective.

Finally, the similarity measures used to compare the generated results to the ground truth are what will provide us with a quantitative tool to evaluate whether our results are good, and in which way they can be improved. The main contributions of the first parts of the thesis are in the field of the similarity measures.

First of all, from an object perspective, we review the existing measures to compare two object representations and show that some of them are equivalent. In order to evaluate full partitions and hierarchies against an object, one needs to select which of their regions form the object to be assessed. We review and improve these techniques by means of a mathematical model of the problem. This analysis allows us to show that hierarchies can represent objects much better with much less number of regions than flat partitions.

From a partition perspective, the literature about evaluation measures is large and entangled. Our first contribution is to review, structure, and deduplicate the measures available. We pro- vide a new measure that improves previous ones in terms of a set of qualitative and quantitative meta-measures. We also extend the measures on flat partitions to cover hierarchical segmentations.

The third part of this Thesis moves from the evaluation of image segmentation to its application to object detection. In particular, we build on some of the conclusions extracted in the first part to generate segmented object candidates. Given a set of hierarchies, we build the pairs and triplets of regions, we learn to combine the set from each hierarchy, and we rank them using low-level and mid-level cues. We conduct an extensive experimental validation that show that our method outperforms the state of the art in terms of object segmentation quality and object detection accuracy.

}, url = {http://hdl.handle.net/10803/134354}, author = {Jordi Pont-Tuset}, editor = {Marqu{\'e}s, F.} } @conference {cMcGuinness, title = {Insight Centre for Data Analytics (DCU) at TRECVid 2014: Instance Search and Semantic Indexing Tasks}, booktitle = {2014 TRECVID Workshop}, year = {2014}, month = {11/2014}, publisher = {National Institute of Standards and Technology (NIST)}, organization = {National Institute of Standards and Technology (NIST)}, address = {Orlando, Florida (USA)}, abstract = {

Insight-DCU participated in the instance search (INS) and semantic indexing (SIN) tasks in 2014. Two very different approaches were submitted for instance search, one based on features extracted using pre-trained deep convolutional neural networks (CNNs), and another based on local SIFT features, large vocabulary visual bag-of-words aggregation, inverted index-based lookup, and geometric verification on the top-N retrieved results. Two interactive runs and two automatic runs were submitted, the best interactive runs achieved a mAP of 0.135 and the best automatic 0.12. Our semantic indexing runs were based also on using convolutional neural network features, and on Support Vector Machine classifiers with linear and RBF kernels. One run was submitted to the main task, two to the no annotation task, and one to the progress task. Data for the no-annotation task was gathered from Google Images and ImageNet. The main task run has achieved a mAP of 0.086, the best no-annotation runs had a close performance to the main run by achieving a mAP of 0.080, while the progress run had 0.043.

[2014 TREC Video Retrieval Evaluation Notebook Papers and Slides]

Poster

}, url = {http://hdl.handle.net/2117/24915}, author = {Kevin McGuinness and Mohedano, Eva and Zhang, ZhenXing and Hu, Feiyan and Albatal, Rami and Gurrin, Cathal and O{\textquoteright}Connor, N. and Smeaton, Alan F. and Amaia Salvador and Xavier Gir{\'o}-i-Nieto and Ventura, C.} } @mastersthesis {xPereira, title = {An investigation of eye gaze tracking utilities in image object recognition}, year = {2014}, abstract = {

Computer vision has been one of the most revolutionary technologies of the last few decades. This project investigates how to improve an image recognition system (image classifier) using a not very exploded technology; eye gaze tracking. The aim of this project is to explore the benefits that this technology can bring to an image classifier. The experiment that is set in this project is to build a dataset with an eye tracking device and (using different sized cropped parts of the image based on the eye tracking data) see how the performance of an image classifier is affected with these images. The results are interesting. Since smaller images have to be processed by using this method, the system is more efficient. Regarding the performance, it is very similar to the one obtained without using any eye tracking data, so it is arguable to state that it presents an improvement, and opens new directions of investigation for future works.

Poster


}, author = {Imedio-Pereira, Sergi}, editor = {Mohedano, Eva and Xavier Gir{\'o}-i-Nieto and Albatal, Rami and O{\textquoteright}Connor, N.} } @mastersthesis {xManchon-Vizuete, title = {Low computational cost algorithms for photo clustering and mail signature detection in the cloud}, year = {2014}, abstract = {

Advisors: Xavier Gir{\'o}-i-Nieto (UPC) and Omar Pera (Pixable)

Degree: Electronic Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)

Low computational cost algorithms for photo clustering and mail signature detection in the cloud from Xavi Gir{\'o}

This Final degree thesis summarizes the tasks that have been developed during an internship in Pixable Inc. in New York City together with the tasks related to the Me- diaeval 2013 evaluation campaign, where I participated with the team of Universitat Politecnica de Catalunya (UPC). The main focus of my work was on the Photofeed service, that is a photo archive service in the cloud.

The popularisation of the storage of photos on the cloud has opened new oppor- tunities and challenges for the organization and extension of photo collections. In my thesis I have developed a light computational solution for the clustering of web photos based on social events. The proposal combines a first oversegmentation of the photo collections of each user based on temporal cues, as previously proposed in the PhotoTOC algorithm [Platt et al, PACRIM 2003]. On a second stage, the resulting mini-clusters are merged based on contextual metadata such as geolocation, keywords and user IDs.

Closely relate to photo clustering we can study mail classification too. Additional tasks were developed for the Contactive company in this field. In order to solve the problems that Contactive was facing in mail analysis tasks, I developed methods for automatically identifying signature blocks and reply lines in plain-text email messages. This analysis has many potential applications, such as preprocessing email for text-to- speech systems; anonymization of email corpora; improving automatic content-based mail classifiers; and email threading. This method is based on applying machine learning methods to a sequential representation of an email message, in which each email is represented as a sequence of lines, and each line is represented as a set of features.

Final grade: A with honors (10/10)

Daniel Manchon
}, author = {Manchon-Vizuete, Daniel}, editor = {Xavier Gir{\'o}-i-Nieto and Pera, Omar} } @conference {cSalembier14, title = {Low-level processing of PolSAR images with binary partition trees}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium (IGARSS), 2014}, year = {2014}, month = {07/2014}, publisher = {IEEE}, organization = {IEEE}, address = {Quebec, Canada}, abstract = {

This paper discusses the interest of Binary Partition Trees (BPTs) and the usefulness of graph cuts for low-level processing of PolSAR images. BPTs group pixels to form homogeneous regions, which are hierarchically structured by inclusion in a tree. They provide multiple resolutions of description and easy access to subsets of regions. Once constructed, BPTs can be used for many applications including filtering, segmentation, classification and object detection. Many processing strategies consist in populating the tree with a specific feature and in applying a graph-cut called pruning. Different graph-cuts are discussed and analyzed in the context of PolSAR images for speckle filtering and segmentation.

}, author = {Salembier, P. and S. Foucher and L{\'o}pez-Mart{\'\i}nez, C.} } @conference {cMorros14, title = {Materials transversals per a l{\textquoteright}aprenentatge actiu de les mat{\`e}ries de processat d{\textquoteright}imatge i v{\'\i}deo}, booktitle = {Congr{\'e}s Internacional de Doc{\`e}ncia Universit{\`a}ria i Innovaci{\'o} (CIDUI)}, year = {2014}, month = {07/2014}, address = {Tarragona, Spain}, abstract = {

This work aims to promote the cooperation and coordination between different image/video processing courses taught at the UPC in order to enhance the learning results. The main contributions are a) the creation of a common set of materials: graphical demonstrators, collections of problems, question banks, etc. and b) the design of strategies strategies to use this material in the development of generic and specific skills, with special emphasis on promoting independent learning

}, keywords = {image/video processing, Matlab demonstrators, question Banks, specific skills, teaching material}, author = {Morros, J.R. and Ver{\'o}nica Vilaplana and Ruiz-Hidalgo, J. and Casas, J. and Gasull, A. and Marqu{\'e}s, F. and M. Pard{\`a}s and Salembier, P.} } @book {bSchreer14, title = {Media Production, Delivery and Interaction for Platform Independent Systems}, volume = {ISBN 978-1-118-60533-2}, year = {2014}, publisher = {Wiley, ISBN 978-1-118-60533-2}, organization = {Wiley, ISBN 978-1-118-60533-2}, issn = {978-1-118-60533-2}, url = {http://eu.wiley.com/WileyCDA/WileyTitle/productCd-1118605330,subjectCd-EEJ0.html}, author = {Schreer, O. and Macq, J. and Niamut, O. and Ruiz-Hidalgo, J. and Shirley, B. and Thallinger, G. and Thomas, G.} } @mastersthesis {xFerrarons-Betrian, title = {Mobile Visual Search at Catchoom}, year = {2014}, abstract = {

Author: Miquel Ferrarons-Betrian

Advisors:\ Xavier Gir{\'o}-i-Nieto\ (UPC) and Tomasz Adamek\ (Catchoom)

Degree: Master in Computer Vision (1 year)

}, keywords = {feature selection, large-scale visual search, Mobile visual search, synthetic views matching, visual word}, author = {Ferrarons-Betrian, Miquel}, editor = {Adamek, Tomasz and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dPalou14, title = {Monocular Depth Estimation in Images and Sequences using Occlusion Cues}, volume = {PhD}, year = {2014}, month = {02/2014}, pages = {250}, abstract = {

When humans observe a scene, they are able to perfectly distinguish the different parts composing it. Moreover, humans can easily reconstruct the spatial position of these parts and conceive a consistent structure. The mechanisms involving visual perception have been studied since the beginning of neuroscience but, still today, not all the processes composing it are known. In usual situations, humans can make use of three different methods to estimate the scene structure. The first one is the so called divergence and it makes use of both eyes. \ When objects lie in front of the observed at a distance up to hundred meters, subtle differences in the image formation in each eye can be used to determine depth. When objects are not in the field of view of both eyes, other mechanisms should be used. In these cases, both visual cues and prior learned information can be used to determine depth. Even if these mechanisms are less accurate than divergence, humans can almost always infer the correct depth structure when using them. As an example of visual cues, occlusion, perspective or object size provide a lot of information about the structure of the scene. A priori information depends on each observer, but it is normally used subconsciously by humans to detect commonly known regions such as the sky, the ground or different types of objects.

In the last years, since technology has been able to handle the processing burden of vision systems, there has been lots of efforts devoted to design automated scene interpreting systems. In this thesis we address the problem of depth estimation using only one point of view and using only occlusion depth cues. The thesis objective is to detect occlusions present in the scene \ and combine them with a segmentation system so as to generate a relative depth order depth map for a scene. We explore both static and dynamic situations such as single images, frame inside sequences or full video sequences. In the case where a full image sequence is available, a system exploiting motion information to recover depth structure is also designed. Results are promising and competitive with respect to the state of the art literature, but there is still much room for improvement when compared to human depth perception performance.

}, url = {http://hdl.handle.net/10803/144653}, author = {Palou, G.}, editor = {Salembier, P.} } @conference {cAlonso-Gonzalez14, title = {Multidimensional SAR Data Analysis Based on Binary Partition Trees and the Covariance Matrix Geometry}, booktitle = {International Radar Conference 2014}, year = {2014}, month = {10/2014}, publisher = {SEE}, organization = {SEE}, address = {Lille, France}, abstract = {

In this paper, we propose the use of the Binary Partition Tree (BPT) as a region-based and multi-scale image representation to process multidimensional SAR data, with special emphasis on polarimetric SAR data. We also show that this approach could be extended to other types of remote sensing imaging technologies, such as hyperspatial imagery. The Binary Partition Tree contains a lot of information about the image structure at different detail levels. At the same time, this structure represents a convenient vehicle to exploit both the statistical properties, as well as the geometric properties of the multidimensional SAR data given by the covariance matrix. The BPT construction process and its exploitation for PolSAR and temporal data information estimation is analyzed in this work. In particular, this work focuses on the speckle noise filtering problem and the temporal characterization of the image dynamics. Results with real data are presented to illustrate the capabilities of the BPT processing approach, specially to maintain the spatial resolution and the small details of the image.

}, author = {Alonso-Gonz{\'a}lez, A. and L{\'o}pez-Mart{\'\i}nez, C. and Salembier, P.} } @conference {cArbelaez14, title = {Multiscale Combinatorial Grouping}, booktitle = {Computer Vision and Pattern Recognition (CVPR)}, year = {2014}, abstract = {

We propose a unified approach for bottom-up hierarchical image segmentation and object candidate generation for recognition, called Multiscale Combinatorial Grouping (MCG). For this purpose, we first develop a fast normalized cuts algorithm. We then propose a high-performance hierarchical segmenter that makes effective use of multiscale information. Finally, we propose a grouping strategy that combines our multiscale regions into highly-accurate object candidates by efficiently exploring their combinatorial space. We conduct extensive experiments on both the BSDS500 and on the PASCAL 2012 segmentation datasets, showing that MCG produces state-of-the-art contours, hierarchical regions, and object candidates.

Code and pre-computed results available here.

}, author = {Pablo Arbelaez and Jordi Pont-Tuset and Barron, Jon and Marqu{\'e}s, F. and Jitendra Malik} } @conference {cGallego14a, title = {Multiview Foreground Segmentation using 3D Probabilistic Model}, booktitle = {ICIP, IEEE International Conference on Image Processing}, year = {2014}, month = {10/2014}, abstract = {

We propose a complete multi-view foreground segmentation and 3D reconstruction system that defines a 3-dimensional probabilistic model to model the foreground object in the 3 spatial dimensions, thus gathering the information from all the camera views. This 3D model is projected to each one of the views in order to perform the 2D segmentation with the foreground information shared by all the cameras. Then, for each one of the views, a MAP-MRF classification framework is applied between the projected region-based foreground model, the pixel-wise background model and the regionbased shadow model defined for each view. The resultant masks are used to compute the next 3-dimensional reconstruction. This system achieves correct results by reducing the false positive and false negative errors in sequences where some camera sensors can present camouflage situations between foreground and background. Moreover, the use of the 3D model opens possibilities to use it for objects recognition or human activity understanding.

}, author = {Gallego, J. and M. Pard{\`a}s} } @article {aBarkhuus14, title = {New interaction modes for rich panoramic live video experiences}, journal = {Behaviour \& Information Technology}, volume = {33}, year = {2014}, month = {07/2014}, chapter = {859-869}, abstract = {

The possibilities of panoramic video are based on the capabilities of high-resolution digital video streams and higher band- width{\textquoteright}s opportunities to broadcast, stream and transfer large content across platforms. With these opportunities also come challenges such as how to focus on sub-parts of the video stream and interact with the content shown on a large screen. In this paper, we present studies of two different interaction modes with a large-scale panoramic video for live experiences; we focus on interactional challenges and explore if it is (1) possible to develop new interactional methods/ways of approaching this type of high-resolution content and (2) feasible for users to interact with the content in these new ways. We developed prototypes for two different interaction modes: an individual system on a mobile device, either a tablet or a mobile phone, for interacting with the content on the same and a non-touch gesture-based system for the home or small group interaction. We present pilot studies where we explore the possibilities and challenges with these two interaction modes for panoramic content.

}, keywords = {interaction modes, interactive television, panoramic video, pilot studies}, doi = {10.1080/0144929X.2014.914975}, url = {http://www.tandfonline.com/doi/full/10.1080/0144929X.2014.914975}, author = {Barkhuus, Louise and Zoric, Goranka and Engstr{\"o}m, Arvid and Ruiz-Hidalgo, J. and Verzijp, Nico} } @conference {cMohedano, title = {Object segmentation in images using EEG signals}, booktitle = {ACM Multimedia}, year = {2014}, month = {11/2014}, address = {Orlando, Florida (USA)}, abstract = {

This paper explores the potential of brain-computer interfaces in segmenting objects from images. Our approach is centered around designing an effective method for displaying the image parts to the users such that they generate measurable brain reactions. When an image region, specifically a block of pixels, is displayed we estimate the probability of the block containing the object of interest using a score based on EEG activity. After several such blocks are displayed, the resulting probability map is binarized and combined with the GrabCut algorithm to segment the image into object and background regions. This study shows that BCI and simple EEG analysis are useful in locating object boundaries in images.

}, keywords = {Brain-computer interfaces, Electroencephalography, GrabCut algorithm, Interactive segmentation, Object segmentation, rapid serial visual presentation}, doi = {10.1145/2647868.2654896}, url = {http://arxiv.org/abs/1408.4363}, author = {Mohedano, Eva and Healy, Graham and Kevin McGuinness and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Smeaton, Alan F.} } @conference {cManchon-Vizuete, title = {Photo Clustering of Social Events by Extending PhotoTOC to a Rich Context}, booktitle = {ICMR 2014 Workshop on Social Events in Web Multimedia (SEWM)}, year = {2014}, month = {04/2014}, publisher = {ACM}, organization = {ACM}, address = {Glasgow, Scotland}, abstract = {

The popularisation of the storage of photos on the cloud has opened new opportunities and challenges for the organisation and extension of photo collections. This paper presents a light computational solution for the clustering of web photos based on social events. The proposal combines a first over-segmentation of the photo collections of each user based on temporal cues, as previously proposed in PhotoTOC. On a second stage, the resulting mini-clusters are merged based on contextual metadata such as geolocation, keywords and user IDs. Results indicate that, although temporal cues are very relevant for event clustering, robust solutions should also consider all these additional features.

[Conference website]

Photo Clustering of Social Events by Extending PhotoTOC to a Rich Context from Xavi Gir{\'o}

}, keywords = {Clustering, Event Detection, Photo Collections}, url = {http://hdl.handle.net/2117/23009}, author = {Manchon-Vizuete, Daniel and Gris-Sarabia, Irene and Xavier Gir{\'o}-i-Nieto} } @article {aAlonso-Gonzalez, title = {PolSAR Time Series Processing with Binary Partition Trees}, journal = {IEEE Transactions on Geoscience and Remote Sensing}, volume = {52}, year = {2014}, month = {06/2014}, pages = {3553 {\textendash} 3567}, abstract = {

This paper deals with the processing of polarimetric synthetic aperture radar (SAR) time series. Different approaches to deal with the temporal dimension of the data are considered, which are derived from different target characterizations in this dimension. These approaches are the basis for defining two different binary partition tree (BPT) structures that are employed for SAR polarimetry (PolSAR) data processing. Once constructed, the BPT is processed by a tree pruning, producing a set of spatiotemporal homogeneous regions, and estimating the polarimetric response within them. It is demonstrated that the proposed technique preserves the PolSAR information in the spatial and the temporal domains without introducing bias nor distortion. Additionally, the evolution of the data in the temporal dimension is also analyzed, and techniques to obtain BPT-based scene change maps are defined. Finally, the proposed techniques are employed to process two real RADARSAT-2 data sets.

}, issn = {0196-2892}, doi = {10.1109/TGRS.2013.2273664}, author = {Alonso-Gonz{\'a}lez, A. and L{\'o}pez-Mart{\'\i}nez, C. and Salembier, P.} } @conference {cPalou14, title = {Precision-Recall-Classification Evaluation Framework: Application to Depth Estimation on Single Images}, booktitle = {European Conference on Computer Vision (ECCV)}, year = {2014}, month = {09/2014}, address = {Zurich}, abstract = {

Many computer vision applications involve algorithms that can be decomposed in two main steps. In a first step, \ events or objects are detected and, in a subsequent stage, detections are assigned to various classes. Examples of such {\textquoteleft}{\textquoteleft}detection plus classification{\textquoteright}{\textquoteright} problems can be found in human pose classification, object recognition or action classification among others. In this paper, we focus on a special case: depth ordering on single images. In this \ problem, the detection step consists of the image segmentation, and the classification stage assigns a depth gradient to each contour or a depth order to each region.

In this paper, we discuss the limitations of the classical Precision-Recall evaluation framework for these kind of problems and define an extended framework called {\textquoteleft}{\textquoteleft}Precision-Recall-Classfication{\textquoteright}{\textquoteright} (PRC). Then, we apply this framework to depth ordering problems and design two specific PRC

measures to evaluate both the local and the global depth consistencies. We use these measures to evaluate precisely state of the art depth ordering systems for monocular images. Based on this evaluation, we also propose an extension to the method of [Cal13] applying an optimal graph cut on a hierarchical segmentation structure such as ultrametric contour maps. The resulting system is proven to provide better results than state of the art algorithms.

}, author = {Palou, G. and Salembier, P.} } @article {aBonet-Carne14, title = {Quantitative Ultrasound Texture Analysis of Fetal Lungs To Predict Neonatal Respiratory Morbidity}, journal = {Ultrasound in Obstetrics and Gynecology, Wiley}, volume = {44}, year = {2014}, author = {E. Bonet-Carne and M. Palacio and T. Cobo and A. Perez-Moreno and M. Lopez and J. P. Piraquive and J. C. Ramirez and F. Marques and E. Gratacos} } @article {Suau2014, title = {Real-time Fingertip Localization Conditioned on Hand Gesture Classification}, journal = {Image and Vision Computing}, volume = {32}, year = {2014}, month = {05/2014}, pages = {522 - 532}, abstract = {

A method to obtain accurate hand gesture classification and fingertip localization from depth images is proposed. The Oriented Radial Distribution feature is utilized, exploiting its ability to globally describe hand poses, but also to locally detect fingertip positions. Hence, hand gesture and fingertip locations are characterized with a single feature calculation. We propose to divide the difficult problem of locating fingertips into two more tractable problems, by taking advantage of hand gesture as an auxiliary variable. Along with the method we present the ColorTip dataset, a dataset for hand gesture recognition and fingertip classification using depth data. ColorTip contains sequences where actors wear a glove with with colored fingertips, allowing automatic annotation. The proposed method is evaluated against recent works in several datasets, achieving promising results in both gesture classification and fingertip localization.

}, keywords = {dataset, fingertip classification, hand gesture recognition, interactivity, range camera}, issn = {0262-8856}, doi = {10.1016/j.imavis.2014.04.015}, url = {http://www.sciencedirect.com/science/article/pii/S0262885614000845}, author = {Suau, X. and Alcoverro, M. and L{\'o}pez-M{\'e}ndez, A. and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cVaras14, title = {Region-based Particle Filter for Video Object Segmentation}, booktitle = {CVPR - Computer Vision and Pattern Recognition}, year = {2014}, publisher = {IEEE}, organization = {IEEE}, address = {Ohio}, abstract = {

We present a video object segmentation approach that\ extends the particle filter to a region-based image representation. Image partition is considered part of the particle filter measurement, which enriches the available information and leads to a re-formulation of the particle filter.\ The prediction step uses a co-clustering between the previous image object partition and a partition of the current\ one, which allows us to tackle the evolution of non-rigid\ structures. Particles are defined as unions of regions in the\ current image partition and their propagation is computed\ through a single co-clustering. The proposed technique is\ assessed on the SegTrack dataset, leading to satisfactory\ perceptual results and obtaining very competitive pixel error rates compared with the state-of-the-art methods.

}, keywords = {co-clustering, particle filter, segmentation, tracking}, author = {David Varas and Marqu{\'e}s, F.} } @article {aSalembier14, title = {Remote sensing image processing with graph cut of Binary Partition Trees}, journal = {Advances in computing science, control and communications}, volume = {69}, year = {2014}, month = {04/2014}, pages = {185-196}, issn = {1870-4069}, author = {Salembier, P. and S. Foucher} } @conference {cGallego14, title = {Robust 3D SFS reconstruction based on reliability maps}, booktitle = {ICIP, IEEE International Conference on Image Processing}, year = {2014}, month = {10/2014}, abstract = {

This paper deals with Shape from Silhouette (SfS) volumetric reconstruction in the context of multi-view smart room scenarios. The method that we propose first computes a 2D foreground object segmentation in each one of the views, by using region-based models to model the foreground, and shadow classes, and a pixel-wise model to model the background class. Next, we calculate the reliability maps between foreground and background/shadow classes in each view, by computing the hellinger distance among models. These 2D reliability maps are taken into account finally, in the 3D SfS reconstruction algorithm, to obtain an enhanced final volumetric reconstruction. The advantages of our system rely on the possibility to obtain a volumetric representation which automatically defines the optimal tolerance to errors for each one of the voxels of the volume, with a low rate of false positive and false negative errors. The results obtained by using our proposal improve the traditional SfS reconstruction computed with a fixed tolerance for the overall volume.

}, author = {Gallego, J. and M. Pard{\`a}s} } @phdthesis {dAlcoverro14, title = {Stochastic optimization and interactive machine learning for human motion analysis}, year = {2014}, month = {12/2014}, school = {UPC}, address = {Barcelona}, abstract = {

The analysis of human motion from visual data is a central issue in the computer vision research community as it enables a wide range of applications and it still remains a challenging problem when dealing with unconstrained scenarios and general conditions. Human motion analysis is used in the entertainment industry for movies or videogame production, in medical applications for rehabilitation or biomechanical studies. It is also used for human computer interaction in any kind of environment, and moreover, it is used for big data analysis from social networks such as Youtube or Flickr, to mention some of its use cases. In this thesis we have studied human motion analysis techniques with a focus on its application for smart room environments. That is, we have studied methods that will support the analysis of people behavior in the room, allowing interaction with computers in a natural manner and in general, methods that introduce computers in human activity environments to enable new kind of services but in an unobstrusive mode. The thesis is structured in two parts, where we study the problem of 3D pose estimation from multiple views and the recognition of gestures using range sensors. First, we propose a generic framework for hierarchically layered particle filtering (HPF) specially suited for motion capture tasks. Human motion capture problem generally involve tracking or optimization of high-dimensional state vectors where also one have to deal with multi-modal pdfs. HPF allow to overcome the problem by means of multiple passes through substate space variables. Then, based on the HPF framework, we propose a method to estimate the anthropometry of the subject, which at the end allows to obtain a human body model adjusted to the subject. Moreover, we introduce a new weighting function strategy for approximate partitioning of observations and a method that employs body part detections to improve particle propagation and weight evaluation, both integrated within the HPF framework. The second part of this thesis is centered in the detection of gestures, and we have focused the problem of reducing annotation and training efforts required to train a specific gesture. In order to reduce the efforts required to train a gesture detector, we propose a solution based on online random forests that allows training in real-time, while receiving new data in sequence. The main aspect that makes the solution effective is the method we propose to collect the hard negatives examples while training the forests. The method uses the detector trained up to the current frame to test on that frame, and then collects samples based on the response of the detector such that they will be more relevant for training. In this manner, training is more effective in terms of the number of annotated frames required.

}, url = {http://hdl.handle.net/10803/285337}, author = {Alcoverro, M.}, editor = {M. Pard{\`a}s and Casas, J.} } @conference {cManchon-Vizuetea, title = {UPC at MediaEval 2014 Social Event Detection Task}, booktitle = {MediaEval 2014 Workshop}, year = {2014}, month = {10/2014}, publisher = {CEUR Workshop Proceedings}, organization = {CEUR Workshop Proceedings}, address = {Barcelona}, abstract = {

This document presents the contribution of the UPC team to the Social Event Detection (SED) Subtask 1 in MediaEval 2014. This contribution extends the solution tested in the previous year with a better optimization of the parameters that determine the clustering algorithm, and by introducing an additional pass that considers the merges of all pairs of mini-clusters generated during the two first passes. Our proposal also addresses the problem of incomplete metadata by generating additional textual tags based on geolocation and natural language processing techniques.

[MediaEval 2014 website]

}, url = {http://ceur-ws.org/Vol-1263/}, author = {Manchon-Vizuete, Daniel and Gris-Sarabia, Irene and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xAlmendros-Gutierrez14, title = {Visual instance mining of news videos using a graph-based approach}, year = {2014}, abstract = {

Advisors:\ Xavier Gir{\'o}-i-Nieto\ (UPC) and Horst Eidenberger (TU Wien)

Degree: Telecommunications Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)

The aim of this thesis is to design a tool that performs visual instance\ search mining for news video summarization. This means to extract the\ relevant content of the video in order to be able to recognize the storyline\ of the news.

Initially, a sampling of the video is required to get the frames with a desired rate. Then, different relevant contents are detected from each frame, focusing on faces, text and several objects that the user can select. Next, we use a graph-based clustering method in order to recognize them with a high accuracy and select the most representative ones to show them in the visual summary. Furthermore, a graphical user interface in Wt was developed to create an online demo to test the application.

During the development of the application we have been testing the tool with the CCMA dataset. We prepared a web-based survey based on four results from this dataset to check the opinion of the users. We also validate our visual instance mining results comparing them with the results obtained applying an algorithm developed at Columbia University for video summarization. We have run the algorithm on a dataset of a few videos on two events: {\textquoteright}Boston bombings{\textquoteright} and the {\textquoteright}search of the Malaysian airlines flight{\textquoteright}. We carried out another web-based survey in which users could compare our approach with this related work. With these surveys we analyze if our tool fulfill the requirements we set up.

We can conclude that our system extract visual instances that show the most relevant content of news videos and can be used to summarize these videos effectively.

Final grade: B (7/10)

}, url = {http://hdl.handle.net/2099.1/22362}, author = {Almendros-Guti{\'e}rrez, David}, editor = {Xavier Gir{\'o}-i-Nieto and Eidenberger, Horst} } @mastersthesis {xRamon13, title = {Algorithms for B wave detection}, year = {2013}, abstract = {

The objective of this Master Thesis was to develop algorithms for B waves detection in ICP. This goal was approached by two different methods that depend basically in the resolution of the acquired ICP. Then, both methods were adapted to work in an ultra-low power microcontroller. The first method works using ICP recorded at 1 Hz and it is based on the Lundberg{\textquoteright}s definition of B wave. A plus of this algorithm is that reduces to the minimum the number of samples per block to classify. The results obtained after testing it using long records of ICP from 27 patients were an accuracy of 89,59\%, a specificity 89,71\% and a sensitivity of 89,16\%. These results did not change when the code was adapted to the microcontroller. The second method requires ICP obtained with a sampling rate of 100 Hz. It is based on the morphology of the pulse waves present in the ICP and caused by the change of blood volume inside the skull with every heartbeat. A total of 1430 blocks of ICP (864 for lack of B wave and 566 for presence of B wave), everyone with duration of 41 seconds, were used to extract 21 features from each one. Then a MLP classifier and a SVM classifier were tested and compared. The best results were obtained by the SVM classifier, reaching an accuracy of 86,37\%, a specificity of 88,09\% and a sensitivity of 83,74\% when all features were used. After adapting the algorithm to the microcontroller the results were nearly the same.

}, url = {http://hdl.handle.net/2099.1/19034}, author = {Ramon, Eduard} } @conference {cVentura13, title = {Automatic Keyframe Selection based on Mutual Reinforcement Algorithm}, booktitle = {CBMI (Content-Based Multimedia Indexing)}, year = {2013}, month = {09/2013}, address = {Veszprem}, abstract = {

This paper addresses the problem of video summarization through an automatic selection of a single representative keyframe. The proposed solution is based on the mutual reinforcement paradigm, where a keyframe is selected thanks to its highest and most frequent similarity to the rest of considered frames. Two variations of the algorithm are explored: a first one where only frames within the same video are used (intraclip mode) and a second one where the decision also depends on the previously selected keyframes of related videos (interclip mode). These two algorithms were evaluated by a set of
professional documentalists from a broadcaster{\textquoteright}s archive, and results concluded that the proposed techniques outperform the semi-manual solution adopted so far in the company.

http://cbmi2013.mik.uni-pannon.hu/

Automatic Keyframe Selection based on Mutual Reinforcement Algorithm from Xavi Gir{\'o}
}, keywords = {mutual reinforcement algorithm, video summarization}, isbn = {978-1-4799-0955-1}, doi = {10.1109/CBMI.2013.6576548}, url = {http://dx.doi.org/10.1109/CBMI.2013.6576548}, author = {Ventura, C. and Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Daniel Giribet and Eusebio Carasusan} } @conference {cPerez-Pellitero13, title = {Bayesian region selection for adaptive dictionary-based Super-Resolution}, booktitle = {British Machine Vision Conference}, year = {2013}, month = {09/2013}, abstract = {

The performance of dictionary-based super-resolution (SR) strongly depends on the contents of the training dataset. Nevertheless, many dictionary-based SR methods randomly select patches from of a larger set of training images to build their dictionaries,\ thus relying on patches being diverse enough. This paper describes an external-dictionary SR algorithm based on adaptively selecting an optimal subset of patches out of the training images. Each training image is divided into sub-image entities, named regions, of such size that texture consistency is preserved. For each input patch to super-resolve, the best-fitting region (with enough high-frequency energy) is found through a Bayesian selection. In order to handle the high number of regions in the train- ing dataset, a local Naive Bayes Nearest Neighbor (NBNN) approach is used. Trained with this adapted subset of patches, sparse coding SR is applied to recover the high-resolution image. Experimental results demonstrate that using our adaptive algorithm produces an improvement in SR performance with respect to non-adaptive training.\ 

}, author = {E. Perez-Pellitero and Salvador, J. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @article {aAlonso-Gonzalez13a, title = {Bilateral Distance Based Filtering for Polarimetric SAR Data}, journal = {Remote Sensing}, volume = {5}, year = {2013}, month = {10/2013}, pages = {5620-5641}, abstract = {

This paper introduces a non-linear Polarimetric SAR data filtering approach able to preserve the edges and small details of the data. It is based on exploiting the data locality in both, the spatial and the polarimetric domains, in order to avoid mixing heterogeneous samples of the data. A weighted average is performed over a given window favoring pixel values that are close on both domains. The filtering technique is based on a modified bilateral filtering, which is defined in terms of spatial and polarimetric distances. These distances encapsulate all the knowledge in both domains for an adaptation to the data structure. Finally, the proposed technique is employed to process a real RADARSAT-2 dataset.

}, issn = {2072-4292}, doi = {10.3390/rs5115620}, author = {Alonso-Gonz{\'a}lez, A. and L{\'o}pez-Mart{\'\i}nez, C. and Salembier, P. and X. Deng} } @mastersthesis {xMartos13, title = {Content-based Video Summarisation to Object Maps}, year = {2013}, abstract = {

Advisors: Xavier Gir{\'o}-i-Nieto\ and\ Horst Eidenberger

School: Vienna University of Technology (Austria)

The amount of digital video content available in the web is constantly increasing. Its handling requires efficient technologies: text search on large databases provides users a great amount of videos; the content results are accessible by a description. Users need a fast and visual way to access relevant video content effectively. Quick visualisation of content using static image summarisation is a sophisticated problem. However, it is worth it because it may solve video navigation problems. Users can very rapidly get an idea of the video with no need to browse through it with a sliding bar as normally done.

In this work a system for automatic video summarisation is developed. It creates an object map the segments of which are extracted from an input video. It allows enhancing video browsing and large video databases management generating a visual index so that the user can rapidly grasp the most relevant content. Finally, accessing them with a simple action requires several technologies that define a complex information processing.

Firstly, shot boundary detection algorithms are required to reduce time redundancy of the video. Secondly, different relevant objects are extracted from each keyframe (faces, cars, etc.). We also describe a workflow to train detection models using multiple open source solutions. Furthermore, faces are a particular and very relevant semantic class. For this reason, we use clustering methods in order to recognise them in an unsupervised recognition process. The image composition of all selected objects and faces is the final stage of the architecture. Composition is defined as the combination of distinct parts to form a whole, therefore, objects have to be rendered in the map in a visually attractive manner.

To validate our approach and assess end-user satisfaction, we conducted a user study in which we compare requirements collected by analysing related literature. We analyse redundancy and informativeness as well as pleasantness.

The results show that our approach effectively creates an image representation for videos and is able to summarise customisable content in an attractive way.


Manel Martos, "Content-based Video Summarization to Object Maps" from Image Processing Group on Vimeo.

}, url = {http://hdl.handle.net/2099.1/19359}, author = {Martos, Manel}, editor = {Xavier Gir{\'o}-i-Nieto and Eidenberger, Horst} } @mastersthesis {xSalvador13, title = {Crowdsourced Object Segmentation with a Game}, year = {2013}, abstract = {

Co-advised with Axel Carlier (INP Toulouse), Vincent Charvillat\ (INP Toulouse) and Oge Marques (Florida Atlantic University).

Amaia Salvador, "Crowdsourced Object Segmentation with a Game" from Image Processing Group on Vimeo.

}, author = {Amaia Salvador}, editor = {Xavier Gir{\'o}-i-Nieto and Carlier, Axel and Charvillat, Vincent and Marques, Oge} } @conference {cSalvador13 , title = {Crowdsourced Object Segmentation with a Game}, booktitle = {ACM Workshop on Crowdsourcing for Multimedia (CrowdMM)}, year = {2013}, month = {10/2013}, address = {Barcelona}, abstract = {

We introduce a new algorithm for image segmentation based on crowdsourcing through a game : Ask{\textquoteright}nSeek. The game provides information on the objects of an image, under the form of clicks that are either on the object, or on the background. These logs are then used in order to determine the best segmentation for an object among a set of candidates generated by the state-of-the-art CPMC algorithm. We also introduce a simulator that allows the generation of game logs and therefore gives insight about the number of games needed on an image to perform acceptable segmentation.

Amaia Salvador, "Crowdsourced Object Segmentation with a Game" from Xavi Gir{\'o}
}, isbn = {978-1-4503-2396-3}, doi = {http://dx.doi.org/10.1145/2506364.2506367}, url = {http://dx.doi.org/10.1145/2506364.2506367}, author = {Amaia Salvador and Carlier, Axel and Xavier Gir{\'o}-i-Nieto and Marques, Oge and Charvillat, Vincent} } @article {aSuau13, title = {Detecting End-Effectors on 2.5D data using Geometric Deformable Models: Application to Human Pose Estimation}, journal = {Computer Vision and Image Understanding (CVIU)}, volume = {117}, year = {2013}, chapter = {281{\textendash}288}, abstract = {

End-effectors are usually related to the location of limbs, and their reliable detection enables robust body tracking as well as accurate pose estimation. Recent innovation in depth cameras has re-stated the pose estimation problem. We focus on the information provided by these sensors, for which we borrow the name 2.5D data from the Graphics community. In this paper we propose a human pose estimation algorithm based on topological propagation. Geometric Deformable Models are used to carry out such propagation, implemented according to the Narrow Band Level Set approach. A variant of the latter method is proposed, including a density restriction which helps preserving the topological properties of the object under analysis. Principal end-effectors are extracted from a directed graph weighted with geodesic distances, also providing a skeletal-like structure describing human pose. An evaluation against reference methods is performed with promising results. The proposed solution allows a frame-wise end-effector detection, with no temporal tracking involved, which may be generalized to the tracking of other objects beyond human body.


}, doi = {10.1016/j.cviu.2012.11.006}, url = {http://www.sciencedirect.com/science/article/pii/S1077314212001907}, author = {Suau, X. and Ruiz-Hidalgo, J. and Casas, J.} } @mastersthesis {xAntoja-Sabin13, title = {El tel{\`e}fon m{\`o}bil com a eina d{\textquoteright}aprenentatge informal}, year = {2013}, abstract = {

Director: Xavier Gir{\'o}-i-Nieto

Els tel{\`e}fons intel{\textperiodcentered}ligents estan cada dia m{\'e}s presents a la nostra societat. L{\textquoteright}educaci{\'o} secund{\`a}ria obligat{\`o}ria no n{\textquoteright}est{\`a} al marge. M{\'e}s aviat al contrari, {\'e}s un dels temes a nivell de conducta que genera m{\'e}s pol{\`e}mica actualment amb l{\textquoteright}alumnat adolescent. Per altra banda, {\'e}s una tecnologia molt potent per accedir a informaci{\'o} i relacionar-se, en qualsevol moment i lloc.

La proposta {\'e}s que l{\textquoteright}alumne hagi de captar amb el tel{\`e}fon m{\`o}bil algun element que tingui a veure amb all{\`o} que s{\textquoteright}ha treballat a classe i li sigui proper. Es tracta de vincular la teoria amb la pr{\`a}ctica, els conceptes de classe amb la seva quotidianitat.

El docent proposar{\`a} uns deures relacionats amb els continguts de classe. L{\textquoteright}alumne utilitzar{\`a} el tel{\`e}fon m{\`o}bil intel{\textperiodcentered}ligent com a vehicle per realitzar la tasca.

Per tal de deixar const{\`a}ncia de la feina feta, s{\textquoteright}utilitzar{\`a} la metodologia de la carpeta d{\textquoteright}aprenentatge electr{\`o}nica (electronic portfolio). Aquesta segona eina permetr{\`a} deixar const{\`a}ncia de tot el que l{\textquoteright}alumne va fent, avaluar-lo i fer una coavaluaci{\'o}.

A la mem{\`o}ria del treball s{\textquoteright}explica quins fonaments pedag{\`o}gics t{\'e} la proposta, com encaixa en el context normatiu de l{\textquoteright}educaci{\'o} secund{\`a}ria obligat{\`o}ria, exemples pr{\`a}ctics d{\textquoteright}exercicis amb els que s{\textquoteright}aplica la metodologia, com aix{\`o} es pot dur a terme a nivell t{\`e}cnic i els resultats de l{\textquoteright}experiment fet amb un grup d{\textquoteright}alumnes utilitzant la metodologia proposada.\ 

}, keywords = {educaci{\'o}, internet, ortfolio, secund{\`a}ria, significatiu. bloc, smartphone, tecnologia. aprenentatge, wordpress}, author = {Antoja-Sabin, Joan}, editor = {Xavier Gir{\'o}-i-Nieto} } @conference {cBosio13a, title = {Ensemble learning and hierarchical data representation for microarray classification}, booktitle = {13th IEEE International Conference on BioInformatics and BioEngineering BIBE}, year = {2013}, month = {11/2013}, publisher = {13th IEEE International Conference on BioInformatics and BioEngineering,}, organization = {13th IEEE International Conference on BioInformatics and BioEngineering,}, address = {Chania, Crete}, abstract = {

Microarray data classification is an open and active research field.

The development of more accurate algorithms is of great interest also because many of the techniques can be straightforwardly applied in analyzing different kinds of omics data.

In this work, an ensemble feature selection algorithm is applied within a binary classification framework from [1] that already got good predictive results. Ensemble feature selection is a rich field of research.

Ensemble techniques take some individual experts (i.e. classifiers) to combine them to improve the individual expert results with a voting scheme.

In this case, a thinning algorithm is proposed which starts by using all the available experts and removes them one by one focusing on improving the ensemble vote.

Two versions of an ensemble thinning algorithm have been tested and three key elements have been introduced to work with microarray data: the ensemble cohort definition, the nonexpert notion, which defines a set of excluded expert from the thinning process, and a rule to break ties in the thinning process. Experiments have been done on seven public datasets from the Microarray Quality Control study, MAQC [2].

The studied ensemble technique improves the state of the art results by producing classifiers with significantly better results and the proposed key elements have shown to be useful for the prediction performance.

}, keywords = {Microarray classification; metagenes; hierarchical representation; Treelets; feature selection; LDA; ensemble.}, author = {Bosio, M. and Salembier, P. and Albert Oliveras and P. Bellot} } @mastersthesis {xGarcia-delMolino13, title = {Extension of Instance Search Technique by Geometric Coding and Quantization Error Compensation}, year = {2013}, abstract = {

This MSc thesis analyzes two ways of improving the video retrieval techniques for instance search problem. In one hand, "Pairing Interest Points for a better Signature using Sparse Detector{\textquoteright}s Spatial Information", allows the Bag-of-Words model to keep some spatial information. In the other, "Study of the Hamming Embedding Signature Symmetry in Video Retrieval" provides binary signatures that refine the matching based on visual words, and aims to find the best way of matching taking into account the existent asymmetries between image query and videos.

}, url = {http://hdl.handle.net/2099.1/19159}, author = {Garcia-delMolino, Ana}, editor = {Satoh, Shin{\textquoteright}ichi and Ventura, C. and Xavier Gir{\'o}-i-Nieto} } @conference {cMaceira13, title = {Fusion of colour and depth partitions for depth map coding}, booktitle = {Digital Signal Processing}, year = {2013}, month = {07/2013}, address = {Santorini, Greece}, abstract = {

3D video coding includes the use of multiple color views and depth maps associated to each view. An adequate coding of depth maps should be adapted to the characteristics of depth maps: smooth regions and sharp edges. In this paper a segmentation-based technique is proposed for improving the depth map compression while preserving the main discontinuities that exploits the color-depth similarity of 3D video. An initial coarse depth map segmentation is used to locate the main discontinuities in depth. The resulting partition is improved by fusing a color partition. We assume that the color image is first encoded and available when the associated depth map is encoded, therefore the color partition can be segmented in the decoder without introducing any extra cost. A new segmentation criterion inspired by super-pixels techniques is proposed to obtain the color partition. Initial experimental results show similar compression efficiency to hevc with a big potential for further improvements.

}, keywords = {3DTV, Depth map coding}, doi = {10.1109/ICDSP.2013.6622781}, author = {Maceira, M. and Morros, J.R. and Ruiz-Hidalgo, J.} } @article {aAlcoverro13, title = {Gesture Control Interface for immersive panoramic displays}, journal = {Multimedia Tools and Applications}, year = {2013}, month = {07/2013}, pages = {1-27}, abstract = {

In this paper, we propose a gesture-based interface designed to interact with panoramic scenes. The system combines novel static gestures with a fast hand tracking method. Our proposal is to use static gestures as shortcuts to activate functionalities of the system (i.e. volume up/down, mute, pause, etc.), and hand tracking to freely explore the panoramic video. The overall system is multi-user, and incorporates a user identi cation module based on face recognition, which is able both to recognize returning users and to add new users online. The system exploits depth data, making it robust to challenging illumination conditions. We show through experimental results the performance of every component of the system compared to the state of the art. We also show the results of a usability study\ performed with several untrained users.

}, issn = {1380-7501}, doi = {10.1007/s11042-013-1605-7}, author = {Alcoverro, M. and Suau, X. and Morros, J.R. and L{\'o}pez-M{\'e}ndez, A. and A. Gil-Moreno and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cKochale13, title = {Gesture controlled interactive rendering in a panoramic scene}, booktitle = {European Interactive TV Conference, EuroITV}, year = {2013}, month = {06/2013}, address = {Como, Italy}, url = {http://upcommons.upc.edu/e-prints/handle/2117/20470}, author = {Kochale, A. and Ruiz-Hidalgo, J. and M. Borsum} } @conference {cZoric13, title = {Gesture Interaction with Rich TV Content in the Social Setting}, booktitle = {Exploring and Enhancing the User Experience for Television, Workshop of ACM SIGCHI Conference on Human Factors in Computing Systems, CHI{\textquoteright}13}, year = {2013}, month = {04/2013}, address = {Paris, France}, abstract = {

The appearance of new immersive TV content has increased the interactive possibilities presented to the viewers. Increased interactivity is seen as a valuable feature in viewing richer television content, but new functionalities are limited by what can be done naturally and intuitively using available devices like remote controls. Therefore, new interaction techniques, such as visual gestures control systems, have appeared aiming to enhance the viewers{\textquoteright} viewing experience. In this work we begin uncovering the potential and challenges of gesture interaction with ultra high definition video for people watching TV together. As a first step we have done a study with a group of people interacting with such content using a gesture-based system in the home environment.

}, url = {http://livingroomexperience.wikispaces.com/}, author = {Zoric, Goranka and Engstr{\"o}m, Arvid and Barkhuus, Louise and Ruiz-Hidalgo, J. and Kochale, A.} } @conference {cBosio13, title = {Hierarchical Clustering Combining Numerical and Biological Similarities for Gene Expression Data Classification}, booktitle = {35th Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC{\textquoteright}13) }, year = {2013}, month = {07/2013}, publisher = {IEEE EMBS}, organization = {IEEE EMBS}, address = {Osaka, Japan}, keywords = {Biological interpretation, Hierarchical representation, microarray classification}, author = {Bosio, M. and Salembier, P. and P. Bellot and Albert Oliveras} } @conference {cPalou13, title = {Hierarchical Video Representation with Trajectory Binary Partition Tree}, booktitle = {Computer Vision and Pattern Recognition (CVPR)}, year = {2013}, month = {06/2013}, address = {Portland, Oregon}, abstract = {

As early stage of video processing, we introduce an iterative trajectory merging algorithm that produces a region-based and hierarchical representation of the video sequence, called the Trajectory Binary Partition Tree (BPT). From this representation, many analysis and graph cut techniques can be used to extract partitions or objects that are useful in the context of specific applications.\ 

In order to define trajectories and to create a precise merging algorithm, color and motion cues have to be used. Both types of informations are very useful to characterize objects but present strong differences of behavior in the spatial and the temporal dimensions. On the one hand, scenes and objects are rich in their spatial color distributions, but these distributions are rather stable over time. Object motion, on the other hand, presents simple structures and low spatial variability but may change from frame to frame. The proposed algorithm takes into account this key difference and relies on different models and associated metrics to deal with color and motion information. We show that the proposed algorithm outperforms existing hierarchical video segmentation algorithms and provides more stable and precise regions.

}, author = {Palou, G. and Salembier, P.} } @phdthesis {dSuau13, title = {Human body analysis using depth data}, year = {2013}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

Human body analysis is one of the broadest areas within the computer vision field. Researchers have put a strong effort in the human body analysis area, specially over the last decade, due to the technological improvements in both video cameras and processing power. Human body analysis covers topics such as person detection and segmentation, human motion tracking or action and behavior recognition. Even if human beings perform all these tasks naturally, they build-up a challenging problem from a computer vision point of view. Adverse situations such as viewing perspective, clutter and occlusions, lighting conditions or variability of behavior amongst persons may turn human body analysis into an arduous task.

In the computer vision field, the evolution of research works is usually tightly related to the technological progress of camera sensors and computer processing power. Traditional human body analysis methods are based on color cameras. Thus, the information is extracted from the raw color data, strongly limiting the proposals. An interesting quality leap was achieved by introducing the \emph{multiview} concept. That is to say, having multiple color cameras recording a single scene at the same time. With multiview approaches, 3D information is available by means of stereo matching algorithms. The fact of having 3D information is a key aspect in human motion analysis, since the human body moves in a three-dimensional space. Thus, problems such as occlusion and clutter may be overcome with 3D information.

The appearance of commercial depth cameras has supposed a second leap in the human body analysis field. While traditional multiview approaches required a cumbersome and expensive setup, as well as a fine camera calibration; novel depth cameras directly provide 3D information with a single camera sensor. Furthermore, depth cameras may be rapidly installed in a wide range of situations, enlarging the range of applications with respect to multiview approaches. Moreover, since depth cameras are based on infra-red light, they do not suffer from illumination variations.

In this thesis, we focus on the study of depth data applied to the human body analysis problem. We propose novel ways of describing depth data through specific descriptors, so that they emphasize helpful characteristics of the scene for further body analysis. These descriptors exploit the special 3D structure of depth data to outperform generalist 3D descriptors or color based ones. We also study the problem of person detection, proposing a highly robust and fast method to detect heads. Such method is extended to a hand tracker, which is used throughout the thesis as a helpful tool to enable further research. In the remainder of this dissertation, we focus on the hand analysis problem as a subarea of human body analysis. Given the recent appearance of depth cameras, there is a lack of public datasets. We contribute with a dataset for hand gesture recognition and fingertip localization using depth data. This dataset acts as a starting point of two proposals for hand gesture recognition and fingertip localization based on classification techniques. In these methods, we also exploit the above mentioned descriptor proposals to finely adapt to the nature of depth data.

}, url = {http://hdl.handle.net/10803/134801}, author = {Suau, X.}, editor = {Casas, J. and Ruiz-Hidalgo, J.} } @article {aValero13, title = {Hyperspectral image representation and processing with Binary Partition Trees}, journal = {IEEE Transactions on Image Processing}, volume = {22}, year = {2013}, month = {April 2013}, pages = {1430 - 1443}, abstract = {

The optimal exploitation of the information provided by hyperspectral images requires the development of advanced image-processing tools. This paper proposes the construction and the processing of a new region-based hierarchical hyperspectral image representation relying on the binary partition tree (BPT). This hierarchical region-based representation can be interpreted as a set of hierarchical regions stored in a tree structure. Hence, the BPT succeeds in presenting: 1) the decomposition of the image in terms of coherent regions, and 2) the inclusion relations of the regions in the scene. Based on region-merging techniques, the BPT construction is investigated by studying the hyperspectral region models and the associated similarity metrics. Once the BPT is constructed, the fixed tree structure allows implementing efficient and advanced application-dependent techniques on it. The applicationdependent processing of BPT is generally implemented through a specific pruning of the tree. In this paper, a pruning strategy is proposed and discussed in a classification context. Experimental results on various hyperspectral data sets demonstrate the interest and the good performances of the BPT representation.

}, doi = {10.1109/TIP.2012.2231687}, author = {Valero, S. and Salembier, P. and Chanussot, J.} } @article {aVentura13, title = {Improving retrieval accuracy of Hierarchical Cellular Trees for generic metric spaces}, journal = {Multimedia Tools and Applications}, year = {2013}, abstract = {

Metric Access Methods (MAMs) are indexing techniques which allow working in generic metric spaces. Therefore, MAMs are specially useful for Content-Based Image Retrieval systems based on features which use non Lp norms as similarity measures. MAMs naturally allow the design of image browsers due to their inherent hierarchical structure. The Hierarchical Cellular Tree (HCT), a MAM based indexing technique, provides the starting point of our work. In this paper, we describe some limitations detected in the original formulation of the HCT and propose some modifications to both the index building and the search algorithm. First, the covering radius, which is defined as the distance from the representative to the furthest element in a node,may not cover all the elements belonging to the node{\textquoteright}s subtree. Therefore, we propose to redefine the covering radius as the distance from the representative to the furthest element in the node{\textquoteright}s subtree. This new definition is essential to guarantee a correct construction of the HCT. Second, the proposed Progressive Query retrieval scheme can be redesigned to perform the nearest neighbor operation in a more efficient way. We propose a new retrieval scheme which takes advantage of the benefits of the search algorithm used in the index building. Furthermore, while the evaluation of the HCT in the original work was only subjective, we propose an objective evaluation based on two aspects which are crucial in any approximate search algorithm: the retrieval time and the retrieval accuracy. Finally, we illustrate the usefulness of the proposal by presenting some actual applications.

}, doi = {10.1007/s11042-013-1686-3}, author = {Ventura, C. and Ver{\'o}nica Vilaplana and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @mastersthesis {xMohedano13, title = {Investigating EEG for Saliency and Segmentation Applications in Image Processing}, year = {2013}, abstract = {

Advisors: Kevin McGuinness, Xavier Gir{\'o}-i-Nieto, Noel O{\textquoteright}Connor

School: Dublin City University (Ireland)

The main objective of this project is to implement a new way to compute saliency maps and to locate an object in an image by using a brain-computer interface. To achieve this, the project is centered in designing the proper way to display the different parts of the images to the users in such a way that they generate measurable reactions. Once an image window is shown, the objective is to compute a score based on the EEG activity and compare its result with the current automatic methods to estimate saliency maps. Also, the aim of this work is to use the EEG map as a seed for another segmentation algorithm that will extract the object from the background in an image. This study provides evidence that BCI are useful to find the location of the objects in a simple images via straightforward EEG analysis and this represents the starting point to locate objects in more complex images.

Related post on BitSearch.

Eva Mohedano, "Investigating EEG for Saliency and Segmentation Applications in Image Processing" from Xavi Gir{\'o}
}, keywords = {Brain-computer interfaces (BCI), Eelectroencephalography (EEG), rapid serial visual presentation (RSVP), saliency map, segmentation}, author = {Mohedano, Eva}, editor = {Kevin McGuinness and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @conference {cPont-Tuset13, title = {Measures and Meta-Measures for the Supervised Evaluation of Image Segmentation}, booktitle = {Computer Vision and Pattern Recognition (CVPR)}, year = {2013}, month = {06/2013}, abstract = {

This paper tackles the supervised evaluation of image segmentation algorithms. First, it surveys and structures the measures used to compare the segmentation results with a ground truth database; and proposes a new measure: the precision-recall for objects and parts. To compare the goodness of these measures, it defines three quantitative meta-measures involving six state of the art segmentation methods. The meta-measures consist in assuming some plausible hypotheses about the results and assessing how well each measure reflects these hypotheses. As a conclusion, this paper proposes the precision-recall curves for boundaries and for objects-and-parts as the tool of choice for the supervised evaluation of image segmentation. We make the datasets and code of all the measures publicly available.

Examples of the meta-measure principles: How good are\ the evaluation measures at distinguishing these pairs of partitions?

}, author = {Jordi Pont-Tuset and Marqu{\'e}s, F.} } @article {aPalou13, title = {Monocular Depth Ordering Using T-junctions and Convexity Occlusion Cues}, journal = {IEEE Transactions on Image Processing}, volume = {22}, year = {2013}, month = {2013}, pages = { 1926 - 1939 }, abstract = {

This paper proposes a system that relates objects in an image using occlusion cues and arranges them according to depth. The system does not rely on a priori knowledge of the scene structure and focuses on detecting special points, such as T-junctions and highly convex contours, to infer the depth relationships between objects in the scene. The system makes extensive use of the binary partition tree as hierarchical region-based image representation jointly with a new approach for candidate T-junction estimation. Since some regions may not involve T-junctions, occlusion is also detected by examining convex shapes on region boundaries. Combining T-junctions and convexity leads to a system which only relies on low level depth cues and does not rely on semantic information. However, it shows a similar or better performance with the state-of-the-art while not assuming any type of scene.

}, doi = {10.1109/TIP.2013.2240002}, author = {Palou, G. and Salembier, P.} } @article {aSalvador13, title = {Multi-View Video Representation Based on Fast Monte Carlo Surface Reconstruction}, journal = {IEEE Transactions on Image Processing}, volume = {22}, year = {2013}, month = {09/2013}, pages = {3342 - 3352}, abstract = {

This article provides an alternative solution for the costly representation of multi-view video data, which can be used for both rendering and scene analysis. First, a new, efficient Monte Carlo discrete surface reconstruction method for foreground objects with static background is presented, which outperforms volumetric techniques and is suitable for GPU environments. Some extensions are also presented, which allow speeding up the reconstruction by exploiting multi-resolution and temporal correlation. Then, a fast meshing algorithm is applied, which allows interpolating a continuous surface from the discrete reconstructed points. As shown by the experimental results, the original video frames can be approximated with high accuracy by projecting the reconstructed foreground objects onto the original viewpoints. Furthermore, the reconstructed scene can be easily projected onto any desired virtual viewpoint, simplifying thus the design of Free-Viewpoint Video applications. In our experimental results, we show that our techniques for reconstruction and meshing compare favorably to the state-of-the-art, and we also introduce a rule-of-thumb for effective application of the method with a good quality vs. representation cost trade-off.

Note: ranked 53 among the 100 most downloaded IEEE Xplore articles in July 2013 (Gormish, M., "Top Downloads in IEEE Xplore [Reader{\textquoteright}s Choice]," IEEE Signal Processing Magazine, vol.31, no.1, pp.8,9, Jan. 2014. doi: 10.1109/MSP.2013.2282791)

}, doi = {10.1109/TIP.2013.2264818}, author = {Salvador, J. and Casas, J.} } @conference {cValero13, title = {Object recognition in urban hyperspectral images using binary partition tree representation}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium, IGARSS{\textquoteright}2013}, year = {2013}, publisher = {IEEE}, organization = {IEEE}, address = {Melbourne, Australia}, author = {Valero, S. and Salembier, P. and Chanussot, J.} } @phdthesis {dGallego13, title = {Parametric Region-Based Foreground Segmentation in Planar and Multi-View Sequences}, year = {2013}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

Foreground segmentation in video sequences is an important area of the image processing that attracts great interest among the scientist community, since it makes possible the detection of the objects that appear in the sequences under analysis, and allows us to achieve a correct performance of high level applications which use foreground segmentation as an initial step. The current Ph.D. thesis entitled Parametric Region-Based Foreground Segmentation in Planar and Multi-View Sequences details, in the following pages, the research work carried out within this field. In this investigation, we propose to use parametric probabilistic models at pixel-wise and region level in order to model the different classes that are involved in the classification process of the different regions of the image: foreground, background and, in some sequences, shadow. The development is presented in the following chapters as a generalization of the techniques proposed for objects segmentation in 2D planar sequences to 3D multi-view environment, where we establish a cooperative relationship between all the sensors that are recording the scene. Hence, different scenarios have been analyzed in this thesis in order to improve the foreground segmentation techniques: In the first part of this research, we present segmentation methods appropriate for 2D planar scenarios. We start dealing with foreground segmentation in static camera sequences, where a system that combines pixel-wise background model with region-based foreground and shadow models is proposed in a Bayesian classification framework. The research continues with the application of this method to moving camera scenarios, where the Bayesian framework is developed between foreground and background classes, both characterized with region-based models, in order to obtain a robust foreground segmentation for this kind of sequences. The second stage of the research is devoted to apply these 2D techniques to multi-view acquisition setups, where several cameras are recording the scene at the same time. At the beginning of this section, we propose a foreground segmentation system for sequences recorded by means of color and depth sensors, which combines different probabilistic models created for the background and foreground classes in each one of the views, by taking into account the reliability that each sensor presents. The investigation goes ahead by proposing foreground segregation methods for multi-view smart room scenarios. In these sections, we design two systems where foreground segmentation and 3D reconstruction are combined in order to improve the results of each process. The proposals end with the presentation of a multi-view segmentation system where a foreground probabilistic model is proposed in the 3D space to gather all the object information that appears in the views. The results presented in each one of the proposals show that the foreground segmentation and also the 3D reconstruction can be improved, in these scenarios, by using parametric probabilistic models for modeling the objects to segment, thus introducing the information of the object in a Bayesian classification framework.

}, url = {http://hdl.handle.net/10803/130813}, author = {Gallego, J.}, editor = {M. Pard{\`a}s} } @conference {cAlonso-Gonzalez13, title = {PolSAR time series processing and analysis based on Binary Partition Trees}, booktitle = {PoLinSAR 2013 Workshop}, year = {2013}, address = {Frascati (Rome), Italy}, author = {Alonso-Gonz{\'a}lez, A. and L{\'o}pez-Mart{\'\i}nez, C. and Salembier, P.} } @article {aAlonso-Gonzalez13, title = {Processing Multidimensional SAR and Hyperspectral Images With Binary Partition Tree}, journal = {Proceedings of the IEEE}, volume = {101}, year = {2013}, month = {March, 2013}, pages = {723 - 747}, chapter = {723 - 747}, abstract = {

The current increase of spatial as well as spectral resolutions of modern remote sensing sensors represents a real opportunity for many practical applications but also generates important challenges in terms of image processing. In particular, the spatial correlation between pixels and/or the spectral correlation between spectral bands of a given pixel cannot be ignored. The traditional pixel-based representation of images does not facilitate the handling of these correlations.

In this paper, we discuss the interest of a particular hierarchical region-based representation of images based on binary partition tree (BPT). This representation approach is very flexible as it can be applied to any type of image. Here both optical and radar images will be discussed. Moreover, once the image representation is computed, it can be used for many different applications. Filtering, segmentation, and classification will be detailed in this paper. In all cases, the interest of the BPT representation over the classical pixel-based representation will be highlighted.

}, doi = {10.1109/JPROC.2012.2205209}, author = {Alonso-Gonz{\'a}lez, A. and Valero, S. and Chanussot, J. and L{\'o}pez-Mart{\'\i}nez, C. and Salembier, P.} } @article {aMolina11 , title = {Real-time user independent hand gesture recognition from time-of-flight camera video using static and dynamic models}, journal = {Machine vision and applications}, volume = {24}, year = {2013}, month = {08/2011}, pages = {187{\textendash}204}, chapter = {187}, abstract = {

The use of hand gestures offers an alternative to the commonly used human computer interfaces, providing a more intuitive way of navigating among menus and multimedia applications. This paper presents a system for hand gesture recognition devoted to control windows applications. Starting from the images captured by a time-of-flight camera (a camera that produces images with an intensity level inversely proportional to the depth of the objects observed) the system performs hand segmentation as well as a low-level extraction of potentially relevant features which are related to the morphological representation of the hand silhouette. Classification based on these features discriminates between a set of possible static hand postures which results, combined with the estimated motion pattern of the hand, in the recognition of dynamic hand gestures. The whole system works in real-time, allowing practical interaction between user and application.

}, issn = {0932-8092}, doi = {10.1007/s00138-011-0364-6}, url = {http://www.springerlink.com/content/062m51v58073572h/fulltext.pdf}, author = {Molina, J. and Escudero-Vi{\~n}olo, M. and Signorelo, A. and M. Pard{\`a}s and Ferran, C. and Bescos, J. and Marqu{\'e}s, F. and Mart{\'\i}nez, J.} } @article {aGallego13, title = {Region Based Foreground Segmentation Combinig Color and Depth Sensors Via Logarithmic Opinion Pool Decision}, journal = {Journal of Visual Communication and Image Representation}, year = {2013}, month = {04/2013}, abstract = {

In this paper we present a novel foreground segmentation system that combines color and depth sensors information to perform a more complete Bayesian segmentation between foreground and background classes. The system shows a combination of spatial-color and spatial-depth region-based models for the foreground as well as color and depth pixel-wise models for the background in a Logarithmic Opinion Pool decision framework used to correctly combine the likelihoods of each model. A posterior enhancement step based on a trimap analysis is also proposed in order to correct the precision errors that the depth sensor introduces. The results presented in this paper show that our system is robust in front of color and depth camouflage problems between the foreground object and the background, and also improves the segmentation in the area of the objects\u2019 contours by reducing the false positive detections that appear due to the lack of precision of the depth sensors.

}, doi = {http://dx.doi.org/10.1016/j.jvcir.2013.03.019}, url = {http://www.sciencedirect.com/science/article/pii/S104732031300059X}, author = {Gallego, J. and M. Pard{\`a}s} } @inbook {bLeon13, title = {Region-based caption text extraction}, booktitle = {Lecture Notes in Electrical Engineering (Analysis, Retrieval and Delivery of Multimedia Content)}, volume = {158}, year = {2013}, month = {07/2012}, pages = {21-36}, publisher = {Springer}, organization = {Springer}, address = {New York}, abstract = {

This chapter presents a method for caption text detection. The proposed method will be included in a generic indexing system dealing with other semantic concepts which are to be automatically detected as well. To have a coherent detection system, the various object detection algorithms use a common image description, a hierarchical region-based image model. The proposed method takes advantage of texture and geometric features to detect the caption text. Texture features are estimated using wavelet analysis and mainly applied for\ text candidate spotting. In turn,\ text characteristics verification\ relies on geometric features, which are estimated exploiting the region-based image model. Analysis of the region hierarchy provides the final caption text objects. The final step of\ consistency analysis for output\ is performed by a binarization algorithm that robustly estimates the thresholds on the caption text area of support.

}, keywords = {Text detection}, isbn = {978-1-4614-3830-4}, doi = {10.1007/978-1-4614-3831-1_2}, author = {Le{\'o}n, M. and Ver{\'o}nica Vilaplana and Gasull, A. and Marqu{\'e}s, F.} } @conference {cVilaplana13, title = {Salient Object Detection on a Hierarchy of Image Partitions}, booktitle = {IEEE Int. Conf. in Image Processing, ICIP 2013}, year = {2013}, month = {09/2013}, address = {Melbourne, Australia}, author = {Ver{\'o}nica Vilaplana and G. Muntaner} } @article {aTochon13, title = {Segmentation hyperspectrales de forets tropicales par arbres de partition binaires}, journal = {Revue fran{\c c}aise de photogramm{\'e}trie et de t{\'e}l{\'e}d{\'e}tection}, volume = {202}, year = {2013}, month = {May 2013}, pages = {55-65}, author = {G. Tochon and J.B. Feret and Valero, S. and R.E. Martin and R. Tupayachi and Chanussot, J. and Salembier, P. and G. Asner} } @conference {cNiamut13, title = {Towards A Format-agnostic Approach for Production, Delivery and Rendering of Immersive Media}, booktitle = {ACM Multimedia Systems}, year = {2013}, month = {3/2013}, address = {Oslo, Norway}, abstract = {

The media industry is currently being pulled in the often-opposing directions of increased realism (high resolution, stereoscopic, large screen) and personalization (selection and control of content, availability on many devices). We investigate the feasibility of an end-to-end format-agnostic approach to support both these trends. In this paper, different aspects of a format- agnostic capture, production, delivery and rendering system are discussed. At the capture stage, the concept of layered scene representation is introduced, including panoramic video and 3D audio capture. At the analysis stage, a virtual director component is discussed that allows for automatic execution of cinematographic principles, using feature tracking and saliency detection. At the delivery stage, resolution-independent audiovisual transport mechanisms for both managed and unmanaged networks are treated. In the rendering stage, a rendering process that includes the manipulation of audiovisual content to match the connected display and loudspeaker properties is introduced. Different parts of the complete system are revisited demonstrating the requirements and the potential of this advanced concept.

}, author = {Niamut, O. and Kaiser, R. and Kienast, G. and Kochale, A. and Spille, J. and Schreer, O. and Ruiz-Hidalgo, J. and Macq, J. and Shirley, B.} } @conference {cVentura13a, title = {UPC at MediaEval 2013 Hyperlinking Task}, booktitle = {MediaEval 2013 Workshop}, year = {2013}, month = {10/2013}, publisher = {CEUR Workshop Proceedings Vol-1043}, organization = {CEUR Workshop Proceedings Vol-1043}, address = {Barcelona, Catalonia}, abstract = {

These working notes paper present the contribution of the UPC team to the Hyperlinking sub-task of the Search and Hyperlinking Task in MediaEval 2013. Our contribution explores the potential of a solution based only on visual cues.

In particular, every automatically generated shot is represented by a keyframe. The linking between video segments is based on the visual similarity of the keyframes they contain. Visual similarity is assessed with the intersection of bag of features histograms generated with the SURF descriptor.

Xavier Gir{\'o}, "UPC at MediaEval Hyperlinking 2013" from Xavi Gir{\'o}

Presentation on MediaEval Search \& Linking task 2013 from Netherlands Institute for Sound and Vision, University of Twente
}, keywords = {Benchmark, Hyperlinking, Video Collections}, url = {http://ceur-ws.org/Vol-1043/mediaeval2013_submission_35.pdf}, author = {Ventura, C. and Tella, Marcel and Xavier Gir{\'o}-i-Nieto} } @conference {cManchon-Vizuete13, title = {UPC at MediaEval 2013 Social Event Detection Task}, booktitle = {MediaEval 2013 Workshop}, year = {2013}, month = {10/2013}, publisher = {CEUR Workshop Proceedings}, organization = {CEUR Workshop Proceedings}, address = {Barcelona, Catalonia}, abstract = {

These working notes present the contribution of the UPC team to the Social Event Detection (SED) task in MediaEval 2013. The proposal extends the previous PhotoTOC work in the domain of shared collections of photographs stored in cloud services. An initial over-segmentation of the photo collection is later refined by merging pairs of similar clusters.

Xavier Gir{\'o}-i-Nieto, "UPC at MediaEval Social Event Detection 2013" from Xavi Gir{\'o}
}, keywords = {Benchmark, Event Detection, Photo Collections}, url = {http://ceur-ws.org/Vol-1043/mediaeval2013_submission_34.pdf}, author = {Manchon-Vizuete, Daniel and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xTort13, title = {Video Clustering Using Camera Motion}, year = {2013}, abstract = {

This document contains the work done in INP Grenoble during the second semester of the academic year 2011-2012, completed in Barcelona during the firsts months of the 2012-2013. The work presented consists in a camera motion study in different types of video in order to group fragments that have some similarity in the content.\ 

In the document it is explained how the data extracted by the program Motion 2D, proportionated by the French university, are treated in order to represented them in a more simplified using motion histograms. It is also explained how the different distances between histograms are calculated and how its similarity is computed.\ 

Three different distances are used: Manhattan, Euclidean and Bhattacharyya, although in the project there can be found the explanation of some others a little bit more complicated. Different histogram configurations are used, using more or less bins to represent the motion.\ 

Every possible combination of the number of bins and distances are evaluated using a group of 30 fragments of video and the clustering algorithm K-Means. The clustering results are evaluated using F1-Score, a very popular measurement suitable for clustering algorithms and also classification.

}, url = {http://hdl.handle.net/2099.1/17337}, author = {Tort, Laura}, editor = {Xavier Gir{\'o}-i-Nieto and Rombaut, Mich{\`e}le and Pellerin, Denis} } @mastersthesis {xVentura13, title = {Visual Object Analysis Using Regions and Interest Points}, year = {2013}, abstract = {

This dissertion research explores two of the most-used image models for object detection, 3D reconstruction, visual search: region-based and interest-points image representations; and will try to provide a new image model to take advantage of the strengths and overcome the weaknesses of both approaches. More specifically, we will focus on the gPb-owt-ucm segmentation algorithm and the SIFT local features since they are the most contrasted techniques in their respective fields. Furthermore, using an object retrieval benchmark, this dissertation research will analyze three basic questions: (i) the usefulness of an interest points hierarchy based on a contour strength signal, (ii) the influence of the context on both interest points location and description, and (iii) the analysis of regions as spatial support for bundling interest points.\ 

}, author = {Ventura, C.}, editor = {Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @mastersthesis {cVentura13b, title = {Visual Object Analysis Using Regions and Interest Points}, journal = {ACM Multimedia}, year = {2013}, month = {10/2013}, abstract = {

This dissertation research will explore region-based and interest points based image representations, two of the most-used image models for object detection, image classification, and visual search among other applications. We will analyze the relationship between both representations with the goal of proposing a new hybrid representation that takes
advantage of the strengths and overcomes the weaknesses of both approaches. More specifically, we will focus on the gPb-owt-ucm segmentation algorithm and the SIFT local features since they are the most contrasted techniques in their respective fields. Furthermore, using an object retrieval benchmark, this dissertation research will analyze three basic questions: (i) the usefulness of an interest points hierarchy based on a contour strength signal, (ii) the influence of the context on both interest points location and description, and (iii) the analysis of regions as spatial support for bundling interest points.

}, keywords = {hierarchical segmentation, Image Representation, Interest Points, Object Retrieval}, doi = {10.1145/2502081.2502220}, url = {http://dx.doi.org/10.1145/2502081.2502220}, author = {Ventura, C.}, editor = {Xavier Gir{\'o}-i-Nieto} } @conference {cPalou12, title = {2.1 Depth Estimation of Frames in Image Sequences Using Motion Occlusions}, booktitle = {Computer Vision {\textendash} ECCV 2012. Workshops and Demonstrations}, year = {2012}, publisher = {Springer Berlin Heidelberg}, organization = {Springer Berlin Heidelberg}, isbn = {978-3-642-33884-7}, doi = {10.1007/978-3-642-33885-4_52}, url = {http://dx.doi.org/10.1007/978-3-642-33885-4_52}, author = {Palou, G. and Salembier, P.} } @phdthesis {dLopez-Mendez12, title = {Articulated Models for Human Motion Analysis}, year = {2012}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

Human motion analysis is as a broad area of computer vision that has strongly attracted the interest of researchers in the last decades. Motion analysis covers topics such as human motion tracking and estimation, action and behavior recognition or segmentation of human motion. All these fields are challenging due to different reasons, but mostly because of viewing perspectives, clutter and the imprecise semantics of actions and human motion. The computer vision community has addressed human motion analysis from several perspectives. Earlier approaches often relied on articulated human body models represented in the three-dimensional world. However, due to the traditionally high difficulty and cost of estimating such an articulated structure from video, research has focus on the development of human motion analysis approaches relying on low-level features. Although obtaining impressive results in several tasks, low-level features are typically conditioned by appearance and viewpoint, thus making difficult their application on different scenarios. Nonetheless, the increase in computational power, the massive availability of data and the irruption of consumer-depth cameras is changing the scenario, and with that change human motion analysis through articulated models can be reconsidered. Analyzing and understanding of human motion through 3-dimensional information is still a crucial issue in order to obtain richer models of dynamics and behavior. In that sense, articulated models of the human body offer a compact and view-invariant representation of motion that can be used to leverage motion analysis. In this dissertation, we present several approaches for motion analysis. In particular, we address the problem of pose inference, action recognition and temporal clustering of human motion. Articulated models are the leitmotiv in all the presented approaches. Firstly, we address pose inference by formulating a layered analysis-by-synthesis framework where models are used to generate hypothesis that are matched against video. Based on the same articulated representation upon which models are built, we propose an action recognition framework. Actions are seen as time-series observed through the articulated model and generated by underlying dynamical systems that we hypothesize that are generating the time-series. Such an hypothesis is used in order to develop recognition methods based on time-delay embeddings, which are analysis tools that do not make assumptions on the form of the form of the underlying dynamical system. Finally, we propose a method to cluster human motion sequences into distinct behaviors, without a priori knowledge of the number of actions in the sequence. Our approach relies on the articulated model representation in order to learn a distance metric from pose data. This metric aims at capturing semantics from labeled data in order to cluster unseen motion sequences into meaningful behaviors. The proposed approaches are evaluated using publicly available datasets in order to objectively measure our contributions.

}, url = {http://hdl.handle.net/10803/112124}, author = {L{\'o}pez-M{\'e}ndez, A.}, editor = {Casas, J.} } @conference {cLopez-Mendez12, title = {Can our TV robustly understand human gestures? Real-Time Gesture Localization in Range Data}, booktitle = {Conference on Visual Media Production}, year = {2012}, month = {12/2012}, publisher = {ACM}, organization = {ACM}, address = {London, UK}, abstract = {

The {\textquoteright}old{\textquoteright} remote falls short of requirements when confronted with digital convergence for living room displays. Enriched options to watch, manage and interact with content on large displays demand improved means of interaction. Concurrently, gesture recognition is increasingly present in human-computer interaction for gaming applications. In this paper we propose a gesture localization framework for interactive display of audio-visual content. The proposed framework works with range data captured from a single consumer depth camera. We focus on still gestures because they are generally user friendly (users do not have to make complex and tiring movements) and allow formulating the problem in terms of object localization. Our method is based on random forests, which have shown an excellent performance on classification and regression tasks. In this work, however, we aim at a specific class of localization problems involving highly unbalanced data: positive examples appear during a small fraction of space and time. We study the impact of this natural unbalance on the random forest learning and we propose a framework to robustly detect gestures on range images in real applications. Our experiments with offline data show the effectiveness of our approach. We also present a real-time application where users can control the TV display with a reduced set of still gestures.

}, isbn = {978-1-4503-1311-7/12/11}, doi = {10.1145/2414688.2414691}, url = {http://www.cvmp-conference.org/2012-Papers}, author = {L{\'o}pez-M{\'e}ndez, A. and Casas, J.} } @conference {cMaceira12, title = {Depth map coding based on a optimal hierarchical region representation}, booktitle = {3DTV Conference}, year = {2012}, month = {10/2012}, publisher = {IEEE}, organization = {IEEE}, address = {Zurich, Switzerland}, abstract = {

Multiview color information used jointly with depth maps is a widespread technique for 3D video. Using\ this depth information, 3D functionalities such as free view point video can be provided by means of\ depth-image-based rendering techniques. In this paper, a new technique to encode depth maps is\ proposed. Based on the usually smooth structure and the sharp edges of depth map, our proposal segments\ the depth map into homogeneous regions of arbitrary shape and encodes the contents of these regions\ using different texture coding strategies. An optimal lagrangian approach is applied to the hierarchical\ region representation provided by our segmentation technique. This approach automatically selects the\ best encoding strategy for each region and the optimal partition to encode the depth map. To avoid the\ high coding costs of coding the resulting partition, a prediction is made using the associated decoded\ color image.

}, keywords = {3DTV, Depth map coding, depth/texture compresion, rate-distortion optimization, Shape-adaptive DCT}, isbn = {978-1-4673-4903-1}, doi = {10.1109/3DTV.2012.6365481}, author = {Maceira, M. and Ruiz-Hidalgo, J. and Morros, J.R.} } @conference {cPalou12b, title = {Depth Ordering on Image Sequences Using Motion Occlusions}, booktitle = {IEEE Int. Conf. in Image Processing, ICIP 2012}, year = {2012}, month = {09/2012}, address = {Orlando, Florida, USA}, author = {Palou, G. and Salembier, P.} } @article {aCuadras12, title = {Distance-based measures of association with applications in relating hyperspectral images}, journal = {Communications in Statistics - Theory and Method}, volume = {41}, year = {2012}, pages = {2342{\textendash}2355}, abstract = {

We propose a distance-based method to relate two data sets. We define and study some measures of multivariate association based on distances between observations. The proposed approach can be used to deal with general data sets (e.g., observations on continuous, categorical or mixed variables). An application, using Hellinger distance, provides the relationships between two regions of hyperspectral images.

}, issn = {0361-0918}, doi = {10.1080/03610926.2012.654880}, author = {Cuadras, C. and Valero, S. and Cuadras, D. and Salembier, P. and Chanussot, J.} } @article {aGallego12, title = {Enhanced foreground segmentation and tracking combining Bayesian background, shadow and foreground modeling}, journal = {Pattern Recognition Letters}, volume = {33}, year = {2012}, month = {09/2012}, pages = {1558{\textendash}1568}, type = {journal}, chapter = {1558}, abstract = {

In this paper we present a foreground segmentation and tracking system for monocular static camera sequences and indoor scenarios that achieves correct foreground detection also in those complicated scenes where similarity between foreground and background colours appears. The work flow of the system is based on three main steps: An initial foreground detection performs a simple segmentation via Gaussian pixel color modeling and shadows removal. Next, a tracking step uses the foreground segmentation for identifying the objects, and tracks them using a modified mean shift algorithm. At the end, an enhanced foreground segmentation step is formulated into a Bayesian framework. For this aim, foreground and shadow candidates are used to construct probabilistic foreground and shadow models. The Bayesian framework combines a pixel-wise color background model with spatial-color models for the foreground and shadows. The final classification is performed using the graph-cut algorithm. The tracking step allows a correct updating of the probabilistic models, achieving a foreground segmentation that reduces the false negative and false positive detections, and obtaining a robust segmentation and tracking of each object of the scene.

}, keywords = {Foreground segmentation, GMM, Objects tracking, Shadow model, Space-color models}, issn = {0167-8655}, doi = {10.1016/j.patrec.2012.05.004}, url = {http://www.sciencedirect.com/science/article/pii/S016786551200164X}, author = {Gallego, J. and M. Pard{\`a}s and Haro, G.} } @article {aAlonso-Gonzalez12, title = {Filtering and Segmentation of Polarimetric SAR Data Based on Binary Partition Trees}, journal = {IEEE transactions on geoscience and remote sensing}, volume = {50}, year = {2012}, pages = {593{\textendash}605}, abstract = {

In this paper,we propose the use of binary partition trees (BPT) to introduce a novel region-based and multi-scale polarimetric SAR (PolSAR) data representation. The BPT structure represents homogeneous regions in the data at different detail levels. The construction process of the BPT is based, firstly, on a region model able to represent the homogeneous areas, and, secondly, on a dissimilarity measure in order to identify similar areas and define the merging sequence. Depending on the final application, a BPT pruning strategy needs to be introduced. In this paper, we focus on the application of BPT PolSAR data representation for speckle noise filtering and data segmentation on the basis of the Gaussian hypothesis, where the average covariance or coherency matrices are considered as a region model. We introduce and quantitatively analyze different dissimilarity measures. In this case, and with the objective to be sensitive to the complete polarimetric information under the Gaussian hypothesis, dissimilarity measures considering the complete covariance or coherency matrices are employed. When confronted to PolSAR speckle filtering, two pruning strategies are detailed and evaluated. As presented, the BPT PolSAR speckle filter defined filters data according to the complete polarimetric information. As shown, this novel filtering approach is able to achieve very strong filtering while preserving the spatial resolution and the polarimetric information. Finally, the BPT representation structure is employed for high spatial resolution image segmentation applied to coastline detection. The analyses detailed in this work are based on simulated, as well as on real PolSAR data acquired by the ESAR system of DLR and the RADARSAT-2 system.

}, issn = {0196-2892}, doi = {10.1109/TGRS.2011.2160647}, author = {Alonso-Gonz{\'a}lez, A. and L{\'o}pez-Mart{\'\i}nez, C. and Salembier, P.} } @inbook {bGallego12, title = {Foreground objects segmentation for moving camera scenarios based on SCGMM}, booktitle = {Computational Intelligence for Multimedia Understanding}, volume = {7252}, number = {Lecture Notes in Computer Science}, year = {2012}, month = {09/2011}, pages = {195-206}, publisher = {Springer}, organization = {Springer}, address = {Berlin Heidelberg}, abstract = {

In this paper we present a new system for segmenting non-rigid objects in moving camera sequences for indoor and outdoor scenarios that achieves a correct object segmentation via global MAP-MRF framework formulation for the foreground and background classification task. Our proposal, suitable for video indexation applications, receives as an input an initial segmentation of the object to segment and it consists of two region-based parametric probabilistic models to model the spatial (x,y) and color (r,g,b) domains of the foreground and background classes. Both classes rival each other in modeling the regions that appear within a dynamic region of interest that includes the foreground object to segment and also, the background regions that surrounds the object. The results presented in the paper show the correctness of the object segmentation, reducing false positive and false negative detections originated by the new background regions that appear near the region of the object.

}, keywords = {moving camera sequences, Object segmentation, SCGMM, video indexation}, issn = {978-3-642-32435-2}, doi = {10.1007/978-3-642-32436-9_17}, url = {http://www.springerlink.com/content/r6u266562h586476/}, author = {Gallego, J. and M. Pard{\`a}s and Solano, M.} } @conference {cPalou12a, title = {From local occlusion cues to global depth estimation}, booktitle = {IEEE Int. Conf. on Acoustics Speech and Signal Processing, ICASSP 2012}, year = {2012}, month = {03/2012}, address = {Kyoto, Japan}, author = {Palou, G. and Salembier, P.} } @article {aBosio12, title = {Gene Expression Data Classification Combining Hierarchical Representation and Efficient Feature Selection}, journal = {Journal of Biological Systems}, volume = {20}, year = {2012}, pages = {349-375}, abstract = {

A general framework for microarray data classification is proposed in this paper. It produces precise and reliable classifiers through a two-step approach. At first, the original feature set is enhanced by a new set of features called metagenes. These new features are obtained through a hierarchical clustering process on the original data. Two different metagene generation rules have been analyzed, called Treelets clustering and Euclidean clustering. Metagenes creation is attractive for several reasons: first, they can improve the classification since they broaden the available feature space and capture the common behavior of similar genes reducing the residual measurement noise. Furthermore, by analyzing some of the chosen metagenes for classification with gene set enrichment analysis algorithms, it is shown how metagenes can summarize the behavior of functionally related probe sets. Additionally, metagenes can point out, still undocumented, highly discriminant probe sets numerically related to other probes endowed with prior biological information in order to contribute to the knowledge discovery process.

The second step of the framework is the feature selection which applies the Improved Sequential Floating Forward Selection algorithm (IFFS) to properly choose a subset from the available feature set for classification composed of genes and metagenes. Considering the microarray sample scarcity problem, besides the classical error rate, a reliability measure is introduced to improve the feature selection process. Different scoring schemes are studied to choose the best one using both error rate and reliability. The Linear Discriminant Analysis classifier (LDA) has been used throughout this work, due to its good characteristics, but the proposed framework can be used with almost any classifier. The potential of the proposed framework has been evaluated analyzing all the publicly available datasets offered by the Micro Array Quality Control Study, phase II (MAQC). The comparative results showed that the proposed framework can compete with a wide variety of state of the art alternatives and it can obtain the best mean performance if a particular setup is chosen. A Monte Carlo simulation confirmed that the proposed framework obtains stable and repeatable results.

}, doi = {10.1142/S0218339012400025}, url = {http://www.worldscientific.com/doi/abs/10.1142/S0218339012400025}, author = {Bosio, M. and P. Bellot and Salembier, P. and Albert Oliveras} } @inbook {bVentura12, title = {Hierarchical Navigation and Visual Search for Video Keyframe Retrieval}, booktitle = {Advances in Multimedia Modeling}, series = {Lecture Notes in Computer Science}, volume = {7131}, year = {2012}, pages = {652-654}, publisher = {Springer Berlin / Heidelberg}, organization = {Springer Berlin / Heidelberg}, abstract = {

This work presents a browser that supports two strategies for video browsing: the navigation through visual hierarchies and the retrieval of similar images. The input videos are firstly processed by a keyframe extractor to reduce the temporal redundancy and decrease the number of elements to consider. These generated keyframes are hierarchically clustered with the Hierachical Cellular Tree (HCT) algorithm, an indexing technique that also allows the creation of data structures suitable for browsing. Different clustering criteria are available, in the current implementation, based on four MPEG-7 visual descriptors computed at the global scale. The navigation can directly drive the user to find the video timestamps that best match the query or to a keyframe which is globally similar in visual terms to the query. In the latter case, a visual search engine is also available to find other similar keyframes, based as well on MPEG-7 visual descriptors.

Winners of the Novice Run at the VideoBrowser Showdown 2012 at the 18th International Conference on MultiMedia Modeling, Kalgenfurt, Austria, January 4-6, 2012.

}, keywords = {hierarchical navigation, image retrieval, video browser}, isbn = {978-3-642-27354-4}, doi = {10.1007/978-3-642-27355-1_67}, url = {http://dx.doi.org/10.1007/978-3-642-27355-1_67}, author = {Ventura, C. and Martos, Manel and Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @inbook {bCalderero12 , title = {Image Analysis and Understanding Based on Information Theoretical Region Merging Approaches for Segmentation and Cooperative Fusion}, booktitle = {Handbook of Research on Computational Intelligence for Engineering, Science, and Business}, year = {2012}, pages = {75-121}, publisher = {IGI Global}, organization = {IGI Global}, chapter = {4}, abstract = {

This chapter addresses the automatic creation of simplified versions of the image, known as image segmentation or partition, preserving the most semantically relevant information of the image at different levels of analysis. From a semantic and practical perspective, image segmentation is a first and key step for image analysis and pattern recognition since region-based image representations provide a first level of abstraction and a reduction of the number of primitives, leading to a more robust estimation of parameters and descriptors. The proposed solution is based on an important class of hierarchical bottom-up segmentation approaches, known as region merging techniques. These approaches naturally provide a bottom-up hierarchy, more suitable when no a priori information about the image is available, and an excellent compromise between efficiency of computation and representation. The chapter is organized in two parts dealing with the following objectives: (i) provide an unsupervised solution to the segmentation of generic images; (ii) design a generic and scalable scheme to automatically fuse hierarchical segmentation results that increases the robustness and accuracy of the final solution

}, isbn = {978-1-466-62518-1}, doi = {10.4018/978-1-4666-2518-1.ch004}, author = {Calderero, F. and Marqu{\'e}s, F.} } @inbook {bSuau12, title = {INTAIRACT: Joint Hand Gesture and Fingertip Classification for Touchless Interaction}, booktitle = {Computer Vision {\textendash} ECCV 2012}, volume = {7585}, year = {2012}, pages = {602-606}, publisher = {Springer}, organization = {Springer}, chapter = {3}, address = {Heidelberg}, abstract = {

In this demo we present INTAIRACT, an online hand-based touchless interaction system. Interactions are based on easy-to-learn hand gestures, that combined with translations and rotations render a user friendly and highly configurable system. The main advantage with respect to existing approaches is that we are able to robustly locate and identify fingertips. Hence, we are able to employ a simple but powerful alphabet of gestures not only by determining the number of visible fingers in a gesture, but also which fingers are being observed. To achieve such a system we propose a novel method that jointly infers hand gestures and fingertip locations using a single depth image from a consumer depth camera. Our approach is based on a novel descriptor for depth data, the Oriented Radial Distribution (ORD) [1]. On the one hand, we exploit the ORD for robust classification of hand gestures by means of efficient k-NN retrieval. On the other hand, maxima of the ORD are used to perform structured inference of fingertip locations. The proposed method outperforms other state-of-the-art approaches both in gesture recognition and fingertip localization. An implementation of the ORD extraction on a GPU yields a real-time demo running at approximately 17fps on a single laptop.

1. Suau, X., Ruiz-Hidalgo, J., Casas, J.R.: Oriented Radial Distribution on Depth\ Data: Application to the Detection of End-Effectors. In: ICASSP. (2012)

}, isbn = {978-3-642-33885-4}, issn = {978-3-642-33884-7}, doi = {10.1007/978-3-642-33885-4_62}, author = {Suau, X. and Alcoverro, M. and L{\'o}pez-M{\'e}ndez, A. and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cGiro-i-Nieto12a, title = {Interactive segmentation and tracking of video objects}, booktitle = {Image Analysis for Multimedia Interactive Services (WIAMIS), 2012 13th International Workshop on}, year = {2012}, month = {05/2012}, publisher = {IEEE}, organization = {IEEE}, address = {Dublin, Ireland}, abstract = {

This paper describes a mechanism to interactively segment objects from a sequence of video frames. The extracted object can be later embedded in a different background, associated to local scale metadata or used to train an automatic object detector. The workflow requires the interaction of the user at two stages: the temporal segmentation of the frames contain- ing the object and the generation of an object mask to initial- ize a video tracker. The mask is defined as a combination of regions generated by an image segmentation algorithm. This framework has been integrated in an annotation tool available to the public.\ 

}, keywords = {Image segmentation, Object segmentation, Proposals, semantics, Signal processing algorithms, Video sequences, Visualization}, doi = {10.1109/WIAMIS.2012.6226749}, url = {http://dx.doi.org/10.1109/WIAMIS.2012.6226749}, author = {Xavier Gir{\'o}-i-Nieto and Martos, Manel} } @conference {cLopez-Mendez12, title = {Metric Learning from Poses for Temporal Clustering of Human Motion}, booktitle = {British Machine Vision Conference 2012}, year = {2012}, month = {09/2012}, address = {Guildford, UK}, abstract = {

Temporal clustering of human motion into semantically meaningful behaviors is a challenging task. While unsupervised methods do well to some extent, the obtained clusters often lack a semantic interpretation. In this paper, we propose to learn what makes a sequence of human poses different from others such that it should be annotated as an action. To this end, we formulate the problem as weakly supervised temporal clustering for an unknown number of clusters. Weak supervision is attained by learning a metric from the implicit semantic distances derived from already annotated databases. Such a metric contains some low-level semantic information that can be used to effectively segment a human motion sequence into distinct actions or behaviors. The main advantage of our approach is that metrics can be successfully used across datasets, making our method a compelling alternative to unsupervised methods. Experiments on publicly available mocap datasets show the effectiveness of our approach.

}, doi = {10.5244/C.26.49}, author = {L{\'o}pez-M{\'e}ndez, A. and Gall, J. and Casas, J. and van Gool, L.} } @conference {cBosio12, title = {Microarray classification with hierarchical data representation and novel feature selection criteria}, booktitle = {IEEE 12th International Conference on BioInformatics and BioEngineering}, year = {2012}, month = {11/2012}, address = {Larnaca, Cyprus}, author = {Bosio, M. and P. Bellot and Salembier, P. and Albert Oliveras} } @article {aLopez-Mendez12, title = {Model-Based Recognition of Human Actions by Trajectory Matching in Phase Spaces}, journal = {Image and Vision Computing}, year = {2012}, abstract = {

This paper presents a human action recognition framework based on the theory of nonlinear dynamical systems. The ultimate aim of our method is to recognize actions from multi-view video. We estimate and represent human motion by means of a virtual skeleton model providing the basis for a view-invariant representation of human actions. Actions are modeled as a set of weighted dynamical systems associated to different model variables. We use time-delay embeddings on the time series resulting of the evolution of model variables along time to reconstruct phase portraits of appropriate dimensions. These phase portraits characterize the underlying dynamical systems. We propose a distance to compare trajectories within the reconstructed phase portraits. These distances are used to train SVM models for action recognition. Additionally, we propose an efficient method to learn a set of weights reflecting the discriminative power of a given model variable in a given action class. Our approach presents a good behavior on noisy data, even in cases where action sequences last just for a few frames. Experiments with marker-based and markerless motion capture data show the effectiveness of the proposed method. To the best of our knowledge, this contribution is the first to apply time-delay embeddings on data obtained from multi-view video.

}, keywords = {action recognition}, issn = {0262-8856}, doi = {10.1016/j.imavis.2012.06.007}, url = {http://www.sciencedirect.com/science/article/pii/S0262885612000959?v=s5}, author = {L{\'o}pez-M{\'e}ndez, A. and Casas, J.} } @conference {cBosio12a, title = {Multiclass cancer microarray classification algorithm with Pair-Against-All redundancy}, booktitle = {The 2012 IEEE International Workshop on Genomic Signal Processing and Statistics (GENSIPS{\textquoteright}12)}, year = {2012}, month = {12/2012}, address = {Washington, DC, USA}, author = {Bosio, M. and P. Bellot and Salembier, P. and Albert Oliveras} } @conference {cGiro-i-Nieto12b, title = {Multiscale annotation of still images with GAT}, booktitle = {Proceedings of the 1st International Workshop on Visual Interfaces for Ground Truth Collection in Computer Vision Applications}, year = {2012}, publisher = {ACM}, organization = {ACM}, address = {Capri, Italy}, abstract = {

This paper presents GAT, a Graphical Annotation Tool for still images that works both at the global and local scales. This interface has been designed to assist users in the an- notation of images with relation to the semantic classes de- scribed in an ontology. Positive, negative and neutral labels can be assigned to both the whole images or parts of them. The user interface is capable of exploiting segmentation data to assist in the selection of objects. Moreover, the annota- tion capabilities are complemented with additional function- alities that allow the creation and evaluation of an image classifier. The implemented Java source code is published under a free software license.\ 

}, keywords = {annotation, image, interactive, segmentation, semantics}, isbn = {978-1-4503-1405-3}, doi = {10.1145/2304496.2304497}, url = {http://doi.acm.org/10.1145/2304496.2304497}, author = {Xavier Gir{\'o}-i-Nieto and Martos, Manel} } @article {aCalderero12, title = {Multispectral Cooperative Partition Sequence Fusion for Joint Classification and Hierarchical Segmentation}, journal = {Geoscience and Remote Sensing Letters, IEEE}, volume = {9}, year = {2012}, pages = {1012-1016}, abstract = {

In this letter, a region-based fusion methodology is presented for joint classification and hierarchical segmentation of specific ground cover classes from high-spatial-resolution remote sensing images. Multispectral information is fused at the partition level using nonlinear techniques, which allows the different relevance of the various bands to be fully exploited. A hierarchical segmentation is performed for each individual band, and the ensuing segmentation results are fused in an iterative and cooperative way. At each iteration, a consensus partition is obtained based on information theory and is combined with a specific ground cover classification. Here, the proposed approach is applied to the extraction and segmentation of vegetation areas. The result is a hierarchy of partitions with the most relevant information of the vegetation areas at different levels of resolution. This system has been tested for vegetation analysis in high-spatial-resolution images from the QuickBird and GeoEye satellites.

}, keywords = {GeoEye satellite, geophysical image processing, geophysical techniques, ground cover classification, hierarchical segmentation, high-spatial-resolution remote sensing images, image classification, image fusion, image region analysis, Image segmentation, information fusion, information theory, joint classification, Joints, Merging, multispectral cooperative partition sequence fusion, multispectral images, multispectral information, nonlinear techniques, partition level, QuickBird satellite, region merging, region-based fusion methodology, Remote sensing, Spatial resolution, specific ground cover classes, Vegetation mapping}, issn = {1545-598X}, doi = {10.1109/LGRS.2012.2188776}, author = {Calderero, F. and F. Eugenio and Marcello, J. and Marqu{\'e}s, F.} } @conference {cNavarro12 , title = {Multi-view Body Tracking with a Detector-Driven Hierarchical Particle Filter}, booktitle = {7th International Conference AMDO 2012}, year = {2012}, month = {07/2012}, publisher = {Springer}, organization = {Springer}, address = {Port d{\textquoteright}Andratx, Mallorca}, abstract = {

In this paper we present a novel approach to markerless human motion capture that robustly integrates body part detections in multiple views. The proposed method fuses cues from multiple views to enhance the propagation and observation model of particle filtering methods aiming at human motion capture. We particularize our method to improve arm tracking in the publicly available IXMAS dataset. Our experiments show that the proposed method outperforms other state-of-the-art approaches.

}, doi = {10.1007/978-3-642-31567-1_8}, author = {Navarro, Sergio and L{\'o}pez-M{\'e}ndez, A. and Alcoverro, M. and Casas, J.} } @inbook {bNavarro12, title = {Multi-view Body Tracking with a Detector-Driven Hierarchical Particle Filter}, booktitle = {Lecture Notes in Computer Science: Articulated Motion and Deformable Objects}, series = {Lecture Notes in Computer Science}, volume = {7378}, year = {2012}, pages = {82-91}, publisher = {Springer }, organization = {Springer }, address = {Berlin / Heidelberg}, abstract = {

In this paper we present a novel approach to markerless human motion capture that robustly integrates body part detections in multiple views. The proposed method fuses cues from multiple views to enhance the propagation and observation model of particle filtering methods aiming at human motion capture. We particularize our method to improve arm tracking in the publicly available IXMAS dataset. Our experiments show that the proposed method outperforms other state-of-the-art approaches.

}, isbn = {978-3-642-31566-4}, doi = {10.1007/978-3-642-31567-1_8}, author = {Navarro, Sergio and L{\'o}pez-M{\'e}ndez, A. and Alcoverro, M. and Casas, J.}, editor = {Perales, Francisco and Fisher, Robert and Moeslund, Thomas} } @article {aRuiz-Hidalgo12, title = {Multiview depth coding based on combined color/depth segmentation}, journal = {Journal of visual communication and image representation}, volume = {23}, number = {1}, year = {2012}, pages = {42{\textendash}52}, abstract = {

In this paper a new coding method for multiview depth video is presented. Considering the smooth structure and sharp edges of depth maps, a segmentation based approach is proposed. This allows further preserving the depth contours thus introducing fewer artifacts in the depth perception of the video. To reduce the cost associated with partition coding, an estimation of the depth partition is built using the decoded color view segmentation. This estimation is refined by sending some complementary information about the relevant differences between color and depth partitions. For coding the depth content of each region, a decomposition into orthogonal basis is used in this paper although similar decompositions may be also employed. Experimental results show that the proposed segmentation based depth coding method outperforms H.264/AVC and H.264/MVC by more than 2dB at similar bitrates.

}, keywords = {3DTV, Depth map, multiview video coding, virtual view}, issn = {1047-3203}, doi = {10.1016/j.jvcir.2011.08.001}, url = {http://www.sciencedirect.com/science/article/pii/S1047320311001040}, author = {Ruiz-Hidalgo, J. and Morros, J.R. and Aflaki, P. and Calderero, F. and Marqu{\'e}s, F.} } @conference {cSuau12, title = {Oriented radial distribution on depth data: Application to the detection of end-effectors}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing}, year = {2012}, month = {03/2012}, address = {Kyoto, Japan}, abstract = {

End-effectors are considered to be the main topological extremities of a given 3D body. Even if the nature of such body is not restricted, this paper focuses on the human body case. Detection of human extremities is a key issue in the human motion capture domain, being needed to initialize and update the tracker. Therefore, the effectiveness of human motion capture systems usually depends on the reliability of the obtained end-effectors. The increasing accuracy, low cost and easy installation of depth cameras has opened the door to new strategies to overcome the body pose estimation problem. With the objective of detecting the head, hands and feet of a human body, we propose a new local feature computed from depth data, which gives an idea of its curvature and prominence. Such feature is weighted depending on recent detections, providing also a temporal dimension. Based on this feature, some end-effector candidate blobs are obtained and classified into head, hands and feet according to three probabilistic descriptors.

}, doi = {10.1109/ICASSP.2012.6288002}, author = {Suau, X. and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cGiro-i-Nieto12, title = {Part-Based Object Retrieval With Binary Partition Trees}, booktitle = {Doctoral Consortium in Computer Vision and Pattern Recognition (CVPR)}, year = {2012}, month = {06/2012}, publisher = {IEEE Computer Society}, organization = {IEEE Computer Society}, address = {Providence (RI), USA}, abstract = {

This Phd thesis {\textquotedblleft}Part-based Object Retrieval with Binary Partition Trees{\textquotedblright} addresses the problem of\ visual object retrieval, where a user formulates a query to an image database by providing one or\ multiple examples of an object of interest. The developed techniques aim both at \ finding those\ images in the database that contain the object as well as locating the object in the image and\ segmenting it from the background. Every considered image, both the ones used as queries and the\ ones contained in the target database, is represented as a Binary Partition Tree (BPT), the hierarchy\ of regions previously proposed by Salembier and Garrido (2000). This data structure offers multiple\ opportunities and challenges when applied to the object retrieval problem.

One application of BPTs appears during the formulation of the query, when the user must\ interactively segment the query object from the background. Firstly, the BPT can assist in adjusting\ an initial marker, such as a scribble or bounding box, to the object contours. Secondly, BPT can also\ define a navigation path for the user to adjust an initial selection to the appropriate scale. The\ hierarchical structure of the BPT is also exploited to extract a new type of visual words named\ Hierarchical Bag of Regions (HBoR). Each region defined in the BPT is characterized with a\ feature vector that combines a soft quantization on a visual codebook with an efficient bottom-up\ computation through the BPT. These features allow the definition of a novel feature space, the Parts\ Space, where each object is located according to the parts that compose it.

HBoR features have been applied to two scenarios for object retrieval, both of them solved\ by considering the decomposition of the objects in parts. In the first scenario, the query is formulated\ with a single object exemplar which is to be matched with each BPT in the target database. The\ matching problem is solved in two stages: an initial top-down one that assumes that the hierarchy\ from the query is \ respected in the target BPT, and a second bottom-up one that relaxes \ this\ condition and considers region merges which are not in the target BPT. The second scenario where

HBoR features are applied considers a query composed of several visual objects. In this case, theprovided exemplars are considered as a training set to build a model of the query concept. This\ model is composed of two levels, a \ first one where each part is modelled and detected separately,\ and a second one that characterises the combinations of parts that describe the complete object. The\ analysis process exploits the hierarchical nature of the BPT by using a novel classifier that drives an\ efficient top-down analysis of the target BPTs.

}, author = {Xavier Gir{\'o}-i-Nieto} } @phdthesis {dGiro-i-Nieto12, title = {Part-Based Object Retrieval With Binary Partition Trees}, volume = {Phd}, year = {2012}, month = {05/2012}, pages = {215}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, address = {Barcelona, Catalonia}, abstract = {

This thesis addresses the problem of visual object retrieval, where a user formulates a query to an image database by providing one or multiple examples of an object of interest. The presented techniques aim both at finding those images in the database that contain the object as well as locating the object in the image and segmenting it from the background.

Every considered image, both the ones used as queries and the ones contained in the target database, is represented as a Binary Partition Tree (BPT), the hierarchy of regions previously proposed by Salembier and Garrido (2000). This data structure offers multiple opportunities and challenges when applied to the object retrieval problem.

One application of BPTs appears during the formulation of the query, when the user must interactively segment the query object from the background. Firstly, the BPT can assist in adjusting an initial marker, such as a scribble or bounding box, to the object contours. Secondly, BPT can also define a navigation path for the user to adjust an initial selection to the appropriate scale.

The hierarchical structure of the BPT is also exploited to extract a new type of visual words named Hierarchical Bag of Regions (HBoR). Each region defined in the BPT is char- acterized with a feature vector that combines a soft quantization on a visual codebook with an ecient bottom-up computation through the BPT. These features allow the definition of a novel feature space, the Parts Space, where each object is located according to the parts that compose it.

HBoR features have been applied to two scenarios for object retrieval, both of them solved by considering the decomposition of the objects in parts. In the first scenario, the query is formulated with a single object exemplar which is to be matched with each BPT in the target database. The matching problem is solved in two stages: an initial top-down one that assumes that the hierarchy from the query is respected in the target BPT, and a second bottom-up one that relaxes this condition and considers region merges which are not in the target BPT.

The second scenario where HBoR features are applied considers a query composed of several visual objects, such as a person, a bottle or a logo. In this case, the provided exemplars are considered as a training set to build a model of the query concept. This model is composed of two levels, a first one where each part is modelled and detected separately, and a second one\ that characterises the combinations of parts that describe the complete object. The analysis process exploits the hierarchical nature of the BPT by using a novel classifier that drives an efficient top-down analysis of the target BPTs.\ \ 

Xavier Gir{\'o}-i-Nieto, "Part-based Object Retrieval with Binary Partition Trees" from Xavi Gir{\'o}
}, url = {http://hdl.handle.net/10803/108909}, author = {Xavier Gir{\'o}-i-Nieto}, editor = {Marqu{\'e}s, F. and Chang, Shih-Fu} } @conference {cIrurueta12, title = {PROMEDS: An adaptive robust fundamental matrix estimation approach}, booktitle = {3DTV Conference}, year = {2012}, month = {09/2012}, publisher = {IEEE}, organization = {IEEE}, address = {Zurich, Switzerland}, abstract = {

Accurate fundamental matrix estimation from computed correspondences is hard to achieve depending on the constraints on computational time and available data (i.e. correspondences and quality scores). Several algorithms exist for this task, like the 8-points, the 7-points algorithm or robust methods such as RANSAC, MSAC or LMedS. Robust methods are capable of discriminating correspondence outliers, thus, obtaining better results. Additionally, some variations of the previous methods have been proposed. For instance PROSAC is an improvement of RANSAC which takes into account additional information of the quality of the matches to largely reduce the computational cost of the fundamental matrix estimation process. This work proposes a new robust method for fundamental matrix estimation that combines the benefits of PROSAC and LMedS algorithms, namely improved quality, reduced computational time and less parameters to adjust.

}, keywords = {Fundamental Matrix, LMedS, PROSAC}, author = {Irurueta, A. and Morros, J.R.} } @article {aSuau12, title = {Real-time head and hand tracking based on 2.5D data}, journal = {IEEE Transactions on Multimedia }, volume = {14}, year = {2012}, month = {06/2012}, pages = {575-585 }, abstract = {

A novel real-time algorithm for head and hand tracking is proposed in this paper. This approach is based on data from a range camera, which is exploited to resolve ambiguities and overlaps. The position of the head is estimated with a depth-based template matching, its robustness being reinforced with an adaptive search zone. Hands are detected in a bounding box attached to the head estimate, so that the user may move freely in the scene. A simple method to decide whether the hands are open or closed is also included in the proposal. Experimental results show high robustness against partial occlusions and fast movements. Accurate hand trajectories may be extracted from the estimated hand positions, and may be used for interactive applications as well as for gesture classification purposes.

}, issn = {1520-9210}, doi = {http://dx.doi.org/10.1109/TMM.2012.2189853}, author = {Suau, X. and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cVaras12a, title = {A Region-Based Particle Filter for Generic Object Tracking and Segmentation}, booktitle = {ICIP - International Conference on Image Processing}, year = {2012}, month = {09/2012}, address = {Orlando}, abstract = {

In this work we present a region-based particle filter for generic object tracking and segmentation. The representation of the object in terms of regions homogeneous in color allows the proposed algorithm to robustly track the object and accurately segment its shape along the sequence. Moreover, this segmentation provides a mechanism to update the target model and allows the tracker to deal with color and shape variations of the object. The performance of the algorithm has been tested using the LabelMe Video public database.

The experiments show satisfactory results in both tracking and segmentation of the object without an important increase of the computational time due to an efficient computation of the image partition.\ 

}, doi = {http://dx.doi.org/10.1109/ICIP.2012.6467114}, author = {David Varas and Marqu{\'e}s, F.} } @conference {cJimenez12, title = {Registration of Multi-Modal Neuroimaging Datasets by Considering the Non-Overlapping Field of View into the NMI Calculation}, booktitle = {IEEE International Symposium on Biomedical Imaging, ISBI 2012}, year = {2012}, address = {Barcelona, Spain}, author = {Jim{\'e}nez, X and Figueiras, F and Marqu{\'e}s, F. and Salembier, P. and Herance, R and Rojas, S and Mill{\'a}n, O and Pareto, D and Domingo Gispert, J} } @inbook {bCarcel12, title = {Rich Internet Application for Semi-automatic Annotation of Semantic Shots on Keyframes}, booktitle = {Computational Intelligence for Multimedia Understanding}, volume = {7242}, number = {Lecture Notes in Computer Science}, year = {2012}, pages = {172-182}, publisher = {Springer-Verlag}, organization = {Springer-Verlag}, address = {Pisa, Italy}, abstract = {

This paper describes a system developed for the semi- automatic annotation of keyframes in a broadcasting company. The tool aims at assisting archivists who traditionally label every keyframe manually by suggesting them an automatic annotation that they can intuitively edit and validate. The system is valid for any domain as it uses generic MPEG-7 visual descriptors and binary SVM classifiers. The classification engine has been tested on the multiclass problem of semantic shot detection, a type of metadata used in the company to index new con- tent ingested in the system. The detection performance has been tested in two different domains: soccer and parliament. The core engine is accessed by a Rich Internet Application via a web service. The graphical user interface allows the edition of the suggested labels with an intuitive drag and drop mechanism between rows of thumbnails, each row representing a different semantic shot class. The system has been described as complete and easy to use by the professional archivists at the company.

[Blog]

Rich Internet Application for Semi-Automatic Annotation of Semantic Shots on Keyframes from Xavi Gir{\'o}
}, keywords = {annotation, classification, MPEG-7 visual descriptors, RIA, semantic shot}, isbn = {978-3-642-32435-2}, doi = {10.1007/978-3-642-32436-9_15}, url = {http://www.springerlink.com/content/x34632125j381045/}, author = {Carcel, Elisabet and Martos, Manel and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cPont-Tuset12, title = {Supervised Assessment of Segmentation Hierarchies}, booktitle = {European Conference on Computer Vision (ECCV)}, year = {2012}, month = {01/2012}, abstract = {

This paper addresses the problem of the supervised assessment of hierarchical region-based image representations. Given the large amount of partitions represented in such structures, the supervised assessment approaches in\ the literature are based on selecting a reduced set of representative partitions and\ evaluating their quality. Assessment results, therefore, depend on the partition selection strategy used. Instead, we propose to find the partition in the tree that best\ matches the ground-truth partition, that is, the upper-bound partition selection.

We show that different partition selection algorithms can lead to different conclusions regarding the quality of the assessed trees and that the upper-bound partition\ selection provides the following advantages: 1) it does not limit the assessment\ to a reduced set of partitions, and 2) it better discriminates the random trees from\ actual ones, which reflects a better qualitative behavior. We model the problem as\ a Linear Fractional Combinatorial Optimization (LFCO) problem, which makes\ the upper-bound selection feasible and efficient.

[Poster]

}, doi = {10.1007/978-3-642-33765-9_58}, author = {Jordi Pont-Tuset and Marqu{\'e}s, F.} } @conference {cMohedano12, title = {Teaching Students to Teach Computers}, booktitle = { International Congress on University Teaching and Innovation (CIDUI)}, year = {2012}, month = {07/2012}, abstract = {

This paper presents a Problem-Based Learning activity that introduce machine learning contents in a study plan that is based on electronics and signal processing subjects. The activity proposes students to follow the same evaluation protocols adopted in scientific challenges, where differents research groups test their own machine learning techniques on a common dataset and evaluation metrics. Teams of students adopt the role of a research group to develop their novel solution for classifying images depending on\ whether they represent a certain semantic concepts. Student must implement software solutions that are tested and presented in class to prove their performance and originalty. This paper includes a detailed evaluation of the workload and satisfaction activity, based on questionaries answered by the students.

}, keywords = {groups, machine learning, problem-based learning}, url = {http://hdl.handle.net/2117/16354}, author = {Mohedano, Eva and Xavier Gir{\'o}-i-Nieto} } @conference {cAlonso-Gonzalez12a, title = {Temporal polsar image series exploitation with binary partition trees}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium, IGARSS 2012}, year = {2012}, address = {Munich, Germany}, author = {Alonso-Gonz{\'a}lez, A. and L{\'o}pez-Mart{\'\i}nez, C. and Salembier, P.} } @conference {cPont-Tuset12a, title = {Upper-bound assessment of the spatial accuracy of hierarchical region-based image representations}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing}, year = {2012}, month = {03/2012}, pages = {865-868}, abstract = {

Hierarchical region-based image representations are versatile tools for segmentation, filtering, object detection, etc.\ The evaluation of their spatial accuracy has been usually performed assessing the final result of an algorithm based on\ this representation.\ Given its wide applicability, however, a direct supervised assessment, independent of any application, would be desirable and fair.

A brute-force assessment of all the partitions represented in the hierarchical structure would be a correct approach,\ but as we prove formally, it is computationally unfeasible.\ This paper presents an efficient algorithm to find the upper-bound performance of the representation and we show\ that the previous approximations in the literature can fail at finding this bound.

}, isbn = {978-1-4673-0044-5}, doi = {10.1109/ICASSP.2012.6288021}, author = {Jordi Pont-Tuset and Marqu{\'e}s, F.} } @conference {cAlonso-Gonzalez12, title = {Variable local weight filtering for polsar data speckle noise reduction}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium, IGARSS{\textquoteright}2012}, year = {2012}, address = {Munich, Germany}, author = {Alonso-Gonz{\'a}lez, A. and L{\'o}pez-Mart{\'\i}nez, C. and Salembier, P.} } @conference {cSalvador12, title = {Variational Reconstruction and Restoration for Video Super-Resolution}, booktitle = {International Conference on Pattern Recognition (ICPR)}, year = {2012}, month = {11-2012}, address = {Tsukuba, Japan}, abstract = {

This paper presents a variational framework for obtaining super-resolved video-sequences, based on the observation that reconstruction-based Super-Resolution (SR) algorithms are limited by two factors: registration exactitude and Point Spread Function (PSF) estimation accuracy.\ To minimize the impact of the first limiting factor, a small-scale linear inpainting algorithm is proposed to provide smooth SR video frames.\ To improve the second limiting factor, a fast PSF local estimation and total variation-based denoising is proposed. Experimental results reflect the improvements provided by the proposed method when compared to classic SR approaches.

}, url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?tp=\&arnumber=6460315\&contentType=Conference+Publications\&searchField\%3DSearch_All\%26queryText\%3Dvariational+reconstruction}, author = {Salvador, J. and Rivero, D. and Kochale, A. and Ruiz-Hidalgo, J.} } @book {eRoca11, title = {14{\`e} Premi a la Qualitat en la Doc{\`e}ncia Universit{\`a}ria 2011}, year = {2011}, url = {http://cataleg.upc.edu/record=b1395249~S1*cat}, author = {Roca, E. and Marqu{\'e}s, F.} } @article {aButko11, title = {Acoustic event detection based on feature-level fusion of audio and video modalities}, journal = {Eurasip journal on advances in signal processing}, volume = {2011}, year = {2011}, pages = {1{\textendash}11}, abstract = {

Acoustic event detection (AED) aims at determining the identity of sounds and their temporal position in audio signals. When applied to spontaneously generated acoustic events, AED based only on audio information shows a large amount of errors, which are mostly due to temporal overlaps. Actually, temporal overlaps accounted for more than 70\% of errors in the real-world interactive seminar recordings used in CLEAR 2007 evaluations. In this paper, we improve the recognition rate of acoustic events using information from both audio and video modalities. First, the acoustic data are processed to obtain both a set of spectrotemporal features and the 3D localization coordinates of the sound source. Second, a number of features are extracted from video recordings by means of object detection, motion analysis, and multicamera person tracking to represent the visual counterpart of several acoustic events. A feature-level fusion strategy is used, and a parallel structure of binary HMM-based detectors is employed in our work. The experimental results show that information from both the microphone array and video cameras is useful to improve the detection rate of isolated as well as spontaneously generated acoustic events.

}, issn = {1687-6172}, doi = {10.1155/2011/485738}, url = {http://www.hindawi.com/journals/asp/2011/485738/}, author = {Butko, T. and Cristian Canton-Ferrer and Segura, C. and Xavier Gir{\'o}-i-Nieto and Nadeu, C. and Hernando, J. and Casas, J.} } @conference {cNiamut11, title = {Advanced visual rendering, gesture-based interaction and distributed delivery for immersive and interactive media services}, booktitle = {International Broadcasting Convention 2011}, year = {2011}, pages = {1{\textendash}8}, isbn = {0780388747}, url = {http://www.ibc.org/page.cfm/Action=Visitor/VisitorID=2851/PageOption=Seminar_1/libEntryID=15}, author = {Niamut, O. and Kochale, A. and Ruiz-Hidalgo, J. and Macq, J. and Kienast, G.} } @conference {cLopez-Mendez11, title = {Approximate partitioning of observations in hierarchical particle filter body tracking}, booktitle = {2011 IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops}, year = {2011}, pages = {19{\textendash}24}, abstract = {

This paper presents a model-based hierarchical particle filtering algorithm to estimate the pose and anthropometric parameters of humans in multi-view environments. Our method incorporates a novel likelihood measurement approach consisting of an approximate partitioning of observations. Provided that a partitioning of the human body model has been defined and associates body parts to state space variables, the proposed method estimates image regions that are relevant to that body part and thus to the state space variables of interest. The proposed regions are bounding boxes and consequently can be efficiently processed in a GPU. The algorithm is tested in a challenging dataset involving people playing tennis (TennisSense) and also in the well-known HumanEva dataset. The obtained results show the effectiveness of the proposed method.

}, isbn = {978-1-4577-0529-8}, doi = {10.1109/CVPRW.2011.5981712}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=5981712}, author = {L{\'o}pez-M{\'e}ndez, A. and Alcoverro, M. and M. Pard{\`a}s and Casas, J.} } @conference {cValero11a, title = {Arbre de partition binaire: un nouvel outil pour la repr{\'e}sentation hi{\'e}rarchique et l{\textquoteright}analyse des images hyperspectrales}, booktitle = {XXIII{\`e} Colloque sur le Traitement du Signal et des Images, GRETSI 2011}, year = {2011}, pages = {120{\textendash}125}, address = {Bordeaux, France}, url = {http://hdl.handle.net/2117/14621}, author = {Valero, S. and Salembier, P. and Chanussot, J.} } @conference {cAlonso-Gonzalez11a, title = {Binary partition tree as a polarimetric SAR data representation in the space-time domain}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium, IGARSS 2011}, year = {2011}, pages = {3819{\textendash}3822}, address = {Vanouver, Canada}, isbn = {978-1-4577-1005-6}, doi = {10.1109/IGARSS.2011.6050063}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6050063}, author = {Alonso-Gonz{\'a}lez, A. and L{\'o}pez-Mart{\'\i}nez, C. and Salembier, P.} } @mastersthesis {xRubiano11, title = {B{\'u}squeda Visual con Retroacci{\'o}n de Relevancia Basada en Actualizacion de Pesos}, year = {2011}, abstract = {

This project presents the design and implementation of different techniques of Relevance Feedback for image searches. These techniques use the user interaction with the search results to estimate the importance of various search criteria in the request submitted by the user. The results have been performed through the linear combination of similarity measures of different MPEG-7 visual descriptors. The results of this project have been compared with those previously obtained in the Final Degree Project by Carles Ventura. For this reason the system has been evaluated with the reference database, MPEG-7 Common Color Dataset (CCD).

B{\'u}squeda Visual con Retroacci{\'o}n de Relevancia basada en Actualizaci{\'o}n de Pesos from Xavi Gir{\'o}
}, url = {http://hdl.handle.net/2099.1/11792}, author = {Rubiano, Aida}, editor = {Xavier Gir{\'o}-i-Nieto} } @conference {cSalvador11, title = {A compact 3D representation for multi-view video}, booktitle = {2011 International Conference on 3D Imaging}, year = {2011}, pages = {1{\textendash}8}, abstract = {

This paper presents a methodology for obtaining a 3D reconstruction of a dynamic scene in multi-camera settings. Our target is to derive a compact representation of the 3D scene which is effective and accurate, whatever the number of cameras and even for very-wide baseline settings. Easing realtime 3D scene capture has outstanding applications in 2D and 3D content production, free viewpoint video of natural scenes and interactive video applications.\ 

The method proposed here has several original contributions on how to accelerate the process: it exploits spatial and temporal consistency for speeding up reconstruction, dividing the problem in two parts. First, 3D surfaces are efficiently sampled to obtain a silhouette-consistent set of colored surface points and normals, using a novel algorithm presented in this paper. Then, a fast, greedy meshing algorithm retrieves topologically correct continuous surfaces from the dense sets of oriented points, providing a suitable representation for multi-view video.\ 

Compared to other techniques in the literature, the presented approach is capable of retrieving 3D surfaces of foreground objects in real-time by exploiting the computing capabilities of GPUs. This is feasible due to the parallelized design of the surface sampling algorithm. The reconstructed surfaces can effectively be used for interactive representations.\ 

The presented methodology also offers good scalability to large multi-view video settings.

}, isbn = {978-1-61284-349-0}, doi = {10.1109/IC3D.2011.6584371}, url = {http://www.3dstereomedia.eu}, author = {Salvador, J. and Casas, J.} } @conference {cAlcoverro11a, title = {Connected Operators on 3D data for human body analysis}, booktitle = {2011 IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops}, year = {2011}, pages = {9{\textendash}14}, abstract = {

This paper presents a novel method for filtering and extraction of human body features from 3D data, either from multi-view images or range sensors. The proposed algorithm consists in processing the geodesic distances on a 3D surface representing the human body in order to find prominent maxima representing salient points of the human body. We introduce a 3D surface graph representation and filtering strategies to enhance robustness to noise and artifacts present in this kind of data. We conduct several experiments on different datasets involving 2 multi-view setups and 2 range data sensors: Kinect and Mesa SR4000. In all of them, the proposed algorithm shows a promising performance towards human body analysis with 3D data.

}, isbn = {978-1-4577-0529-8}, doi = {10.1109/CVPRW.2011.5981772}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5981772}, author = {Alcoverro, M. and L{\'o}pez-M{\'e}ndez, A. and M. Pard{\`a}s and Casas, J.} } @article {aTenorio11, title = {Correlation between a semiautomated method based on ultrasound texture analysis and standard ultrasound diagnosis using white matter damage in preterm neonates as a model}, journal = {Journal of ultrasound in medicine}, volume = {30}, number = {10}, year = {2011}, pages = {1365{\textendash}1377}, issn = {0278-4297}, url = {http://www.ncbi.nlm.nih.gov/pubmed/21968487}, author = {Tenorio, V. and Bonet, E. and Botet, F. and Marqu{\'e}s, F. and Amat, I. and Gratacos, E.} } @conference {cGiro-i-Nieto11, title = {Diversity ranking for video retrieval from a broadcaster archive}, booktitle = {1st ACM International Conference on Multimedia Retrieval (ICMR {\textquoteright}11)}, year = {2011}, pages = {1{\textendash}8}, abstract = {

Video retrieval through text queries is a very common practice in broadcaster archives. The query keywords are compared to the metadata labels that documentalists have previously associated to the video assets. This paper focuses on a ranking strategy to obtain more relevant keyframes among the top hits of the results ranked lists but, at the same time, keeping a diversity of video assets. Previous solutions based on a random walk over a visual similarity graph have been modified to increase the asset diversity by filtering the edges between keyframes depending on their asset. The random walk algorithm is applied separately for ever visual feature to avoid any normalization issue between visual similarity metrics. Finally, this work evaluates performance with two separate metrics: the relevance is measured by the Average Precision and the diversity is assessed by the Average Diversity, a new metric presented in this work.

}, isbn = {978-1-4503-0336-1}, doi = {10.1145/1991996.1992052}, url = {http://dx.doi.org/10.1145/1991996.1992052}, author = {Xavier Gir{\'o}-i-Nieto and Alfaro, M. and Marqu{\'e}s, F.} } @article {aTello11, title = {Edge enhancement algorithm based on the wavelet transform for automatic edge detection in SAR images}, journal = {IEEE transactions on geoscience and remote sensing}, volume = {49}, number = {1}, year = {2011}, pages = {222{\textendash}235}, issn = {0196-2892}, doi = {10.1109/TGRS.2010.2052814}, url = {http://hdl.handle.net/2117/11057}, author = {Tello, M. and L{\'o}pez-Mart{\'\i}nez, C. and Mallorqui, J.J. and Salembier, P.} } @conference {cBosio11, title = {Feature set enhancement via hierarchical clustering for microarray classification}, booktitle = {IEEE International Workshop on Genomic Signal Processing and Statistics, GENSIPS 2011}, year = {2011}, month = {dec.}, pages = {226 -229}, abstract = {

A new method for gene expression classification is proposed in this paper. In a first step, the original feature set is enriched by including new features, called metagenes, produced via hierarchical clustering. In a second step, a reliable classifier is built from a wrapper feature selection process. The selection relies on two criteria: the classical classification error rate and a new reliability measure. As a result, a classifier with good predictive ability using as few features as possible to reduce the risk of overfitting is obtained. This method has been tested on three public cancer datasets: leukemia, lymphoma and colon. The proposed method has obtained interesting classification results and the experiments have confirmed the utility of both metagenes and feature ranking criterion to improve the final classifier.

}, keywords = {cancer, classical classification error rate, classifier, colon, feature set enhancement, gene expression classification, hierarchical clustering, leukemia, lymphoma, medical computing, metagenes, microarray classification, overfitting risk reduction, pattern classification, pattern clustering, public cancer datasets, reliability measure, wrapper feature selection process}, doi = {10.1109/GENSiPS.2011.6169486}, author = {Bosio, M. and P. Bellot and Salembier, P. and Albert Oliveras} } @conference {cSchreer11, title = {Format-agnostic approach for production, delivery and rendering of immersive media}, booktitle = {Networked and Electronic Media Summit}, year = {2011}, pages = {{\textendash}}, isbn = {SN}, url = {http://nem-summit.eu/program/}, author = {Schreer, O. and Thomas, G. and Niamut, O. and Macq, J. and Kochale, A. and Batke, J. and Ruiz-Hidalgo, J. and Oldfield, R. and Shirley, B. and Thallinger, G.} } @conference {cBenediktsson11, title = {Hierarchical analysis of remote sensing data: morphological attribute profiles and binary partition trees}, booktitle = {International Symposium on Mathematical Morphology 2011}, year = {2011}, pages = {306{\textendash}319}, address = {Intra, Lake Maggiore, Italy}, isbn = {3642215688}, doi = {10.1007/978-3-642-21569-8_27}, url = {http://hdl.handle.net/2117/14624}, author = {Benediktsson, J. and Bruzzone, L. and Chanussot, J. and Dalla Mura, M. and Salembier, P. and Valero, S.} } @article {aCanton-Ferrer11, title = {Human motion capture using scalable body models}, journal = {Computer vision and image understanding}, volume = {115}, number = {10}, year = {2011}, pages = {1363{\textendash}1374}, abstract = {

This paper presents a general analysis framework towards exploiting the underlying hierarchical and scalable structure of an articulated object for pose estimation and tracking. Scalable human body models are introduced as an ordered set of articulated models fulfilling an inclusive hierarchy. The concept of annealing is applied to derive a generic particle filtering scheme able to perform a sequential filtering over the set of models contained in the scalable human body model. Two annealing loops are employed, the standard likelihood annealing and the newly introduced structural annealing, leading to a robust, progressive and efficient analysis of the input data. The validity of this scheme is tested by performing markerless human motion capture in a multi-camera environment employing the standard HumanEva annotated datasets. Finally, quantitative results are presented and compared with other existing HMC techniques.

}, issn = {1077-3142}, doi = {10.1016/j.cviu.2011.06.001}, url = {http://hdl.handle.net/2117/13393}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @phdthesis {dValero11, title = {Hyperspectral image representation and Processing with Binary Partition Trees}, year = {2011}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

The optimal exploitation of the information provided by hyperspectral images requires the development of advanced image processing tools. Therefore, this PhD thesis proposes the construction and the processing of a new region-based hierarchical hyperspectral image representation: the Binary Partition Tree (BPT). This representation can be interpreted as a set of hierarchical regions stored in a tree structure. Based on region-merging techniques, the construction of BPT is investigated in this work by studying hyperspectral region models and the associated similarity metrics. Once the BPT is constructed,the fixed tree structure allows implementing efficient and advanced application-dependent techniques on it. The application-dependent processing of BPT is generally implemented through a specific pruning of the tree. Accordingly, some pruning techniques are proposed and discussed according to different applications. This Ph.D is focused in particular on segmentation, object detection and classification of hyperspectral imagery. Experimental results on various hyper spectral data sets demonstrate the interest and the good performances of the BPT representation.

}, url = {http://www.gipsa-lab.grenoble-inp.fr/formation/theses-soutenues.php?id_these=572}, author = {Valero, S.}, editor = {Salembier, P.} } @conference {cValero11b, title = {Hyperspectral image segmentation using binary partition trees}, booktitle = {IEEE International Conference on Image Processing, ICIP 2011}, year = {2011}, pages = {1273{\textendash}1276}, address = {Brussels, Belgium}, isbn = {0-7803-5470-2}, doi = {10.1109/ICIP.2011.6115666}, url = {http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=6094293}, author = {Valero, S. and Salembier, P. and Chanussot, J.} } @conference {cBragos11a, title = {Implementation and first results of the Introduction to Engineering course in the ETSETB-UPC new degrees}, booktitle = {II Conferencia Internacional en Fomento e Innovaci{\'o}n con Nuevas Tecnolog{\'\i}as en la Docencia de la Ingenier{\'\i}a}, year = {2011}, pages = {1{\textendash}4}, isbn = {978-1-4577-0559-5}, doi = {10.1109/FINTDI.2011.5945971}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5945971}, author = {Bragos, R. and Pegueroles, J. and Alarc{\'o}n, E. and Camps, A. and Sarda, J. and Consolacion, C. and Mussons, J. and Pons, O. and Albert Oliveras and Garc{\'\i}a, M. and Onrubia, R. and Elisa Sayrol} } @conference {cValero11, title = {Improved binary partition tree construction for hyperspectral images: application to object detection}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium, IGARSS 2011}, year = {2011}, pages = {2515{\textendash}2518}, address = {Vancouver, Canada}, doi = {10.1109/IGARSS.2011.6049723}, url = {http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=6034618}, author = {Valero, S. and Salembier, P. and Chanussot, J. and Cuadres, C.} } @mastersthesis {xTella11, title = {Interactive Image Processing demonstrations for the web}, year = {2011}, abstract = {

This diploma thesis aims to provide a framework for developing web applications for ImagePlus, the software develpment platform in C++ of the Image Processing Group of the Technical University of Catalonia (UPC). These web applications are to demonstrate the functionality of the image processing algorithms to any visitor to the group website. Developers are also benefited from this graphical user interface because they can easily create Graphical User Interfaces (GUIs) for the processing algorithms

Interactive Image Processing Demos for the Web from Xavi Gir{\'o}
}, keywords = {c++, demo, image processing, web}, url = {http://hdl.handle.net/2099.1/13540}, author = {Tella, Marcel}, editor = {Xavier Gir{\'o}-i-Nieto} } @mastersthesis {x11a, title = {Interactive Image Processing Demos for the Web}, year = {2011}, abstract = {

This diploma thesis aims to provide a framework for developing web applications for ImagePlus, the software develpment platform in C++ of the Image Processing Group of the Technical University of Catalonia (UPC). These web applications are to demonstrate the functionality of the image processing algorithms to any visitor to the group website. Developers are also benefited from this graphical user interface because they can easily create Graphical User Interfaces (GUIs) for the processing algorithms.



}, url = {http://upcommons.upc.edu/pfc/handle/2099.1/13540} } @conference {cGallego11, title = {Joint multi-view foreground segmentation and 3D reconstruction with tolerance loop}, booktitle = {IEEE International Conference on Image Processing}, year = {2011}, pages = {997{\textendash}1000}, abstract = {

In this paper we present a novel foreground segmentation and 3D reconstruction system for multi-view scenarios. The system achieves correct 3D object reconstruction even when foreground segmentation presents critical misses in some of the views. We introduce the spatial redundancy of the multi-view data into the foreground segmentation process by combining segmentation and the 3D reconstruction in a two steps workflow. First, the segmentation of the objects in each view uses a monocular, region-based foreground segmentation in a MAP-MRF framework for foreground, background and shadow classes. Next, we compute an iterative volume reconstruction in a 3D tolerance loop, obtaining an iteratively enhanced SfS volume. Foreground segmentation is improved by updating the foreground model of each view at each iteration. The results presented in this paper show the improved foreground segmentation and the reduction of errors in the reconstruction of the volume.

}, isbn = {978-1-4577-1304-0}, doi = {10.1109/ICIP.2011.6116731}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=6116731}, author = {Gallego, J. and Salvador, J. and Casas, J. and M. Pard{\`a}s} } @conference {cAlarcon11, title = {Learning to conceive, design, implement and operate circuits and systems}, booktitle = {2011 IEEE International Symposium on Circuits and Systems}, year = {2011}, pages = {1183{\textendash}1186}, isbn = {978-1-4244-9472-9}, doi = {10.1109/ISCAS.2011.5937780}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5937780\&tag=1}, author = {Alarc{\'o}n, E. and Bragos, R. and Elisa Sayrol} } @phdthesis {dObrador11, title = {Media Aesthetics Based Multimedia Storytelling}, year = {2011}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

Since the earliest of times, humans have been interested in recording their life experiences, for future reference and for storytelling purposes. This task of recording experiences {\textendash}i.e., both image and video capture{\textendash} has never before in history been as easy as it is today. This is creating a digital information overload that is becoming a great concern for the people that are trying to preserve their life experiences. As high-resolution digital still and video cameras become increasingly pervasive, unprecedented amounts of multimedia, are being downloaded to personal hard drives, and also uploaded to online social networks on a daily basis. The work presented in this dissertation is a contribution in the area of multimedia organization, as well as automatic selection of media for storytelling purposes, which eases the human task of summarizing a collection of images or videos in order to be shared with other people. As opposed to some prior art in this area, we have taken an approach in which neither user generated tags nor comments {\textendash}that describe the photographs, either in their local or on-line repositories{\textendash} are taken into account, and also no user interaction with the algorithms is expected. We take an image analysis approach where both the context images {\textendash}e.g. images from online social networks to which the image stories are going to be uploaded{\textendash}, and the collection images {\textendash}i.e., the collection of images or videos that needs to be summarized into a story{\textendash}, are analyzed using image processing algorithms. This allows us to extract relevant metadata that can be used in the summarization process. Multimedia-storytellers usually follow three main steps when preparing their stories: first they choose the main story characters, the main events to describe, and finally from these media sub-groups, they choose the media based on their relevance to the story as well as based on their aesthetic value. Therefore, one of the main contributions of our work has been the design of computational models {\textendash}both regression based, as well as classification based{\textendash} that correlate well with human perception of the aesthetic value of images and videos. These computational aesthetics models have been integrated into automatic selection algorithms for multimedia storytelling, which are another important contribution of our work. A human centric approach has been used in all experiments where it was feasible, and also in order to assess the final summarization results, i.e., humans are always the final judges of our algorithms, either by inspecting the aesthetic quality of the media, or by inspecting the final story generated by our algorithms. We are aware that a perfect automatically generated story summary is very hard to obtain, given the many subjective factors that play a role in such a creative process; rather, the presented approach should be seen as a first step in the storytelling creative process which removes some of the ground work that would be tedious and time consuming for the user. Overall, the main contributions of this work can be capitalized in three: (1) new media aesthetics models for both images and videos that correlate with human perception, (2) new scalable multimedia collection structures that ease the process of media summarization, and finally, (3) new media selection algorithms that are optimized for multimedia storytelling purposes

}, url = {http://hdl.handle.net/10803/33293}, author = {Obrador, P.}, editor = {Casas, J.} } @article {dPalou11, title = {Monocular Depth Ordering Using Occlusion Cues}, year = {2011}, institution = {Technical University of Catalonia}, type = {masters}, address = {Barcelona}, author = {Palou, G. and Salembier, P.} } @article {aCanton-Ferrer11a, title = {Multi-camera multi-object voxel-based Monte Carlo 3D tracking strategies}, journal = {Eurasip journal on advances in signal processing}, volume = {2011}, number = {114}, year = {2011}, pages = {1{\textendash}15}, abstract = {

This article presents a new approach to the problem of simultaneous tracking of several people in low-resolution sequences from multiple calibrated cameras. Redundancy among cameras is exploited to generate a discrete 3D colored representation of the scene, being the starting point of the processing chain. We review how the initiation and termination of tracks influences the overall tracker performance, and present a Bayesian approach to efficiently create and destroy tracks. Two Monte Carlo-based schemes adapted to the incoming 3D discrete data are introduced. First, a particle filtering technique is proposed relying on a volume likelihood function taking into account both occupancy and color information. Sparse sampling is presented as an alternative based on a sampling of the surface voxels in order to estimate the centroid of the tracked people. In this case, the likelihood function is based on local neighborhoods computations thus dramatically decreasing the computational load of the algorithm. A discrete 3D re-sampling procedure is introduced to drive these samples along time. Multiple targets are tracked by means of multiple filters, and interaction among them is modeled through a 3D blocking scheme. Tests over CLEAR-annotated database yield quantitative results showing the effectiveness of the proposed algorithms in indoor scenarios, and a fair comparison with other state-of-the-art algorithms is presented. We also consider the real-time performance of the proposed algorithm.\ 

}, issn = {1687-6172}, doi = {10.1186/1687-6180-2011-114}, url = {http://asp.eurasipjournals.com/content/2011/1/114}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s and Monte, E.} } @inbook {bDigne11, title = {Neighborhood Filters and the Recovery of 3D Information}, booktitle = {Handbook of Mathematical Methods in Imaging}, year = {2011}, pages = {1203-1229}, publisher = {Springer Verlag}, organization = {Springer Verlag}, isbn = {978-0-387-92919-4}, author = {J. Digne and Dimiccoli, M. and Salembier, P. and N. Sabater} } @conference {cPalou11, title = {Occlusion-based depth ordering on monocular images with binary partition tree}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2011}, year = {2011}, pages = {1093{\textendash}1096}, address = {Prague, Czech Republic}, doi = {0.1109/ICASSP.2011.5946598}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5946598\&tag=1}, author = {Palou, G. and Salembier, P.} } @conference {cAlonso-Gonzalez11, title = {PolSAR speckle filtering and segmentation based on binary partition tree representation}, booktitle = {5th International Workshop on Science and Applications of SAR Polarimetry and Polarimetric Interferometry, PolInSAR 2011}, year = {2011}, pages = {1{\textendash}19}, address = {Frascati (Rome), Italy}, url = {http://cataleg.upc.edu/record=b1233548~S1*cat}, author = {Alonso-Gonz{\'a}lez, A. and L{\'o}pez-Mart{\'\i}nez, C. and Salembier, P.} } @conference {cAlcoverro11, title = {A real-time body tracking system for smart rooms}, booktitle = {ICME - 2011 IEEE International Conference on Multimedia and Expo}, year = {2011}, pages = {1{\textendash}6}, abstract = {

We present a real-time human body tracking system for a single user in a Smart Room scenario. In this paper we propose a novel system that involves a silhouette-based cost function using variable windows, a hierarchical optimization method, parallel implementations of pixel-based algorithms and efficient usage of a low-cost hardware structure. Results in a Smart Room setup are presented.

}, isbn = {978-1-61284-349-0}, doi = {10.1109/ICME.2011.6011847}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=6011847}, author = {Alcoverro, M. and L{\'o}pez-M{\'e}ndez, A. and Casas, J. and M. Pard{\`a}s} } @conference {cSuau11, title = {Real-time head and hand tracking based on 2.5D data}, booktitle = {ICME - 2011 IEEE International Conference on Multimedia and Expo}, year = {2011}, pages = {1{\textendash}6}, abstract = {

A novel real-time algorithm for head and hand tracking is proposed in this paper. This approach is based on 2.5D data from a range camera, which is exploited to resolve ambiguities and overlaps. Experimental results show high robustness against partial occlusions and fast movements. The estimated positions are fairly stable, allowing the extraction of accurate trajectories which may be used for gesture classification purposes.

}, isbn = {975001880X}, doi = {10.1109/ICME.2011.6011869}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6011869\&tag=1}, author = {Suau, X. and Casas, J. and Ruiz-Hidalgo, J.} } @conference {cLopez-Mendez11a, title = {Real-time upper body tracking with online initialization using a range sensor}, booktitle = {2011 IEEE International Conference on Computer VIsion Workshops (ICCV Workshops)}, year = {2011}, pages = {391{\textendash}398}, abstract = {

We present a novel method for upper body pose estimation with online initialization of pose and the anthropometric profile. Our method is based on a Hierarchical Particle Filter that defines its likelihood function with a single view depth map provided by a range sensor. We use Connected Operators on range data to detect hand and head candidates that are used to enrich the Particle Filter{\textquoteright}s proposal distribution, but also to perform an automated initialization of the pose and the anthropometric profile estimation. A GPU based implementation of the likelihood evaluation yields real-time performance. Experimental validation of the proposed algorithm and the real-time implementation are provided, as well as a comparison with the recently released OpenNI tracker for the Kinect sensor.

}, isbn = {978-1-4673-0063-6/11}, doi = {10.1109/ICCVW.2011.6130268}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6130268}, author = {L{\'o}pez-M{\'e}ndez, A. and Alcoverro, M. and M. Pard{\`a}s and Casas, J.} } @mastersthesis {xAlfaro11 , title = {Reordenaci{\'o} i agrupament d{\textquoteright}imatges d{\textquoteright}una cerca de v{\'\i}deo}, year = {2011}, month = {01/2011}, abstract = {

La recuperaci{\'o} de v{\'\i}deo a trav{\'e}s de consultes textuals es una practica molt com{\'u} en els arxius de radiodifusi{\'o}. Les paraules clau de les consultes son comparades amb les metadades que s{\textquoteright}anoten manualment als assets de v{\'\i}deo pels documentalistes. A m{\'e}s, les cerques textuals b{\`a}siques generen llistes de resultats planes, on tots els resultats tenen la mateixa import{\`a}ncia, ja que, es limita a avaluar bin{\`a}riament si la paraula de cerca apareix o no entre les metadades associades als continguts. A m{\'e}s, acostumen a mostrar continguts molt similars, donant al usuari una llista ordenada de resultats de poca diversitat visual. La redund{\`a}ncia en els resultats provoca un malbaratament d{\textquoteright}espai a la interf{\'\i}cie gr{\`a}fica d{\textquoteright}usuari (GUI) que sovint obliga a l{\textquoteright}usuari a interactuar fortament amb la interf{\'\i}cie gr{\`a}fica fins localitzar els resultats rellevants per a la seva cerca. La aportaci{\'o} del present projecte consisteix en la presentaci{\'o} d{\textquoteright}una estrat{\`e}gia de reordenaci{\'o} i agrupaci{\'o} per obtenir keyframes de major rellev{\`a}ncia entre els primers resultats, per{\`o} al mateix temps mantenir una diversitat d{\textquoteright}assets. D{\textquoteright}aquesta forma, aquestes t{\`e}cniques permetran millorar els sistemes de visualitzaci{\'o} d{\textquoteright}imatges resultants d{\textquoteright}una cerca de v{\'\i}deo. L{\textquoteright}eina global es dissenya per ser integrada en l{\textquoteright}entorn del Digition, el gestor de continguts audiovisuals de la Corporaci{\'o} Catalana de Mitjans Audiovisuals.




}, url = {http://hdl.handle.net/2099.1/11106}, author = {Alfaro, M.}, editor = {Vives, X. and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {x11, title = {Rich Internet Application for the Semi-Automatic Annotation of Semantic Shots on Keyframes}, year = {2011}, abstract = {

This thesis describes the graphical user interface developed for semi-automatic keyframebased semantic shot annotation and the semantic shot classifiers built. The graphical user interface aims to optimize the current indexation process by substituting manual annotation for automatic annotation and validation. The system is based on supervised learning binary classifiers and web services. The graphical user interface provides the necessary tools to fix and validate the automatic detections and to learn from the user feedback to retrain the system and improve it. Results of the classifiers evaluation, performed using cross-validation methods, show a good performance in terms of precision and recall. The graphical user interface has been described as complete and easy to use by a professional documentalist at a broadcast company.

Interfície web per l{\textquoteright}annotació semi-automàtica de plans semàntics from Xavi Gir{\'o}


}, url = {http://hdl.handle.net/2099.1/13539}, author = {Carcel, Elisabet}, editor = {Xavier Gir{\'o}-i-Nieto and Vives, X.} } @phdthesis {dSalvador11, title = {Surface Reconstruction for Multi-View Video}, year = {2011}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

This thesis introduces a methodology for obtaining an alternative representation of video sequences captured by calibrated multi-camera systems in controlled environments with known scene background. This representation consists in a 3D description of the surfaces of foreground objects, which allows for the recovering of part of the 3D information of the original scene lost in the projection process in each camera.

The choice of the type of representation and the design of the reconstruction techniques are driven by three requirements that appear in smart rooms or recording studios. In these scenarios, video sequences captured by a multi-camera rig are used both for analysis applications and interactive visualization methods. The requirements are: the reconstruction method must be fast in order to be usable in interactive applications, the surface representation must provide a compression of the multi-view data redundancies and this representation must also provide all the relevant information to be used for analysis applications as well as for free-viewpoint video.

Once foreground and background are segregated for each view, the reconstruction process is divided in two stages. The first one obtains a sampling of the foreground surfaces (including orientation and texture), whereas the second provides closed, continuous surfaces from the samples, through interpolation.

The sampling process is interpreted as a search for 3D positions that result in feature matchings between different views. This search process can be driven by different mechanisms: an image-based approach, another one based on the deformation of a surface from frame to frame or a statistical sampling approach where samples are searched around the positions of other detected samples, which is the fastest and easiest to parallelize of the three approaches.

A meshing algorithm is also presented, which allows for the interpolation of surfaces between samples. Starting by an initial triangle, which connects three points coherently oriented, an iterative expansion of the surface over the complete set of samples takes place. The proposed method presents a very accurate reconstruction and results in a correct topology. Furthermore, it is fast enough to be used interactively.

The presented methodology for surface reconstruction permits obtaining a fast, compressed and complete representation of foreground elements in multi-view video, as reflected by the experimental results.

}, url = {http://hdl.handle.net/10803/108907}, author = {Salvador, J.}, editor = {Casas, J.} } @mastersthesis {xVentura11, title = {Tools for Image Retrieval in Large Multimedia Databases}, year = {2011}, abstract = {

One of the challenges in the development of an image retrieval system is to achieve an efficient indexing scheme since both developers and users, who are used to make requests in order to find a multimedia element in a large database, can be frustrated due to the long computational time of the search.

The traditional indexing schemes neither fulfil the dynamic indexing requirement, which allows to add or remove elements from the structure, nor fit well in high dimensional feature spaces due to the phenomenon so called {\textquotedblleft}the curse of dimensionality.{\textquotedblright}

After analysing several indexing techniques from the literature, we have decided to implement an indexing scheme called Hierarchical Cellular Tree (HCT), which was designed to bring an effective solution especially for indexing large multimedia databases. The HCT has allowed to improve the performance of our implemented image retrieval system based on the MPEG-7 visual descriptors. We have also made some contributions by proposing some modifications to the original HCT which have resulted in an improvement of its performance. Thus, we have proposed a redefinition of the covering radius, which does not consider only the elements belonging to the cell, but also all the elements holding from that cell. Since this consideration implies a much more computationally costly algorithm, we have proposed an approximation by excess for the covering radius value. However, we have also implemented a method which allows to update the covering radius to its actual value whenever it is desired. In addition to this, the pre-emptive insertion method has been adapted as a searching technique in order to improve the performance given by the retrieval scheme called Progressive Query, which was originally proposed to be used over the HCT.

Furthermore, the HCT indexing scheme has been also adapted to a server/client architecture by using a messenger system called KSC, which allows to have the HCT loaded on a server waiting for the query requests which are launched for the several clients of the retrieval system. In addition to this, the tool used to request a search over the indexed database has been adapted to a graphic user interface, named GOS (Graphic Object Searcher), which allows the user to order the retrievals in a more friendly way.

[Thesis report in UPCommons]

}, url = {http://hdl.handle.net/2099.1/13011}, author = {Ventura, C.}, editor = {Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @conference {cPardas11, title = {Work in progress - Cooperative and competitive projects for engaging students in advanced ICT subjects}, booktitle = {41st Annual Frontiers in Education Conference}, year = {2011}, pages = {1{\textendash}3}, isbn = {3-540-76258-2}, doi = {10.1109/FIE.2011.6143032}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6143032\&tag=1}, author = {M. Pard{\`a}s and Bonafonte, A.} } @conference {cHaro10, title = {3D shape from multi-camera views by error projection minimization}, booktitle = {10th Workshop on Image Analysis for Multimedia Interactive Services}, year = {2010}, pages = {250{\textendash}253}, isbn = {978-1-4244-3609-5}, doi = {10.1109/WIAMIS.2009.5031480}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=5031480}, author = {Haro, G. and M. Pard{\`a}s} } @conference {cGiro-i-Nieto10a, title = {BitSearch, the blog before the thesis}, booktitle = {VI International Congress of University Teaching and Innovation}, year = {2010}, address = {Barcelona, Catalonia}, abstract = {

This paper presents BitSearch, a web blog written by a team of thesis students where they share\ the evolution of their work. The blog is aimed at improving the communication not only between\ the advisor and the students, but also at motivating the student through the public exposure of the\ research \  development. \  Basic \  wrting \  guidelines \  are \  provided \  by \  the \  professor \  in \  order \  to\ guarantee the quality of the posts and provide good metadata for their retrieval, both by author or\ by a generic text query from a search engine. After one year online, BitSearch has published 176\ posts written by 19 students, in addition to the professor{\textquoteright}s own contributions. Statistics show\ more than 7,000 \ accesses their pages from visitors from more than 100 countries all over the\ globe. The blog tool has helped in improving the guidance on the students activity, developing a\ sense \ of \ team \ work \ among \  authors \ as \ well \ as \  a more \  progressive \ preparation \  of the \ final\ dissertation.

[CIDUI 2010 Website]

}, keywords = {blog, ~online~learning, ~web-based~tools}, isbn = {978-84-8458-324-0}, url = {http://hdl.handle.net/2117/11371}, author = {Xavier Gir{\'o}-i-Nieto} } @conference {cValero10a, title = {Comparison of merging orders and pruning strategies for binary partition tree in hyperspectral data}, booktitle = {IEEE International Conference on Image Processing, ICIP 2010}, year = {2010}, pages = {2565{\textendash}2568}, address = {Hong Kong, China}, doi = {10.1109/ICIP.2010.5652595}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=5652595}, author = {Valero, S. and Salembier, P. and Chanussot, J.} } @inbook {bMarcotegui10a, title = {Compression}, booktitle = {Morphologie Math{\'e}matique II: estimation choix et mise en oeuvre}, year = {2010}, publisher = {Hermes, Lavoisier}, organization = {Hermes, Lavoisier}, edition = {L. Najman and H. Talbot (Eds.)}, isbn = {978-2-7462-2593-0}, url = {http://cataleg.upc.edu/search~S1*cat/X?estimation\%2C+choix\&SORT=D\&searchscope=1}, author = {Marcotegui, B. and Salembier, P.} } @inbook {bMarcotegui10, title = {Compression}, booktitle = {Mathematical morphology from theories to applications}, year = {2010}, pages = {385{\textendash}391}, publisher = {Wiley}, organization = {Wiley}, edition = {L. Najman and H. Talbot (Eds.)}, chapter = {16}, isbn = {978-1848212152}, url = {http://cataleg.upc.edu/record=b1376601~S1*cat}, author = {Marcotegui, B. and Salembier, P.} } @inbook {bWaibel10, title = {Computers in the Human Interaction Loop}, booktitle = {Handbook on Ambient Intelligence and Smart Environments (AISE)}, year = {2010}, pages = {1071{\textendash}1116}, publisher = {Springer}, organization = {Springer}, address = {Boston, MA}, abstract = {

It is a common experience in our modern world, for us humans to be overwhelmed by the complexities of technological artifacts around us, and by the attention they demand. While technology provides wonderful support and helpful assistance, it also causes an increased preoccupation with technology itself and a related fragmentation of attention. But as humans, we would rather attend to a meaningful dialog and interaction with other humans, than to control the operations of machines that serve us. The cause for such complexity and distraction, however, is a natural consequence of the flexibility and choice of functions and features that technology has to offer. Thus flexibility of choice and the availability of desirable functions are in conflict with ease of use and our very ability to enjoy their benefits.\ 

}, isbn = {978-0-387-93807-3}, doi = {10.1007/978-0-387-93808-0_40}, author = {Waibel, A. and Stiefelhagen, R. and Carlson, R. and Casas, J. and Kleindienst, J. and Lamel, L. and Lanz, O. and Mostefa, D. and Omologo, M. and Pianesi, F. and Polymenakos, L. and Potamianos, G. and Soldatos, J. and Sutschet, G. and Terken, J.} } @inbook {bSalembier10, title = {Connected operators based on tree pruning strategies}, booktitle = {Mathematical morphology from theories to applications}, year = {2010}, publisher = {Wiley}, organization = {Wiley}, edition = {L. Najman and H. Talbot (Eds.)}, chapter = {7}, isbn = {978-1848212152}, url = {http://cataleg.upc.edu/record=b1376601~S1*cat}, author = {Salembier, P.} } @conference {cPont-Tuset10, title = {Contour detection using binary partition trees}, booktitle = {IEEE International Conference on Image Processing}, year = {2010}, pages = {1609{\textendash}1612}, isbn = {?}, doi = {10.1109/ICIP.2010.5652339}, url = {http://ieeexplore.ieee.org/search/srchabstract.jsp?tp=\&arnumber=5652339\&queryText\%3DContour+detection+using+binary+partition+trees\%26openedRefinements\%3D*\%26searchField\%3DSearch+All}, author = {Jordi Pont-Tuset and Marqu{\'e}s, F.} } @conference {cGiro-i-Nieto10, title = {Digimatge, a Rich Internet Application for video retrieval from a Multimedia Asset Management system}, booktitle = {11th ACM SIGMM International Conference on Multimedia Information Retrieval}, year = {2010}, abstract = {

This paper describes the integration of two new services aimed at assisting into the retrieval of video content from an existing Multimedia Asset Manager (MAM) of a TV broadcaster archive. The first tool suggests tags after a first textual query, and the second ranks the keyframe of retrieved assets according to their visual similarity. Both applications were integrated as web services that are accessed from a Rich Internet Application via REST calls.

}, isbn = {978-1-60558-815-5}, doi = {10.1145/1743384.1743458}, url = {http://upcommons.upc.edu/e-prints/urlFiles?idDrac=2249267}, author = {Xavier Gir{\'o}-i-Nieto and Salla, R. and Vives, X.} } @conference {cGallego10, title = {Enhanced bayesian foreground segmentation using brightness and color distortion region-based model for shadow removal}, booktitle = {2010 IEEE International Conference on Image Processing}, year = {2010}, pages = {3449{\textendash}3452}, isbn = {1522-4880}, doi = {10.1109/ICIP.2010.5653897}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5653897\&tag=1}, author = {Gallego, J. and M. Pard{\`a}s} } @mastersthesis {xmunoz-trallero10, title = {Extensi{\'o} d{\textquoteright}una interf{\'\i}cie de cerca d{\textquoteright}imatges a les consultes amb regions}, year = {2010}, abstract = {

Aquest {\'e}s un projecte universitat-empresa que presenta una interf{\'\i}cie de cerca multimodal a partir de la integraci{\'o} de diferents eines ja existents per crear una eina global que combina les cerques textuals amb les consultes mitjan{\c c}ant exemple, ja sigui una consulta basada en una imatge o en una regi{\'o} d{\textquoteright}aquesta imatge. L{\textquoteright}eina global s{\textquoteright}integra en l{\textquoteright}entorn del Digition, el gestor de continguts audiovisuals de la Corporaci{\'o} Catalana de Mitjans Audiovisuals. La principal aportaci{\'o} del present projecte consisteix en la creaci{\'o} d{\textquoteright}una interf{\'\i}cie de cerca d{\textquoteright}imatges basada en regions i la seva integraci{\'o} en l{\textquoteright}eina global.


}, url = {http://hdl.handle.net/2099.1/13525}, author = {Pia Mu{\~n}oz-Trallero}, editor = {Xavier Gir{\'o}-i-Nieto and Vives, X.} } @conference {cAlonso-Gonzalez10, title = {Filtering and segmentation of polarimetric SAR images with binary partition trees}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium, IGARSS 2010}, year = {2010}, pages = {4043{\textendash}4046}, address = {Honolulu, USA}, isbn = {978-1-4244-9564-1}, doi = {10.1109/IGARSS.2010.5653466}, url = {http://cataleg.upc.edu/record=b1167223~S1*cat}, author = {Alonso-Gonz{\'a}lez, A. and L{\'o}pez-Mart{\'\i}nez, C. and Salembier, P.} } @conference {cSalvador10b, title = {From silhouettes to 3D points to mesh: towards free viewpoint video}, booktitle = {ACM Workshop on 3D Video Processing (3DVP)}, year = {2010}, pages = {19{\textendash}24}, isbn = {1522-4880}, doi = {10.1145/1877791.1877797}, url = {http://delivery.acm.org/10.1145/1880000/1877797/p19-salvador.pdf?ip=147.83.95.37\&CFID=24773176\&CFTOKEN=26981598\&__acm__=1306838718_5a07df5f539bac704e3bc98a25b9a68c}, author = {Salvador, J. and Suau, X. and Casas, J.} } @article {aGiro-i-Nieto10, title = {GAT, a Graphical Annotation Tool for semantic regions}, journal = {Multimedia tools and applications}, volume = {46}, number = {2/3 (2010)}, year = {2010}, pages = {155{\textendash}174}, abstract = {

This article presents GAT, a Graphical Annotation Tool based on a region-based hierarchical representation of images. The proposed solution uses Partition Trees to navigate through the image segments which are automatically defined at different spatial scales. Moreover, the system focuses on the navigation through ontologies for a semantic annotation of objects and of the parts that compose them. The tool has been designed under usability criteria to minimize the user interaction by trying to predict the future selection of regions and semantic classes. The implementation uses MPEG-7/XML input and output data to allow interoperability with any type of Partition Tree. This tool is publicly available and its source code can be downloaded under a free software license.

}, issn = {1380-7501}, doi = {10.1007/s11042-009-0389-2}, url = {http://www.springerlink.com/content/j78782k762617352/}, author = {Xavier Gir{\'o}-i-Nieto and Camps, N. and Marqu{\'e}s, F.} } @phdthesis {dRolon10, title = {Generalized Lifting for Sparse Image Representation and Coding}, year = {2010}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, author = {Rol{\'o}n, J.}, editor = {Salembier, P.} } @article {xCortes10, title = {GOS: b{\'u}squeda visual de im{\'a}genes}, number = {25}, year = {2010}, pages = {36{\textendash}44}, keywords = {i3media}, issn = {1698-7047}, url = {http://upcommons.upc.edu/e-prints/urlFiles?idDrac=2251008}, author = {Cort{\'e}s, S.}, editor = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @mastersthesis {xVentura10, title = {Image-Based Query by Example Using MPEG-7 Visual Descriptors}, year = {2010}, abstract = {

This project presents the design and implementation of a Content-Based Image Re- trieval (CBIR) system where queries are formulated by visual examples through a graphical interface. Visual descriptors and similarity measures implemented in this work followed mainly those defined in the MPEG-7 standard although, when necessary, extensions are proposed. Despite the fact that this is an image-based system, all the proposed descriptors have been implemented for both image and region queries, allowing the future system upgrade to support region-based queries. This way, even a contour shape descriptor has been developed, which has no sense for the whole image. The system has been assessed on different benchmark databases; namely, MPEG-7 Common Color Dataset, and Corel Dataset. The evaluation has been performed for isolated descriptors as well as for combinations of them. The strategy studied in this work to gather the information obtained from the whole set of computed descriptors is weighting the rank list for each isolated descriptor.\ 

}, url = {http://upcommons.upc.edu/pfc/handle/2099.1/9453}, author = {Ventura, C.}, editor = {Marqu{\'e}s, F. and Jordi Pont-Tuset} } @article {aGudmundsson10, title = {Improved 3D reconstruction in smart-room environments using ToF imaging}, journal = {Computer vision and image understanding}, volume = {114}, number = {12}, year = {2010}, month = {12/2010}, pages = {1376{\textendash}1384}, abstract = {

This paper presents a general analysis framework towards exploiting the underlying hierarchical and scalable structure of an articulated object for pose estimation and tracking. Scalable human body models are introduced as an ordered set of articulated models fulfilling an inclusive hierarchy. The concept of annealing is applied to derive a generic particle filtering scheme able to perform a sequential filtering over the set of models contained in the scalable human body model. Two annealing loops are employed, the standard likelihood annealing and the newly introduced structural annealing, leading to a robust, progressive and efficient analysis of the input data. The validity of this scheme is tested by performing markerless human motion capture in a multi-camera environment employing the standard HumanEva annotated datasets. Finally, quantitative results are presented and compared with other existing HMC techniques.

}, issn = {1077-3142}, doi = {10.1016/j.cviu.2010.07.011}, url = {http://www.sciencedirect.com/science/article/pii/S1077314210001736}, author = {Gudmundsson, S. and M. Pard{\`a}s and Casas, J. and Sveinsson, J. and Aanaes, H. and Larsen, R.} } @conference {cRolon10, title = {Improved local pdf estimation in the wavelet domain for generalized lifting}, booktitle = {Picture Coding Symposium, PCS 2010}, year = {2010}, address = {Nagoya, Japan}, isbn = {0-7803-3192-3}, url = {http://hdl.handle.net/2117/11675}, author = {Rol{\'o}n, J. and Salembier, P.} } @phdthesis {dCalderero10, title = {Information Theoretical Region Merging Approaches and Fusion of Hierarchical Image Segmentation Results}, year = {2010}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {This PhD. thesis addresses the unsupervised hierarchical segmentation of images and the automatic extraction of the image partitions providing the most semantically relevant explanations of the image at different levels of analysis. From a semantic and practical perspective, image segmentation is a first and key step for image analysis and pattern recognition since region-based image representations provide a first level of abstraction and a reduction of the number of primitives, leading to a more robust estimation of parameters and descriptors. The proposal of this dissertation is based on an important class of hierarchical bottomup segmentation approaches, known as region merging techniques. These approaches naturally provide a bottom-up hierarchy, more suitable when no a priori information about the image is available, and an excellent compromise between efficiency of computation and representation. The first part of the dissertation is devoted to the proposal, under a statistical framework, of a family of unsupervised region merging techniques. These techniques are characterized by general and non-parametric region models, with neither color nor texture homogeneity assumptions, and a set of innovative merging criteria, based on information theory statistical measures. The scale consistency of the partitions is assured through (i) a size regularization term into the merging criteria and a classical merging order, or (ii) using a novel scale-based merging order to avoid the region size homogeneity imposed by the use of a size regularization term. Moreover, a partition significance index is defined to automatically determine the subset of most representative partitions from the created hierarchy. Most significant automatically extracted partitions show the ability to represent the semantic content of the image. Results are promising, outperforming in most indicators both object-oriented and texture state-of-the-art segmentation techniques. The second part of the thesis is focused on the fusion of hierarchical segmentation results, obtained by different segmentation techniques or from different information channels of the same image, with the purpose of increasing the robustness and accuracy of the final solution. In this case, a generic and scalable segmentation scheme based on a cooperative principle, named cooperative region merging, is designed to combine in an unsupervised manner a set of hierarchical region-based representations. The intuition behind this cooperative approach is to iteratively establish a basic or conservative consensus between the independent techniques that can be used as the starting point from which further consensus may be built, similar to a negotiation process for decision making. In addition to the new fused hierarchy of partitions, the proposed scheme automatically provides a subset of partitions considered most relevant from the fusion viewpoint. The combination of hierarchical segmentation results applying different segmentation techniques over the same color image leads to a global improvement of the accuracy and the stability of the segmentation results. Moreover, the use of the cooperative approach for the fusion of segmentation results from heterogenous information channels is presented. Application examples demonstrate the high flexibility and potentiality of the cooperative region merging scheme into a wide range of applications and fusion problems (for instance, in multiview processing and remote sensing). The proposed fusion strategies are able to naturally incorporate a priori available knowledge on the types of information to combine, or on the specificities of the particular fusion problem, and to improve the efficiency and reduce the computational load of the fusion process without compromising the accuracy of the segmentation results.}, author = {Calderero, F.}, editor = {Marqu{\'e}s, F.} } @mastersthesis {x10, title = {Interf{\'\i}cie gr{\`a}fica d{\textquoteright}usuari per a l{\textquoteright}avaluaci{\'o} de classificadors d{\textquoteright}imatges}, year = {2010}, abstract = {

Aquest projecte final de carrera s{\textquoteright}ha realitzat amb l{\textquoteright}objectiu de crear una interf{\'\i}cie gr{\`a}fica capa{\c c} d{\textquoteright}avaluar diferents classificadors d{\textquoteright}una forma gr{\`a}fica i molt intu{\"\i}tiva. La interf{\'\i}cie gr{\`a}fica s{\textquoteright}ha d{\textquoteright}implementar dins del Graphic Annotation Tool (GAT), una aplicaci{\'o} de programari lliure creada pel Grup de Processament de la Imatge de la UPC. El projecte es basa en la creaci{\'o} d{\textquoteright}unes anotacions manuals per tal de realitzar un entrenament a partir d{\textquoteright}un algoritme d{\textquoteright}aprenentatge supervisat. Els resultats d{\textquoteright}aquesta classificaci{\'o} seran visualitzats per la nova pestanya del GAT anomenada Classifier. Aix{\`o} ser{\`a} possible gr{\`a}cies a la integraci{\'o} dels motors de d{\textquoteright}entrenament i de detecci{\'o}, a la incorporaci{\'o} del algoritme de validaci{\'o} creuada aix{\'\i} com que sigui possible triar entre diferents classificadors. Per tal d{\textquoteright}avaluar aquests resultats es calculen la precisi{\'o}, el record, els falsos positius i els falsos negatius.

}, keywords = {image classification, pattern recognition}, url = {http://hdl.handle.net/2099.1/13526}, author = {Gimeno, Mireia}, editor = {Xavier Gir{\'o}-i-Nieto} } @conference {cSalvador10, title = {Joint estimation of shape and motion from silhouettes}, booktitle = {2010 IEEE International Conference on Image Processing}, year = {2010}, pages = {4069{\textendash}4072}, isbn = {3-540-66082-8}, doi = {10.1109/ICIP.2010.5654022}, url = {http://www.cic.unb.br/~mylene/PI_2010_2/ICIP10/pdfs/0004069.pdf}, author = {Salvador, J. and Casas, J.} } @article {aCanton-Ferrer10, title = {Marker-based human motion capture in multi-view sequences}, journal = {Eurasip journal on advances in signal processing}, volume = {2010}, number = {Article ID 105476}, year = {2010}, pages = {1{\textendash}11}, abstract = {

This paper presents a low-cost real-time alternative to available commercial human motion capture systems. First, a set of distinguishable markers are placed on several human body landmarks, and the scene is captured by a number of calibrated and synchronized cameras. In order to establish a physical relation among markers, a human body model is defined. Markers are detected on all camera views and delivered as the input of an annealed particle filter scheme where every particle encodes an instance of the pose of the body model to be estimated. Likelihood between particles and input data is performed through the robust generalized symmetric epipolar distance and kinematic constrains are enforced in the propagation step towards avoiding impossible poses. Tests over the HumanEva annotated data set yield quantitative results showing the effectiveness of the proposed algorithm. Results over sequences involving fast and complex motions are also presented.

}, keywords = {CHIL, PROVEC}, issn = {1687-6172}, doi = {10.1155/2010/105476}, url = {http://www.hindawi.com/journals/asp/2010/105476.html}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @article {cSayrol10, title = {Mixed integration of CDIO skills into telecommunication engineering curricula}, journal = {Elektronika ir elektrotechnika}, volume = {102}, number = {102}, year = {2010}, pages = {127{\textendash}130}, issn = {1392-1215}, url = {http://www.ktu.lt/lt/mokslas/zurnalai/elektros_z/z102/32__ISSN_1392-1215_Mixed\%20Integration\%20of\%20CDIO\%20skills\%20into\%20Telecommunication\%20Engineering\%20Curricula.pdf}, author = {Elisa Sayrol and Bragos, R. and Alarc{\'o}n, E. and Cabrera, M. and Calveras, A. and Comellas, J. and O{\textquoteright}Callaghan, J. and Pegueroles, J. and Pla, E. and Prat, L. and Saez, G. and Sarda, J. and Tallon, C.} } @article {aGudmundsson10a, title = {Model-based hand gesture tracking in ToF image sequences}, journal = {Lecture notes in computer science}, volume = {6169/2010}, year = {2010}, pages = {118{\textendash}127}, issn = {0302-9743}, doi = {10.1007/978-3-642-14061-7_12}, url = {http://www.springerlink.com/content/l1117675tv565twq/}, author = {Gudmundsson, S. and Sveinsson, J. and M. Pard{\`a}s and Aanaes, H. and Larsen, R.} } @conference {cValero10, title = {New hyperspectral data representation using binary partition tree}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium, IGARSS 2010}, year = {2010}, pages = {80{\textendash}83}, address = {Honolulu, USA}, isbn = {978-1-4244-4761-9}, doi = {10.1109/IGARSS.2010.5649780}, url = {http://hdl.handle.net/2117/11005}, author = {Valero, S. and Salembier, P. and Chanussot, J.} } @conference {cVilaplana10, title = {Object detection and segmentation on a hierarchical region-based image representation}, booktitle = {2010 IEEE International Conference on Image Processing}, year = {2010}, pages = {3393{\textendash}3396}, isbn = {0-8194-2103-0}, url = {http://hdl.handle.net/2117/11435}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F. and Le{\'o}n, M. and Gasull, A.} } @conference {cSalvador10a, title = {Photo-consistent surfaces from a sparse set of viewpoints}, booktitle = {2010 IEEE International Conference on Image Processing}, year = {2010}, pages = {4045{\textendash}4048}, isbn = {978-1-4244-5655-6}, doi = {10.1109/ICIP.2010.5649006}, url = {http://www.cic.unb.br/~mylene/PI_2010_2/ICIP10/pdfs/0004045.pdf}, author = {Salvador, J. and Casas, J.} } @conference {cBragos10, title = {Proceso de inversi{\'o}n de competencias gen{\'e}ricas en los nuevos planes de estudios de grado de la ETSETB de acuerdo con el modelo CDIO}, booktitle = {IX Congreso de Tecnolog{\'\i}as Aplicadas a la Ense{\~n}anza de la Electr{\'o}nica}, year = {2010}, pages = {1{\textendash}9}, isbn = {978-84-96737-68-6}, url = {http://upcommons.upc.edu/e-prints/urlFiles?idDrac=4451103}, author = {Bragos, R. and Alarc{\'o}n, E. and Cabrera, M. and Calveras, A. and Comellas, J. and O{\textquoteright}Callaghan, J. and Pegueroles, J. and Prat, L. and Saez, G. and Sarda, J. and Elisa Sayrol} } @conference {cCanton-Ferrer10a, title = {Real-time 3D multi-person tracking using Monte Carlo surface sampling}, booktitle = {2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops}, year = {2010}, pages = {40{\textendash}46}, isbn = {978-1-4244-7030-3/10}, doi = {10.1109/CVPRW.2010.5543734}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=5543734}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @conference {cCalderero10, title = {Region merging parameter dependency as information diversity to create sparse hierarchies of partitions}, booktitle = {2010 IEEE International Conference on Image Processing}, year = {2010}, pages = {2237{\textendash}2240}, isbn = {1522-4880}, doi = {10.1109/ICIP.2010.5651720}, url = {http://ieeexplore.ieee.org/search/srchabstract.jsp?tp=\&arnumber=5651720\&queryText\%3DRegion+merging+parameter+dependency+as+information+diversity+to+create+sparse+hierarchies+of+partitions\%26openedRefinements\%3D*\%26searchField\%3DSearch+All}, author = {Calderero, F. and Marqu{\'e}s, F.} } @article {aCalderero10, title = {Region merging techniques using information theory statistical measures}, journal = {IEEE transactions on image processing}, volume = {19}, number = {6}, year = {2010}, pages = {1567{\textendash}1586}, abstract = {

The purpose of the current work is to propose, under a statistical framework, a family of unsupervised region merging techniques providing a set of the most relevant region-based explanations of an image at different levels of analysis. These techniques are characterized by general and nonparametric region models, with neither color nor texture homogeneity assumptions, and a set of innovative merging criteria, based on information theory statistical measures. The scale consistency of the partitions is assured through i) a size regularization term into the merging criteria and a classical merging order, or ii) using a novel scale-based merging order to avoid the region size homogeneity imposed by the use of a size regularization term. Moreover, a partition significance index is defined to automatically determine the subset of most representative partitions from the created hierarchy. Most significant automatically extracted partitions show the ability to represent the semantic content of the image from a human point of view. Finally, a complete and exhaustive evaluation of the proposed techniques is performed, using not only different databases for the two main addressed problems (object-oriented segmentation of generic images and texture image segmentation), but also specific evaluation features in each case: under- and oversegmentation error, and a large set of region-based, pixel-based and error consistency indicators, respectively. Results are promising, outperforming in most indicators both object-oriented and texture state-of-the-art segmentation techniques.

}, issn = {1057-7149}, doi = {10.1109/TIP.2010.2043008}, url = {http://www.dfmf.uned.es/~daniel/www-imagen-dhp/biblio/region-merging-information.pdf}, author = {Calderero, F. and Marqu{\'e}s, F.} } @conference {cLeon10, title = {Region-based caption text extraction}, booktitle = {11th. International Workshop on Image Analysis for Multimedia Application Services}, year = {2010}, pages = {1{\textendash}4}, url = {http://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=5608542}, author = {Le{\'o}n, M. and Ver{\'o}nica Vilaplana and Gasull, A. and Marqu{\'e}s, F.} } @phdthesis {dVilaplana10, title = {Region-based face detection, segmentation and tracking. framework definition and application to other objects}, year = {2010}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

One of the central problems in computer vision is the automatic recognition of object classes. In particular, the detection of the class of human faces is a problem that generates special interest due to the large number of applications that require face detection as a first step. In this thesis we approach the problem of face detection as a joint detection and segmentation problem, in order to precisely localize faces with pixel accurate masks. Even though this is our primary goal, in finding a solution we have tried to create a general framework as independent as possible of the type of object being searched. For that purpose, the technique relies on a hierarchical region-based image model, the Binary Partition Tree, where objects are obtained by the union of regions in an image partition. In this work, this model is optimized for the face detection and segmentation tasks. Different merging and stopping criteria are proposed and compared through a large set of experiments. In the proposed system the intra-class variability of faces is managed within a learning framework. The face class is characterized using a set of descriptors measured on the tree nodes, and a set of one-class classifiers. The system is formed by two strong classifiers. First, a cascade of binary classifiers simplifies the search space, and afterwards, an ensemble of more complex classifiers performs the final classification of the tree nodes. The system is extensively tested on different face data sets, producing accurate segmentations and proving to be quite robust to variations in scale, position, orientation, lighting conditions and background complexity. We show that the technique proposed for faces can be easily adapted to detect other object classes. Since the construction of the image model does not depend on any object class, different objects can be detected and segmented using the appropriate object model on the same image model. New object models can be easily built by selecting and training a suitable set of descriptors and classifiers. Finally, a tracking mechanism is proposed. It combines the efficiency of the mean-shift algorithm with the use of regions to track and segment faces through a video sequence, where both the face and the camera may move. The method is extended to deal with other deformable objects, using a region-based graph-cut method for the final object segmentation at each frame. Experiments show that both mean-shift based trackers produce accurate segmentations even in difficult scenarios such as those with similar object and background colors and fast camera and object movements.

}, url = {http://hdl.handle.net/10803/33330}, author = {Ver{\'o}nica Vilaplana}, editor = {Marqu{\'e}s, F.} } @mastersthesis {xGirvent10, title = {Servei de v{\'\i}deos a la carta per a l{\textquoteright}iPhone}, year = {2010}, abstract = {

Els tel{\`e}fons intel{\textperiodcentered}ligents (smartphones) estan totalment integrats a la societat. Nombroses cadenes de televisi{\'o}, prove{\"\i}dores de continguts o portals webs desenvolupen aplicacions per a veure la televisi{\'o} en directe o reproduir, en temps real, v{\'\i}deos escollits a la carta. En aquest projecte es desenvolupa un servei de v{\'\i}deos a la carta per a iPhone, que permet reproduir v{\'\i}deos en temps real utilitzant el protocol HTTP Live Streaming d{\textquoteright}Apple. En aquesta mem{\`o}ria es detalla com configurar un servidor de v{\'\i}deo i el desenvolupament d{\textquoteright}una aplicaci{\'o} per a l{\textquoteright}iPhone que reprodueixi v{\'\i}deos en temps real. El servidor es programa i configura en Linux i s{\textquoteright}utilitza programari lliure per a l{\textquoteright}adaptaci{\'o} del v{\'\i}deo segons els requeriments de HTTP Live Streaming. Tamb{\'e} s{\textquoteright}emmagatzemen les metadades del v{\'\i}deo en una base de dades, a la qual s{\textquoteright}accedeix des de l{\textquoteright}iPhone per a mostrar-les en la interf{\'\i}cie. L{\textquoteright}aplicaci{\'o} per a l{\textquoteright}iPhone, permet navegar pel cat{\`a}leg de v{\'\i}deos, visualitzar les metadades del v{\'\i}deo com el t{\'\i}tol, la descripci{\'o} o una miniatura del v{\'\i}deo i reproduir-lo en temps real i en el dispositiu. Aquesta s{\textquoteright}ha desenvolupat en Mac, programant amb Objective-C i utilitzant les eines del kit de desenvolupament per a iPhone (iPhone SDK).

Servei de v{\'\i}deos a la carta per a l{\textquoteright}iPhone. from Xavi Gir{\'o}

}, keywords = {http, ios, iphone, streaming, video, web}, url = {http://hdl.handle.net/2099.1/13524}, author = {Bruna Girvent}, editor = {Xavier Gir{\'o}-i-Nieto} } @article {aHaro10, title = {Shape from incomplete silhouettes based on the reprojection error}, journal = {Image and vision computing}, volume = {28}, number = {9}, year = {2010}, pages = {1354{\textendash}1368}, issn = {0262-8856}, doi = {10.1016/j.imavis.2010.01.016}, url = {linkinghub.elsevier.com/retrieve/pii/S0262885610000326}, author = {Haro, G. and M. Pard{\`a}s} } @conference {cAlcoverro10, title = {Skeleton and shape adjustment and tracking in multicamera environments}, booktitle = {6th International Conference AMDO 2010}, year = {2010}, pages = {88{\textendash}97}, isbn = {978-3-642-14060-0}, doi = {10.1007/978-3-642-14061-7_9}, author = {Alcoverro, M. and Casas, J. and M. Pard{\`a}s} } @inbook {bAlcoverro10, title = {Skeleton and shape adjustment and tracking in multicamera environments}, booktitle = {Lecture notes in computer science}, volume = {6169/2010}, year = {2010}, pages = {88{\textendash}97}, publisher = {Springer}, organization = {Springer}, address = {Berlin / Heidelberg}, abstract = {

In this paper we present a method for automatic body model adjustment and motion tracking in multicamera environments. We introduce a set of shape deformation parameters based on linear blend skinning, that allow a deformation related to the scaling of the distinct bones of the body model skeleton, and a deformation in the radial direction of a bone. The adjustment of a generic body model to a specific subject is achieved by the estimation of those shape deformation parameters. This estimation combines a local optimization method and hierarchical particle filtering, and uses an efficient cost function based on foreground silhouettes using GPU. This estimation takes into account anthropometric constraints by using a rejection sampling method of propagation of particles. We propose a hierarchical particle filtering method for motion tracking using the adjusted model. We show accurate model adjustment and tracking for distinct subjects in a 5 cameras set up

}, isbn = {978-3-642-14060-0}, issn = {0302-9743}, doi = {10.1007/978-3-642-14061-7_9}, url = {http://www.springerlink.com/content/9662h2hwq81041g2/fulltext.pdf}, author = {Alcoverro, M. and Casas, J. and M. Pard{\`a}s} } @conference {cCuadras10, title = {Some measures of multivariate association relating two spectral data sets}, booktitle = {19th International Conference on Computational Statistics, COMSTAT 2010}, year = {2010}, address = {Paris, France}, author = {Cuadras, C. and Valero, S. and Salembier, P. and Chanussot, J.} } @conference {cCanton-Ferrer10, title = {Spatio-temporal alignment and hyperspherical radon transform for 3D gait recognition in multi-view environments}, booktitle = {2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition Workshops}, year = {2010}, pages = {116{\textendash}121}, isbn = {978-1-4244-7030-3/10}, doi = {10.1109/CVPRW.2010.5544615}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=5544615}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @conference {cSuau10, title = {Surface reconstruction by restricted and oriented propagation}, booktitle = {2010 IEEE International Conference on Image Processing}, year = {2010}, pages = {813{\textendash}816}, isbn = {952-15-1364-0}, doi = {10.1109/ICIP.2010.5652707}, author = {Suau, X. and Casas, J. and Ruiz-Hidalgo, J.} } @conference {cGiro10a, title = {System architecture of a web service for Content-Based Image Retrieval}, booktitle = {ACM International Conference On Image And Video Retrieval 2010}, year = {2010}, pages = {358{\textendash}365}, abstract = {

This paper presents the system architecture of a Content-Based Image Retrieval system implemented as a web service. The proposed solution is composed of two parts, a client running a graphical user interface for query formulation and a server where the search engine explores an image repository. The separation of the user interface and the search engine follows a Service as a Software (SaaS) model, a type of cloud computing design where a single core system is online and available to authorized clients. The proposed architecture follows the REST software architecture and HTTP protocol for communications, two solutions that combined with metadata coded in RDF, make the proposed system ready for its integration in the semantic web. User queries are formulated by visual examples through a graphical interface and content is remotely accessed also through HTTP communication. Visual descriptors and similarity measures implemented in this work are mostly defined in the MPEG-7 standard, while textual metadata is coded according to the Dublin Core specifications.

}, isbn = {978-1-4503-0117-6}, doi = {10.1145/1816041.1816093}, url = {http://doi.acm.org/10.1145/1816041.1816093}, author = {Xavier Gir{\'o}-i-Nieto and Ventura, C. and Jordi Pont-Tuset and Cort{\'e}s, S. and Marqu{\'e}s, F.} } @mastersthesis {xSancho10, title = {Tweet@TV: Televisi{\'o} social en 140 car{\`a}cters}, year = {2010}, abstract = {

Aquest Projecte de Final de Carrera se centra en aquesta vessant dels serveis interactius de la televisi{\'o}, la televisi{\'o} social. Durant la seva realitzaci{\'o} s{\textquoteright}ha desenvolupat una aplicaci{\'o} per accedir a una xarxa social d{\textquoteright}una forma integrada i sincronitzada amb el consum de televisi{\'o}. Seguint la l{\'\i}nia de recerca del PFC d{\textquoteright}en Manel Martos, Adaptaci{\'o} i distribuci{\'o} de continguts web per IPTV, aquest projecte s{\textquoteright}ha realitzat en l{\textquoteright}empresa Activa Multim{\`e}dia Digital de la Corporaci{\'o} Catalana de Mitjans Audiovisuals entre els mesos de febrer i maig de 2010 en el marc del projecte CREA-IPTV.

Guardonat amb el segon premi de la convocat{\`o}ria Premios Liberalizaci{\'o}n de las Telecomunicaciones 2010 del Colegio de Ingenieros T{\'e}cnicos de Telecomunicaci{\'o}n (COITT), Espanya.

Cristina Ruiz Sancho, "Tweet@TV, la Televisi{\'o} Social en 140 car{\`a}cters" from Xavi Gir{\'o}
}, keywords = {interactive, microblogging, television, twitter}, url = {http://hdl.handle.net/2099.1/13523}, author = {Ruiz-Sancho, Cristina}, editor = {Xavier Gir{\'o}-i-Nieto and Cucurella, Eduard} } @mastersthesis {xVaras10, title = {Type of view estimation in football sequences}, year = {2010}, abstract = {

Due to the huge repercussion of football broadcast in society, an enormous number of applications can be derived to both analyze the match and enhance the visual experience of the spectator. These applications request semantical information about the content of the images. In particular, the type of view in a football image contains valuable information about the game. Thus, the type of view must be automatically computed to be able to process the large amount of information extracted from each football match. In this work, we propose a robust classification system that estimates the type ofview in football images in real time. For each frame of the sequence, a set of descriptors is extracted to characterize a specific part of the scene: the grass field. Gathering all these descriptors and a few ones related with texture, a decision tree determines the view that is shown in that frame. In order to improve the robustness of the algorithm, the redundancy of the temporal domain is exploited. The validity of the proposed algorithm has been tested on a large amount of frames from broadcasted football sequences in a wide variety of scenarios (stadiums, light conditions, ...). Promising results have been obtained with a 96\% of accuracy in the classification of these images.

}, author = {David Varas}, editor = {Marqu{\'e}s, F.} } @conference {cLopez-Mendez10, title = {Virtual view appearance representation for human motion analysis in multi-view environments}, booktitle = {18th European Signal Processing Conference}, year = {2010}, pages = {959{\textendash}963}, isbn = {2076-1465}, url = {http://hdl.handle.net/2117/8747}, author = {L{\'o}pez-M{\'e}ndez, A. and Cristian Canton-Ferrer and Casas, J.} } @inbook {bNickel09, title = {Activity Classification}, booktitle = {Computers in the Human Interaction Loop}, year = {2009}, pages = {107{\textendash}119}, publisher = {Springer}, organization = {Springer}, address = {London}, abstract = {

When a person enters a room, he or she immediately develops a mental concept about {\textquotedblleft}what is going on{\textquotedblright} in the room; for example, people may be working in the room, people may be engaged in a conversation, or the room may be empty. The CHIL services depend on just the same kind of semantic description, which is termed activity in the following. The {\textquotedblleft}Connector{\textquotedblright} or the {\textquotedblleft}Memory Jog{\textquotedblright}, for example, could provide support that is appropriate for the given context if it knew about the current activity at the user{\textquoteright}s place. This kind of higher-level understanding of human interaction processes could then be used, e.g., for rating the user{\textquoteright}s current availability in a certain situation.

}, isbn = {978-1-84882-053-1}, doi = {10.1007/978-1-84882-054-8}, url = {http://www.springerlink.com/content/tj2x827107563616/}, author = {Nickel, K. and M. Pard{\`a}s and Stiefelhagen, R. and Cristian Canton-Ferrer and Landabaso, J. and Casas, J.} } @mastersthesis {xRovira09, title = {Aplicaci{\'o} rica d{\textquoteright}internet per a la consulta amb text i imatge a la Corporaci{\'o} Catalana de Mitjans Audiovisuals}, year = {2009}, abstract = {

Premi millor projecte final de carrera d{\textquoteright}Enginyeria de Telecomunicaci{\'o} en Serveis Telem{\`a}tics. Atorgat per Accenture (Curs 2009-2010)

}, url = {http://hdl.handle.net/2099.1/8766}, author = {Salla-Rovira, Ramon}, editor = {Xavier Gir{\'o}-i-Nieto and Vives, X.} } @book {eDutoit09, title = {Applied signal processing: A Matlab-based proof of concept}, year = {2009}, publisher = {Springer}, organization = {Springer}, isbn = {978-0-387-74534-3}, url = {http://cataleg.upc.edu/record=b1353617~S1*cat}, author = {Dutoit, T. and Marqu{\'e}s, F.} } @conference {cCanton-Ferrer09, title = {Audiovisual Event Detection Towards Scene Understanding}, booktitle = {2009 IEEE Computer-Society Conference on Computer Vision and Pattern Recognition Workshops}, year = {2009}, pages = {840{\textendash}847}, abstract = {

Acoustic events produced in meeting environments may contain useful information for perceptually aware interfaces and multimodal behavior analysis. In this paper, a system to detect and recognize these events from a multimodal perspective is presented combining information from multiple cameras and microphones. First, spectral and temporal features are extracted from a single audio channel and spatial localization is achieved by exploiting cross-correlation among microphone arrays. Second, several video cues obtained from multi-person tracking, motion analysis, face recognition, and object detection provide the visual counterpart of the acoustic events to be detected. A multimodal data fusion at score level is carried out using two approaches: weighted mean average and fuzzy integral. Finally, a multimodal database containing a rich variety of acoustic events has been recorded including manual annotations of the data. A set of metrics allow assessing the performance of the presented algorithms. This dataset is made publicly available for research purposes.

}, doi = {10.1109/CVPRW.2009.5204264}, author = {Cristian Canton-Ferrer and Butko, T. and Segura, C. and Xavier Gir{\'o}-i-Nieto and Nadeu, C. and Hernando, J. and Casas, J.} } @conference {cGallego09, title = {Bayesian foreground segmentation and tracking using pixel-wise background model and region-based foreground model}, booktitle = {16th IEEE International Conference on Image Processing}, year = {2009}, pages = {3205{\textendash}3208}, doi = {10.1109/ICIP.2009.5414380}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=5414380}, author = {Gallego, J. and M. Pard{\`a}s and Haro, G.} } @conference {cLeon09, title = {Caption text extraction for indexing purposes using a hierarchical region-based image model}, booktitle = {16th International Conference on Image Processing}, year = {2009}, pages = {1869{\textendash}1872}, isbn = {978-1-4244-5655-0}, doi = {http://dx.doi.org/10.1109/ICIP.2009.5413607}, url = {http://hdl.handle.net/2117/7940}, author = {Le{\'o}n, M. and Ver{\'o}nica Vilaplana and Gasull, A. and Marqu{\'e}s, F.} } @conference {cMarcello09, title = {Cloud motion estimation in seviri image sequences}, booktitle = {2009 IEEE International Geoscience and Remote Sensing Symposium}, year = {2009}, pages = {642{\textendash}645}, isbn = {978-1-4244-3394-0}, doi = {10.1109/IGARSS.2009.5417842}, url = {http://hdl.handle.net/2117/9492}, author = {Marcello, J. and F. Eugenio and Marqu{\'e}s, F.} } @conference {cRuiz-Hidalgo09, title = {Comparison of MPEG-7 descriptors for long term selection of reference frames}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2009}, year = {2009}, pages = {941{\textendash}944}, address = {Taipei, Taiwan}, isbn = {0-8194-2103-0}, doi = {10.1109/ICASSP.2009.4959740}, url = {http://hdl.handle.net/2117/8816}, author = {Ruiz-Hidalgo, J. and Salembier, P.} } @inbook {bVilaplana09, title = {Compressi{\'o} d{\textquoteright}imatges}, booktitle = {Codificaci{\'o} del so i de la imatge}, year = {2009}, pages = {1{\textendash}65}, isbn = {9788469148365}, url = {http://ccuc.cbuc.cat:2082/record=b4149618~S23*cat}, author = {Ver{\'o}nica Vilaplana} } @article {aSalembier09, title = {Connected operators: A review of region-based morphological image processing techniques}, journal = {IEEE Signal Processing Magazine}, volume = {6}, year = {2009}, pages = {136{\textendash}157}, issn = {1053-5888}, doi = {10.1109/MSP.2009.934154}, author = {Salembier, P. and Wilkinson, M.H.F.} } @conference {cDimiccoli09, title = {Exploiting T-junctions for depth segregation in single images}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2009}, year = {2009}, address = {Taipei, Taiwan}, author = {Dimiccoli, M. and Salembier, P.} } @conference {cVilaplana09, title = {Face tracking using a region-based mean-shift algorithm with adaptive object and background models}, booktitle = {Workshop on Image Analysis for Multimedia Interactive Services}, year = {2009}, pages = {9{\textendash}12}, doi = {http://dx.doi.org/10.1109/WIAMIS.2009.5031419}, url = {http://hdl.handle.net/2117/7946}, author = {Ver{\'o}nica Vilaplana and David Varas} } @conference {cLopez-Mendez09, title = {Feature-based annealing particle filter for robust body pose estimation}, booktitle = {Fourth International Conference on Computer Vision Theory and Applications}, year = {2009}, pages = {438{\textendash}443}, abstract = {

This paper presents a new annealing method for particle filtering in the context of body pose estimation. The feature-based annealing is inferred from the weighting functions obtained with common image features used for the likelihood approximation. We introduce a complementary weighting function based on the foreground extraction and we balance the different measures through the annealing layers in order to improve the posterior estimate. This technique is applied to estimate the upper body pose of a subject in a realistic multi-view environment. Comparative results between the proposed method and the common annealing strategy are presented to assess the robustness of the algorithm.

}, doi = {10.5220/0001783404380443}, url = {http://visapp.visigrapp.org/Abstracts/2009/VISAPP_2009_Abstracts.htm}, author = {L{\'o}pez-M{\'e}ndez, A. and Casas, J.} } @article {Caselles200916, title = {Flux-gradient and source-term balancing for certain high resolution shock-capturing schemes}, journal = {Computers \& Fluids}, volume = {38}, year = {2009}, month = {0/2009}, pages = {16 - 36}, abstract = {

We present an extension of Marquina{\textquoteright}s flux formula, as introduced in Fedkiw et al. [Fedkiw RP, Merriman B, Donat R, Osher S. The penultimate scheme for systems of conservation laws: finite difference \{ENO\} with Marquina{\textquoteright}s flux splitting. In: Hafez M, editor. Progress in numerical solutions of partial differential equations, Arcachon, France; July 1998], for the shallow water system. We show that the use of two different Jacobians at cell interfaces prevents the scheme from satisfying the exact C-property [Berm{\'u}dez A, V{\'a}zquez ME. Upwind methods for hyperbolic conservation laws with source terms. Comput Fluids 1994;23(8):1049{\textendash}71] while the approximate C-property is satisfied for higher order versions of the scheme. The use of a single Jacobian in Marquina{\textquoteright}s flux splitting formula leads to a numerical scheme satisfying the exact C-property, hence we propose a combined technique that uses Marquina{\textquoteright}s two sided decomposition when the two adjacent states are not close and a single decomposition otherwise. Finally, we propose a special treatment at wet/dry fronts and situations of dry bed generation.

}, issn = {0045-7930}, doi = {10.1016/j.compfluid.2007.07.023}, author = {Caselles, Vicent and Donat, Rosa and Haro, G.} } @conference {cRolon09a, title = {Generalized lifting with adaptive local pdf estimation for image coding}, booktitle = {Picture coding symposium, PCS 2009}, year = {2009}, address = {Chicago, USA}, author = {Rol{\'o}n, J. and Mendon{\c c}a, E. and Salembier, P.} } @conference {cFrias-Velazquez09, title = {Gray-scale erosion algorithm based on image bitwise decomposition: application to focal plane processors}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing 2009}, year = {2009}, pages = {845{\textendash}848}, doi = {10.1109/ICASSP.2009.4959716}, url = {http://hdl.handle.net/2117/9156}, author = {Frias-Velazquez, A. and Morros, J.R.} } @conference {cCalderero09a, title = {Hierarchical fusion of color and depth information at partition level by cooperative region merging}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing 2009}, year = {2009}, pages = {973{\textendash}976}, doi = {10.1109/ICASSP.2009.4959748}, url = {http://hdl.handle.net/2117/9491}, author = {Calderero, F. and Marqu{\'e}s, F.} } @conference {cDimiccoli09a, title = {Hierarchical region-based representation for segmentation and filtering with depth in single images}, booktitle = {IEEE International Conference on Image Processing, ICIP 2009}, year = {2009}, address = {Cairo, Egypt}, author = {Dimiccoli, M. and Salembier, P.} } @conference {cCalderero09, title = {Hierarchical segmentation of vegetation areas in high spatial resolution images by fusion of multispectral information}, booktitle = {2009 IEEE International Geoscience and Remote Sensing Symposium}, year = {2009}, pages = {232{\textendash}235}, isbn = {978-1-4244-3394-0}, doi = {10.1109/IGARSS.2009.5417329}, url = {http://hdl.handle.net/2117/9494}, author = {Calderero, F. and Marqu{\'e}s, F. and Marcello, J. and F. Eugenio} } @conference {cFrias-Velazquez09a, title = {Histogram computation based on image bitwise decomposition}, booktitle = {ICIP 2009}, year = {2009}, isbn = {978-1-4244-5653-6}, doi = {10.1109/ICIP.2009.5413981}, url = {http://hdl.handle.net/2117/9144}, author = {Frias-Velazquez, A. and Morros, J.R.} } @inbook {bMarques09, title = {How are digital images compressed in the web?}, booktitle = {Applied signal processing}, year = {2009}, pages = {265{\textendash}310}, isbn = {978-0-387-74534-3}, url = {http://cataleg.upc.edu/record=b1353617~S1*cat}, author = {Marqu{\'e}s, F. and Menezes, M. and Ruiz-Hidalgo, J.}, editor = {Dutoit, T. and Marqu{\'e}s, F.} } @inbook {bMarques09a, title = {How are digital TV programs compressed to allow broadcasting?}, booktitle = {Applied signal processing}, year = {2009}, pages = {311{\textendash}359}, isbn = {978-0-387-74534-3}, url = {http://cataleg.upc.edu/record=b1353617~S1*cat}, author = {Marqu{\'e}s, F. and Menezes, M. and Ruiz-Hidalgo, J.}, editor = {Dutoit, T. and Marqu{\'e}s, F.} } @inbook {bBach09, title = {How can physicians quantify brain degeneration?}, booktitle = {Applied signal processing}, year = {2009}, pages = {411{\textendash}449}, isbn = {978-0-387-74534-3}, url = {http://cataleg.upc.edu/record=b1353617~S1*cat}, author = {Bach, M. and Thiran, J. and Marqu{\'e}s, F.}, editor = {Dutoit, T. and Marqu{\'e}s, F.} } @inbook {bDescamps09, title = {How does digital cinema compress images?}, booktitle = {Applied signal processing}, year = {2009}, pages = {361{\textendash}410}, isbn = {978-0-387-74534-3}, url = {http://cataleg.upc.edu/record=b1353617~S1*cat}, author = {Descamps, A. and De Vleeschouwer, C. and Jacques, L. and Marqu{\'e}s, F.}, editor = {Dutoit, T. and Marqu{\'e}s, F.} } @phdthesis {dCanton-Ferrer09, title = {Human Motion Capture with Scalable Body Models}, year = {2009}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

Capturing and tracking human motion is becoming a hot research topic due to the number of applications that can be addressed using this information, ranging from action recognition, human-computer interfaces and biometrics. This PhD thesis addresses the problem of extracting the pose parameters of a human body in a multi-camera environment relying on Monte Carlo techniques.

Extracing the describing parameters (pose) of an articulated model of the human body from information provided by multiple cameras can be efficiently tackled using the standard Bayesian prediction and update formulation. However, due to the high dimensionality of the pose space, standard techniques based on linear and Gaussian assumptions are not suitable. Instead, Monte Carlo methods based on a sampled representation of the involved likelihood functions yield to a promising research direction. In this thesis, we present a number of contributions to this topic based on a coarse-to-fine analysis scheme. The input data to all presented algorithms will be a 3D reconstruction of the scene, described by colored voxels, thus combining the information provided by all camera views into a unified data representation.

In a first stage, subjects are coarsely approximated by an ellipsoid and their centroids are estimated and tracked. A novel approach achieving real-time performance is presented based a surface sampling of the objects in the scene: the Sparse Sampling algorithm. In this filtering scheme, an independent tracker is assigned to every target and an exclusion mechanism is defined to avoid interference among targets. Finally, the obtained centroid positions are employed afterwards to initialize a specific pose estimation algorithm.

Two pose estimation algorithms are presented based on the seminal principle of the annealead particle filter technique. The first one is a low cost approach to marker-based human motion capture and, the second, is a markerless technique relying on likelihood functions computed directly on the 3D voxel representation. In both approaches, kinematic constrains are employed to avoid unfeasible poses. Although these algorithms provide satisfactory results when dealing with accurate input data, they tend to loose track when processing noisy measurements and occluded body parts.\ 

Scalability of the structure of the human body is exploited to define two robust alternatives to analyze faulty data. In the first case, the Scalable Human Body Model-Annealed Particle Filter, is presented as filtering approach adding an extra annealing level to the classical annealed particle filter approach: the body hierarchy annealing loop. In this way, a progressive fitting is performed in a coarse-to-fine manner thus yielding to both more efficient and accurate results. Another alternative is presented employing a human body model hierarchy where different limbs are added progressively to the model. This allows detecting those parts that are occluded (for instance, by furniture) and disregard them into the likelihood evaluation step of the filtering scheme.

Finally, in order to evaluate all the systems proposed in this thesis, a new methodology is presented. Existing methods based on computing the mean and variance of the committed estimation error tend to produce biased figures when a subset of the human body is not tracked properly. We proposed two alternative metrics that avoids this situations and therefore allow a fairer comparison among algorithms.

}, author = {Cristian Canton-Ferrer}, editor = {Casas, J. and M. Pard{\`a}s} } @inbook {bCanton-Ferrer09, title = {Image and video processing tools for HCI}, booktitle = {Multimodal signal processing: theory and applications for human-computer interaction}, year = {2009}, pages = {93{\textendash}118}, isbn = {9780123748256}, author = {Cristian Canton-Ferrer and M. Pard{\`a}s and Ver{\'o}nica Vilaplana} } @conference {cButko09, title = {Improving Detection of Acoustic Events Using Audiovisual Data and Feature Level Fusion}, booktitle = {10th Annual Conference of the International Speech Communication Association}, year = {2009}, pages = {1147{\textendash}1150}, isbn = {978-1-61567-692-7}, url = {http://gps-tsc.upc.es/imatge/_Xgiro/research/publications/2009/interspeech.pdf}, author = {Butko, T. and Cristian Canton-Ferrer and Segura, C. and Xavier Gir{\'o}-i-Nieto and Nadeu, C. and Hernando, J. and Casas, J.} } @conference {cSanchez-Riera09, title = {Indoor PTZ camera calibration with concurrent PT axes}, booktitle = {Fourth International Conference on Computer Vision Theory and Applications}, year = {2009}, pages = {45{\textendash}50}, abstract = {

The introduction of active (pan-tilt-zoom or PTZ) cameras in Smart Rooms in addition to fixed static cameras allows to improve resolution in volumetric reconstruction, adding the capability to track smaller objects with higher precision in actual 3D world coordinates. To accomplish this goal, precise camera calibration data should be available for any pan, tilt, and zoom settings of each PTZ camera. The PTZ calibration method proposed in this paper introduces a novel solution to the problem of computing extrinsic and intrinsic parameters for active cameras. We first determine the rotation center of the camera expressed under an arbitrary world coordinate origin. Then, we obtain an equation relating any rotation of the camera with the movement of the principal point to define extrinsic parameters for any value of pan and tilt. Once this position is determined, we compute how intrinsic parameters change as a function of zoom. We validate our method by evaluating the re-projection error and its stability for points inside and outside the calibration set.

}, isbn = {0-8186-7310-9}, doi = {10.5220/0001754900100015}, url = {http://visapp.visigrapp.org/Abstracts/2009/VISAPP_2009_Abstracts.htm}, author = {Sanchez-Riera, J. and Salvador, J. and Casas, J.} } @article {aNeumann09, title = {Integration of audiovisual sensors and technologies in a smart room}, journal = {Personal and ubiquitous computing}, volume = {13}, number = {1}, year = {2009}, pages = {15{\textendash}23}, abstract = {

At the Technical University of Catalonia\ (UPC), a smart room has been equipped with 85 microphones and 8 cameras. This paper describes the setup of the\ sensors, gives an overview of the underlying hardware and\ software infrastructure and indicates possibilities for highand low-level multi-modal interaction. An example of\ usage of the information collected from the distributed\ sensor network is explained in detail: the system supports\ a group of students that have to solve a lab assignment\ related problem.

}, keywords = {CHIL, PROVEC}, issn = {1617-4909}, doi = {10.1007/s00779-007-0172-1}, url = {http://hdl.handle.net/2117/9468}, author = {Neumann, J. and Casas, J. and Macho, D. and Ruiz-Hidalgo, J.} } @mastersthesis {dCortes09, title = {Interfaz gr{\'a}fica de usuario para la b{\'u}squeda de im{\'a}genes basada en im{\'a}genes}, year = {2009}, month = {07/2009}, type = {BSc}, abstract = {

El Proyecto de Final de Carrera (PFC) responde a esa necesidad de creaci{\'o}n de herramientas de acceso a contenido multimedia, nuevas herramientas que faciliten la recuperaci{\'o}n de toda esa informaci{\'o}n audiovisual almacenada. El Graphic Object Searcher (GOS) es una interfaz gr{\'a}fica para realizar b{\'u}squedas de im{\'a}genes alojadas en grandes bases de datos a partir de una imagen ejemplo y de unos criterios de b{\'u}squeda establecidos por el usuario. La realizaci{\'o}n de PFC permite trabajar en las dos {\'a}reas tecnol{\'o}gicas con m{\'a}s auge actualmente: el sector multimedia (gesti{\'o}n de contenido audiovisual) y las tecnolog{\'\i}as de la informaci{\'o}n (TIC) (inform{\'a}tica al servicio de la comunicaci{\'o}n). Estas dos {\'a}reas tienden a aunar esfuerzos en una sociedad abocada al uso y consumo de contenido audiovisual a trav{\'e}s de m{\'u}ltiples plataformas y dispositivos en cualquier sector econ{\'o}mico y social (ocio, formaci{\'o}n, servicios, etc.). Cualquier profesional del sector audiovisual ha de adquirir conocimiento y experiencia en ambas {\'a}reas para cimentar su carrera.

Interfaz gr{\'a}fica de usuario para la b{\'u}squeda de im{\'a}genes basada en im{\'a}genes from Xavi Gir{\'o}
}, url = {http://hdl.handle.net/2099.1/8588}, author = {Cort{\'e}s, S.}, editor = {Xavier Gir{\'o}-i-Nieto} } @conference {cCabrera09, title = {LAVICAD: LAboratori VIrtual de Comunicacions Anal{\`o}giques i Digitals}, booktitle = {Jornada d{\textquoteright}Innovaci{\'o} Docent - RIMA (JID-RIMA)}, year = {2009}, month = {02/2009}, publisher = {UPCommons}, organization = {UPCommons}, address = {Barcelona, Catalonia}, abstract = {

Mitjan{\c c}ant el present ajut s{\textquoteright}ha ampliat l{\textquoteright}aplicaci{\'o} en xarxa LAVICAD (LAboratori VIrtual de COmunicacions Anal{\`o}giques i Digitals) que s{\textquoteright}ofereix de forma integrada dins de la plataforma d{\textquoteright}e-learning COM@WEB. LAVICAD {\'e}s una eina programada en Java i Matlab i est{\`a} formada per un conjunt de simuladors de la capa f{\'\i}sica de sistemes de comunicacions. Tots els simuladors es presenten en xarxa i es poden utilitzar pels estudiants des de qualsevol ordinador sense necessitat d{\textquoteright}instal{\textperiodcentered}laci{\'o} de cap tipus de software especial. Durant el curs 2007 2008 s{\textquoteright}han desenvolupat entre d{\textquoteright}altres dos l{\'\i}nies de treball. D{\textquoteright}una banda s{\textquoteright}ha programat l{\textquoteright}applet que emula la capa f{\'\i}sica de la televisi{\'o} digital terrestre, com a referent per a l{\textquoteright}ensenyament de sistemes de comunicacions avan{\c c}ats. D{\textquoteright}altra banda s{\textquoteright}ha treballat en la programaci{\'o} de noves funcionalitats de l{\textquoteright}eina LAVICAD, que permeten ajudar als professors en el seguiment i avaluaci{\'o} del treball continuat dels estudiants. En particular s{\textquoteright}ha programat la generaci{\'o} d{\textquoteright}una base de dades que cont{\'e} la informaci{\'o} dels usuaris que s{\textquoteright}han connectat i els resultats obtinguts a l{\textquoteright}executar un determinat simulador. Les dues l{\'\i}nies desenvolupades han de permetre durant l{\textquoteright}actual curs, consolidar l{\textquoteright}{\'u}s dels diferents simuladors per a la doc{\`e}ncia de les assignatures implicades al projecte.

}, url = {http://hdl.handle.net/2099/7235}, author = {Cabrera, M. and Xavier Gir{\'o}-i-Nieto and Rey, F. and Gasull, A. and Casas, J. and Villares, J. and Fernandez, J. and Sala {\'A}lvarez, josep and Espinosa Fricke, Pedro and Fern{\'a}ndez, Carlos Marcos and Cort{\'e}s, S. and Farr{\'e}, Miquel {\`A}ngel} } @conference {cOliveras09, title = {Maximum likelihood factor analysis in malaria cytokines analysis and modelling}, booktitle = {IEEE International Workshop on Genomic Signal Processing and Statistics}, year = {2009}, isbn = {84-605-9799-7}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=5174320\&arnumber=5174336\&count=57\&index=14}, author = {Albert Oliveras} } @inbook {bDimakis09, title = {The Memory Jog Service}, booktitle = {Computers in the Human Interaction Loop}, year = {2009}, pages = {207{\textendash}234}, publisher = {Springer}, organization = {Springer}, address = {London}, abstract = {

The CHIL Memory Jog service focuses on facilitating the collaboration of participants in meetings, lectures, presentations, and other human interactive events, occurring in indoor CHIL spaces. It exploits the whole set of the perceptual components that have been developed by the CHIL Consortium partners (e.g., person tracking, face identification, audio source localization, etc) along with a wide range of actuating devices such as projectors, displays, targeted audio devices, speakers, etc.

}, isbn = {978-1-84882-053-1}, doi = {10.1007/978-1-84882-054-8}, url = {http://www.springerlink.com/content/tg220751v5m1h356/}, author = {Dimakis, N. and Soldatos, J. and Polymenakos, L. and Sturm, J. and Neumann, J. and Casas, J.} } @conference {cRolon09, title = {Modeling of contours in wavelet domain for generalized lifting image compression}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2009}, year = {2009}, address = {Taipei, Taiwan}, author = {Rol{\'o}n, J. and Ortega, A. and Salembier, P.} } @phdthesis {dDimiccoli09, title = {Monocular Depth Estimation for Image Segmentation and Filtering}, year = {2009}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, author = {Dimiccoli, M.}, editor = {Salembier, P.} } @book {eMarques09, title = {Multimodal signal processing: theory and applications for human-computer interaction}, year = {2009}, isbn = {9780123748256}, editor = {Marqu{\'e}s, F.} } @conference {cSuau09, title = {Multi-resolution illumination compensation for foreground extraction}, booktitle = {16th International Conference on Image Processing}, year = {2009}, pages = {3225{\textendash}3228}, abstract = {

Illumination changes may lead to false foreground (FG) segmentation and tracking results. Most of the existing FG extraction algorithms obtain a background (BG) estimation from temporal statistical parameters. Such algorithms consider a quasi-static BG which does not change but slowly. Therefore, fast illumination changes are not taken into account by the BG estimator and they are considered as FG. The aim of the proposed algorithm is to reduce illumination effects in video sequences in order to improve foreground segmentation performances.

}, doi = {http://dx.doi.org/10.1109/ICIP.2009.5414358}, url = {http://www.icip2009.org}, author = {Suau, X. and Casas, J. and Ruiz-Hidalgo, J.} } @conference {cCalderero09b, title = {Performance evaluation of probability density estimators for unsupervised information theoretical region merging}, booktitle = {16th International Conference on Image Processing}, year = {2009}, pages = {4397{\textendash}4400}, isbn = {978-1-4244-5655-6}, doi = {10.1109/ICIP.2009.5413621}, url = {http://hdl.handle.net/2117/9510}, author = {Calderero, F. and Marqu{\'e}s, F. and Ortega, A.} } @inbook {bBernardin09, title = {Person Tracking}, booktitle = {Computers in the human interaction loop}, year = {2009}, pages = {11{\textendash}22}, publisher = {Springer}, organization = {Springer}, address = {London}, abstract = {

One of the most basic building blocks for the understanding of human actions and interactions is the accurate detection and tracking of persons in a scene. In constrained scenarios involving at most one subject, or in situations where persons can be confined to a controlled monitoring space or required to wear markers, sensors, or microphones, these tasks can be solved with relative ease. However, when accurate localization and tracking have to be performed in an unobtrusive or discreet fashion, using only distantly placed microphones and cameras, in a variety of natural and uncontrolled scenarios, the challenges posed are much greater. The problems faced by video analysis are those of poor or uneven illumination, low resolution, clutter or occlusion, unclean backgrounds, and multiple moving and uncooperative users that are not always easily distinguishable.

}, isbn = {978-1-84882-053-1}, doi = {10.1007/978-1-84882-054-8}, url = {http://www.springerlink.com/content/tj2x827107563616/}, author = {Bernardin, K. and Stiefelhagen, R. and Pnevmatikakis, A. and Lanz, O. and Brutti, A. and Casas, J. and Potamianos, G.} } @inbook {bVilaplana09a, title = {Processament puntual}, booktitle = {Codificaci{\'o} del so i de la imatge}, year = {2009}, pages = {5{\textendash}65}, isbn = {9788469148365}, url = {http://ccuc.cbuc.cat:2082/record=b4149618~S23*cat}, author = {Ver{\'o}nica Vilaplana} } @conference {cSalembier09, title = {Study on nonlocal morphological operators}, booktitle = {IEEE International Conference on Image Processing, ICIP 2009}, year = {2009}, address = {Cairo, Egypt}, author = {Salembier, P.} } @conference {cCanton-Ferrer09a, title = {Towards a low cost multi-camera marker based human motion capture system}, booktitle = {16th International Conference on Image Processing}, year = {2009}, pages = {2581{\textendash}2584}, isbn = {978-1-4244-5655-6}, doi = {10.1109/ICIP.2009.5413915}, url = {http://dx.doi.org/10.1109/ICIP.2009.5413915}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @article {aChang09, title = {Trajectory tree as an object-oriented hierarchical representation for video}, journal = {IEEE transactions on circuits and systems for video technology}, volume = {19}, number = {4}, year = {2009}, pages = {547{\textendash}560}, issn = {1051-8215}, author = {Chang, C. and M. Pard{\`a}s and Marqu{\'e}s, F.} } @conference {cAlcoverro09, title = {Visual hull reconstruction algorithms comparison: towards robustness to silhouette errors}, booktitle = {International Conference on Computer Vision Theory and Applications 2009}, year = {2009}, pages = {464{\textendash}469}, isbn = {978-989-8111-69-2}, url = {http://upcommons.upc.edu/e-prints/urlFiles?idDrac=2642455}, author = {Alcoverro, M. and M. Pard{\`a}s} } @conference {cCanton-Ferrer09b, title = {Voxel based annealed particle filtering for markerless 3D articulated motion capture}, booktitle = {3DTV Conference: The True Vision - Capture, Transmission and Display of 3D Video, 2009}, year = {2009}, pages = {1{\textendash}4}, doi = {10.1109/3DTV.2009.5069645}, url = {http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber=5069609\&isYear=2009}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @article {aCorrea08, title = {3D posture estimation using geodesic distance maps}, journal = {Multimedia tools and applications}, volume = {38}, number = {3}, year = {2008}, pages = {365{\textendash}384}, issn = {1380-7501}, author = {Correa, P. and Marqu{\'e}s, F. and Marichal, X. and Macq, B.} } @article {aCanton-Ferrer08, title = {Audiovisual head orientation estimation with particle filtering in multisensor scenarios}, journal = {EURASIP Journal on Advances in Signal Processing}, volume = {2008}, year = {2008}, month = {06/2007}, pages = {1{\textendash}13}, chapter = {276846}, abstract = {

This article presents a multimodal approach to head pose estimation of individuals in environments equipped with multiple cameras and microphones, such as SmartRooms or automatic video conferencing. Determining the individuals head orientation is the basis for many forms of more sophisticated interactions between humans and technical devices and can also be used for automatic sensor selection (camera, microphone) in communications or video surveillance systems. The use of particle filters as a unified framework for the estimation of the head orientation for both monomodal and multimodal cases is proposed. In video, we estimate head orientation from color information by exploiting spatial redundancy among cameras. Audio information is processed to estimate the direction of the voice produced by a speaker making use of the directivity characteristics of the head radiation pattern. Furthermore, two different particle filter multimodal information fusion schemes for combining the audio and video streams are analyzed in terms of accuracy and robustness. In the first one, fusion is performed at a decision level by combining each monomodal head pose estimation, while the second one uses a joint estimation system combining information at data level. Experimental results conducted over the CLEAR 2006 evaluation database are reported and the comparison of the proposed multimodal head pose estimation algorithms with the reference monomodal approaches proves the effectiveness of the proposed approach.

}, issn = {1687-6172}, doi = {10.1155/2008/276846}, url = {http://asp.eurasipjournals.com/content/2008/1/276846}, author = {Cristian Canton-Ferrer and Segura, C. and Casas, J. and M. Pard{\`a}s and Hernando, J.} } @article {aVilaplana08, title = {Binary partition trees for object detection}, journal = {IEEE transactions on image processing}, volume = {17}, number = {11}, year = {2008}, pages = {1{\textendash}16}, abstract = {

This paper discusses the use of binary partition trees (BPTs) for object detection. BPTs are hierarchical region-based representations of images. They define a reduced set of regions that covers the image support and that spans various levels of resolution. They are attractive for object detection as they tremendously reduce the search space. In this paper, several issues related to the use of BPT for object detection are studied. Concerning the tree construction, we analyze the compromise between computational complexity reduction and accuracy. This will lead us to define two parts in the BPT: one providing accuracy and one representing the search space for the object detection task. Then we analyze and objectively compare various similarity measures for the tree construction. We conclude that different similarity criteria should be used for the part providing accuracy in the BPT and for the part defining the search space and specific criteria are proposed for each case. Then we discuss the object detection strategy based on BPT. The notion of node extension is proposed and discussed. Finally, several object detection examples illustrating the generality of the approach and its efficiency are reported.

}, issn = {1057-7149}, doi = {10.1109/TIP.2008.2002841}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F. and Salembier, P.} } @conference {cSalembier08, title = {Connected operators based on region-trees}, booktitle = {IEEE International Conference on Image Processing, ICIP 2008}, year = {2008}, address = {San Diego, USA}, author = {Salembier, P.} } @article {aTurkan08, title = {Edge Projections for Eye Localization}, journal = {Optical engineering}, volume = {47}, number = {4}, year = {2008}, pages = {1{\textendash}6}, issn = {0091-3286}, doi = {10.1117/1.2902437}, url = {http://spiedigitallibrary.org/oe/resource/1/opegar/v47/i4/p047007_s1}, author = {Turkan, M. and M. Pard{\`a}s and Cetin, E.} } @phdthesis {dIregui08, title = {Efficient strategies for navigation through very large JPEG2000 image}, year = {2008}, school = {Universit{\'e} Catholique de Louvain (UCL)}, type = {phd}, url = {http://hdl.handle.net/2078.1/19637 }, author = {Iregui, M.}, editor = {Marqu{\'e}s, F. and Macq, B.} } @article {aOliveras08, title = {Elevated basal hepcidin levels in the liver may inhibit the development of malaria infection: Another piece towards solving the malaria puzzle?}, journal = {Medical hypotheses}, volume = {70}, number = {3}, year = {2008}, pages = {630{\textendash}634}, issn = {0306-9877}, doi = {10.1016/j.mehy.2007.07.021}, url = {http://www.sciencedirect.com/science/article/B6WN2-4PK8B8Y-5/2/f5ccc9584cc2e8c2095731c9be9d4a31}, author = {Albert Oliveras and Espel-Masferrer, E.} } @inbook {bCanton-Ferrer08a, title = {Exploiting Structural Hierarchy in Articulated Objects Towards Robust Motion Capture}, booktitle = {Articulated Motion and Deformable Objects}, volume = {5098}, year = {2008}, pages = {82{\textendash}91}, publisher = {Springer}, organization = {Springer}, address = {Berlin / Heidelberg}, abstract = {

This paper presents a general analysis framework towards exploiting the underlying hierarchical and scalable structure of an articulated object for pose estimation and tracking. The Scalable Human Body Model (SHBM) is presented as a set of human body models ordered following a hierarchy criteria. The concept of annealing is applied to derive a generic particle filtering scheme able to perform a sequential filtering over the models contained in the SHBM leading to a structural annealing process. This scheme is applied to perform human motion capture in a multi-camera environment. Finally, the effectiveness of the proposed system is addressed by comparing its performance with the standard and annealed particle filtering approaches over an annotated database.

}, isbn = {978-3-540-70516-1}, doi = {10.1007/978-3-540-70517-8_9}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @conference {cCanton-Ferrer08, title = {Exploiting Structural Hierarchy in Articulated Objects Towards Robust Motion Capture}, booktitle = {V Conference on Articulated Motion and Deformable Objects (AMDO)}, year = {2008}, pages = {82{\textendash}91}, isbn = {0302-9743}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @inbook {bVilaplana08, title = {Face and speech interaction}, booktitle = {Multimodal user interfaces: from signals to interaction}, year = {2008}, pages = {85{\textendash}118}, isbn = {978-3-540-78344-2}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F. and Gurban, M. and Thiran, J.} } @conference {cButko08, title = {Fusion of Audio and Video Modalities for Detection of Acoustic Events}, booktitle = {Interspeech 2008, 9th Annual Conference of the International Speech Communication Association}, year = {2008}, month = {09/2008}, pages = {123{\textendash}126}, publisher = {ISCA}, organization = {ISCA}, address = {Brisbane, Australia}, isbn = {978-1-61567-378-0}, url = {http://www.isca-speech.org/archive/interspeech_2008/i08_0123.html}, author = {Butko, T. and Temko, A. and Nadeu, C. and Cristian Canton-Ferrer} } @conference {cCalderero08a, title = {General region merging approaches based on information theory statistical measures}, booktitle = {IEEE International Conference on Image Processing}, year = {2008}, pages = {3016{\textendash}3019}, isbn = {1-4244-1764-3}, author = {Calderero, F. and Marqu{\'e}s, F.} } @conference {cCalderero08, title = {General Region Merging Based on First Order Markov Information Theory Statistical Measures}, booktitle = {16th European Signal Processing Conference}, year = {2008}, author = {Calderero, F. and Marqu{\'e}s, F.} } @inbook {bCanton-Ferrer08, title = {Head Orientation Estimation Using Particle Filtering in Multiview Scenarios}, booktitle = {Multimodal Technologies for Perception of Humans}, volume = {4625}, year = {2008}, pages = {317{\textendash}327}, publisher = {Springer}, organization = {Springer}, address = {Berlin / Heidelberg}, abstract = {

This paper presents a novel approach to the problem of determining head pose estimation and face 3D orientation of several people in low resolution sequences from multiple calibrated cameras. Spatial redundancy is exploited and the head in the scene is approximated by an ellipsoid. Skin patches from each detected head are located in each camera view. Data fusion is performed by back-projecting skin patches from single images onto the estimated 3D head model, thus providing a synthetic reconstruction of the head appearance. A particle filter is employed to perform the estimation of the head pan angle of the person under study. A likelihood function based on the face appearance is introduced. Experimental results proving the effectiveness of the proposed algorithm are provided for the SmartRoom scenario of the CLEAR Evaluation 2007 Head Orientation dataset.

}, isbn = {978-3-540-68584-5}, doi = {10.1007/978-3-540-68585-2_30}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @conference {cRolon08, title = {Image compression with generalized lifting and partial knowledge of the signal pdf}, booktitle = {IEEE International Conference on Image Processing, ICIP 2008}, year = {2008}, pages = {250{\textendash}254}, address = {San Diego, USA}, isbn = {88-86179-83-9}, author = {Rol{\'o}n, J. and Salembier, P. and Alameda, X.} } @conference {cButko08a, title = {Inclusion of video information for detection of acoustic events using the fuzzy integral}, booktitle = {Machine Learning for Multimodal Interaction: 5th International Workshop}, year = {2008}, pages = {74{\textendash}85}, isbn = {978-3540858522}, author = {Butko, T. and Temko, A. and Nadeu, C. and Cristian Canton-Ferrer} } @conference {cCabrera08, title = {Lavicad: laboratorio virtual de comunicaciones anal{\'o}gicas y digitales}, booktitle = {XXIII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {2008}, month = {09/2008}, pages = {1{\textendash}4}, address = {Madrid, Spain}, abstract = {

The presented experience consists on the {\textquotedblleft}design of{\textquotedblright} and {\textquotedblleft}experimentation with{\textquotedblright} a virtual laboratory of analog and digital communications: LAVICAD. It has been result a useful tool to verify the performance of different communication systems and signal processing techniques, topics typically integrated in undergraduated courses of the curriculum of telecommunications engineering. The communication systems have been implemented and designed as Java applets and are free access. They can be run at the e-learning platform: comweb.upc.edu. The different communication systems present different levels of user interactivity and when students execute a system integrated in a comweb course, the obtained results can be supervised by the professor as an evaluation and assessment tool. From a pedagogical point of view, the main advantages of using a virtual laboratory supposes, can leads to facilitate the learning of certain matters, acting as a connection between the model of knowledge based on concepts and theories, and their practical understanding and experimentation.\ 

[URSI 2008 website][Program for ED track]

}, isbn = {978-84-612-6291-5}, author = {Cabrera, M. and Xavier Gir{\'o}-i-Nieto and Rey, F.} } @conference {cDimiccoli08, title = {Monocular Depth by Nonlinear Diffusion}, booktitle = {IEEE Sixth Indian Conference on Computer Vision, Graphics \& Image Processing, ICVGIP 2008}, year = {2008}, pages = {140{\textendash}148}, address = {Bhubaneswar, India}, author = {Dimiccoli, M. and Morel, J. and Salembier, P.} } @article {aMarcello08, title = {Motion estimation techniques to automatically track oceanographic thermal structures in multi-sensor image sequences}, journal = {IEEE transactions on geoscience and remote sensing}, volume = {46}, number = {9}, year = {2008}, pages = {2743{\textendash}2762}, issn = {0196-2892}, author = {Marcello, J. and F. Eugenio and Marqu{\'e}s, F. and Hernandez-Guerra, A. and Gasull, A.} } @article {aSalah08, title = {Multimodal identification and localization of users in a smart environment}, journal = {Journal on Multimodal user interfaces}, volume = {2}, number = {2}, year = {2008}, pages = {75{\textendash}91}, issn = {1783-7677}, doi = {10.1007/s12193-008-0008-y}, url = {http://hdl.handle.net/2117/6326}, author = {Salah, A. and Morros, J.R. and Luque, J. and Segura, C. and Hernando, J. and Ambekar, O. and Schouten, B. and Pauwels, E.} } @conference {cCanton-Ferrer08b, title = {Multimodal Real-Time Focus of Attention Estimation in SmartRooms}, booktitle = {CVPR 2008 Workshop on Human Communicative Behavior Analysis}, year = {2008}, pages = {1{\textendash}4}, isbn = {978-1-4244-2340}, author = {Cristian Canton-Ferrer and Segura, C. and M. Pard{\`a}s and Casas, J. and Hernando, J.} } @inbook {bCanton-Ferrer08b, title = {Multi-Person Tracking Strategies Based on Voxel Analysis}, booktitle = {Multimodal Technologies for Perception of Humans}, volume = {4625}, year = {2008}, pages = {91{\textendash}103}, publisher = {Springer}, organization = {Springer}, address = {Berlin / Heidelberg}, abstract = {

This paper presents two approaches to the problem of simultaneous tracking of several people in low resolution sequences from multiple calibrated cameras. Spatial redundancy is exploited to generate a discrete 3D binary representation of the foreground objects in the scene. Color information obtained from a zenithal camera view is added to this 3D information. The first tracking approach implements heuristic association rules between blobs labelled according to spatiotemporal connectivity criteria. Association rules are based on a cost function which considers their placement and color histogram. In the second approach, a particle filtering scheme adapted to the incoming 3D discrete data is proposed. A volume likelihood function and a discrete 3D re-sampling procedure are introduced to evaluate and drive particles. Multiple targets are tracked by means of multiple particle filters and interaction among them is modeled through a 3D blocking scheme. Evaluation over the CLEAR 2007 database yields quantitative results assessing the performance of the proposed algorithm for indoor scenarios.

}, isbn = {978-3-540-68584-5}, doi = {10.1007/978-3-540-68585-2_7}, url = {http://www.springerlink.com/content/k08w4000844220gr/}, author = {Cristian Canton-Ferrer and Salvador, J. and Casas, J. and M. Pard{\`a}s} } @conference {cHaro08, title = {On the non-uniform complexity of brain connectivity}, booktitle = {5th IEEE International Symposium on Biomedical Imaging (ISBI 2008)}, year = {2008}, month = {05/2008}, publisher = {IEEE}, organization = {IEEE}, address = {Paris}, abstract = {

A stratification and manifold learning approach for analyzing High Angular Resolution Diffusion Imaging (HARDI) data is introduced in this paper. HARDI data provides high- dimensional signals measuring the complex microstructure of biological tissues, such as the cerebral white matter. We show that these high-dimensional spaces may be understood as unions of manifolds of varying dimensions/complexity and densities. With such analysis, we use clustering to characterize the structural complexity of the white matter. We briefly present the underlying framework and numerical experiments illustrating this original and promising approach.

}, keywords = {biodiffusion, biological tissues, biology computing, brain, brain connectivity, cellular biophysics, cerebral white matter, Clustering methods, complex microstructure, density, Density measurement, Diffusion tensor imaging, Geometry, high-angular resolution diffusion imaging, high-dimensional spaces, High-resolution imaging, Image analysis, Image resolution, Magnetic resonance imaging, manifold learning approach, Microstructure, molecular biophysics, nonuniform complexity, Point processes, Poisson processes, Signal resolution, stratification learning, Switches, Tensile stress, Unsupervised learning}, doi = {10.1109/ISBI.2008.4541139}, author = {Haro, G. and Lenglet, C. and Sapiro, Guillermo and Thompson, P.} } @inbook {bSalembier08, title = {Operateur connexe et arbre des coupes}, booktitle = {Morphologie math{\'e}matique 1: approches deterministes}, year = {2008}, pages = {151{\textendash}172}, publisher = {Hermes, Lavoisier}, organization = {Hermes, Lavoisier}, edition = {L. Najman and H. Talbot (Eds.)}, isbn = {978-2-7462-1841-3}, author = {Salembier, P.} } @conference {cCanton-Ferrer08a, title = {Particle Filtering and Sparse Sampling for Multi-Person 3D Tracking}, booktitle = {IEEE International Conference on Image Processing}, year = {2008}, pages = {2644{\textendash}2647}, isbn = {1-4244-1764-3}, doi = {10.1109/ICIP.2008.4712337}, author = {Cristian Canton-Ferrer and Sblendido, R. and Casas, J. and M. Pard{\`a}s} } @conference {cVilaplana08, title = {Region-based mean shift tracking: Application to face tracking}, booktitle = {IEEE International Conference on Image Processing}, year = {2008}, pages = {2712{\textendash}2715}, isbn = {978-1-4577-0538-0}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @conference {cGallego08, title = {Segmentation and Tracking of Static and Moving Objects in Video Surveillance Scenarios}, booktitle = {IEEE International Conference on Image Processing}, year = {2008}, pages = {2716{\textendash}2719}, isbn = {1-4244-1764-3}, author = {Gallego, J. and M. Pard{\`a}s} } @article {aLandabaso08a, title = {Shape from inconsistent silhouette}, journal = {Computer vision and image understanding}, volume = {112}, number = {2}, year = {2008}, month = {11/2008}, pages = {210{\textendash}224}, abstract = {

Shape from silhouette (SfS) is the general term used to refer to the techniques that obtain a volume estimate from a set of binary images. In a first step, a number of images are taken from different positions around the scene of interest. Later, each image is segmented to produce binary masks, also called silhouettes, to delimit the objects of interest. Finally, the volume estimate is obtained as the maximal one which yields the silhouettes. The set of silhouettes is usually considered to be consistent which means that there exists at least one volume which completely explains them. However, silhouettes are normally inconsistent due to inaccurate calibration or erroneous silhouette extraction techniques. In spite of that, SfS techniques reconstruct only that part of the volume which projects consistently in all the silhouettes, leaving the rest unreconstructed. In this paper, we extend the idea of SfS to be used with sets of inconsistent silhouettes. We propose a fast technique for estimating that part of the volume which projects inconsistently and propose a criteria for classifying it by minimizing the probability of miss-classification taking into account the 2D error detection probabilities of the silhouettes. A number of theoretical and empirical results are given, showing that the proposed method reduces the reconstruction error.

}, issn = {1077-3142}, doi = {10.1016/j.cviu.2008.02.006}, author = {Landabaso, J. and M. Pard{\`a}s and Casas, J.} } @conference {cLandabaso08, title = {Shape from Inconsistent Silhouette for Free Viewpoint Video}, booktitle = {IEEE International Conference on Image Processing}, year = {2008}, pages = {213{\textendash}216}, isbn = {1-4244-1764-3}, author = {Landabaso, J. and Lizcano, L. and M. Pard{\`a}s} } @conference {cSalvador08, title = {Shape from Probability Maps with Image-Adapted Voxelization}, booktitle = {ECCV Workshop on Multi-camera and Multi-modal Sensor Fusion Algorithms and Applications (M2SFA2)}, year = {2008}, pages = {1{\textendash}12}, isbn = {0-7803-9076-8}, author = {Salvador, J. and Casas, J.} } @conference {cGiro-i-Nieto08, title = {System architecture for indexing regions in keyframes}, booktitle = {3rd International Conference on Semantic and digital Media Technologies}, year = {2008}, month = {12/2008}, pages = {1{\textendash}2}, address = {Koblenz, Germany}, abstract = {

This paper describes the design of an indexing system for a video database. The system uses region-based manual an- notations of keyframes to create models to automatically annotate new keyframes also at the region level. The pre- sented architecture includes user interfaces for training and querying the system, internal databases to manage ingested content and modelled semantic classes, as well as communi- cation interfaces to allow the system interconnection. The scheme is designed to work as a plug-in to an external Mul- timedia Asset Management (MAM) system.\ 

}, keywords = {architecture, Indexing, keyframe, MAM, region}, url = {http://hdl.handle.net/2117/10893}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cGudmundsson08, title = {TOF Imaging in Smart Room Environments towards Improved People Tracking}, booktitle = {Computer Vision and Pattern Recognition: Workshop on Time of Flight based Computer Vision (CVPR/TOF-CV)}, year = {2008}, pages = {1{\textendash}6}, author = {Gudmundsson, S. and M. Pard{\`a}s and Casas, J. and Aanaes, H. and Larsen, R.} } @article {aCraene08, title = {Unbiased group-wise alignment by iterative central tendency estimations}, journal = {Mathematical modeling of natural phenomena}, volume = {3}, number = {6}, year = {2008}, pages = {2{\textendash}32}, issn = {0973-5348}, url = {http://hdl.handle.net/2117/8140}, author = {de Craene, M. and Macq, B. and Marqu{\'e}s, F. and Salembier, P. and Warfield, S.} } @phdthesis {dLandabaso08, title = {A Unified Framework for Consistent 2D/3D Foreground Object Detection}, year = {2008}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, author = {Landabaso, J.}, editor = {M. Pard{\`a}s} } @article {aLandabaso08, title = {A Unified Framework for Consistent 2D/3D Foreground Object Detection}, journal = {IEEE transactions on circuits and systems for video technology}, volume = {18}, number = {8}, year = {2008}, pages = {1040{\textendash}1051}, issn = {1051-8215}, author = {Landabaso, J. and M. Pard{\`a}s} } @conference {cLuque06a, title = {Audio, Video and Multimodal Person Identification in a Smart Room}, booktitle = {CLEAR{\textquoteright}06 Evaluation Campaign and Workshop - Classification of Events, Activities and Relationships}, year = {2007}, pages = {258{\textendash}269}, isbn = {978-3-540-69567-7}, author = {Luque, J. and Morros, J.R. and Garde, A. and Anguita, J. and Farr{\'u}s, M. and Macho, D. and Marqu{\'e}s, F. and Mart{\'\i}nez, C. and Ver{\'o}nica Vilaplana and Hernando, J.} } @article {aCorrea07, title = {Bayesian approach for morphology based 2D human motion capture}, journal = {IEEE transactions on multimedia}, volume = {9}, number = {4}, year = {2007}, pages = {754{\textendash}765}, issn = {1520-9210}, author = {Correa, P. and Czyz, J. and Marqu{\'e}s, F. and Umeda, T. and Marichal, X. and Macq, B.} } @conference {cVilaplana07b, title = {On building a hierarchical region-based representation for generic image analysis}, booktitle = {IEEE International Conference on Image Processing}, year = {2007}, isbn = {978-1-61284-467-1}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @article {aMostefa07, title = {The CHIL Audiovisual Corpus for Lecture and Meeting Analysis inside Smart Rooms}, journal = {Language resources and evaluation}, volume = {41}, number = {3}, year = {2007}, month = {01/2008}, pages = {389{\textendash}407}, abstract = {

The analysis of lectures and meetings inside smart rooms has recently attracted much interest in the literature, being the focus of international projects and technology evaluations. A key enabler for progress in this area is the availability of appropriate multimodal and multi-sensory corpora, annotated with rich human activity information during lectures and meetings. This paper is devoted to exactly such a corpus, developed in the framework of the European project CHIL, {\textquotedblleft}Computers in the Human Interaction Loop{\textquotedblright}. The resulting data set has the potential to drastically advance the state-of-the-art, by providing numerous synchronized audio and video streams of real lectures and meetings, captured in multiple recording sites over the past 4 years. It particularly overcomes typical shortcomings of other existing databases that may contain limited sensory or monomodal data, exhibit constrained human behavior and interaction patterns, or lack data variability. The CHIL corpus is accompanied by rich manual annotations of both its audio and visual modalities. These provide a detailed multi-channel verbatim orthographic transcription that includes speaker turns and identities, acoustic condition information, and named entities, as well as video labels in multiple camera views that provide multi-person 3D head and 2D facial feature location information. Over the past 3 years, the corpus has been crucial to the evaluation of a multitude of audiovisual perception technologies for human activity analysis in lecture and meeting scenarios, demonstrating its utility during internal evaluations of the CHIL consortium, as well as at the recent international CLEAR and Rich Transcription evaluations. The CHIL corpus is publicly available to the research community

}, issn = {1574-020X}, doi = {10.1007/s10579-007-9054-4}, author = {Mostefa, D. and Moreau, N. and Choukri, K. and Potamianos, G. and Chu, S. and Tyagi, A. and Casas, J. and Turmo, J. and Cristoforetti, L. and Tobia, F. and Pnevmatikakis, A. and Mylonakis, V. and Talantzis, F. and Burger, S. and Stiefelhagen, R. and Bernardin, K. and Rochet, C.} } @conference {cGiro-i-Nieto07, title = {Composite object detection in video sequences: Applications to controlled environments}, booktitle = {8th International Workshop on Image Analysis for Multimedia Interactive Services}, year = {2007}, month = {06/2007}, publisher = {IEEE}, organization = {IEEE}, address = {Santorini, Greece}, abstract = {

This paper presents a set of techniques for the detection of composite objects in video recordings of a controlled environ- ment. Firstly, a selective region-based analysis is performed by tuning the algorithm to the perceptual characteristics of the object in the environment. Secondly, the controlled per- ceptual and semantic variabilities of the object are addressed by the detection analysis thanks to a frame by frame update of the object models, and by allowing multiple models for a single object. The proposed techniques are illustrated in the detection of laptops from a zenithal view in a smart room.\ 

}, keywords = {image region analysis, image sequence analysis, Object detection, object recognition, pattern recognition}, isbn = {0-7695-2818-X}, doi = {10.1109/WIAMIS.2007.31}, url = {http://dx.doi.org/10.1109/WIAMIS.2007.31}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @inbook {bCasas07, title = {Context Awareness triggered by Multiple Perceptual Analyzers}, booktitle = {Emerging Artificial Intelligence Applications in Computer Engineering}, volume = {160}, year = {2007}, pages = {371{\textendash}383}, publisher = {IOS Press}, organization = {IOS Press}, address = {Amsterdam}, abstract = {

A multitude of technologies from computer vision, acoustic signal analysis and natural language processing are used to implement multi-modal perceptual components. The output of this analysis is used to gain context awareness {\textendash} a necessity when designing a computer-based service that interacts reactively and proactively with humans. This article describes the integration process and our experience in implementing one such information service, the {\textquotedblleft}Memory Jog{\textquotedblright}, in a particular scenario where the computer system supports a group of journalists in their daily work.

}, isbn = {978-1-58603-780-2}, url = {http://www.booksonline.iospress.nl/Content/View.aspx?piid=6976}, author = {Casas, J. and Neumann, J.} } @conference {cAsteriadis07, title = {Detection of facial characteristics based on edge information}, booktitle = {International Conference on Computer Vision Theory and Applications, VISAPP 2007}, year = {2007}, pages = {247{\textendash}250}, isbn = {978-972-8865-74-0}, author = {Asteriadis, S. and Nikolaidis, N. and Pitas, I. and M. Pard{\`a}s} } @book {eAguilar07, title = {Diccionari de Telecomunicacions}, year = {2007}, isbn = {978-84-412-1459-0}, author = {Aguilar, M. and Alcober, J. and Altes, J. and Aragones, X. and Artigas, D. and Bardes, D. and Barlabe, A. and Bragos, R. and Calderer, J. and Cardama, A. and Casademont, J. and Casals, L. and Comer{\'o}n, A. and Cotrina, J. and Cruz, L. and Dios, V. and Duxans, H. and Esparza, O. and Esquerra, I. and Garcia, D. and Garcias, P. and Gomez, C. and Gorricho, J. and Guinjoan, F. and Hesselbach, X. and Liria, A. and Lopez, J. and Madrenas, J. and Madue{\~n}o, M. and Mestre, F. and Monte, E. and Morros, J.R. and Mu{\~n}oz, J. and Pallar{\'e}s, E. and Pons, J. and Recolons, J. and Rincon, D. and Riu, P. and Pradell, L. and Pascual-Iserte, A. and Prat, L. and Rey, F. and Villares, J.} } @conference {cMorros07, title = {Event recognition for meaningful Human-Computer interaction in smart environments}, booktitle = {Proceedings of the eNTERFACE{\textquoteright}07 Workshop on Multimodal Interfaces}, year = {2007}, month = {08/2007}, abstract = {

The aim of this project is to monitor a room for the purposes of analysing the interactions and identities of a small set of individ- uals. We work with multiple uncalibrated sensors that observe a single environment and generate multimodal data streams. These streams are processed with the help of a generic client-server middleware called SmartFlow. Modules for visual motion de- tection, visual face tracking, visual face identification, visual op- portunistic sensing, audio-based localization, and audio-based identification are implemented and integrated under SmartFlow to work in coordination across different platforms.

}, keywords = {Audio systems, Image motion analysis, Image sensors, pattern recognition}, isbn = { 978-2874631054}, author = {Morros, J.R. and Albert Ali Salah and Ben Schouten and Carlos Segura Perales and Jordi Luque Serrano and Onkar Ambekar and Ceren Kayalar and Cem Keskin and Lale Akarun} } @conference {cVilaplana07a, title = {Face detection and segmentation on a hierarchical image representation}, booktitle = {15th European Signal Processing Conference}, year = {2007}, pages = {1955{\textendash}1959}, isbn = {0-7803-7403-7}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @conference {cPetras07, title = {Flexible test-bed for unusual behavior detection}, booktitle = {6th ACM International Conference on Image and Video Retrieval}, year = {2007}, pages = {105{\textendash}108}, isbn = {0 646 34170 7}, author = {Petras, I. and Beleznai, C. and Dedeoglu, Y. and M. Pard{\`a}s and Kov{\'a}cs, L. and Szl{\'a}vik, Z. and Havasi, L. and Szir{\'a}nyi, T. and T{\"o}reyin, B. and G{\"u}d{\"u}kbay, U. and Cetin, E. and Cristian Canton-Ferrer} } @conference {cRolon07, title = {Generalized Lifting For Sparse Image Representation and Coding}, booktitle = {Picture Coding Symposium, PCS 2007}, year = {2007}, pages = {234{\textendash}238}, address = {Lisbon, Portugal}, isbn = {88-86179-83-9}, author = {Rol{\'o}n, J. and Salembier, P.} } @article {aSole07a, title = {Generalized lifting prediction optimization applied to lossless image compression}, journal = {IEEE signal processing letters}, volume = {14}, number = {10}, year = {2007}, pages = {695{\textendash}698}, issn = {1070-9908}, author = {Sol{\'e}, J. and Salembier, P.} } @conference {cDimiccoli07a, title = {Geometrical Filtering Scheme with Connected Operators and Image Inpainting}, booktitle = {SPIE Visual Communcations and Image Processing 2007}, year = {2007}, pages = {1{\textendash}14}, address = {San Jose, CA, USA}, author = {Dimiccoli, M. and Salembier, P.} } @conference {cCanton-Ferrer07a, title = {Head Orientation Estimation using Particle Filtering in Multiview Scenarios}, booktitle = {CLEAR{\textquoteright}07 Second International Evaluation Workshop on Classification of Events, Activities and Relationships}, year = {2007}, pages = {1{\textendash}11}, isbn = {978-3-540-68584-5}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @inbook {bCanton-Ferrer07, title = {Head Pose Detection based on Fusion of Multiple Viewpoint Information}, booktitle = {Multimodal Technologies for Perception of Humans}, volume = {4122}, year = {2007}, pages = {305{\textendash}310}, publisher = {Springer}, organization = {Springer}, address = {Berlin / Heidelberg}, abstract = {

This paper presents a novel approach to the problem of determining head pose estimation and face 3D orientation of several people in low resolution sequences from multiple calibrated cameras. Spatial redundancy is exploited and the head in the scene is detected and geometrically approximated by an ellipsoid. Skin patches from each detected head are located in each camera view. Data fusion is performed by back-projecting skin patches from single images onto the estimated 3D head model, thus providing a synthetic reconstruction of the head appearance. Finally, these data are processed in a pattern analysis framework thus giving an estimation of face orientation. Tracking over time is performed by Kalman filtering. Results of the proposed algorithm are provided in the SmartRoom scenario of the CLEAR Evaluation.

}, isbn = {978-3-540-69567-7}, doi = {10.1007/978-3-540-69568-4_28}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @conference {cDorea07, title = {Hierarchical partition-based representations for image sequences using trajectory merging criteria}, booktitle = {2007 International Conference on Acoustics, Speech and Signal Processing}, year = {2007}, author = {Dorea, C. and M. Pard{\`a}s and Marqu{\'e}s, F.} } @phdthesis {dChang07, title = {Hierarchical Partition-Based Representations of Motion-Coherent Regions For Video Object Segmentation}, year = {2007}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, author = {Chang, C.}, editor = {M. Pard{\`a}s and Marqu{\'e}s, F.} } @conference {cDorea07a, title = {A hierarchical trajectory-based representation for video}, booktitle = {Fith International Workshop on Content-Based Multimedia Indexing}, year = {2007}, pages = {275{\textendash}282}, isbn = {1-4244-1011-8}, author = {Dorea, C. and M. Pard{\`a}s and Marqu{\'e}s, F.} } @conference {cTurkan07, title = {Human eye localization using edge projections}, booktitle = {International Conference on Computer Vision Theory and Applications, VISAPP 2007}, year = {2007}, pages = {410{\textendash}415}, isbn = {-}, author = {Turkan, M. and M. Pard{\`a}s and Cetin, E.} } @conference {cRuiz-Hidalgo07, title = {Long term selection of reference frame sub-blocks using MPEG-7 indexing metadata}, booktitle = {International Conference on Acoustics, Speech and Signal Processing, ICASSP 2007}, year = {2007}, month = {04/2007}, pages = {669{\textendash}672}, address = {Honolulu, Hawaii}, author = {Ruiz-Hidalgo, J. and Salembier, P.} } @conference {cSegura07, title = {Multimodal Head Orientation towards Attention Tracking in Smart Rooms}, booktitle = {2007 International Conference on Acoustics, Speech and Signal Processing}, year = {2007}, author = {Segura, C. and Abad, A. and Casas, J. and Hernando, J.} } @conference {cLopez-Mendez07, title = {Multi-person 3D Tracking with Particle Filters on Voxels}, booktitle = {2007 International Conference on Acoustics, Speech and Signal Processing}, year = {2007}, author = {L{\'o}pez-M{\'e}ndez, A. and Cristian Canton-Ferrer and Casas, J.} } @conference {cCanton-Ferrer07, title = {Multi-Person Tracking Strategies Based on Voxel Analysis}, booktitle = {CLEAR{\textquoteright}07 Second International Evaluation Workshop on Classification of Events, Activities and Relationships}, year = {2007}, pages = {1{\textendash}12}, isbn = {978-3-540-68584-5}, author = {Cristian Canton-Ferrer and Salvador, J. and Casas, J. and M. Pard{\`a}s} } @conference {cCalderero07, title = {Multiple view region matching using as a Lagrangian optimization problem}, booktitle = {2007 International Conference on Acoustics, Speech and Signal Processing}, year = {2007}, author = {Calderero, F. and Marqu{\'e}s, F. and Ortega, A.} } @conference {cDimiccoli07, title = {Perceptual filtering with connected operators and image inpainting}, booktitle = {Proc. of International Symposium on Mathematical Morphology, ISMM 2007}, year = {2007}, pages = {227{\textendash}238}, address = {Rio de Janeiro, Brazil}, isbn = {978-85-17-00032-4}, author = {Dimiccoli, M. and Salembier, P.} } @article {aDescampe07, title = {Prefetching and caching strategies for remote and interactive browsing of JPEG2000 images}, journal = {IEEE transactions on image processing}, volume = {16}, number = {5}, year = {2007}, pages = {1339{\textendash}1354}, issn = {1057-7149}, author = {Descampe, A. and Vleeschouwer, D. and Iregui, M. and Macq, B. and Marqu{\'e}s, F.} } @conference {cCabrera07, title = {Proyecto de Innovaci{\'o}n Docente COM@WEB (COMunicaciones en la WEB)}, booktitle = {VIII Simposio Nacional de Tecnolog{\'\i}as de la Informaci{\'o}n y las Comunicaciones en la Educaci{\'o}n. SINTICE{\textquoteright}07}, year = {2007}, month = {09/2007}, pages = {113{\textendash}120}, abstract = {

COMalaWEB significa Comunicaciones en la Web y es una plataforma de e-learning que ofrece diferentes recursos docentes on-line a los estudiantes universitarios con el objetivo de facilitar el estudio preferentemente en las tem{\'a}ticas de procesado de se{\~n}al y comunicaciones.

La plataforma es interactiva y se ha dise{\~n}ado para promover la experimentaci{\'o}n e-learning y adquisici{\'o}n de h{\'a}bitos de estudio basados en el uso de nuevas tecnolog{\'\i}as.

El sistema en un futuro a corto plazo podr{\'a} generar itinerarios autom{\'a}ticos de estudio al estudiante en funci{\'o}n del rendimiento obtenido en las pruebas de autoevaluaci{\'o}n y a partir de informaci{\'o}n empaquetada en metadatos e integrada en una base de datos de objetos de aprendizaje.

El objetivo del proyecto COMalaWEB a medio y largo plazo es constituir un punto de encuentro en la www para estudiantes, profesores y profesionales relacionados con el {\'a}rea de las telecomunicaciones y la docencia a nivel universitario.

Uno de los cursos integrados en COMalaWEB es el laboratorio virtual de comunicaciones anal{\'o}gicas y digitales LaViCAD, de libre difusi{\'o}n y acceso y basado en la simulaci{\'o}n de diferentes sistemas de comunicaciones que pueden ser utilizados tanto en docencia presencial como en educaci{\'o}n a distancia.\ 

Actualmente se puede consultar el contenido del proyecto en la plataforma: http://comweb.upc.edu/

}, isbn = {978-84-9732-597-4}, author = {Cabrera, M. and Closas, P. and Alfredo, L. and Xavier Gir{\'o}-i-Nieto and Rey, F.} } @article {aSole07, title = {Quadratic Interpolation and Linear Lifting Design}, journal = {Eurasip Journal on Applied Signal Processing}, volume = {1}, number = {1}, year = {2007}, pages = {1{\textendash}9}, issn = {1110-8657}, author = {Sol{\'e}, J. and Salembier, P.} } @conference {cGiro-i-Nieto07a, title = {Region-based annotation tool using partition trees}, booktitle = {International Conference on Semantic and Digital Media Technologies}, year = {2007}, month = {12/2007}, pages = {3{\textendash}4}, address = {Genova, Italy}, abstract = {

This paper presents an annotation tool for the man- ual and region-based annotation of still images. The selection of regions is achieved by navigating through a Partition Tree, a data structure that offers a multiscale representation of the image. The user interface provides a framework for the annotation of both atomic and composite semantic classes and generates an MPEG-7 XML compliant file.\ 

}, url = {http://hdl.handle.net/2117/10890}, author = {Xavier Gir{\'o}-i-Nieto and Camps, N. and Marqu{\'e}s, F.} } @conference {cVilaplana07, title = {Region-based hierarchical representation for object detection}, booktitle = {Fith International Workshop on Content-Based Multimedia Indexing}, year = {2007}, pages = {157{\textendash}164}, isbn = {0-7803-6727-8}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @mastersthesis {xGil-Moreno07, title = {Sistema de gesti{\'o} de v{\'\i}deo off-line per una smart-room}, year = {2007}, keywords = {qt, recording, smartroom}, author = {A. Gil-Moreno}, editor = {Casas, J.} } @conference {cHaro07, title = {Stratification Learning: Detecting Mixed Density and Dimensionality in High Dimensional Point Clouds}, booktitle = {Neural Information Processing Systems NIPS}, year = {2007}, month = {12/2007}, publisher = {NIPS}, organization = {NIPS}, address = {Montreal}, abstract = {

The study of point cloud data sampled from a stratification, a collection of manifolds with possible different dimensions, is pursued in this paper. We present a technique for simultaneously soft clustering and estimating the mixed dimensionality and density of such structures. The framework is based on a maximum likelihood estimation of a Poisson mixture model. The presentation of the approach is completed with artificial and real examples demonstrating the importance of extending manifold learning to stratification learning.

}, author = {Haro, G. and Randall, Gregory and Sapiro, Guillermo} } @article {aSalembier07, title = {Structure description tools}, journal = {Journal of the American Society for Information Science}, volume = {58}, number = {9}, year = {2007}, pages = {1329{\textendash}1337}, issn = {0002-8231}, author = {Salembier, P. and Benitez, A.B} } @inbook {bAbad07, title = {UPC Audio, Video and Multimodal Person Tracking Systems in the CLEAR Evaluation Campaign}, booktitle = {Lecture notes in computer science}, volume = {4122}, year = {2007}, pages = {93{\textendash}104}, publisher = {Springer}, organization = {Springer}, address = {Berlin / Heidelberg}, abstract = {

Reliable measures of person positions are needed for computational perception of human activities taking place in a smart-room environment. In this work, we present the Person Tracking systems developed at UPC for audio, video and audio-video modalities in the context of the EU funded CHIL project research activities. The aim of the designed systems, and particularly of the new contributions proposed, is to deal robustly in both single and multiperson localization tasks independently on the environmental conditions. Besides the technology description, experimental results conducted for the CLEAR evaluation workshop are also reported.

}, isbn = {978-3-540-69567-7}, doi = {10.1007/978-3-540-69568-4_6}, author = {Abad, A. and Cristian Canton-Ferrer and Segura, C. and Landabaso, J. and Macho, D. and Casas, J. and Hernando, J. and M. Pard{\`a}s and Nadeu, C.} } @conference {cCanton-Ferrer06, title = {3D Human Action Recognition In Multiple View Scenarios}, booktitle = {2ones Jornades UPC de Investigaci{\'o}n en Autom{\'a}tica, Visi{\'o}n y Rob{\'o}tica}, year = {2006}, pages = {1{\textendash}5}, isbn = {84-7653-885-5}, author = {Cristian Canton-Ferrer and Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s and Sargin, M. and Tekalp, M.} } @conference {cSole06a, title = {Adaptive Quadratic Interpolation Methods for Lifting Steps Construction}, booktitle = {IEEE International Symposium on Signal Processing and Information Technology, , ISSPIT 2006}, year = {2006}, pages = {1{\textendash}8}, address = {Vancouver, Canada}, isbn = {1-4244-1437-7}, author = {Sol{\'e}, J. and Salembier, P.} } @inbook {bLuque06, title = {Audio, Video and Multimodal Person Identification in a Smart Room}, booktitle = {Lecture notes in computer science - Multimodal Technologies for Perception of Humans}, volume = {4122}, year = {2006}, pages = {258{\textendash}269}, issn = {0302-9743}, doi = {10.1007/978-3-540-69568-4_23}, author = {Luque, J. and Morros, J.R. and Garde, A. and Anguita, J. and Farr{\'u}s, M. and Macho, D. and Marqu{\'e}s, F. and Mart{\'\i}nez, C. and Ver{\'o}nica Vilaplana and Hernando, J.} } @conference {cFerran06, title = {BPT Enhancement based on Syntactic and Semantic criteria}, booktitle = {1st International Conference on Semantic and Digital Media Technologies}, year = {2006}, pages = {184{\textendash}198}, isbn = {3-540-49335-2}, author = {Ferran, C. and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Casas, J.} } @inbook {bFerran06, title = {BPT Enhancement based on Syntactic and Semantic criteria}, booktitle = {Semantic Multimedia}, volume = {4306}, year = {2006}, pages = {184{\textendash}198}, publisher = {Springer}, organization = {Springer}, address = {Berlin / Heidelberg}, abstract = {

This paper presents two enhancements for the creation and analysis of Binary Partition Trees (BPTs). Firstly, the classic creation of BPT based on colour is expanded to include syntactic criteria derived from human perception. Secondly, a method to include semantic information in the BPT analysis is shown thanks to the definition of the BPT Semantic Neighborhood and the introduction of Semantic Trees. Both techniques aim at bridging the semantic gap between signal and semantics following a bottom-up and a top-down approach, respectively.

}, isbn = {978-3-540-49335-8}, doi = {10.1007/11930334_15}, url = {http://www.springerlink.com/content/u7201mw06545w057/}, author = {Ferran, C. and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Casas, J.} } @conference {cSole06, title = {A Common Formulation for Interpolation, Prediction, and Update Lifting Design,}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2006}, year = {2006}, pages = {1{\textendash}4}, address = {Toulouse, France}, isbn = {978-83-921340-2-2}, author = {Sol{\'e}, J. and Salembier, P.} } @conference {cLandabaso06, title = {Cooperative background modelling using multiple cameras towards human detection in smart-room}, booktitle = {14th European Signal Processing Conference}, year = {2006}, pages = {1{\textendash}5}, author = {Landabaso, J. and M. Pard{\`a}s} } @phdthesis {dMarcello06, title = {Desarrollo de t{\'e}cnicas de procesado de im{\'a}genes, multitemporales y multisensoriales, de teledetecci{\'o}n para la detecci{\'o}n y seguimiento de estructuras oceanogr{\'a}ficas}, year = {2006}, school = {Universidad de Las Palmas de Gran Canaria (ULPGC)}, type = {phd}, author = {Marcello, J.}, editor = {Marqu{\'e}s, F. and F. Eugenio} } @phdthesis {dCorrea06, title = {Dual morphology-based and Bayesian approach for markerless human motion capture in natural interaction environments}, year = {2006}, school = {Universit{\'e} Catholique de Louvain (UCL)}, type = {phd}, url = {http://hdl.handle.net/2078.1/5033}, author = {Correa, P.}, editor = {Marqu{\'e}s, F. and Macq, B.} } @conference {cGiro-i-Nieto06, title = {The edition of the Wikipedia as an academic activity}, booktitle = {4rt. Congr{\'e}s Internacional de Doc{\`e}ncia Unversit{\`a}ria i Innovaci{\'o}}, year = {2006}, month = {07/2006}, pages = {{\textendash}}, address = {Barcelona, Catalonia}, abstract = {

This paper presents a learning activity around the Wikipedia, a free online encyclopaedia written by its users. Students are asked to write and review entries related to the course topics following a collaborative environment provided by the wiki tools. This paper proposes a seventeensteps methodology for this task in the framework of an academic course organized by topics. The activity has been successfully introduced in a two different schools of the Technical University of Catalonia. In its first edition, 81 new articles were added by 64 students from the EUETIT, in the second experience 60 articles were created and 14 reviewed by 43 students from the ETSETB.

[CIDUI 2006 website]

}, keywords = {wikipedia, ~cooperative, ~online}, isbn = {84-8458-240-X}, url = {http://hdl.handle.net/2117/13157}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Casas, J.} } @conference {cVilaplana06, title = {Face Recognition Using Groups of Images in Smart Room Scenarios}, booktitle = {International Conference on Image Processing}, year = {2006}, pages = {2071{\textendash}2074}, author = {Ver{\'o}nica Vilaplana and Mart{\'\i}nez, C. and Javier, C. and Marqu{\'e}s, F.} } @article {aLandabaso06, title = {Foreground Regions Extraction and Characterization Towards Real-Time Object Tracking}, journal = {Lecture notes in computer science}, volume = {3869}, year = {2006}, pages = {241{\textendash}249}, issn = {0302-9743}, url = {http://www.springerlink.com}, author = {Landabaso, J. and M. Pard{\`a}s} } @inbook {bGiro-i-Nieto06, title = {From partition trees to semantic trees}, booktitle = {Multimedia Content Representation, Classification and Security}, volume = {4105/2006}, number = {4105}, year = {2006}, pages = {306{\textendash}313}, abstract = {

This paper proposes a solution to bridge the gap between semantic and visual information formulated as a structural pattern recognition problem. Instances of semantic classes expressed by Description Graphs are detected on a region-based representation of visual data expressed with a Binary Partition Tree. The detection process builds instances of Semantic Trees on the top of the Binary Partition Tree using an encyclopedia of models organised as a hierarchy. At the leaves of the Semantic Tree, classes are defined by perceptual models containing a list of low-level descriptors. The proposed solution is assessed in different environments to show its flexibility.

}, issn = {0302-9743}, doi = {10.1007/11848035_41}, url = {http://www.springerlink.com/content/j148713624k48u3r/}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cGiro-i-Nieto06a, title = {From Partition Trees to Semantic Trees}, booktitle = {International Workshop on Multimedia Content Representation, Classification and Security}, year = {2006}, pages = {306{\textendash}313}, isbn = {3-540-39392-7}, doi = {10.1007/11848035}, url = {http://dx.doi.org/10.1007/11848035_41}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cGiro-i-Nieto06b, title = {From Partition Trees to Semantic Trees}, booktitle = {2ndas Jornades UPC de Investigaci{\'o}n en Autom{\'a}tica, Visi{\'o}n y Rob{\'o}tica}, year = {2006}, pages = {187{\textendash}194}, isbn = {84-7653-885-5}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cDorea06, title = {Generation of long-term color and motion coherent partitions}, booktitle = {International Conference on Image Processing}, year = {2006}, pages = {581{\textendash}584}, author = {Dorea, C. and M. Pard{\`a}s and Marqu{\'e}s, F.} } @conference {cCanton-Ferrer06b, title = {Head Pose Detection based on Fusion of Multiple Viewpoint Information}, booktitle = {CLEAR{\textquoteright}06 Evaluation Campaign and Workshop - Classification of Events, Activities and Relationships}, year = {2006}, pages = {305{\textendash}310}, isbn = {978-3-540-69567-7}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @conference {cCanton-Ferrer06a, title = {Human Model and Motion Based 3D Action Recognition in Multiple View Scenarios}, booktitle = {14th European Signal Processing Conference}, year = {2006}, pages = {1{\textendash}1}, author = {Cristian Canton-Ferrer and Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @conference {cSalvador06, title = {Image-adapted voxelization in multicamera settings}, booktitle = {International Workshop on Multimedia Signal Processing}, year = {2006}, pages = {161{\textendash}165}, isbn = {0-7803-3258-X}, author = {Salvador, J. and Casas, J.} } @conference {cCasas06, title = {Image-based Multi-view Scene Analysis using {\textquoteright}Conexels{\textquoteright}}, booktitle = {HCSNet Workshop on the Use of Vision in Human-Computer Interaction (VisHCI 2006)}, year = {2006}, pages = {203{\textendash}212}, abstract = {

Multi-camera environments allow constructing volumetric models of the scene to improve the analysis performance of computer vision algorithms (e.g. disambiguating occlusion). When representing volumetric results of image-based multi-camera analysis, a direct approach is to scan the 3D space with regular voxels. Regular voxelization is good at high spatial resolutions for applications such as volume visualization and rendering of synthetic scenes generated by geometric models, or to represent data resulting from direct 3D data capture (e.g. MRI). However, regular voxelization shows a number of drawbacks for visual scene analysis, where direct measurements on 3D voxels are not usually available. In this case, voxel values are computed rather as a result of the analysis on ?projected? image data.

In this paper, we first provide some statistics to show how voxels project to ?unbalanced? sets of image data in common multi-view analysis settings. Then, we propose a 3D geometry for multi-view scene analysis providing a better balance in terms of the number of pixels used to analyse each elementary volumetric unit. The proposed geometry is non-regular in 3D space, but becomes regular once projected onto camera images, adapting the sampling to the images. The aim is to better exploit multiview image data by balancing its usage across multiple cameras instead of focusing in regular sampling of 3D space, from which we do not have direct measurements. An efficient recursive algorithm using the proposed geometry is outlined. Experimental results reflect better balance and higher accuracy for multi-view analysis than regular voxelization with equivalent restrictions.

}, keywords = {epipolar geometry, Multi-view analysis, volume voxelization}, isbn = {1-920-68238-4}, url = {http://crpit.com/Vol56.html}, author = {Casas, J. and Salvador, J.} } @inbook {bNeumann06, title = {Multimodal Integration of Sensor Network}, booktitle = {Artificial Intelligence Applications and Innovations}, volume = {204}, year = {2006}, pages = {312{\textendash}323}, publisher = {Springer}, organization = {Springer}, address = {Boston}, abstract = {

At the Universitat Polit{\`e}cnica de Catalunya (UPC), a Smart Room has been equipped with 85 microphones and 8 cameras. This paper describes the setup of the sensors, gives an overview of the underlying hardware and software infrastructure and indicates possibilities for high- and low-level multi-modal interaction. An example of usage of the information collected from the distributed sensor network is explained in detail: the system supports a group of students that have to solve a lab assignment related problem.

}, isbn = {978-0-387-34223-8}, doi = {10.1007/0-387-34224-9_36}, author = {Neumann, J. and Casas, J. and Macho, D. and Ruiz-Hidalgo, J.} } @conference {cNeumann06, title = {Multimodal Integration of Sensor Network}, booktitle = {Proceedings of 3rd IFIP Conference on Artificial Intelligence Applications \& Innovations}, year = {2006}, month = {09/2006}, publisher = {Springer}, organization = {Springer}, address = {Athens, Greece}, isbn = {978-0-387-34223}, author = {Neumann, J. and Casas, J. and Macho, D. and Ruiz-Hidalgo, J.} } @conference {cLuque06, title = {Multimodal Person Identification in a Smart Room}, booktitle = {IV Jornadas en Tecnolog{\'\i}a del Habla}, year = {2006}, pages = {327{\textendash}331}, isbn = {84-96214-82-6}, author = {Luque, J. and Morros, J.R. and Anguita, J. and Farr{\'u}s, M. and Macho, D. and Marqu{\'e}s, F. and Mart{\'\i}nez, C. and Ver{\'o}nica Vilaplana and Hernando, J.} } @phdthesis {dSole06, title = {Optimization and Generalization of Lifting Schemes: Application to Lossless Image Compression}, year = {2006}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

This Ph.D. thesis dissertation addresses multi-resolution image decomposition, a key issue in signal processing that in recent years has contributed to the emergence of the JPEG2000 image compression standard. JPEG2000 incorporates many interesting features, mainly due to the discrete wavelet transform stage and to the EBCOT entropy coder. Wavelet analysis perform multi-resolution decompositions that decorrelate signal and separate information in useful frequency-bands, allowing flexible post-coding. In JPEG2000, decomposition is computed through the lifting scheme, the so-called second generation wavelets. This fact has focused the community interest on this tool. Many works have been recently proposed in which lifting is modified, improved, or included in a complete image coding algorithm. The Ph.D. thesis dissertation follows this research line. Lifting is analyzed, proposals are made within the scheme, and their possibilities are explored. Image compression is the main objective and it is principally assessed by means of coding transformed signal with EBCOT and SPIHT coders. Starting from this context, the work diverges in two distinct paths, the linear and the nonlinear one. The linear lifting filter construction is based on the idea of quadratic interpolation and the underlying linear restriction due to the wavelet transform coefficients. The result is a flexible framework that allows the creation of new transforms using different criteria and that may adapt to the image statistics. The nonlinear part is founded on the adaptive lifting scheme, which is extensively analyzed and as a consequence, a generalization of the lifting is proposed. The discrete version of the generalized lifting is developed leading to filters that achieve good compression results, specially for biomedical and remote sensing images

}, url = {http://hdl.handle.net/10803/6897}, author = {Sol{\'e}, J.} } @conference {cDescampe06, title = {Pre-fetching strategies for remote and interactive browsing of JPEG2000 images}, booktitle = {International Conference on Image Processing}, year = {2006}, pages = {3203{\textendash}3206}, author = {Descampe, A. and De Vleeschouwer, C. and Iregui, M. and Macq, B. and Marqu{\'e}s, F.} } @inbook {bCanton-Ferrer06, title = {Projective Kalman Filter: Multiocular Tracking of 3D Locations Towards Scene Understanding}, booktitle = {Machine Learning for Multimodal Interaction}, volume = {3869}, year = {2006}, pages = {250{\textendash}261}, publisher = {Springer}, organization = {Springer}, address = {Berlin / Heidelberg}, abstract = {

This paper presents a novel approach to the problem of estimating and tracking 3D locations of multiple targets in a scene using measurements gathered from multiple calibrated cameras. Estimation and tracking is jointly achieved by a newly conceived computational process, the Projective Kalman filter (PKF), allowing the problem to be treated in a single, unified framework. The projective nature of observed data and information redundancy among views is exploited by PKF in order to overcome occlusions and spatial ambiguity. To demonstrate the effectiveness of the proposed algorithm, the authors present tracking results of people in a SmartRoom scenario and compare these results with existing methods as well.

}, isbn = {978-3-540-32549-9}, doi = {10.1007/11677482_22}, author = {Cristian Canton-Ferrer and Casas, J. and Tekalp, M. and M. Pard{\`a}s} } @conference {cLandabaso06a, title = {Reconstruction of 3D shapes considering inconsistent 2D silhouettes}, booktitle = {International Conference on Image Processing}, year = {2006}, pages = {1{\textendash}4}, author = {Landabaso, J. and M. Pard{\`a}s and Casas, J.} } @conference {cLopez06, title = {Rotating convection: Eckhauss-Benjamin-Feir instability}, booktitle = {59th Annual Meeting of the APS Division of Fluid Dynamics}, year = {2006}, pages = {2004{\textendash}2004}, isbn = {-}, author = {Lopez, J. and Mercader, M. and Marqu{\'e}s, F. and Batiste, O.} } @conference {cGiro-i-Nieto06c, title = {Solucions de programari lliure en un projecte d{\textquoteright}adaptaci{\'o} de dades a XML}, booktitle = {V Jornades de Programari Lliure de la UPC}, year = {2006}, author = {Xavier Gir{\'o}-i-Nieto and Camps, N.} } @phdthesis {dRuiz-Hidalgo06, title = {On the Synergy between indexing and compression representations for video sequences}, year = {2006}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, author = {Ruiz-Hidalgo, J.}, editor = {Salembier, P.} } @conference {cAbad06, title = {UPC Audio, Video and Multimodal Person Tracking Systems in the CLEAR Evaluation Campaign}, booktitle = {CLEAR{\textquoteright}06 Evaluation Campaign and Workshop - Classification of Events, Activities and Relationships}, year = {2006}, pages = {93{\textendash}104}, isbn = {978-3-540-69567-7}, author = {Abad, A. and Cristian Canton-Ferrer and Cristian Canton-Ferrer and Segura, C. and Landabaso, J. and Macho, D. and Casas, J. and Hernando, J. and M. Pard{\`a}s and Nadeu, C.} } @article {aRuiz-Hidalgo06, title = {On the use of indexing metadata to improve the efficiency of video compression}, journal = {IEEE transactions on circuits and systems for video technology}, volume = {16}, number = {3}, year = {2006}, pages = {410{\textendash}419}, abstract = {

For the last years, video indexing and video compression have been considered as two separate functionalities. However, multimedia content is growing in such a rate that multimedia services will need to consider both the compression and the indexing aspects of the content in order to efficiently manage this audio-visual content. Therefore, it is interesting to study the synergy between the representations of compression and indexing and in particular to find new schemas that allow the possibility to exploit indexing/compression information in order to increase the efficiency of video compression/indexing capabilities. The principal contribution of this paper is to study and develop new techniques where the compression efficiency of video codecs can be improved by the use of indexing metadata where indexing metadata refers to information that has been generated to support indexing capabilities.

}, keywords = {H.264, Indexing Metadata, MPEG-7, Video Coding}, issn = {1051-8215}, author = {Ruiz-Hidalgo, J. and Salembier, P.} } @conference {cSalvador06a, title = {Voxelitzaci{\'o} Adaptada a les Imatges en Entorns Multic{\`a}mera}, booktitle = {2ones Jornades UPC de Investigaci{\'o}n en Autom{\'a}tica, Visi{\'o}n y Rob{\'o}tica}, year = {2006}, pages = {1{\textendash}6}, isbn = {978-3-540-26042-4}, author = {Salvador, J. and Casas, J.} } @conference {cSole05, title = {Adaptive Generalized Prediction for Lifting Schemes}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2005}, year = {2005}, pages = {205{\textendash}208}, address = {Philadelphia, USA}, isbn = {1-4244-1011-8}, author = {Sol{\'e}, J. and Salembier, P.} } @inbook {bGiro-i-Nieto05, title = {Automatic extraction and analysis of visual objects information}, booktitle = {Multimedia content and the semantic web}, year = {2005}, pages = {203{\textendash}221}, publisher = {Wiley}, organization = {Wiley}, chapter = {7}, isbn = {978-0-470-85753-3}, doi = {10.1002/0470012617.ch7}, url = {http://eu.wiley.com/WileyCDA/WileyTitle/productCd-0470857536.html}, author = {Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Marqu{\'e}s, F. and Salembier, P.} } @article {aMarcello05, title = {Automatic tool for the precise detection of upwelling and filaments in remote sensing imagery}, journal = {IEEE transactions on geoscience and remote sensing}, volume = {43}, number = {7}, year = {2005}, pages = {1605{\textendash}1616}, issn = {0196-2892}, author = {Marcello, J. and Marqu{\'e}s, F. and F. Eugenio} } @conference {cAnton05, title = {Collaborative Network Space: Infrastructure and Learning Application}, booktitle = {IEEE Region 8 EUROCON 2005 Conference: Computer as a tool.}, year = {2005}, pages = {803{\textendash}806}, isbn = {1-4244-0050-3}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=01630054}, author = {Anton, M. and Vall-Llosera, M. and Jordi Torres and Romeu, J. and Jofre, L. and Sole, F. and Marqu{\'e}s, F. and Sabate, F. and Berenguer, J.} } @conference {cGiro-i-Nieto05, title = {Detection of Semantic Objects using Description Graphs}, booktitle = {IEEE International Conference on Image Processing}, year = {2005}, address = {Genova, Italy}, abstract = {

This paper presents a technique to detect instances of classes (objects) according to their semantic definition in the form of a description graph. Classes are defined as combinations of instances of lower level semantic classes and allow the definition of a semantic tree that organizes classes in semantic levels. At the bottom level of the semantic tree, classes are defined by a perceptual model containing a list of low-level descriptors. The proposed detection algorithm follows a bottom-up/top-down approach, building semantic trees on a region-based representation of the media. The flexibility of the approach is assessed on different examples of planar objects, such as frontal faces, groups of islands, flags and traffic signs.

}, keywords = {Detection algorithms, Explosions, Face detection, Image databases, Indexing, MPEG 7 Standard, Object detection, Testing, Traffic control, Tree graphs}, isbn = {0-7803-9135-7}, doi = {10.1109/ICIP.2005.1529972}, url = {http://dx.doi.org/10.1109/ICIP.2005.1529972}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cSayrol05, title = {Development of a platform offering video copyright protection and security against illegal distribution}, booktitle = {Security, Steganography, and Watermarking of Multimedia Contents}, year = {2005}, pages = {76{\textendash}83}, isbn = {9963-607-06-3}, author = {Elisa Sayrol and Soriano, M. and Fernandez, M. and Casanelles, J. and Tom{\`a}s, J.} } @conference {cMarques05, title = {El concepto NetCampus}, booktitle = {3es Jornadas de la C{\'a}tedra Telef{\'o}nica-UPC}, year = {2005}, pages = {15{\textendash}20}, url = {https://catedratelefonica.upc.edu/documents/llibres/docs/el_espacio_innovador_y_la_red.pdf}, author = {Marqu{\'e}s, F. and Jofre, L. and Sole, F. and Sabate, F. and Berenguer, J. and Romeu, J. and Jordi Torres} } @book {eJofre05, title = {El "Espacio Innovador" y la red}, year = {2005}, url = {http://www.upc.edu/web/CatedraTelefonicaUPC}, author = {Jofre, L. and Sole, F. and Sabate, F. and Berenguer, J. and Marqu{\'e}s, F. and Romeu, J. and Jordi Torres} } @conference {cLandabaso05a, title = {Foreground Regions Extraction and Characterization Towards Real-Time Object Tracking}, booktitle = {2nd Joint Workshop on Multimodal Interaction and Related Machine Learning Algorithms}, year = {2005}, pages = {241{\textendash}249}, author = {Landabaso, J. and M. Pard{\`a}s} } @conference {cGarcia05, title = {Functionalities for mapping 2D images and 3D world objects in a Multicamera Environment}, booktitle = {6th International Workshop on Image Analysis for Multimedia Interactive Services}, year = {2005}, abstract = {

We present four functionalities intended to improve the ability of image detection and tracking algorithms to understand a scene in a multicamera system. The redundancy of several available projections of any 3D object onto different cameras might ease video analysis tasks. When some prior information about the 3D object or any of its projections is known, geometric constraints can help to restrict search areas in the images under analysis. The functionalities presented also tackle the problem of selecting the best camera at any time, or computing projected areas of 3D objects in images.

}, author = {Garcia, O. and Casas, J.} } @conference {cCanton-Ferrer05a, title = {Fusion of multiple viewpoint information towards 3d face robust orientation detection}, booktitle = {IEEE International Conference on Image Processing}, year = {2005}, isbn = {0-7803-9135-7}, author = {Cristian Canton-Ferrer and Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @conference {cLandabaso05, title = {Hierarchical Representation of Scenes using Activity Information}, booktitle = {2005 IEEE International Conference on Acoustics, Speech, and Signal Processing}, year = {2005}, pages = {677{\textendash}680}, isbn = {0780388755}, author = {Landabaso, J. and M. Pard{\`a}s} } @article {jFigueras05, title = {Las Mancomunidades en Espa{\~n}a}, journal = {Bolet{\'\i} de la Asociaci{\'o}n de Ge{\'o}grafos Espa{\~n}oles}, number = {39}, year = {2005}, pages = {151{\textendash}176}, issn = {0212-9426}, author = {Figueras, P. and Haas, C. and Capdevila, C. and Ver{\'o}nica Vilaplana} } @conference {cChang05, title = {A Motion-based Binary Partition Tree approach to Video Object Segmentation}, booktitle = {IEEE International Conference on Image Processing}, year = {2005}, pages = {430{\textendash}433}, isbn = {0-7803-9135-7}, author = {Chang, C. and M. Pard{\`a}s and Marqu{\'e}s, F.} } @inbook {bSoriano05, title = {Multimedia Copyright Protection Platform Demonstrator}, booktitle = {Lecture notes in computer science}, volume = {3477}, year = {2005}, pages = {76{\textendash}83}, issn = {0302-9743}, url = {http://www.springerlink.com/(n0yw1g55rs45pg55g5zcinrm)/app/home/contribution.asp?referrer=parent\&backto=issue,32,34;journal,197,2212;linkingpublicationresults,1:105633,1}, author = {Soriano, M. and Fernandez, M. and Elisa Sayrol and Tom{\`a}s, J. and Casanelles, J. and Pegueroles, J. and Juan Hern{\'a}ndez Serrano} } @conference {cSoriano05, title = {Multimedia copyright protection platform demonstrator}, booktitle = {Third International Conference on Trust Management (iTrust{\textquoteright}05)}, year = {2005}, pages = {411{\textendash}414}, doi = {10.1007/11429760_32}, url = {http://dx.doi.org/10.1007/11429760_32}, author = {Soriano, M. and Fernandez, M. and Elisa Sayrol and Buliart, J. and Casanelles, J. and Pegueroles, J. and Juan Hern{\'a}ndez Serrano} } @article {aCasas05, title = {Mutual feedback scheme for face detection and tracking aimed at density estimation in demonstrations}, journal = {IEE Proceedings-Vision, Image and Signal Processing}, volume = {152}, number = {3}, year = {2005}, month = {05/2005}, pages = {334{\textendash}346}, abstract = {

Crowded video sequences, like those of demonstrations, offer an interesting challenge for object detection and tracking owing to their complexity: taken outdoors, often in different illumination conditions; showing faces not in frontal view, with perspective effects, complex background, etc. Tracking of individuals becomes a difficult task due to the high number of occlusions. The paper proposes a mutual feedback spatiotemporal detection strategy to tackle these problems. The system improves its efficiency thanks to a cooperative approach between spatial detection and temporal tracking. Spatial detection is based on skin colour classification and shape analysis by morphological tools. Temporal tracking is based on the analysis of the optical flow. The mutual feedback scheme benefits both spatial detection and temporal tracking. In order to deal with multiple occlusions, a graph-based tracking technique, which takes advantage of neighbourhood consistency, has been introduced.

}, issn = {1350-245X}, doi = {10.1049/ip-vis:20045071}, url = {http://dx.doi.org/10.1049/ip-vis:20045071}, author = {Casas, J. and Puig-Sitjes, A. and Puig-Folch, P.} } @conference {cFerran05, title = {Object representation using colour, shape and structure criteria in a Binary Partition Tree}, booktitle = {IEEE International Conference on Image Processing}, year = {2005}, isbn = {0-7803-9135-7}, author = {Ferran, C. and Casas, J.} } @conference {cCanton-Ferrer05b, title = {Projective Kalman Filter: Multiocular Tracking of 3D Locations Towards Scene Understanding}, booktitle = {2nd Joint Workshop on Multimodal Interaction and Related Machine Learning Algorithms}, year = {2005}, author = {Cristian Canton-Ferrer and Cristian Canton-Ferrer and Casas, J. and Tekalp, M. and M. Pard{\`a}s} } @conference {cVilaplana05, title = {Region-based extraction and analysis of visual objects information}, booktitle = {Fourth International Workshop on Content-Based Multimedia Indexing, CBMI 2005}, year = {2005}, address = {Riga, Letonia}, abstract = {

In this paper, we propose a strategy to detect objects from\ still images that relies on combining two types of models: a\ perceptual and a structural model. The algorithms that are\ proposed for both types of models make use of a regionbased description of the image relying on a Binary Partition\ Tree. Perceptual models link the low-level signal description with semantic classes of limited variability. Structural\ models represent the common structure of all instances by\ decomposing the semantic object into simpler objects and\ by defining the relations between them using a DescriptionGraph.

}, isbn = {0-7803-6293-4}, author = {Ver{\'o}nica Vilaplana and Xavier Gir{\'o}-i-Nieto and Salembier, P. and Marqu{\'e}s, F.} } @conference {cQun05, title = {Shadow removal with blob-based Morphological Reconstruction for Error Corretion}, booktitle = {2005 IEEE International Conference on Acoustics, Speech, and Signal Processing}, year = {2005}, pages = {729{\textendash}732}, author = {Xu, Li-Qun and Landabaso, J. and M. Pard{\`a}s} } @conference {cCorrea05, title = {Silhouette-based Probabilistic 2D Human Motion Estimation for Real-Time Applications}, booktitle = {IEEE International Conference on Image Processing}, year = {2005}, pages = {836{\textendash}839}, isbn = {0-7803-9135-7}, author = {Correa, P. and Czyz, J. and Umeda, T. and Marqu{\'e}s, F. and Marichal, X. and Macq, B.} } @conference {cVilaplana05a, title = {Support Vector Data Description Based on PCA Features for Face Detection}, booktitle = {13th European Signal Processing Conference}, year = {2005}, isbn = {0-7803-6300-0}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @inbook {bCanton-Ferrer05, title = {Towards a Bayesian Approach to Robust Finding Correspondances in Multiple View Geometry Environments}, booktitle = {Computational Science {\textendash} ICCS 2005}, volume = {3515}, year = {2005}, pages = {281{\textendash}289}, publisher = {Springer}, organization = {Springer}, address = {Berlin / Heidelberg}, abstract = {

This paper presents a new Bayesian approach to the problem of finding correspondences of moving objects in a multiple calibrated camera environment. Moving objects are detected and segmented in multiple cameras using a background learning technique. A Point Based Feature (PBF) of each foreground region is extracted, in our case, the top. This features will be the support to establish the correspondences. A reliable, efficient and fast computable distance, the symmetric epipolar distance , is proposed to measure the closeness of sets of points belonging to different views. Finally, matching the features from different cameras originating from the same object is achieved by selecting the most likely PBF in each view under a Bayesian framework. Results are provided showing the effectiveness of the proposed algorithm even in case of severe occlusions or with incorrectly segmented foreground regions.

}, isbn = {978-3-540-26043-1}, doi = {10.1007/11428848_35 }, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @conference {cCanton-Ferrer05, title = {Towards a bayesian approach to robust finding correspondances in multiple view geometry environments}, booktitle = {Workshop on Computer Graphics and Geometric Modelling. Intrernational Conference on Computational Science}, year = {2005}, pages = {281{\textendash}289}, isbn = {3-540-26043-9}, author = {Cristian Canton-Ferrer and Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @conference {cSole04, title = {Adaptive Discrete Generalized Lifting for Lossless Compression}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2004}, year = {2004}, pages = {1{\textendash}5}, address = {Montreal, Canada}, isbn = {1-4244-1764-3}, author = {Sol{\'e}, J. and Salembier, P.} } @conference {cEugenio04, title = {An automated multisensor satellite imagery registration technique based on the optimization of contour features}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium 2004}, year = {2004}, pages = {1410{\textendash}1413}, author = {F. Eugenio and Marcello, J. and Marqu{\'e}s, F.} } @conference {cFerran04, title = {Binary-Partition Tree creation using a quasi-inclusion criterion}, booktitle = {8th International Conference on Information Visualization (IV04)}, year = {2004}, pages = {259{\textendash}264}, isbn = {0-7695-2177-0}, author = {Ferran, C. and Casas, J.} } @conference {cBroquetas04, title = {Definici{\'o} d{\textquoteright}un Master Internacional de Recerca: la proposta del Departament de Teoria del Senyal i Comunicacions}, booktitle = {Jornada de reflexi{\'o}n y trabajo sobre el modelo docente de la UPC en el Espacio Europeo de ecuaci{\'o}n Superior (EEES)}, year = {2004}, pages = {1{\textendash}3}, author = {Broquetas, A. and Hernando, J. and Marqu{\'e}s, F. and Romeu, J.} } @book {eJofre04, title = {El {\textquoteright}Profesional Innovador{\textquoteright} y la red}, year = {2004}, url = {http://catedratelefonica.upc.edu/documents/llibres/docs/jornada_2004_catedra_telf_upc.pdf}, author = {Jofre, L. and Sole, F. and Sabate, F. and Marqu{\'e}s, F. and Romeu, J. and Jordi Torres} } @conference {cLobo04, title = {Enhanced audio data hiding synchronization using non-linear filter}, booktitle = {International Conference on Acoustics, Speech, and Signal Processing}, year = {2004}, pages = {885{\textendash}888}, author = {Lobo, A. and Bas, P. and Marqu{\'e}s, F.} } @inbook {bPardas04, title = {The InterFace Software Platform for Interactive Virtual Characters}, booktitle = {Mpeg-4 facial animation: the standard, implementation and applications}, year = {2004}, pages = {169{\textendash}183}, isbn = {0-470-84465-5}, author = {M. Pard{\`a}s and Pandzic, I. and Cannella, M. and Davoine, F. and Forchheimer, R. and Lavagetto, F. and Marriott, A. and Malassiotis, S.} } @article {xGiro-i-Nieto04, title = {La converg{\`e}ncia de la TV cap al PC}, year = {2004}, month = {03/2004}, institution = {Diari Avui}, type = {Newspaper}, address = {Barcelona, Catalonia}, author = {Xavier Gir{\'o}-i-Nieto} } @conference {cCabrera04, title = {LaViCAD: LABORATORIO VIRTUAL DE COMUNICACIONES ANAL{\'O}GICAS Y DIGITALES}, booktitle = {4rt. Congr{\'e}s Internacional de Doc{\`e}ncia Unversit{\`a}ria i Innovaci{\'o}}, year = {2004}, pages = {1{\textendash}20}, isbn = {84-8458-240-X}, author = {Cabrera, M. and Fernandez, J. and Berzosa, C. and Francisco, V. and Gasull, A.} } @conference {cGiro-i-Nieto04, title = {The Moonlight project: bringing light to our satellite}, booktitle = {Proceedings of the 4th International Conference on Solar Power from Space SPS{\textquoteright}04 The 4th International Conference on Solar Power SPS{\textquoteright}04 together with the 5th International Conference on Wireless Power transmissions WPT 5}, year = {2004}, pages = {99{\textendash}100}, isbn = {92-9092-878-6}, author = {Xavier Gir{\'o}-i-Nieto and Aragon, M. and Prats, X. and Acero, L.} } @conference {cSalerno04, title = {Object recognition based on Binary Partition Trees}, booktitle = {IEEE International Conference on Image Processing}, year = {2004}, pages = {929{\textendash}932}, author = {Salerno, O. and M. Pard{\`a}s and Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @phdthesis {dMorros04, title = {Optimization of Segmentation Based Video Sequence Coding Techniques: Application to Content Based Functionalities}, year = {2004}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

This work addresses the problem of video compression with content-based functionalities in the framework of segmentation-based video coding systems. Two major problems are considered. The first one is related with coding optimality in segmentation-based coding systems. Regarding this subject, the feasibility of a rate-distortion approach for a complete region-based coding system is shown. The second one is how to address content-based functionalities in the coding system proposed as a solution of the first problem. Optimality, as defined in the framework of rate-distortion theory, deals with obtaining a representation of the video sequence that leads to a minimum distortion of the coded signal for a given bit budget. In the case of segmentation-based coding systems this means to obtain an {\textquoteright}optimal{\textquoteright} partition together with the best coding technique for each region of this partition so that the result is optimal in an operational rate-distortion sense. The problem is formalized for independent, non-scalable coding. An algorithm to solve this problem is provided as well. This algorithms is applied to a specific segmentation-based coding system, the so called SESAME. In SESAME, each frame is segmented into a set of regions, that are coded independently. Segmentation involves both spatial and motion homogeneity criteria. To exploit temporal redundancy, a prediction for both the partition and the texture of the current frame is created by using motion information. The time evolution of each region is defined along the sequence (time tracking). The results are optimal (or near-optimal) for the given framework in a rate-distortion sense. The definition of the coding strategy involves a global optimization of the partition as well as of the coding technique/quality level for each region. Later, the investigation is also extended to the problem of video coding optimization in the framework of a scalable video coding system that can address content-based functionalities. The focus is set in the various types of content-based scalability and object tracking. The generality of the problem has also been extended by including the spatial and temporal dependencies between frames and scalability layers into the optimization schema. In this case the solution implies finding the optimal partition and set of quantizers for both the base and the enhancement layers. Due to the coding dependencies of the enhancement layer with respect to the base layer, the partition and the set of quantizers of the enhancement layer depend on the decisions made on the base layer. Also, a solution for the independent optimization problem (i.e. without tacking into account dependencies between different frames of scalability layers) has been proposed to reduce the computational complexity. These solutions are used to extend the SESAME coding system. The extended coding system, named XSESAME, supports different types of scalability (PSNR, Spatial and temporal) as well as content-based functionalities, such as content-based scalability and object tracking. Two different operating modes for region selection in the enhancement layer have been presented: One (supervised) aimed at providing content-based functionalities at the enhancement layer and the other (unsupervised) aimed at coding efficiency, without content-based functionalities. Integration of object tracking into the segmentation-based coding system is also investigated. In the general case, tracking is a very complex problem. If this capability has to be integrated into a coding system, additional problems arise due to conflicting requirements between coding efficiency and tracking accuracy. This is solved by using a double partition approach, where pure spatial criteria are used to re-segment the partition used for coding. The projection of the re-segmented partition results in more precise adaptation to object contours. A merging step is performed a posteriori to eliminate the excess of regions originated by the re-segmentation.

}, url = {http://hdl.handle.net/10803/6888}, author = {Morros, J.R.}, editor = {Marqu{\'e}s, F.} } @conference {cMarcello04, title = {Precise upwelling and filaments automatic extraction from multisensorial imagery}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium 2004}, year = {2004}, pages = {2018{\textendash}2021}, author = {Marcello, J. and F. Eugenio and Marqu{\'e}s, F.} } @conference {cSole04a, title = {Prediction design for discrete generalized lifting}, booktitle = {Conference on Advanced Concepts for Intelligent Vision Systems, ACIVS 2004}, year = {2004}, pages = {78{\textendash}82}, address = {Brussels, Belgium}, isbn = {978-1-4244-3610-1}, author = {Sol{\'e}, J. and Salembier, P.} } @conference {cDorea04, title = {A region-based algorithm for image segmentation and parametric motion estimation}, booktitle = {Image Analysis for Multimedia Interactive Services}, year = {2004}, isbn = {972-98115-7-1}, author = {Dorea, C. and M. Pard{\`a}s and Marqu{\'e}s, F.} } @article {aLandabaso04, title = {Robust Tracking and Object Classification Towards Automated Video Surveillance}, journal = {Lecture notes in computer science}, volume = {3212}, year = {2004}, pages = {463{\textendash}470}, issn = {0302-9743}, author = {Landabaso, J. and Xu, Li-Qun and M. Pard{\`a}s} } @conference {cJose-Luis04, title = {Robust Tracking and Object Classification Towards Automated Video Surveillance}, booktitle = {International Conference on Image Analysis and Recognition}, year = {2004}, pages = {46333{\textendash}470}, isbn = {3540232230}, author = {Landabaso, J. and M. Pard{\`a}s and Xu, Li-Qun} } @conference {cCasas04, title = {Spatial-temporal Video Analysis for Improved Pedestrian Detection: Application to Density Estimation and Tracking in Demonstrations}, booktitle = {Image Analysis for Multimedia Interactive Services}, year = {2004}, pages = {334{\textendash}346}, isbn = {972-98115-7-1}, author = {Casas, J. and Puig-Sitjes, A. and Puig-Folch, P.} } @conference {cJofre04, title = {Una Enginyeria per a la Societat del Coneixement}, booktitle = {II Congr{\'e}s d{\textquoteright}Enginyeria en Llengua Catalana}, year = {2004}, url = {http://www.eicc.cat/celc/formacio.htm}, author = {Jofre, L. and Sole, F. and Sabate, F. and Marqu{\'e}s, F. and Romeu, J. and Jordi Torres} } @conference {cCasas03, title = {Advanced content-based semantic scene analysis and information retrieval: the Schema Project}, booktitle = {4th European Workshop on Image Analysis for Multimedia Interactive Services_x000D_}, year = {2003}, pages = {43{\textendash}45}, author = {Casas, J. and Kompatsiaris, I. and Strintzis, M.} } @phdthesis {dLlach03, title = {Analysis of Video Sequences for Content Description. Table of Contents \& Index Creation and Scene Classification}, year = {2003}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, author = {Llach, J.}, editor = {Salembier, P.} } @article {aEugenio03, title = {Automatic satellite image georeferencing using a contour matching approach}, journal = {IEEE transactions on geoscience and remote sensing}, volume = {41}, number = {12}, year = {2003}, pages = {2869{\textendash}2880}, issn = {0196-2892}, author = {F. Eugenio and Marqu{\'e}s, F.} } @conference {cEugenio03a, title = {Automatic structures detection and spatial registration using multisensor satellite imagery}, booktitle = {Proceedings of the International Geoscience and Remote Sensing Symposium,}, year = {2003}, pages = {1038{\textendash}1040}, author = {F. Eugenio and Rovaris, E. and Marcello, J. and Marqu{\'e}s, F.} } @conference {cGiro-i-Nieto03, title = {Detection of Semantic Entities using Description Graphs}, booktitle = {4th European Workshop on Image Analysis for Multimedia Interactive Services (WIAMIS)}, year = {2003}, month = {04/2003}, address = {London, England}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cLandabaso03, title = {HMM recognition of expressions in unrestrained video intervals}, booktitle = {International conference on Acoustics, Speech, and Signal Processing}, year = {2003}, pages = {197{\textendash}200}, isbn = {0-7803-7664-1}, author = {Landabaso, J. and M. Pard{\`a}s and Bonafonte, A.} } @conference {cEugenio03, title = {Marine coastal dynamic study using an automatic structure detection and spatial registration tool}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium (IGARSS-03)}, year = {2003}, pages = {1{\textendash}3}, isbn = {0-7803-7930-6}, author = {F. Eugenio and Marcello, J. and Marqu{\'e}s, F.} } @conference {cRuiz-Hidalgo03, title = {Metadata-based coding tools for hybrid video codecs}, booktitle = {Picture Coding Symposium, PCS 2003}, year = {2003}, month = {04/2003}, pages = {473{\textendash}477}, address = {Saint-Malo, France}, author = {Ruiz-Hidalgo, J. and Salembier, P.} } @conference {c03, title = {The Moon Orbital Mirror}, booktitle = {54th International Astronautical Congress (IAC)}, year = {2003}, month = {10/2003}, address = {Bremen, Germany}, abstract = {

The current state of space exploration pushes that one of the next steps is the establishment of a permanent base on the Moon. This paper describes the problems related to lunar night, explains the benefits that the Moon Orbital Mirror would offer and gives a possible design for the satellite.

}, url = {http://www.zarm.uni-bremen.de/iaf2003/abstracts/data/pdf/IAC-03-IAA.1.1.02.PDF}, author = {Acero, LLu{\'\i}s and {\`A}. Arag{\'o}n and Xavier Gir{\'o}-i-Nieto and Prats, Xavier} } @conference {cVentosa03, title = {Perceptual mask estimation from watermarked images}, booktitle = {SPIE Electronic Imaging 2003}, year = {2003}, pages = {10{\textendash}17}, isbn = {?}, author = {Ventosa, S. and Elisa Sayrol and Vidal, J.} } @conference {cO{\textquoteright}Connor03, title = {Region and object segmentation algorithms in the QIMERA segmentation platform}, booktitle = {Third International Workshop on Content-Based Multimedia Indexing}, year = {2003}, pages = {95{\textendash}103}, isbn = {978-84-612-2373-2}, author = {O{\textquoteright}Connor, N. and Sav, S. and Adamek, T. and Mezaris, V. and Kompatsiaris, I. and Lui, T. and Izquierdo, E. and Ferran, C. and Casas, J.} } @inbook {cGiro-i-Nieto03a, title = {Semantic Entity Detection Using Description Graphs}, booktitle = {Digital Media Processing for Multimedia Interactive Services}, year = {2003}, pages = {39{\textendash}42}, publisher = {World Scientific Publishing Co.}, organization = {World Scientific Publishing Co.}, address = {Singapore}, abstract = {

This paper presents a technique for the detection of Semantic Entities (SEs) in multimedia content. A definition of a SE in terms of lower-level SEs and their Relations (Rs) is proposed using Description Graphs (DGs). By analyzing the a/v information, an instance DG is built to be compared with a model DG of the SE. As a result, a confidence value is computed to express how well the SE is represented in the content. Examples of the use of this approach are presented in two different applications: detection of frontal faces and recognition of clusters of islands.

}, isbn = {981-238-355-7}, url = {http://books.google.es/books?id=vVvJINURimIC\&printsec=frontcover\&hl=ca\&source=gbs_ge_summary_r\&cad=0$\#$v=onepage\&q\&f=false}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @article {aComas03, title = {Unbalanced multiple description video coding based on a rate-distortion optimization}, journal = {Eurasip Journal on Applied Signal Processing}, volume = {2003}, number = {1}, year = {2003}, pages = {81{\textendash}90}, issn = {1110-8657}, author = {Comas, D. and Singh, R. and Ortega, A. and Marqu{\'e}s, F.} } @conference {cAvrithis03, title = {Unified access to heterogeneous audiovisual archives}, booktitle = {International Conference on Knowledge Management}, year = {2003}, pages = {1{\textendash}2}, address = {Graz, Austria}, author = {Avrithis, Y. and Stamou, G. and Wallace, M. and Marqu{\'e}s, F. and Salembier, P. and Xavier Gir{\'o}-i-Nieto and Haas, W. and Vallant, H. and Zufferey, M.} } @article {aAvrithis03, title = {Unified Access to Heterogeneous Audiovisual Archives}, journal = {Journal of universal computer science}, volume = {9}, number = {6}, year = {2003}, pages = {510{\textendash}519}, abstract = {

In this paper, an integrated information system is presented that offers enhanced search and retrieval capabilities to users of heterogeneous digital audiovisual (a/v) archives. This innovative system exploits the advances in handlings a/v content and related metadata, as introduced by MPEG-4 and worked out by MPEG-7, to offer advanced services characterized by the tri-fold semantic phrasing of the request (query), unified handling and personalized response. The proposed system is targeting the intelligent extraction of semantic information from a/v and text related data taking into account the nature of the queries that users my issue, and the context determined by user profiles. It also provides a personalization process of the response in order to provide end_users with desired information. From a technical point of view, the FAETHON system plays the role of an intermediate access server residing between the end users and multiple heterogeneous audiovisual archives organized according to the new MPEG standards.

}, issn = {0948-6968}, doi = {10.3217/jucs-009-06-0510}, url = {http://www.jucs.org/jucs_9_6}, author = {Avrithis, Y. and Stamou, G. and Wallace, M. and Marqu{\'e}s, F. and Salembier, P. and Xavier Gir{\'o}-i-Nieto and Haas, W. and Vallant, H. and Zufferey, M.} } @article {aSchelkens03, title = {Wavelet coding of volumetric medical datasets}, journal = {Medical Imaging, IEEE Transactions on}, volume = {22}, year = {2003}, month = {march}, pages = {441 -458}, abstract = {

Several techniques based on the three-dimensional (3-D) discrete cosine transform (DCT) have been proposed for volumetric data coding. These techniques fail to provide lossless coding coupled with quality and resolution scalability, which is a significant drawback for medical applications. This paper gives an overview of several state-of-the-art 3-D wavelet coders that do meet these requirements and proposes new compression methods exploiting the quadtree and block-based coding concepts, layered zero-coding principles, and context-based arithmetic coding. Additionally, a new 3-D DCT-based coding scheme is designed and used for benchmarking. The proposed wavelet-based coding algorithms produce embedded data streams that can be decoded up to the lossless level and support the desired set of functionality constraints. Moreover, objective and subjective quality evaluation on various medical volumetric datasets shows that the proposed algorithms provide competitive lossy and lossless compression results when compared with the state-of-the-art.

}, keywords = {Algorithms, Computer-Assisted, Data Compression, discrete cosine transforms, embedded coding, embedded data streams, functionality constraints, image coding, Image Enhancement, Image Interpretation, Imaging, JPEG2000, layered zero coding, lossless compression, medical diagnostic imaging, medical image compression, medical image processing, Numerical Analysis, objective quality evaluation, progressive image transmission, quadtree coding, reviews, Signal Processing, subjective quality evaluation, Three-Dimensional, volumetric coding, wavelet transforms}, issn = {0278-0062}, doi = {10.1109/TMI.2003.809582}, author = {Schelkens, P. and Munteanu, A. and Barbarien, J. and Galca, M. and Xavier Gir{\'o}-i-Nieto and Cornelis, J.} } @conference {cMarcelo02, title = {Automatic feature extraction from multisensorial oceanographic imagery}, booktitle = {International Geoscience and Remote Sensing Symposium, 2002. IGARSS {\textquoteright}02. 2002 IEEE}, year = {2002}, pages = {4{\textendash}8}, isbn = {0-7803-7536-0}, author = {Marcello, J. and Marqu{\'e}s, F. and F. Eugenio} } @conference {cVidal02, title = {Color Image Watermarking Using Channel-State Knowledge}, booktitle = {SPIE Electronic Imaging 2002}, year = {2002}, pages = {214{\textendash}221}, isbn = {0-8194-4415-4}, author = {Vidal, J. and Maribel, M. and Elisa Sayrol} } @conference {cSalembier02a, title = {Connected Operators Based on Reconstruction Process for Size and Motion Simplification}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2002}, year = {2002}, month = {05/2002}, address = {Orlando, USA}, author = {Salembier, P. and Ruiz-Hidalgo, J.} } @conference {cEugenio02, title = {A contour-based approach to automatic and accurate registration of multitemporal and multisensor satellite imagery}, booktitle = {International Geoscience and Remote Sensing Symposium, 2002. IGARSS {\textquoteright}02. 2002 IEEE}, year = {2002}, pages = {1{\textendash}3}, isbn = {0-7803-7536-0}, author = {F. Eugenio and Marqu{\'e}s, F. and Marcello, J.} } @inbook {bSalembier02b, title = {Description of a Single Multimedia Document}, booktitle = {Introduction to the mpeg-7: multimedia content description interface}, year = {2002}, pages = {111{\textendash}138}, publisher = {Wiley}, organization = {Wiley}, edition = {B. S. Manjunath, P. Salembier, T. Sikora (Eds.)}, chapter = {8}, isbn = {0471486787}, author = {Benitez, A.B and Martinez, J.M and Rising, H and Salembier, P.} } @conference {cPardas02, title = {Emotion recognition based on MPEG-4 facial animation parameters}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing}, year = {2002}, pages = {3624{\textendash}3627}, author = {M. Pard{\`a}s and Bonafonte, A. and Landabaso, J.} } @article {aMarques02, title = {Face segmentation and tracking based on connected operators and partition projection}, journal = {Pattern recognition}, volume = {35}, number = {3}, year = {2002}, pages = {601{\textendash}614}, issn = {0031-3203}, author = {Marqu{\'e}s, F. and Ver{\'o}nica Vilaplana} } @article {aPardas02, title = {Facial animation parameters extraction and expression recognition using Hidden Markov Models}, journal = {Signal processing: image communication}, number = {17}, year = {2002}, pages = {675{\textendash}688}, issn = {0923-5965}, author = {M. Pard{\`a}s and Bonafonte, A.} } @conference {cMarques02a, title = {Facial Feature Segmentation from Frontal View Images}, booktitle = {11th European Signal Processing Conference (EUSIPCO 2002)}, year = {2002}, pages = {33{\textendash}36}, author = {Marqu{\'e}s, F. and Sobrevals, C.} } @conference {cSalembier02, title = {On Filters by Reconstruction for Size and Motion Simplification,}, booktitle = {Int. Symposium on Mathematical Morphology, ISMM 2002}, year = {2002}, month = {04/2002}, pages = {425{\textendash}434}, address = {Sydney, Australia}, author = {Salembier, P. and Ruiz-Hidalgo, J.} } @conference {cGarrido02, title = {A framework for the retrieval of multiple regions using Binary Partition Trees and low level descriptors}, booktitle = {11th European Signal Processing Conference, EUSIPCO 2002}, year = {2002}, pages = {512{\textendash}516}, address = {Toulouse, France}, author = {Garrido, L. and Salembier, P.} } @conference {cVallverdu02, title = {Graphical study of signals and systems}, booktitle = {14th annual World Conference on Educational Multimedia, Hypermedia \& Telecommunications}, year = {2002}, isbn = {0-8186-7919-0}, author = {Vallverdu, F. and Elisa Sayrol and Gasull, A. and Salavedra, J. and Moreno, A.} } @phdthesis {dGarrido02, title = {Hierarchical Region Based Processing of Images and Video Sequences: Application to Filtering, Segmentation and Information Retrieval}, year = {2002}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

This work discusses the usefulness of hierarchical region based representations for image and video processing. Region based representations offer a way to perform a first level of abstraction and reduce the number of elements to process with respect to the classical pixel based representation. In this work the two representations that have demonstrated to be useful for region based processing are reviewed, namely region adjacency graphs and trees, and it is discussed why tree based representations are better suited for our purpose. In fact, trees allow representing the image in a hierarchical way and efficient and complex processing techniques can be applied on it. Two major issues are discussed in this work: how the hierarchical representation may be created from a given image and how the tree may be manipulated or processed. Two tree based representations have been developed: the Max-Tree, and the Binary Partition Tree. The Max-Tree structures in a compact way the connected components that arise from all possible level sets from a gray-level image. It is suitable for the implementation of anti-extensive connected operators, ranging from classical ones (for instance, area filter) to new ones (such as the motion filter developed in this work). The Binary Partition Tree structures the set of regions that are obtained during the execution of a region merging algorithm. Developed to overcome some of the drawbacks imposed by the Max-Tree {\textendash} in particular the lack of flexibility of the tree creation and the self-duality of the tree representation {\textendash}, it has demonstrated to be a representation useful for a rather large range of applications, as it is shown in this work. Processing strategies are focused on pruning techniques. Pruning techniques remove some of the branches of the tree based on an analysis algorithm applied on the nodes of the tree. Pruning techniques applied on the Max-Tree lead to anti-extensive operators, whereas self-dual operators are obtained on the Binary Partition Tree, if the tree is created in a self-dual manner. The pruning techniques that have been developed in this work are directed to the following applications: filtering, segmentation and content based image retrieval. The filtering (in the context of connected operators) and segmentation applications are based on the same principle: the nodes of the tree are analyzed according to a fixed criterion, and the decision to remove or preserve a node usually relies on a threshold applied on the former measured criterion. Pruning is then performed according to the previous decision. As a result, the image associated to the pruned tree represents a filtered or segmented version of the original image according to the selected criterion. Some of the criteria that are discussed in this work are based, for instance, on area, motion, marker \& propagation or a rate-distortion strategy. The problem of the lack of robustness of classical decision approaches of non-increasing criteria is discussed and solved by means of an optimization strategy based on the Viterbi algorithm. Content based image retrieval is the third application we have focused on in this work. Hierarchical region based representations are particularly well suited for this purpose since they allow to represent the image at different scales of resolution, and thus the regions of the image can be described at different scales of resolution. In this work we focus on an image retrieval system which supports low level queries based on visual descriptors and spatial relationships. For that purpose, region descriptors are attached to the nodes of the tree. Two types of queries are discussed: single region query, in which the query is made up of one region and, multiple region query, in which the query is made up of a set of regions. In the former visual descriptors are used to perform the retrieval whereas visual descriptors and spatial relationships are used in the latter case. Moreover, a relevance feedback approach is presented to avoid the need of manually setting the weights associated to each descriptor. An important aspect that has been taken into account throughout this work is the efficient implementation of the algorithms that have been developed for both creation and processing of the tree. In the case of the tree creation, efficiency has been obtained mainly due to the use of hierarchical queues, whereas in the processing step analysis algorithms based on recursive strategies are used to get efficient algorithms.

}, url = {http://hdl.handle.net/10803/6878}, author = {Garrido, L.}, editor = {Salembier, P.} } @book {eSalembier02, title = {Introduction to the mpeg-7: multimedia content description interface}, year = {2002}, publisher = {Wiley}, organization = {Wiley}, isbn = {0471486787}, author = {Manjunath, B.S and Salembier, P. and Sikora, T} } @conference {cGiro-i-Nieto02, title = {MPEG-7 Descriptors for Earth Observation Satellites}, booktitle = {International Astronautical Congress}, year = {2002}, month = {09/2002}, pages = {1{\textendash}4}, publisher = {Inernational Astronautical Federation}, organization = {Inernational Astronautical Federation}, address = {Houston, Texas (USA)}, abstract = {

The amount of digital multimedia information has experienced a spectacular growth during the last years thanks to the advances on digital systems of image, video and audio acquisition. As a response to the need of organizing all this information, ISO/IEC has developed a new standard for multimedia content description called MPEG-7. Among other topics, MPEG-7 defines a set of multimedia descriptors that can be automatically generated using signal processing techniques. Earth Observation Satellites generate large quantities of images stored on enormous databases that can take advantage of the new standard. An automatic indexation of these images using MPEG-7 meta-data can improve their contents management as well as simplify interaction between independent databases. This paper gives an overall description on MPEG-7 standard focusing on the low-level Visual Descriptors. These descriptors can be grouped into four categories: color, texture, shape and motion. Visual Color Descriptors represent the color distribution of an image in terms of a specified color space. Visual Texture Descriptors define the visual pattern of an image according to its homogeneities and non-homogeneities. Visual Shape Descriptors describe the shape of 2D and 3D objects being, at the same time, invariant to scaling, rotation and translation. Motion Descriptors give the essential characteristics of objects and camera motions.

These descriptors can be used individually or in combination to index and retrieve satellite images of the Earth from a database. For example, oceans and glaciers can be discerned based on their Color Descriptors, also cities and desert based on the Texture Descriptors, island images can be grouped using the Shape descriptors and cyclone trajectories studied and compared using Motion Descriptors.

}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Marcello, J. and F. Eugenio} } @conference {cMarques02, title = {Object matching based on partition information}, booktitle = {IEEE International Conference on Image Processing}, year = {2002}, pages = {829{\textendash}832}, isbn = {0-7803-7623-4}, doi = {10.1109/icip.2002.1040079}, author = {Marqu{\'e}s, F. and M. Pard{\`a}s and Morros, J.R.} } @article {aMech02, title = {Objective evaluation criteria for 2-D shape estimation results of moving objects}, journal = {Eurasip Journal on Applied Signal Processing}, volume = {2002}, number = {4}, year = {2002}, pages = {401{\textendash}409}, issn = {1110-8657}, author = {Mech, R. and Marqu{\'e}s, F.} } @conference {cGasull02, title = {Oil Spills Detection in SAR Images using Mathematical Morphology}, booktitle = {11th European Signal Processing Conference (EUSIPCO 2002)}, year = {2002}, pages = {25{\textendash}28}, author = {Gasull, A. and F{\'a}bregas, F.X. and Jim{\'e}nez, J. and Marqu{\'e}s, F. and Moreno, V. and Herrero, M.} } @inbook {bSalembier02a, title = {Overview of MPEG-7 multimedia description schemes and schema tools}, booktitle = {Introduction to the mpeg-7: multimedia content description interface}, year = {2002}, pages = {83{\textendash}94}, publisher = {Wiley}, organization = {Wiley}, edition = {B. S. Manjunath, P. Salembier, T. Sikora (Eds.)}, chapter = {6}, isbn = {0471486787}, author = {Salembier, P. and Smith, J.} } @article {aSalembier02a, title = {Overview of the MPEG-7 Standard and of Future Challenges for Visual Information Analysis}, journal = {EURASIP Journal on Applied Signal Processing}, volume = {4}, number = {1}, year = {2002}, pages = {1{\textendash}11}, issn = {0165-1684}, author = {Salembier, P.} } @book {eSayrol02, title = {Senyals i sistemes anal{\`o}gics: una introducci{\'o} pr{\`a}ctica}, year = {2002}, isbn = {84-8301-610-9}, url = {http://www.edicionsupc.es}, author = {Elisa Sayrol and Gasull, A. and Moreno, A. and Salavedra, J. and Vallverdu, F.} } @inbook {bSalembier02, title = {Systems Architecture}, booktitle = {Introduction to the mpeg-7: multimedia content description interface}, year = {2002}, pages = {33{\textendash}42}, publisher = {Wiley}, organization = {Wiley}, edition = {B. S. Manjunath, P. Salembier, T. Sikora (Eds.)}, chapter = {3}, isbn = {0471486787}, author = {Avaro, O. and Salembier, P.} } @article {aSalembier02, title = {Visual Segment Tree Creation for MPEG-7 Description Schemes}, journal = {Pattern recognition}, volume = {35}, number = {1}, year = {2002}, pages = {563{\textendash}579}, issn = {0031-3203}, author = {Salembier, P. and Llach, J. and Garrido, L.} } @conference {cSayrol01, title = {Color Inititialization for Lip Tracking}, booktitle = {International Conference on Augmented, Virtual Environments and 3D Imaging}, year = {2001}, pages = {351{\textendash}354}, author = {Elisa Sayrol and Fischi, O. and M. Pard{\`a}s} } @conference {cPineda01, title = {Estudio de campos de golf mediante t{\'e}cnicas de segmentaci{\'o}n}, booktitle = {IX Congreso Nacional de Teledetecci{\'o}n}, year = {2001}, address = {Lleida, Spain}, author = {Pineda, N and Jorge, J and Garrido, L. and Salembier, P.} } @conference {cPardas01, title = {Facial Animation Parameters extraction and Expression detection using HMM}, booktitle = {International Conference on Augmented, Virtual Environments and 3D Imaging}, year = {2001}, pages = {120{\textendash}123}, isbn = {1-4244-1764-3}, author = {M. Pard{\`a}s and Bonafonte, A.} } @conference {cPardas01a, title = {Facial Parameter Extraction System based on Active Contours}, booktitle = {IEEE International Conference on Image Processing}, year = {2001}, pages = {1058{\textendash}1061}, author = {M. Pard{\`a}s and Marcos, L.} } @conference {cSayrol01a, title = {Graphical Study of Signals and Systems}, booktitle = {International Conference on Acoustics, Speech and Signal Processing ICASSP{\textquoteright}01}, year = {2001}, isbn = {0-7803-1775-0}, author = {Elisa Sayrol and Gasull, A. and Moreno, A. and Salavedra, J. and Vallverdu, F.} } @conference {cVilaplana01, title = {Join detection and segmentation of human faces in color images}, booktitle = {International Conference on Augmented, Virtual Environments and 3D Imaging}, year = {2001}, pages = {347{\textendash}350}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @article {aRuiz-Hidalgo01, title = {Morphological tools for robust key-region extraction and video shot modeling}, journal = {Lecture notes in computer science}, year = {2001}, pages = {407{\textendash}416}, abstract = {

In recent years, the use of multimedia content has experienced an exponential growth. In this context, the need of new image/video sequence representation is becoming a necessity for many applications. This paper deals with the structuring of video shots in terms\ of various foreground key-regions and a background mosaic. Each key-region represents different foreground objects that appear through the\ entire sequence in a similar manner the mosaic image represents the\ background information of the complete sequence. We focus on the interest of morphological tools such as connected operators or watersheds\ to perform the shot analysis and the computation of the key-regions and\ the mosaic. It will be shown that morphological tools are particularly attractive to improve the robustness of the various steps of the algorithm.

}, issn = {0302-9743}, author = {Ruiz-Hidalgo, J. and Salembier, P.} } @article {aPardas01, title = {Motion estimation based tracking of active contours}, journal = {Pattern recognition letters}, volume = {22}, year = {2001}, pages = {1447{\textendash}1456}, issn = {0167-8655}, author = {M. Pard{\`a}s and Elisa Sayrol} } @article {aSalembier01, title = {MPEG-7 Description Schemes}, journal = {IEEE transactions on circuits and systems for video technology}, volume = {11}, number = {6}, year = {2001}, pages = {748{\textendash}759}, issn = {1051-8215}, author = {Salembier, P. and Smith, J.} } @article {aAvaro01, title = {MPEG-7 Systems: overview}, journal = {IEEE transactions on circuits and systems for video technology}, volume = {11}, number = {6}, year = {2001}, pages = {760{\textendash}764}, issn = {1051-8215}, author = {Avaro, O. and Salembier, P.} } @conference {cSalembier01b, title = {MPEG-7: What{\textquoteright}s next for visual content representation and analysis?}, booktitle = {International Workshop on Very Low Bit Rate Video Coding 2001, VLBV 2001}, year = {2001}, address = {Athens, Greece}, author = {Salembier, P.} } @conference {cBescos01, title = {An open integrated system for video indexing}, booktitle = {International Workshop on Content-Based Multimedia Indexing}, year = {2001}, pages = {322{\textendash}333}, author = {Bescos, J. and Torres, L. and Menendez, J. and Vil{\`a}, J. and Cisneros, G. and Marqu{\'e}s, F. and Cabrera, J. and Mart{\'\i}nez, J.} } @conference {cSalembier01a, title = {An overview of MPEG-7 Multimedia Description Schemes and of future visual information analysis challenges for content-based indexing}, booktitle = {International Workshop on Content-Based Multimedia Indexing, CBMI 2001}, year = {2001}, address = {Brescia, Italy}, author = {Salembier, P.} } @conference {cSalembier01, title = {An overview of the MPEG-7 standard and of future challenges for visual information analysis}, booktitle = {Workshop on Image Analysis for Multimedia Services, WIAMIS 2001}, year = {2001}, address = {Tampere, Finland}, author = {Salembier, P.} } @conference {cEugenio01, title = {Pixel and sub-pixel accuracy in satellite image georeferencing using an automatic contour matching approach}, booktitle = {IEEE International Conference on Image Processing}, year = {2001}, isbn = {0-7803-6727-8}, author = {F. Eugenio and Marqu{\'e}s, F. and Marcello, J.} } @article {aEugenio01, title = {A real-time automatic acquisition, processing and distribution system for AVHRR and SeaWIFS imagery}, journal = {IEEE geoscience electronics society newsletter}, volume = {-}, number = {Issue 20}, year = {2001}, pages = {10{\textendash}15}, issn = {0161-7869}, author = {F. Eugenio and Marcello, J. and Marqu{\'e}s, F. and Hernandez-Guerra, A. and Rovaris, E.} } @article {aVilaplana01, title = {A region-based approach to face segmentation and tracking in video sequences}, journal = {Latin american applied research}, volume = {31}, number = {2}, year = {2001}, pages = {99{\textendash}106}, issn = {0327-0793}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @conference {cRuiz-Hidalgo01, title = {Robust segmentation and representation of foreground key-regions in video sequences}, booktitle = {International Conference on Acoustics, Speech and Signal Processing ICASSP{\textquoteright}01}, year = {2001}, month = {05/2001}, pages = {1565{\textendash}1568}, address = {Salt Lake City, USA}, author = {Ruiz-Hidalgo, J. and Salembier, P.} } @book {eSayrol01, title = {Senyals i sistemes anal{\`o}gics: una introducci{\'o} pr{\`a}ctica}, year = {2001}, isbn = {84-8301-510-2}, url = {www.edicionsupc.es}, author = {Elisa Sayrol and Gasull, A. and Moreno, A. and Salavedra, J. and Vallverdu, F.} } @conference {cSchelkens00, title = {3D compression of medical data based on cube-splitting and embedded block coding}, booktitle = {ProRISC/IEEE Workshop}, year = {2000}, month = {12/2000}, address = {Veldhoven, The Netherlands}, author = {Schelkens, P. and Xavier Gir{\'o}-i-Nieto and Barbarien, J. and Cornelis, J.} } @conference {cGasull00, title = {619 - un ejemplo de dise{\~n}o del laboratorio para asignaturas con cr{\'e}ditos te{\'o}ricos y pr{\'a}cticos}, booktitle = {VIII Congreso Universitario de Innovaci{\'o}n Educativa en las Ense{\~n}anzas T{\'e}cnicas}, year = {2000}, pages = {233{\textendash}240}, isbn = {84-7585-380-3}, author = {Gasull, A. and Moreno, A. and Salavedra, J. and Elisa Sayrol and Vallverdu, F.} } @conference {cEugenio00a, title = {Accurate and automatic NOAA-AVHRR image navigation using a global contour matching approach}, booktitle = {International Geoscience and remote Sensing Symposium}, year = {2000}, pages = {639{\textendash}642}, isbn = {0-7803-6362-0}, author = {F. Eugenio and Marqu{\'e}s, F. and G{\'o}mez, L. and Suarez, E. and Rovaris, E.} } @article {aSalembier00, title = {Binary partition tree as an efficient representation for image processing, segmentation and information retrieval}, journal = {IEEE transactions on image processing}, volume = {9}, number = {4}, year = {2000}, pages = {561{\textendash}576}, abstract = {

This paper discusses the interest of binary partition trees as a region-oriented image representation. Binary partition trees concentrate in a compact and structured representation a set of meaningful regions that can be extracted from an image. They offer a multiscale representation of the image and define a translation invariant 2-connectivity rule among regions. As shown in this paper, this representation can be used for a large number of processing goals such as filtering, segmentation, information retrieval and visual browsing. Furthermore, the processing of the tree representation leads to very efficient algorithms. Finally, for some applications, it may be interesting to compute the binary partition tree once and to store it for subsequent use for various applications. In this context, the paper shows that the amount of bits necessary to encode a binary partition tree remains moderate.

}, issn = {1057-7149}, doi = {10.1109/83.841934}, author = {Salembier, P. and Garrido, L.} } @inbook {bMarques00, title = {CODING-ORIENTED SEGMENTATION OF VIDEO SEQUENCES}, booktitle = {Video coding: the second generation approach}, year = {2000}, pages = {79{\textendash}124}, isbn = {079239680}, author = {Marqu{\'e}s, F.} } @conference {cSalembier00a, title = {Connected operators based on region-tree pruning}, booktitle = {Mathematical Morphology and its application to signal processing, ISMM 200}, year = {2000}, pages = {169{\textendash}184}, address = {Palo Alto, USA}, isbn = {0-7803-8555-1}, author = {Salembier, P. and Garrido, L.} } @conference {cSalembier00c, title = {Connected operators based on region-tree pruning strategies}, booktitle = {15th IAPR International Conference on Pattern Recognition, ICPR 2000}, year = {2000}, pages = {371{\textendash}374}, address = {Barcelona, Spain}, isbn = {?}, author = {Salembier, P. and Garrido, L.} } @conference {cEugenio00, title = {A contour matching approach for accurate NOAA-AVHRR image navigation}, booktitle = {10th European Signal Processing Conference (EUSIPCO 2000)}, year = {2000}, isbn = {952-15-0447-1}, author = {F. Eugenio and Marqu{\'e}s, F. and Suarez, E. and Rovaris, E.} } @article {aNunes00, title = {A contour-based approach to binary shape coding using a multiple grid chain code}, journal = {Signal processing: image communication}, volume = {15}, number = {7-8}, year = {2000}, pages = {585{\textendash}599}, issn = {0923-5965}, author = {Nunes, P. and Marqu{\'e}s, F. and Pereira, F. and Gasull, A.} } @conference {cSalembier00, title = {Creation of visual segment trees for MPEG-7 Description Schemes}, booktitle = {International Symposium on Image / video Communications over Fixed and Mobile Networks, ISIVC{\textquoteright}2000}, year = {2000}, pages = {127{\textendash}134}, address = {Rabat, Morocco}, isbn = {?}, author = {Salembier, P. and Llach, J. and Garrido, L.} } @conference {cVidal00, title = {Data Hidding in Color Images using Perceptual Models}, booktitle = {COST 254 Intelligent Processing and Facilities for Communications Terminals}, year = {2000}, pages = {21{\textendash}25}, author = {Vidal, J. and Elisa Sayrol and Maribel, M.} } @article {aSalembier00a, title = {Description Schemes for Video Programs, Users and Devices}, journal = {Signal processing: image communication}, volume = {16}, number = {1}, year = {2000}, pages = {211{\textendash}234}, issn = {0923-5965}, author = {Salembier, P. and Richard, Q. and O{\textquoteright}Connor, N. and Correia, P. and Sezan, I and van Beek, P} } @conference {cBroquetas00, title = {Detecci{\'o}n de vertidos de petr{\'o}leo en el mar a partir de im{\'a}genes SAR}, booktitle = {XV Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {2000}, pages = {327{\textendash}328}, isbn = {84-600-9597-5}, author = {Broquetas, A. and Toronjo, C. and Gasull, A. and F{\'a}bregas, F.X. and Moreno, V. and Herrero, M.} } @conference {cPardas00, title = {Extraction and tracking of the eyelids}, booktitle = {International Conference on Acoustics, Speech and Signal Processing}, year = {2000}, author = {M. Pard{\`a}s} } @book {eMarques00, title = {Introduction fo interpretation of graphic images}, year = {2000}, isbn = {0-8194-2380-7}, editor = {Marqu{\'e}s, F.} } @conference {cMarques00, title = {A morphological approach for segmentation and tracking of human faces}, booktitle = {International Symposium on Image/Video Communications over Fixed and Mobile Networks}, year = {2000}, pages = {38{\textendash}43}, author = {Marqu{\'e}s, F. and Ver{\'o}nica Vilaplana} } @conference {cMarques00a, title = {A morphological approach for segmentation and tracking of human faces}, booktitle = {15th IAPR International Conference on Pattern Recognition}, year = {2000}, isbn = {0-7695-0750-6}, author = {Marqu{\'e}s, F. and Ver{\'o}nica Vilaplana} } @conference {cPardas00a, title = {A new approach to active contours for tracking}, booktitle = {IEEE International Conference on Image Processing}, year = {2000}, isbn = {0-7803-7403-7}, author = {M. Pard{\`a}s and Elisa Sayrol} } @conference {cMarcotegui00, title = {Partition-based image representation as basis for user-assisted segmentation}, booktitle = {IEEE International Conference on Image Processing}, year = {2000}, isbn = {0-7803-6300-0}, author = {Marcotegui, B. and Marqu{\'e}s, F. and Wollborn, M.} } @inbook {bSalembier00, title = {Region-based filtering of images and video sequences: a morphological viewpoint}, booktitle = {Nonlinear Image Processing}, year = {2000}, publisher = {Academic Press}, organization = {Academic Press}, edition = {S. Mitra and G. Sicuranza (Eds.)}, chapter = {9}, isbn = {0125004516}, author = {Salembier, P.} } @conference {cMaziere00, title = {Segmentation and tracking of video objects for content-based video indexing}, booktitle = {IEEE International Conference on Multimedia and Expo, ICME{\textquoteright}2000}, year = {2000}, pages = {305{\textendash}309}, address = {New York City, NY, USA}, author = {Maziere, M. and Chassaing, F. and Garrido, L. and Salembier, P.} } @conference {cSayrol00, title = {Una Aproximaci{\'o}n experimental a las Se{\~n}ales y Sistemas Anal{\'o}gicos}, booktitle = {XV Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {2000}, pages = {329{\textendash}330}, isbn = {978-972-8865-74-0}, author = {Elisa Sayrol and Gasull, A. and Moreno, A. and Salavedra, J. and Vallverdu, F.} } @conference {cSalembier00b, title = {Visual Segment Tree Creation for MPEG-7 Description Schemes}, booktitle = {IEEE International Conference on Multimedia and Expo, ICME{\textquoteright}2000 }, year = {2000}, pages = {56{\textendash}61}, address = {New York City, NY, USA}, author = {Salembier, P. and Llach, J. and Garrido, L.} } @mastersthesis {xGiro-i-Nieto00, title = {Volumetric Data Compression based on Cube-Splitting and Embedded Block Coding by Optimized Truncation}, year = {2000}, abstract = {

Many medical data acquisition devices or multispectral imaging techniques produce three-dimensional image data. These images must be stored in limited space devices or transmitted through limited bandwidth channels. Compression techniques are an extremely valuable tool to reduce the expensive resource requirements.

However, compression techniques have already been developed for the more popular two-dimensional images. Splitting the volumetric image in slices and applying a two- dimensional coding technique to each slice is the philosophy followed by the classical approach for 3D compression. This is clearly inefficient, because 2D techniques only exploit the image correlation in the X and Y axis. In volumetric images a new Z-axis appears, whose correlation must be also exploited to achieve the best results.

The basis for all current image and video compression standards is DCT-based coding. For these techniques the computation is based on splitting of the image into NxN blocks and transforming it from the spatial domain into the DCT domain. Typical examples are first generation coders, like JPEG, which produce a non-structured, unique bit-stream. This technique could easily be adapted to three-dimensional by splitting the volume into NxNxN blocks and applying a 3D DCT. However, one encounters two problems. First, the DCT transform is a lossy, and medical practice cannot tolerate any distortion that could lead to an faulty diagnose. Secondly, contemporary transmission techniques make use of concepts like rate-scalability, quality and resolution scalability, features that are not fully supportable by DCT techniques.

Coders using a wavelet transform as front-end are good candidates to overcome these problems. They scan each bit-planes one by one to generate a structured bit-stream. This bit-stream can be truncated to give more or less quality or resolution, and they are classified second-generation coders. A typical example of 3D wavelet coding is the octave zero-tree based coding [Bil99, Xio99, Kim99, Kim00, Sch00a], which currently tends to deliver the best compression performance. However, it is difficult to control the bit-stream structure since it is dependent on the coder{\textquoteright}s data flow.

The new image compression standard JPEG2000 uses a third generation technique, called EBCOT ,incorporating an abstract interface to enable reordering of the generated code packages. In this way a fully controllable bit-stream structure is achieved. For example, the bit-stream can be equipped so that resolution or quality scalability are supported. The current verification model (VM7.0) of JPEG2000 however, does not include three-dimensional coding. The only support that is given for multidimensional and/or multi-spectral images is the possibility to execute a wavelet transform along the component axis. Unfortunately, the code supporting this feature was still buggy at the time this document was written

Adapting this third-generation coding technique to a three-dimensional environment was the aim of this thesis. The input volume is transformed into the wavelet transform with the 3D Wavelet front-end described and implemented by Schelkens et al. [Sch00a] and Barbarien [Joeri{\textquoteright}s thesis]. Later it is coded by an hybrid technique of Cube-Splitting and an JPEG2000{\textquoteright}s EBCOT module, modified to support the third dimension. The Cube-Splitting module codes big zero-volumes very efficiently, while the EBCOT coder is responsible for the coding of the (sub)volumes containing significant samples. Hence, the implemented coder is called CS- EBCOT.\ 

}, keywords = {coding, volumetric coding}, author = {Xavier Gir{\'o}-i-Nieto} } @article {pSalembier99a, title = {Analysis of Video Sequence. Method for Defining the Structure of a Video Sequence. Part II}, number = {994026615-}, year = {1999}, type = {Invention patent}, author = {Llach, J. and Salembier, P.} } @article {pSalembier99, title = {Analysis of Video Sequence. Method for Defining the Structure of a Video Sequence. Part I}, number = {994025948-}, year = {1999}, type = {Invention patent}, author = {Llach, J. and Salembier, P.} } @conference {cLlach99, title = {Analysis of video sequences : table of content and index creation}, booktitle = {International Workshop on Very Low Bitrate Video, VLBV{\textquoteright}99}, year = {1999}, pages = {51{\textendash}54}, address = {Kobe, Japan}, author = {Llach, J. and Salembier, P.} } @conference {cPardas99, title = {Automatic Face Analysis for Model Calibration}, booktitle = {International Workshop on Synthetic and natural hybrid coding and three dimensional imaging}, year = {1999}, pages = {12{\textendash}15}, isbn = {0-7695-0750-6}, author = {M. Pard{\`a}s} } @conference {cSalembier99, title = {The DICEMAN description schemes for still images and video sequences}, booktitle = {Workshop on Image Analysis for Multimedia Application Services, WIAMIS{\textquoteright}99}, year = {1999}, pages = {25{\textendash}34}, address = {Berlin, Germany}, isbn = {84-7653-885-5}, author = {Salembier, P. and O{\textquoteright}Connor, N. and Correa, P. and Ward, L} } @conference {cGasull99, title = {Editor gr{\'a}fico de figuras MATLAB}, booktitle = {III Congreso de Usuarios de MATLAB}, year = {1999}, pages = {219{\textendash}227}, isbn = {84-699-1358-1}, author = {Gasull, A. and Elisa Sayrol and Moreno, A. and Vallverdu, F. and Salavedra, J. and Albert Oliveras} } @conference {cTorres99, title = {Face location and recognition for video indexing in the Hypermedia project}, booktitle = {European Conference on Multimedia Applications, Services and Techniques}, year = {1999}, pages = {364{\textendash}377}, author = {Torres, L. and Marqu{\'e}s, F. and Lorente, L. and Ver{\'o}nica Vilaplana} } @inbook {aTorres99, title = {Face Location and Recognition for Video Indexing in the Hypermedia Project}, booktitle = {Lecture Notes in Computer Science}, volume = {1629}, year = {1999}, issn = {364-377}, doi = {DOI: 10.1007/3-540-48757-3_25}, author = {Torres, L. and Marqu{\'e}s, F. and Lorente, L. and Ver{\'o}nica Vilaplana} } @conference {cSalembier99a, title = {Hierarchical visual description schemes for still images and video sequences}, booktitle = {1999 IEEE International Conference on Image Processing, ICIP 1999}, year = {1999}, address = {Kobe, Japan}, author = {Salembier, P. and O{\textquoteright}Connor, N. and Correia, P. and Pereira, F.} } @conference {cMarques99a, title = {Human face segmentation and tracking using connected components and partition projection}, booktitle = {1999 IEEE INternational Conference on Image Processing}, year = {1999}, isbn = {0-7803-5470-2}, author = {Marqu{\'e}s, F. and Ver{\'o}nica Vilaplana and Buxes, A.} } @conference {cMarques99, title = {A morphological approach for human face segmentation and tracking}, booktitle = {Workshop on Image Analysis for Multimedia Services}, year = {1999}, pages = {41{\textendash}44}, author = {Marqu{\'e}s, F. and Ver{\'o}nica Vilaplana and Buxes, A.} } @conference {cVidal99, title = {Non-Noticeable Information Embedding in Color Images : Marking and Detection}, booktitle = {33rd ANNUAL - 1999 International Carnahan Conference on Security Technology}, year = {1999}, pages = {293{\textendash}297}, isbn = {0-7803-5247-5}, author = {Vidal, J. and Elisa Sayrol and Cabanillas, S. and Santamaria, S.} } @conference {cSayrol99, title = {Optimum Watermark Detection in Color Images}, booktitle = {1999 IEEE International Conference on Image Processing}, year = {1999}, pages = {1{\textendash}5}, author = {Elisa Sayrol and Vidal, J. and Cabanillas, S. and Santamaria, S.} } @article {pMarques99, title = {Partition Coding Method and Device}, number = {994004364}, year = {1999}, month = {02/1999}, type = {Invention patent}, address = {International}, issn = {994004364}, author = {Marqu{\'e}s, F.} } @article {pMarques99a, title = {Partition Decoding Method and Device}, number = {994017614}, year = {1999}, month = {07/1999}, type = {Invention patent}, address = {International}, issn = {994017614}, author = {Marqu{\'e}s, F. and Gomila, C. and Gasull, A.} } @conference {cMorros99, title = {A proposal for dependent optimization in scalabale region-based coding systems}, booktitle = {1999 IEEE International Conference on Image Processing}, year = {1999}, pages = {295{\textendash}299}, abstract = {

We address in this paper the problem of optimal coding in the framework of region-based video coding systems, with a special stress on content-basedfunctionalities. We present a coding system that can provide scaled layers (using PSNR or temporal content-based scalability) such that each one has an optimal partition with optimal bit allocation among the resulting regions. This coding system is based on a de- pendent optimization algorithm that can provide joint opti- mality for a group of layers or a group of frames.

}, isbn = {978-1-59593-733-9}, doi = {10.1109/ICIP.1999.819598}, author = {Morros, J.R. and Marqu{\'e}s, F.} } @conference {cVilaplana99, title = {A region-based approach to face segmentation and tracking in video sequences}, booktitle = {VIII Reuni{\'o}n de Trabajo en Procesamiento de la Informaci{\'o}n y Control}, year = {1999}, pages = {345{\textendash}350}, isbn = {0-8186-8821-1}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @article {aSalembier99, title = {Region-based representations of image and video : segmentation tools for multimedia services}, journal = {IEEE transactions on circuits and systems for video technology}, volume = {9}, number = {8}, year = {1999}, pages = {1147{\textendash}1169}, issn = {1051-8215}, author = {Salembier, P. and Marqu{\'e}s, F.} } @article {xRuiz-Hidalgo99, title = {The representation of images using scale trees}, year = {1999}, institution = {University of East Anglia}, type = {Master by Research}, abstract = {

This thesis presents a new tree structure that codes the grey scale information of an\ image. Based on a scale-space processor called the sieve, a scale tree represents\ the image in a hierarchical manner in which nodes of the tree describe features of\ the image at a specific scales.

This representation can be used to perform different image processing operations. Filtering, segmentation or motion detection can be accomplished by parsing\ the tree using different attributes associated with the nodes

}, author = {Ruiz-Hidalgo, J.} } @conference {cGarrido99, title = {Representing and retrieving regions using binary partition trees}, booktitle = {1999 IEEE International Conference on Image Processing, ICIP 1999}, year = {1999}, address = {Kobe, Japan}, isbn = {0-7803-5470-2}, author = {Garrido, L. and Salembier, P. and Casas, J.} } @conference {cSayrol99a, title = {Simulaci{\'o}n digital de se{\~n}ales y sistemas anal{\'o}gicos}, booktitle = {III Congreso de Usuarios de MATLAB}, year = {1999}, pages = {67{\textendash}76}, isbn = {84-699-1358-1}, author = {Sayrol E. and Gasull, A. and Moreno, A. and Vallverdu, F. and Salavedra, J. and Albert Oliveras} } @conference {cRuiz-Hidalgo99, title = {Towards stereo from scale-trees}, booktitle = {7th International Conference on Image Processing and its Applications}, year = {1999}, pages = {52{\textendash}56}, author = {Moravec, K. and Ruiz-Hidalgo, J. and Harvey, R. and Bangham, J.} } @inbook {bMarques99, title = {Video compresssion standards}, booktitle = {Electronic imaging technology}, year = {1999}, pages = {31{\textendash}64}, publisher = {SPIE Optical Engineering Press}, organization = {SPIE Optical Engineering Press}, edition = {Edward R. Dougherty (Ed.)}, isbn = {0819430374}, author = {Marqu{\'e}s, F. and Salembier, P.} } @conference {cMarcotegui99, title = {A video generation tool allowing friendly user interaction}, booktitle = {1999 IEEE INternational Conference on Image Processing}, year = {1999}, isbn = {0-7803-5470-2}, author = {Marcotegui, B. and Correia, P. and Marqu{\'e}s, F. and Mech, R. and Rosa, R. and Wollborn, M. and Zanoguera, F.} } @article {aSalembier98, title = {Antiextensive connected operators for image and sequence processing}, journal = {IEEE transactions on image processing}, volume = {7}, number = {4}, year = {1998}, pages = {555{\textendash}570}, abstract = {

This paper deals with a class of morphological operators called connected operators. These operators filter the signal by merging its flat zones. As a result, they do not create any new contours and are very attractive for filtering tasks where the contour information has to be preserved. This paper shows that connected operators work implicitly on a structured representation of the image made of flat zones. The max-tree is proposed as a suitable and efficient structure to deal with the processing steps involved in antiextensive connected operators. A formal definition of the various processing steps involved in the operator is proposed and, as a result, several lines of generalization are developed. First, the notion of connectivity and its definition are analyzed. Several modifications of the traditional approach are presented. They lead to connected operators that are able to deal with texture. They also allow the definition of connected operators with less leakage than the classical ones. Second, a set of simplification criteria are proposed and discussed. They lead to simplicity-, entropy-, and motion-oriented operators. The problem of using a nonincreasing criterion is analyzed. Its solution is formulated as an optimization problem that can be very efficiently solved by a Viterbi (1979) algorithm. Finally, several implementation issues are discussed showing that these operators can be very efficiently implemented.

}, issn = {1057-7149}, doi = {10.1109/83.663500}, url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=663500}, author = {Salembier, P. and Albert Oliveras and Garrido, L.} } @conference {cVidal98, title = {Autentificacion digital de imagen, marcado y detecci{\'o}n {\'o}ptimos}, booktitle = {XIII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1998}, pages = {105{\textendash}106}, isbn = {84-89654-12-3}, author = {Vidal, J. and Elisa Sayrol and Lagunas, M.} } @conference {cSalembier98, title = {Auto-dual connected operators based on iterative merging algorithms}, booktitle = {International Symposium on Mathematical Morphology and its applications to image and signal processing, ISMM 1998}, year = {1998}, pages = {183{\textendash}190}, address = {Amsterdam, The Netherlands}, isbn = {1522-4880}, author = {Salembier, P. and Garrido, L. and Garc{\'\i}a, D.} } @conference {cSalembier98b, title = {Binary partition tree as an efficient representation for filtering, segmentation and information retreival}, booktitle = {IEEE International Conference on Image Processing, ICIP 1998}, year = {1998}, address = {Chicago (IL), USA}, author = {Salembier, P. and Garrido, L.} } @inbook {bTorres98, title = {Codificaci{\'o}n de imagen}, booktitle = {Reconocimiento de formas y an{\'a}lisis de im{\'a}genes}, year = {1998}, pages = {101{\textendash}135}, publisher = {AERFAI}, organization = {AERFAI}, edition = {A. Sanfeliu (Ed.)}, isbn = {84-922529-4-4}, author = {Torres, L. and Salembier, P.} } @conference {cSalembier98a, title = {Connected operators for sprite creation and layered representation of image sequences}, booktitle = {9th European Signal Processing Conference, EUSIPCO 1998}, year = {1998}, pages = {2105{\textendash}2108}, address = {Rhodes, Greece}, isbn = {978-1-4503-0159-6}, author = {Salembier, P. and Pujol, O. and Garrido, L.} } @article {aGarrido98, title = {Extensive Operators in Partition Lattices for Image Sequence Analysis}, journal = {Signal processing}, volume = {66}, number = {2}, year = {1998}, pages = {157{\textendash}180}, issn = {0165-1684}, author = {Garrido, L. and Salembier, P. and Garc{\'\i}a, D.} } @conference {cVilaplana98, title = {Face segmentation using connected operators}, booktitle = {Mathematical Morphology and its applications to image and signal processing}, year = {1998}, pages = {207{\textendash}214}, isbn = {0 7923 5133 9}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @conference {cVidal98a, title = {Optimum Watermark Detection and Embedding in Digital Images}, booktitle = {1998 IEEE Second Workshop on Multimedia Signal Processing}, year = {1998}, pages = {285{\textendash}290}, isbn = {0-7803-4919-9}, author = {Vidal, J. and Elisa Sayrol} } @article {aMarques98, title = {Prediction of image partitions using fourier descriptors : application to segmentation-based coding schemes}, journal = {IEEE transactions on image processing}, volume = {7}, number = {4}, year = {1998}, pages = {529{\textendash}542}, issn = {1057-7149}, author = {Marqu{\'e}s, F. and Llorens, B. and Gasull, A.} } @conference {cGarrido98, title = {Region-based analysis of video sequences with a general merging algorithm}, booktitle = {9th European Signal Processing Conference, EUSIPCO 1998}, year = {1998}, pages = {1693{\textendash}1696}, address = {Rhodes, Greece}, isbn = {960-7620-06-4}, author = {Garrido, L. and Salembier, P.} } @conference {cVilaplana98, title = {Region-based segmentation segmentation and tracking of human faces}, booktitle = {9th European Signal Processing Conference, EUSIPCO 1998}, year = {1998}, pages = {311{\textendash}314}, address = {Rhodes, Greece}, isbn = {960-7620-06-4}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F. and Salembier, P. and Garrido, L.} } @conference {cRuiz-Hidalgo98a, title = {Robust morphological scale-space trees}, booktitle = {Noblesse Workshop on Non-Linear Model Based Image Analysis}, year = {1998}, month = {07/1998}, pages = {133{\textendash}139}, author = {Ruiz-Hidalgo, J. and Bangham, J. and Harvey, R.} } @conference {cRuiz-Hidalgo98, title = {The segmentation of images via scale-space trees}, booktitle = {British Machine Vision Conference}, year = {1998}, month = {09/1998}, pages = {33{\textendash}43}, address = {Southampton, UK}, abstract = {

A useful representation of an image would be an object tree in which nodes represent objects, or parts of objects, and which includes at least one node that, together with its children, represents each object: a grandmother node. It is shown that scale-trees, obtained from greyscale images, approximate such a tree. It is then shown how they may be modified using other attributes to more closely become object trees. The result is a data structure that provides {\textquotedblleft}handles{\textquotedblright} for every element of the image that can be used for manipulating the image. This segmentation has potential for object recognition.

}, author = {Bangham, J. and Ruiz-Hidalgo, J. and Harvey, R. and Cawley, G.} } @conference {cPardas98, title = {Video Object Segmentation introducing depth and motion information}, booktitle = {IEEE International Conference on Image Processing}, year = {1998}, isbn = {-}, author = {M. Pard{\`a}s} } @article {aMarcotegui97a, title = {Allowing content-based functionalities in segmentation-based coding schemes}, journal = {Annales des t{\'e}lecommunications. Annals of telecommunications}, volume = {52}, number = {7-8}, year = {1997}, pages = {398{\textendash}407}, issn = {0003-4347}, author = {Marcotegui, B. and Marqu{\'e}s, F. and Meyer, F.} } @conference {cGarrido97a, title = {Anti-extensive Connected Operators with Application to Image Sequences}, booktitle = {VII Simposium Nacional de Reconocimiento de Formas y An{\'a}lisis de Im{\'a}genes}, year = {1997}, pages = {151{\textendash}156}, address = {Barcelona, Spain}, isbn = {84-922529-0-1}, author = {Garrido, L. and Salembier, P. and Albert Oliveras} } @phdthesis {dOliveras97, title = {Contribuci{\'o} a l{\textquoteright}an{\`a}lisi morfol{\`o}gica d{\textquoteright}imatges amb operadors connexes}, year = {1997}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, author = {Albert Oliveras}, editor = {Salembier, P.} } @article {aMarques97, title = {General requirements for coding oriented segmentation of video sequences}, journal = {Annales des t{\'e}lecommunications. Annals of telecommunications}, volume = {52}, number = {7-8}, year = {1997}, pages = {359{\textendash}366}, issn = {0003-4347}, author = {Marqu{\'e}s, F. and Meyer, F. and M. Pard{\`a}s and Salembier, P.} } @conference {cGarrido97b, title = {A hierarchical technique for image sequence analysis}, booktitle = {Workshop on Image Analysis for Multimedia Application Services, WIAMIS{\textquoteright}97}, year = {1997}, pages = {13{\textendash}20}, address = {Louvain-la-Neuve, Belgium}, author = {Garrido, L. and Marqu{\'e}s, F. and M. Pard{\`a}s and Salembier, P. and Ver{\'o}nica Vilaplana} } @article {pMarques97, title = {Image Segmentation and Object Tracking Method and Corresponding System}, number = {974025587}, year = {1997}, month = {10/1997}, type = {Invention patent}, address = {International}, issn = {974025587}, author = {Marqu{\'e}s, F. and Molina, C.} } @conference {cSalembier97, title = {Image sequence analysis and merging algorithms}, booktitle = {International Workshop on Very Low Bit-rate Video, VLBV{\textquoteright}97}, year = {1997}, pages = {1{\textendash}8}, address = {Link{\"o}ping, Sweden}, isbn = {0-7803-9752-5}, author = {Salembier, P. and Garrido, L. and Garc{\'\i}a, D.} } @article {aMeyer97, title = {Morphological tools for segmentation: connected operators and watersheds}, journal = {Annales des t{\'e}lecommunications. Annals of telecommunications}, volume = {52}, number = {7-8}, year = {1997}, pages = {366{\textendash}379}, issn = {0003-4347}, author = {Meyer, F. and Albert Oliveras and Salembier, P. and Vachier, C.} } @conference {cGarrido97, title = {Motion analysis of image sequences using connected operators}, booktitle = {SPIE Visual Communications and Image Processing, VCIP{\textquoteright}97}, year = {1997}, pages = {546{\textendash}557}, address = {San Jose, CA, USA}, isbn = {0-8194-2435-8}, author = {Garrido, L. and Albert Oliveras and Salembier, P.} } @article {aPardas97, title = {Object-base image coding}, journal = {Vistas in astronomy}, volume = {41}, number = {3}, year = {1997}, pages = {455{\textendash}461}, issn = {0083-6656}, author = {M. Pard{\`a}s} } @article {aCasas97, title = {A region-based subband coding scheme}, journal = {Signal Processing: Image Communication}, volume = {10}, number = {1-2}, year = {1997}, month = {10/1997}, pages = {173{\textendash}200}, abstract = {

This paper describes a region-based subband coding scheme intended for efficient representation of the visual information contained in image regions of arbitrary shape. QMF filters are separately applied inside each region for the analysis and synthesis stages, using a signal-adaptive symmetric extension technique at region borders. The frequency coefficients corresponding to each region are identified over the various subbands of the decomposition, so that the coding steps {\textemdash} namely, bit-allocation, quantization and entropy coding {\textemdash} can be performed independently for each region. Region-based subband coding exploits the possible homogeneity of the region contents by distributing the available bitrate not only in the frequency domain but also in the spatial domain, i.e. among the considered regions. The number of bits assigned to the subbands is optimized region by region for the whole image, by means of a rate-distortion optimization algorithm. Improved compression efficiency is obtained thanks to the local adaptativity of the bit allocation to the spectral contents of the different regions. This compensates for the overhead data spent in the coding of contour information. As the subband coefficients obtained for each region are coded as separate data units, the content-based functionalities required for the future MPEG4 video coding standard can be readily handled. For instance, content-based scalability is possible by simply imposing user-defined constraints to the bit-assignment in some regions.

}, keywords = {SCHEMA}, issn = {0923-5965}, doi = {10.1016/S0923-5965(97)00024-6}, author = {Casas, J. and Torres, L.} } @conference {cPardas97, title = {Relative depth estimation and segmentation in monocular sequences}, booktitle = {1997 PICTURE CODING SYMPOSIUM}, year = {1997}, pages = {367{\textendash}372}, isbn = {0 7923 7862 8}, author = {M. Pard{\`a}s} } @conference {cSalembier97a, title = {Robust motion estimation using connected operators}, booktitle = {IEEE International Conference on Image Processing, ICIP{\textquoteright}97}, year = {1997}, pages = {77{\textendash}80}, address = {Santa Barbara, USA}, isbn = {1522-4880}, author = {Salembier, P. and Sanson, H.} } @conference {cMorros97, title = {Scalable segmentation-based coding of video sequences addressing content-based functionalities}, booktitle = {IEEE International Conference on Image Processing}, year = {1997}, month = {10/1997}, pages = {1-4}, publisher = {IEEE}, organization = {IEEE}, address = {Santa Barbara, USA}, abstract = {

In this paper, we\ addressvideo\ scalability in the framework\ of\ a region-basedcoding\ system, allowingcontent-basedfunctionalities. The proposed algorithm can construct scaled layers from a\ videosequence, each one with either fixed bit-rate or fixed quality, allowing\ contentbased\ manipulation. Two modes\ of\ operation have been defined: a supervised mode, that allows the user to select the objects to be\ coded\ in the enhancement layer, and an unsupervised mode, where this selection is done by the algorithm itself

}, doi = {10.1109/ICIP.1997.638658}, author = {Morros, J.R. and Marqu{\'e}s, F.} } @article {aMarcotegui97, title = {Segmentation of video sequences and rate control}, journal = {Annales des t{\'e}lecommunications. Annals of telecommunications}, volume = {52}, number = {7-8}, year = {1997}, pages = {380{\textendash}388}, issn = {0003-4347}, author = {Marcotegui, B. and Marqu{\'e}s, F. and Morros, J.R. and M. Pard{\`a}s and Salembier, P.} } @article {aPardas97a, title = {Segmentation of video sequences for partition tree generation}, journal = {Annales des t{\'e}lecommunications. Annals of telecommunications}, volume = {52}, number = {7-8}, year = {1997}, pages = {389{\textendash}396}, issn = {0003-4347}, author = {M. Pard{\`a}s and Salembier, P.} } @article {aSalembier97, title = {Segmentation-based video coding system allowing the manipulation of objects}, journal = {IEEE transactions on circuits and systems for video technology}, volume = {7}, number = {1}, year = {1997}, pages = {60{\textendash}74}, issn = {1051-8215}, doi = {10.1109/76.554418}, author = {Salembier, P. and Marqu{\'e}s, F. and M. Pard{\`a}s and Morros, J.R. and Corset, I. and Jeannin, S. and Bouchard, L. and Meyer, F. and Marcotegui, B.} } @conference {cOliveras97, title = {Stereo image analyis using connected operators}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 1997}, year = {1997}, pages = {3169{\textendash}3172}, address = {Munich, Germany}, isbn = {978-989-8109-05-7}, author = {Albert Oliveras and Garrido, L. and Salembier, P.} } @article {aTorres97, title = {Stochastic vector quantization of images}, journal = {Signal Processing}, volume = {62}, number = {3}, year = {1997}, month = {11/1997}, pages = {291{\textendash}301}, abstract = {

One of the most important steps in the vector quantization of images is the design of the codebook. The codebook is generally designed using the LBG algorithm, that is in essence a clustering algorithm which uses a large training set of empirical data that is statistically representative of the image to be quantized. The LBG algorithm, although quite effective for practical applications, is computationally very expensive and the resulting codebook has to be recalculated each time the type of image to be encoded changes. One alternative to the generation of the codebook, called stochastic vector quantization, is presented in this paper. Stochastic vector quantization (SVQ) is based on the generation of the codebook according to some previous model defined for the image to be encoded. The well-known AR model has been used to model the image in the current implementations of the technique, and has shown good performance in the overall scheme. To show the merit of the technique in different contexts, stochastic vector quantization is discussed and applied to both pixel-based and segmentation-based image coding schemes.

}, issn = {0165-1684}, doi = {10.1016/S0165-1684(97)00130-8}, author = {Torres, L. and Casas, J. and Arias, E.} } @article {pPardas97, title = {Video coding method and corresponding coding and decoding systems}, number = {9693276.5-}, year = {1997}, type = {Invention patent}, author = {M. Pard{\`a}s and Salembier, P. and Ayuso, X. and Mart{\'\i}, E.} } @conference {cSalembier96d, title = {Active mesh coding and rate distortion theory}, booktitle = {IEEE International Conference on Image Processing, ICIP 1996}, year = {1996}, address = {Lausanne, Switzerland}, isbn = {0-7803-7043-0}, author = {Salembier, P. and Mart{\'\i}, E. and M. Pard{\`a}s} } @inbook {bSalembier96, title = {Coding of partition sequences}, booktitle = {Video coding: the second generation approach}, year = {1996}, pages = {125{\textendash}170}, publisher = {Kluwer}, organization = {Kluwer}, edition = {L. Torres and M. Kunt (Eds.)}, isbn = {0 7923 9680 4}, author = {Salembier, P. and Marqu{\'e}s, F. and Gasull, A.} } @inbook {bMarques96, title = {Coding-oriented segmentation of video sequences}, booktitle = {Video coding: the second generation approach}, year = {1996}, pages = {79{\textendash}124}, publisher = {Kluwer}, organization = {Kluwer}, edition = {L. Torres and M. Kunt (Eds.)}, isbn = {0 7923 9680 4}, author = {Marqu{\'e}s, F. and M. Pard{\`a}s and Salembier, P.} } @conference {cCasas96, title = {A feature-based subband coding schene}, booktitle = {THE 1996 IEEE INTERNATIONAL CONFERENCE ON ACOUSTICS, SPEECH \& SIGNAL PROCESSING}, year = {1996}, pages = {2357{\textendash}2360}, isbn = {0-7803-3192-3}, author = {Casas, J. and Torres, L.} } @article {pOliveras96, title = {Filtering Method and Corresponding Filtering System}, number = {96402925.0-}, year = {1996}, type = {Invention patent}, author = {Albert Oliveras and Salembier, P. and Garrido, L.} } @conference {cOliveras96, title = {Generalized connected operators}, booktitle = {SPIE Visual Communication and Image Processing, VCIP{\textquoteright}96}, year = {1996}, pages = {2727{\textendash}2771}, address = {Orlando, Florida, USA}, author = {Albert Oliveras and Salembier, P.} } @phdthesis {dCasas96, title = {Image compression based on perceptual coding techniques}, volume = {PhD}, year = {1996}, month = {03/1996}, pages = {178}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, address = {download link}, abstract = {

This thesis studies image and video sequence coding methods from the point of view of the way the human visual system perceives and understands visual information. The relevance of such study is due, on the one hand, to the important role that visual signals have in our civilization and, on the other hand, to the problem of representing the large amount of data that image and video processing systems have to deal with. Three different approaches have been investigated for the coding of image textures in an advanced compression scheme relying in aspects of visual perception. The first approach is based on image transitions and the interpolation of smooth areas from such transitions. The second one, considers the extraction, selection and coding of meaningful image details. Finally, the third approach studies the efficient representation of homogeneous fine textures that give a natural appearance to the reconstructed images at high compression levels. In order to apply these techniques for still image and video coding, a three component model of the image, that matches the perceptual properties of the human vision, is put forward. The coding approaches subject of research have leaded to the design of three new image analysis and coding techniques. Using non-linear tools from the framework of Mathematical Morphology, three texture coding techniques are developed. In particular, - A "morphological" image interpolation method aimed at the problem of scattered data interpolation. - An empirical subjective criterion for the ranking and selection of image details according to visual perception. - The application of a conventional image coding technique, subband coding, to the coding of arbitrarily shaped image regions (region-based subband coding). These are new texture coding techniques in the field of object-oriented and Second Generation image and video coding schemes. Furthermore, the model of the image that has been investigated follows the line of the last proposals in the framework of MPEG4, the forthcoming coding standard for low bit-rate visual communications, which considers the possibility of content-based manipulation and coding of visual information.

The PhD thesis report can be downloaded from http://hdl.handle.net/10803/6920

}, url = {http://hdl.handle.net/10803/6920}, author = {Casas, J.}, editor = {Torres, L.} } @conference {cCasas96b, title = {Morphological Interpolation for Image Coding}, booktitle = {ICAOS {\textquoteright}96: 12th International Conference on Analysis and Optimization of Systems Images, Wavelets and PDEs }, year = {1996}, month = {06/1996}, pages = {295{\textendash}304}, address = {Paris}, author = {Casas, J.} } @inbook {bCasas96, title = {Morphological Interpolation for Image Coding}, booktitle = {Images, Wavelets and PDEs}, volume = {219}, number = {Lecture Notes in Control and Information Sciences}, year = {1996}, pages = {295{\textendash}304}, publisher = {Springer}, organization = {Springer}, address = {Berlin / Heidelberg}, abstract = {

The aim of this paper is to present a new interpolation technique intended for spatial interpolation from sparse data sets. The proposed implementation, which is based on non-linear morphological operators, overperforms linear interpolation by means of diffusion processes performing iterative space-variant filtering on the initial image. Morphological interpolation is applied to sketch-based image coding. We put forward a perceptually motivated two-component image model that strongly relies on morphological operators. The watershed is used to detect strong edge features in the first component of the model. The smooth areas of the image are recovered from the extracted edge information by morphological interpolation. The residual component, containing fine textures, is separately coded by a subband coding scheme.

}, isbn = {978-3-540-76076-4}, doi = {10.1007/3-540-76076-8_142}, author = {Casas, J.} } @article {aSalembier96, title = {Morphological operators for image and video compression}, journal = {IEEE transactions on image processing}, volume = {5}, number = {6}, year = {1996}, month = {06/1996}, pages = {881{\textendash}898}, abstract = {

This paper deals with the use of some morphological tools for image and video coding. Mathematical morphology can be considered as a shape-oriented approach to signal processing, and some of its features make it very useful for compression. Rather than describing a coding algorithm, the purpose of this paper is to describe some morphological tools that have proved attractive for compression. Four sets of morphological transformations are presented: connected operators, the region-growing version of the watershed, the geodesic skeleton, and a morphological interpolation technique. The authors discuss their implementation, and show how they can be used for image and video segmentation, contour coding, and texture coding.

}, issn = {1057-7149}, doi = {10.1109/83.503906}, author = {Salembier, P. and Brigger, P. and Casas, J. and M. Pard{\`a}s} } @conference {cSalembier96e, title = {Morphological operators for very low bit rate video coding}, booktitle = {IEEE International Conference on Image Processing, ICIP 1996}, year = {1996}, pages = {659{\textendash}662}, address = {Lausanne, Switzerland}, isbn = {0-8194-5654-3}, author = {Salembier, P. and Meyer, F. and Brigger, P. and Bouchard, L.} } @conference {cSalembier96, title = {Motion compensated partition coding}, booktitle = {SPIE Visual Communication and Image Processing, VCIP{\textquoteright}96}, year = {1996}, pages = {403{\textendash}415}, address = {Orlando, Florida, USA}, isbn = {84-699-1359-1}, author = {Salembier, P.} } @conference {cSalembier96b, title = {Motion connected operators for image sequences}, booktitle = {VIII European Signal Processing Conference, EUSIPCO{\textquoteright}96}, year = {1996}, pages = {1083{\textendash}1086}, address = {Trieste, Italy}, isbn = {84-600-9597-5}, author = {Salembier, P. and Albert Oliveras and Garrido, L.} } @article {aSayrol96, title = {Motion estimation using higher-order statistics}, journal = {IEEE transactions on image processing}, volume = {5}, number = {6}, year = {1996}, pages = {1077{\textendash}1084}, issn = {1057-7149}, author = {Elisa Sayrol and Gasull, A. and R. Fonollosa, Javier} } @conference {cMarques96a, title = {Partition coding using multigrid chain code and motion compensation}, booktitle = {IEEE INTERNATIONAL CONFERENCE ON IMAGE PROCESSING}, year = {1996}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cPardas96, title = {Partition tree for a segmentation-based video coding system}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 1996}, year = {1996}, pages = {1982{\textendash}1985}, address = {Atlanta (GA), USA}, isbn = {-}, author = {M. Pard{\`a}s and Salembier, P. and Marqu{\'e}s, F. and Morros, J.R.} } @conference {cOliveras96b, title = {Practical extensions of connected operators}, booktitle = {International Symposium on Mathematical Morphology, ISMM 1996}, year = {1996}, pages = {97{\textendash}110}, address = {Atlanta (GA), USA}, author = {Salembier, P. and Albert Oliveras} } @conference {cTorres96, title = {Prediction error image coding using a modified stochastic vector quantization scheme}, booktitle = {IEEE International Conference on Image Processing}, year = {1996}, pages = {451{\textendash}454}, author = {Torres, L. and Casas, J.} } @conference {cMarques96, title = {A segmentation-based coding system allowing manipulation of objects}, booktitle = {IEEE International Conference on Image Processing, ICIP 1996}, year = {1996}, pages = {145{\textendash}175}, address = {Lausanne, Switzerland}, abstract = {

We present a coding scheme that achieves, for each image in the sequence, the best segmentation in terms of rate-distortion theory. It is obtained from a set of initial regions and a set of available coding techniques. The segmentation combines spatial and motion criteria. It selects at each area of the image the most adequate criterion for defining a partition in order to obtain the best compromise between cost and quality. In addition, the proposed scheme is very suitable for addressing content-based functionalities.

}, isbn = {0-7803-3258-X}, doi = {10.1109/ICIP.1996.560741}, author = {Marqu{\'e}s, F. and Salembier, P. and M. Pard{\`a}s and Morros, J.R. and Corset, I. and Jeannin, S. and Marcotegui, B. and Meyer, F.} } @conference {cMarques96b, title = {A segmentation-based coding system allowing manipulation of objects (sesame)}, booktitle = {IEEE INTERNATIONAL CONFERENCE ON IMAGE PROCESSING}, year = {1996}, author = {Marqu{\'e}s, F.} } @conference {cSalembier96c, title = {Segmentation-based video coding: temporal links and rate control}, booktitle = {VIII European Signal Processing Conference, EUSIPCO{\textquoteright}96}, year = {1996}, pages = {455{\textendash}458}, address = {Trieste, Italy}, author = {Salembier, P. and Marqu{\'e}s, F. and M. Pard{\`a}s} } @article {pSalembier96a, title = {Segmented Picture Coding Method and System and Corresponding Decoding Method and System}, number = {964009161-}, year = {1996}, type = {Invention patent}, author = {Salembier, P. and Marqu{\'e}s, F. and Corset, I. and Bouchard, L. and Jeannin, S. and M. Pard{\`a}s and Morros, J.R. and Meyer, F. and Marcotegui, B.} } @conference {cCasas96a, title = {Strong edge features for image coding}, booktitle = {International Symposium on Mathematical Morphology and its Applications to Image and Signal Processing III}, year = {1996}, pages = {443{\textendash}450}, isbn = {0-7923-9733-9}, author = {Casas, J. and Torres, L.} } @inbook {bCasas96c, title = {Strong edge features for image coding}, booktitle = {Mathematical Morphology and its Applications to Image and Signal Processing}, volume = {5}, year = {1996}, pages = {443{\textendash}450}, publisher = {Springer}, organization = {Springer}, address = {Boston}, abstract = {

A two component model is proposed for perceptual image coding For the first component of the model the watershed operator is used to detect strong edge features Then an efficientmorphological interpolation algorithm reconstructs the smooth areas of the image from the extracted edge information also known as sketch data The residual component containing ne textures is separately coded by a subband coding scheme The morphological operators involved in the coding of the primary component perform very e ciently compared to conventional techniques like the LGO operator used for the edge extraction or the di usion lters iteratively applied for the interpolation of smooth areas in previously reported sketch based coding schemes.

}, isbn = {0-793-9733-9}, doi = {10.1007/978-1-4613-0469-2_52}, author = {Casas, J. and Torres, L.}, editor = {Schafer, R. W. and Maragos, P. and Butt, M. A.} } @conference {cSalembier96a, title = {Very low rate video coding using active triangular mesh}, booktitle = {IEEE International Conference on Acoustics, Speech \& Signal Processing, ICASSP 1996}, year = {1996}, pages = {97{\textendash}110}, address = {Atlanta (GA), USA}, isbn = {84-699-1358-1}, author = {Salembier, P. and Ayuso, X.} } @conference {cMorros96, title = {Video sequence segmentation based on rate-distortion theory}, booktitle = {SPIE Visual Communication and Image Processing, VCIP{\textquoteright}96}, year = {1996}, month = {02/1996}, publisher = {Proc. SPIE 2727, 1185}, organization = {Proc. SPIE 2727, 1185}, address = {Orlando, Florida, USA}, abstract = {
This paper describes a coding-oriented segmentation technique for video schemes using an optimization strategy to address the problem of bit allocation. The optimization is based on the rate-distortion theory. Our purpose is to define a method to obtain an {\textquoteright}optimal{\textquoteright} partition together with the best coding technique for each region of this partition so that the result is optimal in a rate-distortion sense.

}, author = {Morros, J.R. and Marqu{\'e}s, F. and M. Pard{\`a}s and Salembier, P.} } @conference {cMarques95, title = {Coding of image partitions by morphological skeleton using overlapping structuring elements}, booktitle = {IEEE WORKSHOP ON NONLINEAR SIGNAL AND IMAGE PROCESSING}, year = {1995}, pages = {250{\textendash}253}, author = {Marqu{\'e}s, F. and Fioravanti, S. and Brigger, P.} } @conference {cTorres95a, title = {An efficient technique of texture representation in segmentation-based image coding schemes}, booktitle = {IEEE INTERNATIONAL CONFERENCE ON IMAGE PROCESSING.}, year = {1995}, pages = {588{\textendash}591}, author = {Torres, L. and Casas, J. and Campins, J.} } @conference {cSayrol95a, title = {Estimation of motion parameters using hos}, booktitle = {IEEE SIGNAL PROCESSING-ATHOD WORKSHOP ON HIGHER-ORDER STATISTICS.}, year = {1995}, pages = {262{\textendash}265}, isbn = {1522-4880}, author = {Elisa Sayrol and Gasull, A. and R. Fonollosa, Javier} } @article {aSalembier95a, title = {Flat zones filtering, connected operators and filters by reconstruction}, journal = {IEEE transactions on image processing}, volume = {3}, number = {8}, year = {1995}, pages = {1153{\textendash}1160}, issn = {1057-7149}, author = {Salembier, P. and Serra, J.} } @conference {cSayrol95, title = {Fourth-order statistics cost functions: applications to time delay estimation and image motion estimation}, booktitle = {VI SPANISH SYMPOSIUM ON PATTERN RECOGNITION AND IMAGE ANALYSIS}, year = {1995}, pages = {543{\textendash}548}, isbn = {978-1-4244-9564-1}, author = {Elisa Sayrol and Gasull, A. and R. Fonollosa, Javier} } @article {aSayrol95, title = {Image Restoration using the W-Slice Method}, journal = {IEEE transactions on image processing}, volume = {4}, number = {4}, year = {1995}, pages = {1174{\textendash}1181}, abstract = {

We propose the use of higher order statistics (HOS)-based methods to address the problem of image restoration. The restoration strategy is based on the fact that the phase information of the original image and its HOS are not distorted by some types of blurring. The difficulties associated with the combination of 2-D signals and their HOS are reduced by means of the Radon transform. Two methods that apply the weight-slice algorithm over the projections are developed. Simulation results illustrate the performance of the proposed methods

}, issn = {1057-7149}, author = {Elisa Sayrol and Gasull, A. and Nikias, C.} } @conference {cMarques95a, title = {Interpolation and extrapolation of iomage partitions using fourier descriptions: to segmentation-based coding schemes}, booktitle = {IEEE INTERNATIONAL CONFERENCE ON IMAGE PROCESSING{\textquoteright}95.}, year = {1995}, pages = {584{\textendash}587}, author = {Marqu{\'e}s, F. and Llorens, B. and Gasull, A.} } @article {pSalembier95, title = {Method of Coding an Image Sequence}, number = {954018131-}, year = {1995}, type = {Invention patent}, author = {Salembier, P.} } @article {pSalembier95a, title = {Method of Coding an Image Sequence and Corresponding Decoding Method}, number = {954020202-}, year = {1995}, type = {Invention patent}, author = {Salembier, P.} } @conference {cCasas95, title = {Morphological interpolation for texture coding}, booktitle = {IEEE International Conference on Image Processing, ICIP 1995}, year = {1995}, pages = {903{\textendash}907}, address = {Washington DC, USA}, author = {Casas, J. and Salembier, P. and Torres, L.} } @conference {cTorres95, title = {New approaches to texture coding in segmentation and feature-based image coding schemes}, booktitle = {International Conference on Digital Signal Processing}, year = {1995}, pages = {12{\textendash}17}, isbn = {84-600-7766-7}, author = {Torres, L. and Casas, J.} } @article {aSalembier95, title = {Region-based video coding using mathematical morphology}, journal = {Proceedings of the IEEE}, volume = {83}, number = {6}, year = {1995}, pages = {843{\textendash}857}, issn = {0018-9219}, author = {Salembier, P. and Torres, L. and Meyer, F. and Gu, C.} } @phdthesis {dPardas95, title = {Segmentaci{\'o}n Morfol{\'o}gica de Secuencias de Im{\'a}genes: Aplicaci{\'o}n a la Codificaci{\'o}n}, year = {1995}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, author = {M. Pard{\`a}s}, editor = {Salembier, P.} } @conference {cSalembier95 , title = {Texture coding using morphological interpolation}, booktitle = {IEEE workshop on Nonlinear Signal and Image Processing, NSIP 1995}, year = {1995}, address = {Halkidiki, Greece}, author = {Salembier, P. and Ru{\'e}, R} } @article {aPardas94, title = {3d morphological segmentation and motion estimation for image sequences}, journal = {Signal processing}, volume = {38}, number = {2}, year = {1994}, pages = {31{\textendash}43}, issn = {0165-1684}, author = {M. Pard{\`a}s and Salembier, P.} } @inbook {bMarques94a, title = {AUTOMATIC QUANTIFICATION OF SPINE PARAMETERS FROM X-RAY IMAGES BY MEANS OF MORPHOLOGICAL TOOLS.}, booktitle = {Mathematical morphology and its applications to image processing}, year = {1994}, pages = {330{\textendash}340}, isbn = {0-7923-3093-5}, author = {Marqu{\'e}s, F.} } @article {aCasas94, title = {Coding of details in very low bit-rate video systems}, journal = {IEEE transactions on circuits and systems for video technology}, volume = {4}, number = {3}, year = {1994}, month = {06/1994}, pages = {317{\textendash}327}, abstract = {

In this paper, the importance of including small image features at the initial levels of a progressive second generation video coding scheme is presented. It is shown that a number of meaningful small features called details should be coded, even at very low data bit-rates, in order to match their perceptual significance to the human visual system. We propose a method for extracting, perceptually selecting and coding of visual details in a video sequence using morphological techniques. Its application in the framework of a multiresolution segmentation-based coding algorithm yields better results than pure segmentation techniques at higher compression ratios, if the selection step fits some main subjective requirements. Details are extracted and coded separately from the region structure and included in the reconstructed images in a later stage. The bet of considering the local background of a given detail for its perceptual selection breaks the concept of {\textquotedblleft}partition{\textquotedblright} in the segmentation scheme. As long as details are not considered as adjacent regions but isolated features spread over the image, {\textquotedblleft}detail coding{\textquotedblright} can be seen as one step towards the so called feature-based video coding techniques

}, keywords = {CHIL}, issn = {1051-8215}, doi = {10.1109/76.305876}, url = {http://hdl.handle.net/2117/97643}, author = {Casas, J. and Torres, L.} } @conference {cCasas94a, title = {Coding of significant features in very low bit-rate video systems}, booktitle = {SPIE{\textquoteright}S Visual Communications{\textquoteright}94}, year = {1994}, pages = {73{\textendash}85}, author = {Casas, J. and Torres, L.} } @conference {cCasas94c, title = {Feature-based video coding using Mathematical Morphology}, booktitle = {VII European Signal Processing Conference (EUSIPCO)}, year = {1994}, publisher = {EURASIP}, organization = {EURASIP}, address = {Edinburgh}, abstract = {

This paper puts forward a new approach to second generation image coding. The concept of "image feature" is introduced in order to deal with those "objects" that cannot be properly described as regions within a segmentation framework. Visual features such as open contours or texture details are extracted from the original images using morphological operators. As malhematical morphology deals with the shapes and structures of the images, the resulting features are very close to what would be obtained from a visual perception point of view. If these features are efficiently coded with suitable techniques, such coding model is able to reach higher compression ratios than purely segmentationbased techniques. Numerical results are given at the end of the paper to prove the goodness of the feature-based coding scheme in a very-low bit-rate video coding application.

}, author = {Casas, J. and Torres, L.} } @conference {cMarques94a, title = {Hierarchical image sequence model for segmentation: application to region-based sequence coding}, booktitle = {VISUAL COMMUNICATION AND IMAGE PROCESSING}, year = {1994}, pages = {554{\textendash}563}, author = {Marqu{\'e}s, F. and Gasull, A. and Vera, V.} } @article {aSalembier94, title = {Hierarchical morphological segmentation for image sequence coding}, journal = {IEEE transactions on image processing}, volume = {3}, number = {5}, year = {1994}, pages = {639{\textendash}651}, issn = {1057-7149}, author = {Salembier, P. and M. Pard{\`a}s} } @phdthesis {dSayrol94, title = {Higher-order statistics applications in image sequence processing}, year = {1994}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, url = {http://hdl.handle.net/10803/6950}, author = {Elisa Sayrol}, editor = {Gasull, A.} } @conference {cMarques94c, title = {Interleaved segmentation and motion estimation by means of morphological tools}, booktitle = {Workshop on Image Analysis and Synthesis in Image Coding}, year = {1994}, address = {Berlin, Germany}, author = {Marqu{\'e}s, F. and Bouchard, L. and Corset, I. and Jeannin, S. and Morros, J.R. and M. Pard{\`a}s and Salembier, P. and Torres, L.} } @conference {cPardas94b, title = {Joint region and motion estimation with morphological tools}, booktitle = {International Symposium on Mathematical Morphology, ISMM 1994}, year = {1994}, address = {Fontainebleau, France}, author = {M. Pard{\`a}s and Salembier, P.} } @conference {cCasas94b, title = {Morphological filter for lossless image subsampling}, booktitle = {IEEE International Conference on Image Processing{\textquoteright}94}, year = {1994}, pages = {903{\textendash}907}, isbn = {0-8186-6950-0}, author = {Casas, J. and Torres, L.} } @article {aSalembier94a, title = {Morphological multiscale segmentation for image coding}, journal = {Signal processing}, volume = {38}, number = {3}, year = {1994}, pages = {359{\textendash}386}, issn = {0165-1684}, author = {Salembier, P.} } @inbook {bCasas94, title = {Morphological scheme for morphometric analysis of epidermal biopsy images}, booktitle = {Mathematical Morphology and its Applications to Image Processing}, volume = {2}, number = {Computational Imaging and Vision}, year = {1994}, pages = {325{\textendash}331}, publisher = {Springer}, organization = {Springer}, address = {Dordrecht}, abstract = {

This paper addresses the problem of morphometric analysis of microscope images from cutaneous biopsy samples. A morphological scheme is applied for the automatic measurement of histologic parameters of the epidermis. It consists in an unsupervised segmentation approach that is strongly based on an {\textquoteright}a priori{\textquoteright} model of the images. The watershed algorithm has proven to be a very powerful tool for the introduction of such {\textquoteright}a priori{\textquoteright} information, because the segmentation process can be conveniently guided by some strategic markers in order to perform the detection of the desired structures. This permits an automatic measurement of some objective parameters which are highly correlated with the evolution of some skin diseases.

}, doi = {10.1007/978-94-011-1040-2_42}, author = {Casas, J. and Esteban, P. and Moreno, A. and Carrera, M.}, editor = {Serra, J. and Soille, P.} } @conference {cCasas94d, title = {Morphological scheme for morphometric analysis of epidermal biopsy images}, booktitle = {International Symposium on Mathematical Morphology and its Applications to Image Processing II}, year = {1994}, address = {Fontainebleau}, isbn = {0-7923-3093-5}, author = {Casas, J. and Esteban, P. and Moreno, A. and Carrera, M.} } @conference {cPardas94a, title = {Motion region overlapping for segmentation-based video coding}, booktitle = {International Conference on Image Processing, ICIP{\textquoteright}94}, year = {1994}, pages = {428{\textendash}431}, address = {Austin, Texas}, isbn = {0-7803-5470-2}, author = {M. Pard{\`a}s and Salembier, P. and Gonzalez, B.} } @conference {cGimeno94, title = {A new approach to texture coding using stochastic vector quantization}, booktitle = {IEEE International Conference on Image Processing{\textquoteright}94}, year = {1994}, pages = {119{\textendash}123}, abstract = {

A new method for texture coding which combines 2-D linear prediction and stochastic vector quantization is presented in this paper. To encode a texture, a linear predictor is computed first. Next, a codebook following the prediction error model is generated and the prediction error is encoded with VQ, using an algorithm which takes into account the pixels surrounding the block being encoded. In the decoder, the error image is decoded first and then filtered as a whole, using the prediction filter. Hence, correlation between pixels is not lost from one block to another and a good reproduction quality can be achieved

}, isbn = {0-8186-6950-0}, doi = {10.1109/ICIP.1994.413287}, author = {Gimeno, D. and Torres, L. and Casas, J.} } @conference {cSerra94 , title = {Op{\'e}rateurs connexes et pyramides}, booktitle = {In Proc. of RFIA}, year = {1994}, address = {Paris, France}, author = {Serra, J. and Salembier, P.} } @conference {cMarques94b, title = {Recursive image sequence segmentation by hierarchical models}, booktitle = {12th IAPR International Conference on Pattern Recognition}, year = {1994}, pages = {523{\textendash}525}, isbn = {0-8186-6265-4}, author = {Marqu{\'e}s, F. and Vera, V. and Gasull, A.} } @conference {cCasas94, title = {Residual image coding using mathematical morphology}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing}, year = {1994}, pages = {597{\textendash}600}, isbn = {0-7803-1775-0}, author = {Casas, J. and Torres, L.} } @conference {cTorres94, title = {Segmentation based coding of textures using stochastic vector quantization}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing}, year = {1994}, pages = {553{\textendash}556}, abstract = {

In second generation image compression techniques the image to be compressed is first segmented. The pixels are divided into mutually exclusive spatial regions based on some criteria. After segmentation, the image consists of regions separated by contours. Then, the information is coded describing the shapes and interiors of the regions. The interiors of the regions are usually encoded using polynomials. The objective of this paper is to encode the interior of the regions by stochastic vector quantization techniques. If the segmentation process has been well defined and the obtained regions are homogeneous, then it is possible to design a specific codebook suited to the statistics of each region. The approach is to design the codebook according to some previously defined model for the regions of the image found in the segmentation process. If the approach is combined with efficient contour coding techniques, good visual results for high compression rates are obtained

}, isbn = {84-600-7766-7}, doi = {10.1109/ICASSP.1994.389380}, author = {Torres, L. and Casas, J. and Diego, D.} } @conference {cSalembier94 , title = {Self-referred texture coding for segmentation-based codec}, booktitle = {SPIE Visual Communication and Image Processing, VCIP{\textquoteright}94}, year = {1994}, address = {Chicago (IL), USA}, author = {Salembier, P. and Lopez, F} } @conference {cPardas94, title = {Time-recursive segmentation of image sequences}, booktitle = {European Signal Processing Conference, EUSIPCO-94}, year = {1994}, pages = {18{\textendash}21}, address = {Edinburgh, UK}, author = {M. Pard{\`a}s and Salembier, P.} } @conference {cMarques94, title = {Top-down 3d image sequence segmentation technique controlled by morphological}, booktitle = {EUSIPCO-94.}, year = {1994}, pages = {415{\textendash}418}, author = {Marqu{\'e}s, F. and Vera, V. and Gasull, A.} } @inbook {bMarques94, title = {UNSUPERVISED HIERARCHICAL IMAGE SEGMENTATION USING COMPOUND RANDOM FIELDS}, booktitle = {A Robust Method for Computing 2D Projective invariants of 3D Object Vertices}, year = {1994}, pages = {125{\textendash}136}, isbn = {981-02-1872-9}, author = {Marqu{\'e}s, F.} } @conference { cSalembier94, title = {Very low bit rate video coding using morphological segmentation and contour/texture motion compensation}, booktitle = {12th International Conference on Pattern Recognition, ICPR 1994}, year = {1994}, address = {Jerusalem, Israel}, author = {Salembier, P. and Gu, C. and M. Pard{\`a}s and Kunt, M} } @conference {cPardas93a, title = {3D morphological segmentation and motion estimation for image sequences}, booktitle = {International Symposium on Mathematical Morphology and its applications to image and signal processing, ISMM 1993}, year = {1993}, pages = {58{\textendash}63}, address = {Barcelona, Spain}, isbn = {0-8186-8821-1}, author = {M. Pard{\`a}s and Salembier, P.} } @conference {cPardas93, title = {3D morphological segmentation or image sequence processing}, booktitle = {IEEE Winter Workshop on Nonlinear Signal Processing}, year = {1993}, pages = {31{\textendash}36}, address = {Tampere, Finland}, isbn = {960-7620-06-4}, author = {M. Pard{\`a}s and Salembier, P. and Torres, L.} } @conference {cSalembier93a, title = {Application of mathematical morphology to picture coding}, booktitle = {General Assembly of URSI, 1993}, year = {1993}, address = {Kyoto, Japan}, author = {Salembier, P.} } @conference {cGasull93, title = {Character recognition and document analysis by morphological techniq}, booktitle = {MATHEMATICAL MORPHOLOGY}, year = {1993}, pages = {198{\textendash}203}, author = {Gasull, A. and Corbera, L. and Marqu{\'e}s, F.} } @conference {cMontolio93, title = {Character recognition and document analysis by morphological techniques}, booktitle = {WORKSHOP ON MATHEMATICAL MORPHOLOGY AND ITS APPLICATIONS TO SIGNAL PROCESSING}, year = {1993}, pages = {198{\textendash}203}, author = {Montolio, P. and Gasull, A. and Corbera, L. and Marqu{\'e}s, F.} } @conference {cCasas93, title = {Codificacion run-length de imagenes de detalle}, booktitle = {VIII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1993}, pages = {349{\textendash}399}, author = {Casas, J.} } @conference {cSerra93, title = {Connected operators and pyramids}, booktitle = {SPIE Image Algebra and Mathematical Morphology}, year = {1993}, pages = {65{\textendash}76}, address = {San Diego (CA), USA}, isbn = {1522-4880}, author = {Serra, J. and Salembier, P.} } @conference {cMarques93d, title = {Cuantificacion de lesiones de columna vertebral a partir de imagenes}, booktitle = {VIII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1993}, pages = {433{\textendash}437}, author = {Marqu{\'e}s, F.} } @conference {cMarques93e, title = {Cuantificador vectorial con clasificador difuso para la codificacion}, booktitle = {VIII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1993}, pages = {369{\textendash}373}, author = {Marqu{\'e}s, F.} } @conference {cSalembier93c, title = {Edge versus contrast estimation of morphological filters}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 1993}, year = {1993}, address = {Mineapolis (MI), USA}, author = {Salembier, P. and Serra, J. and Bangham, J.} } @conference {cCasas93a, title = {Efficient coding of residual images}, booktitle = {SPIE Visual Communications {\textquoteright}93}, year = {1993}, month = {10/1993}, pages = {694{\textendash}705}, publisher = {SPIE}, organization = {SPIE}, address = {Cambridge, MA}, abstract = {

In progressive image coding, the cost in bits of the successive residual images being encoded does not always correspond with the subjective importance that such components may have. The original idea of this paper arose for increasing the efficiency when coding the last Laplacian levels in linear decompositions for pyramidal coding. The same principle has been applied to the non-linear image decomposition presented in which a segmentation-based progressive scheme is used for coding purposes. The {\textquoteleft}post-it{\textquoteright} method for extracting details based on mathematical morphology proposed by Meyer has been modified in order to improve the rendition of the posterior coding, and a suitable technique for coding the extracted details from an extension of run-length coding is then applied. In both linear and non-linear cases the results of this method for {\textquoteleft}detail-coding{\textquoteright} are compared against the conventional progressive coding technique, i.e., pyramidal coding or pure segmentation-based contour/texture coding.

}, doi = {10.1117/12.157987}, author = {Casas, J. and Torres, L. and Jare{\~n}o, M.} } @conference {cSayrol93a, title = {Image analysis using higher-order statistics and the radon transfrom}, booktitle = {IEEE SIGNAL PROCESSING WORKSHOP ON HIGH-ORDER-STATISTICS}, year = {1993}, pages = {76{\textendash}80}, author = {Elisa Sayrol and Nikias, C. and Gasull, A.} } @conference {cSayrol93, title = {Image analysis using higher-order statistics and the random transfor}, booktitle = {WORKSHOP ON HIGHER-ORDER STATISTICS}, year = {1993}, author = {Elisa Sayrol and Gasull, A.} } @book {eSalembier93, title = {Mathematical morphology and its applications to signal processing}, year = {1993}, publisher = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, organization = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, address = {Barcelona}, isbn = {8476532717}, author = {Serra, J. and Salembier, P.} } @conference {cSalembier93b, title = {Morphological approach to segmentation-based image coding}, booktitle = {Picture Coding Symposium, PCS 1993}, year = {1993}, pages = {121{\textendash}122}, address = {Lausanne, Switzerland}, isbn = {0-7803-1238-4}, author = {Salembier, P.} } @conference {cMarques93c, title = {Morphological scheme for myalinated nerve fiber morphometric analysis}, booktitle = {International Symposium on Mathematical Morphology and its applications to image and signal processing, ISMM 1993}, year = {1993}, pages = {122{\textendash}126}, address = {Barcelona, Spain}, url = {Problemes de connexi{\'o} a l{\textquoteright}Eprints}, author = {Marqu{\'e}s, F. and Salembier, P. and Navarro, X. and Sa{\~n}udo, J.} } @conference {cSalembier93f, title = {Morphological segmentation-based coding of image sequences}, booktitle = {IEEE European Conference on Circuits Theory and Design}, year = {1993}, pages = {1245{\textendash}1250}, address = {Davos, Switzerland}, isbn = {0-7803-5470-2}, url = {Problemes de connexi{\'o} a l{\textquoteright}Eprints}, author = {Salembier, P. and Torres, L. and M. Pard{\`a}s and Marqu{\'e}s, F. and HIERRO, P. and Gasull, A.} } @conference {cSalembier93d, title = {Multi-criterion segmentation for image coding}, booktitle = {International Symposium on Mathematical Morphology and its applications to image and signal processing, ISMM 1993}, year = {1993}, pages = {40{\textendash}45}, address = {Barcelona, Spain}, author = {Salembier, P.} } @conference {cSalembier93e, title = {Object-based image coding with morphological segmentation and efficient contour coding}, booktitle = {IEEE International Conference on Signal Processing, ICSP{\textquoteright}93}, year = {1993}, address = {Beijing, China}, author = {Salembier, P. and Marqu{\'e}s, F.} } @conference {cMarques93, title = {Shape and location coding for contour images}, booktitle = {PROC. OF THE 1993 PICTURE CODING SYMPOSIUM}, year = {1993}, pages = {61{\textendash}63}, author = {Marqu{\'e}s, F. and Gasull, A.} } @book {eTorres93, title = {Sistemas Anal{\'o}gicos y Digitales de Televisi{\'o}n}, year = {1993}, publisher = {Edicions UPC}, organization = {Edicions UPC}, address = {Barcelona}, isbn = {84-7653-288-1}, author = {Torres, L. and Lleida, E. and Casas, J.} } @conference {cMarques93b, title = {Stochastic image model for segmentation. application to image coding}, booktitle = {SCANDINAVIAN CONFERENCE ON IMAGE ANALYSIS}, year = {1993}, pages = {265{\textendash}272}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cSalembier93, title = {Unsupervised morphological segmentation for images}, booktitle = {IEEE Winter Workshop on Nonlinear Signal Processing}, year = {1993}, pages = {21{\textendash}26}, address = {Tampere, Finland}, author = {Salembier, P.} } @conference {cMarques93a, title = {Unsupervised segmentation controlled by morphological contrast ext}, booktitle = {ICASSP}, year = {1993}, pages = {517{\textendash}520}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cSalembier92, title = {Adaptation of grey level structuring elements for morphological filters with application to shape detection}, booktitle = {VI European Signal Processing Conference, EUSIPCO{\textquoteright}92}, year = {1992}, pages = {1137{\textendash}1140}, address = {Brussels, Belgium}, isbn = {0-7803-8485-7}, author = {Salembier, P.} } @article {aSalembier92b, title = {Adaptive rank order based filters}, journal = {Signal processing}, volume = {27}, number = {1}, year = {1992}, pages = {1{\textendash}25}, issn = {0165-1684}, author = {Salembier, P.} } @inbook {bGasull92, title = {Analysis and optimization of the K-Means algorithm for remote sensing applications}, booktitle = {Pattern recognition and image analysis}, year = {1992}, pages = {0{\textendash}0}, isbn = {9810208812}, url = {http://cataleg.upc.edu/search~S1*cat/?searchtype=i\&searcharg=+9810208812\&searchscope=1\&SORT=D\&extended=0\&SUBMIT=Cerca\&searchlimits=\&searchorigarg=t+Proceedings+of++Workshop+on+Network+Robot+Systems+2009}, author = {Gasull, A. and Monte, E. and Torres, L. and Montolio, P. and Marqu{\'e}s, F.} } @conference {cMarques92, title = {Classified vector quantization with fuzzy theory}, booktitle = {INTERNATIONAL CONFERENCE ON FUZZY SYSTEMS}, year = {1992}, pages = {237{\textendash}244}, isbn = {0-7803-3192-3}, author = {Marqu{\'e}s, F.} } @conference {cGasull92c, title = {Coagulation time detection by means of a real-time image processing}, booktitle = {14th Annual International Conference of the IEEE Engineering in Medicine and Biology Society}, year = {1992}, pages = {1948{\textendash}1949}, author = {Gasull, A. and Vallverdu, F. and Marqu{\'e}s, F.} } @conference {cMarques92b, title = {Codificacion de imagenes: un metodo de segunda generacion}, booktitle = {VII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1992}, pages = {144{\textendash}148}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cMarques92d, title = {Codificacion de imagenes:un metodo de segunda generacion}, booktitle = {VII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1992}, pages = {144{\textendash}148}, author = {Marqu{\'e}s, F. and Gasull, A.} } @article {aTorres92, title = {Compresi{\'o}n de audio e imagen para sistemas multimedia}, journal = {Bit numerical mathematics}, number = {76}, year = {1992}, pages = {50{\textendash}58}, issn = {0006-3835}, author = {Torres, L. and Moreno, A. and Masgrau, E. and Gasull, A.} } @conference {cPardas92, title = {Connectivity filters for image sequences}, booktitle = {IMAGE ALGEBRA AND MORPHOLOGICAL IMAGE PROCESSING. SPIE}, year = {1992}, pages = {318{\textendash}329}, isbn = {0 7923 5133 9}, author = {M. Pard{\`a}s and Torres, L.} } @conference {cBonafonte92a, title = {Efficient integration of coarticulation and lexical information ...}, booktitle = {INTERNATIONAL CONFERENCE ON SPOKEN LANGUAGE PROCESSING}, year = {1992}, pages = {45{\textendash}48}, author = {Bonafonte, A. and Mari{\~n}o, J. and M. Pard{\`a}s} } @conference {cCasas92a, title = {Fuzzy classification of Remote Sensing images: a pseudocolor representation of fuzzy partitions}, booktitle = {SPIE Neural and Stochastic Methods in Image and Signal Processing}, year = {1992}, month = {07/1992}, publisher = {SPIE}, organization = {SPIE}, address = {San Diego, CA}, doi = {10.1117/12.130844}, author = {Casas, J. and Hillion, A. and Roux, C. and Torres, L. and Gasull, A.} } @conference {cSalembier92b, title = {Morphological detection based on size and contrast criteria}, booktitle = {14th Annual International Conference of the IEEE Engineering in Medicine and Biology Society}, year = {1992}, pages = {1930{\textendash}1031}, address = {Paris, France}, isbn = {?}, author = {Salembier, P. and Gasull, A. and Marqu{\'e}s, F. and Elisa Sayrol} } @conference {cSalembier92a, title = {Morphological multiscale image segmentation}, booktitle = {SPIE Visual Communication and Image Processing, VCIP{\textquoteright}92}, year = {1992}, pages = {620{\textendash}631}, address = {Boston, USA}, author = {Salembier, P. and Serra, J.} } @phdthesis {dMarques92, title = {Multiresolution image segmentation based on camporend random fields: Application to image coding}, year = {1992}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, url = {http://hdl.handle.net/10803/6910}, author = {Marqu{\'e}s, F.}, editor = {Gasull, A.} } @conference {cBonafonte92, title = {N-best hypotheses:an approach taking into account coarticulation}, booktitle = {EUROPEAN SIGNAL PROCESSING CONFERENCE}, year = {1992}, pages = {379{\textendash}382}, author = {Bonafonte, A. and M. Pard{\`a}s and Mari{\~n}o, J.} } @conference {cGasull92, title = {Non-linear techniques for image interpolation}, booktitle = {VI European Signal Processing Conference}, year = {1992}, pages = {1473{\textendash}1476}, isbn = {0 444 89587 6}, author = {Gasull, A. and Marqu{\'e}s, F. and Torres, L.} } @conference {cMarques92c, title = {Segmentacion de imagenes multiespectrales con tecnicas piramidales}, booktitle = {VII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1992}, pages = {372{\textendash}376}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cMarques92a, title = {Segmentacion no supervisada de imagenes mediante campos aleatorios}, booktitle = {SIMPOSIUM NACIONAL DE RECONOZIMIENTO DE FORMAS Y ANALISIS IMAGENES}, year = {1992}, pages = {55{\textendash}62}, isbn = {3-8007-2300-X}, author = {Marqu{\'e}s, F. and Gasull, A.} } @article {aSalembier92a, title = {Size-sensitive multiresolution decomposition of images with rank..}, journal = {Signal processing}, volume = {27}, number = {2}, year = {1992}, pages = {205{\textendash}241}, issn = {0165-1684}, author = {Salembier, P.} } @article {aSalembier92, title = {Structuring element adaptation for morphological filters}, journal = {Journal of visual communication and image representation}, volume = {3}, number = {2}, year = {1992}, pages = {115{\textendash}136}, issn = {1047-3203}, author = {Salembier, P.} } @conference {cGasull92b, title = {T{\'e}cnicas de preprocesado para la segmentaci{\'o}n de im{\'a}genes}, booktitle = {VII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1992}, pages = {367{\textendash}371}, author = {Gasull, A. and Marqu{\'e}s, F. and Torres, L.} } @conference {cGasull92a, title = {T{\'e}cnicas de preprocesado para la segmentaci{\'o}n de im{\'a}genes}, booktitle = {U.R.S.I. 92}, year = {1992}, pages = {367{\textendash}371}, isbn = {84-600-8219-9}, author = {Gasull, A. and Marqu{\'e}s, F. and Montolio, P. and Torres, L.} } @conference {cCasas92, title = {Una interpretaci{\'o}n colorim{\'e}trica en clasificaciones fuzzy de im{\'a}genes de teledetecci{\'o}n}, booktitle = {V Simposium Nacional de Reconocimiento de Formas y An{\'a}lisis de Im{\'a}genes}, year = {1992}, pages = {281{\textendash}287}, author = {Casas, J. and Torres, L. and Gasull, A.} } @conference {cSalembier91, title = {Adaptive morphological multiresolution decomposition}, booktitle = {SPIE Image Algebra and Mathematical Morphology}, year = {1991}, pages = {620{\textendash}631}, address = {San Diego, USA}, author = {Salembier, P. and Jaquenoud, L} } @conference {cGasull91, title = {Analisis de no estacionariedades en la interpolacion de imagenes}, booktitle = {VI Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1991}, pages = {985{\textendash}989}, isbn = {84-600-7766-7}, author = {Gasull, A. and Marqu{\'e}s, F. and Torres, L.} } @conference {cMarques91, title = {Codig-oriented segmentation based on g-m random}, booktitle = {International Conference on Acoustics, Speech and Signal Processing 1991}, year = {1991}, pages = {2749{\textendash}2752}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cCasas91, title = {Dise{\~n}o de filtros de imagen con funciones de transferencia}, booktitle = {VI Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1991}, pages = {965{\textendash}969}, isbn = {84-600-7766-7}, author = {Casas, J. and Torres, L.} } @conference {cMarques91b, title = {Obtencion de un esqueleto morfologico sin puntos redundantes}, booktitle = {VI Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1991}, pages = {980{\textendash}984}, author = {Marqu{\'e}s, F. and Gasull, A. and Torres, L.} } @conference {cMarques91a, title = {Segmentacion de imagenes mediante modelos de gibbs-markov}, booktitle = {VI Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1991}, pages = {975{\textendash}979}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cOliveras91, title = {Tou - Brazo Robot Asistencial: Control Verbal}, booktitle = {2{\textordmasculine} Congreso de la Asociaci{\'o}n Espa{\~n}ola de Rob{\'o}tica}, year = {1991}, pages = {93{\textendash}100}, isbn = {0-7803-7043-0}, author = {Albert Oliveras and Fuertes, J. and Vill{\`a}, R.} } @conference {cLleida91, title = {Two level c.s.r. using demisyllable..}, booktitle = {EUROPEAN CONF. ON SPEECH COMMUN. AND TECHN. EUROSPEECH{\textquoteright}91}, year = {1991}, pages = {1199{\textendash}1202}, isbn = {10184074}, author = {Lleida, E. and Albert Oliveras and Nadeu, C. and Mari{\~n}o, J.} } @conference {cGasull90a, title = {Analisis y optimizacion del algoritmo k-means aplicado a teledeteccion en ima}, booktitle = {IV Simposium Nacional de Reconocimiento de Formas y An{\'a}lisis de Im{\'a}genes}, year = {1990}, pages = {17{\textendash}23}, author = {Gasull, A. and Torres, L.} } @article {aSanz90, title = {Aspectos Cl{\'\i}nicos-toxicol{\'o}gicos en los trabajadores de una industria productora de cromatos}, journal = {Annual review of pharmacology and toxicology}, volume = {7}, year = {1990}, pages = {1:13{\textendash}1:20}, issn = {0362-1642}, author = {Sanz, P. and Ribas, B. and Cobo, E. and Gadea, E. and Marqu{\'e}s, F. and Sol{\'e}, E. and Corbella, J.} } @conference {cGasull90, title = {Contour extraction and image preprocessing of echocardiographic images using r}, booktitle = {Latvian Signal Processing International Conference}, year = {1990}, pages = {26{\textendash}30}, author = {Gasull, A. and Torres, L.} } @conference {cGasull90b, title = {Eleccion de componentes principales para la clasific. no supervisada de image}, booktitle = {IV Simposium Nacional de Reconocimiento de Formas y An{\'a}lisis de Im{\'a}genes}, year = {1990}, pages = {9{\textendash}16}, author = {Gasull, A. and Torres, L.} } @inbook {bMarques90, title = {NEW GENERATION METHODS FOR HTE HIGH-COMPRESSION CODING OF DIGITAL SEQUENCES}, booktitle = {From pixels to features ii}, year = {1990}, pages = {363{\textendash}375}, isbn = {0444890033}, author = {Marqu{\'e}s, F.} } @conference {cNadeu90, title = {Simbad:a tool for speech analysis and synthesis}, booktitle = {IASTED INT.CONF.SIGNAL PROC.\&DIG.FILT.}, year = {1990}, isbn = {1522-4880}, author = {Nadeu, C. and Albert Oliveras and Mari{\~n}o, J.} } @conference {cTorres90, title = {Temporal automatic edge detection of echocardiographic images}, booktitle = {ICASSP}, year = {1990}, pages = {2149{\textendash}2152}, isbn = {078030033}, author = {Torres, L. and Gasull, A.} } @article {pSalembier89, title = {Appareil de D{\'e}codage de Signaux Modul{\'e}s en Fr{\'e}quence}, number = {8900812-}, year = {1989}, type = {Invention patent}, author = {Salembier, P. and Lamnabhi, M.} } @conference {cGasull89a, title = {Automated left ventricular contour extraction and volume calculation form echocardiographic images}, booktitle = {Ultrasonics International 89}, year = {1989}, pages = {1{\textendash}4}, isbn = {SN.}, author = {Gasull, A. and Alquezar, R. and Torres, L. and Sallent, S. and Marqu{\'e}s, F. and Vidal, J. and Sangr{\'a}, E.} } @conference {cGasull89, title = {Automatic left ventricular contour for volume calculation}, booktitle = {Ultrasonics International 89}, year = {1989}, pages = {123{\textendash}126}, isbn = {SN.}, author = {Gasull, A. and Vazquez, G.} } @conference {cVazquez89, title = {Constant variance transversal filtering for adaptive channel equalization}, booktitle = {INTERNATIONAL CONFERENCE ON SONAR SIGNAL PROCESSING}, year = {1989}, pages = {212{\textendash}215}, isbn = {0-7923-9733-9}, author = {Vazquez, G. and Gasull, A. and Sanchez, J. and Lagunas, M.} } @article {pSalembier88a, title = {Appareil Muni d{\textquoteright}un Dispositif de R{\'e}stitution de la Composante Continue Am{\'e}lior{\'e}}, number = {8814685-}, year = {1988}, type = {Invention patent}, author = {Salembier, P. and Hayet, P.} } @conference {cSallent88, title = {Codificaci{\'o}n piramidal generalizada}, booktitle = {III Simposium Nacional de Reconocimiento de Formas y An{\'a}lisis de Im{\'a}genes}, year = {1988}, pages = {1{\textendash}2}, isbn = {142440469X/1520-6149}, author = {Sallent, S. and Torres, L. and Gasull, A.} } @conference {cGasull88, title = {Detecci{\'o}n autom{\'a}tica de contornos en im{\'a}genes ecogr{\'a}ficas}, booktitle = {III Simposium Nacional de Reconocimiento de Formas y An{\'a}lisis de Im{\'a}genes}, year = {1988}, pages = {1{\textendash}4}, isbn = {?}, author = {Gasull, A. and Marqu{\'e}s, F. and Sallent, S. and Torres, L. and Vidal, J.} } @article {pSalembier88, title = {Dispositif de R{\'e}ception de Signaux Num{\'e}riques Cod{\'e}s et Modul{\'e}s en Fr{\'e}quences}, number = {8808919-}, year = {1988}, type = {Invention patent}, author = {Salembier, P. and Lamnabhi, M.} } @article {pSalembier87, title = {Dispositif d{\textquoteright}Am{\'e}lioration du D{\'e}codage de Signaux Num{\'e}riques lors de Transmission en Modulation de Fr{\'e}quence}, number = {8710580-}, year = {1987}, type = {Invention patent}, author = {Salembier, P. and Lamnabhi, M.} }