@article {aGene-Molad, title = {AmodalAppleSize_RGB-D dataset: RGB-D images of apple trees annotated with modal and amodal segmentation masks for fruit detection, visibility and size estimation}, journal = {Data in Brief}, volume = {52}, year = {2024}, month = {02/2024}, abstract = {

The present dataset comprises a collection of RGB-D apple tree images that can be used to train and test computer vision-based fruit detection and sizing methods. This dataset encompasses two distinct sets of data obtained from a Fuji and an Elstar apple orchards. The Fuji apple orchard sub-set consists of 3925 RGB-D images containing a total of 15335 apples annotated with both modal and amodal apple segmentation masks. Modal masks denote the visible portions of the apples, whereas amodal masks encompass both visible and occluded apple regions. Notably, this dataset is the first public resource to incorporate on-tree fruit amodal masks. This pioneering inclusion addresses a critical gap in existing datasets, enabling the development of robust automatic fruit sizing methods and accurate fruit visibility estimation, particularly in the presence of partial occlusions. Besides the fruit segmentation masks, the dataset also includes the fruit size (calliper) ground truth for each annotated apple. The second sub- set comprises 2731 RGB-D images capturing five Elstar apple trees at four distinct growth stages. This sub-set includes mean diameter information for each tree at every growth stage and serves as a valuable resource for evaluating fruit sizing methods trained with the first sub-set. The present data was employed in the research papers titled "Looking behind occlusions: a study on amodal segmentation for robust on-tree apple fruit size estimation" [1] and {\textquotedblleft}Simultaneous fruit detection and size estimation using multitask deep neural networks{\textquotedblright} [2].

}, keywords = {Agricultural robotics, amodal segmentation, depth image, Fruit measurement, Fruit visibility, Instance Segmentation, modal segmentation, Yield prediction}, doi = {https://doi.org/10.1016/j.dib.2023.110000}, author = {Gen{\'e}-Mola, Jordi and Ferrer-Ferrer, M. and Hemming, J. and Dalfsen, P. and Hoog, D. and Sanz-Cortiella, R. and Rosell-Polo, Joan R. and Morros, J.R. and Ver{\'o}nica Vilaplana and Ruiz-Hidalgo, J. and Gregorio, Eduard} } @article {ade-Mas-Gimenez23, title = {Gradient-Based Metrics for the Evaluation of Image Defogging}, journal = {World Electric Vehicle Journal}, volume = {14}, year = {2023}, month = {09/2023}, chapter = {254}, abstract = {

Fog, haze, or smoke are standard atmospheric phenomena that dramatically compromise the overall visibility of any scene, critically affecting features such as the illumination, contrast, and contour detection of objects. The decrease in visibility compromises the performance of computer vision algorithms such as pattern recognition and segmentation, some of which are very relevant to decision-making in the field of autonomous vehicles. Several dehazing methods have been proposed that either need to estimate fog parameters through physical models or are statistically based. But physical parameters greatly depend on the scene conditions, and statistically based methods require large datasets of natural foggy images together with the original images without fog, i.e., the ground truth, for evaluation. Obtaining proper fog-less ground truth images for pixel-to-pixel evaluation is costly and time-consuming, and this fact hinders progress in the field. This paper aims to tackle this issue by proposing gradient-based metrics for image defogging evaluation that do not require a ground truth image without fog or a physical model. A comparison of the proposed metrics with metrics already used in the NTIRE 2018 defogging challenge as well as several state-of-the-art defogging evaluation metrics is performed to prove its effectiveness in a general situation, showing comparable results to conventional metrics and an improvement in the no-reference scene. A Matlab implementation of the proposed metrics has been developed and it is open-sourced in a public GitHub repository.

}, doi = {10.3390/wevj14090254}, url = {https://www.mdpi.com/2032-6653/14/9/254}, author = {Gerard de-Mas-Gim{\'e}nez and Pablo Garc{\'\i}a-G{\'o}mez and Casas, J. and S. Royo} } @phdthesis {dMosella-Montoro23, title = {Graph Convolutional Neural Networks for 3D Data Analysis}, volume = {Phd}, year = {2023}, school = {Universitat Polit{\`e}cnica de Catalunya}, address = {Barcelona}, abstract = {

Deep Learning allows the extraction of complex features directly from raw input data, eliminating the need for hand-crafted features from the classical Machine Learning pipeline. This new paradigm brought a boost in the performance across several domains, including computer vision, natural language processing and audio processing. However, there are still challenges when dealing with unorganized structures. This thesis addresses this challenge using Graph Convolutional Neural Networks, a new set of techniques capable of managing graph structures that can be used for processing 3D data.\ \ 

The first part of the thesis focuses on the Graph Analysis task, in which we study the capabilities of Graph Convolutional Neural Networks to capture the intrinsic geometric information of 3D data. We propose the Attention Graph Convolution layer that learns to infer the kernel used during the convolution, taking into account the particularities of each neighbourhood of the graph. We explore two variants of the Attention Graph Convolution layer, one that explores a residual approach and another one that allows the convolution to combine different neighbourhood domains. Furthermore, we propose a set of 3D pooling layers that mimics the behaviour of the pooling layers found in common 2D Convolutional Neural Networks architectures. Finally, we present a 2D-3D Fusion block capable of merging the 3D geometric information that we get from a Graph Convolutional Neural Network with the texture information obtained by a 2D Convolutional Neural Network. We evaluate the presented contributions on the RGB-D Scene Classification task.\ 

The second part of this thesis focuses on the Node Analysis task, which consists of extracting features on a node level, taking into account the neighbourhood structure. We present the Multi-Aggregator Graph Convolution layer that uses a multiple aggregator approach to better generalize for unseen topologies and learn better local representations. In addition, it reduces the memory footprint with respect to the Attention Graph Convolution layer. Finally, we analyze the capabilities of our proposed Graph Convolution layers to deal with heterogeneous graphs where the nodes of the graph may belong to different modalities. We evaluate the presented contributions with the Computer Graphics process of skinning a character mesh. Specifically, we propose a Two-Stream Graph Neural Network capable of predicting the skinning weights of a 3D character.

}, url = {http://hdl.handle.net/10803/689400}, author = {Mosella-Montoro, Albert}, editor = {Ruiz-Hidalgo, J.} } @article {aGene-Mola23, title = {Looking behind occlusions: A study on amodal segmentation for robust on-tree apple fruit size estimation}, journal = {Computers and Electronics in Agriculture}, volume = {209}, year = {2023}, month = {04/2023}, abstract = {

The detection and sizing of fruits with computer vision methods is of interest because it provides relevant information to improve the management of orchard farming. However, the presence of partially occluded fruits limits the performance of existing methods, making reliable fruit sizing a challenging task. While previous fruit segmentation works limit segmentation to the visible region of fruits (known as modal segmentation), in this work we propose an amodal segmentation algorithm to predict the complete shape, which includes its visible and occluded regions. To do so, an end-to-end convolutional neural network (CNN) for simultaneous modal and amodal instance segmentation was implemented. The predicted amodal masks were used to estimate the fruit diameters in pixels. Modal masks were used to identify the visible region and measure the distance between the apples and the camera using the depth image. Finally, the fruit diameters in millimetres (mm) were computed by applying the pinhole camera model. The method was developed with a Fuji apple dataset consisting of 3925 RGB-D images acquired at different growth stages with a total of 15,335 annotated apples, and was subsequently tested in a case study to measure the diameter of Elstar apples at different growth stages. Fruit detection results showed an F1-score of 0.86 and the fruit diameter results reported a mean absolute error (MAE) of 4.5\ mm and R2\ =\ 0.80 irrespective of fruit visibility. Besides the diameter estimation, modal and amodal masks were used to automatically determine the percentage of visibility of measured apples. This feature was used as a confidence value, improving the diameter estimation to MAE\ =\ 2.93\ mm and R2\ =\ 0.91 when limiting the size estimation to fruits detected with a visibility higher than 60\%. The main advantages of the present methodology are its robustness for measuring partially occluded fruits and the capability to determine the visibility percentage. The main limitation is that depth images were generated by means of photogrammetry methods, which limits the efficiency of data acquisition. To overcome this limitation, future works should consider the use of commercial RGB-D sensors. The code and the dataset used to evaluate the method have been made publicly available at\ https://github.com/GRAP-UdL-AT/Amodal_Fruit_Sizing.

}, keywords = {deep learning, Fruit detection, Fruit measurement, Fruit visibility, Precision agriculture, Yield estimation}, issn = {ISSN 0168-1699}, doi = {https://doi.org/10.1016/j.compag.2023.107854}, url = {https://authors.elsevier.com/sd/article/S0168-1699(23)00242-9}, author = {Gen{\'e}-Mola, Jordi and Ferrer-Ferrer, M. and Gregorio, Eduard and Blok, P. M. and Hemming, J. and Morros, J.R. and Rosell-Polo, Joan R. and Ver{\'o}nica Vilaplana and Ruiz-Hidalgo, J.} } @article {aPlasencia23, title = {A Preliminary Study of Deep Learning Sensor Fusion for Pedestrian Detection}, journal = {Sensors}, volume = {23}, year = {2023}, month = {04/2023}, chapter = {4167}, abstract = {

Most pedestrian detection methods focus on bounding boxes based on fusing RGB with lidar. These methods do not relate to how the human eye perceives objects in the real world. Furthermore, lidar and vision can have difficulty detecting pedestrians in scattered environments, and radar can be used to overcome this problem. Therefore, the motivation of this work is to explore, as a preliminary step, the feasibility of fusing lidar, radar, and RGB for pedestrian detection that potentially can be used for autonomous driving that uses a fully connected convolutional neural network architecture for multimodal sensors. The core of the network is based on SegNet, a pixel-wise semantic segmentation network. In this context, lidar and radar were incorporated by transforming them from 3D pointclouds into 2D gray images with 16-bit depths, and RGB images were incorporated with three channels. The proposed architecture uses a single SegNet for each sensor reading, and the outputs are then applied to a fully connected neural network to fuse the three modalities of sensors. Afterwards, an up-sampling network is applied to recover the fused data. Additionally, a custom dataset of 60 images was proposed for training the architecture, with an additional 10 for evaluation and 10 for testing, giving a total of 80 images. The experiment results show a training mean pixel accuracy of 99.7\% and a training mean intersection over union of 99.5\%. Also, the testing mean of the IoU was 94.4\%, and the testing pixel accuracy was 96.2\%. These metric results have successfully demonstrated the effectiveness of using semantic segmentation for pedestrian detection under the modalities of three sensors. Despite some overfitting in the model during experimentation, it performed well in detecting people in test mode. Therefore, it is worth emphasizing that the focus of this work is to show that this method is feasible to be used, as it works regardless of the size of the dataset. Also, a bigger dataset would be necessary to achieve a more appropiate training. This method gives the advantage of detecting pedestrians as the human eye does, thereby resulting in less ambiguity. Additionally, this work has also proposed an extrinsic calibration matrix method for sensor alignment between radar and lidar based on singular value decomposition.

}, keywords = {autonomous driving, Convolutional Neural Networks, sensor calibration, sensor fusion}, doi = {10.3390/s23084167}, url = {https://www.mdpi.com/1424-8220/23/8/4167}, author = {Alfredo Ch{\'a}vez Plasencia and Pablo Garc{\'\i}a-G{\'o}mez and Eduardo Bernal P{\'e}rez and Gerard de-Mas-Gim{\'e}nez and Casas, J. and S. Royo} } @article {aFerrer-Ferrer, title = {Simultaneous Fruit Detection and Size Estimation Using Multitask Deep Neural Networks }, journal = {Biosystems Engineering}, volume = {233}, year = {2023}, month = {09/2023}, pages = {63-75}, abstract = {

The measurement of fruit size is of great interest to estimate the yield and predict the harvest resources in advance. This work proposes a novel technique for in-field apple detection and measurement based on Deep Neural Networks. The proposed\ \ framework was trained with RGB-D data and consists of an end-to-end multitask Deep Neural Network architecture 13 specifically designed to perform the following tasks: 1) detection and segmentation of each fruit from its surroundings; 2) estimation of the diameter of each detected fruit. The methodology was tested with a total of 15335 annotated apples at different growth stages, with diameters varying from 27 mm to 95 mm. Fruit detection results reported an F1-score for apple detection of 0.88 and a mean absolute error of diameter estimation of 5.64 mm. These are state-of-the-art results with the additional advantages of: a) using an end-to-end multitask trainable network; b) an efficient and fast inference speed; and c) being based on RGB-D data which can be acquired with affordable depth cameras. On the contrary, the main disadvantage is the need of annotating a large amount of data with fruit masks and diameter ground truth to train the model. Finally, a fruit visibility analysis showed an improvement in the prediction when limiting the measurement to apples above 65\% of visibility (mean absolute error of 5.09 mm). This suggests that future works should develop a method for automatically identifying the most visible apples and discard the prediction of highly occluded fruits.

}, keywords = {deep learning, Fruit measurement, Fruit visibility, Precision agriculture, Yield estimation}, doi = {https://doi.org/10.1016/j.biosystemseng.2023.07.010}, author = {Ferrer-Ferrer, M. and Ruiz-Hidalgo, J. and Gregorio, Eduard and Ver{\'o}nica Vilaplana and Morros, J.R. and Gen{\'e}-Mola, Jordi} } @conference {cCaselles, title = {SIRA: Relightable Avatars from a Single Image}, booktitle = {Winter Conference on Applications of Computer Vision (WACV)}, year = {2023}, abstract = {

Recovering the geometry of a human head from a single image, while factorizing the materials and illumination is a severely ill-posed problem that requires prior information to be solved. Methods based on 3D Morphable Models (3DMM), and their combination with differentiable renderers, have shown promising results. However, the expressiveness of 3DMMs is limited, and they typically yield over-smoothed and identity-agnostic 3D shapes limited to the face region. Highly accurate full head reconstructions have recently been obtained with neural fields that parameterize the geometry using multilayer perceptrons. The versatility of these representations has also proved effective for disentangling geometry, materials and lighting. However, these methods require several tens of input images. In this paper, we introduce SIRA, a method which, from a single image, reconstructs human head avatars with high fidelity geometry and factorized lights and surface materials. Our key ingredients are two data-driven statistical models based on neural fields that resolve the ambiguities of single-view 3D surface reconstruction and appearance factorization. Experiments show that SIRA obtains state of the art results in 3D head reconstruction while at the same time it successfully disentangles the global illumination, and the diffuse and specular albedos. Furthermore, our reconstructions are amenable to physically-based appearance editing and head model relighting.

}, author = {Caselles, Pol and Ramon, Eduard and Garcia, Jaime and Xavier Gir{\'o}-i-Nieto and Moreno, Francesc and Triginer, Gil} } @conference {cHurtado23, title = {Study of Manifold Geometry using Multiscale Non-Negative Kernel Graphs}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, year = {2023}, month = {06/2023}, address = {Rhodes Island, Greece}, isbn = {978-1-7281-6327-7}, doi = {https://doi.org/10.1109/ICASSP49357.2023.10095956}, author = {Hurtado, C. and Shekkizhar, S. and Ruiz-Hidalgo, J. and Ortega, A.} } @conference {cBonet22, title = {Channel Redundancy and Overlap in Convolutional Neural Networks with Channel-Wise NNK Graphs}, booktitle = {International Conference on Acoustics, Speech and Signal Processing}, year = {2022}, month = {05/2022}, abstract = {

Feature spaces in the deep layers of convolutional neural networks (CNNs) are often very high-dimensional and difficult to interpret. However, convolutional layers consist of multiple channels that are activated by different types of inputs, which suggests that more insights may be gained by studying the channels and how they relate to each other. In this paper, we first analyze theoretically channel-wise non-negative kernel (CW-NNK) regression graphs, which allow us to quantify the overlap between channels and, indirectly, the intrinsic dimension of the data representation manifold. We find that redundancy between channels is significant and varies with the layer depth and the level of regularization during training. Additionally, we observe that there is a correlation between channel overlap in the last convolutional layer and generalization performance. Our experimental results demonstrate that these techniques can lead to a better understanding of deep representations.\ 

}, author = {Bonet, D. and Ortega, A. and Ruiz-Hidalgo, J. and Shekkizhar, S.} } @article {aTemprana-Salvador22, title = {DigiPatICS: Digital Pathology Transformation of the Catalan Health Institute Network of 8 Hospitals - Planification, Implementation and Preliminary Results}, journal = {Diagnostics}, volume = {12}, year = {2022}, month = {03/2022}, chapter = {852}, abstract = {

Complete digital pathology transformation for primary histopathological diagnosis is a challenging yet rewarding endeavor. Its advantages are clear with more efficient workflows, but there are many technical and functional difficulties to be faced. The Catalan Health Institute (ICS) has started its DigiPatICS project, aiming to deploy digital pathology in an integrative, holistic, and comprehensive way within a network of 8 hospitals, over 168 pathologists, and over 1 million slides each year. We describe the bidding process and the careful planning that was required, followed by swift implementation in stages. The purpose of the DigiPatICS project is to increase patient safety and quality of care, improving diagnosis and the efficiency of processes in the pathological anatomy departments of the ICS through process improvement, digital pathology, and artificial intelligence tools.

}, keywords = {artificial intelligence, computational pathology, deep learning, digital pathology, implementation, LIS, primary diagnosis, telepathology, workflow}, doi = {10.3390/diagnostics12040852}, url = {https://www.mdpi.com/2075-4418/12/4/852}, author = {Jordi Temprana-Salvador and Pau L{\'o}pez-Garc{\'\i}a and Josep Castellv{\'\i} Vives and Llu{\'\i}s de Haro and Eudald Ballesta and Matias Rojas Abusleme and Miquel Arrufat and Ferran Marques and Casas, J. and Carlos Gallego and Laura Pons and Jos{\'e} Luis Mate and Pedro Luis Fern{\'a}ndez and Eugeni L{\'o}pez-Bonet and Ramon Bosch and Salom{\'e} Mart{\'\i}nez and Santiago Ram{\'o}n y Cajal and Xavier Matias-Guiu} } @phdthesis {dRamon22, title = {Few-shot 3D Reconstruction of Body Parts with Deep Neural Networks}, volume = {Excellent}, year = {2022}, month = {09/2022}, type = {Industrial}, abstract = {

In this thesis, we study the problem of reconstructing objects from a concrete category\ in 3D when few images are available as input, i.e. less than 10. We apply our findings to digitalizing human body parts such as heads and torsos for medical applications. The first part of the thesis explores systems that rely on 3D Morphable Models. When approaching a concrete task, training such systems requires expensive manual hyper-parameter tuning of both the architecture and the loss, which is time consuming. We focus on designing novel losses without hyperparameters and modular architectures that allow to train models without tuning efforts. We also aim at providing a fine alignment between the 3D space and the image space by estimating camera poses with a low re-projection error that further improve the texturing process in 3D modelling applications or the rendering process in augmented reality applications. Our findings lead to systems that are very stable and that naturally scale to different scenes.

While 3D Morphable Models are fast and robust, they are still very limited in terms\ of accuracy and expressiveness, which might be prohibitive for applications that require high fidelity. A promising alternative to 3D Morphable Models are implicit functions,which in combination with differentiable rendering techniques have shown impressive results at reconstructing 3D surfaces. However, the later require large sets of images at test time to obtain satisfactory results. In the second part of the thesis, we propose to use a probabilistic model that represents a distribution of implicit surfaces in combination with a differentiable renderer to reduce the number of images required at test time. The resulting 3D reconstruction system is highly accurate and allows to reconstruct a wide variety of human head shapes when only 3 images are available.

}, author = {Ramon, Eduard}, editor = {Moreno, Francesc and Garcia, Jaime and Xavier Gir{\'o}-i-Nieto} } @article {pTerradas22, title = {A method, system and computer programs to automatically transform an image}, journal = {European Patent Office}, volume = {21382176}, year = {2022}, abstract = {

The present invention is directed, in general, to a method and a system to automatically transform an image using neural networks. More specifically, the invention relates to a controllable image generation through an image representation and several conditions using a conditional Neural Network.

The method comprises receiving, by a processing unit, at least one image and processing the received image to obtain an image representation thereof (i.e. an intermediate representation of the initial image that captures high level features and low level properties of the image and that is structured in an understandable way for a conditional Neural Network such as a deep generative Neural Network). The method also includes receiving, by an encoding unit, one or more references (e.g. other images, text, labels, combinations thereof, or even other data describing how the received image should be transformed) and encoding the received one or more references into one or more features, the latter being further provided to a conditional Neural Network as a condition(s). In addition, the method further applies the conditional Neural Network to transform the obtained image representation into a resulting conditioned image based on said condition(s).

}, issn = {21382176.2}, url = {https://patentscope.wipo.int/search/es/detail.jsf?docId=EP373278976}, author = {Terradas, R. and Pau Domingo and Grau, M. and Alarc{\'o}n, E. and Ruiz-Hidalgo, J.} } @conference {cGarcia-Gomez22, title = {Multimodal imaging System based on s{\`o}lid-State LiDAR for Advanced perception applications}, booktitle = {10th International Symposium on Optronics in defence \& security}, year = {2022}, month = {06/2022}, publisher = {3AF OPTRO2022}, organization = {3AF OPTRO2022}, address = {Versailles, France}, url = {https://www.3af-optro.com/}, author = {Pablo Garc{\'\i}a-G{\'o}mez and S. Royo and Noel Rodrigo and Casas, J. and Jordi Riu} } @conference {cPinazo22, title = {Perception in the era of Autonomous Vehicles}, booktitle = {Photonics 4 Smart Cities, SCEWC 2022}, year = {2022}, month = {11/2022}, publisher = {Photonics21}, organization = {Photonics21}, address = {Barcelona}, url = {https://www.fotonica21.org/photonics-4-smart-cities}, author = {Jorge Pinazo and Adolfo Ler{\'\i}n and Francesc Xavier de Gibert and {\'A}lvaro Moliner and Daniel Sevilla and Antonio Jurado and Iv{\'a}n R{\'\i}os and Rodrigo Jerez and Jaime Santiago and {\'A}lvaro Linuesa and Antonio Cano and Federico Dios and Adolfo Comer{\'o}n and Casas, J. and Jos{\'e} Antonio L{\'a}zaro} } @conference {cBartusiak, title = {Predicting Dog Phenotypes from Genotypes}, booktitle = {44th Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC{\textquoteright}22)}, year = {2022}, abstract = {

In this paper, we analyze dog genotypes - positions of DNA sequences that often vary between different dogs - in order to predict the corresponding phenotypes - unique characteristics that result from different genetic code. More specifically, given chromosome data from a dog, we aim to predict its breed category, height, and weight. We explore a variety of linear and non-linear classification and regression techniques to accomplish these three tasks. We show that linear methods generally outperform or match non-linear methods for breed classification. However, the reverse case is true for height and weight regression. We also evaluate the performance of all of these methods based on the number of input features used in the analysis. We conduct experiments using different fractions of the full genomic sequences and demonstrate that phenotypes can be predicted with as few as 0.5\% of the input features available for our analysis, and dog breeds can be classified with 50\% balanced accuracy with as few as 0.02\% of the features.

}, doi = {https://doi.org/10.1101/2022.04.13.488108 }, author = {Bartusiak, Emily and Barrab{\'e}s, M{\'\i}riam and Aigerim Rymbe and J{\'u}lia Gimbernat and Cayetana L{\'o}pez and Lorenzo Barberis and Mas-Montserrat, Daniel and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @conference {cMosella-Montoro22, title = {SkinningNet: Two-Stream Graph Convolutional Neural Network for Skinning Prediction of Synthetic Characters}, booktitle = {IEEE / CVF Computer Vision and Pattern Recognition Conference (CVPR)}, year = {2022}, month = {06/2022}, address = {New Orleans, USA}, abstract = {

This work presents SkinningNet, an end-to-end Two-Stream Graph Neural Network architecture that computes skinning weights from an input mesh and its associated skeleton, without making any assumptions on shape class and structure of the provided mesh. Whereas previous methods pre-compute handcrafted features that relate the mesh and the skeleton or assume a fixed topology of the skeleton, the proposed method extracts this information in an end-to-end learnable fashion by jointly learning the best relationship between mesh vertices and skeleton joints. The proposed method exploits the benefits of the novel Multi-Aggregator Graph Convolution that combines the results of different aggregators during the summarizing step of the Message-Passing scheme, helping the operation to generalize for unseen topologies. Experimental results demonstrate the effectiveness of the contributions of our novel architecture, with SkinningNet outperforming current state-of-the-art alternatives.

https://imatge-upc.github.io/skinningnet/

}, url = {https://imatge-upc.github.io/skinningnet/}, author = {Mosella-Montoro, Albert and Ruiz-Hidalgo, J.} } @article {aMosella-Montoro21, title = {2D{\textendash}3D Geometric Fusion network using Multi-Neighbourhood Graph Convolution for RGB-D indoor scene classification}, journal = {Information Fusion}, volume = {76}, year = {2021}, month = {12/2021}, chapter = {46-54}, abstract = {

Multi-modal fusion has been proved to help enhance the performance of scene classification tasks. This paper presents a 2D-3D Fusion stage that combines 3D Geometric Features with 2D Texture Features obtained by 2D Convolutional Neural Networks. To get a robust 3D Geometric embedding, a network that uses two novel layers is proposed. The first layer, Multi-Neighbourhood Graph Convolution, aims to learn a more robust geometric descriptor of the scene combining two different neighbourhoods: one in the Euclidean space and the other in the Feature space. The second proposed layer, Nearest Voxel Pooling, improves the performance of the well-known Voxel Pooling. Experimental results, using NYU-Depth-V2 and SUN RGB-D datasets, show that the proposed method outperforms the current state-of-the-art in RGB-D indoor scene classification task.\ 

https://imatge-upc.github.io/munegc/

}, doi = {10.1016/j.inffus.2021.05.002}, url = {https://imatge-upc.github.io/munegc/}, author = {Mosella-Montoro, Albert and Ruiz-Hidalgo, J.} } @conference {cBonet21, title = {Channel-Wise Early Stopping without a Validation Set via NNK Polytope Interpolation}, booktitle = {Asia Pacific Signal and Information Processing Association Annual Summit, APSIPA}, year = {2021}, month = {12/2021}, address = {Tokyo, Japan}, abstract = {

State-of-the-art neural network architectures continue to scale in size and deliver impressive generalization results, although this comes at the expense of limited interpretability. In particular, a key challenge is to determine when to stop training the model, as this has a significant impact on generalization. Convolutional neural networks (ConvNets) comprise high-dimensional feature spaces formed by the aggregation of multiple channels, where analyzing intermediate data representations and the model{\textquoteright}s evolution can be challenging owing to the curse of dimensionality. We present channel-wise DeepNNK (CW-DeepNNK), a novel channel-wise generalization estimate based on non-negative kernel regression (NNK) graphs with which we perform local polytope interpolation on low-dimensional channels. This method leads to instance-based interpretability of both the learned data representations and the relationship between channels. Motivated by our observations, we use CW-DeepNNK to propose a novel early stopping criterion that (i) does not require a validation set, (ii) is based on a task performance metric, and (iii) allows stopping to be reached at different points for each channel. Our experiments demonstrate that our proposed method has advantages as compared to the standard criterion based on validation set performance.

}, url = {https://arxiv.org/abs/2107.12972}, author = {Bonet, D. and Ortega, A. and Ruiz-Hidalgo, J. and Shekkizhar, S.} } @conference {cRamonb, title = {H3D-Net: Few-Shot High-Fidelity 3D Head Reconstruction}, booktitle = {International Conference on Computer Vision (ICCV)}, year = {2021}, publisher = {IEEE/CVF}, organization = {IEEE/CVF}, address = {Virtual}, abstract = {

Recent learning approaches that implicitly represent surface geometry using coordinate-based neural representations have shown impressive results in the problem of multi-view 3D reconstruction. The effectiveness of these techniques is, however, subject to the availability of a large number (several tens) of input views of the scene, and computationally demanding optimizations. In this paper, we tackle these limitations for the specific problem of few-shot full 3D head reconstruction, by endowing coordinate-based representations with a probabilistic shape prior that enables faster convergence and better generalization when using few input images (down to three). First, we learn a shape model of 3D heads from thousands of incomplete raw scans using implicit representations. At test time, we jointly overfit two coordinate-based neural networks to the scene, one modeling the geometry and another estimating the surface radiance, using implicit differentiable rendering. We devise a two-stage optimization strategy in which the learned prior is used to initialize and constrain the geometry during an initial optimization phase. Then, the prior is unfrozen and fine-tuned to the scene. By doing this, we achieve high-fidelity head reconstructions, including hair and shoulders, and with a high level of detail that consistently outperforms both state-of-the-art 3D Morphable Models methods in the few-shot scenario, and non-parametric methods when large sets of views are available.

}, author = {Ramon, Eduard and Triginer, Gil and Escur, Janna and Pumarola, Albert and Garcia, Jaime and Xavier Gir{\'o}-i-Nieto and Moreno, Francesc} } @unpublished {xBonet21, title = {Improved Neural Network Generalization using Channel-Wise NNK Graph Constructions}, year = {2021}, publisher = {Final Year Project, UPC}, abstract = {

State-of-the-art neural network architectures continue to scale in size and deliver impressive results on unseen data points at the expense of poor interpretability. In the deep layers of these models we often encounter very high dimensional feature spaces, where constructing graphs from intermediate data representations can lead to the well-known curse of dimensionality. We propose a channel-wise graph construction method that works on lower dimensional subspaces and provides a new channel-based perspective that leads to better interpretability of the data and relationship between channels. In addition, we introduce a novel generalization estimate based on the proposed graph construction method with which we perform local polytope interpolation. We show its potential to replace the standard generalization estimate based on validation set performance to perform progressive channel-wise early stopping without requiring a validation set.

}, author = {Bonet, David}, editor = {Ortega, Antonio and Ruiz-Hidalgo, J. and Shekkizhar, Sarath} } @conference {cGarcia-Gomez21, title = {Multimodal solid-state LiDAR for advanced perception applications}, booktitle = {OPTOEL}, year = {2021}, month = {06/2021}, abstract = {

Perception of the environment is an essential requirement for the fields of autonomous vehicles and robotics. Consequently, LiDAR imaging sensors have become crucial sen-sors for such applications due to their 3D geometry sensing capability. However, auton-omous systems claim for high amounts of data to make reliable decisions so many dif-ferent sensors are often combined. In this context, we present a multimodal imaging sys-tem based on a solid-state LiDAR combined with three other imaging sensors that pro-vides multimodal information with low parallax fusion error.

}, keywords = {artificial intelligence, autonomous navigation, computer Vision, enhanced perception, robotics, sensor fusion, solid-state LiDAR}, author = {Pablo Garc{\'\i}a-G{\'o}mez and Noel Rodrigo and Jordi Riu and Casas, J. and S. Royo} } @conference {cGirbau21, title = {Multiple Object Tracking with Mixture Density Networks for Trajectory Estimation}, booktitle = {CVPR 2021 Robust Video Scene Understanding: Tracking and Video Segmentation (RVSU) Workshop}, year = {2021}, abstract = {

Multiple object tracking faces several challenges that may be alleviated with trajectory information. Knowing the posterior locations of an object helps disambiguating and solving situations such as occlusions, re-identification, and identity switching. In this work, we show that trajectory estimation can become a key factor for tracking, and present TrajE, a trajectory estimator based on recurrent mixture density networks, as a generic module that can be added to existing object trackers. To provide several trajectory hypotheses, our method uses beam search. Also, relying on the same estimated trajectory, we propose to reconstruct a track after an occlusion occurs. We integrate TrajE into two state of the art tracking algorithms, CenterTrack [63] and Tracktor [3]. Their respective performances in the MOTChallenge 2017 test set are boosted 6.3 and 0.3 points in MOTA score, and 1.8 and 3.1 in IDF1, setting a new state of the art for the CenterTrack+TrajE configuration.

}, url = {https://arxiv.org/abs/2106.10950}, author = {Girbau, A. and Xavier Gir{\'o}-i-Nieto and Rius, Ignasi and Marqu{\'e}s, F.} } @conference {cMayoral21b, title = {Prediction of amyloid pathology in cognitively unimpaired individuals using structural MRI}, booktitle = {Alzheimer{\textquoteright}s Association International Conference}, year = {2021}, month = {07/2021}, author = {Irene Cumplido-Mayoral and Silvia Ingala and Luigi Lorenzini and Alle Meije Wink and Sven Haller and Jose Luis Molinuevo and Robin Wolz and Alessandro Palombit and Adam J Schwarz and Ga{\"e}l Chetelat and Pierre Payoux and Pablo Martinez-Lage and Giovanni Frisoni and Nick C Fox and Craig W Ritchie and Joanna M Wardlaw and Adam Waldman and Frederik Barkhof and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @conference {cManas, title = {Seasonal Contrast: Unsupervised Pre-Training from Uncurated Remote Sensing Data}, booktitle = {International Conference in Computer Vision (ICCV)}, year = {2021}, publisher = {IEEE/CVF}, organization = {IEEE/CVF}, address = {Virtual}, abstract = {

Remote sensing and automatic earth monitoring are key to solve global-scale challenges such as disaster prevention, land use monitoring, or tackling climate change. Although there exist vast amounts of remote sensing data, most of it remains unlabeled and thus inaccessible for supervised learning algorithms. Transfer learning approaches can reduce the data requirements of deep learning algorithms. However, most of these methods are pre-trained on ImageNet and their generalization to remote sensing imagery is not guaranteed due to the domain gap. In this work, we propose Seasonal Contrast (SeCo), an effective pipeline to leverage unlabeled data for in-domain pre-training of re-mote sensing representations. The SeCo pipeline is com-posed of two parts. First, a principled procedure to gather large-scale, unlabeled and uncurated remote sensing datasets containing images from multiple Earth locations at different timestamps. Second, a self-supervised algorithm that takes advantage of time and position invariance to learn transferable representations for re-mote sensing applications. We empirically show that models trained with SeCo achieve better performance than their ImageNet pre-trained counterparts and state-of-the-art self-supervised learning methods on multiple downstream tasks. The datasets and models in SeCo will be made public to facilitate transfer learning and enable rapid progress in re-mote sensing applications.

}, url = {https://arxiv.org/abs/2103.16607}, author = {Ma{\~n}as,Oscar and Lacoste, Alexandre and Xavier Gir{\'o}-i-Nieto and Vazquez, David and Rodr{\'\i}guez, Pau} } @phdthesis {dGirbau21, title = {Sports broadcasting and multiple object tracking with deep learning methods}, volume = {PhD}, year = {2021}, month = {03/2021}, type = {Industrial}, abstract = {

Since less than a decade ago, deep learning techniques started to dominate many different\ fields, revolutionizing the possibilities of artificial intelligence. Seeing their potential, industrial\ sectors started to invest in applying such technologies as key components of the company\ strategy. This thesis has been developed in an industrial context, in AutomaticTV. The main\ focus along this period has been the transfer of knowledge and know-how between academia\ and industry, development of tools to exploit this knowledge, the exploration of new techniques\ for future challenges, and, from an academic research perspective, contributions to the\ multiple object tracking problem.

The first part of the thesis is devoted to the introduction of deep learning technologies to\ AutomaticTV, a company dedicated to automatic sports analysis and broadcasting, and the\ development of tools and tasks that surround the application.

The second part of this thesis introduces the contributions to the multiple object tracking\ challenge. We present TrajE, a trajectory estimator based on mixture density networks and\ beam search, used to boost the performance of existing multiple object trackers, and introduce\ an occlusion reconstruction step using the estimated trajectory information. By adding TrajE\ to an existing multiple object tracker, we boost its performance by 6.3, 1.8 points in MOTA and\ IDF1 scores respectively, becoming the new state of the art in the MOTChallenge dataset.

}, author = {Girbau, A.}, editor = {Rius, Ignasi and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @mastersthesis {xEscur, title = {Attention-based multi-view 3D reconstruction models}, year = {2020}, abstract = {

Attention mechanism has been successfully used in multiple tasks in the fields of Computer Vision and Natural Language Processing, but has not ever been applied to 3D reconstruction problems. In this work, we explore the potential of attention in a multi-view 3D face recon- struction pipeline. On one hand, we use spatial attention when extracting the features of the input images, taking advantage of the interpretability it provides us. This allows us to validate the proper behaviour of the model. On the other hand, we want to make this multi-view setup invariant to the order of the input image?s views. To do so, instead of concatenating the fea- tures of the different views, we use part of the Transformer architecture as a symmetric merging function, which is based on a multi-head self-attention mechanism, showing an improvement in the performance.

}, author = {Escur, Janna}, editor = {Ramon, Eduard and Xavier Gir{\'o}-i-Nieto} } @conference {xCaros19, title = {Automatic Reminiscence Therapy for Dementia}, booktitle = {ACM International Conference on Multimedia Retrieval (ICMR)}, year = {2020}, month = {06/2020}, publisher = {ACM}, organization = {ACM}, address = {Dublin, Ireland}, abstract = {

With people living longer than ever, the number of cases with dementia such as Alzheimer{\textquoteright}s disease increases steadily. It affects more than 46 million people worldwide, and it is estimated that in 2050 more than 100 million will be affected. While there are not effective treatments for these terminal diseases, therapies such as reminiscence, that stimulate memories from the past are recommended. Currently, reminiscence therapy takes place in care homes and is guided by a therapist or a carer. In this work, we present an AI-based solution to automatize the reminiscence therapy, which consists in a dialogue system that uses photos as input to generate questions. We run a usability case study with patients diagnosed of mild cognitive impairment that shows they found the system very entertaining and challenging. Overall, this paper presents how reminiscence therapy can be automatized by using machine learning, and deployed to smartphones and laptops, making the therapy more accessible to every person affected by dementia. (demo paper)

Automatic Reminiscence Therapy for Dementia from Universitat Polit{\`e}cnica de Catalunya



}, doi = {https://doi.org/10.1145/3372278.3391927}, url = {https://arxiv.org/abs/1910.11949}, author = {Caros, Mariona and Garolera, Maite and Radeva, Petia and Xavier Gir{\'o}-i-Nieto} } @conference {cFernandezf, title = {Enhancing Online Knowledge Graph Population with Semantic Knowledge}, booktitle = {19th International Semantic Web Conference (ISWC)}, year = {2020}, month = {11/2020}, address = {Virtual}, abstract = {

Knowledge Graphs (KG) are becoming essential to organize, represent and store the world{\textquoteright}s knowledge, but they still rely heavily on humanly-curated structured data. Information Extraction (IE) tasks, like disambiguating entities and relations from unstructured text, are key to automate KG population. However, Natural Language Processing (NLP) methods alone can not guarantee the validity of the facts extracted and may introduce erroneous information into the KG.\ This work presents an end-to-end system that combines Semantic Knowledge and Validation techniques with NLP methods, to provide KG population of novel facts from clustered news events.\ The contributions of this paper are two-fold: First, we present a novel method for including entity-type knowledge into a Relation Extraction model, improving F1-Score over the baseline with TACRED and TypeRE datasets. Second, we increase the precision by adding data validation on top of the Relation Extraction method. These two contributions are combined in an industrial pipeline for automatic KG population over aggregated news, demonstrating increased data validity when performing online learning from unstructured web data. Finally, the TypeRE and AggregatedNewsRE datasets build to benchmark these results are also published to foster future research in this field.

}, keywords = {Data Validation, Knowledge Graph, Relation Extraction}, author = {Fern{\`a}ndez, D{\`e}lia and Rimmek, Joan Marco and Espadaler, Joan and Garolera, Blai and Barja, Adri{\`a} and Codina, Marc and Sastre, Marc and Xavier Gir{\'o}-i-Nieto and Riveiro, Juan Carlos and Bou-Balust, Elisenda} } @article {aGene-Molac, title = {Fruit detection and 3D location using instance segmentation neural networks and structure-from-motion photogrammetry}, journal = {Computers and Electronics in Agriculture}, volume = {169}, year = {2020}, month = {02/2020}, abstract = {

The development of remote fruit detection systems able to identify and 3D locate fruits provides opportunities to improve the efficiency of agriculture management. Most of the current fruit detection systems are based on 2D image analysis. Although the use of 3D sensors is emerging, precise 3D fruit location is still a pending issue. This work presents a new methodology for fruit detection and 3D location consisting of: (1) 2D fruit detection and segmentation using Mask R-CNN instance segmentation neural network; (2) 3D point cloud generation of detected apples using structure-from-motion (SfM) photogrammetry; (3) projection of 2D image detections onto 3D space; (4) false positives removal using a trained support vector machine. This methodology was tested on 11 Fuji apple trees containing a total of 1455 apples. Results showed that, by combining instance segmentation with SfM the system performance increased from an F1-score of 0.816 (2D fruit detection) to 0.881 (3D fruit detection and location) with respect to the total amount of fruits. The main advantages of this methodology are the reduced number of false positives and the higher detection rate, while the main disadvantage is the high processing time required for SfM, which makes it presently unsuitable for real-time work. From these results, it can be concluded that the combination of instance segmentation and SfM provides high performance fruit detection with high 3D data precision. The dataset has been made publicly available and an interactive visualization of fruit detection results is accessible at http://www.grap.udl.cat/documents/photogrammetry_fruit_detection.html

}, keywords = {Fruit detection, Fruit location, Mask R-CNN, Structure-from-motion, Terrestrial remote sensing}, issn = {ISSN: 0168-1699}, doi = {https://doi.org/10.1016/j.compag.2019.105165}, url = {https://doi.org/10.1016/j.compag.2019.105165}, author = {Gen{\'e}-Mola, Jordi and Sanz, Ricardo and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Ver{\'o}nica Vilaplana and Gregorio, Eduard} } @article {aRey-Arena20, title = {FuCiTNet: Improving the generalization of deep learning networks by the fusion of learned class-inherent transformations}, journal = {Information Fusion}, volume = {63}, year = {2020}, month = {10/2020}, chapter = {188}, abstract = {

It is widely known that very small datasets produce overfitting in Deep Neural Networks (DNNs), i.e., the network becomes highly biased to the data it has been trained on. This issue is often alleviated using transfer learning, regularization techniques and/or data augmentation. This work presents a new approach, independent but complementary to the previous mentioned techniques, for improving the generalization of DNNs on very small datasets in which the involved classes share many visual features. The proposed model, called FuCiTNet (Fusion Class inherent Transformations Network), inspired by GANs, creates as many generators as classes in the problem. Each generator, k, learns the transformations that bring the input image into the k-class domain. We introduce a classification loss in the generators to drive the leaning of specific k-class transformations. Our experiments demonstrate that the proposed transformations improve the generalization of the classification model in three diverse datasets.

}, doi = {10.1016/j.inffus.2020.06.015}, author = {Rey-Arena, M. and Guirado, E. and Tabik, S. and Ruiz-Hidalgo, J.} } @article {aGene-Mola20, title = {Fuji-SfM dataset: A collection of annotated images and point clouds for Fuji apple detection and location using structure-from-motion photogrammetry}, volume = {Data in Brief}, year = {2020}, month = {06/2020}, keywords = {Fruit detection, Mask R-CNN, Photogrammetry, Structure-from-motion, Terrestrial remote sensing, Yield mapping, Yield prediction}, doi = {https://doi.org/10.1016/j.dib.2020.105591}, author = {Gen{\'e}-Mola, Jordi and Sanz, Ricardo and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Ver{\'o}nica Vilaplana and Gregorio, Eduard} } @article {aGarcia-Gomez20, title = {Geometric Model and Calibration Method for a Solid-State LiDAR}, journal = {Sensors}, volume = {20}, year = {2020}, month = {05/2020}, pages = {2898}, abstract = {

This paper presents a novel calibration method for solid-state LiDAR devices based on a geometrical description of their scanning system, which has variable angular resolution. Determining this distortion across the entire Field-of-View of the system yields accurate and precise measurements which enable it to be combined with other sensors. On the one hand, the geometrical model is formulated using the well-known Snell{\textquoteright}s law and the intrinsic optical assembly of the system, whereas on the other hand the proposed method describes the scanned scenario with an intuitive camera-like approach relating pixel locations with scanning directions. Simulations and experimental results show that the model fits with real devices and the calibration procedure accurately maps their variant resolution so undistorted representations of the observed scenario can be provided. Thus, the calibration method proposed during this work is applicable and valid for existing scanning systems improving their precision and accuracy in an order of magnitude.

}, keywords = {solid-state LiDAR; LiDAR calibration; distortion correction; FOV mapping}, issn = {1424-8220}, doi = {10.3390/s20102898}, url = {https://www.mdpi.com/1424-8220/20/10/2898}, author = {Pablo Garc{\'\i}a-G{\'o}mez and S. Royo and Noel Rodrigo and Casas, J.} } @phdthesis {dPujol-Miro20, title = {Learning to extract features for 2D-3D multimodal registration}, year = {2020}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

The ability to capture depth information form an scene has greatly increased in the recent years. 3D sensors, traditionally high cost and low resolution sensors, are being democratized and 3D scans of indoor and outdoor scenes are becoming more and more common.

However, there is still a great data gap between the amount of captures being per- formed with 2D and 3D sensors. Although the 3D sensors provide more information about the scene, 2D sensors are still more accessible and widely used. This trade-off between availability and information between sensors brings us to a multimodal scenario of mixed 2D and 3D data.

This thesis explores the fundamental block of this multimodal scenario: the reg- istration between a single 2D image and a single unorganized point cloud. An unorganized 3D point cloud is the basic representation of a 3D capture. In this representation the surveyed points are represented only by their real word coordi- nates and, optionally, by their colour information. This simplistic representation brings multiple challenges to the registration, since most of the state of the art works leverage the existence of metadata about the scene or prior knowledges.

Two different techniques are explored to perform the registration: a keypoint-based technique and an edge-based technique. The keypoint-based technique estimates the transformation by means of correspondences detected using Deep Learning, whilst the edge-based technique refines a transformation using a multimodal edge detection to establish anchor points to perform the estimation.

An extensive evaluation of the proposed methodologies is performed. Albeit further research is needed to achieve adequate performances, the obtained results show the potential of the usage of deep learning techniques to learn 2D and 3D similari- ties. The results also show the good performance of the proposed 2D-3D iterative refinement, up to the state of the art on 3D-3D registration.

}, url = {http://hdl.handle.net/2117/330132}, author = {A. Pujol-Mir{\'o}}, editor = {Casas, J. and Ruiz-Hidalgo, J.} } @mastersthesis {xManas, title = {Self-Supervised Visual Representation Learning for Remote Sensing}, year = {2020}, abstract = {

With the creation of large-scale annotated datasets such as the ImageNet, fully-supervised machine learning methods have become the standard for solving computer vision tasks. These methods require large amounts of labeled data, which are usually obtained with crowdsourcing tools or social media tags. However, these approaches do not scale for specialized domains, such as medical or satellite imaging, where annotations must be provided by experts at a prohibitive cost. Recently, self-supervised learning has emerged as an alternative for obtaining transferable visual representations from unlabeled data. Models based on these representations match the performance of fully-supervised models while only requiring a small fraction of the annotations. In this work, we aim to explore the application of self-supervised learning methods in the remote sensing domain. We propose a contrastive approach for learning visual representations by exploiting the multi-spectral information of satellite images. These representations serve as a good starting point for a variety of downstream tasks that involve remote sensing imagery, accelerating convergence with fewer labeled examples.

Best thesis award 2020 (draw with 4 more other works)\ 

}, author = {Ma{\~n}as,Oscar}, editor = {Rodr{\'\i}guez, Pau and Xavier Gir{\'o}-i-Nieto} } @conference {cCombaliac, title = {BCN20000: Dermoscopic Lesions in the Wild}, booktitle = {International Skin Imaging Collaboration (ISIC) Challenge on Dermoscopic Skin Lesion Analysis 2019}, year = {2019}, month = {10/2019}, abstract = {

This article summarizes the BCN20000 dataset, composed of 19424 dermoscopic images of skin lesions captured from 2010 to 2016 in the facilities of the Hospital Cl{\'\i}nic in Barcelona. With this dataset, we aim to study the problem of unconstrained classification of dermoscopic images of skin cancer, including lesions found in hard-to-diagnose locations (nails and mucosa), large lesions which do not fit in the aperture of the dermoscopy device, and hypo-pigmented lesions. The BCN20000 will be provided to the participants of the ISIC Challenge 2019 \cite{ISIC2019}, where they will be asked to train algorithms to classify dermoscopic images of skin cancer automatically.

}, author = {Marc Combalia and Noel C. F. Codella and Veronica Rotemberg and Brian Helba and Ver{\'o}nica Vilaplana and Ofer Reiter and Cristina Carrera and Alicia Barreiro and Allan C. Halpern and Susana Puig and Josep Malvehy} } @article {aPujol-Miro19, title = {Correspondence matching in unorganized 3D point clouds using Convolutional Neural Networks}, journal = {Image and Vision Computing}, volume = {83-84}, year = {2019}, month = {04/2019}, chapter = {51-60}, abstract = {

This document presents a novel method based in Convolutional Neural Networks (CNN) to obtain correspondence matchings between sets of keypoints of several unorganized 3D point cloud captures, independently of the sensor used. The proposed technique extends a state-of-the-art method for correspondence matching in standard 2D images to sets of unorganized 3D point clouds. The strategy consists in projecting the 3D neighborhood of the keypoint onto an RGBD patch, and the classification of patch pairs using CNNs. The objective evaluation of the proposed 3D point matching based in CNNs outperforms existing 3D feature descriptors, especially when intensity or color data is available.

}, doi = {https://doi.org/10.1016/j.imavis.2019.02.013}, author = {A. Pujol-Mir{\'o} and Casas, J. and Ruiz-Hidalgo, J.} } @article {aRoisman18, title = {Differential expression of long non-coding RNAs related to proliferation and histological diversity in follicular lymphomas}, journal = {British Journal of Haematology}, volume = {184}, year = {2019}, month = {Feb 2019}, pages = {373-383}, issn = {ISSN:1365-2141}, doi = {DOI: 10.1111/bjh.15656}, author = {A. Roisman and A. Navarro and G. Clot and G. Castellano and B. Gonzalez-Farre and P. P{\'e}rez-Galan and A. Esteve and M. Dabad and S. Heath and M. Gut and Bosio, M. and P. Bellot and Salembier, P. and Albert Oliveras and I. Slavusky and L. Magnano and H. Horn and A. Rosenwald and G. Ott and M. Aymerich and A. L{\'o}pez-Guillermo and P. Jares and J.I. Mart{\'\i}n-Subero and E. Campo and L. Hern{\'a}ndez} } @article {aGene-Molab, title = {Fruit Detection in an Apple Orchard Using a Mobile Terrestrial Laser Scanner}, journal = {Biosystems Engineering}, volume = {187}, year = {2019}, month = {09/2019}, chapter = {171}, abstract = {

The development of reliable fruit detection and localization systems provides an opportunity to improve the crop value and management by limiting fruit spoilage and optimized harvesting practices. Most proposed systems for fruit detection are based on RGB cameras and thus are affected by intrinsic constraints, such as variable lighting conditions. This work presents a new technique that uses a mobile terrestrial laser scanner (MTLS) to detect and localise Fuji apples. An experimental test focused on Fuji apple trees (Malus domestica Borkh. cv. Fuji) was carried out. A 3D point cloud of the scene was generated using an MTLS composed of a Velodyne VLP-16 LiDAR sensor synchronized with an RTK-GNSS satellite navigation receiver. A reflectance analysis of tree elements was performed, obtaining mean apparent reflectance values of 28.9\%, 29.1\%, and 44.3\% for leaves, branches and trunks, and apples, respectively. These results suggest that the apparent reflectance parameter (at 905 nm wavelength) can be useful to detect apples. For that purpose, a four-step fruit detection algorithm was developed. By applying this algorithm, a localization success of 87.5\%, an identification success of 82.4\%, and an F1-score of 0.858 were obtained in relation to the total amount of fruits. These detection rates are similar to those obtained by RGB-based systems, but with the additional advantages of providing direct 3D fruit location information, which is not affected by sunlight variations. From the experimental results, it can be concluded that LiDAR-based technology and, particularly, its reflectance information, has potential for remote apple detection and 3D location.

}, issn = {1537-5110}, doi = {10.1016/j.biosystemseng.2019.08.017}, url = {https://authors.elsevier.com/c/1Zmc45Tbkk9EHW}, author = {Gen{\'e}-Mola, Jordi and Gregorio, Eduard and Guevara, Javier and Auat Cheein, Fernando and Sanz, Ricardo and Escol{\`a}, Alexandre and Llorens Calveras, Jordi and Morros, J.R. and Ruiz-Hidalgo, J. and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R.} } @mastersthesis {xCaros, title = {A Generative Dialogue System for Reminiscence Therapy}, year = {2019}, abstract = {

With people living longer than ever, the number of cases with neurodegenerative diseases such as Alzheimer{\textquoteright}s or cognitive impairment increases steadily. In Spain it affects more than 1.2 million patients and it is estimated that in 2050 more than 100 million people will be affected. While there are not effective treatments for this terminal disease, therapies such as reminiscence, that stimulate memories of the patient{\textquoteright}s past are recommended, as they encourage the communication and produce mental and emotional benefits on the patient. Currently, reminiscence therapy takes place in hospitals or residences, where the therapists are located. Since people that receive this therapy are old and may have mobility difficulties, we present an AI solution to guide older adults through reminiscence sessions by using their laptop or smartphone.\ 

Our solution consists in a generative dialogue system composed of two deep learning architectures to recognize image and text content. An Encoder-Decoder with Attention is trained to generate questions from photos provided by the user, which is composed of a pretrained Convolution Neural Network to encode the picture, and a Long Short-Term Memory to decode the image features and generate the question. The second architecture is a sequence-to-sequence model that provides feedback to engage the user in the conversation.

Thanks to the experiments, we realise that we obtain the best performance by training the dialogue model with Persona-Dataset and fine-tuning it with Cornell Movie-Dialogues dataset. Finally, we integrate Telegram as the interface for the user to interact with Elisabot, our trained conversational agent.

}, author = {Caros, Mariona}, editor = {Radeva, Petia and Xavier Gir{\'o}-i-Nieto} } @conference {cRamon, title = {Hyperparameter-Free Losses for Model-Based Monocular Reconstruction}, booktitle = {ICCV 2019 Workshop on Geometry Meets Deep Learning}, year = {2019}, month = {11/2019}, publisher = {IEEE / Computer Vision Foundation}, organization = {IEEE / Computer Vision Foundation}, address = {Seoul, South Corea}, abstract = {

This work proposes novel hyperparameter-free losses for single view 3D reconstruction with morphable models (3DMM). We dispense with the hyperparameters used in other works by exploiting geometry, so that the shape of the object and the camera pose are jointly optimized in a sole term expression. This simplification reduces the optimization time and its complexity. Moreover, we propose a novel implicit regularization technique based on random virtual projections that does not require additional 2D or 3D annotations. Our experiments suggest that minimizing a shape reprojection error together with the proposed implicit regularization is especially suitable for applications that require precise alignment between geometry and image spaces, such as augmented reality. We evaluate our losses on a large scale dataset with 3D ground truth and publish our implementations to facilitate reproducibility and public benchmarking in this field.

}, author = {Ramon, Eduard and Ruiz, Guillermo and Batard, Thomas and Xavier Gir{\'o}-i-Nieto} } @conference {cSalvadorf, title = {Inverse Cooking: Recipe Generation from Food Images}, booktitle = {CVPR}, year = {2019}, month = {06/2019}, publisher = {OpenCVF / IEEE}, organization = {OpenCVF / IEEE}, address = {Long Beach, CA, USA}, abstract = {

People enjoy food photography because they appreciate food. Behind each meal there is a story described in a complex recipe and, unfortunately, by simply looking at a food image we do not have access to its preparation process. Therefore, in this paper we introduce an inverse cooking system that recreates cooking recipes given food images. Our system predicts ingredients as sets by means of a novel architecture, modeling their dependencies without imposing any order, and then generates cooking instructions by attending to both image and its inferred ingredients simultaneously. We extensively evaluate the whole system on the large-scale Recipe1M dataset and show that (1) we improve performance w.r.t. previous baselines for ingredient prediction; (2) we are able to obtain high quality recipes by leveraging both image and ingredients; (3) our system is able to produce more compelling recipes than retrieval-based approaches according to human judgment.

}, url = {http://openaccess.thecvf.com/content_CVPR_2019/html/Salvador_Inverse_Cooking_Recipe_Generation_From_Food_Images_CVPR_2019_paper.html}, author = {Amaia Salvador and Drozdzal, Michal and Xavier Gir{\'o}-i-Nieto and Romero, Adriana} } @article {aGene-Molaa, title = {KFuji RGB-DS database: Fuji apple multi-modal images for fruit detection with color, depth and range-corrected IR data}, journal = {Data in Brief}, year = {2019}, month = {07/2019}, abstract = {

This article contains data related to the research article entitle {\textquotedblleft}Multi-modal Deep Learning for Fruit Detection Using RGB-D Cameras and their Radiometric Capabilities{\textquotedblright} [1]. The development of reliable fruit detection and localization systems is essential for future sustainable agronomic management of high-value crops. RGB-D sensors have shown potential for fruit detection and localization since they provide 3D information with color data. However, the lack of substantial datasets is a barrier for exploiting the use of these sensors. This article presents the KFuji RGB-DS database which is composed by 967 multi-modal images of Fuji apples on trees captured using Microsoft Kinect v2 (Microsoft, Redmond, WA, USA). Each image contains information from 3 different modalities: color (RGB), depth (D) and range corrected IR intensity (S). Ground truth fruit locations were manually annotated, labeling a total of 12,839 apples in all the dataset. The current dataset is publicly available at http://www.grap.udl.cat/publicacions/datasets.html.

}, keywords = {Depth cameras; RGB-D, Fruit detection, Fruit reflectance, Fuji apple, Multi-modal dataset}, doi = {10.1016/j.dib.2019.104289}, author = {Gen{\'e}-Mola, Jordi and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Gregorio, Eduard} } @article {aGene-Mola, title = {Multi-modal Deep Learning for Fuji Apple Detection Using RGB-D Cameras and their Radiometric Capabilities}, journal = {Computers and Electronics in Agriculture}, volume = {162}, year = {2019}, month = {07/2019}, chapter = {689-698}, abstract = {

Fruit detection and localization will be essential for future agronomic management of fruit crops, with applications in yield prediction, yield mapping and automated harvesting. RGB-D cameras are promising sensors for fruit detection given that they provide geometrical information with color data. Some of these sensors work on the principle of time-of-flight (ToF) and, besides color and depth, providethe backscatter signal intensity. However, this radiometric capability has not been exploited for fruit detection applications. This workpresents the KFuji RGB-DS database, composed of 967 multi-modal images containing a total of 12,839 Fuji apples. Compilation of th\ database allowed a study of the usefulness of fusing RGB-D and radiometric information obtained with Kinect v2 for fruit detection. Todo so, the signal intensity was range corrected to overcome signal attenuation, obtaining an image that was proportional to the reflectanceof the scene. A registration between RGB, depth and intensity images was then carried out. The Faster R-CNN model was adapted foruse with five-channel input images: color (RGB), depth (D) and range-corrected intensity signal (S). Results show an improvement of4.46\% in F1-score when adding depth and range-corrected intensity channels, obtaining an F1-score of 0.898 and an AP of 94.8\% whenall channels are used. From our experimental results, it can be concluded that the radiometric capabilities of ToF sensors give valuableinformation for fruit detection.

}, keywords = {Agricultural robotics, Convolutional Neural Networks, Fruit detection, Fruit reflectance, Multi-modal faster R-CNN, RGB-D}, doi = {10.1016/j.compag.2019.05.016}, author = {Gen{\'e}-Mola, Jordi and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Gregorio, Eduard} } @conference {cRamona, title = {Multi-View 3D Face Reconstruction in the Wild using Siamese Networks}, booktitle = {ICCV 2019 Workshop on 3D Face Alignment in the Wild Challenge Workshop (3DFAW)}, year = {2019}, month = {11/2019}, publisher = {IEEE/CVF}, organization = {IEEE/CVF}, address = {Seoul, South Corea}, abstract = {

In this work, we present a novel learning based approach\ to reconstruct 3D faces from a single or multiple images.\ Our method uses a simple yet powerful architecture based\ on siamese neural networks that helps to extract relevant\ features from each view while keeping the models small. Instead of minimizing multiple objectives, we propose to simultaneously learn the 3D shape and the individual camera\ poses by using a single term loss based on the reprojection\ error, which generalizes from one to multiple views. This allows to globally optimize the whole scene without having to\ tune any hyperparameters and to achieve low reprojection\ errors, which are important for further texture generation.\ Finally, we train our model on a large scale dataset with\ more than 6,000 facial scans. We report competitive results\ in 3DFAW 2019 challenge, showing the effectiveness of our\ method.

}, author = {Ramon, Eduard and Escur, Janna and Xavier Gir{\'o}-i-Nieto} } @unpublished {cRamon19, title = {Plug-and-Train Loss for Model-Based Single View 3D Reconstruction}, journal = {BMVA Technical Meeting: 3D vision with Deep Learning}, year = {2019}, month = {02/2019}, publisher = {UPC}, address = {London, UK}, abstract = {

Obtaining 3D geometry from images is a well studied problem by the computer vision community. In the concrete case of a single image, a considerable amount of prior knowledge is often required to obtain plausible reconstructions. Recently, deep neural networks in combination with 3D morphable models (3DMM) have been used in order to address the lack of scene information, leading to more accurate results. Nevertheless, the losses employed during the training process are usually a linear combination of terms where the coefficients, also called hyperparameters, must be carefully tuned for each dataset to obtain satisfactory results. In this work we propose a hyperparameters-free loss that exploits the geometry of the problem for learning 3D reconstruction from a single image. The proposed formulation is not dataset dependent, is robust against very large camera poses and jointly optimizes the shape of the object and the camera pose.

Plug and-train Loss for Single View 3D Reconstruction from Universitat Polit{\`e}cnica de Catalunya
}, author = {Ramon, Eduard and Villar, Jordi and Ruiz, Guillermo and Batard, Thomas and Xavier Gir{\'o}-i-Nieto} } @conference {cPareto, title = {Prediction of a second clinical event in CIS patients by combining lesion and brain features}, booktitle = {Congress of the European Comitee for Treatment and Research in Multiple Sclerosis (ECTRIMS 2019)}, year = {2019}, month = {09/2019}, abstract = {

Prediction of a second clinical event in clinically isolated syndrome (CIS) patients, leading to clinically definite multiple sclerosis (CDMS), is still a matter of investigation. The aim of the current study was to predict conversion to CDMS after a first attack, by means of a machine-learning approach and MRI-derived brain features. For that purpose, lesion-based features (volume and distribution within brain lobes) as well as cortical thickness (CT) and deep grey matter volumes (DGMV) were used. The final goal was to determine which features were more frequently found in the classification between converters and non-converters.

}, author = {Deborah Pareto and Pau Vidal and M. Alberich and Carlos Lopez and C. Auger and M. Tintor{\'e} and X. Montalban and J. Sastre-Garriga and Ver{\'o}nica Vilaplana and Alex Rovira} } @conference {cRuiz-Hidalgo19, title = {Residual Attention Graph Convolutional Network for Geometric 3D Scene Classification}, booktitle = {IEEE Conference on Computer Vision Workshop (ICCVW)}, year = {2019}, month = {11/2019}, publisher = {IEEE}, organization = {IEEE}, address = {Seoul, Korea}, abstract = {

Geometric 3D scene classification is a very challenging task. Current methodologies extract the geometric information using only a depth channel provided by an RGBD sensor. These kinds of methodologies introduce possible errors due to missing local geometric context in the depth channel. This work proposes a novel Residual Attention Graph Convolutional Network that exploits the intrinsic geometric context inside a 3D space without using any kind of point features, allowing the use of organized or unorganized 3D data. Experiments are done in NYU Depth v1 and SUN-RGBD datasets to study the different configurations and to demonstrate the effectiveness of the proposed method. Experimental results show that the proposed method outperforms current state-of-the-art in geometric 3D scene classification tasks.\ 

}, doi = {10.1109/ICCVW.2019.00507}, url = {https://imatge-upc.github.io/ragc/}, author = {Mosella-Montoro, Albert and Ruiz-Hidalgo, J.} } @conference {cGarcia-Gomez19, title = {Self-registered lidar and polarimetric images in real-time: application to detection of small objects at sea}, booktitle = {7th Workshop on Active Imaging}, year = {2019}, month = {11/2019}, publisher = {French-German Research Institute of Saint-Louis (ISL)}, organization = {French-German Research Institute of Saint-Louis (ISL)}, address = {Saint-Louis, France}, author = {Pablo Garc{\'\i}a-G{\'o}mez and Jordi Riu and Casas, J. and S. Royo} } @article {aKuijf, title = {Standardized Assessment of Automatic Segmentation of White Matter Hyperintensities; Results of the WMH Segmentation Challenge}, journal = {IEEE Transactions on Medical Imaging}, year = {2019}, month = {03/2019}, abstract = {

Quantification of white matter hyperintensities (WMH) of presumed vascular origin is of key\ importance in many neurological research studies. Advanced\ measurements are obtained from manual segmentations on brain\ MR images, which is a laborious procedure. Automatic WMH\ segmentation methods exist, but a standardized comparison of\ such methods is lacking. We organized a scientific challenge, in\ which developers could evaluate their method on a standardized\ multi-center/-scanner image dataset, giving an objective comparison:\ the WMH Segmentation Challenge (http://wmh.isi.uu.nl/).\ Sixty T1+FLAIR images from three MR scanners were released\ with manual WMH segmentations. A secret test set of 110\ images from five MR scanners was used for evaluation. Methods\ had to be containerized and submitted to the challenge organizers.Five evaluation metrics were used to rank the methods:\ (1) Dice Similarity Coefficient, (2) modified Hausdorff distance\ (95th percentile), (3) absolute percentage volume difference, (4)\ sensitivity for detecting individual lesions, and (5) F1-score for\ individual lesions. Additionally, methods were ranked on their\ inter-scanner robustness.\ Twenty participants submitted their method for evaluation.\ This paper provides a detailed analysis of the results. In brief,there is a cluster of four methods that rank significantly better\ than the other methods. There is one clear winner, which also\ has the best inter-scanner robustness.\ The challenge remains open for future submissions and provides\ a public platform for method evaluation.

}, keywords = {brain, Evaluation and performance, Magnetic resonance imaging (MRI), segmentation}, issn = {0278-0062}, doi = {10.1109/TMI.2019.2905770}, author = {Hugo Kuijf and Matthijs Biesbroek and Jeroen de Bresser and Rutger Heinen and Simon Andermatt and Mariana Bento and Matt Berseth and Mikhail Belyaev and Jorge Cardoso and Adri{\`a} Casamitjana and Louis Collins and Mahsa Dadar and Achileas Georgiou and Mohsen Ghafoorian and Dakai Jin and April Khademi and Jesse Knight and Hongwei Li and Xavier Llado and Miguel Luna and Qaiser Mahmood and Richard McKinley and Alireza Mehrtash and Sebastien Ourselin and Bo-yong Park and Hyunkin Park and Sang Hyun Park and Simon Pezold and Elodie Puybareau and Leticia Rittner and Carole Sudre and Sergi Valverde and Ver{\'o}nica Vilaplana and Rolan Wiest and Yongchao Xu and Ziyue Xu and Guodong Zeng and Jianguo Zhang and Guoyan Zheng and Christoper Chen and Wiesje van der Flier and Frederik Barkhof and Max Viergever and Geert Jan Biessels} } @conference {cGene-Mola19, title = {Uso de redes neuronales convolucionales para la detecci{\'o}n remota de frutos con c{\'a}maras RGB-D}, booktitle = {Congreso Ib{\'e}rico de Agroingenier{\'\i}a}, year = {2019}, month = {09/2019}, publisher = { Universidad de Zaragoza (UZA)}, organization = { Universidad de Zaragoza (UZA)}, address = {Huesca}, abstract = {

La detecci{\'o}n remota de frutos ser{\'a} una herramienta indispensable para la gesti{\'o}n agron{\'o}mica optimizada y sostenible de las plantaciones frut{\'\i}colas del futuro, con aplicaciones en previsi{\'o}n de cosecha, robotizaci{\'o}n de la recolecci{\'o}n y elaboraci{\'o}n de mapas de producci{\'o}n. Este trabajo propone el uso de c{\'a}maras de profundidad RGB-D para la detecci{\'o}n y la posterior localizaci{\'o}n 3D de los frutos. El material utilizado para la adquisici{\'o}n de datos consiste en una plataforma terrestre autopropulsada equipada con dos sensores Kinect v2 de Microsoft y un sistema de posicionamiento RTK-GNSS. Con este equipo se escanearon 3 filas de manzanos Fuji de una explotaci{\'o}n comercial. El conjunto de datos adquiridos est{\'a} compuesto por 110 capturas que contienen un total de 12,838 manzanas Fuji. La detecci{\'o}n de frutos se realiz{\'o} mediante los datos RGB (im{\'a}genes de color proporcionadas por el sensor). Para ello, se implement{\'o} y se entren{\'o} la red neuronal convolucional de detecci{\'o}n de objetos Faster R-CNN, la cual est{\'a} compuesta por dos m{\'o}dulos: red de propuesta de regiones de inter{\'e}s y red de clasificaci{\'o}n. Ambos m{\'o}dulos comparten las primeras capas convolucionales siguiendo el modelo VGG-16 pre-entrenado con la base de datos ImageNet. Los resultados de test muestran un porcentaje de detecci{\'o}n del 91.4\% de los frutos con un 15.9\% de falsos positivos (F1-score = 0.876). La evaluaci{\'o}n cualitativa de las detecciones muestra que los falsos positivos corresponden a zonas de la imagen que presentan un patr{\'o}n muy similar a una manzana, donde, incluso a percepci{\'o}n del ojo humano, es dif{\'\i}cil de determinar si hay o no manzana. Por otro lado, las manzanas no detectadas corresponden a aquellas que estaban ocultas casi en su totalidad por otros {\'o}rganos vegetativos (hojas o ramas) o a manzanas cortadas por los m{\'a}rgenes de la imagen. De los resultados experimentales se concluye que el sensor Kinect v2 tiene un gran potencial para la detecci{\'o}n y localizaci{\'o}n 3D de frutos. La principal limitaci{\'o}n del sistema es que el rendimiento del sensor de profundidad se ve afectado en condiciones de alta iluminaci{\'o}n

}, keywords = {C{\'a}maras de profundidad, Detecci{\'o}n de frutos, Redes neuronales convolucionales, RGB-D, Rob{\'o}tica agr{\'\i}cola}, doi = {https://doi.org/10.26754/c_agroing.2019.com.3325}, author = {Gen{\'e}-Mola, Jordi and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Gregorio, Eduard} } @conference {cDuartea, title = {Wav2Pix: Speech-conditioned Face Generation using Generative Adversarial Networks}, booktitle = {ICASSP}, year = {2019}, month = {05/2019}, publisher = {IEEE}, organization = {IEEE}, address = {Brighton, UK}, abstract = {

Speech is a rich biometric signal that contains information\ about the identity, gender and emotional state of the speaker.\ In this work, we explore its potential to generate face images\ of a speaker by conditioning a Generative Adversarial\ Network (GAN) with raw speech input. We propose a deep\ neural network that is trained from scratch in an end-to-end\ fashion, generating a face directly from the raw speech waveform\ without any additional identity information (e.g reference\ image or one-hot encoding). Our model is trained in a\ self-supervised fashion by exploiting the audio and visual signals\ naturally aligned in videos. With the purpose of training\ from video data, we present a novel dataset collected for this\ work, with high-quality videos of ten youtubers with notable\ expressiveness in both the speech and visual signals.

}, doi = {10.1109/ICASSP.2019.8682970}, url = {http://hdl.handle.net/2117/167073}, author = {Amanda Duarte and Rold{\'a}n, Francisco and Tubau, Miquel and Escur, Janna and Pascual-deLaPuente, Santiago and Amaia Salvador and Mohedano, Eva and McGuinness, Kevin and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xRoca, title = {Block-based Speech-to-Speech Translation}, year = {2018}, abstract = {

This bachelor{\textquoteright}s thesis explores different ways of building a block-based Speech Translation system with the aim of generating huge amounts of parallel speech data. The first goal is to research and manage to run suitable tools to implement each one of the three blocks that integrates the Speech Translation system: Speech Recognition, Translation and Speech Synthesis. We experiment with some open-source toolkits and we manage to train a speech recognition system and a neural machine translation system. Then, we test them to evaluate their performance. As an alternative option, we use the cloud computing solutions provided by Google Cloud to implement the three sequential blocks and we successfully build the overall system. Finally, we make a comparative study between an in-house software development versus Cloud computing implementation.

Block-based Speech to Speech Translation from Universitat Polit{\`e}cnica de Catalunya
}, author = {Roca, Sandra}, editor = {Amanda Duarte and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xRosello18, title = {Brain lesion segmentation using Convolutional Neuronal Networks}, year = {2018}, abstract = {

Convolutional neural networks (CNN) are powerful tools for learning representations from images. They are being used in a large range of applications, being the state-of-the art in many computer vision tasks. In this work, we study the brain tumor segmentation problem using CNNs and the publicly available BraTS dataset. One of the key factors for this task is which training scheme is used since it should deal with memory constraints and should alleviate the high-imbalance nature between healthy and lesion tissue in the brain.

Thus, the purpose of this project is to propose a comparison between several training schemes and extensively analyze and evaluate them in terms of the dice score. We evaluate densetraining against patch-sampling, and particularly, xed-rule against adaptive sampling scheme. Furthermore, variants and modications of the existing training schemes have been proposed in order to enhance their performance. Finally, several loss functions for each training scheme have been analyzed.

}, author = {Clara Bon{\'\i}n Rosell{\'o}}, editor = {Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana} } @unpublished {xRamon, title = {Deep Learning algorithms for 3D Reconstruction and Simulation of Aesthetic Procedures}, year = {2018}, month = {07/2018}, type = {Phd Thesis Proposal}, abstract = {

3D technology is key for a wide range of industries. Medicine, construction, cinema and many other disciplines can nowadays digitalize the world we perceive using 3D reconstruction algorithms, create new objects by means of 3D printers or analyze the world using 3D detection and segmentation techniques.

These tools are transforming how research and industry problems are addressed. Concretely, in the field of aesthetic surgery, a fluent communication between doctors and patients is crucial in order to maximize the success of the operatories and the satisfaction of the patients. A new trend in the industry is to incorporate 3D technology during the consultation, with the goa lof improving that communication. By reconstructing the body of the patient and simulating aesthetic procedures on it, he or she can realise how a concrete modification would look like when applied to his or her body. It is also beneficial for the physicians, since they can measure the effectiveness of the applied treatments and also convert more consultations into operations due to an increase of confidence in the patient.

This thesis proposal defines the research directions to follow during an industrial doctorate to be developed in Crisalix Labs, in collaboration of the Image Processing Group at the Universitat Polit{\`e}cnica de Catalunya. Industrial doctorates are promoted from the Government of Catalonia to promote the transfer of knowledge from universities to local industry as an element for innovation and technical excellence.

}, author = {Ramon, Eduard}, editor = {Xavier Gir{\'o}-i-Nieto} } @conference {cGene-Mola18, title = {Fruit Detection Using Mobile Terrestrial Laser Scanning}, booktitle = {AgEng 2018,}, year = {2018}, month = {07/2018}, address = {Wageningen (Netherlands)}, abstract = {

The development of reliable fruit detection and localization systems is essential for future sustainable agronomic management of high-value crops. Up to date, most proposed systems on fruit detection and characterization are based on RGB cameras and thus affected by intrinsic constraints, such as variable lighting conditions and camera calibration. This work presents a new technique that uses a mobile terrestrial laser scanner to detect and localize fruits regardless of the prevailing lighting conditions and without the need of a previous calibration. An experimental test focused on two Fuji apple trees (containing 139 and 145 apples each) was carried out. A 3D point cloud of this scene was generated using a Velodyne VLP-16 LiDAR sensor synchronized with a RTK-GNSS receiver. A reflectivity analysis of tree elements was performed, obtaining mean reflectivity values of 28.9\%, 29.1\%, and 44.3\% for leaves, trunks, and fruits, respectively. These results suggest that the reflectivity parameter can be useful to localize fruits in the tree. From this knowledge, a three-step fruit detection algorithm has been developed: 1) reflectivity thresholding to remove most of the leaves and trunks from the original point cloud; 2) statistical outlier removal to reduce noise; 3) connected components clustering using a density-based algorithm. By applying this algorithm to our dataset, a localization success of 85\%, a detachment success of 78.8\%, and a false detection rate of 15.2\% were obtained. These detection rates are similar to those obtained by current RGB-based system, but with the additional advantage of providing direct 3D fruit location information (global coordinates) which is not affected by sunlight variations. It can be concluded that LiDAR technology and, particularly, its reflectivity information, might have potential use in fruit detection. Future work should include the application of this fruit detection technique on a wider range of crop types

}, author = {Gen{\'e}-Mola, Jordi and Gregorio, Eduard and Guevara, Javier and Auat, Fernando and Escol{\`a}, Alexandre and Morros, J.R. and Rosell-Polo, Joan R.} } @conference {cBakas19, title = {Identifying the best machine learning algorithms for brain tumor segmentation, progression assessment, and overall survival prediction in the BRATS challenge}, booktitle = {MICCAI - Multimodal Brain Tumor Segmentation Challenge}, year = {2018}, month = {12/2018}, abstract = {

Gliomas are the most common primary brain malignancies, with different degrees of aggressiveness, variable prognosis and various heterogeneous histologic sub-regions, i.e., peritumoral edematous/invaded tissue, necrotic core, active and non-enhancing core. This intrinsic heterogeneity is also portrayed in their radio-phenotype, as their sub-regions are depicted by varying intensity profiles disseminated across multi-parametric magnetic resonance imaging (mpMRI) scans, reflecting varying biological properties. Their heterogeneous shape, extent, and location are some of the factors that make these tumors difficult to resect, and in some cases inoperable. The amount of resected tumor is a factor also considered in longitudinal scans, when evaluating the apparent tumor for potential diagnosis of progression. Furthermore, there is mounting evidence that accurate segmentation of the various tumor sub-regions can offer the basis for quantitative image analysis towards prediction of patient overall survival. This study assesses the state-of-the-art machine learning (ML) methods used for brain tumor image analysis in mpMRI scans, during the last seven instances of the International Brain Tumor Segmentation (BraTS) challenge, i.e., 2012-2018. Specifically, we focus on i) evaluating segmentations of the various glioma sub-regions in pre-operative mpMRI scans, ii) assessing potential tumor progression by virtue of longitudinal growth of tumor sub-regions, beyond use of the RECIST/RANO criteria, and iii) predicting the overall survival from pre-operative mpMRI scans of patients that underwent gross total resection. Finally, we investigate the challenge of identifying the best ML algorithms for each of these tasks, considering that apart from being diverse on each instance of the challenge, the multi-institutional mpMRI BraTS dataset has also been a continuously evolving/growing dataset.

}, url = {https://arxiv.org/pdf/1811.02629.pdf}, author = {Spyridon Bakas and Mauricio Reyes and Andras Jakab and Stefan Bauer and Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana and et. Al.} } @article {aDimiccoli18, title = {Introduction to the special issue: Egocentric Vision and Lifelogging}, journal = {Journal of Visual Communication and Image Representation}, year = {2018}, month = {06/2018}, abstract = {

}, doi = {10.1016/j.jvcir.2018.06.010}, url = {https://www.sciencedirect.com/science/article/pii/S1047320318301354}, author = {Dimiccoli, M. and Gurrin, Cathal and Crandall, David and Xavier Gir{\'o}-i-Nieto and Radeva, Petia} } @inbook {bGorriz18, title = {Leishmaniasis Parasite Segmentation and Classification Using Deep Learning}, booktitle = { Articulated Motion and Deformable Objects}, volume = {10945}, number = {Lecture Notes in Computer Science}, year = {2018}, pages = {53-62}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {

Leishmaniasis is considered a neglected disease that causes thousands of deaths annually in some tropical and subtropical countries. There are various techniques to diagnose leishmaniasis of which manual microscopy is considered to be the gold standard. There is a need for the development of automatic techniques that are able to detect parasites in a robust and unsupervised manner. In this paper we present a procedure for automatizing the detection process based on a deep learning approach. We train a U-net model that successfully segments leismania parasites and classifies them into promastigotes, amastigotes and adhered parasites.

}, issn = {978-3-319-94544-6}, doi = {10.1007/978-3-319-94544-6}, author = {G{\'o}rriz, Marc and Albert Aparicio and Berta Ravent{\'o}s and Ver{\'o}nica Vilaplana and Elisa Sayrol and Daniel L{\'o}pez-Codina} } @conference {cGorrizb, title = {Leishmaniasis Parasite Segmentation and Classification Using Deep Learning}, booktitle = {International Conference on Articulated Motion and Deformable Objects}, year = {2018}, address = {Palma, Spain}, abstract = {

Leishmaniasis is considered a neglected disease that causes thousands of deaths annually in some tropical and subtropical countries. There are various techniques to diagnose leishmaniasis of which manual microscopy is considered to be the gold standard. There is a need for the development of automatic techniques that are able to detect parasites in a robust and unsupervised manner. In this paper we present a procedure for automatizing the detection process based on a deep learning approach. We train a U-net model that successfully segments leismania parasites and classifies them into promastigotes, amastigotes and adhered parasites.

}, author = {G{\'o}rriz, Marc and Albert Aparicio and Berta Ravent{\'o}s and Daniel L{\'o}pez-Codina and Ver{\'o}nica Vilaplana and Elisa Sayrol} } @article {pPerez-Pellitero17, title = {Method for upscaling an image and apparatus for upscaling an image}, number = {US 20170132759 A1}, year = {2018}, month = {05/2017}, address = {US 20170132759 A1}, abstract = {

Image super-resolution (SR) generally enhance the resolution of images. One of SR{\textquoteright}s main challenge is discovering mappings between low-resolution (LR) and high-resolution (HR) image patches. The invention learns patch upscaling projection matrices from a training set of images. Input images are divided into overlapping patches, which are normalized and transformed to a defined orientation. Different transformations can be recognized and dealt with by using a simple 2D-projection. The transformed patches are clustered, and cluster specific upscaling projection matrices and corresponding cluster centroids determined during training are applied to obtain upscaled patches. The upscaled patches are assembled to an upscaled image.

https://worldwide.espacenet.com/publicationDetails/biblio?II=2\&ND=3\&adjacent=true\&locale=en_EP\&FT=D\&date=20170511\&CC=US\&NR=2017132759A1\&KC=A1

}, issn = {US 20170132759 A1}, url = {https://register.epo.org/ipfwretrieve?apn=US.201615341080.A\&lng=en}, author = {E. Perez-Pellitero and Salvador, J. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @conference {cCaminal18, title = {SLAM-based 3D outdoor reconstructions from LIDAR data}, booktitle = {IC3D}, year = {2018}, month = {12/2018}, publisher = {IEEE}, organization = {IEEE}, address = {Brussels, Belgium}, abstract = {

The use of depth (RGBD) cameras to reconstruct large outdoor environments is not feasible due to lighting conditions and low depth range. LIDAR sensors can be used instead. Most state of the art SLAM methods are devoted to indoor environments and depth (RGBD) cameras. We have adapted two SLAM systems to work with LIDAR data. Quantitative evaluations have been performed with LIDAR and RGBD data allowing to compare the systems. Results show that the best method with LIDAR is RTAB-Map with clear difference. Additionally, RTAB-Map has been used to create 3D reconstructions with and without the use of a visible color camera. This proves the potential of Lidar sensors for the reconstructionof outdoor environments for immersion or audiovisual production applications.

}, keywords = {3D imaging, Lidar cameras, mapping, point-cloud processing, SLAM, time-of-flight}, url = {http://www.stereopsia.com/international-conference-3d-immersion-ic3d}, author = {I. Caminal and Casas, J. and S. Royo} } @mastersthesis {xRoldana, title = {Speech-conditioned Face Generation with Deep Adversarial Networks}, year = {2018}, abstract = {

Image synthesis have been a trending task for the AI community in recent years. Many works have shown the potential of Generative Adversarial Networks (GANs) to deal with tasks such as text or audio to image synthesis. In particular, recent advances in deep learning using audio have inspired many works involving both visual and auditory information. In this work we propose a face synthesis method using audio and/or language representations as inputs. Furthermore, a dataset which relates speech utterances with a face and an identity has been built, fitting for other tasks apart from face synthesis such as speaker recognition or voice conversion.

Speech Conditioned Face Generation with Deep Adversarial Networks from Universitat Polit{\`e}cnica de Catalunya
}, author = {Rold{\'a}n, Francisco}, editor = {Pascual-deLaPuente, Santiago and Amaia Salvador and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @article {aMaceiraa, title = {3D hierarchical optimization for multi-view depth map coding}, journal = {Multimedia Tools and Applications}, year = {2017}, month = {12/2017}, abstract = {

Full version available at:\ http://rdcu.be/zHtU

Depth data has a widespread use since the popularity of high resolution 3D sensors. In multi-view sequences, depth information is used to supplement the color data of each view. This article proposes a joint encoding of multiple depth maps with a unique representation. Color and depth images of each view are segmented independently and combined in an optimal Rate-Distortion fashion. The resulting partitions are projected to a reference view where a coherent hierarchy forthe multiple views is built. A Rate-Distortion optimization is applied to obtain the final segmentation choosing nodes of the hierarchy. The consistent segmentation is used to robustly encode depth maps of multiple views obtaining competitive results with HEVC coding standards.

}, keywords = {3D representation, Depth coding, Multiview coding, segmentation based coding}, doi = {10.1007/s11042-017-5409-z}, url = {http://rdcu.be/zHtU}, author = {Maceira, M. and David Varas and Morros, J.R. and Ruiz-Hidalgo, J. and Marqu{\'e}s, F.} } @conference {cvan Sabben17, title = {Collaborative voting of 3D features for robust gesture estimation}, booktitle = {International Conference on Acoustics, Speech and Signal Processing}, year = {2017}, month = {03/2017}, address = {New Orleans, USA}, abstract = {

Human body analysis raises special interest because it enables a wide range of interactive applications. In this paper we present a gesture estimator that discriminates body poses in depth images. A novel collaborative method is proposed to learn 3D features of the human body and, later, to estimate specific gestures. The collaborative estimation framework is inspired by decision forests, where each selected point (anchor point) contributes to the estimation by casting votes. The main idea is to detect a body part by accumulating the inference of other trained body parts. The collaborative voting encodes the global context of human pose, while 3D features represent local appearance. Body parts contributing to the detection are interpreted as a voting process. Experimental results for different 3D features prove the validity of the proposed algorithm.

}, author = {van Sabben, D. and Ruiz-Hidalgo, J. and Suau, X. and Casas, J.} } @inbook {bBellot17, title = {Efficient Combination of Pairwise Feature Networks}, booktitle = {Neural Connectomics Challenge}, year = {2017}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, chapter = {7}, issn = {978-3-319-53069-7}, doi = {10.1007/978-3-319-53070-3}, author = {P Bellot and Patrick Meyer}, editor = {Battaglia, D and Guyon, I. and Lemaire, V. and Orlandi, J. and Ray, B. and Soriano, J.} } @conference {cRomero-Lopeza, title = {The Impact of Segmentation on the Accuracy and Sensitivity of a Melanoma Classifier based on Skin Lesion Images}, booktitle = {Annual Meeting of the Society of Imaging Informatics in Medicine (SIIM)}, year = {2017}, month = {06/2017}, publisher = {Society of Imaging Informatics for Medicine}, organization = {Society of Imaging Informatics for Medicine}, address = {Pittsburgh, PA, USA}, abstract = {

The accuracy and sensitivity of a Deep Learning based approach for a 2-class classifier for early melanoma detection\ based on skin lesion dermoscopic images increases when\ the classifier is trained with segmented inputs (i.e., images\ containing only the lesions as binary masks, without the\ surrounding context) instead of entire images.

[SIIM 2017 Annual Meeting website]

[SIIM 2017 Session where our work is presented]

The Impact of Segmentation on the Accuracy and Sensitivity of a Melanoma Classifier Based on Skin Lesion Images from Oge Marques
}, url = {http://hdl.handle.net/2117/105582}, author = {Romero-Lopez, Adria and Burdick, Jack and Xavier Gir{\'o}-i-Nieto and Marques, Oge} } @conference {cGurrina, title = {LTA 2017: The Second Workshop on Lifelogging Tools and Applications}, booktitle = {ACM Multimedia}, year = {2017}, month = {10/2017}, publisher = {ACM}, organization = {ACM}, address = {Mountain View, California USA}, abstract = {

The organisation of personal data is receiving increasing research attention due to the challenges we face in gathering, enriching, searching, and visualising such data. \ Given the increasing ease with which personal data being gathered by individuals, the concept of a lifelog digital library of rich multimedia and sensory content for every individual is fast becoming a reality. \ The LTA~2017 workshop aims to bring together academics and practitioners to discuss approaches to lifelog data analytics and applications; and to debate the opportunities and challenges for researchers in this new and challenging area.

}, doi = {10.1145/3123266.3132050}, author = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto and Radeva, Petia and Dimiccoli, M. and Dang-Nguyen, Duc-Tien and Joho, H.} } @phdthesis {dPerez-Pellitero17, title = {Manifold Learning for Super Resolution}, year = {2017}, school = {Leibniz Universit{\"a}t Hannover}, address = {Hannover}, abstract = {

The development pace of high-resolution displays has been so fast in the recent years that many images acquired with low-end capture devices are already outdated or will be shortly in time. Super Resolution is central to match the resolution of the already existing image content to that of current and future high resolution displays and applications. This dissertation is focused on learning how to upscale images from the statistics of natural images. We build on a sparsity model that uses learned coupled low- and high-resolution dictionaries in order to upscale images.

Firstly, we study how to adaptively build coupled dictionaries so that their content is semantically related with the input image. We do so by using a Bayesian selection stage which finds the best-fitted texture regions from the training dataset for each input image. The resulting adapted subset of patches is compressed into a coupled dictionary via sparse coding techniques.

We then shift from l1 to a more efficient l2 regularization, as introduced by Timofte et al. Instead of using their patch-to-dictionary decomposition, we propose a fully collaborative neighbor embedding approach. In this novel scheme, for each atom in the dictionary we create a densely populated neighborhood from an extensive training set of raw patches (i.e. in the order of hundreds of thousands). This generates more accurate regression functions.

We additionally propose using sublinear search structures such as spherical hashing and trees to speed up the nearest neighbor search involved in regression-based Super Resolution. We study the positive impact of antipodally invariant metrics for linear regression frameworks, and we propose two efficient solutions: (a) the Half Hypersphere Confinement, which enables antipodal invariance within the Euclidean space, and (b) the bimodal tree, whose split functions are designed to be antipodally invariant and which we use in the context of a Bayesian Super Resolution forest.

In our last contribution, we extend antipodal invariance by also taking into consideration the dihedral group of transforms (i.e. rotations and reflections). We study them as a group of symmetries within the high-dimensional manifold. We obtain the respective set of mirror-symmetry axes by means of a frequency analysis, and we use them to collapse the redundant variability, resulting in a reduced manifold span which, in turn, greatly improves quality performance and reduces the dictionary sizes.

}, author = {E. Perez-Pellitero}, editor = {Rosenhahn, B. and Ruiz-Hidalgo, J.} } @phdthesis {dMaceira17, title = {Multi-view depth coding based on a region representation combining color and depth information}, year = {2017}, month = {06/2017}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {PhD}, abstract = {

Depth map data is used to supplement the color data in multi-view sequences. As depth maps present distinct characteristics than natural color images, new coding techniques are required to represent their smooth regions and sharp edges. In this thesis, segmentation-based coding techniques are proposed to encode depth maps by exploiting the redundancy between color and depth information. Methods developed combine partitions obtained from color and depth images to find efficient representations. The color image is assumed to be available before the depth map coding process, therefore a color partition can be obtained at the decoder without introducing coding cost.

Two hierarchical image segmentation algorithms are proposed to generate color and depth partitions for coding applications. The color segmentation obtains a super-pixel representation using color information, spatial distribution and shape complexity. The depth segmentation uses a 3D planar model for each region to extract the structure of the scene. Color and depth partitions are combined in depth map coding methods to find the final coding partition.

Different methods for texture representation have been explored in this thesis. Initial approaches used 2D coding methods, while a 3D representation have been proposed to represent depth maps from multiple views with a unique segmentation. This 3D representation is used to segment depth maps in single-view and multi-view configurations. Final coding partitions are obtained with a ratedistortion optimization over a hierarchy of regions. Segmentation-based coding techniques proposed obtain competitive results with HEVC coding standards.

}, url = {http://hdl.handle.net/2117/110439}, author = {Maceira, M.}, editor = {Ruiz-Hidalgo, J. and Morros, J.R.} } @conference {cPujol-Miro17, title = {Registration of Images to Unorganized 3D Point Clouds Using Contour Cues}, booktitle = {The 25th European Signal Processing Conference (EUSIPCO 2017)}, year = {2017}, month = {08/2017}, publisher = {Eurasip}, organization = {Eurasip}, address = {Kos island, Greece}, abstract = {

Low resolution commercial 3D sensors contribute to computer vision tasks even better when the analysis is carried out in a combination with higher resolution image data. This requires registration of 2D images to unorganized 3D point clouds. In this paper we present a framework for 2D-3D data fusion to obtain directly the camera pose of a 2D color image in relation to a 3D point cloud. It includes a novel multiscale intensity feature detection algorithm and a modified ICP procedure based on point-to-line distances. The framework is generic for several data types (such as CAD designs or LiDAR data without photometric information), and results show that performance is comparable to the state of the art, while avoiding manual markers or specific patterns on the data.

}, keywords = {Cameras, Feature extraction, Iterative closest point algorithm, Sensors, Signal processing algorithms, Three-dimensional displays}, doi = {10.23919/EUSIPCO.2017.8081173}, url = {https://www.eusipco2017.org/}, author = {A. Pujol-Mir{\'o} and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cBazazian16, title = {Segmentation-based Multi-Scale Edge Extraction to Measure the Persistence of Features in Unorganized Point Clouds}, booktitle = {International Conference on Computer Vision Theory and Applications}, year = {2017}, month = {02/2017}, address = {Porto, Portugal}, abstract = {

Edge extraction has attracted a lot of attention in computer vision. The accuracy of extracting edges in point clouds can be a significant asset for a variety of engineering scenarios. To address these issues, we propose a segmentation-based multi-scale edge extraction technique.\ 

In this approach, different regions of a point cloud are segmented by a global analysis according to the geodesic distance. Afterwards, a multi-scale operator is defined according to local neighborhoods. Thereupon, by applying this operator at multiple scales of the point cloud, the persistence of features is determined. We illustrate the proposed method by computing a feature weight that measures the likelihood of a point to be an edge, then detects the edge points based on that value at both global and local scales. Moreover, we evaluate quantitatively and qualitatively our method. Experimental results show that the proposed approach achieves a superior accuracy. Furthermore, we demonstrate the robustness of our approach in noisier real-world datasets.

}, doi = {10.5220/0006092503170325}, url = {http://www.scitepress.org/DigitalLibrary/Link.aspx?doi=10.5220\%2f0006092503170325}, author = {D. Bazazian and Casas, J. and Ruiz-Hidalgo, J.} } @conference {xLidona, title = {Semantic Summarization of Egocentric Photo Stream Events}, booktitle = {ACM Multimedia 2017 Workshop on Lifelogging Tools and Applications}, year = {2017}, month = {10/2017}, publisher = {ACM}, organization = {ACM}, address = {Mountain View, CA, USA}, abstract = {

With the rapid increase of users of wearable cameras in recent years and of the amount of data they produce, there is a strong need for automatic retrieval and summarization techniques. This work addresses the problem of automatically summarizing egocentric photo streams captured through a wearable camera by taking an image retrieval perspective. After removing non-informative images by a new CNN-based filter, \ images are ranked by relevance to ensure semantic diversity and \ finally re-ranked by a novelty criterion to reduce redundancy. \ To assess the results, a new evaluation metric is proposed which takes into account the non-uniqueness of the solution. Experimental results applied on a database of 7,110 images from 6 different subjects and evaluated by experts gave 95.74\% of experts satisfaction and a Mean Opinion Score of 4.57 out of 5.0.

}, doi = {10.1145/3133202.3133204}, url = {https://dl.acm.org/citation.cfm?id=3133204}, author = {Lidon, Aniol and Bola{\~n}os, Marc and Dimiccoli, M. and Radeva, Petia and Garolera, Maite and Xavier Gir{\'o}-i-Nieto} } @conference {cRomero-Lopez, title = {Skin Lesion Classification from Dermoscopic Images using Deep Learning}, booktitle = {The 13th IASTED International Conference on Biomedical Engineering (BioMed 2017)}, year = {2017}, month = {02/2017}, address = {Innsbruck Austria}, abstract = {

The recent emergence of deep learning methods for medical image analysis has enabled the development of intelligent medical imaging-based diagnosis systems that can assist the human expert in making better decisions about a patient{\textquoteright}s health. In this paper we focus on the problem of skin lesion classification, particularly early melanoma detection, and present a deep-learning based approach to solve the problem of classifying a dermoscopic image containing a skin lesion as malignant or benign. \ The proposed solution is built around the VGGNet convolutional neural network architecture and uses the transfer learning paradigm. Experimental results are encouraging: on the ISIC Archive dataset, the proposed method achieves a sensitivity value of 78.66\%, which is significantly higher than the current state of the art on that dataset.

[BioMed conference 2017]

}, keywords = {Convolutional Neural Networks, deep learning, machine learning, Medical Decision Support Systems, Medical Image Analysis, Skin Lesions}, url = {http://upcommons.upc.edu/handle/2117/103386}, author = {Romero-Lopez, Adria and Burdick, Jack and Xavier Gir{\'o}-i-Nieto and Marques, Oge} } @mastersthesis {xRomero-Lopez, title = {Skin Lesion Detection from Dermoscopic Images using Convolutional Neural Networks}, year = {2017}, abstract = {

Advisors: Oge Marques (Florida Atlantic University) and Xavier Giro-i-Nieto (UPC)

The recent emergence of machine learning and deep learning methods for medical image analysis has enabled the development of intelligent medical imaging-based diagnosis systems that can assist physicians in making better decisions about a patient{\textquoteright}s health. In particular, skin imaging is a field where these new methods can be applied with a high rate of success.\ 

This thesis focuses on the problem of automatic skin lesion detection, \ particularly on melanoma detection, by applying semantic segmentation and classification from dermoscopic images using a deep learning based approach.\ For the first problem, a U-Net convolutional neural network architecture is applied for an accurate extraction of the lesion region.\ For the second problem, the current model performs a binary classification (benign versus malignant) that can be used for early melanoma detection. The model is general enough to be extended to multi-class skin lesion classification. The proposed solution is built around the VGG-Net ConvNet architecture and uses the transfer learning paradigm.\ Finally, this work performs a comparative evaluation of classification \ alone (using the entire image) against a combination of the two approaches (segmentation followed by classification) in order to assess which of them achieves better classification results.

[Source code]

Skin Lesion Detection from Dermoscopic Images using Convolutional Neural Networks from Xavier Giro
}, author = {Romero-Lopez, Adria}, editor = {Xavier Gir{\'o}-i-Nieto and Marques, Oge} } @conference {cLuque17, title = {Spatio-Temporal Road Detection from Aerial Imagery using CNNs}, booktitle = {International Conference on Computer Vision Theory and Applications}, year = {2017}, month = {2/2017}, address = {Porto, Portugal}, abstract = {

The main goal of this paper is to detect roads from aerial imagery recorded by drones. To achieve this, we propose a modification of SegNet, a deep fully convolutional neural network for image segmentation. In order to train this neural network, we have put together a database containing videos of roads from the point of view of a small commercial drone. Additionally, we have developed an image annotation tool based on the watershed technique, in order to perform a semi-automatic labeling of the videos in this database. The experimental results using our modified version of SegNet show a big improvement on the performance of the neural network when using aerial imagery, obtaining over 90\% accuracy.

}, doi = {10.5220/0006128904930500}, author = {Luque, B. and Morros, J.R. and Ruiz-Hidalgo, J.} } @mastersthesis {xRoldan, title = {Visual Question Answering 2.0}, year = {2017}, abstract = {

This bachelor{\textquoteright}s thesis explores di erent deep learning techniques to solve the Visual Question-Answering (VQA) task, whose aim is to answer questions about images. We study di erent Convolutional\ Neural Networks (CNN) to extract the visual representation from images: Kernelized-CNN (KCNN), VGG-16 and Residual Networks (ResNet). We also analyze the impact of using\ pre-computed word embeddings trained in large datasets (GloVe embeddings). Moreover, we\ examine di erent techniques of joining representations from di erent modalities. This work has\ been submitted to the second edition Visual Question Answering Challenge, and obtained a\ 43.48\% of accuracy.

[Project page]

Visual Question Answering 2.0 from Xavier Giro-i-Nieto
}, author = {Rold{\'a}n, Francisco}, editor = {Xavier Gir{\'o}-i-Nieto and Masuda-Mora, Issey and Pascual-deLaPuente, Santiago} } @conference {cFernandeza, title = {ViTS: Video Tagging System from Massive Web Multimedia Collections}, booktitle = {ICCV 2017 Workshop on Web-scale Vision and Social Media }, year = {2017}, month = {10/2017}, address = {Venice, Italy}, abstract = {

The popularization of multimedia content on the Web has arised the need to automatically understand, index and retrieve it. In this paper we present ViTS, an automatic Video Tagging System which learns from videos, their web context and comments shared on social networks. ViTS analyses massive multimedia collections by Internet crawling, and maintains a knowledge base that updates in real time with no need of human supervision. As a result, each video is indexed with a rich set of labels and linked with other related contents. ViTS is an industrial product under exploitation with a vocabulary of over 2.5M concepts, capable of indexing more than 150k videos per month. We compare the quality and completeness of our tags with respect to the ones in the YouTube-8M dataset, and we show how ViTS enhances the semantic annotation of the videos with a larger number of labels (10.04 tags/video), with an accuracy of 80,87\%.

}, author = {Fern{\`a}ndez, D{\`e}lia and David Varas and Espadaler, Joan and Ferreira, Jordi and Woodward, Alejandro and Rodr{\'\i}guez, David and Xavier Gir{\'o}-i-Nieto and Riveiro, Juan Carlos and Bou, Elisenda} } @article {aPerez-Pellitero16, title = {Antipodally Invariant Metrics For Fast Regression-Based Super-Resolution}, journal = {IEEE Transactions on Image Processing}, volume = {25}, year = {2016}, month = {06/2016}, pages = {2468}, chapter = {2456}, abstract = {

Dictionary-based Super-Resolution algorithms usually select dictionary atoms based on distance or similarity metrics. Although the optimal selection of nearest neighbors is of central importance for such methods, the impact of using proper metrics for Super-Resolution (SR) has been overlooked in literature, mainly due to the vast usage of Euclidean distance. In this paper we present a very fast regression-based algorithm which builds on densely populated anchored neighborhoods and sublinear search structures. We perform a study of the nature of the features commonly used for SR, observing that those features usually lie in the unitary hypersphere, where every point has a diametrically opposite one, i.e. its antipode, with same module and angle, but opposite direction. Even though we validate the benefits of using antipodally invariant metrics, most of the binary splits use Euclidean distance, which does not handle antipodes optimally. In order to benefit from both worlds, we propose a simple yet effective Antipodally Invariant Transform (AIT) that can be easily included in the Euclidean distance calculation. We modify the original Spherical Hashing algorithm with this metric in our Antipodally Invariant Spherical Hashing scheme, obtaining the same performance as a pure antipodally invariant metric. We round up our contributions with a novel feature transform that obtains a better coarse approximation of the input image thanks to Iterative Back Projection. The performance of our method, which we named Antipodally Invariant Super-Resolution (AIS), improves quality (PSNR) and it is faster than any other state-of-the-art method.

}, doi = {10.1109/TIP.2016.2549362}, url = {http://perezpellitero.github.io/project_websites/ais_sr.html}, author = {E. Perez-Pellitero and Salvador, J. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @conference {cPoignant16, title = {The CAMOMILE Collaborative Annotation Platform for Multi-modal, Multi-lingual and Multi-media Documents}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, month = {05/2016}, address = {Portoro{\v z} (Slovenia)}, abstract = {

In this paper, we describe the organization and the implementation of the CAMOMILE collaborative annotation framework for multimodal, multimedia, multilingual (3M) data. Given the versatile nature of the analysis which can be performed on 3M data, the structure of the server was kept intentionally simple in order to preserve its genericity, relying on standard Web technologies. Layers of annotations, defined as data associated to a media fragment from the corpus, are stored in a database and can be managed through standard interfaces with authentication. Interfaces tailored specifically to the needed task can then be developed in an agile way, relying on simple but reliable services for the management of the centralized annotations. We then present our implementation of an active learning scenario for person annotation in video, relying on the CAMOMILE server; during a dry run experiment, the manual annotation of 716 speech segments was thus propagated to 3504 labeled tracks. The code of the CAMOMILE framework is distributed in open source.

}, keywords = {active learning, Annotation tool, collaborative annotation, multimedia, person annotation}, isbn = {978-2-9517408-9-1}, url = {http://www.lrec-conf.org/proceedings/lrec2016/pdf/456_Paper.pdf}, author = {Johann Poignant and Mateusz Budnik and Herv{\'e} Bredin and Claude Barras and Mickael Stefas and Pierrick Bruneau and Gilles Adda and Laurent Besacier and Hazim Ekenel and Gil Francopoulo and Javier Hernando and Joseph Mariani and Morros, J.R. and Georges Qu{\'e}not and Sophie Rosset and Thomas Tamisier} } @article {aMaceira, title = {Depth map compression via 3D region-based representation}, journal = {Multimedia Tools and Applications}, year = {2016}, month = {07/2016}, abstract = {

The final publication is available at Springer via http://dx.doi.org/10.1007/s11042-016-3727-1

In 3D video, view synthesis is used to create new virtual views between encoded camera views. Errors in the coding of the depth maps introduce geometry inconsistencies in synthesized views. In this paper, a new 3D plane representation of the scene is presented which improves the performance of current standard video codecs in the view synthesis domain. Two image segmentation algorithms are proposed for generating a color and depth segmentation. Using both partitions, depth maps are segmented into regions without sharp discontinuities without having to explicitly signal all depth edges. The resulting regions are represented using a planar model in the 3D world scene. This 3D representation allows an ecient encoding while preserving the 3D characteristics of the scene. The 3D planes opens up the possibility to code multiview images with an unique representation.

}, keywords = {3D representation, Data Compression, Depth map coding, Image segmentation}, doi = {10.1007/s11042-016-3727-1}, url = {http://rdcu.be/nqyE}, author = {Maceira, M. and Morros, J.R. and Ruiz-Hidalgo, J.} } @conference {cPerez-Pellitero16, title = {Half Hypersphere Confinement for Piecewise Linear Regression}, booktitle = {IEEE Winter Conference on Applications of Computer Vision}, year = {2016}, month = {03/2016}, address = {Lake Placid, NY, USA}, abstract = {

2016 WACV\  \  \ 

IIn this paper we study the characteristics of the metrics best suited for the piecewise regression algorithms, in which comparisons are usually made between normalized vectors that lie on the unitary hypersphere. Even though Euclidean distance has been widely used for this purpose, it is suboptimal since it does not handle antipodal points (i.e. diametrically opposite points) properly. Therefore, we propose the usage of antipodally invariant metrics and introduce the Half Hypersphere Confinement (HHC), a fast alternative to Multidimensional Scaling (MDS) that allows to map antipodally invariant distances in the Euclidean space with very little approximation error. The performance of our method, which we named HHC Regression (HHCR), applied to Super-Resolution (SR) improves both in quality (PSNR) and it is faster than any other state-of-the-art method. Additionally, under an application-agnostic interpretation of our regression framework, we also test our algorithm for denoising and depth upscaling with promising results.

}, doi = {10.1109/WACV.2016.7477651}, author = {E. Perez-Pellitero and Salvador, J. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @conference {cCasanova16, title = {Interactive Registration Method for 3D data Fusion}, booktitle = {IC3D}, year = {2016}, month = {12/2016}, publisher = {IEEE}, organization = {IEEE}, address = {Li{\`e}ge, Belgium}, abstract = {

Commercial depth sensors represent an opportunity for automation of certain 3D production and analysis tasks. One way to overcome some of their inherent limitations is by capturing the same scene with several depth sensors and merging their data, i.e. by performing 3D data fusion, which requires the registration of point clouds from different sensors. We propose a new interactive, fast and user-friendly method for depth sensor registration. We replace the traditional checkerboard pattern used to extract key points in the scene by a finger detector. This provides a main advantage: the method is easier to use and does not require external objects, while the elapsed time and the registration error are similar to those obtained through the classical method.

We test the proposed approach with an interactive hand tracking application, improved to use more than a single sensor, and we show the increase in detection area by more than 70\%.

}, url = {http://www.3dstereomedia.eu/ic3d}, author = {A. Casanova and A. Pujol-Mir{\'o} and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cdeOliveira-Barra, title = {LEMoRe: A Lifelog Engine for Moments Retrieval at the NTCIR-Lifelog LSAT Task}, booktitle = {The 12th NTCIR Conference, Evaluation of Information Access Technologies}, year = {2016}, month = {06/2016}, publisher = {National Institute of Informatics (NII)}, organization = {National Institute of Informatics (NII)}, address = {Tokyo, Japan}, abstract = {

Semantic image retrieval from large amounts of egocentric visual data requires to leverage powerful techniques for filling in the semantic gap. This paper introduces LEMoRe, a Lifelog Engine for Moments Retrieval, developed in the context of the Lifelog Semantic Access Task (LSAT) of the the NTCIR-12 challenge and discusses its performance variation on different trials. \ LEMoRe integrates classical image descriptors with high-level semantic concepts extracted by Convolutional Neural Networks (CNN), powered by a graphic user interface that uses natural language processing. \ Although this is just a first attempt towards interactive image retrieval from large egocentric datasets and there is a large room for improvement of the system components and the user interface, the structure of the system itself and the way the single components cooperate are very promising.

LEMoRe - A Lifelog Engine for Moments Retrieval at NTCIR-12 from University of Barcelona
}, url = {http://research.nii.ac.jp/ntcir/workshop/OnlineProceedings12/NTCIR/toc_ntcir.html$\#$Lifelog}, author = {Gabriel de Oliveira-Barra and Xavier Gir{\'o}-i-Nieto and Cartas-Ayala, Alejandro and Radeva, Petia} } @conference {cGurrin, title = {LTA 2016 - The First Workshop on Lifelogging Tools and Applications}, booktitle = {ACM Multimedia}, year = {2016}, month = {10/2016}, publisher = {ACM}, organization = {ACM}, address = {Amsterdam, The Netherlands}, abstract = {

The organisation of personal data is receiving increasing research attention due to the challenges that are faced in gathering, enriching, searching and visualising this data. Given the increasing quantities of personal data being gathered by individuals, the concept of a lifelog digital library of rich multimedia and sensory content for every individual is fast becoming a reality. The LTA2016 lifelogging workshop at ACM MM 2016 aims to bring together academics and practitioners to discuss approaches to lifelog data analytics and the applications of same, and to debate the opportunities and challenges for researchers in this new and challenging area.

[Workshop web page]

[Workshop proceedings]

[UPCommons]

}, keywords = {lifelogging, Personal digital archives, Personal information management}, doi = {http://dx.doi.org/10.1145/2964284.2980534}, url = {http://lta2016.computing.dcu.ie/}, author = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto and Radeva, Petia and Dimiccoli, M. and Johansen, H. and Joho, H. and Singh, Vivek K} } @conference {cPerez-Pellitero, title = {PSyCo: Manifold Span Reduction for Super Resolution}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition}, year = {2016}, month = {06/2016}, address = {Las Vegas, Nevada, USA}, abstract = {

The main challenge in Super Resolution (SR) is to discover the mapping between the low- and high-resolution manifolds of image patches, a complex ill-posed problem which has recently been addressed through piecewise linear regression with promising results. In this paper we present a novel regression-based SR algorithm that benefits from an extended knowledge of the structure of both manifolds. We propose a transform that collapses the 16 variations induced from the dihedral group of transforms (i.e. rotations, vertical and horizontal reflections) and antipodality (i.e. di- ametrically opposed points in the unitary sphere) into a single primitive. The key idea of our transform is to study the different dihedral elements as a group of symmetries within the high-dimensional manifold. We obtain the respective set of mirror-symmetry axes by means of a frequency analysis of the dihedral elements, and we use them to collapse the redundant variability through a modified symmetry distance. The experimental validation of our algorithm shows the effectiveness of our approach, which obtains competitive quality with a dictionary of as little as 32 atoms (reducing other methods{\textquoteright} dictionaries by at least a factor of 32) and further pushing the state-of-the-art with a 1024 atoms dictionary.

}, url = {http://perezpellitero.github.io/}, author = {E. Perez-Pellitero and Salvador, J. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @mastersthesis {xReyesa, title = {Time-sensitive Egocentric Image Retrieval for Fidings Objects in Lifelogs}, year = {2016}, abstract = {

Advisors: Eva Mohedano (Insight DCU), Kevin McGuinness (Insight DCU) and\ Xavier Gir{\'o}-i-Nieto\ (UPC)

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Grade: A (9.4/10.0)

This work explores diverse practices for conducting an object search from large amounts of egocentric images taking into account their temporal information. The application of this technology is to identify where personal belongings were lost or forgotten. We develop a pipeline-structured system. Firstly, the images of the day being scanned are sorted based on their probability to depict the forgotten object. This stage is solved by applying an existing visual search engine based on deep learning features. Secondly, a learned threshold selects the top ranked images as candidates to contain the object. Finally the images are reranked based on temporal and diversity criteria. Furthermore, we build a validation environment for assessing the system{\textquoteright}s performance aiming to find the optimal configuration of its parameters. Due to the lack of related works to be compared with, this thesis proposes an novel evaluation framework and metric to assess the problem.

[Tfg cristian reyes] time sensitive egocentric image retrieval for finding objects in lifelogs from Xavier Giro
}, author = {Reyes, Cristian}, editor = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @article {xReyes, title = {Where did I leave my phone ?}, year = {2016}, month = {06/2016}, institution = {4th Workshop on Egocentric (First-Person) Vision, CVPR 2016}, type = {Extended abstract}, address = {Las Vegas, NV, USA}, abstract = {

The interest of users in having their lives digitally recorded has grown in the last years thanks to the advances on wearable sensors.\ Wearable cameras are one of the most informative ones, but they generate large amounts of images that require automatic analysis to build useful applications upon them.\ In this work we explore the potential of these devices to find the last appearance of personal objects among the more than 2,000 images that are generated everyday.\ This application could help into developing personal assistants capable of helping users when they do not remember where they left their personal objects.\ We adapt a previous work on instance search to the specific domain of egocentric vision.

Extended abstract presented as poster in the 4th Workshop on Egocentric (First-Person) Vision,\ CVPR 2016.

}, author = {Reyes, Cristian and Mohedano, Eva and McGuinness, Kevin and O{\textquoteright}Connor, N. and Xavier Gir{\'o}-i-Nieto} } @conference {cReyes, title = {Where is my Phone? Personal Object Retrieval from Egocentric Images}, booktitle = {Lifelogging Tools and Applications Workshop in ACM Multimedia}, year = {2016}, month = {10/2016}, publisher = {ACM}, organization = {ACM}, address = {Amsterdam, The Netherlands}, abstract = {

This work presents a retrieval pipeline and evaluation scheme for the problem of finding the last appearance of personal objects in a large dataset of images captured from a wearable camera. Each personal object is modelled by a small set of images that define a query for a visual search engine.The retrieved results are reranked considering the temporal timestamps of the images to increase the relevance of the later detections. Finally, a temporal interleaving of the results is introduced for robustness against false detections. The Mean Reciprocal Rank is proposed as a metric to evaluate this problem. This application could help into developing personal assistants capable of helping users when they do not remember where they left their personal belongings.

}, doi = {http://dx.doi.org/10.1145/2983576.2983582}, url = {http://arxiv.org/abs/1608.08139}, author = {Reyes, Cristian and Mohedano, Eva and McGuinness, Kevin and Noel E. O{\textquoteright}Connor and Xavier Gir{\'o}-i-Nieto} } @conference {cPerez-Pellitero15, title = {Accelerating Super-Resolution for 4K Upscaling}, booktitle = {IEEE International Conference on Consumer Electronics}, year = {2015}, month = {01/2015}, address = {Las Vegas, NV, USA}, abstract = {

This paper presents a fast Super-Resolution (SR) algorithm based on a selective patch processing. Motivated by the observation that some regions of images are smooth and unfocused and can be properly upscaled with fast interpolation methods, we locally estimate the probability of performing a degradation-free upscaling. Our proposed framework explores the usage of supervised machine learning techniques and tackles the problem using binary boosted tree classifiers. The applied upscaler is chosen based on the obtained probabilities: (1) A fast upscaler (e.g. bicubic interpolation) for those regions which are smooth or (2) a linear regression SR algorithm for those which are ill-posed. The proposed strategy accelerates SR by only processing the regions which benefit from it, thus not compromising quality. Furthermore all the algorithms composing the pipeline are naturally parallelizable and further speed-ups could be obtained.

}, doi = {10.1109/ICCE.2015.7066429}, author = {E. Perez-Pellitero and Salvador, J. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @conference {cRoldan-Carlosa, title = {Event Video Retrieval using Global and Local Descriptors in Visual Domain}, booktitle = {IEEE/ACM International Workshop on Content-Based Multimedia Indexing - CBMI 2015 }, year = {2015}, month = {06/2015}, abstract = {

With the advent of affordable multimedia smart phones, it has become common that people take videos when they are at events. The larger the event, the larger is the amount of videos taken there and also, the more videos get shared online. To search in this mass of videos is a challenging topic. In this paper we present and discuss a prototype software for searching in such videos. We focus only on visual information, and we report on experiments based on a research data set. With a small study we show that our prototype demonstrates promising results by identifying the same scene in different videos taken from different angles solely based on content based image retrieval.

}, url = {https://upcommons.upc.edu/handle/2117/76553}, author = {Roldan-Carlos, Jennifer and Lux, Mathias and Xavier Gir{\'o}-i-Nieto and Pia Mu{\~n}oz-Trallero and Anagnostopoulos, Nektarios} } @conference {cBazazian15, title = {Fast and Robust Edge Extraction in Unorganized Point Clouds}, booktitle = {International Conference on Digital Image Computing: Techniques and Applications}, year = {2015}, month = {11/2015}, publisher = {DICTA/IEEE}, organization = {DICTA/IEEE}, address = {Adelaide, Australia}, abstract = {

Edges provide important visual information in scene surfaces. The need for fast and robust feature extraction from 3D data is nowadays fostered by the widespread availability of cheap commercial depth sensors and multi-camera setups. This article investigates the challenge of detecting edges in surfaces represented by unorganized point clouds. Generally, edge recognition requires the extraction of geometric features such as normal vectors and curvatures. Since the normals alone do not provide enough information about the geometry of the cloud, further analysis of extracted normals is needed for edge extraction, such as a clustering method. Edge extraction through these techniques consists of several steps with parameters which depend on the density and the scale of the point cloud. In this paper we propose a fast and precise method to detect sharp edge features by analysing the eigenvalues of the covariance matrix that are defined by each point{\textquoteright}s k-nearest neighbors. Moreover, we evaluate quantitatively, and qualitatively the proposed methods for sharp edge extraction using several dihedral angles and well known examples of unorganized point clouds. Furthermore, we demonstrate the robustness of our approach in the noisier real-world datasets.

}, doi = {10.1109/DICTA.2015.7371262}, url = {http://dictaconference.org/dicta2015/}, author = {D. Bazazian and Casas, J. and Ruiz-Hidalgo, J.} } @mastersthesis {xRamos-Caballero15, title = {Keyframe-based Video Summarization Designer}, year = {2015}, abstract = {

Advisors: Xavier Gir{\'o}-i-Nieto (UPC) and Horst Eidenberger (TU Wien)

Studies: Bachelor Degree inAudiovisual Systems Engineeringat Telecom BCN-ETSETB from the Technical University of Catalonia (UPC)

Grade: B (8.7/10)

This Final Degree Work extends two previous projects and consists in carrying out an improvement of the video keyframe extraction module from one of them called Designer Master, by integrating the algorithms that were developed in the other, Object Maps.

Firstly the proposed solution is explained, which consists in a shot detection method, where the input video is sampled uniformly and afterwards, cumulative pixel-to-pixel difference is applied and a classifier decides which frames are keyframes or not.

Last, to validate our approach we conducted a user study in which both applications were compared. Users were asked to complete a survey regarding to different summaries created by means of the original application and with the one developed in this project. The results obtained were analyzed and they showed that the improvement done in the keyframes extraction module improves slightly the application performance and the quality of the generated summaries.

Keyframe-based Video Summarization Designer from Xavier Giro
}, author = {Ramos-Caballero, Carlos}, editor = {Eidenberger, Horst and Xavier Gir{\'o}-i-Nieto} } @article {aBonet-Carne14a, title = {Quantitative Ultrasound Texture Analysis of Fetal Lungs To Predict Neonatal Respiratory Morbidity}, journal = {Ultrasound in Obstetrics and Gynecology, Wiley}, volume = {45}, year = {2015}, pages = {427{\textendash}433}, author = {E. Bonet-Carne and M. Palacio and T. Cobo and A. Perez-Moreno and M. Lopez and J. P. Piraquive and J. C. Ramirez and F. Marques and E. Gratacos} } @conference {cMaceira15, title = {Region-based depth map coding using a 3D scene representation}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing}, year = {2015}, month = {04/2015}, address = {Brisbane, Australia}, abstract = {

In 3D video, view synthesis is used to process new virtual views between encoded camera views. Errors in the coding of the depth maps introduce geometry inconsistencies in synthesized views. In this paper, a 3D plane representation of the scene is presented which improve the performance of current standard video codecs in the view synthesis domain. Depth maps are segmented into regions without sharp edges and represented with a plane in the 3D world scene coordinates. This 3D representation allows an efficient representation while preserving the 3D characteristics of the scene. Experimental results are provided obtaining gains from 10 to 40 \% in bitrate compared to HEVC.

}, author = {Maceira, M. and Morros, J.R. and Ruiz-Hidalgo, J.} } @mastersthesis {xLidon, title = {Semantic and Diverse Summarization of Egocentric Photo Events}, year = {2015}, abstract = {

Student: Aniol Lidon

Advisors: Petia Radeva (UB) and Xavier Gir{\'o}-i-Nieto (UPC)

Program: Master in Computer Vision

Grade: A (9.8/10.0)

This project generates visual summaries of events depicted from egocentric photos taken with a wearable camera. These summaries are addressed to mild-dementia patients in order to exercise their memory in a daily base. The main contribution is an iterative approach that guarantees the semantic diversity of the summary and a novel soft metric to assess subjective results. Medical experts validated the proposed solution with a Mean Opinion Score of 4.6 out of of 5.0. The flexibility and quality of the solution was also tested in the 2015 Retrieving Diverse Social Images Task from the scientific international benchmark, MediaEval.\ 

Semantic and Diverse Summarization of Egocentric Photo Events from Xavier Giro
Aniol Lidon Master Thesis
}, author = {Lidon, Aniol}, editor = {Radeva, Petia and Xavier Gir{\'o}-i-Nieto} } @conference {cLidon, title = {UPC-UB-STP @ MediaEval 2015 Diversity Task: Iterative Reranking of Relevant Images}, booktitle = {MediaEval 2015 Workshop}, year = {2015}, month = {09/2015}, abstract = {

This paper presents the results of the UPC-UB-STP team in the 2015 MediaEval Retrieving Diverse Images Task.The goal of the challenge is to provide a ranked list of Flickr photos for a predefined set of queries. Our approach firstly generates a ranking of images based on a query-independent estimation of its relevance. Only top results are kept and iteratively re-ranked based on their intra-similarity to introduce diversity.

UPC-UB-STP @ MediaEval 2015 diversity task iterative reranking of relevant images from Xavier Giro
}, url = {http://ceur-ws.org/Vol-1436/}, author = {Lidon, Aniol and Bola{\~n}os, Marc and Seidl, Markus and Xavier Gir{\'o}-i-Nieto and Radeva, Petia and Zeppelzauer, Matthias} } @conference {cRoldan-Carlos, title = {Visual Information Retrieval in Endoscopic Video Archives}, booktitle = {IEEE/ACM International Workshop on Content-Based Multimedia Indexing - CBMI 2015 }, year = {2015}, month = {06/2015}, address = {Prague, Czech Republic}, abstract = {

In endoscopic procedures, surgeons work with live video strea\-ms from the inside of their subjects. A main source for documentation of procedures are still frames from the video, identified and taken during the surgery. However, with growing demands and technical means, the streams are saved to storage servers and the surgeons need to retrieve parts of the videos on demand. In this submission we present a demo application allowing for video retrieval based on visual features and late fusion, which allows surgeons to re-find shots taken during the procedure.

[Paper on arXiv]

[CBMI 2015 Conference website]

Presented in the Special Session on Medical Multimedia Processing (acceptance rate for special sessions= 55\%)

}, url = {http://arxiv.org/abs/1504.07874}, author = {Roldan-Carlos, Jennifer and Lux, Mathias and Xavier Gir{\'o}-i-Nieto and Pia Mu{\~n}oz-Trallero and Anagnostopoulos, Nektarios} } @mastersthesis {xRoldan-Carlos, title = {Visual Search for Musical Performances and Endoscopic Videos}, year = {2015}, abstract = {

Advisors: Mathias Lux (Klagenfurt University) and\ Xavier Gir{\'o}-i-Nieto\ (UPC)

Degree: Electronic Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)

Grade: A (9.5/10)

This project explores the potential of LIRE, an en existing Content-Based Image Retrieval (CBIR) system, when used to retrieve medical videos. These videos are recording of the live streams used by surgeons during the endoscopic procedures, captured from inside of the subject. The growth of such video content stored in servers requires search engines capable to assist surgeons in their management and retrieval. In our tool, queries are formulated by visual examples and those allow surgeons to re-find shots taken during the procedure. This thesis presents an extension and adaptation of Lire for video retrieval based on visual features and late fusion. The results are assessed from two perspectives: a quantitative and qualitative one. While the quantitative one follows the standard practices and metrics for video retrieval, the qualitative assessment has been based on an empirical social study using a semi-interactive web-interface. In particular, a thinking aloud test was applied to analyze if the user expectations and requirements were fulfilled. Due to the scarcity of surgeons available for the qualitative tests, a second domain was also addressed: videos captured at musical performances. These type of videos has also experienced an exponential growth with the advent of affordable multimedia smart phones, available to a large audience. Analogously to the endoscopic videos, searching in a large data set of such videos is a challenging topic.

}, url = {http://hdl.handle.net/2099.1/26032}, author = {Roldan-Carlos, Jennifer}, editor = {Xavier Gir{\'o}-i-Nieto and Lux, Mathias} } @conference {cMestre, title = {Visual Summary of Egocentric Photostreams by Representative Keyframes}, booktitle = {IEEE International Workshop on Wearable and Ego-vision Systems for Augmented Experience (WEsAX) 2015}, year = {2015}, month = {07/2015}, address = {Turin, Italy}, abstract = {

Building a visual summary from an egocentric photostream captured by a lifelogging wearable camera is of high interest for different applications (e.g. memory reinforcement).\ In this paper, we propose a new summarization method based on keyframes selection that\ uses visual features extracted by means of a convolutional neural network. Our method applies an unsupervised clustering for dividing the photostreams into events, and finally extracts the most relevant keyframe for each event. We assess the results by applying a blind-taste test on a group of 20 people who assessed the quality of the summaries.

}, keywords = {egocentric, keyframes, lifelogging, summarization}, url = {http://arxiv.org/abs/1505.01130}, author = {Bola{\~n}os, Marc and Mestre, Ricard and Talavera, Estefan{\'\i}a and Xavier Gir{\'o}-i-Nieto and Radeva, Petia} } @conference {cBosch14, title = {An Epipolar-Constrained Prior for Efficient Search in Multi-View Scenarios}, booktitle = {EUSIPCO}, year = {2014}, month = {09/2014}, address = {Lisbon}, abstract = {

In this paper we propose a novel framework for fast exploitation of multi-view cues with applicability in different image processing problems. In order to bring our proposed framework into practice, an epipolar-constrained prior is presented, onto which a random search algorithm is proposed to find good matches among the different views of the same scene. This algorithm includes a generalization of the local coherency in 2D images for multi-view wide-baseline cases. Experimental results show that the geometrical constraint allows a faster initial convergence when finding good matches. We present some applications of the proposed framework on classical image processing problems.

}, keywords = {approximate nearest neighbor, deblurring, epipolar line, Super resolution}, author = {Bosch, I. and Salvador, J. and E. Perez-Pellitero and Ruiz-Hidalgo, J.} } @conference {cPerez-Pellitero14, title = {Fast Super-Resolution via Dense Local Training and Inverse Regressor Search}, booktitle = {Asian Conference in Computer Vision (ACCV)}, year = {2014}, month = {11/2014}, address = {Singapore}, abstract = {

Regression-based Super-Resolution (SR) addresses the up- scaling problem by learning a mapping function (i.e. regressor) from the low-resolution to the high-resolution manifold. Under the locally linear assumption, this complex non-linear mapping can be properly modeled by a set of linear regressors distributed across the manifold. In such methods, most of the testing time is spent searching for the right regressor within this trained set. In this paper we propose a novel inverse-search approach for regression-based SR. Instead of performing a search from the image to the dictionary of regressors, the search is done inversely from the regressors{\textquoteright} dictionary to the image patches. We approximate this framework by applying spherical hashing to both image and regressors, which reduces the inverse search into computing a trained function. Additionally, we propose an improved training scheme for SR linear regressors which improves perceived and objective quality. By merging both contributions we improve speed and quality compared to the state-of-the-art.

}, author = {E. Perez-Pellitero and Salvador, J. and Torres-Xirau, I. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @conference {cMorros14, title = {Materials transversals per a l{\textquoteright}aprenentatge actiu de les mat{\`e}ries de processat d{\textquoteright}imatge i v{\'\i}deo}, booktitle = {Congr{\'e}s Internacional de Doc{\`e}ncia Universit{\`a}ria i Innovaci{\'o} (CIDUI)}, year = {2014}, month = {07/2014}, address = {Tarragona, Spain}, abstract = {

This work aims to promote the cooperation and coordination between different image/video processing courses taught at the UPC in order to enhance the learning results. The main contributions are a) the creation of a common set of materials: graphical demonstrators, collections of problems, question banks, etc. and b) the design of strategies strategies to use this material in the development of generic and specific skills, with special emphasis on promoting independent learning

}, keywords = {image/video processing, Matlab demonstrators, question Banks, specific skills, teaching material}, author = {Morros, J.R. and Ver{\'o}nica Vilaplana and Ruiz-Hidalgo, J. and Casas, J. and Gasull, A. and Marqu{\'e}s, F. and M. Pard{\`a}s and Salembier, P.} } @book {bSchreer14, title = {Media Production, Delivery and Interaction for Platform Independent Systems}, volume = {ISBN 978-1-118-60533-2}, year = {2014}, publisher = {Wiley, ISBN 978-1-118-60533-2}, organization = {Wiley, ISBN 978-1-118-60533-2}, issn = {978-1-118-60533-2}, url = {http://eu.wiley.com/WileyCDA/WileyTitle/productCd-1118605330,subjectCd-EEJ0.html}, author = {Schreer, O. and Macq, J. and Niamut, O. and Ruiz-Hidalgo, J. and Shirley, B. and Thallinger, G. and Thomas, G.} } @article {aBarkhuus14, title = {New interaction modes for rich panoramic live video experiences}, journal = {Behaviour \& Information Technology}, volume = {33}, year = {2014}, month = {07/2014}, chapter = {859-869}, abstract = {

The possibilities of panoramic video are based on the capabilities of high-resolution digital video streams and higher band- width{\textquoteright}s opportunities to broadcast, stream and transfer large content across platforms. With these opportunities also come challenges such as how to focus on sub-parts of the video stream and interact with the content shown on a large screen. In this paper, we present studies of two different interaction modes with a large-scale panoramic video for live experiences; we focus on interactional challenges and explore if it is (1) possible to develop new interactional methods/ways of approaching this type of high-resolution content and (2) feasible for users to interact with the content in these new ways. We developed prototypes for two different interaction modes: an individual system on a mobile device, either a tablet or a mobile phone, for interacting with the content on the same and a non-touch gesture-based system for the home or small group interaction. We present pilot studies where we explore the possibilities and challenges with these two interaction modes for panoramic content.

}, keywords = {interaction modes, interactive television, panoramic video, pilot studies}, doi = {10.1080/0144929X.2014.914975}, url = {http://www.tandfonline.com/doi/full/10.1080/0144929X.2014.914975}, author = {Barkhuus, Louise and Zoric, Goranka and Engstr{\"o}m, Arvid and Ruiz-Hidalgo, J. and Verzijp, Nico} } @article {aBonet-Carne14, title = {Quantitative Ultrasound Texture Analysis of Fetal Lungs To Predict Neonatal Respiratory Morbidity}, journal = {Ultrasound in Obstetrics and Gynecology, Wiley}, volume = {44}, year = {2014}, author = {E. Bonet-Carne and M. Palacio and T. Cobo and A. Perez-Moreno and M. Lopez and J. P. Piraquive and J. C. Ramirez and F. Marques and E. Gratacos} } @article {Suau2014, title = {Real-time Fingertip Localization Conditioned on Hand Gesture Classification}, journal = {Image and Vision Computing}, volume = {32}, year = {2014}, month = {05/2014}, pages = {522 - 532}, abstract = {

A method to obtain accurate hand gesture classification and fingertip localization from depth images is proposed. The Oriented Radial Distribution feature is utilized, exploiting its ability to globally describe hand poses, but also to locally detect fingertip positions. Hence, hand gesture and fingertip locations are characterized with a single feature calculation. We propose to divide the difficult problem of locating fingertips into two more tractable problems, by taking advantage of hand gesture as an auxiliary variable. Along with the method we present the ColorTip dataset, a dataset for hand gesture recognition and fingertip classification using depth data. ColorTip contains sequences where actors wear a glove with with colored fingertips, allowing automatic annotation. The proposed method is evaluated against recent works in several datasets, achieving promising results in both gesture classification and fingertip localization.

}, keywords = {dataset, fingertip classification, hand gesture recognition, interactivity, range camera}, issn = {0262-8856}, doi = {10.1016/j.imavis.2014.04.015}, url = {http://www.sciencedirect.com/science/article/pii/S0262885614000845}, author = {Suau, X. and Alcoverro, M. and L{\'o}pez-M{\'e}ndez, A. and Ruiz-Hidalgo, J. and Casas, J.} } @mastersthesis {xRamon13, title = {Algorithms for B wave detection}, year = {2013}, abstract = {

The objective of this Master Thesis was to develop algorithms for B waves detection in ICP. This goal was approached by two different methods that depend basically in the resolution of the acquired ICP. Then, both methods were adapted to work in an ultra-low power microcontroller. The first method works using ICP recorded at 1 Hz and it is based on the Lundberg{\textquoteright}s definition of B wave. A plus of this algorithm is that reduces to the minimum the number of samples per block to classify. The results obtained after testing it using long records of ICP from 27 patients were an accuracy of 89,59\%, a specificity 89,71\% and a sensitivity of 89,16\%. These results did not change when the code was adapted to the microcontroller. The second method requires ICP obtained with a sampling rate of 100 Hz. It is based on the morphology of the pulse waves present in the ICP and caused by the change of blood volume inside the skull with every heartbeat. A total of 1430 blocks of ICP (864 for lack of B wave and 566 for presence of B wave), everyone with duration of 41 seconds, were used to extract 21 features from each one. Then a MLP classifier and a SVM classifier were tested and compared. The best results were obtained by the SVM classifier, reaching an accuracy of 86,37\%, a specificity of 88,09\% and a sensitivity of 83,74\% when all features were used. After adapting the algorithm to the microcontroller the results were nearly the same.

}, url = {http://hdl.handle.net/2099.1/19034}, author = {Ramon, Eduard} } @conference {cPerez-Pellitero13, title = {Bayesian region selection for adaptive dictionary-based Super-Resolution}, booktitle = {British Machine Vision Conference}, year = {2013}, month = {09/2013}, abstract = {

The performance of dictionary-based super-resolution (SR) strongly depends on the contents of the training dataset. Nevertheless, many dictionary-based SR methods randomly select patches from of a larger set of training images to build their dictionaries,\ thus relying on patches being diverse enough. This paper describes an external-dictionary SR algorithm based on adaptively selecting an optimal subset of patches out of the training images. Each training image is divided into sub-image entities, named regions, of such size that texture consistency is preserved. For each input patch to super-resolve, the best-fitting region (with enough high-frequency energy) is found through a Bayesian selection. In order to handle the high number of regions in the train- ing dataset, a local Naive Bayes Nearest Neighbor (NBNN) approach is used. Trained with this adapted subset of patches, sparse coding SR is applied to recover the high-resolution image. Experimental results demonstrate that using our adaptive algorithm produces an improvement in SR performance with respect to non-adaptive training.\ 

}, author = {E. Perez-Pellitero and Salvador, J. and Ruiz-Hidalgo, J. and Rosenhahn, B.} } @article {aSuau13, title = {Detecting End-Effectors on 2.5D data using Geometric Deformable Models: Application to Human Pose Estimation}, journal = {Computer Vision and Image Understanding (CVIU)}, volume = {117}, year = {2013}, chapter = {281{\textendash}288}, abstract = {

End-effectors are usually related to the location of limbs, and their reliable detection enables robust body tracking as well as accurate pose estimation. Recent innovation in depth cameras has re-stated the pose estimation problem. We focus on the information provided by these sensors, for which we borrow the name 2.5D data from the Graphics community. In this paper we propose a human pose estimation algorithm based on topological propagation. Geometric Deformable Models are used to carry out such propagation, implemented according to the Narrow Band Level Set approach. A variant of the latter method is proposed, including a density restriction which helps preserving the topological properties of the object under analysis. Principal end-effectors are extracted from a directed graph weighted with geodesic distances, also providing a skeletal-like structure describing human pose. An evaluation against reference methods is performed with promising results. The proposed solution allows a frame-wise end-effector detection, with no temporal tracking involved, which may be generalized to the tracking of other objects beyond human body.


}, doi = {10.1016/j.cviu.2012.11.006}, url = {http://www.sciencedirect.com/science/article/pii/S1077314212001907}, author = {Suau, X. and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cMaceira13, title = {Fusion of colour and depth partitions for depth map coding}, booktitle = {Digital Signal Processing}, year = {2013}, month = {07/2013}, address = {Santorini, Greece}, abstract = {

3D video coding includes the use of multiple color views and depth maps associated to each view. An adequate coding of depth maps should be adapted to the characteristics of depth maps: smooth regions and sharp edges. In this paper a segmentation-based technique is proposed for improving the depth map compression while preserving the main discontinuities that exploits the color-depth similarity of 3D video. An initial coarse depth map segmentation is used to locate the main discontinuities in depth. The resulting partition is improved by fusing a color partition. We assume that the color image is first encoded and available when the associated depth map is encoded, therefore the color partition can be segmented in the decoder without introducing any extra cost. A new segmentation criterion inspired by super-pixels techniques is proposed to obtain the color partition. Initial experimental results show similar compression efficiency to hevc with a big potential for further improvements.

}, keywords = {3DTV, Depth map coding}, doi = {10.1109/ICDSP.2013.6622781}, author = {Maceira, M. and Morros, J.R. and Ruiz-Hidalgo, J.} } @article {aAlcoverro13, title = {Gesture Control Interface for immersive panoramic displays}, journal = {Multimedia Tools and Applications}, year = {2013}, month = {07/2013}, pages = {1-27}, abstract = {

In this paper, we propose a gesture-based interface designed to interact with panoramic scenes. The system combines novel static gestures with a fast hand tracking method. Our proposal is to use static gestures as shortcuts to activate functionalities of the system (i.e. volume up/down, mute, pause, etc.), and hand tracking to freely explore the panoramic video. The overall system is multi-user, and incorporates a user identi cation module based on face recognition, which is able both to recognize returning users and to add new users online. The system exploits depth data, making it robust to challenging illumination conditions. We show through experimental results the performance of every component of the system compared to the state of the art. We also show the results of a usability study\ performed with several untrained users.

}, issn = {1380-7501}, doi = {10.1007/s11042-013-1605-7}, author = {Alcoverro, M. and Suau, X. and Morros, J.R. and L{\'o}pez-M{\'e}ndez, A. and A. Gil-Moreno and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cKochale13, title = {Gesture controlled interactive rendering in a panoramic scene}, booktitle = {European Interactive TV Conference, EuroITV}, year = {2013}, month = {06/2013}, address = {Como, Italy}, url = {http://upcommons.upc.edu/e-prints/handle/2117/20470}, author = {Kochale, A. and Ruiz-Hidalgo, J. and M. Borsum} } @conference {cZoric13, title = {Gesture Interaction with Rich TV Content in the Social Setting}, booktitle = {Exploring and Enhancing the User Experience for Television, Workshop of ACM SIGCHI Conference on Human Factors in Computing Systems, CHI{\textquoteright}13}, year = {2013}, month = {04/2013}, address = {Paris, France}, abstract = {

The appearance of new immersive TV content has increased the interactive possibilities presented to the viewers. Increased interactivity is seen as a valuable feature in viewing richer television content, but new functionalities are limited by what can be done naturally and intuitively using available devices like remote controls. Therefore, new interaction techniques, such as visual gestures control systems, have appeared aiming to enhance the viewers{\textquoteright} viewing experience. In this work we begin uncovering the potential and challenges of gesture interaction with ultra high definition video for people watching TV together. As a first step we have done a study with a group of people interacting with such content using a gesture-based system in the home environment.

}, url = {http://livingroomexperience.wikispaces.com/}, author = {Zoric, Goranka and Engstr{\"o}m, Arvid and Barkhuus, Louise and Ruiz-Hidalgo, J. and Kochale, A.} } @phdthesis {dSuau13, title = {Human body analysis using depth data}, year = {2013}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {

Human body analysis is one of the broadest areas within the computer vision field. Researchers have put a strong effort in the human body analysis area, specially over the last decade, due to the technological improvements in both video cameras and processing power. Human body analysis covers topics such as person detection and segmentation, human motion tracking or action and behavior recognition. Even if human beings perform all these tasks naturally, they build-up a challenging problem from a computer vision point of view. Adverse situations such as viewing perspective, clutter and occlusions, lighting conditions or variability of behavior amongst persons may turn human body analysis into an arduous task.

In the computer vision field, the evolution of research works is usually tightly related to the technological progress of camera sensors and computer processing power. Traditional human body analysis methods are based on color cameras. Thus, the information is extracted from the raw color data, strongly limiting the proposals. An interesting quality leap was achieved by introducing the \emph{multiview} concept. That is to say, having multiple color cameras recording a single scene at the same time. With multiview approaches, 3D information is available by means of stereo matching algorithms. The fact of having 3D information is a key aspect in human motion analysis, since the human body moves in a three-dimensional space. Thus, problems such as occlusion and clutter may be overcome with 3D information.

The appearance of commercial depth cameras has supposed a second leap in the human body analysis field. While traditional multiview approaches required a cumbersome and expensive setup, as well as a fine camera calibration; novel depth cameras directly provide 3D information with a single camera sensor. Furthermore, depth cameras may be rapidly installed in a wide range of situations, enlarging the range of applications with respect to multiview approaches. Moreover, since depth cameras are based on infra-red light, they do not suffer from illumination variations.

In this thesis, we focus on the study of depth data applied to the human body analysis problem. We propose novel ways of describing depth data through specific descriptors, so that they emphasize helpful characteristics of the scene for further body analysis. These descriptors exploit the special 3D structure of depth data to outperform generalist 3D descriptors or color based ones. We also study the problem of person detection, proposing a highly robust and fast method to detect heads. Such method is extended to a hand tracker, which is used throughout the thesis as a helpful tool to enable further research. In the remainder of this dissertation, we focus on the hand analysis problem as a subarea of human body analysis. Given the recent appearance of depth cameras, there is a lack of public datasets. We contribute with a dataset for hand gesture recognition and fingertip localization using depth data. This dataset acts as a starting point of two proposals for hand gesture recognition and fingertip localization based on classification techniques. In these methods, we also exploit the above mentioned descriptor proposals to finely adapt to the nature of depth data.

}, url = {http://hdl.handle.net/10803/134801}, author = {Suau, X.}, editor = {Casas, J. and Ruiz-Hidalgo, J.} } @conference {cNiamut13, title = {Towards A Format-agnostic Approach for Production, Delivery and Rendering of Immersive Media}, booktitle = {ACM Multimedia Systems}, year = {2013}, month = {3/2013}, address = {Oslo, Norway}, abstract = {

The media industry is currently being pulled in the often-opposing directions of increased realism (high resolution, stereoscopic, large screen) and personalization (selection and control of content, availability on many devices). We investigate the feasibility of an end-to-end format-agnostic approach to support both these trends. In this paper, different aspects of a format- agnostic capture, production, delivery and rendering system are discussed. At the capture stage, the concept of layered scene representation is introduced, including panoramic video and 3D audio capture. At the analysis stage, a virtual director component is discussed that allows for automatic execution of cinematographic principles, using feature tracking and saliency detection. At the delivery stage, resolution-independent audiovisual transport mechanisms for both managed and unmanaged networks are treated. In the rendering stage, a rendering process that includes the manipulation of audiovisual content to match the connected display and loudspeaker properties is introduced. Different parts of the complete system are revisited demonstrating the requirements and the potential of this advanced concept.

}, author = {Niamut, O. and Kaiser, R. and Kienast, G. and Kochale, A. and Spille, J. and Schreer, O. and Ruiz-Hidalgo, J. and Macq, J. and Shirley, B.} } @conference {cMaceira12, title = {Depth map coding based on a optimal hierarchical region representation}, booktitle = {3DTV Conference}, year = {2012}, month = {10/2012}, publisher = {IEEE}, organization = {IEEE}, address = {Zurich, Switzerland}, abstract = {

Multiview color information used jointly with depth maps is a widespread technique for 3D video. Using\ this depth information, 3D functionalities such as free view point video can be provided by means of\ depth-image-based rendering techniques. In this paper, a new technique to encode depth maps is\ proposed. Based on the usually smooth structure and the sharp edges of depth map, our proposal segments\ the depth map into homogeneous regions of arbitrary shape and encodes the contents of these regions\ using different texture coding strategies. An optimal lagrangian approach is applied to the hierarchical\ region representation provided by our segmentation technique. This approach automatically selects the\ best encoding strategy for each region and the optimal partition to encode the depth map. To avoid the\ high coding costs of coding the resulting partition, a prediction is made using the associated decoded\ color image.

}, keywords = {3DTV, Depth map coding, depth/texture compresion, rate-distortion optimization, Shape-adaptive DCT}, isbn = {978-1-4673-4903-1}, doi = {10.1109/3DTV.2012.6365481}, author = {Maceira, M. and Ruiz-Hidalgo, J. and Morros, J.R.} } @inbook {bSuau12, title = {INTAIRACT: Joint Hand Gesture and Fingertip Classification for Touchless Interaction}, booktitle = {Computer Vision {\textendash} ECCV 2012}, volume = {7585}, year = {2012}, pages = {602-606}, publisher = {Springer}, organization = {Springer}, chapter = {3}, address = {Heidelberg}, abstract = {

In this demo we present INTAIRACT, an online hand-based touchless interaction system. Interactions are based on easy-to-learn hand gestures, that combined with translations and rotations render a user friendly and highly configurable system. The main advantage with respect to existing approaches is that we are able to robustly locate and identify fingertips. Hence, we are able to employ a simple but powerful alphabet of gestures not only by determining the number of visible fingers in a gesture, but also which fingers are being observed. To achieve such a system we propose a novel method that jointly infers hand gestures and fingertip locations using a single depth image from a consumer depth camera. Our approach is based on a novel descriptor for depth data, the Oriented Radial Distribution (ORD) [1]. On the one hand, we exploit the ORD for robust classification of hand gestures by means of efficient k-NN retrieval. On the other hand, maxima of the ORD are used to perform structured inference of fingertip locations. The proposed method outperforms other state-of-the-art approaches both in gesture recognition and fingertip localization. An implementation of the ORD extraction on a GPU yields a real-time demo running at approximately 17fps on a single laptop.

1. Suau, X., Ruiz-Hidalgo, J., Casas, J.R.: Oriented Radial Distribution on Depth\ Data: Application to the Detection of End-Effectors. In: ICASSP. (2012)

}, isbn = {978-3-642-33885-4}, issn = {978-3-642-33884-7}, doi = {10.1007/978-3-642-33885-4_62}, author = {Suau, X. and Alcoverro, M. and L{\'o}pez-M{\'e}ndez, A. and Ruiz-Hidalgo, J. and Casas, J.} } @article {aRuiz-Hidalgo12, title = {Multiview depth coding based on combined color/depth segmentation}, journal = {Journal of visual communication and image representation}, volume = {23}, number = {1}, year = {2012}, pages = {42{\textendash}52}, abstract = {

In this paper a new coding method for multiview depth video is presented. Considering the smooth structure and sharp edges of depth maps, a segmentation based approach is proposed. This allows further preserving the depth contours thus introducing fewer artifacts in the depth perception of the video. To reduce the cost associated with partition coding, an estimation of the depth partition is built using the decoded color view segmentation. This estimation is refined by sending some complementary information about the relevant differences between color and depth partitions. For coding the depth content of each region, a decomposition into orthogonal basis is used in this paper although similar decompositions may be also employed. Experimental results show that the proposed segmentation based depth coding method outperforms H.264/AVC and H.264/MVC by more than 2dB at similar bitrates.

}, keywords = {3DTV, Depth map, multiview video coding, virtual view}, issn = {1047-3203}, doi = {10.1016/j.jvcir.2011.08.001}, url = {http://www.sciencedirect.com/science/article/pii/S1047320311001040}, author = {Ruiz-Hidalgo, J. and Morros, J.R. and Aflaki, P. and Calderero, F. and Marqu{\'e}s, F.} } @conference {cSuau12, title = {Oriented radial distribution on depth data: Application to the detection of end-effectors}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing}, year = {2012}, month = {03/2012}, address = {Kyoto, Japan}, abstract = {

End-effectors are considered to be the main topological extremities of a given 3D body. Even if the nature of such body is not restricted, this paper focuses on the human body case. Detection of human extremities is a key issue in the human motion capture domain, being needed to initialize and update the tracker. Therefore, the effectiveness of human motion capture systems usually depends on the reliability of the obtained end-effectors. The increasing accuracy, low cost and easy installation of depth cameras has opened the door to new strategies to overcome the body pose estimation problem. With the objective of detecting the head, hands and feet of a human body, we propose a new local feature computed from depth data, which gives an idea of its curvature and prominence. Such feature is weighted depending on recent detections, providing also a temporal dimension. Based on this feature, some end-effector candidate blobs are obtained and classified into head, hands and feet according to three probabilistic descriptors.

}, doi = {10.1109/ICASSP.2012.6288002}, author = {Suau, X. and Ruiz-Hidalgo, J. and Casas, J.} } @article {aSuau12, title = {Real-time head and hand tracking based on 2.5D data}, journal = {IEEE Transactions on Multimedia }, volume = {14}, year = {2012}, month = {06/2012}, pages = {575-585 }, abstract = {

A novel real-time algorithm for head and hand tracking is proposed in this paper. This approach is based on data from a range camera, which is exploited to resolve ambiguities and overlaps. The position of the head is estimated with a depth-based template matching, its robustness being reinforced with an adaptive search zone. Hands are detected in a bounding box attached to the head estimate, so that the user may move freely in the scene. A simple method to decide whether the hands are open or closed is also included in the proposal. Experimental results show high robustness against partial occlusions and fast movements. Accurate hand trajectories may be extracted from the estimated hand positions, and may be used for interactive applications as well as for gesture classification purposes.

}, issn = {1520-9210}, doi = {http://dx.doi.org/10.1109/TMM.2012.2189853}, author = {Suau, X. and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cJimenez12, title = {Registration of Multi-Modal Neuroimaging Datasets by Considering the Non-Overlapping Field of View into the NMI Calculation}, booktitle = {IEEE International Symposium on Biomedical Imaging, ISBI 2012}, year = {2012}, address = {Barcelona, Spain}, author = {Jim{\'e}nez, X and Figueiras, F and Marqu{\'e}s, F. and Salembier, P. and Herance, R and Rojas, S and Mill{\'a}n, O and Pareto, D and Domingo Gispert, J} } @conference {cSalvador12, title = {Variational Reconstruction and Restoration for Video Super-Resolution}, booktitle = {International Conference on Pattern Recognition (ICPR)}, year = {2012}, month = {11-2012}, address = {Tsukuba, Japan}, abstract = {

This paper presents a variational framework for obtaining super-resolved video-sequences, based on the observation that reconstruction-based Super-Resolution (SR) algorithms are limited by two factors: registration exactitude and Point Spread Function (PSF) estimation accuracy.\ To minimize the impact of the first limiting factor, a small-scale linear inpainting algorithm is proposed to provide smooth SR video frames.\ To improve the second limiting factor, a fast PSF local estimation and total variation-based denoising is proposed. Experimental results reflect the improvements provided by the proposed method when compared to classic SR approaches.

}, url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?tp=\&arnumber=6460315\&contentType=Conference+Publications\&searchField\%3DSearch_All\%26queryText\%3Dvariational+reconstruction}, author = {Salvador, J. and Rivero, D. and Kochale, A. and Ruiz-Hidalgo, J.} } @book {eRoca11, title = {14{\`e} Premi a la Qualitat en la Doc{\`e}ncia Universit{\`a}ria 2011}, year = {2011}, url = {http://cataleg.upc.edu/record=b1395249~S1*cat}, author = {Roca, E. and Marqu{\'e}s, F.} } @conference {cNiamut11, title = {Advanced visual rendering, gesture-based interaction and distributed delivery for immersive and interactive media services}, booktitle = {International Broadcasting Convention 2011}, year = {2011}, pages = {1{\textendash}8}, isbn = {0780388747}, url = {http://www.ibc.org/page.cfm/Action=Visitor/VisitorID=2851/PageOption=Seminar_1/libEntryID=15}, author = {Niamut, O. and Kochale, A. and Ruiz-Hidalgo, J. and Macq, J. and Kienast, G.} } @mastersthesis {xRubiano11, title = {B{\'u}squeda Visual con Retroacci{\'o}n de Relevancia Basada en Actualizacion de Pesos}, year = {2011}, abstract = {

This project presents the design and implementation of different techniques of Relevance Feedback for image searches. These techniques use the user interaction with the search results to estimate the importance of various search criteria in the request submitted by the user. The results have been performed through the linear combination of similarity measures of different MPEG-7 visual descriptors. The results of this project have been compared with those previously obtained in the Final Degree Project by Carles Ventura. For this reason the system has been evaluated with the reference database, MPEG-7 Common Color Dataset (CCD).

B{\'u}squeda Visual con Retroacci{\'o}n de Relevancia basada en Actualizaci{\'o}n de Pesos from Xavi Gir{\'o}
}, url = {http://hdl.handle.net/2099.1/11792}, author = {Rubiano, Aida}, editor = {Xavier Gir{\'o}-i-Nieto} } @conference {cSchreer11, title = {Format-agnostic approach for production, delivery and rendering of immersive media}, booktitle = {Networked and Electronic Media Summit}, year = {2011}, pages = {{\textendash}}, isbn = {SN}, url = {http://nem-summit.eu/program/}, author = {Schreer, O. and Thomas, G. and Niamut, O. and Macq, J. and Kochale, A. and Batke, J. and Ruiz-Hidalgo, J. and Oldfield, R. and Shirley, B. and Thallinger, G.} } @conference {cSuau11, title = {Real-time head and hand tracking based on 2.5D data}, booktitle = {ICME - 2011 IEEE International Conference on Multimedia and Expo}, year = {2011}, pages = {1{\textendash}6}, abstract = {

A novel real-time algorithm for head and hand tracking is proposed in this paper. This approach is based on 2.5D data from a range camera, which is exploited to resolve ambiguities and overlaps. Experimental results show high robustness against partial occlusions and fast movements. The estimated positions are fairly stable, allowing the extraction of accurate trajectories which may be used for gesture classification purposes.

}, isbn = {975001880X}, doi = {10.1109/ICME.2011.6011869}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6011869\&tag=1}, author = {Suau, X. and Casas, J. and Ruiz-Hidalgo, J.} } @phdthesis {dRolon10, title = {Generalized Lifting for Sparse Image Representation and Coding}, year = {2010}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, author = {Rol{\'o}n, J.}, editor = {Salembier, P.} } @conference {cRolon10, title = {Improved local pdf estimation in the wavelet domain for generalized lifting}, booktitle = {Picture Coding Symposium, PCS 2010}, year = {2010}, address = {Nagoya, Japan}, isbn = {0-7803-3192-3}, url = {http://hdl.handle.net/2117/11675}, author = {Rol{\'o}n, J. and Salembier, P.} } @conference {cSuau10, title = {Surface reconstruction by restricted and oriented propagation}, booktitle = {2010 IEEE International Conference on Image Processing}, year = {2010}, pages = {813{\textendash}816}, isbn = {952-15-1364-0}, doi = {10.1109/ICIP.2010.5652707}, author = {Suau, X. and Casas, J. and Ruiz-Hidalgo, J.} } @mastersthesis {xSancho10, title = {Tweet@TV: Televisi{\'o} social en 140 car{\`a}cters}, year = {2010}, abstract = {

Aquest Projecte de Final de Carrera se centra en aquesta vessant dels serveis interactius de la televisi{\'o}, la televisi{\'o} social. Durant la seva realitzaci{\'o} s{\textquoteright}ha desenvolupat una aplicaci{\'o} per accedir a una xarxa social d{\textquoteright}una forma integrada i sincronitzada amb el consum de televisi{\'o}. Seguint la l{\'\i}nia de recerca del PFC d{\textquoteright}en Manel Martos, Adaptaci{\'o} i distribuci{\'o} de continguts web per IPTV, aquest projecte s{\textquoteright}ha realitzat en l{\textquoteright}empresa Activa Multim{\`e}dia Digital de la Corporaci{\'o} Catalana de Mitjans Audiovisuals entre els mesos de febrer i maig de 2010 en el marc del projecte CREA-IPTV.

Guardonat amb el segon premi de la convocat{\`o}ria Premios Liberalizaci{\'o}n de las Telecomunicaciones 2010 del Colegio de Ingenieros T{\'e}cnicos de Telecomunicaci{\'o}n (COITT), Espanya.

Cristina Ruiz Sancho, "Tweet@TV, la Televisi{\'o} Social en 140 car{\`a}cters" from Xavi Gir{\'o}
}, keywords = {interactive, microblogging, television, twitter}, url = {http://hdl.handle.net/2099.1/13523}, author = {Ruiz-Sancho, Cristina}, editor = {Xavier Gir{\'o}-i-Nieto and Cucurella, Eduard} } @conference {cRuiz-Hidalgo09, title = {Comparison of MPEG-7 descriptors for long term selection of reference frames}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2009}, year = {2009}, pages = {941{\textendash}944}, address = {Taipei, Taiwan}, isbn = {0-8194-2103-0}, doi = {10.1109/ICASSP.2009.4959740}, url = {http://hdl.handle.net/2117/8816}, author = {Ruiz-Hidalgo, J. and Salembier, P.} } @conference {cRolon09a, title = {Generalized lifting with adaptive local pdf estimation for image coding}, booktitle = {Picture coding symposium, PCS 2009}, year = {2009}, address = {Chicago, USA}, author = {Rol{\'o}n, J. and Mendon{\c c}a, E. and Salembier, P.} } @inbook {bMarques09, title = {How are digital images compressed in the web?}, booktitle = {Applied signal processing}, year = {2009}, pages = {265{\textendash}310}, isbn = {978-0-387-74534-3}, url = {http://cataleg.upc.edu/record=b1353617~S1*cat}, author = {Marqu{\'e}s, F. and Menezes, M. and Ruiz-Hidalgo, J.}, editor = {Dutoit, T. and Marqu{\'e}s, F.} } @inbook {bMarques09a, title = {How are digital TV programs compressed to allow broadcasting?}, booktitle = {Applied signal processing}, year = {2009}, pages = {311{\textendash}359}, isbn = {978-0-387-74534-3}, url = {http://cataleg.upc.edu/record=b1353617~S1*cat}, author = {Marqu{\'e}s, F. and Menezes, M. and Ruiz-Hidalgo, J.}, editor = {Dutoit, T. and Marqu{\'e}s, F.} } @article {aNeumann09, title = {Integration of audiovisual sensors and technologies in a smart room}, journal = {Personal and ubiquitous computing}, volume = {13}, number = {1}, year = {2009}, pages = {15{\textendash}23}, abstract = {

At the Technical University of Catalonia\ (UPC), a smart room has been equipped with 85 microphones and 8 cameras. This paper describes the setup of the\ sensors, gives an overview of the underlying hardware and\ software infrastructure and indicates possibilities for highand low-level multi-modal interaction. An example of\ usage of the information collected from the distributed\ sensor network is explained in detail: the system supports\ a group of students that have to solve a lab assignment\ related problem.

}, keywords = {CHIL, PROVEC}, issn = {1617-4909}, doi = {10.1007/s00779-007-0172-1}, url = {http://hdl.handle.net/2117/9468}, author = {Neumann, J. and Casas, J. and Macho, D. and Ruiz-Hidalgo, J.} } @conference {cCabrera09, title = {LAVICAD: LAboratori VIrtual de Comunicacions Anal{\`o}giques i Digitals}, booktitle = {Jornada d{\textquoteright}Innovaci{\'o} Docent - RIMA (JID-RIMA)}, year = {2009}, month = {02/2009}, publisher = {UPCommons}, organization = {UPCommons}, address = {Barcelona, Catalonia}, abstract = {

Mitjan{\c c}ant el present ajut s{\textquoteright}ha ampliat l{\textquoteright}aplicaci{\'o} en xarxa LAVICAD (LAboratori VIrtual de COmunicacions Anal{\`o}giques i Digitals) que s{\textquoteright}ofereix de forma integrada dins de la plataforma d{\textquoteright}e-learning COM@WEB. LAVICAD {\'e}s una eina programada en Java i Matlab i est{\`a} formada per un conjunt de simuladors de la capa f{\'\i}sica de sistemes de comunicacions. Tots els simuladors es presenten en xarxa i es poden utilitzar pels estudiants des de qualsevol ordinador sense necessitat d{\textquoteright}instal{\textperiodcentered}laci{\'o} de cap tipus de software especial. Durant el curs 2007 2008 s{\textquoteright}han desenvolupat entre d{\textquoteright}altres dos l{\'\i}nies de treball. D{\textquoteright}una banda s{\textquoteright}ha programat l{\textquoteright}applet que emula la capa f{\'\i}sica de la televisi{\'o} digital terrestre, com a referent per a l{\textquoteright}ensenyament de sistemes de comunicacions avan{\c c}ats. D{\textquoteright}altra banda s{\textquoteright}ha treballat en la programaci{\'o} de noves funcionalitats de l{\textquoteright}eina LAVICAD, que permeten ajudar als professors en el seguiment i avaluaci{\'o} del treball continuat dels estudiants. En particular s{\textquoteright}ha programat la generaci{\'o} d{\textquoteright}una base de dades que cont{\'e} la informaci{\'o} dels usuaris que s{\textquoteright}han connectat i els resultats obtinguts a l{\textquoteright}executar un determinat simulador. Les dues l{\'\i}nies desenvolupades han de permetre durant l{\textquoteright}actual curs, consolidar l{\textquoteright}{\'u}s dels diferents simuladors per a la doc{\`e}ncia de les assignatures implicades al projecte.

}, url = {http://hdl.handle.net/2099/7235}, author = {Cabrera, M. and Xavier Gir{\'o}-i-Nieto and Rey, F. and Gasull, A. and Casas, J. and Villares, J. and Fernandez, J. and Sala {\'A}lvarez, josep and Espinosa Fricke, Pedro and Fern{\'a}ndez, Carlos Marcos and Cort{\'e}s, S. and Farr{\'e}, Miquel {\`A}ngel} } @conference {cRolon09, title = {Modeling of contours in wavelet domain for generalized lifting image compression}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2009}, year = {2009}, address = {Taipei, Taiwan}, author = {Rol{\'o}n, J. and Ortega, A. and Salembier, P.} } @conference {cSuau09, title = {Multi-resolution illumination compensation for foreground extraction}, booktitle = {16th International Conference on Image Processing}, year = {2009}, pages = {3225{\textendash}3228}, abstract = {

Illumination changes may lead to false foreground (FG) segmentation and tracking results. Most of the existing FG extraction algorithms obtain a background (BG) estimation from temporal statistical parameters. Such algorithms consider a quasi-static BG which does not change but slowly. Therefore, fast illumination changes are not taken into account by the BG estimator and they are considered as FG. The aim of the proposed algorithm is to reduce illumination effects in video sequences in order to improve foreground segmentation performances.

}, doi = {http://dx.doi.org/10.1109/ICIP.2009.5414358}, url = {http://www.icip2009.org}, author = {Suau, X. and Casas, J. and Ruiz-Hidalgo, J.} } @conference {cRolon08, title = {Image compression with generalized lifting and partial knowledge of the signal pdf}, booktitle = {IEEE International Conference on Image Processing, ICIP 2008}, year = {2008}, pages = {250{\textendash}254}, address = {San Diego, USA}, isbn = {88-86179-83-9}, author = {Rol{\'o}n, J. and Salembier, P. and Alameda, X.} } @conference {cCabrera08, title = {Lavicad: laboratorio virtual de comunicaciones anal{\'o}gicas y digitales}, booktitle = {XXIII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {2008}, month = {09/2008}, pages = {1{\textendash}4}, address = {Madrid, Spain}, abstract = {

The presented experience consists on the {\textquotedblleft}design of{\textquotedblright} and {\textquotedblleft}experimentation with{\textquotedblright} a virtual laboratory of analog and digital communications: LAVICAD. It has been result a useful tool to verify the performance of different communication systems and signal processing techniques, topics typically integrated in undergraduated courses of the curriculum of telecommunications engineering. The communication systems have been implemented and designed as Java applets and are free access. They can be run at the e-learning platform: comweb.upc.edu. The different communication systems present different levels of user interactivity and when students execute a system integrated in a comweb course, the obtained results can be supervised by the professor as an evaluation and assessment tool. From a pedagogical point of view, the main advantages of using a virtual laboratory supposes, can leads to facilitate the learning of certain matters, acting as a connection between the model of knowledge based on concepts and theories, and their practical understanding and experimentation.\ 

[URSI 2008 website][Program for ED track]

}, isbn = {978-84-612-6291-5}, author = {Cabrera, M. and Xavier Gir{\'o}-i-Nieto and Rey, F.} } @article {aMostefa07, title = {The CHIL Audiovisual Corpus for Lecture and Meeting Analysis inside Smart Rooms}, journal = {Language resources and evaluation}, volume = {41}, number = {3}, year = {2007}, month = {01/2008}, pages = {389{\textendash}407}, abstract = {

The analysis of lectures and meetings inside smart rooms has recently attracted much interest in the literature, being the focus of international projects and technology evaluations. A key enabler for progress in this area is the availability of appropriate multimodal and multi-sensory corpora, annotated with rich human activity information during lectures and meetings. This paper is devoted to exactly such a corpus, developed in the framework of the European project CHIL, {\textquotedblleft}Computers in the Human Interaction Loop{\textquotedblright}. The resulting data set has the potential to drastically advance the state-of-the-art, by providing numerous synchronized audio and video streams of real lectures and meetings, captured in multiple recording sites over the past 4 years. It particularly overcomes typical shortcomings of other existing databases that may contain limited sensory or monomodal data, exhibit constrained human behavior and interaction patterns, or lack data variability. The CHIL corpus is accompanied by rich manual annotations of both its audio and visual modalities. These provide a detailed multi-channel verbatim orthographic transcription that includes speaker turns and identities, acoustic condition information, and named entities, as well as video labels in multiple camera views that provide multi-person 3D head and 2D facial feature location information. Over the past 3 years, the corpus has been crucial to the evaluation of a multitude of audiovisual perception technologies for human activity analysis in lecture and meeting scenarios, demonstrating its utility during internal evaluations of the CHIL consortium, as well as at the recent international CLEAR and Rich Transcription evaluations. The CHIL corpus is publicly available to the research community

}, issn = {1574-020X}, doi = {10.1007/s10579-007-9054-4}, author = {Mostefa, D. and Moreau, N. and Choukri, K. and Potamianos, G. and Chu, S. and Tyagi, A. and Casas, J. and Turmo, J. and Cristoforetti, L. and Tobia, F. and Pnevmatikakis, A. and Mylonakis, V. and Talantzis, F. and Burger, S. and Stiefelhagen, R. and Bernardin, K. and Rochet, C.} } @book {eAguilar07, title = {Diccionari de Telecomunicacions}, year = {2007}, isbn = {978-84-412-1459-0}, author = {Aguilar, M. and Alcober, J. and Altes, J. and Aragones, X. and Artigas, D. and Bardes, D. and Barlabe, A. and Bragos, R. and Calderer, J. and Cardama, A. and Casademont, J. and Casals, L. and Comer{\'o}n, A. and Cotrina, J. and Cruz, L. and Dios, V. and Duxans, H. and Esparza, O. and Esquerra, I. and Garcia, D. and Garcias, P. and Gomez, C. and Gorricho, J. and Guinjoan, F. and Hesselbach, X. and Liria, A. and Lopez, J. and Madrenas, J. and Madue{\~n}o, M. and Mestre, F. and Monte, E. and Morros, J.R. and Mu{\~n}oz, J. and Pallar{\'e}s, E. and Pons, J. and Recolons, J. and Rincon, D. and Riu, P. and Pradell, L. and Pascual-Iserte, A. and Prat, L. and Rey, F. and Villares, J.} } @conference {cRolon07, title = {Generalized Lifting For Sparse Image Representation and Coding}, booktitle = {Picture Coding Symposium, PCS 2007}, year = {2007}, pages = {234{\textendash}238}, address = {Lisbon, Portugal}, isbn = {88-86179-83-9}, author = {Rol{\'o}n, J. and Salembier, P.} } @conference {cRuiz-Hidalgo07, title = {Long term selection of reference frame sub-blocks using MPEG-7 indexing metadata}, booktitle = {International Conference on Acoustics, Speech and Signal Processing, ICASSP 2007}, year = {2007}, month = {04/2007}, pages = {669{\textendash}672}, address = {Honolulu, Hawaii}, author = {Ruiz-Hidalgo, J. and Salembier, P.} } @conference {cCabrera07, title = {Proyecto de Innovaci{\'o}n Docente COM@WEB (COMunicaciones en la WEB)}, booktitle = {VIII Simposio Nacional de Tecnolog{\'\i}as de la Informaci{\'o}n y las Comunicaciones en la Educaci{\'o}n. SINTICE{\textquoteright}07}, year = {2007}, month = {09/2007}, pages = {113{\textendash}120}, abstract = {

COMalaWEB significa Comunicaciones en la Web y es una plataforma de e-learning que ofrece diferentes recursos docentes on-line a los estudiantes universitarios con el objetivo de facilitar el estudio preferentemente en las tem{\'a}ticas de procesado de se{\~n}al y comunicaciones.

La plataforma es interactiva y se ha dise{\~n}ado para promover la experimentaci{\'o}n e-learning y adquisici{\'o}n de h{\'a}bitos de estudio basados en el uso de nuevas tecnolog{\'\i}as.

El sistema en un futuro a corto plazo podr{\'a} generar itinerarios autom{\'a}ticos de estudio al estudiante en funci{\'o}n del rendimiento obtenido en las pruebas de autoevaluaci{\'o}n y a partir de informaci{\'o}n empaquetada en metadatos e integrada en una base de datos de objetos de aprendizaje.

El objetivo del proyecto COMalaWEB a medio y largo plazo es constituir un punto de encuentro en la www para estudiantes, profesores y profesionales relacionados con el {\'a}rea de las telecomunicaciones y la docencia a nivel universitario.

Uno de los cursos integrados en COMalaWEB es el laboratorio virtual de comunicaciones anal{\'o}gicas y digitales LaViCAD, de libre difusi{\'o}n y acceso y basado en la simulaci{\'o}n de diferentes sistemas de comunicaciones que pueden ser utilizados tanto en docencia presencial como en educaci{\'o}n a distancia.\ 

Actualmente se puede consultar el contenido del proyecto en la plataforma: http://comweb.upc.edu/

}, isbn = {978-84-9732-597-4}, author = {Cabrera, M. and Closas, P. and Alfredo, L. and Xavier Gir{\'o}-i-Nieto and Rey, F.} } @conference {cHaro07, title = {Stratification Learning: Detecting Mixed Density and Dimensionality in High Dimensional Point Clouds}, booktitle = {Neural Information Processing Systems NIPS}, year = {2007}, month = {12/2007}, publisher = {NIPS}, organization = {NIPS}, address = {Montreal}, abstract = {

The study of point cloud data sampled from a stratification, a collection of manifolds with possible different dimensions, is pursued in this paper. We present a technique for simultaneously soft clustering and estimating the mixed dimensionality and density of such structures. The framework is based on a maximum likelihood estimation of a Poisson mixture model. The presentation of the approach is completed with artificial and real examples demonstrating the importance of extending manifold learning to stratification learning.

}, author = {Haro, G. and Randall, Gregory and Sapiro, Guillermo} } @conference {cNeumann06, title = {Multimodal Integration of Sensor Network}, booktitle = {Proceedings of 3rd IFIP Conference on Artificial Intelligence Applications \& Innovations}, year = {2006}, month = {09/2006}, publisher = {Springer}, organization = {Springer}, address = {Athens, Greece}, isbn = {978-0-387-34223}, author = {Neumann, J. and Casas, J. and Macho, D. and Ruiz-Hidalgo, J.} } @inbook {bNeumann06, title = {Multimodal Integration of Sensor Network}, booktitle = {Artificial Intelligence Applications and Innovations}, volume = {204}, year = {2006}, pages = {312{\textendash}323}, publisher = {Springer}, organization = {Springer}, address = {Boston}, abstract = {

At the Universitat Polit{\`e}cnica de Catalunya (UPC), a Smart Room has been equipped with 85 microphones and 8 cameras. This paper describes the setup of the sensors, gives an overview of the underlying hardware and software infrastructure and indicates possibilities for high- and low-level multi-modal interaction. An example of usage of the information collected from the distributed sensor network is explained in detail: the system supports a group of students that have to solve a lab assignment related problem.

}, isbn = {978-0-387-34223-8}, doi = {10.1007/0-387-34224-9_36}, author = {Neumann, J. and Casas, J. and Macho, D. and Ruiz-Hidalgo, J.} } @phdthesis {dRuiz-Hidalgo06, title = {On the Synergy between indexing and compression representations for video sequences}, year = {2006}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, author = {Ruiz-Hidalgo, J.}, editor = {Salembier, P.} } @article {aRuiz-Hidalgo06, title = {On the use of indexing metadata to improve the efficiency of video compression}, journal = {IEEE transactions on circuits and systems for video technology}, volume = {16}, number = {3}, year = {2006}, pages = {410{\textendash}419}, abstract = {

For the last years, video indexing and video compression have been considered as two separate functionalities. However, multimedia content is growing in such a rate that multimedia services will need to consider both the compression and the indexing aspects of the content in order to efficiently manage this audio-visual content. Therefore, it is interesting to study the synergy between the representations of compression and indexing and in particular to find new schemas that allow the possibility to exploit indexing/compression information in order to increase the efficiency of video compression/indexing capabilities. The principal contribution of this paper is to study and develop new techniques where the compression efficiency of video codecs can be improved by the use of indexing metadata where indexing metadata refers to information that has been generated to support indexing capabilities.

}, keywords = {H.264, Indexing Metadata, MPEG-7, Video Coding}, issn = {1051-8215}, author = {Ruiz-Hidalgo, J. and Salembier, P.} } @conference {cAnton05, title = {Collaborative Network Space: Infrastructure and Learning Application}, booktitle = {IEEE Region 8 EUROCON 2005 Conference: Computer as a tool.}, year = {2005}, pages = {803{\textendash}806}, isbn = {1-4244-0050-3}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=01630054}, author = {Anton, M. and Vall-Llosera, M. and Jordi Torres and Romeu, J. and Jofre, L. and Sole, F. and Marqu{\'e}s, F. and Sabate, F. and Berenguer, J.} } @conference {cMarques05, title = {El concepto NetCampus}, booktitle = {3es Jornadas de la C{\'a}tedra Telef{\'o}nica-UPC}, year = {2005}, pages = {15{\textendash}20}, url = {https://catedratelefonica.upc.edu/documents/llibres/docs/el_espacio_innovador_y_la_red.pdf}, author = {Marqu{\'e}s, F. and Jofre, L. and Sole, F. and Sabate, F. and Berenguer, J. and Romeu, J. and Jordi Torres} } @book {eJofre05, title = {El "Espacio Innovador" y la red}, year = {2005}, url = {http://www.upc.edu/web/CatedraTelefonicaUPC}, author = {Jofre, L. and Sole, F. and Sabate, F. and Berenguer, J. and Marqu{\'e}s, F. and Romeu, J. and Jordi Torres} } @conference {cBroquetas04, title = {Definici{\'o} d{\textquoteright}un Master Internacional de Recerca: la proposta del Departament de Teoria del Senyal i Comunicacions}, booktitle = {Jornada de reflexi{\'o}n y trabajo sobre el modelo docente de la UPC en el Espacio Europeo de ecuaci{\'o}n Superior (EEES)}, year = {2004}, pages = {1{\textendash}3}, author = {Broquetas, A. and Hernando, J. and Marqu{\'e}s, F. and Romeu, J.} } @book {eJofre04, title = {El {\textquoteright}Profesional Innovador{\textquoteright} y la red}, year = {2004}, url = {http://catedratelefonica.upc.edu/documents/llibres/docs/jornada_2004_catedra_telf_upc.pdf}, author = {Jofre, L. and Sole, F. and Sabate, F. and Marqu{\'e}s, F. and Romeu, J. and Jordi Torres} } @conference {cJofre04, title = {Una Enginyeria per a la Societat del Coneixement}, booktitle = {II Congr{\'e}s d{\textquoteright}Enginyeria en Llengua Catalana}, year = {2004}, url = {http://www.eicc.cat/celc/formacio.htm}, author = {Jofre, L. and Sole, F. and Sabate, F. and Marqu{\'e}s, F. and Romeu, J. and Jordi Torres} } @conference {cEugenio03a, title = {Automatic structures detection and spatial registration using multisensor satellite imagery}, booktitle = {Proceedings of the International Geoscience and Remote Sensing Symposium,}, year = {2003}, pages = {1038{\textendash}1040}, author = {F. Eugenio and Rovaris, E. and Marcello, J. and Marqu{\'e}s, F.} } @conference {cRuiz-Hidalgo03, title = {Metadata-based coding tools for hybrid video codecs}, booktitle = {Picture Coding Symposium, PCS 2003}, year = {2003}, month = {04/2003}, pages = {473{\textendash}477}, address = {Saint-Malo, France}, author = {Ruiz-Hidalgo, J. and Salembier, P.} } @conference {cSalembier02a, title = {Connected Operators Based on Reconstruction Process for Size and Motion Simplification}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 2002}, year = {2002}, month = {05/2002}, address = {Orlando, USA}, author = {Salembier, P. and Ruiz-Hidalgo, J.} } @inbook {bSalembier02b, title = {Description of a Single Multimedia Document}, booktitle = {Introduction to the mpeg-7: multimedia content description interface}, year = {2002}, pages = {111{\textendash}138}, publisher = {Wiley}, organization = {Wiley}, edition = {B. S. Manjunath, P. Salembier, T. Sikora (Eds.)}, chapter = {8}, isbn = {0471486787}, author = {Benitez, A.B and Martinez, J.M and Rising, H and Salembier, P.} } @conference {cSalembier02, title = {On Filters by Reconstruction for Size and Motion Simplification,}, booktitle = {Int. Symposium on Mathematical Morphology, ISMM 2002}, year = {2002}, month = {04/2002}, pages = {425{\textendash}434}, address = {Sydney, Australia}, author = {Salembier, P. and Ruiz-Hidalgo, J.} } @article {aRuiz-Hidalgo01, title = {Morphological tools for robust key-region extraction and video shot modeling}, journal = {Lecture notes in computer science}, year = {2001}, pages = {407{\textendash}416}, abstract = {

In recent years, the use of multimedia content has experienced an exponential growth. In this context, the need of new image/video sequence representation is becoming a necessity for many applications. This paper deals with the structuring of video shots in terms\ of various foreground key-regions and a background mosaic. Each key-region represents different foreground objects that appear through the\ entire sequence in a similar manner the mosaic image represents the\ background information of the complete sequence. We focus on the interest of morphological tools such as connected operators or watersheds\ to perform the shot analysis and the computation of the key-regions and\ the mosaic. It will be shown that morphological tools are particularly attractive to improve the robustness of the various steps of the algorithm.

}, issn = {0302-9743}, author = {Ruiz-Hidalgo, J. and Salembier, P.} } @article {aEugenio01, title = {A real-time automatic acquisition, processing and distribution system for AVHRR and SeaWIFS imagery}, journal = {IEEE geoscience electronics society newsletter}, volume = {-}, number = {Issue 20}, year = {2001}, pages = {10{\textendash}15}, issn = {0161-7869}, author = {F. Eugenio and Marcello, J. and Marqu{\'e}s, F. and Hernandez-Guerra, A. and Rovaris, E.} } @conference {cRuiz-Hidalgo01, title = {Robust segmentation and representation of foreground key-regions in video sequences}, booktitle = {International Conference on Acoustics, Speech and Signal Processing ICASSP{\textquoteright}01}, year = {2001}, month = {05/2001}, pages = {1565{\textendash}1568}, address = {Salt Lake City, USA}, author = {Ruiz-Hidalgo, J. and Salembier, P.} } @conference {cEugenio00a, title = {Accurate and automatic NOAA-AVHRR image navigation using a global contour matching approach}, booktitle = {International Geoscience and remote Sensing Symposium}, year = {2000}, pages = {639{\textendash}642}, isbn = {0-7803-6362-0}, author = {F. Eugenio and Marqu{\'e}s, F. and G{\'o}mez, L. and Suarez, E. and Rovaris, E.} } @conference {cEugenio00, title = {A contour matching approach for accurate NOAA-AVHRR image navigation}, booktitle = {10th European Signal Processing Conference (EUSIPCO 2000)}, year = {2000}, isbn = {952-15-0447-1}, author = {F. Eugenio and Marqu{\'e}s, F. and Suarez, E. and Rovaris, E.} } @article {aSalembier00a, title = {Description Schemes for Video Programs, Users and Devices}, journal = {Signal processing: image communication}, volume = {16}, number = {1}, year = {2000}, pages = {211{\textendash}234}, issn = {0923-5965}, author = {Salembier, P. and Richard, Q. and O{\textquoteright}Connor, N. and Correia, P. and Sezan, I and van Beek, P} } @article {xRuiz-Hidalgo99, title = {The representation of images using scale trees}, year = {1999}, institution = {University of East Anglia}, type = {Master by Research}, abstract = {

This thesis presents a new tree structure that codes the grey scale information of an\ image. Based on a scale-space processor called the sieve, a scale tree represents\ the image in a hierarchical manner in which nodes of the tree describe features of\ the image at a specific scales.

This representation can be used to perform different image processing operations. Filtering, segmentation or motion detection can be accomplished by parsing\ the tree using different attributes associated with the nodes

}, author = {Ruiz-Hidalgo, J.} } @conference {cRuiz-Hidalgo99, title = {Towards stereo from scale-trees}, booktitle = {7th International Conference on Image Processing and its Applications}, year = {1999}, pages = {52{\textendash}56}, author = {Moravec, K. and Ruiz-Hidalgo, J. and Harvey, R. and Bangham, J.} } @conference {cMarcotegui99, title = {A video generation tool allowing friendly user interaction}, booktitle = {1999 IEEE INternational Conference on Image Processing}, year = {1999}, isbn = {0-7803-5470-2}, author = {Marcotegui, B. and Correia, P. and Marqu{\'e}s, F. and Mech, R. and Rosa, R. and Wollborn, M. and Zanoguera, F.} } @conference {cRuiz-Hidalgo98a, title = {Robust morphological scale-space trees}, booktitle = {Noblesse Workshop on Non-Linear Model Based Image Analysis}, year = {1998}, month = {07/1998}, pages = {133{\textendash}139}, author = {Ruiz-Hidalgo, J. and Bangham, J. and Harvey, R.} } @conference {cRuiz-Hidalgo98, title = {The segmentation of images via scale-space trees}, booktitle = {British Machine Vision Conference}, year = {1998}, month = {09/1998}, pages = {33{\textendash}43}, address = {Southampton, UK}, abstract = {

A useful representation of an image would be an object tree in which nodes represent objects, or parts of objects, and which includes at least one node that, together with its children, represents each object: a grandmother node. It is shown that scale-trees, obtained from greyscale images, approximate such a tree. It is then shown how they may be modified using other attributes to more closely become object trees. The result is a data structure that provides {\textquotedblleft}handles{\textquotedblright} for every element of the image that can be used for manipulating the image. This segmentation has potential for object recognition.

}, author = {Bangham, J. and Ruiz-Hidalgo, J. and Harvey, R. and Cawley, G.} } @article {aSayrol96, title = {Motion estimation using higher-order statistics}, journal = {IEEE transactions on image processing}, volume = {5}, number = {6}, year = {1996}, pages = {1077{\textendash}1084}, issn = {1057-7149}, author = {Elisa Sayrol and Gasull, A. and R. Fonollosa, Javier} } @conference {cSayrol95a, title = {Estimation of motion parameters using hos}, booktitle = {IEEE SIGNAL PROCESSING-ATHOD WORKSHOP ON HIGHER-ORDER STATISTICS.}, year = {1995}, pages = {262{\textendash}265}, isbn = {1522-4880}, author = {Elisa Sayrol and Gasull, A. and R. Fonollosa, Javier} } @conference {cSayrol95, title = {Fourth-order statistics cost functions: applications to time delay estimation and image motion estimation}, booktitle = {VI SPANISH SYMPOSIUM ON PATTERN RECOGNITION AND IMAGE ANALYSIS}, year = {1995}, pages = {543{\textendash}548}, isbn = {978-1-4244-9564-1}, author = {Elisa Sayrol and Gasull, A. and R. Fonollosa, Javier} } @conference {cSalembier95 , title = {Texture coding using morphological interpolation}, booktitle = {IEEE workshop on Nonlinear Signal and Image Processing, NSIP 1995}, year = {1995}, address = {Halkidiki, Greece}, author = {Salembier, P. and Ru{\'e}, R} } @conference {cCasas92a, title = {Fuzzy classification of Remote Sensing images: a pseudocolor representation of fuzzy partitions}, booktitle = {SPIE Neural and Stochastic Methods in Image and Signal Processing}, year = {1992}, month = {07/1992}, publisher = {SPIE}, organization = {SPIE}, address = {San Diego, CA}, doi = {10.1117/12.130844}, author = {Casas, J. and Hillion, A. and Roux, C. and Torres, L. and Gasull, A.} } @article {aSanz90, title = {Aspectos Cl{\'\i}nicos-toxicol{\'o}gicos en los trabajadores de una industria productora de cromatos}, journal = {Annual review of pharmacology and toxicology}, volume = {7}, year = {1990}, pages = {1:13{\textendash}1:20}, issn = {0362-1642}, author = {Sanz, P. and Ribas, B. and Cobo, E. and Gadea, E. and Marqu{\'e}s, F. and Sol{\'e}, E. and Corbella, J.} }