@article {aGeleta, title = {Deep Variational Autoencoders for Population Genetics}, year = {Submitted}, abstract = {
Motivation Modern biobanks provide numerous high-resolution genomic sequences of diverse populations. These datasets enable a better understanding of genotype-phenotype interactions with genome-wide association studies (GWAS) and power a new personalized precision medicine with polygenic risk scores (PRS). In order to account for diverse and admixed populations, new algorithmic tools are needed in order to properly capture the genetic composition of populations. Here we explore deep learning techniques, namely variational autoencoders (VAEs), to process genomic data from a population perspective. We hope this work will encourage the adoption of deep neural networks in the population genetics community.
Results In this paper, we show the power of VAEs for a variety of tasks relating to the interpretation, classification, simulation, and compression of genomic data with several worldwide whole genome datasets from both humans and canids and evaluate the performance of the proposed applications with and without ancestry conditioning. The unsupervised setting of autoencoders allows for the detection and learning of granular population structure and inferring of informative latent factors. The learned latent spaces of VAEs are able to capture and represent differentiated Gaussian-like clusters of samples with similar genetic composition on a fine-scale from single nucleotide polymorphisms (SNPs), enabling applications in dimensionality reduction, data simulation, and imputation. These individual genotype sequences can then be decomposed into latent representations and reconstruction errors (residuals) which provide a sparse representation useful for lossless compression. We show that different population groups have differentiated compression ratios and classification accuracies. Additionally, we analyze the entropy of the SNP data, its effect on compression across populations, its relation to historical migrations, and we show how to introduce autoencoders into existing compression pipelines.
The present dataset comprises a collection of RGB-D apple tree images that can be used to train and test computer vision-based fruit detection and sizing methods. This dataset encompasses two distinct sets of data obtained from a Fuji and an Elstar apple orchards. The Fuji apple orchard sub-set consists of 3925 RGB-D images containing a total of 15335 apples annotated with both modal and amodal apple segmentation masks. Modal masks denote the visible portions of the apples, whereas amodal masks encompass both visible and occluded apple regions. Notably, this dataset is the first public resource to incorporate on-tree fruit amodal masks. This pioneering inclusion addresses a critical gap in existing datasets, enabling the development of robust automatic fruit sizing methods and accurate fruit visibility estimation, particularly in the presence of partial occlusions. Besides the fruit segmentation masks, the dataset also includes the fruit size (calliper) ground truth for each annotated apple. The second sub- set comprises 2731 RGB-D images capturing five Elstar apple trees at four distinct growth stages. This sub-set includes mean diameter information for each tree at every growth stage and serves as a valuable resource for evaluating fruit sizing methods trained with the first sub-set. The present data was employed in the research papers titled "Looking behind occlusions: a study on amodal segmentation for robust on-tree apple fruit size estimation" [1] and {\textquotedblleft}Simultaneous fruit detection and size estimation using multitask deep neural networks{\textquotedblright} [2].
}, keywords = {Agricultural robotics, amodal segmentation, depth image, Fruit measurement, Fruit visibility, Instance Segmentation, modal segmentation, Yield prediction}, doi = {https://doi.org/10.1016/j.dib.2023.110000}, author = {Gen{\'e}-Mola, Jordi and Ferrer-Ferrer, M. and Hemming, J. and Dalfsen, P. and Hoog, D. and Sanz-Cortiella, R. and Rosell-Polo, Joan R. and Morros, J.R. and Ver{\'o}nica Vilaplana and Ruiz-Hidalgo, J. and Gregorio, Eduard} } @conference {cBonet23, title = {HyperFast: Instant Classification for Tabular Data}, booktitle = {38th Annual AAAI Conference on Artificial Intelligence (AAAI)}, year = {2024}, month = {12/2024}, abstract = {Training deep learning models and performing hyperparameter tuning can be computationally demanding and time-consuming. Meanwhile, traditional machine learning methods like gradient-boosting algorithms remain the preferred choice for most tabular data applications, while neural network alternatives require extensive hyperparameter tuning or work only in toy datasets under limited settings. In this paper, we introduce HyperFast, a meta-trained hypernetwork designed for instant classification of tabular data in a single forward pass. HyperFast generates a task-specific neural network tailored to an unseen dataset that can be directly used for classification inference, removing the need for training a model. We report extensive experiments with OpenML and genomic data, comparing HyperFast to competing tabular data neural networks, traditional ML methods, AutoML systems, and boosting machines. HyperFast shows highly competitive results, while being significantly faster. Additionally, our approach demonstrates robust adaptability across a variety of classification tasks with little to no fine-tuning, positioning HyperFast as a strong solution for numerous applications and rapid model deployment. HyperFast introduces a promising paradigm for fast classification, with the potential to substantially decrease the computational burden of deep learning.
}, author = {Bonet, David and Mas-Montserrat, Daniel and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @conference {cBarrabes, title = {Adversarial Learning for Feature Shift Detection and Correction}, booktitle = {Neural Information Processing Systems (NeurIPS)}, year = {2023}, month = {12/2023}, address = {New Orleans, USA}, abstract = {TL;DR: We introduce a framework inspired in adversarial learning to detect and correct features originating a distribution shift between datasets.
Data shift is a phenomenon present in many real-world applications, and while there are multiple methods trying to detect shifts, the task of localizing and correcting the features originating such shifts has not been studied in depth. Feature shifts can occur in many datasets, including in multi-sensor data, where some sensors are malfunctioning, or in tabular and structured data, including biomedical, financial, and survey data, where faulty standardization and data processing pipelines can lead to erroneous features. In this work, we explore using the principles of adversarial learning, where the information from several discriminators trained to distinguish between two distributions is used to both detect the corrupted features and fix them in order to remove the distribution shift between datasets. We show that mainstream supervised classifiers, such as random forest or gradient boosting trees, combined with simple iterative heuristics, can localize and correct feature shifts outperforming current statistical and neural network-based techniques.
Brain-age can be inferred from structural neuroimaging and compared to chronological age (brain-age delta) as a marker of biological brain aging. Accelerated aging has been found in neurodegenerative disorders like Alzheimer9s disease (AD), but its validation against markers of neurodegeneration and AD is lacking. Here, imaging-derived measures from the UK Biobank dataset (N=22,661) were used to predict brain-age in 2,314 cognitively unimpaired (CU) individuals at higher risk of AD and mild cognitive impaired (MCI) patients from four independent cohorts with available biomarker data: ALFA+, ADNI, EPAD and OASIS. Brain-age delta was associated with abnormal amyloid-β, more advanced stages (AT) of AD pathology and APOE-ε4 status. Brain-age delta was positively associated with plasma neurofilament light, a marker of neurodegeneration, and sex differences in the brain effects of this marker were found. These results validate brain-age delta as a non-invasive marker of biological brain aging related to markers of AD and neurodegeneration.
}, issn = {2050-084X}, doi = {https://doi.org/10.7554/eLife.81067}, author = {Irene Cumplido-Mayoral and Marina Garc{\'\i}a-Prat and Gregory Operto and Carles Falcon and Mahnaz Shekari and Raffaele Cacciaglia and Marta Mila-Aloma and Luigi Lorenzini and Carolina Minguillon and Jose Luis Molinuevo and Marc Suarez-Calvet and Ver{\'o}nica Vilaplana and Juan Domingo Gispert} } @conference {cCumplido-Mayoral23a, title = {Brain-age mediates the association between modifiable risk factors and cognitive decline early in the AD continuum}, booktitle = {Alzheimer{\textquoteright}s Association International Conference (AAIC)}, year = {2023}, month = {07/2023}, address = {Amsterdam, Netherlands}, author = {Irene Cumplido-Mayoral and Anna Brugulat-Serrat and Gonzalo S{\'a}nchez-Benavides and Armand Gonz{\'a}lez-Escalante and Federica Anastasi and Marta Mila-Aloma and Carles Falcon and Mahnaz Shekari and Raffaele Cacciaglia and Carolina Minguillon and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @conference {cCumplido-Mayoral23, title = {Brain-age prediction and its associations with glial and synaptic CSF markers}, booktitle = {Alzheimer{\textquoteright}s Association International Conference}, year = {2023}, month = {07/2023}, address = {Amsterdam, Netherlands}, author = {Irene Cumplido-Mayoral and Marta Mila-Aloma and Carles Falcon and Raffaele Cacciaglia and Carolina Minguillon and Karine Fauria and Jose Luis Molinuevo and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @article {ade-Mas-Gimenez23, title = {Gradient-Based Metrics for the Evaluation of Image Defogging}, journal = {World Electric Vehicle Journal}, volume = {14}, year = {2023}, month = {09/2023}, chapter = {254}, abstract = {Fog, haze, or smoke are standard atmospheric phenomena that dramatically compromise the overall visibility of any scene, critically affecting features such as the illumination, contrast, and contour detection of objects. The decrease in visibility compromises the performance of computer vision algorithms such as pattern recognition and segmentation, some of which are very relevant to decision-making in the field of autonomous vehicles. Several dehazing methods have been proposed that either need to estimate fog parameters through physical models or are statistically based. But physical parameters greatly depend on the scene conditions, and statistically based methods require large datasets of natural foggy images together with the original images without fog, i.e., the ground truth, for evaluation. Obtaining proper fog-less ground truth images for pixel-to-pixel evaluation is costly and time-consuming, and this fact hinders progress in the field. This paper aims to tackle this issue by proposing gradient-based metrics for image defogging evaluation that do not require a ground truth image without fog or a physical model. A comparison of the proposed metrics with metrics already used in the NTIRE 2018 defogging challenge as well as several state-of-the-art defogging evaluation metrics is performed to prove its effectiveness in a general situation, showing comparable results to conventional metrics and an improvement in the no-reference scene. A Matlab implementation of the proposed metrics has been developed and it is open-sourced in a public GitHub repository.
}, doi = {10.3390/wevj14090254}, url = {https://www.mdpi.com/2032-6653/14/9/254}, author = {Gerard de-Mas-Gim{\'e}nez and Pablo Garc{\'\i}a-G{\'o}mez and Casas, J. and S. Royo} } @phdthesis {dFernandez23, title = {Knowledge graph population from news streams}, volume = {Doctorate}, year = {2023}, month = {10/2023}, school = {Universitat Polit{\`e}cnica de Catalunya}, type = {Industrial}, address = {Barcelona, Catalonia}, abstract = {Media producers publish large amounts of multimedia content online - both text, audio, image and video.\ As the online media market grows, the management and delivery of contents becomes a challenge. Semantic and linking technologies can be used to organize and exploit these contents through the use of knowledge graphs. This industrial doctorate dissertation addresses the problem of constructing knowledge resources and integrating them into a system used by media producers to manage and explore their contents. For that purpose, knowledge graphs and their maintenance through Information Extraction (IE) from news streams is studied. This thesis presents solutions for multimedia understanding and knowledge extraction from online news, and their exploitation in real product applications, and it is structured in three parts.
The first part consists on the construction of IE tools that will be used for knowledge graph population. For that, we built an holistic Entity Linking (EL) system capable of combining multimodal data inputs to extract a set of semantic entities that describe news content.\ The EL system is followed by a Relation Extraction (RE) model that predicts relations between pairs of entities with a novel method based on entity-type knowledge. The final system is capable of extracting triples describing the contents of a news article.
The second part focuses on the automatic construction of a news event knowledge graph. We present an online multilingual system for event detection and comprehension from media feeds, called VLX-Stories. The system retrieves information from news sites, aggregates them into events (event detection), and summarizes them by extracting semantic labels of its most relevant entities (event representation) in order to answer four Ws from journalism: who, what, when and where.\ This part of the thesis deals with the problems of Topic Detection and Tracking (TDT), topic modeling and event representation.
The third part of the thesis builds on top of the models developed in the two previous parts to populate a knowledge graph from aggregated news.
The system is completed with an emerging entity detection module, which detects mentions of novel people appearing on the news and creates new knowledge graph entities from them. Finally, data validation and triple classification tools are added to increase the quality of the knowledge graph population.
This dissertation addresses many general knowledge graph and information extraction problems, like knowledge dynamicity, self-learning, and quality assessment. Moreover, as an industrial work, we provide solutions that were deployed in production and verify our methods with real customers.
The detection and sizing of fruits with computer vision methods is of interest because it provides relevant information to improve the management of orchard farming. However, the presence of partially occluded fruits limits the performance of existing methods, making reliable fruit sizing a challenging task. While previous fruit segmentation works limit segmentation to the visible region of fruits (known as modal segmentation), in this work we propose an amodal segmentation algorithm to predict the complete shape, which includes its visible and occluded regions. To do so, an end-to-end convolutional neural network (CNN) for simultaneous modal and amodal instance segmentation was implemented. The predicted amodal masks were used to estimate the fruit diameters in pixels. Modal masks were used to identify the visible region and measure the distance between the apples and the camera using the depth image. Finally, the fruit diameters in millimetres (mm) were computed by applying the pinhole camera model. The method was developed with a Fuji apple dataset consisting of 3925 RGB-D images acquired at different growth stages with a total of 15,335 annotated apples, and was subsequently tested in a case study to measure the diameter of Elstar apples at different growth stages. Fruit detection results showed an F1-score of 0.86 and the fruit diameter results reported a mean absolute error (MAE) of 4.5\ mm and R2\ =\ 0.80 irrespective of fruit visibility. Besides the diameter estimation, modal and amodal masks were used to automatically determine the percentage of visibility of measured apples. This feature was used as a confidence value, improving the diameter estimation to MAE\ =\ 2.93\ mm and R2\ =\ 0.91 when limiting the size estimation to fruits detected with a visibility higher than 60\%. The main advantages of the present methodology are its robustness for measuring partially occluded fruits and the capability to determine the visibility percentage. The main limitation is that depth images were generated by means of photogrammetry methods, which limits the efficiency of data acquisition. To overcome this limitation, future works should consider the use of commercial RGB-D sensors. The code and the dataset used to evaluate the method have been made publicly available at\ https://github.com/GRAP-UdL-AT/Amodal_Fruit_Sizing.
}, keywords = {deep learning, Fruit detection, Fruit measurement, Fruit visibility, Precision agriculture, Yield estimation}, issn = {ISSN 0168-1699}, doi = {https://doi.org/10.1016/j.compag.2023.107854}, url = {https://authors.elsevier.com/sd/article/S0168-1699(23)00242-9}, author = {Gen{\'e}-Mola, Jordi and Ferrer-Ferrer, M. and Gregorio, Eduard and Blok, P. M. and Hemming, J. and Morros, J.R. and Rosell-Polo, Joan R. and Ver{\'o}nica Vilaplana and Ruiz-Hidalgo, J.} } @article {aDominguez, title = {Neural ADMIXTURE: rapid population clustering with autoencoders}, journal = {Nature Computational Science}, year = {2023}, month = {07/2023}, abstract = {Characterizing the genetic substructure of large cohorts has become increasingly important as genetic association and prediction studies are extended to massive, increasingly diverse, biobanks. ADMIXTURE and STRUCTURE are widely used unsupervised clustering algorithms for characterizing such ancestral genetic structure. These methods decompose individual genomes into fractional cluster assignments with each cluster representing a vector of DNA marker frequencies. The assignments, and clusters, provide an interpretable representation for geneticists to describe population substructure at the sample level. However, with the rapidly increasing size of population biobanks and the growing numbers of variants genotyped (or sequenced) per sample, such traditional methods become computationally intractable. Furthermore, multiple runs with different hyperparameters are required to properly depict the population clustering using these traditional methods, increasing the computational burden. This can lead to days of compute. In this work we present Neural ADMIXTURE, a neural network autoencoder that follows the same modeling assumptions as ADMIXTURE, providing similar (or better) clustering, while reducing the compute time by orders of magnitude. Indeed, the equivalent of one month of continuous compute can be reduced to hours. In addition, Neural ADMIXTURE can include multiple outputs, providing the equivalent results as running the original ADMIXTURE algorithm many times with different numbers of clusters. Our models can also be stored, allowing later cluster assignment to be performed with a linear computational time.
Most pedestrian detection methods focus on bounding boxes based on fusing RGB with lidar. These methods do not relate to how the human eye perceives objects in the real world. Furthermore, lidar and vision can have difficulty detecting pedestrians in scattered environments, and radar can be used to overcome this problem. Therefore, the motivation of this work is to explore, as a preliminary step, the feasibility of fusing lidar, radar, and RGB for pedestrian detection that potentially can be used for autonomous driving that uses a fully connected convolutional neural network architecture for multimodal sensors. The core of the network is based on SegNet, a pixel-wise semantic segmentation network. In this context, lidar and radar were incorporated by transforming them from 3D pointclouds into 2D gray images with 16-bit depths, and RGB images were incorporated with three channels. The proposed architecture uses a single SegNet for each sensor reading, and the outputs are then applied to a fully connected neural network to fuse the three modalities of sensors. Afterwards, an up-sampling network is applied to recover the fused data. Additionally, a custom dataset of 60 images was proposed for training the architecture, with an additional 10 for evaluation and 10 for testing, giving a total of 80 images. The experiment results show a training mean pixel accuracy of 99.7\% and a training mean intersection over union of 99.5\%. Also, the testing mean of the IoU was 94.4\%, and the testing pixel accuracy was 96.2\%. These metric results have successfully demonstrated the effectiveness of using semantic segmentation for pedestrian detection under the modalities of three sensors. Despite some overfitting in the model during experimentation, it performed well in detecting people in test mode. Therefore, it is worth emphasizing that the focus of this work is to show that this method is feasible to be used, as it works regardless of the size of the dataset. Also, a bigger dataset would be necessary to achieve a more appropiate training. This method gives the advantage of detecting pedestrians as the human eye does, thereby resulting in less ambiguity. Additionally, this work has also proposed an extrinsic calibration matrix method for sensor alignment between radar and lidar based on singular value decomposition.
}, keywords = {autonomous driving, Convolutional Neural Networks, sensor calibration, sensor fusion}, doi = {10.3390/s23084167}, url = {https://www.mdpi.com/1424-8220/23/8/4167}, author = {Alfredo Ch{\'a}vez Plasencia and Pablo Garc{\'\i}a-G{\'o}mez and Eduardo Bernal P{\'e}rez and Gerard de-Mas-Gim{\'e}nez and Casas, J. and S. Royo} } @conference {cTarresa, title = {Sign Language Translation from Instructional Videos}, booktitle = {CVPR 2023 Women in Computer Vision Workshop}, year = {2023}, month = {04/2023}, publisher = {Computer Vision Foundation / IEEE}, organization = {Computer Vision Foundation / IEEE}, address = {Vancouver, Canada}, abstract = {The advances in automatic sign language translation (SLT) to spoken languages have been mostly benchmarked with datasets of limited size and restricted domains. Our work advances the state of the art by providing the first baseline results on How2Sign, a large and broad dataset. We train a Transformer over I3D video features, using the reduced BLEU as a reference metric for validation, instead of the widely used BLEU score. We report a result of 8.03 on the BLEU score, and publish the first open-source implementation of its kind to promote further advances.
\
\
\ The measurement of fruit size is of great interest to estimate the yield and predict the harvest resources in advance. This \ work proposes a novel technique for in-field apple detection and measurement based on Deep Neural Networks. The proposed\ \ framework was trained with RGB-D data and consists of an end-to-end multitask Deep Neural Network architecture 13 specifically designed to perform the following tasks: 1) detection and segmentation of each fruit from its surroundings; 2) estimation of the diameter of each detected fruit. The methodology was tested with a total of 15335 annotated apples at different growth stages, with diameters varying from 27 mm to 95 mm. Fruit detection results reported an F1-score for apple detection of 0.88 and a mean absolute error of diameter estimation of 5.64 mm. These are state-of-the-art results with the additional advantages of: a) using an end-to-end multitask trainable network; b) an efficient and fast inference speed; and c) being based on RGB-D data which can be acquired with affordable depth cameras. On the contrary, the main disadvantage is the need of annotating a large amount of data with fruit masks and diameter ground truth to train the model. Finally, a fruit visibility analysis showed an improvement in the prediction when limiting the measurement to apples above 65\% of visibility (mean absolute error of 5.09 mm). This suggests that future works should develop a method for automatically identifying the most visible apples and discard the prediction of highly occluded fruits.\
}, keywords = {deep learning, Fruit measurement, Fruit visibility, Precision agriculture, Yield estimation}, doi = {https://doi.org/10.1016/j.biosystemseng.2023.07.010}, author = {Ferrer-Ferrer, M. and Ruiz-Hidalgo, J. and Gregorio, Eduard and Ver{\'o}nica Vilaplana and Morros, J.R. and Gen{\'e}-Mola, Jordi} } @conference {cCaselles, title = {SIRA: Relightable Avatars from a Single Image}, booktitle = {Winter Conference on Applications of Computer Vision (WACV)}, year = {2023}, abstract = {Recovering the geometry of a human head from a single image, while factorizing the materials and illumination is a severely ill-posed problem that requires prior information to be solved. Methods based on 3D Morphable Models (3DMM), and their combination with differentiable renderers, have shown promising results. However, the expressiveness of 3DMMs is limited, and they typically yield over-smoothed and identity-agnostic 3D shapes limited to the face region. Highly accurate full head reconstructions have recently been obtained with neural fields that parameterize the geometry using multilayer perceptrons. The versatility of these representations has also proved effective for disentangling geometry, materials and lighting. However, these methods require several tens of input images. In this paper, we introduce SIRA, a method which, from a single image, reconstructs human head avatars with high fidelity geometry and factorized lights and surface materials. Our key ingredients are two data-driven statistical models based on neural fields that resolve the ambiguities of single-view 3D surface reconstruction and appearance factorization. Experiments show that SIRA obtains state of the art results in 3D head reconstruction while at the same time it successfully disentangles the global illumination, and the diffuse and specular albedos. Furthermore, our reconstructions are amenable to physically-based appearance editing and head model relighting.
}, author = {Caselles, Pol and Ramon, Eduard and Garcia, Jaime and Xavier Gir{\'o}-i-Nieto and Moreno, Francesc and Triginer, Gil} } @article {xPerera22, title = {Ancestry-conditioned Generative Models for Genotyping}, year = {2022}, abstract = {Local ancestry inference (LAI) identifies the ancestry of each segment of an individual{\textquoteright}s genome and it is a critical step in the analysis of human genomes with applications from pharmacogenomics and personalized medicine to increase detection of genetic associations.\
\
New LAI techniques are appearing at a fast pace in both industry and academic research and large data-sets of human genomic sequences from the ancestries of interest are required to train those methods. Usually, those data-sets are protected by privacy regulations, are proprietary or accessible only when they come with restrictions due to its nature. An interesting way to overcome those difficulties is through the generation of data samples that could be similar enough to real sequences from ancestries of interest. A generalized model can be openly shared because there is no real individual information in there.\
\
Thus, we present a class-conditional Generative adversarial Model and a Conditional Generative Moment-Matching Network intended to generate new realistic genotypes of a desired ancestry. In addition, we present a privacy mechanism that extracts features from the real data to generate new realistic genotypes by using features.
Signed languages are complete and natural languages used as the first or preferred mode of communication by millions of people worldwide. However, they, unfortunately, continue to be marginalized languages. Designing, building, and evaluating models that work on sign languages presents compelling research challenges and requires interdisciplinary and collaborative efforts. The recent advances in Machine Learning (ML) and Artificial Intelligence (AI) has the power to enable better accessibility to sign language users and narrow down the existing communication barrier between the Deaf community and non-sign language users. However, recent AI-powered technologies still do not account for sign language in their pipelines. This is mainly because sign languages are visual languages, that use manual and non-manual features to convey information, and do not have a standard written form. Thus, the goal of this thesis is to contribute to the development of new technologies that account for sign language by creating large-scale multimodal resources suitable for training modern data-hungry machine learning models and developing automatic systems that focus on computer vision tasks related to sign language that aims at learning better visual understanding of sign languages.Thus, in Part I, we introduce the How2Sign dataset, which is a large-scale collection of multimodal and multiview sign language videos in American Sign Language. In Part II, we contribute to the development of technologies that account for sign languages by presenting in Chapter 4 a framework called Spot-Align, based on sign spotting methods, to automatically annotate sign instances in continuous sign language. We further present the benefits of this framework and establish a baseline for the sign language recognition task on the How2Sign dataset. In addition to that, in Chapter 5 we benefit from the different annotations and modalities of the How2Sign to explore sign language video retrieval by learning cross-modal embeddings. Later in Chapter 6, we explore sign language video generation by applying Generative Adversarial Networks to the sign language domain and assess if and how well sign language users can understand automatically generated sign language videos by proposing an evaluation protocol based on How2Sign topics and English translation.
Complete digital pathology transformation for primary histopathological diagnosis is a challenging yet rewarding endeavor. Its advantages are clear with more efficient workflows, but there are many technical and functional difficulties to be faced. The Catalan Health Institute (ICS) has started its DigiPatICS project, aiming to deploy digital pathology in an integrative, holistic, and comprehensive way within a network of 8 hospitals, over 168 pathologists, and over 1 million slides each year. We describe the bidding process and the careful planning that was required, followed by swift implementation in stages. The purpose of the DigiPatICS project is to increase patient safety and quality of care, improving diagnosis and the efficiency of processes in the pathological anatomy departments of the ICS through process improvement, digital pathology, and artificial intelligence tools.
}, keywords = {artificial intelligence, computational pathology, deep learning, digital pathology, implementation, LIS, primary diagnosis, telepathology, workflow}, doi = {10.3390/diagnostics12040852}, url = {https://www.mdpi.com/2075-4418/12/4/852}, author = {Jordi Temprana-Salvador and Pau L{\'o}pez-Garc{\'\i}a and Josep Castellv{\'\i} Vives and Llu{\'\i}s de Haro and Eudald Ballesta and Matias Rojas Abusleme and Miquel Arrufat and Ferran Marques and Casas, J. and Carlos Gallego and Laura Pons and Jos{\'e} Luis Mate and Pedro Luis Fern{\'a}ndez and Eugeni L{\'o}pez-Bonet and Ramon Bosch and Salom{\'e} Mart{\'\i}nez and Santiago Ram{\'o}n y Cajal and Xavier Matias-Guiu} } @mastersthesis {xMohamed22, title = {Exploring Visual Representations for Sign Language Translation}, year = {2022}, abstract = {Sign Language Translation (SLT) task has been addressed in multiple approaches in recent years. In this work we aim to investigate the impact of using different types of visual sign language representation for SLT. For this investigation we use the state-of-the-art in SLT, the Sign Language Transformers model. We compare the translation output performance of two types of body pose estimation models as our skeleton extractor,\ and 2D CNN features trained on the test dataset. These later perform best, and I3D features outperform the pose estimation-based ones.\
}, author = {Maram A. Mohamed}, editor = {Laia Tarr{\'e}s and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dRamon22, title = {Few-shot 3D Reconstruction of Body Parts with Deep Neural Networks}, volume = {Excellent}, year = {2022}, month = {09/2022}, type = {Industrial}, abstract = {In this thesis, we study the problem of reconstructing objects from a concrete category\ in 3D when few images are available as input, i.e. less than 10. We apply our findings to digitalizing human body parts such as heads and torsos for medical applications. The first part of the thesis explores systems that rely on 3D Morphable Models. When approaching a concrete task, training such systems requires expensive manual hyper-parameter tuning of both the architecture and the loss, which is time consuming. We focus on designing novel losses without hyperparameters and modular architectures that allow to train models without tuning efforts. We also aim at providing a fine alignment between the 3D space and the image space by estimating camera poses with a low re-projection error that further improve the texturing process in 3D modelling applications or the rendering process in augmented reality applications. Our findings lead to systems that are very stable and that naturally scale to different scenes.
While 3D Morphable Models are fast and robust, they are still very limited in terms\ of accuracy and expressiveness, which might be prohibitive for applications that require high fidelity. A promising alternative to 3D Morphable Models are implicit functions,which in combination with differentiable rendering techniques have shown impressive results at reconstructing 3D surfaces. However, the later require large sets of images at test time to obtain satisfactory results. In the second part of the thesis, we propose to use a probabilistic model that represents a distribution of implicit surfaces in combination with a differentiable renderer to reduce the number of images required at test time. The resulting 3D reconstruction system is highly accurate and allows to reconstruct a wide variety of human head shapes when only 3 images are available.
The generation of synthetic genomic sequences using neural networks has potential to overcome privacy and data sharing restrictions and to mitigate potential bias within datasets due to under-representation of some population groups. However, there is not a consensus on which architectures, training procedures, and evaluation metrics should be used when simulating single nucleotide polymorphism (SNP) sequences with neural networks. In this paper, we explore the use of Generative Moment Matching Networks (GMMNs) for SNP simulation, we present some architectural and procedural changes to properly train the networks, and we introduce an evaluation scheme to qualitatively and quantitatively asses the quality of the simulated sequences.
\
\
}, author = {Mas-Montserrat, Daniel and Perera, Maria and Barrab{\'e}s, M{\'\i}riam and Geleta, Margarita and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @mastersthesis {xDomenech, title = {Hiding Images in their Spoken Narratives}, year = {2022}, abstract = {Steganography is the technique of hiding secret data within an ordinary, non-secret, file or message in order to avoid its detection. Throughout our work, we study the case where the hidden secret data is an image and the non-secret data or cover signal is an audio. To this end, we use a recently proposed residual architecture operating on top of short-time discrete cosine transform (STDCT) audio spectrograms. In our work, we evaluate the above mentioned residual steganography architecture with the Localized Narratives dataset, explore the feasibility of using short-time fourier transform (STFT) audio spectrograms instead of STDCTs to improve the efficiency of the system, investigate the use of hidden signals permuted with the objective to spread the audio corruption of the revealed images, apply averaged audio windows to improve quality results and tested the system in real-world distortions.
}, author = {Teresa Domenech}, editor = {McGuinness, Kevin and Pons, Jordi and Xavier Gir{\'o}-i-Nieto} } @conference {cSchurholt22, title = {Hyper-Representations as Generative Models: Sampling Unseen Neural Network Weights}, booktitle = {NeurIPS 2022 - Neural Information Processing Systems}, year = {2022}, abstract = {Learning representations of neural network weights given a model zoo is an emerging and challenging area with many potential applications from model inspection, to neural architecture search or knowledge distillation. Recently, an autoencoder trained on a model zoo was able to learn a hyper-representation, which captures intrinsic and extrinsic properties of the models in the zoo. In this work, we extend hyper-representations for generative use to sample new model weights as pre-training. We propose layer-wise loss normalization which we demonstrate is key to generate high-performing models and a sampling method based on the empirical density of hyper-representations. The models generated using our methods are diverse, performant and capable to outperform conventional baselines for transfer learning. Our results indicate the potential of knowledge aggregation from model zoos to new models via hyper-representations thereby paving the avenue for novel research directions.
Introduction to Research, BSc Data Science and Engineering, Autumn 2021:
Predicting Dog Phenotypes from Genotypes
In this paper, we analyze dog genotypes {\textendash} positions of DNA sequences that often vary between different dogs {\textendash}in order to predict the corresponding phenotypes {\textendash} unique characteristics that result from different genetic code. More specifically, given chromosome data from a dog, we aim to predict its breed category, height, and weight. We explore a variety of linear and non-linear classification and regression techniques to accomplish these three tasks. We also investigate the use of a neural network (both in linear and non-linear modes) for breed classification and compare its performance to traditional statistical methods. We show that linear methods generally outperform or match the performance of non-linear methods for breed classification. However, the reverse case is true for height and weight regression. We also evaluate the results of all of these methods based on the number of input features used in the analy sis and demonstrate that phenotypes can be predicted with as few as 0.5\% of the input features, and dog breeds can be classified with 50\% balanced accuracy with as few as 0.02\% of the full genomic sequences for our analysis.
\
MergeGenome. A Python-based Toolkit for Merging VCF files
A challenge of genomic studies is the lack of easy to access and properly formatted datasets. When having access to more than one dataset, it seems desirable to combine them. There is a lack of tools to duly merge genomic datasets without losing all non-matching features. To fill this gap, we present the MergeGenome toolkit, designed to integrate DNA sequences from two files in variant call format (VCF) while targeting data quality. MergeGenome is a robust pipeline of comprehensive steps to standardize nomenclature, remove ambiguities, correct flips, eliminate mismatches, select important features, and filter likely erroneous features (the latter with machine learning). MergeGenome is Python-based and relies on pre-existing software for manipulation and imputation of VCF data. We evaluate the result of merging two datasets with dog DNA sequences of dissimilar lengths and notice that genotype imputation with Beagle v5.1 usually fails for low-frequency alleles. Alternatively, we explore several multi-label machine learning classifiers. Although K-Nearest Neighbors achieves competitive results, none of the methods tried outperforms Beagle v5.1.
\
\
The present invention is directed, in general, to a method and a system to automatically transform an image using neural networks. More specifically, the invention relates to a controllable image generation through an image representation and several conditions using a conditional Neural Network.
The method comprises receiving, by a processing unit, at least one image and processing the received image to obtain an image representation thereof (i.e. an intermediate representation of the initial image that captures high level features and low level properties of the image and that is structured in an understandable way for a conditional Neural Network such as a deep generative Neural Network). The method also includes receiving, by an encoding unit, one or more references (e.g. other images, text, labels, combinations thereof, or even other data describing how the received image should be transformed) and encoding the received one or more references into one or more features, the latter being further provided to a conditional Neural Network as a condition(s). In addition, the method further applies the conditional Neural Network to transform the obtained image representation into a resulting conditioned image based on said condition(s).
\
}, issn = {21382176.2}, url = {https://patentscope.wipo.int/search/es/detail.jsf?docId=EP373278976}, author = {Terradas, R. and Pau Domingo and Grau, M. and Alarc{\'o}n, E. and Ruiz-Hidalgo, J.} } @conference {cSchurholt22a, title = {Model Zoos: A Dataset of Diverse Populations of Neural Network Models}, booktitle = {NeurIPS}, year = {2022}, month = {09/2022}, abstract = {In the last years, neural networks (NN) have evolved from laboratory environments to the state-of-the-art for many real-world problems. It was shown that NN models (i.e., their weights and biases) evolve on unique trajectories in weight space during training. Following, a population of such neural network models (referred to as model zoo) would form structures in weight space. We think that the geometry, curvature and smoothness of these structures contain information about the state of training and can reveal latent properties of individual models. With such model zoos, one could investigate novel approaches for (i) model analysis, (ii) discover unknown learning dynamics, (iii) learn rich representations of such populations, or (iv) exploit the model zoos for generative modelling of NN weights and biases. Unfortunately, the lack of standardized model zoos and available benchmarks significantly increases the friction for further research about populations of NNs. With this work, we publish a novel dataset of model zoos containing systematically generated and diverse populations of NN models for further research. In total the proposed model zoo dataset is based on eight image datasets, consists of 27 model zoos trained with varying hyperparameter combinations and includes 50{\textquoteright}360 unique NN models as well as their sparsified twins, resulting in over 3,844,360 collected model states. Additionally, to the model zoo data we provide an in-depth analysis of the zoos and provide benchmarks for multiple downstream tasks.
In the last years, neural networks (NN) have evolved from laboratory environments to the state-of-the-art for many real-world problems. It was shown that NN models (i.e., their weights and biases) evolve on unique trajectories in weight space during training. Following, a population of such neural network models (referred to as model zoo) would form structures in weight space. We think that the geometry, curvature and smoothness of these structures contain information about the state of training and can reveal latent properties of individual models. With such model zoos, one could investigate novel approaches for (i) model analysis, (ii) discover unknown learning dynamics, (iii) learn rich representations of such populations, or (iv) exploit the model zoos for generative modelling of NN weights and biases. Unfortunately, the lack of standardized model zoos and available benchmarks significantly increases the friction for further research about populations of NNs. With this work, we publish a novel dataset of model zoos containing systematically generated and diverse populations of NN models for further research. In total the proposed model zoo dataset is based on eight image datasets, consists of 27 model zoos trained with varying hyperparameter combinations and includes 50{\textquoteright}360 unique NN models as well as their sparsified twins, resulting in over 3{\textquoteright}844{\textquoteright}360 collected model states. Additionally, to the model zoo data we provide an in-depth analysis of the zoos and provide benchmarks for multiple downstream tasks.The dataset can be found at www.modelzoos.cc.
}, author = {Sch{\"u}rholt, Konstantin and Taskiran, Diyar and Knyazev, Boris and Xavier Gir{\'o}-i-Nieto and Borth, Damian} } @article {xBudria22, title = {Multimodal 3D Hand Pose Enhancement for Sign Language}, year = {2022}, abstract = {The application of recent deep learning breakthroughs to the domain of sign language has yielded very promising results. However, sign language processing systems depend on large amounts of labeled high-quality data to work properly. Current hand pose estimation methods are often unreliable and do not always produce estimations with enough quality. To mitigate this issue, we explore the applicability of the novel Body2Hands method for the obtainment of high-quality hand pose estimations.
\
\
Steganography comprises the mechanics of hiding data in a host media that may be publicly available. While previous works focused on unimodal setups (e.g., hiding images in images, or hiding audio in audio), PixInWav targets the multimodal case of hiding images in audio. To this end, we propose a novel residual architecture operating on top of short-time discrete cosine transform (STDCT) audio spectrograms. Among our results, we find that the residual audio steganography setup we propose allows independent encoding of the hidden image from the host audio without compromising quality. Accordingly, while previous works require both host and hidden signals to hide a signal, PixInWav can encode images offline --- which can be later hidden, in a residual fashion, into any audio signal. Finally, we test our scheme in a lab setting to transmit images over airwaves from a loudspeaker to a microphone verifying our theoretical insights and obtaining promising results.
\
\ Deep Learning Barcelona Symposium 2022
\
\
\
\
\
\
Presentation from the early stages of the project (January 2021):
In this paper, we analyze dog genotypes - positions of DNA sequences that often vary between different dogs - in order to predict the corresponding phenotypes - unique characteristics that result from different genetic code. More specifically, given chromosome data from a dog, we aim to predict its breed category, height, and weight. We explore a variety of linear and non-linear classification and regression techniques to accomplish these three tasks. We show that linear methods generally outperform or match non-linear methods for breed classification. However, the reverse case is true for height and weight regression. We also evaluate the performance of all of these methods based on the number of input features used in the analysis. We conduct experiments using different fractions of the full genomic sequences and demonstrate that phenotypes can be predicted with as few as 0.5\% of the input features available for our analysis, and dog breeds can be classified with 50\% balanced accuracy with as few as 0.02\% of the features.
Deep learning (DL) models have provided state-of-the-art performance in various medical imaging benchmarking challenges, including the Brain Tumor Segmentation (BraTS) challenges. However, the task of focal pathology multi-compartment segmentation (e.g., tumor and lesion sub-regions) is particularly challenging, and potential errors hinder translating DL models into clinical workflows. Quantifying the reliability of DL model predictions in the form of uncertainties could enable clinical review of the most uncertain regions, thereby building trust and paving the way toward clinical translation. Several uncertainty estimation methods have recently been introduced for DL medical image segmentation tasks. Developing scores to evaluate and compare the performance of uncertainty measures will assist the end-user in making more informed decisions. In this study, we explore and evaluate a score developed during the BraTS 2019 and BraTS 2020 task on uncertainty quantification (QU-BraTS) and designed to assess and rank uncertainty estimates for brain tumor multi-compartment segmentation. This score (1) rewards uncertainty estimates that produce high confidence in correct assertions and those that assign low confidence levels at incorrect assertions, and (2) penalizes uncertainty measures that lead to a higher percentage of under-confident correct assertions. We further benchmark the segmentation uncertainties generated by 14 independent participating teams of QU-BraTS 2020, all of which also participated in the main BraTS segmentation task. Overall, our findings confirm the importance and complementary value that uncertainty estimates provide to segmentation algorithms, highlighting the need for uncertainty quantification in medical image analyses. Finally, in favor of transparency and reproducibility, our evaluation code is made publicly available at:\ this https URL.
}, url = {https://www.melba-journal.org/papers/2022:026.html}, author = {Raghav Metha and Angelos Filos and Ujjwal Baid and Laura Mora and Ver{\'o}nica Vilaplana and Christos Davatzikos and Bjoern Menze and Spyridon Bakas and Yarin Gal and Tar Arbel} } @article {cBellver-Bueno20, title = {RefVOS: A Closer Look at Referring Expressions for Video Object Segmentation}, journal = {Multimedia Tools and Applications}, year = {2022}, month = {07/2022}, abstract = {The task of video object segmentation with referring expressions (language-guided VOS) is to, given a linguistic phrase and a video, generate binary masks for the object to which the phrase refers. Our work argues that existing benchmarks used for this task are mainly composed of trivial cases, in which referents can be identified with simple phrases. Our analysis relies on a new categorization of the phrases in the DAVIS-2017 and Actor-Action datasets into trivial and non-trivial REs, with the non-trivial REs annotated with seven RE semantic categories. We leverage this data to analyze the results of RefVOS, a novel neural network that obtains competitive results for the task of language-guided image segmentation and state of the art results for language-guided VOS. Our study indicates that the major challenges for the task are related to understanding motion and static actions.
\
Motivation
Local ancestry inference (LAI) is the high resolution prediction of ancestry labels along a DNA sequence. LAI is important in the study of human history and migrations, and it is beginning to play a role in precision medicine applications including ancestry-adjusted genome-wide association studies (GWASs) and polygenic risk scores (PRSs). Existing LAI models do not generalize well between species, chromosomes or even ancestry groups, requiring re-training for each different setting. Furthermore, such methods can lack interpretability, which is an important element in each of these applications.
Results
We present SALAI-Net, a portable statistical LAI method that can be applied on any set of species and ancestries (species-agnostic), requiring only haplotype data and no other biological parameters. Inspired by identity by descent methods, SALAI-Net estimates population labels for each segment of DNA by performing a reference matching approach, which leads to an interpretable and fast technique. We benchmark our models on whole-genome data of humans and we test these models{\textquoteright} ability to generalize to dog breeds when trained on human data. SALAI-Net outperforms previous methods in terms of balanced accuracy, while generalizing between different settings, species and datasets. Moreover, it is up to two orders of magnitude faster and uses considerably less RAM memory than competing methods.
Introduction to Research, BSc Data Science and Engineering, Autumn 2021:
The end goal of Sign Language Translation is to either produce spoken sentences from sign videos or generate sign videos from their corresponding written transcriptions. In this situation, this task has been address in multiple approaches in recent years. Moreover, it has been proved that taking advantage of the sign gloss representations improves substantially the model{\textquoteright}s performance in this task. Therefore, in this work we replicate the state-of-the-art Transformer-based approach on the task and evaluate it on the multimodal American Sign Language How2Sign dataset. Furthermore, we provide baseline recognition and translation results that represent an starting point to further research on the topic. In addition, we provide a new sentence-based alignment for the How2Sign videos, as their current alignment was with speech, which we have used to tackle the Sign Language Translation task properly.\
\
\
Systems that can efficiently search collections of sign language videos have been highlighted as a useful application of sign language technology. However, the problem of searching videos beyond individual keywords has received limited attention in the literature. To address this gap, in this work we introduce the task of sign language retrieval with free-form textual queries: given a written query (e.g., a sentence) and a large collection of sign language videos, the objective is to find the signing video in the collection that best matches the written query. We propose to tackle this task by learning cross-modal embeddings on the recently introduced large-scale How2Sign dataset of American Sign Language (ASL). We identify that a key bottleneck in the performance of the system is the quality of the sign video embedding which suffers from a scarcity of labeled training data. We, therefore, propose SPOT-ALIGN, a framework for interleaving iterative rounds of sign spotting and feature alignment to expand the scope and scale of available training data. We validate the effectiveness of SPOT-ALIGN for learning a robust sign video embedding through improvements in both sign recognition and the proposed video retrieval task.
Sign Language Translation is an open problem whose goal is to generate written sentences from sign videos. In recent years, many research works that have been developed in this field mainly addressed the Sign Language Recognition task, which consists in understanding the input signs and transcribing them into sequences of annotations. Moreover, current studies show that taking advantage of the latter task helps to learn meaningful representations and can be seen as an intermediate step towards the end goal of translation.
In this work, we present a method to generate automatic pseudo-glosses from written sentences, which can work as a replacement for real glosses. This addresses the issue of their collection, as they need to be manually annotated and it is extremely costly.
Furthermore, we introduce a new implementation built on Fairseq of the Transformer-model approach introduced by Camgoz et al., which is jointly trained to solve the recognition and translation tasks. Besides, we provide new baseline results on both implementations: first, on the Phoenix dataset, we present results that outperform the ones provided by Camgoz et al. in their work, and, second, on the How2Sign dataset, we present the first results on the translation task. These results can work as a baseline for future research in the field.
Biological networks have gained considerable attention within the Deep Learning community because of the promising framework of Graph Neural Networks (GNN), neural models that operate in complex networks. In the context of neuroimaging, GNNs have successfully been employed for functional MRI processing but their application to ROI-level structural MRI (sMRI) remains mostly unexplored. In this work we analyze the implementation of these geometric models with sMRI by building graphs of ROIs (ROI graphs) using tools from Graph Signal Processing literature and evaluate their performance in a downstream supervised task, age prediction. We first make a qualitative and quantitative comparison of the resulting networks obtained with common graph topology learning strategies. In a second stage, we train GNN-based models for brain age prediction. Since the order of every ROI graph is exactly the same and each vertex is an entity by itself (a ROI), we evaluate whether including ROI information during message-passing or global pooling operations is beneficial and compare the performance of GNNs against a Fully-Connected Neural Network baseline. The results show that ROI-level information is needed during the global pooling operation in order to achieve competitive results. However, no relevant improvement has been detected when it is incorporated during the message passing. These models achieve a MAE of 4.27 in hold-out test data, which is a performance very similar to the baseline, suggesting that the inductive bias included with the obtained graph connectivity is relevant and useful to reduce the dimensionality of the problem
}, author = {Oscar Pina and Irene Cumplido-Mayoral and Raffaele Cacciaglia and Jos{\'e} Mar{\'\i}a Gonz{\'a}lez-de-Ech{\'a}varri and Juan D. Gispert and Ver{\'o}nica Vilaplana} } @conference {cTarres22, title = {Tackling Low-Resourced Sign Language Translation: UPC at WMT-SLT 22}, booktitle = {EMNLP 2022 Seventh Conference on Machine Translation (WMT22)}, year = {2022}, month = {10/2022}, abstract = {This paper describes the system developed at the Universitat Polit{\`e}cnica de Catalunya for the Workshop on Machine Translation 2022 Sign Language Translation Task, in particular, for the sign-to-text direction. We use a Transformer model implemented with the Fairseq modeling toolkit. We have experimented with the vocabulary size, data augmentation tech- niques and pretraining the model with the PHOENIX-14T dataset. Our system obtains 0.50 BLEU score for the test set, improving the organizers{\textquoteright} baseline by 0.38 BLEU. We remark the poor results for both the baseline and our system, and thus, the unreliability of our findings.
\
\
}, author = {Laia Tarr{\'e}s and Gerard I. G{\'a}llego and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @mastersthesis {xBudria, title = {Topic Detection from Sign Language Videos}, year = {2022}, abstract = {Significant progress has been made recently on challenging tasks in automatic sign language understanding, such as sign language recognition, translation and production. However, most works have focused on datasets with relatively few samples, short recordings and limited vocabulary and signing space. Moreover, they have neglected the less complex task of sign language video classification, whose analogue in spoken language, namely text classification, has been widely addressed. For this reason, in this work, we introduce the novel task of sign language topic detection. We base our experiments on How2Sign, a large-scale video dataset spanning multiple semantic domains. The contributions of this thesis are twofold. First, we present the first study of sign language topic detection in continuous sign language videos, providing baseline models for this task. Second, we perform a comparison between different visual features and deep learning architectures that are commonly employed in the sign language understanding literature. We implement our modelling pipelines in Fairseq, a PyTorch library that is extensively used in the spoken language community. Modular, extensible code for running our experiments is provided along this thesis.
}, author = {{\'A}lvaro Budria}, editor = {Laia Tarr{\'e}s and Xavier Gir{\'o}-i-Nieto} } @conference {cBudria, title = {Topic Detection in Continuous Sign Language Videos}, booktitle = {Accessibility, Vision, and Autonomy Meet (AVA) CVPR Workshop}, year = {2022}, month = {06/2022}, abstract = {Significant progress has been made recently on challenging tasks in automatic sign language understanding, such as sign language recognition, translation and production. However, these works have focused on datasets with relatively few samples, short recordings and limited vocabulary and signing space.\ In this work, we introduce the novel task of sign language topic detection. We base our experiments on How2Sign, a large-scale video dataset spanning multiple semantic domains. We provide strong baselines for the task of topic detection,\ \ and present a comparison between different visual features commonly used in the domain of sign language.
}, author = {{\'A}lvaro Budria and Laia Tarr{\'e}s and Gerard I. G{\'a}llego and Moreno, Francesc and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @article {xGiro-i-Nieto22, title = {Towards Sign Language Translation and Production}, year = {2022}, abstract = {Machine translation and computer vision have greatly benefited from the advances in deep learning. A large and diverse amount of textual and visual data have been used to train neural networks whether in a supervised or self-supervised manner. Nevertheless, the convergence of the two fields in sign language translation and production still poses multiple open challenges, like the low video resources, limitations in hand pose estimation, or 3D spatial grounding from poses.
\
\
\
This paper aims at improving the quality of a dataset\ that contains multiple sequences of 3D poses extracted from\ American Sign Language videos. Each pose consists of 147\ points with three coordinates each. We propose an algorithm able to correct missing points as well as to add some\ constraints such as the length of the bones. To prove the\ quality of the algorithm{\textquoteright}s outcome, we evaluate the task of\ lifting 2D to 3D poses with a deep learning model trained\ on raw data, and another one trained with the preprocessed\ data.
\ \
Deep neural networks have revolutionized the data analytics scene by improving results in several and diverse benchmarks with the same recipe: learning feature representations from data. These achievements have raised the interest across multiple scientific fields, especially in those where large amounts of data and computation are available. This change of paradigm in data analytics has several ethical and economic implications that are driving large investments, political debates and sounding press coverage under the generic label of artificial intelligence (AI). This talk will present the fundamentals of deep learning through the classic example of image classification, and point at how the same principal has been adopted for several tasks. Finally, some of the forthcoming potentials and risks for AI will be pointed.
\
\ Version 2021
\
\
\ Version 2020
In this study we show the power of variational autoencoders (VAEs) for a variety of tasks relating to the interpretation and compression of genomic data. The unsupervised setting allows for detecting and learning of granular population structure and inferring of new informative latent factors, opening up an avenue for applications in dimensionality reduction, data simulation, population classification, imputation, and lossless genomic data compression. The latent spaces of VAEs are able to capture and represent clearly differentiated Gaussian-like clusters of similar genetic composition on a fine-scale with a relatively small number of SNPs as input. Furthermore, sequences can be decomposed into latent representations and reconstruction errors (residuals) providing a sparse representation that provides a means for efficient lossless compression.
Identifying genetic clusters can be important when performing genome-wide association studies and provides an alternative to self-reported ethnic labels, which are culturally constructed and vary according to the location and individual. A variety of unsupervised dimensionality reduction methods have been explored in the past for such applications, including PCA, MDS, t-SNE, and UMAP. Our proposed VAE can represent the population structure as a Gaussian-distributed continuous multi-dimensional representation and as classification probabilities providing flexible and interpretable population descriptors.
We train our VAE method with several worldwide whole genome datasets from both humans and canids and evaluate the performance of the different proposed applications with networks with and without ancestry conditioning. Our experiments show that different population groups have significantly differentiated compression ratios and classification accuracies. Additionally, we analyze the entropy of the SNP data, noting its effect on compression across populations and connect these patterns to historical migrations and ancestral relationships.
\ Video from the related BSc thesis at UPC Data Science Engineering (2021):
Pre-training Reinforcement Learning (RL) agents in a task-agnostic manner has shown promising results. However, previous works still struggle to learn and discover meaningful skills in high-dimensional state-spaces. We approach the problem by leveraging unsupervised skill discovery and self-supervised learning of state representations. In our work, we learn a compact latent representation by making use of variational or contrastive techniques. We demonstrate that both allow learning a set of basic navigation skills by maximizing an information theoretic objective. We assess our method in Minecraft 3D maps with different complexities. Our results show that representations and conditioned policies learned from pixels are enough for toy examples, but do not scale to realistic and complex maps. We also explore alternative rewards and input observations to overcome these limitations.
Deep Neural Networks have been used to tackle a wide variety of tasks achieving great performance. However, there is still a lack of knowledge of how the training of these models converge and how weights relate to their properties. In this thesis we investigate the structure of the weight space and try to disentangle its properties. Attention mechanisms are introduced to capture relations among neurons{\textquoteright} weights that help in weight reconstruction, hyper-parameter classification and accuracy prediction. Our approach further has the potential to work with variable input size allowing different network width, depth or even architecture types.
}, author = {Caselles, Pol}, editor = {Sch{\"u}rholt, Konstantin and Borth, Damian and Xavier Gir{\'o}-i-Nieto} } @conference {cTarres, title = {GAN-based Image Colourisation with Feature Reconstruction Loss}, booktitle = {CVPR 2021 Women in Computer Vision Workshop}, year = {2021}, month = {06/2021}, address = {Virtual}, abstract = {Image colourisation is the task of adding plausible colour to grayscale images. This transformation requires obtaining a three dimensional colour-valued mapping from a real-valued grayscale image, which leads to an undetermined problem because the gray-scale semantics and texture provide cues for multiple possible colour mappings. The goal of image colourisation in not to recover the ground truth colour in a manner that it is perceived as natural by a human observer.\ Our work takes as a baseline a scheme based on an end-to-end trainable convolutional neural network (CNN) trained with a smooth L1 loss to predict the $ab$ channels of a colour image given the $L$ channel. We introduce an extra perceptual reconstruction loss during training to improve the capabilities of a adversarial adversarial model, that we adopt as a baseline.
}, author = {Laia Tarr{\'e}s and G{\'o}rriz, Marc and Xavier Gir{\'o}-i-Nieto and Mrak, Marta} } @mastersthesis {xTarres21, title = {GAN-based Image Colourisation with Feature Reconstruction Loss}, year = {2021}, abstract = {Automatic image colourisation is a complex and ambiguous task due to having multiple correct solutions. Previous approaches have resulted in desaturated results unless relying on significant user interaction.\ In this thesis we study the state of the art for colourisation and we propose an automatic colourisation approaches based on generative adversarial networks that incorporates a feature reconstruction loss during training. The generative network is framed in an adver- sarial model that learns how to colourise by incorporating a perceptual understanding of the colour. Qualitative and quantitative results show the capacity of the proposed method to colourise images in a realistic way, boosting the colourfulness and perceptual realism of previous GAN-based methodologies.\ We also study and propose a second approach that incorporates segmentation information in the GAN framework and obtain quantitative and qualitative results.
}, author = {Laia Tarr{\'e}s}, editor = {Mrak, Marta and Xavier Gir{\'o}-i-Nieto} } @conference {cRamonb, title = {H3D-Net: Few-Shot High-Fidelity 3D Head Reconstruction}, booktitle = {International Conference on Computer Vision (ICCV)}, year = {2021}, publisher = {IEEE/CVF}, organization = {IEEE/CVF}, address = {Virtual}, abstract = {Recent learning approaches that implicitly represent surface geometry using coordinate-based neural representations have shown impressive results in the problem of multi-view 3D reconstruction. The effectiveness of these techniques is, however, subject to the availability of a large number (several tens) of input views of the scene, and computationally demanding optimizations. In this paper, we tackle these limitations for the specific problem of few-shot full 3D head reconstruction, by endowing coordinate-based representations with a probabilistic shape prior that enables faster convergence and better generalization when using few input images (down to three). First, we learn a shape model of 3D heads from thousands of incomplete raw scans using implicit representations. At test time, we jointly overfit two coordinate-based neural networks to the scene, one modeling the geometry and another estimating the surface radiance, using implicit differentiable rendering. We devise a two-stage optimization strategy in which the learned prior is used to initialize and constrain the geometry during an initial optimization phase. Then, the prior is unfrozen and fine-tuned to the scene. By doing this, we achieve high-fidelity head reconstructions, including hair and shoulders, and with a high level of detail that consistently outperforms both state-of-the-art 3D Morphable Models methods in the few-shot scenario, and non-parametric methods when large sets of views are available.
Sign Language is the primary means of communication for the majority of the Deaf community. One of the factors that has hindered the progress in the areas of automatic sign language recognition, generation, and translation is the absence of large annotated datasets, especially continuous sign language datasets, i.e. datasets that are annotated and segmented at the sentence or utterance level. Towards this end, in this work we introduce How2Sign, a work-in-progress dataset collection. How2Sign consists of a parallel corpus of 80 hours of sign language videos (collected with multi-view RGB and depth sensor data) with corresponding speech transcriptions and gloss annotations. In addition, a three-hour subset was further recorded in a geodesic dome setup using hundreds of cameras and sensors, which enables detailed 3D reconstruction and pose estimation and paves the way for vision systems to understand the 3D geometry of sign language.
\
\
Image and video segmentation are central tasks within the computer vision field. Nevertheless, deep learning solutions for segmentation typically rely on pixel-level annotations, which are very costly to collect. Likewise, some segmentation systems require human interaction at inference time, which involves effort for the end-user. In this thesis, we look into diverse supervision scenarios for image and video object segmentation. We discern between supervision when learning the model, i.e., which type of annotations are used during training, and supervision at inference, namely which kind of human input is required when running the system. Our target are models that require low forms of supervision.
In the first part of the thesis we present a novel recurrent architecture for video object segmentation that is end-to-end trainable in a fully-supervised setup, and that does not require any post-processing step, i.e., the output of the model directly solves the addressed task. The second part of the thesis aims at lowering the annotation cost, in terms of labeling time, needed to train image segmentation models. We explore semi-supervised pipelines and show results when a very limited budget is available. The third part of the dissertation attempts to alleviate the supervision required by semi-automatic systems at inference time. Particularly, we focus on semi-supervised video object segmentation, which typically requires generating a binary mask for each instance to be tracked. In contrast, we present a model for language-guided video object segmentation, which identifies the object to segment with a natural language expression. We study current benchmarks, propose a novel categorization of referring expressions for video, and identify the main challenges posed by the video task.
Evaluation committee: Zeynep Akata (University of T{\"u}bingen), Francesc Moreno-Noguer (UPC IRI-CSIC) and Yannis Kalantidis (Naver Labs Europ).
These slides review the research of our lab since 2016 on applied deep learning, starting from our participation in the TRECVID Instance Search 2014, moving into video analysis with CNN+RNN architectures, and our current efforts in sign language translation and production.
\
Perception of the environment is an essential requirement for the fields of autonomous vehicles and robotics. Consequently, LiDAR imaging sensors have become crucial sen-sors for such applications due to their 3D geometry sensing capability. However, auton-omous systems claim for high amounts of data to make reliable decisions so many dif-ferent sensors are often combined. In this context, we present a multimodal imaging sys-tem based on a solid-state LiDAR combined with three other imaging sensors that pro-vides multimodal information with low parallax fusion error.
}, keywords = {artificial intelligence, autonomous navigation, computer Vision, enhanced perception, robotics, sensor fusion, solid-state LiDAR}, author = {Pablo Garc{\'\i}a-G{\'o}mez and Noel Rodrigo and Jordi Riu and Casas, J. and S. Royo} } @conference {cGirbau21, title = {Multiple Object Tracking with Mixture Density Networks for Trajectory Estimation}, booktitle = {CVPR 2021 Robust Video Scene Understanding: Tracking and Video Segmentation (RVSU) Workshop}, year = {2021}, abstract = {Multiple object tracking faces several challenges that may be alleviated with trajectory information. Knowing the posterior locations of an object helps disambiguating and solving situations such as occlusions, re-identification, and identity switching. In this work, we show that trajectory estimation can become a key factor for tracking, and present TrajE, a trajectory estimator based on recurrent mixture density networks, as a generic module that can be added to existing object trackers. To provide several trajectory hypotheses, our method uses beam search. Also, relying on the same estimated trajectory, we propose to reconstruct a track after an occlusion occurs. We integrate TrajE into two state of the art tracking algorithms, CenterTrack [63] and Tracktor [3]. Their respective performances in the MOTChallenge 2017 test set are boosted 6.3 and 0.3 points in MOTA score, and 1.8 and 3.1 in IDF1, setting a new state of the art for the CenterTrack+TrajE configuration.
Characterizing the genetic substructure of large cohorts has become increasingly important as genetic association and prediction studies are extended to massive, increasingly diverse, biobanks. ADMIXTURE and STRUCTURE are widely used unsupervised clustering algorithms for characterizing such ancestral genetic structure. These methods decompose individual genomes into fractional cluster assignments with each cluster representing a vector of DNA marker frequencies. The assignments, and clusters, provide an interpretable representation for geneticists to describe population substructure at the sample level. However, with the rapidly increasing size of population biobanks and the growing numbers of variants genotyped (or sequenced) per sample, such traditional methods become computationally intractable. Furthermore, multiple runs with different hyperparameters are required to properly depict the population clustering using these traditional methods, increasing the computational burden. This can lead to days of compute. In this work we present Neural ADMIXTURE, a neural network autoencoder that follows the same modeling assumptions as ADMIXTURE, providing similar (or better) clustering, while reducing the compute time by orders of magnitude. In addition, this network can include multiple outputs, providing the equivalent results as running the original ADMIXTURE algorithm many times with different numbers of clusters. These models can also be stored, allowing later cluster assignment to be performed with a linear computational time.
}, author = {Dominguez, Albert and Mas-Montserrat, Daniel and Bustamante, Carlos and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @article {xGiro-i-Nieto21a, title = {Object Detection with Deep Learning}, year = {2021}, abstract = {Object detection in computer vision is the task of localizing and categorizing object instances in still images. This talk reviews the main approaches for solving the task with deep neural networks, following a historically perspective starting from the image classification task.
\
Multiple object tracking is a broadly used task in multi- ple applications, all the way from bioengineering to security applications. In this paper we propose a variation of RVOS by adding the center estimation of detected instances, by means of a second head in the decoder which is assigned the task of detecting the corresponding object{\textquoteright}s bounding box arithmetic center. We have trained the model using three variants of the cross-entropy loss, which has been adapted to tackle the class imbalance caused by the fact that the center of an object is represented by only one pixel of the image, and have obtained some promising results.
}, author = {Escobar, Miquel}, editor = {Girbau, A. and Ventura, C. and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cPodlipnik21, title = {Personalized medicine in melanoma patients aided by artificial intelligence}, booktitle = {Clinical Translation of Medical Image Computing and Computer Assisted Interventions (CLINICCAI) Workshop at MICCAI}, year = {2021}, month = {09/2021}, abstract = {The 8th Edition of the American Joint Committee on Cancer (AJCC) staging system1 is the current standard for classifying patients into prognostic and treatment groups. This classification is used to predict the evolution of the patient, and therefore treatment actions provided to the individual. However, patients at the same stage behave differently, indicating that the current classification system is often insufficient to provide a customized prognosis for each patient2. It is, therefore, necessary to improve patient classification into prognostic groups. Furthermore, patients{\textquoteright} systemic and surgical treatments often involve significant toxicities and morbidities that impact their quality of life (i.e., sentinel node biopsy is not needed for 80\% of the melanoma patients, 50\% of patients do not benefit from adjuvant treatment)3. Therefore, melanoma patients should benefit from a more precise risk estimation.
We create a survival dataset for melanoma risk estimation and train survival XGBoost algorithms4 to predict the mortality, relapse, and metastasis risk. We compare their performance to the AJCC 2018 risk stratification system. Furthermore, we train classifiers to predict the risk of a positive lymph node biopsy and distant metastasis on melanoma patients and compare the performance of the proposed system to the clinical practice.
}, author = {Sebastian Podlipnik and Carlos Hernandez and Anil Kiroglu and Sergio Garc{\'\i}a and Joan Ficapal and Julio Burgos and Neus Calbet and Susana Puig and Josep Malvehy and Ver{\'o}nica Vilaplana and Marc Combalia} } @conference {cNieto, title = {PiCoEDL: Discovery and Learning of Minecraft Navigation Goals from Pixels and Coordinates}, booktitle = {CVPR 2021 Embodied AI Workshop}, year = {2021}, month = {06/2021}, abstract = {Defining a reward function in Reinforcement Learning (RL) is not always possible or very costly. For this reason, there is a great interest in training agents in a task-agnostic manner making use of intrinsic motivations and unsupervised techniques. Due to the complexity to learn useful behaviours in pixel-based domains, the results obtained in RL are still far from the remarkable results obtained in domains such as computer vision and natural language processing. We hypothesize that RL agents will also benefit from unsupervised pre-trainings with no extrinsic rewards, analogously to how humans mostly learn, especially in the early stages of life. Our main contribution is the deployment of the Explore, Discover and Learn (EDL) paradigm for unsupervised learning to the pixel space. In particular, our work focuses on the MineRL environment, where the observation of the agent is represented by: (a) its spatial coordinates in the Minecraft virtual world, and (b) an image from an egocentric viewpoint.
}, author = {Nieto, Juan Jos{\'e} and Creus, Roger and Xavier Gir{\'o}-i-Nieto} } @conference {cCreus, title = {PixelEDL: Unsupervised Skill Discovery and Learning from Pixels}, booktitle = {CVPR 2021 Embodied AI Workshop}, year = {2021}, month = {06/2021}, abstract = {We tackle embodied visual navigation in a task-agnostic set-up by putting the focus on the unsupervised discovery of skills (or options) that provide a good coverage of states. Our approach intersects with empowerment: we address the reward-free skill discovery and learning tasks to discover {\textquotedblleft}what{\textquotedblright} can be done in an environment and {\textquotedblleft}how{\textquotedblright}. For this reason, we adopt the existing Explore, Discover and Learn (EDL) paradigm, tested only in toy example mazes, and extend it to pixel-based state representations available for embodied AI agents.
}, author = {Creus, Roger and Nieto, Juan Jos{\'e} and Xavier Gir{\'o}-i-Nieto} } @conference {cMayoral21b, title = {Prediction of amyloid pathology in cognitively unimpaired individuals using structural MRI}, booktitle = {Alzheimer{\textquoteright}s Association International Conference}, year = {2021}, month = {07/2021}, author = {Irene Cumplido-Mayoral and Silvia Ingala and Luigi Lorenzini and Alle Meije Wink and Sven Haller and Jose Luis Molinuevo and Robin Wolz and Alessandro Palombit and Adam J Schwarz and Ga{\"e}l Chetelat and Pierre Payoux and Pablo Martinez-Lage and Giovanni Frisoni and Nick C Fox and Craig W Ritchie and Joanna M Wardlaw and Adam Waldman and Frederik Barkhof and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @article {aPuig-Sitjes21, title = {Real-time detection of overloads on the plasma-facing components of Wendelstein 7-X}, journal = {Applied sciences (Basel)}, volume = {11}, year = {2021}, month = {12/2021}, chapter = {1}, issn = {2076-3417}, doi = {10.3390/app112411969}, url = {http://hdl.handle.net/2117/361558}, author = {Puig-Sitjes, A. and Jakubowski, M. and Naujoks, D. and Gao, Y. and Drewelow, P. and Niemann, H. and Felinger, J. and Casas, J. and Salembier, P. and Clemente, R.} } @conference {cManas, title = {Seasonal Contrast: Unsupervised Pre-Training from Uncurated Remote Sensing Data}, booktitle = {International Conference in Computer Vision (ICCV)}, year = {2021}, publisher = {IEEE/CVF}, organization = {IEEE/CVF}, address = {Virtual}, abstract = {Remote sensing and automatic earth monitoring are key to solve global-scale challenges such as disaster prevention, land use monitoring, or tackling climate change. Although there exist vast amounts of remote sensing data, most of it remains unlabeled and thus inaccessible for supervised learning algorithms. Transfer learning approaches can reduce the data requirements of deep learning algorithms. However, most of these methods are pre-trained on ImageNet and their generalization to remote sensing imagery is not guaranteed due to the domain gap. In this work, we propose Seasonal Contrast (SeCo), an effective pipeline to leverage unlabeled data for in-domain pre-training of re-mote sensing representations. The SeCo pipeline is com-posed of two parts. First, a principled procedure to gather large-scale, unlabeled and uncurated remote sensing datasets containing images from multiple Earth locations at different timestamps. Second, a self-supervised algorithm that takes advantage of time and position invariance to learn transferable representations for re-mote sensing applications. We empirically show that models trained with SeCo achieve better performance than their ImageNet pre-trained counterparts and state-of-the-art self-supervised learning methods on multiple downstream tasks. The datasets and models in SeCo will be made public to facilitate transfer learning and enable rapid progress in re-mote sensing applications.
Machine translation and computer vision have greatly benefited of the advances in deep learning. The large and diverse amount of textual and visual data have been used to train neural networks whether in a supervised or self-supervised manner. Nevertheless, the convergence of the two field in sign language translation and production is still poses multiple open challenges, like the low video resources, limitations in hand pose estimation, or 3D spatial grounding from poses. This talk will present these challenges and the How2Sign dataset recorded at CMU in collaboration with UPC, BSC, Gallaudet University and Facebook.
\
In steady-state fusion devices like Wendelstein 7-X (W7-X), the active control of heat loads is mandatory to attain long-plasma operation. An intelligent feedback control system that mitigates the risk of overheating is required to avoid a premature plasma termination by the safety system. To keep the plasma within the safe operational limits of the plasma facing components, the feedback control system must be informed of the ongoing thermal events and their evolution in time. Then it can take effectives countermeasures to prevent the thermal events from reaching a critical point. These countermeasures may include reducing the heating power, changing the strike-line position or inducing detachment. With reaction times of the order of a hundred milliseconds, a fully automated real-time image analysis algorithm is required.
In this work, we present a spatio-temporal algorithm to detect, classify and track the thermal events observed by the thermography diagnostic on the plasma facing components of W7-X. The system detects and distinguishes between strike-lines and isolated hot spots as well as leading edges. The segmentation of the strike-line is specially challenging at W7-X. As a 3-dimensional helically-shaped stellarator equipped with 10 island divertors, the strike-lines have a complex heat load distribution with a high-dynamic range. The use of morphological tools and, in particular, the use of the Max-tree transform allow us to segment the thermal events in a hierarchical way preserving the inclusion relationship between different events, like hot spots and leading edges embedded in the strike-line structure. The thermal events are segmented for each frame and tracked over time in order to forecast their temporal evolution and to evaluate their risk. To this end, a spatio-temporal graph is built and spatio-temporal connected components are used to track the thermal events across the sequence frames. The spatio-temporal components in the graph are used to label the events in the sequence preserving temporal coherence and minimizing discontinuities, solving splits and merges. Spatio-temporal descriptors are then generated for each event to assess their risk.
The algorithm was tested offline on the infrared data acquired during the last operation phase OP1.2 and the results are presented here. Further work will follow to accelerate the code with GPUs to reach real-time processing and be ready to protect the water-cooled plasma facing components in the forthcoming operation phase OP2.
Local Ancestry Inference (LAI) is the high resolution prediction of ancestry (African, European, ...) across a DNA sequence. LAI is becoming increasingly important in DNA sequence analysis for the study of human ancestry and migrations. It is also necessary for polygenic risk scores research (prediction of traits and disease risk). Most current LAI models are built for specific species, set of ancestries and chromosomes, hence a new model needs to be trained from scratch for every slightly different setting. This creates a big barrier for research and industry to shift across different LAI scenarios. In this thesis we present SALAI-Net, the first statistical method for LAI with reference panel that can be used on any set of species and ancestries (species-agnostic). Loter is the state of the art in species-agnostic models with reference panel, and is based on a dynamic programming algorithm. However, it is slow and does not perform very well in small reference panel settings. Our model is based on a novel hand-engineered template matching block followed by a convolutional smoothing filter optimized to minimize cross-entropy loss on a training dataset. The right choice of DNA sequence encoding, similarity features and architecture is what makes our model able to generalize well to unseen ancestries, species, and different chromosomes. We benchmark our models on whole genome data of humans and we test the ability to generalize to dog species when trained on human data. Our models outperform the state-of-the-art method by a big margin in terms of accuracy, testing in different settings and datasets. Moreover, it is up to two orders of magnitude faster. Our model also shows close to no generalization gap when switching between species.
}, author = {Oriol, Benet}, editor = {Mas-Montserrat, Daniel and Ioannidis, Alexander G. and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dGirbau21, title = {Sports broadcasting and multiple object tracking with deep learning methods}, volume = {PhD}, year = {2021}, month = {03/2021}, type = {Industrial}, abstract = {Since less than a decade ago, deep learning techniques started to dominate many different\ fields, revolutionizing the possibilities of artificial intelligence. Seeing their potential, industrial\ sectors started to invest in applying such technologies as key components of the company\ strategy. This thesis has been developed in an industrial context, in AutomaticTV. The main\ focus along this period has been the transfer of knowledge and know-how between academia\ and industry, development of tools to exploit this knowledge, the exploration of new techniques\ for future challenges, and, from an academic research perspective, contributions to the\ multiple object tracking problem.
The first part of the thesis is devoted to the introduction of deep learning technologies to\ AutomaticTV, a company dedicated to automatic sports analysis and broadcasting, and the\ development of tools and tasks that surround the application.
The second part of this thesis introduces the contributions to the multiple object tracking\ challenge. We present TrajE, a trajectory estimator based on mixture density networks and\ beam search, used to boost the performance of existing multiple object trackers, and introduce\ an occlusion reconstruction step using the estimated trajectory information. By adding TrajE\ to an existing multiple object tracker, we boost its performance by 6.3, 1.8 points in MOTA and\ IDF1 scores respectively, becoming the new state of the art in the MOTChallenge dataset.
\
\
\
\
}, author = {Girbau, A.}, editor = {Rius, Ignasi and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cKazakos, title = {SynthRef: Generation of Synthetic Referring Expressions for Object Segmentation}, booktitle = {NAACL Visually Grounded Interaction and Language (ViGIL) Workshop}, year = {2021}, month = {06/2021}, address = {Virtual}, abstract = {Recent advances in deep learning have brought significant progress in visual grounding tasks such as language-guided video object segmentation. However, collecting large datasets for these tasks is expensive in terms of annotation time, which represents a bottleneck. To this end, in our work we propose a novel method, namely SynthRef, for generating synthetic referring expressions for an image (or video frame), and we also present and disseminate the first large-scale dataset with synthetic referring expressions for video object segmentation. Our experiments demonstrate that by training with our synthetic dataset one can improve the ability of a model to generalize across different datasets, without any additional annotation cost. Moreover, our formulation allows its application to any object detection or segmentation dataset.
\
This thesis degree is part of a project from the Image Group at UPC that is focused on sign language translation using deep learning technologies. This thesis builds on top of an existing database called How2Sign, that contains more than 83 hours of sign language translation videos. This database has some textual annotations aligned to a front RGB camera. The same scenes are also captured by a side RGB and a front RGB-D cameras. These three cameras are not synchronized, so it is necessary to align the segments annotated on the RGB front camera to the other cameras. This thesis explores a solution based on the cross correlation operator. Our work is to process the coordinates of the joints of the subject that appears in the videos, not from the point of view of image or video processing based on pixels. The first part if this thesis is to investigate the properties of the cross-correlation function by locating short video segments of a long recording based on automatically extracted 2D human poses. The experiments studied the impact of adding noise. The second part applied the cross-correlation to try to align two videos with the same scene, but recorded with different cameras from different points of view.
In this study we show the power of variational autoencoders (VAEs) for a variety of tasks relating to the interpretation and compression of genomic data. The unsupervised setting allows for detecting and learning of granular population structure and inferring of new informative latent factors, opening up an avenue for applications in dimensionality reduction, data simulation, population classification, imputation, and lossless genomic data compression. The latent spaces of VAEs are able to capture and represent clearly differentiated Gaussian-like clusters of similar genetic composition on a fine-scale with a relatively small number of Single Nucleotide Polymorphisms (SNPs) as input. Furthermore, sequences can be decomposed into latent representations and reconstruction errors (residuals) providing a sparse representation that provides a means for efficient lossless compression.
Identifying genetic clusters can be important when performing genome-wide association studies and provides an alternative to self-reported ethnic labels, which are culturally constructed and vary according to the location and individual. A variety of unsupervised dimensionality reduction methods have been explored in the past for such applications, including PCA, MDS, t-SNE, and UMAP. Our proposed VAE can represent the population structure as a Gaussian-distributed continuous multi-dimensional representation and as classification probabilities providing flexible and interpretable population descriptors.
We train our VAE method with several worldwide whole genome datasets from both humans and canids, and evaluate the performance of the different proposed applications with networks with and without ancestry conditioning. Our experiments show that different population groups have significantly differentiated compression ratios and classification accuracies. Additionally, we analyze the entropy of the SNP data, noting its effect on compression across populations and connect these patterns to historical migrations and ancestral relationships.
This work focuses on the self-acquirement of the fundamental task-agnostic knowledge available within an environment. The aim is to discover and learn baseline representations and behaviours that can later be useful for solving embodied visual navigation downstream tasks. Specifically, the presented approach extends the idea of the "Explore, Discover and Learn" (EDL) paradigm to the pixel domain. This way, this work is centered in the representations and behaviours that can be learnt by an agent that only integrates an image capture sensor. Both the agents and the environment that is used in this work run over the Habitat AI simulator, which is developed by Facebook AI, and renders 3D fotorealistic views of the insides of apartments.
Pre-training Reinforcement Learning agents in a task-agnostic manner has shown promising results. However, previous works still struggle in learning and discovering meaningful skills in high-dimensional state-spaces, such as pixel-spaces. We approach the problem by leveraging unsupervised skill discovery and self-supervised learning of state representations. In our work, we learn a compact latent representation by making use of variational and contrastive techniques. We demonstrate that both enable RL agents to learn a set of basic navigation skills by maximizing an information theoretic objective. We assess our method in Minecraft 3D pixel maps with different complexities. Our results show that representations and conditioned policies learned from pixels are enough for toy examples, but do not scale to realistic and complex maps. To overcome these limitations, we explore alternative input observations such as the relative position of the agent along with the raw pixels.
}, author = {Nieto, Juan Jos{\'e} and Creus, Roger and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xGraneroa, title = {2D to 3D body pose estimation for sign language with Deep Learning}, year = {2020}, abstract = {This project aims at leveraging the challenge of using 3D poses for Sign Language translation or animation by transforming 2D pose datasets into 3D ones. The goal is, using a 3D dataset of American Sign Language, to train a deep neural network that will predict the depth coordinates of the skeleton keypoints from 2D coordinates. Specifically, it will be explored a Long Short-Term Memory network, an architecture broadly used for sequence to sequence tasks. The conclusions extracted on this report are that despite some of the results being good enough to be used for actual 3D SL annotation, the majority of them lack the precision to do so, and they are too variant with respect to the dataset split. It is also concluded that the solutions approached here could be improved by adding some regularization methods, more powerful hardware to run better experiments, and new input features such as keypoint visibility.
}, author = {P{\'e}rez-Granero, Pol}, editor = {McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xEscur, title = {Attention-based multi-view 3D reconstruction models}, year = {2020}, abstract = {Attention mechanism has been successfully used in multiple tasks in the fields of Computer Vision and Natural Language Processing, but has not ever been applied to 3D reconstruction problems. In this work, we explore the potential of attention in a multi-view 3D face recon- struction pipeline. On one hand, we use spatial attention when extracting the features of the input images, taking advantage of the interpretability it provides us. This allows us to validate the proper behaviour of the model. On the other hand, we want to make this multi-view setup invariant to the order of the input image?s views. To do so, instead of concatenating the fea- tures of the different views, we use part of the Transformer architecture as a symmetric merging function, which is based on a multi-head self-attention mechanism, showing an improvement in the performance.
}, author = {Escur, Janna}, editor = {Ramon, Eduard and Xavier Gir{\'o}-i-Nieto} } @conference {xCaros19, title = {Automatic Reminiscence Therapy for Dementia}, booktitle = {ACM International Conference on Multimedia Retrieval (ICMR)}, year = {2020}, month = {06/2020}, publisher = {ACM}, organization = {ACM}, address = {Dublin, Ireland}, abstract = {With people living longer than ever, the number of cases with dementia such as Alzheimer{\textquoteright}s disease increases steadily. It affects more than 46 million people worldwide, and it is estimated that in 2050 more than 100 million will be affected. While there are not effective treatments for these terminal diseases, therapies such as reminiscence, that stimulate memories from the past are recommended. Currently, reminiscence therapy takes place in care homes and is guided by a therapist or a carer. In this work, we present an AI-based solution to automatize the reminiscence therapy, which consists in a dialogue system that uses photos as input to generate questions. We run a usability case study with patients diagnosed of mild cognitive impairment that shows they found the system very entertaining and challenging. Overall, this paper presents how reminiscence therapy can be automatized by using machine learning, and deployed to smartphones and laptops, making the therapy more accessible to every person affected by dementia. (demo paper)
\
\
Recent work have addressed the generation of human poses represented by 2D/3D coordinates of human joints for sign language. We use the state of the art in Deep Learning for motion transfer and evaluate them on How2Sign, an American Sign Language dataset, to generate videos of signers performing sign language given a 2D pose skeleton. We evaluate the generated videos quantitatively and qualitatively showing that the current models are not enough to generated adequate videos for Sign Language due to lack of detail in hands.
\ \
\
Video object segmentation can be understood as a sequence-to-sequence task that can benefit from the curriculum learning strategies for better and faster training of deep neural networks.\ This work explores different schedule sampling and frame skipping variations to significantly improve the performance of a recurrent architecture.\ Our results on the car class of the KITTI-MOTS challenge indicate that, surprisingly, an inverse schedule sampling is a better option than a classic forward one. Also, that a progressive skipping of frames during training is beneficial, but only when training with the ground truth masks instead of the predicted ones.
Video object segmentation (VOS) is a computer vision task that aims at determining\ the pixels of an object of interest along a video sequence. This thesis explores different\ curriculum learning strategies for a deep neural network trained to solve this task.
Curriculum learning defines a methodology where the training data are not randomly presented to the model, instead, they are organized in a meaningful way. Simple concepts are first presented and gradually become more complex. Four different\ curriculum strategies are explored: schedule sampling, frame skipping, the effect of\ temporal and spatial recurrence variations and loss penalization by the object{\textquoteright}s area.
This work focuses on the RVOS neural architecture, a recurrent architecture originally tested on the DAVIS and YouTube-VOS datasets for one-shot video object segmentation, over the cars class of the KITTI-MOTS dataset. Even though this architecture is a fast solution for the VOS task, the model struggles with the KITTI-MOTS\ dataset, whose videos are more crowded and challenging.
For the schedule sampling curriculum, both the classic and inverse implementations are evaluated. Results show how inverse schedule sampling strategies improve\ the model{\textquoteright}s performance instead of the classic approach, the forward one. The different frame skipping schemes are also beneficial, but only when training with the\ ground truth mask instead of the predicted ones. Lastly, both the curriculums that\ vary the temporal and spatial recurrence or penalize the loss by the object{\textquoteright}s area have\ shown poor model{\textquoteright}s performance.\
These results show how curriculum learning strategies affect greatly the performance of recurrent neural networks. Moreover, the results on the inverse schedule\ sampling and frame skipping strategies invite to further explore this schemes to exploit\ their benefits.
\
\
\
Deep learning has revolutionized the field of artificial intelligence in the past decade. Although the development of these techniques spans over several years, the recent advent of deep learning is explained by an increased availability of data and compute that have unlocked the potential of deep neural networks. They have become ubiquitous in domains such as natural language processing, computer vision, speech processing, and control, where enough training data is available. Recent years have seen continuous progress driven by ever-growing neural networks that benefited from large amounts of data and computing power. This thesis is motivated by the observation that scale is one of the key factors driving progress in deep learning research, and aims at devising deep learning methods that scale gracefully with the available data and compute. We narrow down this scope into two main research directions. The first of them is concerned with designing hardware-aware methods which can make the most of the computing resources in current high performance computing facilities. We then study bottlenecks preventing existing methods from scaling up as more data becomes available, providing solutions that contribute towards enabling training of more complex models. This dissertation studies the aforementioned research questions for two different learning paradigms, each with its own algorithmic and computational characteristics. The first part of this thesis studies the paradigm where the model needs to learn from a collection of examples, extracting as much information as possible from the given data. The second part is concerned with training agents that learn by interacting with a simulated environment, which introduces unique challenges such as efficient exploration and simulation.
\
Deep neural networks have achieved outstanding results in various applications such as vision, language, audio, speech, or reinforcement learning. These powerful function approximators typically require large amounts of data to be trained, which poses a challenge in the usual case where little labeled data is available. During the last year, multiple solutions have been proposed to leverage this problem, based on the concept of self-supervised learning, which can be understood as a specific case of unsupervised learning. This talk will cover its basic principles and provide examples in the field of multimedia.
\
Knowledge Graphs (KG) are becoming essential to organize, represent and store the world{\textquoteright}s knowledge, but they still rely heavily on humanly-curated structured data. Information Extraction (IE) tasks, like disambiguating entities and relations from unstructured text, are key to automate KG population. However, Natural Language Processing (NLP) methods alone can not guarantee the validity of the facts extracted and may introduce erroneous information into the KG.\ This work presents an end-to-end system that combines Semantic Knowledge and Validation techniques with NLP methods, to provide KG population of novel facts from clustered news events.\ The contributions of this paper are two-fold: First, we present a novel method for including entity-type knowledge into a Relation Extraction model, improving F1-Score over the baseline with TACRED and TypeRE datasets. Second, we increase the precision by adding data validation on top of the Relation Extraction method. These two contributions are combined in an industrial pipeline for automatic KG population over aggregated news, demonstrating increased data validity when performing online learning from unstructured web data. Finally, the TypeRE and AggregatedNewsRE datasets build to benchmark these results are also published to foster future research in this field.
\
}, keywords = {Data Validation, Knowledge Graph, Relation Extraction}, author = {Fern{\`a}ndez, D{\`e}lia and Rimmek, Joan Marco and Espadaler, Joan and Garolera, Blai and Barja, Adri{\`a} and Codina, Marc and Sastre, Marc and Xavier Gir{\'o}-i-Nieto and Riveiro, Juan Carlos and Bou-Balust, Elisenda} } @conference {cCamposb, title = {Explore, Discover and Learn: Unsupervised Discovery of State-Covering Skills}, booktitle = {International Conference on Machine Learning (ICML) 2020}, year = {2020}, month = {07/2020}, abstract = {Acquiring abilities in the absence of a task-oriented reward function is at the frontier of reinforcement learning research. This problem has been studied through the lens of empowerment, which draws a connection between option discovery and information theory. Information-theoretic skill discovery methods have garnered much interest from the community, but little research has been conducted in understanding their limitations. Through theoretical analysis and empirical evidence, we show that existing algorithms suffer from a common limitation -- they discover options that provide a poor coverage of the state space. In light of this, we propose {\textquoteright}Explore, Discover and Learn{\textquoteright} (EDL), an alternative approach to information-theoretic skill discovery. Crucially, EDL optimizes the same information-theoretic objective derived from the empowerment literature, but addresses the optimization problem using different machinery. We perform an extensive evaluation of skill discovery methods on controlled environments and show that EDL offers significant advantages, such as overcoming the coverage problem, reducing the dependence of learned skills on the initial state, and allowing the user to define a prior over which behaviors should be learned.
The development of remote fruit detection systems able to identify and 3D locate fruits provides opportunities to improve the efficiency of agriculture management. Most of the current fruit detection systems are based on 2D image analysis. Although the use of 3D sensors is emerging, precise 3D fruit location is still a pending issue. This work presents a new methodology for fruit detection and 3D location consisting of: (1) 2D fruit detection and segmentation using Mask R-CNN instance segmentation neural network; (2) 3D point cloud generation of detected apples using structure-from-motion (SfM) photogrammetry; (3) projection of 2D image detections onto 3D space; (4) false positives removal using a trained support vector machine. This methodology was tested on 11 Fuji apple trees containing a total of 1455 apples. Results showed that, by combining instance segmentation with SfM the system performance increased from an F1-score of 0.816 (2D fruit detection) to 0.881 (3D fruit detection and location) with respect to the total amount of fruits. The main advantages of this methodology are the reduced number of false positives and the higher detection rate, while the main disadvantage is the high processing time required for SfM, which makes it presently unsuitable for real-time work. From these results, it can be concluded that the combination of instance segmentation and SfM provides high performance fruit detection with high 3D data precision. The dataset has been made publicly available and an interactive visualization of fruit detection results is accessible at http://www.grap.udl.cat/documents/photogrammetry_fruit_detection.html
}, keywords = {Fruit detection, Fruit location, Mask R-CNN, Structure-from-motion, Terrestrial remote sensing}, issn = {ISSN: 0168-1699}, doi = {https://doi.org/10.1016/j.compag.2019.105165}, url = {https://doi.org/10.1016/j.compag.2019.105165}, author = {Gen{\'e}-Mola, Jordi and Sanz, Ricardo and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Ver{\'o}nica Vilaplana and Gregorio, Eduard} } @article {aRey-Arena20, title = {FuCiTNet: Improving the generalization of deep learning networks by the fusion of learned class-inherent transformations}, journal = {Information Fusion}, volume = {63}, year = {2020}, month = {10/2020}, chapter = {188}, abstract = {It is widely known that very small datasets produce overfitting in Deep Neural Networks (DNNs), i.e., the network becomes highly biased to the data it has been trained on. This issue is often alleviated using transfer learning, regularization techniques and/or data augmentation. This work presents a new approach, independent but complementary to the previous mentioned techniques, for improving the generalization of DNNs on very small datasets in which the involved classes share many visual features. The proposed model, called FuCiTNet (Fusion Class inherent Transformations Network), inspired by GANs, creates as many generators as classes in the problem. Each generator, k, learns the transformations that bring the input image into the k-class domain. We introduce a classification loss in the generators to drive the leaning of specific k-class transformations. Our experiments demonstrate that the proposed transformations improve the generalization of the classification model in three diverse datasets.
}, doi = {10.1016/j.inffus.2020.06.015}, author = {Rey-Arena, M. and Guirado, E. and Tabik, S. and Ruiz-Hidalgo, J.} } @article {aGene-Mola20, title = {Fuji-SfM dataset: A collection of annotated images and point clouds for Fuji apple detection and location using structure-from-motion photogrammetry}, volume = {Data in Brief}, year = {2020}, month = {06/2020}, keywords = {Fruit detection, Mask R-CNN, Photogrammetry, Structure-from-motion, Terrestrial remote sensing, Yield mapping, Yield prediction}, doi = {https://doi.org/10.1016/j.dib.2020.105591}, author = {Gen{\'e}-Mola, Jordi and Sanz, Ricardo and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Ver{\'o}nica Vilaplana and Gregorio, Eduard} } @mastersthesis {xKazakos20, title = {Generation of Synthetic Referring Expressions for Object Segmentation in Videos}, year = {2020}, abstract = {Integrating computer vision with natural language processing has achieved significant progress over the last years owing to the continuous evolution of deep learning. A novel vision and language task, which is tackled in the present Master thesis is referring video object segmentation, in which a language query defines which instance to segment from a video sequence. One of the biggest chal- lenges for this task is the lack of relatively large annotated datasets since a tremendous amount of time and human effort is required for annotation. Moreover, existing datasets suffer from poor qual- ity annotations in the sense that approximately one out of ten language expressions fails to uniquely describe the target object.
The purpose of the present Master thesis is to address these challenges by proposing a novel method for generating synthetic referring expressions for an image (video frame). This method pro- duces synthetic referring expressions by using only the ground-truth annotations of the objects as well as their attributes, which are detected by a state-of-the-art object detection deep neural network. One of the advantages of the proposed method is that its formulation allows its application to any object detection or segmentation dataset.
By using the proposed method, the first large-scale dataset with synthetic referring expressions for video object segmentation is created, based on an existing large benchmark dataset for video instance segmentation. A statistical analysis and comparison of the created synthetic dataset with existing ones is also provided in the present Master thesis.
The conducted experiments on three different datasets used for referring video object segmen- tation prove the efficiency of the generated synthetic data. More specifically, the obtained results demonstrate that by pre-training a deep neural network with the proposed synthetic dataset one can improve the ability of the network to generalize across different datasets, without any additional an- notation cost. This outcome is even more important taking into account that no additional annotation cost is involved.
}, author = {Kazakos, Ioannis}, editor = {Xavier Gir{\'o}-i-Nieto} } @article {aGarcia-Gomez20, title = {Geometric Model and Calibration Method for a Solid-State LiDAR}, journal = {Sensors}, volume = {20}, year = {2020}, month = {05/2020}, pages = {2898}, abstract = {This paper presents a novel calibration method for solid-state LiDAR devices based on a geometrical description of their scanning system, which has variable angular resolution. Determining this distortion across the entire Field-of-View of the system yields accurate and precise measurements which enable it to be combined with other sensors. On the one hand, the geometrical model is formulated using the well-known Snell{\textquoteright}s law and the intrinsic optical assembly of the system, whereas on the other hand the proposed method describes the scanned scenario with an intuitive camera-like approach relating pixel locations with scanning directions. Simulations and experimental results show that the model fits with real devices and the calibration procedure accurately maps their variant resolution so undistorted representations of the observed scenario can be provided. Thus, the calibration method proposed during this work is applicable and valid for existing scanning systems improving their precision and accuracy in an order of magnitude.
}, keywords = {solid-state LiDAR; LiDAR calibration; distortion correction; FOV mapping}, issn = {1424-8220}, doi = {10.3390/s20102898}, url = {https://www.mdpi.com/1424-8220/20/10/2898}, author = {Pablo Garc{\'\i}a-G{\'o}mez and S. Royo and Noel Rodrigo and Casas, J.} } @article {9103248, title = {Grounded Sequence to Sequence Transduction}, journal = {IEEE Journal of Selected Topics in Signal Processing}, volume = {14}, year = {2020}, month = {05/2020}, pages = {577-591}, abstract = {Speech recognition and machine translation have made major progress over the past decades, providing practical systems to map one language sequence to another. Although multiple modalities such as sound and video are becoming increasingly available, the state-of-the-art systems are inherently unimodal, in the sense that they take a single modality {\textemdash} either speech or text {\textemdash} as input. Evidence from human learning suggests that additional modalities can provide disambiguating signals crucial for many language tasks. In this article, we describe the How2 dataset\ , a large, open-domain collection of videos with transcriptions and their translations. We then show how this single dataset can be used to develop systems for a variety of language tasks and present a number of models meant as starting points. Across tasks, we find that building multimodal architectures that perform better than their unimodal counterpart remains a challenge. This leaves plenty of room for the exploration of more advanced solutions that fully exploit the multimodal nature of the How2 dataset\ , and the general direction of multimodal learning with other datasets as well.
}, author = {L. Specia and L. Barrault and O. Caglayan and Amanda Duarte and D. Elliott and S. Gella and N. Holzenberger and C. Lala and S. J. Lee and J. Libovicky and P. Madhyastha and F. Metze and K. Mulligan and A. Ostapenko and S. Palaskar and R. Sanabria and J. Wang and R. Arora} } @article {xGiro-i-Nieto20b, title = {Image and Video Object Segmentation with Low Supervision}, year = {2020}, abstract = {Invited talk at VI-Lab, University oif Bristol (November 3rd, 2020 - 2pm)
Image and video segmentation are central tasks within the computer vision field. Nevertheless, deep learning solutions for segmentation typically rely on pixel-level annotations, which are very costly to collect. Likewise, some segmentation systems require human interaction at inference time, which involves some effort for the end-user. In this talk, we look into diverse supervision scenarios for image and video object segmentation. We discern between supervision when learning the model, i.e., which type of annotations are used during training, and supervision at inference, namely which kind of human input is required when running the system. Our target are models that require low forms of supervision
\
In the first part of the talk we present recurrent architectures for image and video object segmentation that are end-to-end trainable in a fully-supervised setup, and that does not require any post-processing step, i.e., the output of the model directly solves the addressed task. The second part of the talk aims at lowering the annotation cost, in terms of labelling time, needed to train image segmentation models. We explore semi-supervised pipelines and show results when a very limited budget is available. The third part of the session attempts to alleviate the supervision required by semi-automatic systems at inference time. Particularly, we focus on semi-supervised video object segmentation, which typically requires generating a binary mask for each instance to be tracked. In contrast, we present a model for language-guided video object segmentation, which identifies the object to segment with a natural language expression. We study current benchmarks, propose a novel categorization of referring expressions for video, and propose a method to generate synthetic referring expressions.
\
Slides\
Image segmentation is a classic computer vision task that aims at labeling pixels with semantic classes. These slides provide an overview of the basic approaches applied from the deep learning field to tackle this challenge and presents the basic subtasks (semantic, instance and panoptic segmentation) and related datasets.
Presented at the International Summer School on Deep Learning (ISSonDL) 2020 held online and organized by the University of Gdansk (Poland) between the 30th August and 2nd September.
\
\
\
\
Sign language recognition and translation has been an active research field in the recent years with most approaches using deep neural networks to extract information from sign language data. This work investigates the mostly disregarded approach of using human keypoint estimation from image and video data with OpenPose in combination with transformer network architecture. Firstly, it was shown that it is possible to recognize individual signs (4.5\% word error rate (WER)). Continuous sign language recognition though was more error prone (77.3\% WER) and sign language translation was not possible using the proposed methods, which might be due to low accuracy scores of human keypoint estimation by OpenPose and accompanying loss of information or insufficient capacities of the used transformer model. Results may improve with the use of datasets containing higher repetition rates of individual signs or focusing more precisely on keypoint extraction of hands.
\
\
\
\
}, doi = {10.18419/opus-11197}, url = {https://elib.uni-stuttgart.de/handle/11682/11214}, author = {Muschik, Peter}, editor = {A. Duarte and Xavier Gir{\'o}-i-Nieto} } @article {aBellver, title = {Mask-guided sample selection for Semi-Supervised Instance Segmentation}, journal = {Multimedia Tools and Applications}, year = {2020}, month = {07/2020}, abstract = {Image segmentation methods are usually trained with pixel-level annotations, which require significant human effort to collect. The most common solution to address this constraint is to implement weakly-supervised pipelines trained with lower forms of supervision, such as bounding boxes or scribbles. Another option are semi-supervised methods, which leverage a large amount of unlabeled data and a limited number of strongly-labeled samples. In this second setup, samples to be strongly-annotated can be selected randomly or with an active learning mechanism that chooses the ones that will maximize the model performance.\ In this work, we propose a sample selection approach to decide which samples to annotate for semi-supervised instance segmentation. Our method consists in first predicting pseudo-masks for the unlabeled pool of samples, together with a score predicting the quality of the mask. This score is an estimate of the Intersection Over Union (IoU) of the segment with the ground truth mask. We study which samples are better to annotate given the quality score, and show how our approach outperforms a random selection, leading to improved performance for semi-supervised instance segmentation with low annotation budgets.
\
\ NeAT is a modular, flexible and user-friendly neuroimaging analysis toolbox for modeling linear and nonlinear effects overcoming the limitations of the standard neuroimaging methods which are solely based on linear models. NeAT provides a wide range of statistical and machine learning non-linear methods for model estimation, several metrics based on curve fitting and complexity for model inference and a graphical user interface (GUI) for visualization of results. We illustrate its usefulness on two study cases where non-linear effects have been previously established. Firstly, we study the nonlinear effects of Alzheimer{\textquoteright}s disease on brain morphology (volume and cortical thickness). Secondly, we analyze the effect of the apolipoprotein APOE-ε4 genotype on brain aging and its interaction with age. NeAT is fully documented and publicly distributed at https://imatge-upc.github.io/neat-tool/.
}, keywords = {Alzheimer{\textquoteright}s disease, APOE, GAM, GLM, inference, neuroimaging, nonlinear., SVR}, doi = {10.1007/s12021-020-09456-w}, url = {https://link.springer.com/article/10.1007/s12021-020-09456-w}, author = {Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana and Santi Puch and Asier Aduriz and Carlos Lopez and G. Operto and R. Cacciaglia and C. Falcon and J.L. Molinuevo and Juan D. Gispert} } @conference {cGiro-i-Nieto, title = {One Perceptron to Rule Them All: Language, Vision, Audio and Speech (tutorial)}, booktitle = {ACM International Conference on Multimedia Retrieval (ICMR) 2020}, year = {2020}, month = {06/2020}, publisher = {ACM}, organization = {ACM}, address = {Dublin, Ireland}, abstract = {Deep neural networks have boosted the convergence of multimedia data analytics in a unified framework shared by practitioners in natural language, vision and speech. Image captioning, lip reading or video sonorization are some of the first applications of a new and exciting field of research exploiting the generalization properties of deep neural representation. This tutorial will firstly review the basic neural architectures to encode and decode vision, text and audio, to later review the those models that have successfully translated information across modalities.
\
PixInPix is an steganography hidding system of images within other images.\ The system designed is able to create, from an cover image and a message, a new steganography image. This new stego-image looks as similar as possible as the cover but has the message hidden in it.\ Our approach adopts the U-net architecture and combines two reconstruction losses to provide a simple yet effective approach tested in low resolution images from MNIST, CIFAR and ImageNet.
}, author = {Punt{\'\i}, Cristina}, editor = {McGuinness, Kevin and Cristian Canton-Ferrer and Xavier Gir{\'o}-i-Nieto} } @article {aCasamitjanac, title = {Projection to Latent Spaces disentangles pathological effects on brain morphology in the asymptomatic phase of Alzheimer{\textquoteright}s disease}, journal = {Frontiers in Neurology, section Applied Neuroimaging}, volume = {11}, year = {2020}, month = {07/2020}, chapter = {648}, abstract = {Alzheimer{\textquoteright}s disease (AD) continuum is defined as a cascade of several neuropathological processes that can be measured using biomarkers such as cerebrospinal fluid (CSF) levels of Aβ, p-tau and t-tau. In parallel, brain anatomy can be characterized through imaging techniques such as magnetic resonance imaging (MRI). In this work, we relate both sets of measurements seeking associations between biomarkers and brain structure that can be indicative of AD progression. The goal is to uncover underlying multivariate effects of AD pathology on regional brain morphological information. For this purpose, we use the projection to latent structures (PLS)\ method. Using PLS, we find a low dimensional latent space that best describes the covariance between both sets of measurements on the same subjects. Possible confounder effects (age and sex) on brain morphology are included in the model and regressed out using an orthogonal PLS model. We look for statistically significant correlations between brain morphology and CSF biomarkers that explain part of the volumetric variance at each region-of-interest (ROI). Furthermore, we use a clustering technique to discover a small set of CSF-related patterns describing the AD continuum. We apply this technique to the study of subjects in the whole AD continuum from the preclinical asymptomatic stages all through to the symptomatic groups. Subsequent analyses involve splitting the course of the disease into diagnostic categories: cognitively unimpaired subjects (CU), mild cognitive impaired subjects (MCI) and dementia subjects (AD-dementia) where all symptoms are due to AD.
}, keywords = {Alzheimer{\textquoteright}s disease, brain morphology, CSF biomarkers, Latent model, PLS, preclinical AD}, author = {Adri{\`a} Casamitjana and Paula Petrone and Jose Luis Molinuevo and Juan D. Gispert and Ver{\'o}nica Vilaplana} } @mastersthesis {xManas, title = {Self-Supervised Visual Representation Learning for Remote Sensing}, year = {2020}, abstract = {With the creation of large-scale annotated datasets such as the ImageNet, fully-supervised machine learning methods have become the standard for solving computer vision tasks. These methods require large amounts of labeled data, which are usually obtained with crowdsourcing tools or social media tags. However, these approaches do not scale for specialized domains, such as medical or satellite imaging, where annotations must be provided by experts at a prohibitive cost. Recently, self-supervised learning has emerged as an alternative for obtaining transferable visual representations from unlabeled data. Models based on these representations match the performance of fully-supervised models while only requiring a small fraction of the annotations. In this work, we aim to explore the application of self-supervised learning methods in the remote sensing domain. We propose a contrastive approach for learning visual representations by exploiting the multi-spectral information of satellite images. These representations serve as a good starting point for a variety of downstream tasks that involve remote sensing imagery, accelerating convergence with fewer labeled examples.
Best thesis award 2020 (draw with 4 more other works)\
}, author = {Ma{\~n}as,Oscar}, editor = {Rodr{\'\i}guez, Pau and Xavier Gir{\'o}-i-Nieto} } @conference {cPuig-Sitjes20, title = {Strategy for the real-time detection of thermal events on the plasma facing components of Wendelstein 7-X}, booktitle = {31st Symposium on Fusion Technology (SOFT2020)}, year = {2020}, month = {09/2020}, address = {Dubrovnik, Croatia}, abstract = {Wendelstein 7-X (W7-X), the most advanced fusion experiment in the stellarator line, aims at demonstrating the feasibility of the stellarator concept as a future fusion power plant. It is planned to restart operation by the end of 2021 with a high heat flux divertor and water-cooled plasma facing components (PFCs) to demonstrate steady-state operation. With plasma energy limits starting at 1 GJ and gradually increasing to 18 GJ over several experimental campaigns, the PFCs have to be protected from overheating. For that, a fully autonomous system is required in order to prevent damage to the plasma facing components due to thermal events.
During the last experimental campaign, when W7-X was equipped with inertially cooled test divertor units, extensive experience was gained with the preliminary design of the thermal event detection system. By then, the system was not yet real-time capable and it was not fully automated, requiring manual supervision between discharges. This experience, however, allowed to prove the validity of some design concepts and to define the new strategy towards the protection of the machine in steady-state operation, when the system will be connected to the Interlock System and the feedback control.
In this work, the design of the real-time thermal event detection system for W7-X for steady-state operation is presented. The system is based on the thermography and video diagnostics to monitor the divertor units, the baffles, and the wall heat-shields and panels. It will be implemented on a real-time system and integrated in CoDaC{\textquoteright}s safety infrastructure. The system relies on computer vision and machine learning techniques to perform a spatio-temporal analysis to detect and classify the thermal events and to perform a risk evaluation. The results and the main conclusions drawn from the analysis of the data from the past campaign are reported.
In this work, we propose an effective approach for training unique embedding representations by combining three simultaneous modalities: image and spoken and textual narratives. The proposed methodology departs from a baseline system that spawns a embedding space trained with only spoken narratives and image cues. Our experiments on the EPIC-Kitchen and Places Audio Caption datasets show that introducing the human-generated textual transcriptions of the spoken narratives helps to the training procedure yielding to get better embedding representations. The triad speech, image and words allows for a better estimate of the point embedding and show an improving of the performance within tasks like image and speech retrieval,\ even when text third modality, text, is not present in the task.
\
This work proposes a novel end-to-end convolutional neural network (CNN) architecture to\ automatically quantify the severity of knee osteoarthritis (OA) using X-Ray images, which\ incorporates trainable attention modules acting as unsupervised fine-grained detectors of\ the region of interest (ROI). The proposed attention modules can be applied at different\ levels and scales across any CNN pipeline helping the network to learn relevant attention\ patterns over the most informative parts of the image at different resolutions. We test\ the proposed attention mechanism on existing state-of-the-art CNN architectures as our\ base models, achieving promising results on the benchmark knee OA datasets from the\ osteoarthritis initiative (OAI) and multicenter osteoarthritis study (MOST). All the codes\ from our experiments will be publicly available on the github repository: https://github.com/marc-gorriz/KneeOA-CNNAttention
One of the points of maximum interest in the valuation of the areas of commercial advertising is the probability with which that publicity will be seen. This paper presents a method based on top-view camera measurement, where the probability of viewing is estimated based on the trajectories and movements of the heads of the passerby individuals in the area of interest. Using a camera with a depth sensor, the heads of the people in the range of view can be detected and modeled. This allows to\ determine the orientation of the head which is used to estimate the direction of vision. A tracking by detection algorithm\ allows to compute the trajectory of each user. The attention at each advertising point is estimated based on the trajectories and head orientations of the individuals in the area of interest.
}, keywords = {attention time, depth sensor, top-view camera, tracking}, author = {Manuel L{\'o}pez-Palma and Morros, J.R. and Montserrat Corbal{\'a}n and Javier Gago} } @conference {cBellverb, title = {Budget-aware Semi-Supervised Semantic and Instance Segmentation}, booktitle = {CVPR 2019 DeepVision Workshop}, year = {2019}, month = {05/2019}, publisher = {OpenCVF}, organization = {OpenCVF}, address = {Long Beach, CA, USA}, abstract = {Methods that move towards less supervised scenarios are key for image segmentation, as dense labels demand significant human intervention. Generally, the annotation burden is mitigated by labeling datasets with weaker forms of supervision, e.g. image-level labels or bounding boxes. Another option are semi-supervised settings, that commonly leverage a few strong annotations and a huge number of unlabeled/weakly-labeled data. In this paper, we revisit semi-supervised segmentation schemes and narrow down significantly the annotation budget (in terms of total labeling time of the training set) compared to previous approaches. With a very simple pipeline, we demonstrate that at low annotation budgets, semi-supervised methods outperform by a wide margin weakly-supervised ones for both semantic and instance segmentation. Our approach also outperforms previous semi-supervised works at a much reduced labeling cost. We present results for the Pascal VOC benchmark and unify weakly and semi-supervised approaches by considering the total annotation budget, thus allowing a fairer comparison between methods.
\ \
In the past decade, deep neural networks have revolutionized computer vision. High performing deep neural architectures trained for visual recognition tasks have pushed the field towards methods relying on learned image representations instead of hand-crafted ones, in the seek of designing end-to-end learning methods to solve challenging tasks, ranging from long-lasting ones such as image classification to newly emerging tasks like image captioning.\
As this thesis is framed in the context of the rapid evolution of computer vision, we present contributions that are aligned with three major changes in paradigm that the field has recently experienced, namely 1) the power of re-utilizing deep features from pre-trained neural networks for different tasks, 2) the advantage of formulating problems with end-to-end solutions given enough training data, and 3) the growing interest of describing visual data with natural language rather than pre-defined categorical label spaces, which can in turn enable visual understanding beyond scene recognition.
The first part of the thesis is dedicated to the problem of visual instance search, where we particularly focus on obtaining meaningful and discriminative image representations which allow efficient and effective retrieval of similar images given a visual query. Contributions in this part of the thesis involve the construction of sparse Bag-of-Words image representations from convolutional features from a pre-trained image classification neural network, and an analysis of the advantages of fine-tuning a pre-trained object detection network using query images as training data.
The second part of the thesis presents contributions to the problem of image-to-set prediction, understood as the task of predicting a variable-sized collection of unordered elements for an input image. We conduct a thorough analysis of current methods for multi-label image classification, which are able to solve the task in an end-to-end manner by simultaneously estimating both the label distribution and the set cardinality. Further, we extend the analysis of set prediction methods to semantic instance segmentation, and present an end-to-end recurrent model that is able to predict sets of objects (binary masks and categorical labels) in a sequential manner.
Finally, the third part of the dissertation takes insights learned in the previous two parts in order to present deep learning solutions to connect images with natural language in the context of cooking recipes and food images. First, we propose a retrieval-based solution in which the written recipe and the image are encoded into compact representations that allow the retrieval of one given the other. Second, as an alternative to the retrieval approach, we propose a generative model to predict recipes directly from food images, which first predicts ingredients as sets and subsequently generates the rest of the recipe one word at a time by conditioning both on the image and the predicted ingredients.
\
Sign Language is the primary means of communication for the majority of the Deaf and hard-of-hearing communities. Current computational approaches in this general research area have focused specifically on sign language recognition and the translation of sign language to text. However, the reverse problem of translating from spoken to sign language has so far not been widely explored.
The goal of this doctoral research is to explore sign language translation in this generalized setting, i.e. translating from spoken language to sign language and vice versa. Towards that end, we propose a concrete methodology for tackling the problem of speech to sign language translation and introduce How2Sign, the first public, continuous American Sign Language dataset that enables such research. With a parallel corpus of almost 60 hours of sign language videos (collected with both RGB and depth sensor data) and the corresponding speech transcripts for over 2500 instructional videos, How2Sign is a public dataset of unprecedented scale that can be used to advance not only sign language translation, but also a wide range of sign language understanding tasks.
\
Magnetic resonance imaging (MRI) has unveiled specific AD alterations at different stages of the AD pathophysiologic continuum that constitutes what has been established as the {\textquoteleft}AD signature{\textquoteright}. To what extent MRI can detect amyloid-related cerebral changes from structural MRI in unimpaired individuals is still an area open for exploration.
}, issn = {1552-5260}, doi = {10.1016/j.jalz.2019.06.2796}, author = {Adri{\`a} Casamitjana and P. Petrone and C. Falcon and M. Artigues and G. Operto and R. Cacciaglia and J.L. Molinuevo and Ver{\'o}nica Vilaplana and J.D. Gispert} } @article {aRoisman18, title = {Differential expression of long non-coding RNAs related to proliferation and histological diversity in follicular lymphomas}, journal = {British Journal of Haematology}, volume = {184}, year = {2019}, month = {Feb 2019}, pages = {373-383}, issn = {ISSN:1365-2141}, doi = {DOI: 10.1111/bjh.15656}, author = {A. Roisman and A. Navarro and G. Clot and G. Castellano and B. Gonzalez-Farre and P. P{\'e}rez-Galan and A. Esteve and M. Dabad and S. Heath and M. Gut and Bosio, M. and P. Bellot and Salembier, P. and Albert Oliveras and I. Slavusky and L. Magnano and H. Horn and A. Rosenwald and G. Ott and M. Aymerich and A. L{\'o}pez-Guillermo and P. Jares and J.I. Mart{\'\i}n-Subero and E. Campo and L. Hern{\'a}ndez} } @conference {cCombaliab, title = {Digitally Stained Confocal Microscopy through Deep Learning}, booktitle = {International Conference on Medical Imaging with Deep Learning (MIDL 2019)}, year = {2019}, month = {07/2019}, address = {London}, abstract = {Specialists have used confocal microscopy in the ex-vivo modality to identify tumors with\ an overall sensitivity of 96.6\% and specicity of 89.2\%. However, this technology hasn{\textquoteright}t\ established yet in the standard clinical practice because most pathologists lack the knowledge to interpret its output. In this paper we propose a combination of deep learning and\ computer vision techniques to digitally stain confocal microscopy images into H\&E-like\ slides, enabling pathologists to interpret these images without specic training. We use a\ fully convolutional neural network with a multiplicative residual connection to denoise the\ confocal microscopy images, and then stain them using a Cycle Consistency Generative\ Adversarial Network.
}, author = {Marc Combalia and Javiera P{\'e}rez-Anker and Adriana Garc{\'\i}a-Herrera and Ll{\'u}cia Alos and Ver{\'o}nica Vilaplana and Ferran Marques and Susana Puig and Josep Malvehy} } @mastersthesis {xComas19, title = {Exploring Methods for Enhancing Linear Prediction of Video Sequences}, year = {2019}, abstract = {Video prediction has for a long time received attention within the field of computer\ vision, but it has gained importance during the last decade with the popularization of deep\ neural networks and their applications to computer vision.\ In this thesis, the main focus will be to linearize the dynamics of time sequences by exploiting the spatial context that video offers, with the final scope of obtaining better predictions.\ In the first place, we provide the theoretical base for dynamics. Following, we present\ several modifications for an existing deterministic predictor network called Dynamical\ Atoms-based Network (DYAN) [1], which models time sequences as the output of Linear\ Time-Invariant (LTI) systems using system identification and dynamics foundations. The\ solutions present different levels of success and an in some cases they beat the State Of\ The Art (SOTA) for at least one dataset, in the metrics SSIM, MSE and MMF.\ We also present two novel convolutional autoencoder architectures (LODAEs) for low order\ dynamics manifold embedding, strongly based on deep neural networks, with the primary\ aim of giving a generalized solution for mapping video sequences into a new manifold, to\ adapt them to the pipeline of predictors such as DYAN, based on system identification.\ The results for the LODAEs are promising as they seem to achieve their goal for a very\ simple synthetic dataset by lowering the order of the latent space sequences and providing\ good reconstructions and in some cases, predictions.
}, author = {Comas, Armand}, editor = {Camps, Octavia and Xavier Gir{\'o}-i-Nieto} } @article {aGene-Molab, title = {Fruit Detection in an Apple Orchard Using a Mobile Terrestrial Laser Scanner}, journal = {Biosystems Engineering}, volume = {187}, year = {2019}, month = {09/2019}, chapter = {171}, abstract = {The development of reliable fruit detection and localization systems provides an opportunity to improve the crop value and management by limiting fruit spoilage and optimized harvesting practices. Most proposed systems for fruit detection are based on RGB cameras and thus are affected by intrinsic constraints, such as variable lighting conditions. This work presents a new technique that uses a mobile terrestrial laser scanner (MTLS) to detect and localise Fuji apples. An experimental test focused on Fuji apple trees (Malus domestica Borkh. cv. Fuji) was carried out. A 3D point cloud of the scene was generated using an MTLS composed of a Velodyne VLP-16 LiDAR sensor synchronized with an RTK-GNSS satellite navigation receiver. A reflectance analysis of tree elements was performed, obtaining mean apparent reflectance values of 28.9\%, 29.1\%, and 44.3\% for leaves, branches and trunks, and apples, respectively. These results suggest that the apparent reflectance parameter (at 905 nm wavelength) can be useful to detect apples. For that purpose, a four-step fruit detection algorithm was developed. By applying this algorithm, a localization success of 87.5\%, an identification success of 82.4\%, and an F1-score of 0.858 were obtained in relation to the total amount of fruits. These detection rates are similar to those obtained by RGB-based systems, but with the additional advantages of providing direct 3D fruit location information, which is not affected by sunlight variations. From the experimental results, it can be concluded that LiDAR-based technology and, particularly, its reflectance information, has potential for remote apple detection and 3D location.
}, issn = {1537-5110}, doi = {10.1016/j.biosystemseng.2019.08.017}, url = {https://authors.elsevier.com/c/1Zmc45Tbkk9EHW}, author = {Gen{\'e}-Mola, Jordi and Gregorio, Eduard and Guevara, Javier and Auat Cheein, Fernando and Sanz, Ricardo and Escol{\`a}, Alexandre and Llorens Calveras, Jordi and Morros, J.R. and Ruiz-Hidalgo, J. and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R.} } @mastersthesis {xCaros, title = {A Generative Dialogue System for Reminiscence Therapy}, year = {2019}, abstract = {With people living longer than ever, the number of cases with neurodegenerative diseases such as Alzheimer{\textquoteright}s or cognitive impairment increases steadily. In Spain it affects more than 1.2 million patients and it is estimated that in 2050 more than 100 million people will be affected. While there are not effective treatments for this terminal disease, therapies such as reminiscence, that stimulate memories of the patient{\textquoteright}s past are recommended, as they encourage the communication and produce mental and emotional benefits on the patient. Currently, reminiscence therapy takes place in hospitals or residences, where the therapists are located. Since people that receive this therapy are old and may have mobility difficulties, we present an AI solution to guide older adults through reminiscence sessions by using their laptop or smartphone.\
Our solution consists in a generative dialogue system composed of two deep learning architectures to recognize image and text content. An Encoder-Decoder with Attention is trained to generate questions from photos provided by the user, which is composed of a pretrained Convolution Neural Network to encode the picture, and a Long Short-Term Memory to decode the image features and generate the question. The second architecture is a sequence-to-sequence model that provides feedback to engage the user in the conversation.
Thanks to the experiments, we realise that we obtain the best performance by training the dialogue model with Persona-Dataset and fine-tuning it with Cornell Movie-Dialogues dataset. Finally, we integrate Telegram as the interface for the user to interact with Elisabot, our trained conversational agent.
This work addresses the challenge of hate speech detection in Internet memes, and attempts using visual information to automatically detect hate speech, unlike any previous work of our knowledge.\ Memes are pixel-based multimedia documents that contain photos or illustrations together with phrases which, when combined, usually adopt a funny meaning.\ However, hate memes are also used to spread hate through social networks, so their automatic detection would help reduce their harmful societal impact.\ Our results indicate that the model can learn to detect some of the memes, but that the task is far from being solved with this simple architecture.\ While previous work focuses on linguistic hate speech, our experiments indicate how the visual modality can be much more informative for hate speech detection than the linguistic one in memes.\ In our experiments, we built a dataset of 5,020 memes to train and evaluate a multi-layer perceptron over the visual and language representations, whether independently or fused.
This work proposes novel hyperparameter-free losses for single view 3D reconstruction with morphable models (3DMM). We dispense with the hyperparameters used in other works by exploiting geometry, so that the shape of the object and the camera pose are jointly optimized in a sole term expression. This simplification reduces the optimization time and its complexity. Moreover, we propose a novel implicit regularization technique based on random virtual projections that does not require additional 2D or 3D annotations. Our experiments suggest that minimizing a shape reprojection error together with the proposed implicit regularization is especially suitable for applications that require precise alignment between geometry and image spaces, such as augmented reality. We evaluate our losses on a large scale dataset with 3D ground truth and publish our implementations to facilitate reproducibility and public benchmarking in this field.
This thesis investigates the importance of motion when predicting saliency in videos. Naturally, humans observe both dynamic and static objects. When we are focused on watching a video, we tend to keep our eyes on the objects that are moving in the scene, items that we quickly recognize, as well as to those that attract our attention. In this work, different experiments are presented to corroborate this implication. Various approaches will be shown implementing an adaptation of the SalBCE neural network by using only motion. A simple implementation is proposed for the generation of saliency maps using previously extracted static and dynamic information from the images. The DHF1K dataset has been used for the experiment{\textquoteright}s realization.
}, keywords = {Motion, Saliency, video}, author = {Caselles, Pol}, editor = {McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @conference {cSalvadorf, title = {Inverse Cooking: Recipe Generation from Food Images}, booktitle = {CVPR}, year = {2019}, month = {06/2019}, publisher = {OpenCVF / IEEE}, organization = {OpenCVF / IEEE}, address = {Long Beach, CA, USA}, abstract = {People enjoy food photography because they appreciate food. Behind each meal there is a story described in a complex recipe and, unfortunately, by simply looking at a food image we do not have access to its preparation process. Therefore, in this paper we introduce an inverse cooking system that recreates cooking recipes given food images. Our system predicts ingredients as sets by means of a novel architecture, modeling their dependencies without imposing any order, and then generates cooking instructions by attending to both image and its inferred ingredients simultaneously. We extensively evaluate the whole system on the large-scale Recipe1M dataset and show that (1) we improve performance w.r.t. previous baselines for ingredient prediction; (2) we are able to obtain high quality recipes by leveraging both image and ingredients; (3) our system is able to produce more compelling recipes than retrieval-based approaches according to human judgment.
\
This article contains data related to the research article entitle {\textquotedblleft}Multi-modal Deep Learning for Fruit Detection Using RGB-D Cameras and their Radiometric Capabilities{\textquotedblright} [1]. The development of reliable fruit detection and localization systems is essential for future sustainable agronomic management of high-value crops. RGB-D sensors have shown potential for fruit detection and localization since they provide 3D information with color data. However, the lack of substantial datasets is a barrier for exploiting the use of these sensors. This article presents the KFuji RGB-DS database which is composed by 967 multi-modal images of Fuji apples on trees captured using Microsoft Kinect v2 (Microsoft, Redmond, WA, USA). Each image contains information from 3 different modalities: color (RGB), depth (D) and range corrected IR intensity (S). Ground truth fruit locations were manually annotated, labeling a total of 12,839 apples in all the dataset. The current dataset is publicly available at http://www.grap.udl.cat/publicacions/datasets.html.
}, keywords = {Depth cameras; RGB-D, Fruit detection, Fruit reflectance, Fuji apple, Multi-modal dataset}, doi = {10.1016/j.dib.2019.104289}, author = {Gen{\'e}-Mola, Jordi and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Gregorio, Eduard} } @article {aGene-Mola, title = {Multi-modal Deep Learning for Fuji Apple Detection Using RGB-D Cameras and their Radiometric Capabilities}, journal = {Computers and Electronics in Agriculture}, volume = {162}, year = {2019}, month = {07/2019}, chapter = {689-698}, abstract = {Fruit detection and localization will be essential for future agronomic management of fruit crops, with applications in yield prediction, yield mapping and automated harvesting. RGB-D cameras are promising sensors for fruit detection given that they provide geometrical information with color data. Some of these sensors work on the principle of time-of-flight (ToF) and, besides color and depth, provide\ the backscatter signal intensity. However, this radiometric capability has not been exploited for fruit detection applications. This work\ presents the KFuji RGB-DS database, composed of 967 multi-modal images containing a total of 12,839 Fuji apples. Compilation of th\ \ database allowed a study of the usefulness of fusing RGB-D and radiometric information obtained with Kinect v2 for fruit detection. To\ do so, the signal intensity was range corrected to overcome signal attenuation, obtaining an image that was proportional to the reflectance\ of the scene. A registration between RGB, depth and intensity images was then carried out. The Faster R-CNN model was adapted for\ use with five-channel input images: color (RGB), depth (D) and range-corrected intensity signal (S). Results show an improvement of\ 4.46\% in F1-score when adding depth and range-corrected intensity channels, obtaining an F1-score of 0.898 and an AP of 94.8\% when\ all channels are used. From our experimental results, it can be concluded that the radiometric capabilities of ToF sensors give valuable\ information for fruit detection.
}, keywords = {Agricultural robotics, Convolutional Neural Networks, Fruit detection, Fruit reflectance, Multi-modal faster R-CNN, RGB-D}, doi = {10.1016/j.compag.2019.05.016}, author = {Gen{\'e}-Mola, Jordi and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Gregorio, Eduard} } @mastersthesis {xOriol, title = {Multimodal Hate Speech Detection in Memes}, year = {2019}, abstract = {This thesis explores a multimodal approach to Hate Speech detection, involving vision and language (text). More specifically, we deal with the context of memes, a form of internet humour which will present additional challenges.\ We first gather meme data from different sources. This way, we create a hate memes dataset for this task. Then, we use this data for the training and evaluation of statistical models, which are based on state-of-the art neural networks.\ We study different ways to fine-tune pretrained descriptors for our specific task. We propose a way to add expert knowledge into the system and orient it into a real world issue-solving system. We also discuss ways to deal with the issue of reduced amount of data, experimenting with a self-supervised learning approach for pre-training.\ We also compare the effect or contribution of each modality in the overall performance of the model.
}, author = {Oriol, Benet}, editor = {Cristian Canton-Ferrer and Xavier Gir{\'o}-i-Nieto} } @article {aVentura, title = {Multiresolution co-clustering for uncalibrated multiview segmentation}, journal = {Signal Processing: Image Communication}, year = {2019}, abstract = {We propose a technique for coherently co-clustering uncalibrated views of a scene\ with a contour-based representation. Our work extends the previous framework,\ an iterative algorithm for segmenting sequences with small variations, where the\ partition solution space is too restrictive for scenarios where consecutive images\ present larger variations. To deal with a more flexible scenario, we present three\ main contributions. First, motion information has been considered both for\ region adjacency and region similarity. Second, a two-step iterative architecture\ is proposed to increase the partition solution space. Third, a feasible global\ optimization that allows to jointly process all the views has been implemented.\ In addition to the previous contributions, which are based on low-level features,\ we have also considered introducing higher level features as semantic information\ in the co-clustering algorithm. We evaluate these techniques on multiview and\ temporal datasets, showing that they outperform state-of-the-art approaches.
In this work, we present a novel learning based approach\ to reconstruct 3D faces from a single or multiple images.\ Our method uses a simple yet powerful architecture based\ on siamese neural networks that helps to extract relevant\ features from each view while keeping the models small. Instead of minimizing multiple objectives, we propose to simultaneously learn the 3D shape and the individual camera\ poses by using a single term loss based on the reprojection\ error, which generalizes from one to multiple views. This allows to globally optimize the whole scene without having to\ tune any hyperparameters and to achieve low reprojection\ errors, which are important for further texture generation.\ Finally, we train our model on a large scale dataset with\ more than 6,000 facial scans. We report competitive results\ in 3DFAW 2019 challenge, showing the effectiveness of our\ method.
}, author = {Ramon, Eduard and Escur, Janna and Xavier Gir{\'o}-i-Nieto} } @article {xGiro-i-Nieto19a, title = {One Perceptron to Rule Them All: Language and Vision}, year = {2019}, abstract = {Deep neural networks have boosted the convergence of multimedia data analytics in a unified framework shared by practitioners in natural language and vision. Image captioning, visual question answering or multimodal translation are some of the first applications of a new and exciting field that exploiting the generalization properties of deep neural representations. This talk will provide an overview of how vision and language problems are addressed with deep neural networks, and the exciting challenges being addressed nowadays by the research community.
\
Panel at the 6th Annual Catalan Meeting on Computer Vision.
$\#$ACM2019 @JosepLlados now introducing CVCatalliance with @xantallavina and experts from different sectors: @Vilynx @Satellogic @osalap @TheCollider_MWC @DocXavi pic.twitter.com/o5tVWW0gMQ
{\textemdash} CVC_UAB (@CVC_UAB) September 17, 2019
Obtaining 3D geometry from images is a well studied problem by the computer vision community. In the concrete case of a single image, a considerable amount of prior knowledge is often required to obtain plausible reconstructions. Recently, deep neural networks in combination with 3D morphable models (3DMM) have been used in order to address the lack of scene information, leading to more accurate results. Nevertheless, the losses employed during the training process are usually a linear combination of terms where the coefficients, also called hyperparameters, must be carefully tuned for each dataset to obtain satisfactory results. In this work we propose a hyperparameters-free loss that exploits the geometry of the problem for learning 3D reconstruction from a single image. The proposed formulation is not dataset dependent, is robust against very large camera poses and jointly optimizes the shape of the object and the camera pose.
Background: Magnetic resonance imaging (MRI) has unveiled specific alterations at different stages of Alzheimer{\textquoteright}s disease (AD) pathophysiologic continuum constituting what has been established as {\textquoteleft}AD signature{\textquoteright}. To what extent MRI can detect amyloid-related cerebral changes from structural MRI in cognitively unimpaired individuals is still an area open for exploration.
Method: Longitudinal 3D-T1 MRI scans were acquired from a subset of the ADNI cohort comprising 403 subjects: 79 controls (Ctrls), 50 preclinical AD (PreAD), 274 MCI and dementia due to AD (MCI/AD). Amyloid CSF was used as gold-standard measure with established cut-offs (\<192pg/mL) to establish diagnostic categories. Cognitively unimpaired individuals were defined as Ctrls if were amyloid negative and PreAD otherwise. The MCI/AD group was amyloid positive. Only subjects with the same diagnostic category at baseline and follow-up visits were considered for the study. Longitudinal morphometric analysis was performed using SPM12 to calculate Jacobian determinant maps. Statistical analysis was carried out on these jacobian maps to identify structural changes that were significantly different between diagnostic categories. A machine learning classifier was applied on Jacobian determinant maps to predict the presence of abnormal amyloid levels in cognitively unimpaired individuals. The performance of this classifier was evaluated using receiver operating characteristic curve analysis and as a function of the follow-up time between MRI scans. We applied a cost function to assess the benefit of using this classifier in the triaging of individuals in a clinical trial-recruitment setting.
Results: The optimal follow-up time for classification of Ctrls vs PreAD was Δt\>2.5 years and hence, only subjects within this temporal span are used for evaluation (15 Ctrls, 10 PreAD). The longitudinal voxel-based classifier achieved an AUC=0.87 (95\%CI:0.72-0.97). The brain regions that showed the highest discriminative power to detect amyloid abnormalities were the medial, inferior and lateral temporal lobes, precuneus, caudate heads, basal forebrain and lateral ventricles.
Conclusions: Our work supports that machine learning applied to longitudinal brain volumetric changes can be used to predict, with high precision, presence of amyloid abnormalities in cognitively unimpaired subjects. Used as a triaging method to identify a fixed number of amyloid positive individuals, this longitudinal voxelwise classifier is expected to avoid 55\% of unnecessary CSF and/or PET scans and reduce economic cost by 40\%.
}, doi = {https://doi.org/10.1186/s13195-019-0526-8}, url = {https://link.springer.com/article/10.1186/s13195-019-0526-8}, author = {Paula Petrone and Adri{\`a} Casamitjana and Carles Falcon and Miguel Artigues C{\`a}naves and G. Operto and R. Cacciaglia and Jose Luis Molinuevo and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @conference {cHerrera-Palacioa, title = {Recurrent Instance Segmentation using Sequences of Referring Expressions}, booktitle = {NeurIPS workshop on Visually Grounded Interaction and Language (ViGIL)}, year = {2019}, month = {09/2019}, address = {Vancouver, Canada}, abstract = {The goal of this work is segmenting the objects in an image which are referred to by a sequence of linguistic descriptions (referring expressions). We propose a deep neural network with recurrent layers that output a sequence of binary masks, one for each referring expression provided by the user. The recurrent layers in the architecture allow the model to condition each predicted mask on the previous ones, from a spatial perspective within the same image. Our multimodal approach uses off-the-shelf architectures to encode both the image and the referring expressions. The visual branch provides a tensor of pixel embeddings that are concatenated with the phrase embeddings produced by a language encoder. Our experiments on the RefCOCO dataset for still images indicate how the proposed architecture successfully exploits the sequences of referring expressions to solve a pixel-wise task of instance segmentation.
\
The goal of this work is segmenting the object in an image or video which is referred to by a linguistic description (referring expression).\ We propose a deep neural network with recurrent layers that output a sequence of binary masks, one for each referring expression provided by the user.\ The recurrent layers in the architecture allow the model to condition each predicted mask on the previous ones, from a spatial perspective within the same image.\ Our multimodal approach uses off-the-shelf architectures to encode both the image and the referring expressions.\ The visual branch provides a tensor of pixel embeddings that are concatenated with the phrase embeddings produced by a language encoder.\ We focus our study on comparing different configurations to encode and combine the visual and linguistic representations.\ Our experiments on the RefCOCO dataset for still images indicate how the proposed architecture successfully exploits the referring expressions to solve a pixel-wise task of instance segmentation.
\
Diabetic retinopathy (DR) is an eye disease associated with diabetes mellitus that affects retinal blood vessels. Early detection is crucial to prevent vision loss. The most common method for detecting the disease is the analysis of digital fundus images, which show lesions of small vessels and functional abnormalities.
Manual detection and segmentation of lesions is a time-consuming task requiring proficient skills. Automatic methods for retinal image analysis could help ophthalmologists in large-scale screening programs of population with diabetes mellitus allowing cost-effective and accurate diagnosis.
In this work we propose a fully convolutional neural network with adversarial training to automatically segment DR lesions in funduscopy images.\
}, author = {Nat{\`a}lia Gull{\'o}n and Ver{\'o}nica Vilaplana} } @conference {cVenturaa, title = {RVOS: End-to-End Recurrent Network for Video Object Segmentation}, booktitle = {CVPR}, year = {2019}, month = {06/2019}, publisher = {OpenCVF / IEEE}, organization = {OpenCVF / IEEE}, address = {Long Beach, CA, USA}, abstract = {Multiple object video object segmentation is a challenging task, specially for the zero-shot case, when no object mask is given at the initial frame and the model has to find the objects to be segmented along the sequence. In our work, we propose RVOS, a recurrent network that is fully end-to-end trainable for multiple object video object segmentation, with a recurrence module working on two different domains: (i) the spatial, which allows to discover the different object instances within a frame, and (ii) the temporal, which allows to keep the coherence of the segmented objects along time. We train RVOS for zero-shot video object segmentation and are the first ones to report quantitative results for DAVIS-2017 and YouTube-VOS benchmarks. Further, we adapt RVOS for one-shot video object segmentation by using the masks obtained in previous time-steps as inputs to be processed by the recurrent module. Our model reaches comparable results to state-of-the-art techniques in YouTube-VOS benchmark and outperforms all previous video object segmentation methods not using online learning in the DAVIS-2017 benchmark.
\
\
Magnetic resonance imaging (MRI) provides high resolution brain morphological information and is used as a biomarker in neurodegenerative diseases. Population studies of brain morphology often seek to identify pathological structural changes related to different diagnostic categories (e.g: controls, mild cognitive impairment or dementia) which normally describe highly heterogeneous groups with a single categorical variable. Instead, multiple biomarkers are used as a proxy for pathology and are more powerful in capturing structural variability. Hence, using the joint modeling of brain morphology and biomarkers, we aim at describing structural changes related to any brain condition by means of few underlying processes. In this regard, we use a multivariate approach based on Projection to Latent Structures in its regression variant (PLSR) to study structural changes related to aging and AD pathology. MRI volumetric and cortical thickness measurements are used for brain morphology and cerebrospinal fluid (CSF) biomarkers (t-tau, p-tau and amyloid-beta) are used as a proxy for AD pathology. By relating both sets of measurements, PLSR finds a low-dimensional latent space describing AD pathological effects on brain structure. The proposed framework allows to separately model aging effects on brain morphology as a confounder variable orthogonal to the pathological effect. The predictive power of the associated latent spaces (i.e. the capacity of predicting biomarker values) is assessed in a cross-validation framework.\
}, keywords = {CSF biomarkers, Latent model, MRI, PLS, preclinical AD}, doi = {10.1109/JBHI.2019.2932565}, author = {Adri{\`a} Casamitjana and Paula Petrone and J.L. Molinuevo and Juan D. Gispert and Ver{\'o}nica Vilaplana} } @conference {cLinardosa, title = {Simple vs complex temporal recurrences for video saliency prediction}, booktitle = {British Machine Vision Conference (BMVC)}, year = {2019}, month = {09/2019}, publisher = {British Machine Vision Association}, organization = {British Machine Vision Association}, address = {Cardiff, Wales / UK.}, abstract = {This paper investigates modifying an existing neural network architecture for static saliency prediction using two types of recurrences that integrate information from the temporal domain. The first modification is the addition of a ConvLSTM within the architecture, while the second is a computationally simple exponential moving average of an internal convolutional state. We use weights pre-trained on the SALICON dataset and fine-tune our model on DHF1K. Our results show that both modifications achieve state-of-the-art results and produce similar saliency maps.
Quantification of white matter hyperintensities (WMH) of presumed vascular origin is of key\ importance in many neurological research studies. Advanced\ measurements are obtained from manual segmentations on brain\ MR images, which is a laborious procedure. Automatic WMH\ segmentation methods exist, but a standardized comparison of\ such methods is lacking. We organized a scientific challenge, in\ which developers could evaluate their method on a standardized\ multi-center/-scanner image dataset, giving an objective comparison:\ the WMH Segmentation Challenge (http://wmh.isi.uu.nl/).\ Sixty T1+FLAIR images from three MR scanners were released\ with manual WMH segmentations. A secret test set of 110\ images from five MR scanners was used for evaluation. Methods\ had to be containerized and submitted to the challenge organizers.\ Five evaluation metrics were used to rank the methods:\ (1) Dice Similarity Coefficient, (2) modified Hausdorff distance\ (95th percentile), (3) absolute percentage volume difference, (4)\ sensitivity for detecting individual lesions, and (5) F1-score for\ individual lesions. Additionally, methods were ranked on their\ inter-scanner robustness.\ Twenty participants submitted their method for evaluation.\ This paper provides a detailed analysis of the results. In brief,\ there is a cluster of four methods that rank significantly better\ than the other methods. There is one clear winner, which also\ has the best inter-scanner robustness.\ The challenge remains open for future submissions and provides\ a public platform for method evaluation.
}, keywords = {brain, Evaluation and performance, Magnetic resonance imaging (MRI), segmentation}, issn = {0278-0062}, doi = {10.1109/TMI.2019.2905770}, author = {Hugo Kuijf and Matthijs Biesbroek and Jeroen de Bresser and Rutger Heinen and Simon Andermatt and Mariana Bento and Matt Berseth and Mikhail Belyaev and Jorge Cardoso and Adri{\`a} Casamitjana and Louis Collins and Mahsa Dadar and Achileas Georgiou and Mohsen Ghafoorian and Dakai Jin and April Khademi and Jesse Knight and Hongwei Li and Xavier Llado and Miguel Luna and Qaiser Mahmood and Richard McKinley and Alireza Mehrtash and Sebastien Ourselin and Bo-yong Park and Hyunkin Park and Sang Hyun Park and Simon Pezold and Elodie Puybareau and Leticia Rittner and Carole Sudre and Sergi Valverde and Ver{\'o}nica Vilaplana and Rolan Wiest and Yongchao Xu and Ziyue Xu and Guodong Zeng and Jianguo Zhang and Guoyan Zheng and Christoper Chen and Wiesje van der Flier and Frederik Barkhof and Max Viergever and Geert Jan Biessels} } @conference {cGene-Mola19, title = {Uso de redes neuronales convolucionales para la detecci{\'o}n remota de frutos con c{\'a}maras RGB-D}, booktitle = {Congreso Ib{\'e}rico de Agroingenier{\'\i}a}, year = {2019}, month = {09/2019}, publisher = { Universidad de Zaragoza (UZA)}, organization = { Universidad de Zaragoza (UZA)}, address = {Huesca}, abstract = {La detecci{\'o}n remota de frutos ser{\'a} una herramienta indispensable para la gesti{\'o}n agron{\'o}mica optimizada y sostenible de las plantaciones frut{\'\i}colas del futuro, con aplicaciones en previsi{\'o}n de cosecha, robotizaci{\'o}n de la recolecci{\'o}n y elaboraci{\'o}n de mapas de producci{\'o}n. Este trabajo propone el uso de c{\'a}maras de profundidad RGB-D para la detecci{\'o}n y la posterior localizaci{\'o}n 3D de los frutos. El material utilizado para la adquisici{\'o}n de datos consiste en una plataforma terrestre autopropulsada equipada con dos sensores Kinect v2 de Microsoft y un sistema de posicionamiento RTK-GNSS. Con este equipo se escanearon 3 filas de manzanos Fuji de una explotaci{\'o}n comercial. El conjunto de datos adquiridos est{\'a} compuesto por 110 capturas que contienen un total de 12,838 manzanas Fuji. La detecci{\'o}n de frutos se realiz{\'o} mediante los datos RGB (im{\'a}genes de color proporcionadas por el sensor). Para ello, se implement{\'o} y se entren{\'o} la red neuronal convolucional de detecci{\'o}n de objetos Faster R-CNN, la cual est{\'a} compuesta por dos m{\'o}dulos: red de propuesta de regiones de inter{\'e}s y red de clasificaci{\'o}n. Ambos m{\'o}dulos comparten las primeras capas convolucionales siguiendo el modelo VGG-16 pre-entrenado con la base de datos ImageNet. Los resultados de test muestran un porcentaje de detecci{\'o}n del 91.4\% de los frutos con un 15.9\% de falsos positivos (F1-score = 0.876). La evaluaci{\'o}n cualitativa de las detecciones muestra que los falsos positivos corresponden a zonas de la imagen que presentan un patr{\'o}n muy similar a una manzana, donde, incluso a percepci{\'o}n del ojo humano, es dif{\'\i}cil de determinar si hay o no manzana. Por otro lado, las manzanas no detectadas corresponden a aquellas que estaban ocultas casi en su totalidad por otros {\'o}rganos vegetativos (hojas o ramas) o a manzanas cortadas por los m{\'a}rgenes de la imagen. De los resultados experimentales se concluye que el sensor Kinect v2 tiene un gran potencial para la detecci{\'o}n y localizaci{\'o}n 3D de frutos. La principal limitaci{\'o}n del sistema es que el rendimiento del sensor de profundidad se ve afectado en condiciones de alta iluminaci{\'o}n
}, keywords = {C{\'a}maras de profundidad, Detecci{\'o}n de frutos, Redes neuronales convolucionales, RGB-D, Rob{\'o}tica agr{\'\i}cola}, doi = {https://doi.org/10.26754/c_agroing.2019.com.3325}, author = {Gen{\'e}-Mola, Jordi and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R. and Morros, J.R. and Ruiz-Hidalgo, J. and Gregorio, Eduard} } @mastersthesis {xGranero, title = {A Video Database for Analyzing Affective Physiological Responses}, year = {2019}, abstract = {Affective computing, leveraged by machine learning techniques, is advancing rapidly in the task of affect recognition in videos. However, there is a need for more annotated data. Several studies have built huge video datasets with emotions annotations. Others have collected music videos or film scenes datasets with physiological signals. However, none of them approached a solution with both physiological signals and user-generated videos. The work introduced here presents GALLUS, a novel database of user-generated videos with affective physiological responses. The database is composed of 775 videos that have been previously annotated through an online crowdsourcing platform. Physiological responses such as electroencephalography, electrocardiography, galvanic skin response, facial emotion recognition, and eye-gaze have been collected from 30 participants while they watched the stimuli. Our dataset will be made public to foster research in affect recognition.
The goal of this work is segmenting on a video sequence the objects\ which are mentioned in a linguistic description of the scene. We\ have adapted an existing deep neural network that achieves state of\ the art performance in semi-supervised video object segmentation,\ to add a linguistic branch that would generate an attention map\ over the video frames, making the segmentation of the objects\ temporally consistent along the sequence.
\
\
}, author = {Herrera-Palacio, Alba and Ventura, C. and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xNieto, title = {Video Saliency Prediction with Deep Neural Networks}, year = {2019}, abstract = {Saliency prediction is a topic undergoing intense study in computer vision with a broad range of applications. It consists in predicting where the attention is going to be received in an image or a video by a human. Our work is based on a deep neural network named SalGAN, which was trained on a saliency annotated dataset of static images. In this thesis we investigate different approaches for extending SalGAN to the video domain. To this end, we investigate the recently proposed saliency annotated video dataset DHF1K to train and evaluate our models. The obtained results indicate that techniques such as depth estimation or coordconv can effectively be used as additional modalities to enhance the saliency prediction of static images obtained with SalGAN, achieving encouraging results in the DHF1K benchmark. Our work is based on pytorch and it is publicly available here.
In this article we present a web platform used by media producers to monitor word events, detected by VLX-Stories.\ The event detector system retrieves multi-regional articles from news sites, aggregates them by topic, and summarizes them by extracting and structuring its most relevant entities in order to answer the journalism W{\textquoteright}s: who, what, when and where.\ The dashboard displays online detected events in a semantically linked space which allows navigation among trending news stories on distinct countries, categories and time.\ Moreover, detected events are linked to costumer contents, helping editorial process by providing real time access to breaking news related to their contents.\ (Demo paper)\
}, url = {http://ceur-ws.org/Vol-2456/paper61.pdf}, author = {Fern{\`a}ndez, D{\`e}lia and Bou, Elisenda and Xavier Gir{\'o}-i-Nieto} } @conference {cFernandezd, title = {VLX-Stories: building an online Event Knowledge Base with Emerging Entity detection}, booktitle = {The Semantic Web {\textendash} ISWC 2019}, year = {2019}, month = {10/2019}, pages = {382-399}, publisher = {Springer, Cham}, organization = {Springer, Cham}, chapter = {24}, address = {Auckland, New Zealand}, abstract = {We present an online multilingual system for event detection and comprehension from media feeds. The system retrieves information from news sites and social networks, aggregates them into events (event detection), and summarizes them by extracting semantic labels of its most relevant entities (event representation) in order to answer the journalism W{\textquoteright}s: who, what, when and where. The generated events populate VLX-Stories -an event Knowledge Base (KB)- transforming unstructured text data to a structured knowledge base representation.\ Our system exploits an external entity Knowledge Base (VLX-KG) to help populate VLX-Stories. At the same time, this external knowledge base can also be extended with a Dynamic Entity Linking (DEL) module, which detects Emerging Entities (EE) on unstructured data and adds them to VLX-KG.\ The system is currently used in production, detecting over 6000 monthly events from over 3500 news feeds from seven different countries and in three different languages.
The interest for image synthesis has grown exponentially for the last years. Few years ago, it was invented a very powerful tool for this task: Generative Adversarial Networks (GANs). As its high performance in generating realistic images has been proved, nowadays many researchers are putting the focus on cross-modal learning.
Taking advantage of the huge amount of information we can extract from speech (such as identity, gender or emotional state), in this\ work\ we\ explore\ its\ potential\ to\ generate\ face\ images of a speaker by conditioning a GAN with his/her voice.\ We propose the enhancement and evaluation of a deep neural network that is trained from scratch in an end-to-end fashion, generating a face directly from the raw speech waveform without any additional identity information (e.g reference image or one-hot encoding).
This project focus on the enhancement of a previous model proposed by Francisco Roldan. As a result of a deep analysis on the former project strengths and weaknesses, we present a novel dataset collected for this work, with high-quality videos of ten youtubers with notable expressiveness in both the speech and visual signals. Besides, unlike in the preliminary project, four different techniques are proposed in order to assess the results.
}, author = {Tubau, Miquel}, editor = {Amanda Duarte and Xavier Gir{\'o}-i-Nieto} } @conference {cDuartea, title = {Wav2Pix: Speech-conditioned Face Generation using Generative Adversarial Networks}, booktitle = {ICASSP}, year = {2019}, month = {05/2019}, publisher = {IEEE}, organization = {IEEE}, address = {Brighton, UK}, abstract = {Speech is a rich biometric signal that contains information\ about the identity, gender and emotional state of the speaker.\ In this work, we explore its potential to generate face images\ of a speaker by conditioning a Generative Adversarial\ Network (GAN) with raw speech input. We propose a deep\ neural network that is trained from scratch in an end-to-end\ fashion, generating a face directly from the raw speech waveform\ without any additional identity information (e.g reference\ image or one-hot encoding). Our model is trained in a\ self-supervised fashion by exploiting the audio and visual signals\ naturally aligned in videos. With the purpose of training\ from video data, we present a novel dataset collected for this\ work, with high-quality videos of ten youtubers with notable\ expressiveness in both the speech and visual signals.
This bachelor{\textquoteright}s thesis explores different ways of building a block-based Speech Translation system with the aim of generating huge amounts of parallel speech data. The first goal is to research and manage to run suitable tools to implement each one of the three blocks that integrates the Speech Translation system: Speech Recognition, Translation and Speech Synthesis. We experiment with some open-source toolkits and we manage to train a speech recognition system and a neural machine translation system. Then, we test them to evaluate their performance. As an alternative option, we use the cloud computing solutions provided by Google Cloud to implement the three sequential blocks and we successfully build the overall system. Finally, we make a comparative study between an in-house software development versus Cloud computing implementation.
In the last decade, magnetic resonance imaging has unveiled specific AD alterations at different stages of the AD pathophysiologic continuum that conform what has been established as the AD signature. To which extent MRI can detect structural changes at the preclinical asymptomatic stage of AD - the preclinical AD signature- is still an area open for exploration. Our work supports the idea that there are brain volumetric changes specific to preclinical AD subjects and defines the preclinical AD signature based on longitudinal data. While some regions show a pattern of atrophy that overlaps with the AD signature, other specific regions exhibit changes that are unique to this early asymptomatic AD stage.
}, author = {P. Petrone and Adri{\`a} Casamitjana and C. Falcon and M. Artigues and G. Operto and S. Skouras and R. Cacciaglia and J.L. Molinuevo and Ver{\'o}nica Vilaplana and J.D. Gispert} } @conference {cFojo, title = {Comparing Fixed and Adaptive Computation Time for Recurrent Neural Network}, booktitle = {International Conference on Learning Representations (ICLR)}, year = {2018}, month = {02/2018}, address = {Vancouver, Canada}, abstract = {Deep networks commonly perform better than shallow ones, but allocating the proper amount of computation for each particular input sample remains an open problem. This issue is particularly challenging in sequential tasks, where the required complexity may vary for different tokens in the input sequence. Adaptive Computation Time (ACT) was proposed as a method for dynamically adapting the computation at each step for Recurrent Neural Networks (RNN). ACT introduces two main modifications to the regular RNN formulation: (1) more than one RNN steps may be executed between an input sample is fed to the layer and and this layer generates an output,\ and (2) this number of steps is dynamically predicted depending on the input token and the hidden state of the network. In our work, we aim at gaining intuition about the contribution of these two factors to the overall performance boost observed when augmenting RNNs with ACT. We design a new baseline, Repeat-RNN, which performs a constant number of RNN state updates larger than one before generating an output. Surprisingly, such uniform distribution of the computational resources matches the performance of ACT in the studied tasks. We hope that this finding motivates new research efforts towards designing RNN architectures that are able to dynamically allocate computational resources.
The increasing amount of online videos brings several opportunities for training self-supervised neural networks. The creation of large scale datasets of videos such as the YouTube-8M allows us to deal with this large amount of data in manageable way. In this work, we find new ways of exploiting this dataset by taking advantage of the multi-modal information it provides. By means of a neural network, we are able to create links between audio and visual documents, by projecting them into a common region of the feature space, obtaining joint audio-visual embeddings. These links are used to retrieve audio samples that fit well to a given silent video, and also to retrieve images that match a given a query audio. The results in terms of Recall@K obtained over a subset of YouTube-8M videos show the potential of this unsupervised approach for cross-modal feature learning. We train embeddings for both scales and assess their quality in a retrieval problem, formulated as using the feature extracted from one modality to retrieve the most similar videos based on the features computed in the other modality.
\
3D technology is key for a wide range of industries. Medicine, construction, cinema and many other disciplines can nowadays digitalize the world we perceive using 3D reconstruction algorithms, create new objects by means of 3D printers or analyze the world using 3D detection and segmentation techniques.
These tools are transforming how research and industry problems are addressed. Concretely, in the field of aesthetic surgery, a fluent communication between doctors and patients is crucial in order to maximize the success of the operatories and the satisfaction of the patients. A new trend in the industry is to incorporate 3D technology during the consultation, with the goa lof improving that communication. By reconstructing the body of the patient and simulating aesthetic procedures on it, he or she can realise how a concrete modification would look like when applied to his or her body. It is also beneficial for the physicians, since they can measure the effectiveness of the applied treatments and also convert more consultations into operations due to an increase of confidence in the patient.
This thesis proposal defines the research directions to follow during an industrial doctorate to be developed in Crisalix Labs, in collaboration of the Image Processing Group at the Universitat Polit{\`e}cnica de Catalunya. Industrial doctorates are promoted from the Government of Catalonia to promote the transfer of knowledge from universities to local industry as an element for innovation and technical excellence.
}, author = {Ramon, Eduard}, editor = {Xavier Gir{\'o}-i-Nieto} } @conference {cGomez, title = {Demonstration of an Open Source Framework for Qualitative Evaluation of CBIR Systems}, booktitle = {ACM Multimedia}, year = {2018}, month = {10/2018}, publisher = {ACM}, organization = {ACM}, address = {Seoul, South Korea}, abstract = {Evaluating image retrieval systems in a quantitative way, for example by computing measures like mean average precision, allows for objective comparisons with a ground-truth. However, in cases where ground-truth is not\ available, the only alternative is to collect feedback from a user. Thus, qualitative assessments become important to better understand how the system works. Visualizing the results could be, in some scenarios, the only way to evaluate the results obtained and also the only opportunity to identify that a system is failing. This necessitates developing a User Interface (UI) for a Content Based Image Retrieval (CBIR) system that allows visualization of results and improvement via capturing user relevance feedback. A well-designed UI facilitates understanding of the performance of the system, both in cases where it works well and perhaps more importantly those which highlight the need for improvement. Our open-source system implements three components to facilitate researchers to quickly develop these capabilities for their retrieval engine. We present: a web-based user interface to visualize retrieval results and collect user annotations; a server that simplifies\ connection with any underlying CBIR system; and a server that manages the search engine data.\
\
\
This paper illustrates the work around the English - American Signs Language (ASL) data generation for the speech2signs system that is devoted to the generation of a signs language interpreter. The current work will be, first, an approximation to the speech2signs system and, second, a video-to-video corpus generator for an end-to-end approximation of speech2signs. In order to generate the desired corpus data, the Google Transformer (a Neural Machine Translation system based completely on attention) will be trained to translate from English to ASL. The dataset used to train the Transformer is the ASLG-PC12.
}, author = {Moreno, Daniel and Costa-juss{\`a}, Marta R. and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {x, title = {Exploring Automatic Speech Recognition with TensorFlow}, year = {2018}, abstract = {Advisors: Marta R. Costa-juss{\`a} (TALP, UPC) and Xavier Giro-i-Nieto (GPI, UPC)
Grade: A (9.8/10.9)
Speech recognition is the task aiming to identify words in spoken language and convert them into text. This bachelor{\textquoteright}s thesis focuses on using deep learning techniques to build an end-to-end Speech Recognition system. As a preliminary step, we overview the most relevant methods carried out over the last several years. Then, we study one of the latest proposals for this end-to-end approach that uses a sequence to sequence model with attention-based mechanisms. Next, we successfully reproduce the model and test it over the TIMIT database. We analyze the similarities and differences between the current implementation proposal and the original theoretical work. And finally, we experiment and contrast using different parameters (e.g. number of layer units, learning rates and batch sizes) and reduce the Phoneme Error Rate in almost 12\% relative.
\
The development of reliable fruit detection and localization systems is essential for future sustainable agronomic management of high-value crops. Up to date, most proposed systems on fruit detection and characterization are based on RGB cameras and thus affected by intrinsic constraints, such as variable lighting conditions and camera calibration. This work presents a new technique that uses a mobile terrestrial laser scanner to detect and localize fruits regardless of the prevailing lighting conditions and without the need of a previous calibration. An experimental test focused on two Fuji apple trees (containing 139 and 145 apples each) was carried out. A 3D point cloud of this scene was generated using a Velodyne VLP-16 LiDAR sensor synchronized with a RTK-GNSS receiver. A reflectivity analysis of tree elements was performed, obtaining mean reflectivity values of 28.9\%, 29.1\%, and 44.3\% for leaves, trunks, and fruits, respectively. These results suggest that the reflectivity parameter can be useful to localize fruits in the tree. From this knowledge, a three-step fruit detection algorithm has been developed: 1) reflectivity thresholding to remove most of the leaves and trunks from the original point cloud; 2) statistical outlier removal to reduce noise; 3) connected components clustering using a density-based algorithm. By applying this algorithm to our dataset, a localization success of 85\%, a detachment success of 78.8\%, and a false detection rate of 15.2\% were obtained. These detection rates are similar to those obtained by current RGB-based system, but with the additional advantage of providing direct 3D fruit location information (global coordinates) which is not affected by sunlight variations. It can be concluded that LiDAR technology and, particularly, its reflectivity information, might have potential use in fruit detection. Future work should include the application of this fruit detection technique on a wider range of crop types
}, author = {Gen{\'e}-Mola, Jordi and Gregorio, Eduard and Guevara, Javier and Auat, Fernando and Escol{\`a}, Alexandre and Morros, J.R. and Rosell-Polo, Joan R.} } @mastersthesis {xColl-Pol, title = {The Importance of Time in Visual Attention Models}, year = {2018}, abstract = {Predicting visual attention is a very active field in the computer vision community. Visual attention is a mechanism of the visual system that can select relevant areas within a scene. Models for saliency prediction are intended to automatically predict which regions are likely to be attended by a human observer. Traditionally, ground truth saliency maps are built using only the spatial position of the fixation points, being these xation points the locations where an observer fixates the gaze when viewing a scene. In this work we explore encoding the temporal information as well, and assess it in the application of prediction saliency maps with deep neural networks. It has been observed that the later fixations in a scanpath are usually selected randomly during visualization, specially in those images with few regions of interest. Therefore, computer vision models have dificulties learning to predict them. In this work, we explore a temporal weighting over the saliency maps to better cope with this random behaviour. The newly proposed saliency representation assigns dierent weights depending on the position in the sequence of gaze fixations, giving more importance to early timesteps than later ones. We used this maps to train MLNet, a state of the art for predicting saliency maps. MLNet predictions were evaluated and compared to the results obtained when the model has been trained using traditional saliency maps. Finally, we show how the temporally weighted saliency maps brought some improvement when used to weight the visual features in an image retrieval task.
Evolution Strategies (ES) emerged as a scalable alternative to popular Reinforcement Learning (RL) techniques, providing an almost perfect speedup when distributed across hundreds of CPU cores thanks to a reduced communication overhead. Despite providing large improvements in wall-clock time, ES is data inefficient when compared to competing RL methods. One of the main causes of such inefficiency is the collection of large batches of experience, which are discarded after each policy update. In this work, we study how to perform more than one update per batch of experience by means of Importance Sampling while preserving the scalability of the original method. The proposed method, Importance Weighted Evolution Strategies (IW-ES), shows promising results and is a first step towards designing efficient ES algorithms.
}, author = {V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @conference {cAlsina, title = {An Interactive Lifelog Search Engine for LSC2018}, booktitle = {Lifelog Search Challenge workshop at ICMR2018}, year = {2018}, month = {06/2018}, publisher = {ACM}, organization = {ACM}, address = {Yokohama, Japan}, abstract = {In this work, we describe an interactive lifelog search engine developed for the LSC 2018 search challenge at ACM ICMR 2018. The paper introduces the four-step process required to support lifelog search engines and describes the source data for the search engine as well as the approach to ranking chosen for the iterative search engine. Finally the interface used is introduced before we highlight the limits of the current prototype and suggest opportunities for future work.
This thesis consists on developing an interactive lifelog search engine for the LSC 2018 search\ challenge at ACM ICMR 2018. This search engine is created in order to browse for images from a given lifelog dataset and display them along with some written information related to them and four other images providing contextualization about the searched one.
First of all, the work makes an introduction to the relevance of this project. It introduces the\ reader to the main social problems affronted and the aim of our project to deal with them. Thus, go ahead with the scope of the project introducing to the main objectives fixed. Also, the work is gone by the actual state of the same kind of prototypes that already exist to let the reader see the differences that our project presents.
After the project approach is done, it begins a travel trough the methodology and creation\ process, going deep in the main aspects and the explanation of every election and decision, also remarking the limits of the current prototype.
Additionally, the project concludes with a result section where the system is tested with six users. They are asked to find three specific images using the search engine. This test is divided in two sections: first, a qualitative section where the user is asked to test the system and fill out a survey to see how comfortable it is for him. And a second section, more quantitative, where they value the speed of our system.
Finally, the project concludes going through the actual and future ethics of lifelogging in\ general and with a final conclusion further investigation and future improvement.
}, author = {Alsina, Adri{\`a}}, editor = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto} } @article {aDimiccoli18, title = {Introduction to the special issue: Egocentric Vision and Lifelogging}, journal = {Journal of Visual Communication and Image Representation}, year = {2018}, month = {06/2018}, abstract = {\
Deep learning models do not only achieve superior performances in image recognition tasks, but also in predicting where and when users focus their attention. This talk will provide an overview of how convolutional neural networks have been trained to predict saliency maps that describe the probability of fixing the gaze on each image location. Different solution have been proposed for this task, and our recent work has added a temporal dimension by predicting the gaze scanpath over 360 degree images for VR/AR. These techniques allow simulating eye tracker data with no need of user data collection.
\
Leishmaniasis is considered a neglected disease that causes thousands of deaths annually in some tropical and subtropical countries. There are various techniques to diagnose leishmaniasis of which manual microscopy is considered to be the gold standard. There is a need for the development of automatic techniques that are able to detect parasites in a robust and unsupervised manner. In this paper we present a procedure for automatizing the detection process based on a deep learning approach. We train a U-net model that successfully segments leismania parasites and classifies them into promastigotes, amastigotes and adhered parasites.
}, author = {G{\'o}rriz, Marc and Albert Aparicio and Berta Ravent{\'o}s and Daniel L{\'o}pez-Codina and Ver{\'o}nica Vilaplana and Elisa Sayrol} } @inbook {bGorriz18, title = {Leishmaniasis Parasite Segmentation and Classification Using Deep Learning}, booktitle = { Articulated Motion and Deformable Objects}, volume = {10945}, number = {Lecture Notes in Computer Science}, year = {2018}, pages = {53-62}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {Leishmaniasis is considered a neglected disease that causes thousands of deaths annually in some tropical and subtropical countries. There are various techniques to diagnose leishmaniasis of which manual microscopy is considered to be the gold standard. There is a need for the development of automatic techniques that are able to detect parasites in a robust and unsupervised manner. In this paper we present a procedure for automatizing the detection process based on a deep learning approach. We train a U-net model that successfully segments leismania parasites and classifies them into promastigotes, amastigotes and adhered parasites.
}, issn = {978-3-319-94544-6}, doi = {10.1007/978-3-319-94544-6}, author = {G{\'o}rriz, Marc and Albert Aparicio and Berta Ravent{\'o}s and Ver{\'o}nica Vilaplana and Elisa Sayrol and Daniel L{\'o}pez-Codina} } @conference {cFernandezc, title = {Linking Media: adopting Semantic Technologies for multimodal media connection}, booktitle = {International Semantic Web Conference - ISWC (Industry Track)}, year = {2018}, month = {08/2018}, address = {Monterey, CA, USA}, abstract = {Today{\textquoteright}s media and news organizations are constantly generating large amounts of multimedia content, majorly delivered online. As the online media market grows, the management and delivery of contents is becoming a challenge.\ Computational approaches can help to overcome this challenge by governing different applications such as content creation, production, search, and its promotion and distribution to different audiences.\ In this abstract we present a success story of the adoption of semantic technologies on the aforementioned applications, which\ \ are built on top of a semantic tagging framework, based on a Knowledge Graph (KG).\ The presented pipeline combines multimodal inputs into a contextual entity linking module, which indexes documents and links them to trends and stories developing on the news.\ \ We will describe how documents are linked and provided to media producers through Vilynx{\textquoteright}s platform, which is currently indexing over 20k media documents a day.
The identification of healthy individuals harboring amyloid pathology constitutes one important challenge for secondary prevention clinical trials in Alzheimer{\textquoteright}s disease (AD). Consequently, noninvasive and cost-efficient techniques to detect preclinical AD constitute an unmet need of critical importance. In this manuscript, we apply machine learning to structural MRI (T1 and DTI) of 96 cognitively normal subjects to identify amyloid-positive ones. Models were trained on public ADNI data and validated on an independent local cohort. Used for subject classification in a simulated clinical trial setting, the proposed method is able to save 60\% unnecessary CSF/PET tests and to reduce 47\% of the cost of recruitment when used in a simulated clinical trial setting. This recruitment strategy capitalizes on already acquired MRIs to reduce the overall amount of invasive PET/CSF tests in prevention trials, demonstrating a potential value as a tool for AD screening. This protocol could foster the development of secondary prevention strategies for AD.
}, author = {Adri{\`a} Casamitjana and Paula Petrone and Alan Tucholka and Carles Falcon and Stavros Skouras and Jose Luis Molinuevo and Ver{\'o}nica Vilaplana and Juan D. Gispert} } @unpublished {xFernandezb, title = {Multimodal Knowledge Base Population from News Streams for Media Applications}, year = {2018}, month = {07/2018}, type = {Phd thesis proposal}, abstract = {Media producers publish large amounts of multimedia content online - both text, audio\ and video. As the online media market grows, the management and delivery of contents\ is becoming a challenge. Semantic and Linking technologies can be used to organize and\ exploit this contents. This dissertation addresses the problem of integrating Semantic Web\ technologies and linking data technologies into Vilynx{\textquoteright}s platform, a system used by media\ producers to manage and explode its contents. For that purpose, Knowledge Graphs (KG)\ and its maintenance through multimodal Knowledge Base Population (KBP) from online\ data extracted from the Web is studied. The Web is a very large unstructured data source\ with millions of text, images, videos and audio. This thesis is willing to generate solutions\ to facilitate automatic learning from these multimodal data and use it in real product applications\ for media.
This thesis is going to be structured in three parts. The first part of the thesis will cover\ the construction of a multimodal KG, which will be the core of the system for knowledge\ extraction, standardization and contextualization.
The\ second part will consist on the construction of the tools that will be used for KBP. For\ that we will construct a multimodal semantic tagging framework, based on the previously\ mentioned KG. This block addresses some typical challenges of KBP and data mining, like:\ name entity recognition (NER), entity linking (EL), context set construction (CSC), structured\ data creation, standardization, entity matching and data fusion.
The third part will focus on the extraction of knowledge from the Web to populate the knowledge\ base. As the KG domain is media, we will populate the KG using events detected from\ news streams using a multilmodal perspective. To detect events we will construct a news\ aggregator system. This part will deal with the problems of Topic Detection and Tracking\ (TDT), Topic Modeling (TM) and multi-document summarization. From these data we will\ learn relations between world entities, that will populate our KG, dealing with the automatic\ detection and update of concepts and relations. Also social media information will be\ analyzed to understand trendiness and world interests.
}, keywords = {Entity Detection, Entity Linking, Knowledge Base Population, Knowledge Graph, Linked Technologies, Multi-document Summarization, multimedia, Multimodal Systems, Natural Language Processing, Semantic Web, Topic Detection and Tracking, Topic Modeling}, author = {Fern{\`a}ndez, D{\`e}lia and Bou-Balust, Elisenda and Xavier Gir{\'o}-i-Nieto} } @article {xGiro-i-Nieto18a, title = {One Perceptron to Rule them All}, year = {2018}, abstract = {\
}, url = {https://arxiv.org/abs/1802.06822}, author = {Shou, Zheng and Pan, Junting and Chan, Johnatan and Miyazawa, Kazuyuki and Mansour, Hassan and Vetro, Anthony and Xavier Gir{\'o}-i-Nieto and Chang, Shih-Fu} } @conference {cLopez-Palma, title = {Oriented trajectories as a method for audience measurement}, booktitle = {27th International Symposium on Industrial Electronics (ISIE)}, year = {2018}, month = {06/2018}, publisher = {IEEE}, organization = {IEEE}, address = {Cairns, Australia}, abstract = {The quantification of the attention received by advertisements is of paramount importance to determine its effectiveness. In this work, a simple and effective objective method for the assessment of the attention given to advertisements is provided. The proposed method is based on computing the oriented trajectory of the different test persons along with their head pose. This way, it is possible to determine if a given person is looking towards the advertisement. While other similar methods use more complex setups, requiring a camera at each advertisement location, our method needs only a single (or a few) ceiling camera. Even though the apparent simplicity, the method can compute attention measures at each point of the scene.
}, author = {Manuel L{\'o}pez-Palma and Morros, J.R. and Javier Gago and Montserrat Corbal{\'a}n} } @conference {cAssensa, title = {PathGAN: Visual Scanpath Prediction with Generative Adversarial Networks}, booktitle = {ECCV 2018 Workshop on Egocentric Perception, Interaction and Compution (EPIC)}, year = {2018}, month = {07/2018}, publisher = {Springer}, organization = {Springer}, address = {Munich, Germany}, abstract = {We introduce PathGAN, a deep neural network for visual scanpath prediction trained on adversarial examples. A visual scanpath is defined as the sequence of fixation points over an image defined by a human observer with its gaze. PathGAN is composed of two parts, the generator and the discriminator. Both parts extract features from images using off-the-shelf networks, and train recurrent layers to generate or discriminate scanpaths accordingly. In scanpath prediction, the stochastic nature of the data makes it very difficult to generate realistic predictions using supervised learning strategies, but we adopt adversarial training as a suitable alternative. Our experiments prove how PathGAN improves the state of the art of visual scanpath prediction on the Salient360! dataset.
This work obtained the\ 2nd award in Prediction of Head-gaze Scan-paths for Images, and the 2nd award in Prediction of Eye-gaze Scan-paths for Images at the IEEE ICME 2018 Salient360! Challenge.
Partial Least Squares (PLS) is a mathematical technique that relates two sets of observable variables by means of a few latent explanatory factors. The aim of this study is to use PLS to discover the associations between CSF biomarkers and structural brain imaging in preclinical AD and to disentangle their specific contribution from confounding demographic factors.\ PLS is able to disentangle the cerebral morphometric patterns associated to preclinical AD stages from other demographic factors. Results with both cortical thickness and volumetric data present significant overlap, thus showing the robustness of this approach. Interestingly, volumetric data showed more significant correlations with CSF Abeta than cortical thickness.
}, author = {Adri{\`a} Casamitjana and P. Petrone and M. Artigues and J.L. Molinuevo and J.D. Gispert and Ver{\'o}nica Vilaplana} } @conference {cSalvadore, title = {Recurrent Neural Networks for Semantic Instance Segmentation}, booktitle = {CVPR 2018 DeepVision Workshop}, year = {2018}, month = {06/2018}, abstract = {We present a recurrent model for semantic instance segmentation that sequentially generates binary masks and their associated class probabilities for every object in an image. Our proposed system is trainable end-to-end from an input image to a sequence of labeled masks and, compared to methods relying on object proposals, does not require post-processing steps on its output. We study the suitability of our recurrent model on three different instance segmentation benchmarks, namely Pascal VOC 2012, CVPPP Plant Leaf Segmentation and Cityscapes. Further, we analyze the object sorting patterns generated by our model and observe that it learns to follow a consistent pattern, which correlates with the activations learned in the encoder part of our network.
}, author = {Amaia Salvador and M{\'\i}riam Bellver and Baradad, Manel and V{\'\i}ctor Campos and Marqu{\'e}s, F. and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cSalvadord, title = {Recurrent Neural Networks for Semantic Instance Segmentation}, booktitle = {ECCV 2018 Women in Computer Vision (WiCV) Workshop}, year = {2018}, month = {12/2017}, abstract = {We present a recurrent model for semantic instance segmentation that sequentially generates pairs of masks and their associated class probabilities for every object in an image. Our proposed system is trainable end-to-end, does not require post-processing steps on its output and is conceptually simpler than current methods relying on object proposals. We observe that our model learns to follow a consistent pattern to generate object sequences, which correlates with the activations learned in the encoder part of our network. We achieve competitive results on three different instance segmentation benchmarks (Pascal VOC 2012, Cityscapes and CVPPP Plant Leaf Segmentation).
The complexity of solving a problem can differ greatly to the complexity of posing that problem. Building a Neural Network capable of dynamically adapting to the complexity of the inputs would be a great feat for the machine learning community. One of the most promising approaches is Adaptive Computation Time for Recurrent Neural Network (ACT) \parencite{act}. In this thesis, we implement ACT in two of the most used deep learning frameworks, PyTorch and TensorFlow. Both are open source and publicly available. We use this implementations to evaluate the capability of ACT to learn algorithms from examples. We compare ACT with a proposed baseline where each input data sample of the sequence is read a fixed amount of times, learned as a hyperparameter during training. Surprisingly, we do not observe any benefit from ACT when compared with this baseline solution, which opens new and unexpected directions for future research.
Convolutional Neural Networks (CNNs) are frequently used to tackle image classification and segmentation problems due to its recently proven successful results. In particular, in medical domain, it is more and more common to see automated techniques to help doctors in their diagnosis. In this work, we study the retinal lesions segmentation problem using CNNs on the Indian Diabetic Retinopathy Image Dataset (IDRiD). Additionally, the idea of adversarial training used by Generative Adversarial Networks (GANs) will be also added to the previous CNN to improve its results, making segmentation maps more accurate and realistic. A comparison between these two architectures will be made. One of the main challenges we will be facing is the high-imbalance between lesions and healthy parts of the retina and the fact that some lesion classes are very scattered in small fractions. Thus, different loss functions, optimizers and training schemes will be studied and evaluated to see which one best addresses our problem.
}, author = {Nat{\`a}lia Gull{\'o}n}, editor = {Ver{\'o}nica Vilaplana} } @conference {cMohedanob, title = {Saliency Weighted Convolutional Features for Instance Search}, booktitle = {Content-Based Multimedia Indexing - CBMI}, year = {2018}, month = {09/2018}, publisher = {IEEE}, organization = {IEEE}, address = {La Rochelle, France}, abstract = {This work explores attention models to weight the contribution of local convolutional representations for the instance search task. We present a retrieval framework based on bags of local convolutional features (BLCF) that benefits from saliency weighting to build an efficient image representation. The use of human visual attention models (saliency) allows significant improvements in retrieval performance without the need to conduct region analysis or spatial verification, and without requiring any feature fine tuning. We investigate the impact of different saliency models, finding that higher performance on saliency benchmarks does not necessarily equate to improved performance when used in instance search tasks. The proposed approach outperforms the state-of-the-art on the challenging INSTRE benchmark by a large margin, and provides similar performance on the Oxford and Paris benchmarks compared to more complex methods that use off-the-shelf representations.
We introduce deep neural networks for scanpath and saliency prediction trained on 360-degree images. The scanpath prediction model called SaltiNet is based on a temporal-aware novel representation of saliency information named the saliency volume. The first part of the network consists of a model trained to generate saliency volumes, whose parameters are fit by back-propagation using a binary cross entropy (BCE) loss over downsampled versions of the saliency volumes. Sampling strategies over these volumes are used to generate scanpaths over the 360-degree images. Our experiments show the advantages of using saliency volumes, and how they can be used for related tasks. We also show how a similar architecture achieves state-of-the-art performance for the related task of saliency map prediction. Our source code and trained models available here.
}, url = {https://www.sciencedirect.com/science/article/pii/S0923596518306209}, author = {Assens, Marc and McGuinness, Kevin and O{\textquoteright}Connor, N. and Xavier Gir{\'o}-i-Nieto} } @inbook {bCampos, title = {Sentiment concept embedding for visual affect recognition}, booktitle = {Multimodal Behavior Analysis in theWild}, year = {2018}, publisher = {Elsevier}, organization = {Elsevier}, edition = {1}, chapter = {16}, abstract = {Automatic sentiment and emotion understanding of general visual content has recently garnered much research attention. However, the large visual variance associated with\ high-level affective concepts presents a challenge when designing systems with high-performance requirements. One\ popular approach to bridge the {\textquotedblleft}affective gap{\textquotedblright} between\ low-level visual features and affective semantics consists of\ using Adjective Noun Pair (ANP) semantic constructs for\ concepts, e.g. {\textquotedblleft}beautiful landscape{\textquotedblright} or {\textquotedblleft}scary face{\textquotedblright} which\ act as a mid-level representation that can be recognized by\ visual classifers while still carrying an affective bias. In\ this work, we formulate the ANP detection task in images\ over a continuous space defined over an embedding that\ captures the inter-concept relationships between ANPs. We\ show how the compact representations obtained from the\ embeddeding extend the discrete concepts in the ontology\ and can be used for improved visual sentiment and emotion\ prediction, as well as new applications such as zero-shot\ ANP detection.
}, url = {https://www.elsevier.com/books/multimodal-behavior-analysis-in-the-wild/alameda-pineda/978-0-12-814601-9}, author = {V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto and Jou, Brendan and Jordi Torres and Chang, Shih-Fu} } @inbook {bCasamitjana18a, title = {Shared Latent Structures Between Imaging Features and Biomarkers in Early Stages of Alzheimer{\textquoteright}s Disease}, booktitle = {PRedictive Intelligence in MEdicine}, volume = {11121}, year = {2018}, pages = {60-67}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, abstract = {In this work, we identify meaningful latent patterns in MR images for patients across the Alzheimer{\textquoteright}s disease (AD) continuum. For this purpose, we apply Projection to Latent Structures (PLS) method using cerebrospinal fluid (CSF) biomarkers (t-tau, p-tau, amyloid-beta) and age as response variables and imaging features as explanatory variables. Freesurfer pipeline is used to compute MRI surface and volumetric features resulting in 68 cortical ROIs and 84 cortical and subcortical ROIs, respectively. The main assumption of this work is that there are two main underlying processes governing brain morphology along the AD continuum: brain aging and dementia. We use two different and orthogonal PLS models to describe each process: PLS-aging and PLS-dementia. To define PLS-aging model we use normal aging subjects and age as predictor and response variables, respectively, while for PLS-dementia we only use demented subjects and biomarkers as response variables.
}, issn = {978-3-030-00320-3}, doi = {10.1007/978-3-030-00320-3}, author = {Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana and Paula Petrone and Jose Luis Molinuevo and Juan D. Gispert} } @conference {cCasamitjanac, title = {Shared latent structures between imaging features and biomarkers in early stages of Alzheimer{\textquoteright}s disease}, booktitle = {Workshop on Predictive Intelligence in Medicine (PRIME), MICCAI}, year = {2018}, month = {2018}, address = {Granada, Spain}, abstract = {In this work, we identify meaningful latent patterns in MR images for patients across the Alzheimer{\textquoteright}s disease (AD) continuum. For this purpose, we apply Projection to Latent Structures (PLS) method using cerebrospinal fluid (CSF) biomarkers (t-tau, p-tau, amyloid-beta) and age as response variables and imaging features as explanatory variables. Freesurfer pipeline is used to compute MRI surface and volumetric features\ \ resulting in 68 cortical ROIs and 84 cortical and subcortical ROIs, respectively. The main assumption of this work is that there are two main underlying processes governing brain morphology along the AD continuum: brain aging and dementia. We use two different and orthogonal PLS models to describe each process: PLS-aging and PLS-dementia. To define PLS-aging model we use normal aging subjects and age as predictor and response variables, respectively, while for PLS-dementia we only use demented subjects and biomarkers as response variables.\
}, author = {Adri{\`a} Casamitjana and Ver{\'o}nica Vilaplana and Paula Petrone and Jose Luis Molinuevo and Juan D. Gispert} } @conference {cCampos18, title = {Skip RNN: Learning to Skip State Updates in Recurrent Neural Networks}, booktitle = {International Conference on Learning Representations (ICLR)}, year = {2018}, month = {01/2018}, abstract = {Recurrent Neural Networks (RNNs) continue to show\ outstanding performance in sequence modeling tasks. However, training RNNs on long sequences often face challenges like slow inference, vanishing gradients and difficulty in capturing long term dependencies. In backpropagation through time settings, these issues are tightly coupled with the large, sequential computational graph resulting from unfolding the RNN in time. We introduce the Skip RNN model which extends existing RNN models by learning to skip state updates and shortens the effective size of the computational graph. This model can also be encouraged to perform fewer state updates through a budget constraint. We evaluate the proposed model on various tasks and show how it can reduce the number of required RNN updates while preserving, and sometimes even improving, the performance of the baseline RNN models.
\
Image synthesis have been a trending task for the AI community in recent years. Many works have shown the potential of Generative Adversarial Networks (GANs) to deal with tasks such as text or audio to image synthesis. In particular, recent advances in deep learning using audio have inspired many works involving both visual and auditory information. In this work we propose a face synthesis method using audio and/or language representations as inputs. Furthermore, a dataset which relates speech utterances with a face and an identity has been built, fitting for other tasks apart from face synthesis such as speaker recognition or voice conversion.
\
\
}, author = {Linardos, Panagiotis and Mohedano, Eva and Chert{\'o}, M{\`o}nica and Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto} } @conference {cDuarte, title = {Towards Speech to Sign Language Translation}, booktitle = {ECCV 2018 Workshop on Shortcomings in Vision and Language}, year = {2018}, month = {08/2018}, abstract = {Sign Language (SL) is the primary means of communication for a majority of the hearing-impaired community. Current computational approaches in this research area have focused specifically on Sign Language Recognition (SLR) and Sign Language Translation (from SL to text) (SLT). However, the reverse problem of translating from spoken language to sign language has so far been unexplored. The goal of our ongoing project is to provide to people with hearing disabilities the audio tracks from online videos, by automatically generating a video-based speech to sign language translation. In this paper, we will point out the shortcomings that limit the advances of this research area and propose first steps towards this end.
}, author = {Amanda Duarte and Camli, Gorkem and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xArenas, title = {Video Understanding through the Disentanglement of Appearance and Motion}, year = {2018}, abstract = {Understanding the inner workings of deep learning algorithms is key to eciently exploit the\ large number of videos that are generated every day. For the self-supervised learning of the spatiotemporal\ information contained within these videos, there are several types of algorithms based\ on convolutional neural networks (CNNs) following an auto-encoder style architecture. However,\ we have checked that this type of models, trained for the frame prediction task, learn jointly\ these spatio-temporal information, so the model is not able to recognize appearance-motion\ combinations not seen during training. Our proposed model, called DisNet, can learn separately\ the appearance and motion through disentanglement, so that it solves the generalization and\ scalability problems. To demonstrate this, we conducted numerous experiments under highly\ controlled conditions, generating specic datasets that make the conventional model fail for\ the appearance and motion classication tasks, and analyzing how well our proposal behaves\ under the same conditions.
\
}, author = {Arenas, Carlos}, editor = {Xavier Gir{\'o}-i-Nieto and V{\'\i}ctor Campos and Palacio, Sebastian} } @conference {cFernandezb, title = {What is going on in the world? A display platform for media understanding}, booktitle = {IEEE Multimedia Information Processing and Retrieval (MIPR) Conference}, year = {2018}, month = {04/2018}, publisher = {IEEE}, organization = {IEEE}, address = {Miami, FL (USA)}, abstract = {News broadcasters and on-line publishers daily generate a large amount of articles and videos describing events currently happening in the world. In this, work we present a system that automatically indexes videos from a library and links them to stories developing in the news. The user interface displays in an intuitive manner the links between\ videos and stories and allows navigation through related content by using associated tags. This interface is a powerful industrial tool for publishers to index, retrieve and visualize their video content. It helps them identify which topics require more attention or retrieve related content that{\textquoteright}s already been published about the stories.
}, doi = {https://doi.org/10.1109/MIPR.2018.00045}, url = {https://www.youtube.com/watch?v=eaXcB2X-5xY}, author = {Fern{\`a}ndez, D{\`e}lia and David Varas and Bou, Elisenda and Xavier Gir{\'o}-i-Nieto} } @conference {cLopez-Palmaa, title = {Who watches the watchers? Quality control of the human inspection in production lines using Visual Intensity of Attention}, booktitle = {SAAEI 2018}, year = {2018}, month = {07/2018}, address = {Barcelona}, abstract = {On multiple occasions production lines require inspectors, human operators that visualize certain steps of the production and determine the quality of the resulting products. However, inspectors are subject to errors. We propose a method based on computer vision to decide if the inspector has used an adequate attention in the different points of inspection so that pieces that have not been verified can be marked for rejection or re-inspection. The method uses a top-view ceiling camera that computes the trajectories and areas of vision of the inspector, and determines which products have received the correct amount of attention. The resulting attention figure can be compared with the acceptance range in the inspection protocol to determine if the inspection is valid.
}, author = {Manuel L{\'o}pez-Palma and Morros, J.R. and Javier Gago and Montserrat Corbal{\'a}n} } @conference {cGorriz, title = {Active Deep Learning for Medical Imaging Segmentation}, booktitle = {Medical Image meets NIPS 2017 Workshop}, year = {2017}, month = {11/2017}, abstract = {We propose a novel Active Learning framework capable to train effectively a convolutional neural network for semantic segmentation of medical imaging, with a limited amount of training labeled data. Our contribution is a practical Cost-Effective Active Learning approach using Dropout at test time as Monte Carlo sampling to model the pixel-wise uncertainty and to analyze the image information to improve the training performance.
Grade: A (9.7/10)
This thesis proposes a novel active learning framework capable to train eectively a convolutional neural network for semantic segmentation of medical imaging, with a limited amount of training labeled data. Our approach tries to apply in segmentation existing active learning techniques, which is becoming an important topic today because of the many problems caused by the lack of large amounts of data. We explore dierent strategies to study the image information and introduce a previously used cost-eective active learning method based on the selection of high condence predictions to assign automatically pseudo-labels with the aim of reducing the manual annotations. First, we made a simple application for handwritten digit classication to get started to the methodology and then we test the system with a medical image database for the treatment of melanoma skin cancer. Finally, we compared the traditional training methods with our active learning proposals, specifying the conditions and parameters required for it to be optimal.
\
Program: Master{\textquoteright}s Degree in Telecommunications Engineering
Grade: A with honours (10.0/10.0)
Image retrieval in realistic scenarios targets large dynamic datasets of unlabeled images. In these cases, training or ne-tuning a model every time new images are added to the database is neither ecient nor scalable. Convolutional Neural Networks trained for image classication over large datasets have been proven eective feature extractors when transferred to the task of image retrieval. The most successful approaches are based in encoding the activations of convolutional layers as they convey the image spatial information. Our proposal goes beyond and aims at a local-aware encoding of these features depending on the predicted image semantics, with the advantage of using only of the knowledge contained inside the network. In particular, we employ Class Activation Maps (CAMs) to obtain the most discriminative regions of the image from a semantic perspective. Additionally, CAMs are also used to generate object proposals during an unsupervised re-ranking stage after a rst fast search. Our experiments on two public available datasets for instance retrieval, Oxford5k and Paris6k, demonstrate that our system is competitive and even outperforms the current state-of-the-art when using o-the-shelf models trained on the object classes of ImageNet.
Image retrieval in realistic scenarios targets large dynamic datasets of unlabeled images.\ In these cases, training or fine-tuning a model every time new images are added to the database is neither efficient nor scalable.\ Convolutional neural networks trained for image classification over large datasets have been proven effective feature extractors when transferred to the task of image retrieval. The most successful approaches are based on encoding the activations of convolutional layers, as they convey the image spatial information. \ Our proposal goes beyond and aims at a local-aware encoding of these features depending on the predicted image semantics, with the advantage of using only of the knowledge contained inside the network.\ In particular, we employ Class Activation Maps (CAMs) to obtain the most discriminative regions from a semantic perspective. Additionally, CAMs are also used to generate object proposals during an unsupervised re-ranking stage after a first fast search.\ Our experiments on two public available datasets for instance retrieval, Oxford5k and Paris6k, demonstrate that our system is competitive and even outperforms the current state-of-the-art when using off-the-shelf models trained on the object classes of ImageNet.
We propose a novel Active Learning framework capable to train effectively a convolutional neural network for semantic segmentation of medical imaging, with a limited amount of training labeled data. Our contribution is a practical Cost-Effective Active Learning approach using Dropout at test time as Monte Carlo sampling to model the pixel-wise uncertainty and to analyze the image information to improve the training performance.
A fully automatic technique for segmenting the liver and localizing its unhealthy tissues is a convenient tool in order to diagnose hepatic diseases and assess the response to the according treatments. In this work we propose a method to segment the liver and its lesions from Computed Tomography (CT) scans using Convolutional Neural Networks (CNNs), that have proven good results in a variety of computer vision tasks, including medical imaging. The network that segments the lesions consists of a cascaded architecture, which first focuses on the region of the liver in order to segment the lesions on it. Moreover, we train a detector to localize the lesions, and mask the results of the segmentation network with the positive detections. The segmentation architecture is based on DRIU, a Fully Convolutional Network (FCN) with side outputs that work on feature maps of different resolutions, to finally\ benefit from the multi-scale information learned by different stages of the network. The main contribution of this work is the use of a detector to localize the lesions, which we show to be beneficial to remove false positives triggered by the segmentation network.
\
Program:\ Master{\textquoteright}s Degree in Telecommunications Engineering
Grade: A with honours (10.0/10.0)
A fully automatic technique for segmenting the liver and localizing its unhealthy tissues is a convenient tool in order to diagnose hepatic diseases and also to assess the response to the according treatments. In this thesis we propose a method to segment the liver and its lesions from Computed Tomography (CT) scans, as well as other anatomical structures and organs of the human body. We have used Convolutional Neural Networks (CNNs), that have proven good results in a variety of tasks, including medical imaging. The network to segment the lesions consists of a cascaded architecture, which first focuses on the liver region in order to segment the lesion. Moreover, we train a detector to localize the lesions and just keep those pixels from the output of the segmentation network where a lesion is detected. The segmentation architecture is based on DRIU [24], a Fully Convolutional Network (FCN) with side outputs that work at feature maps of different resolutions, to finally benefit from the multi-scale information learned by different stages of the network. Our pipeline is 2.5D, as the input of the network is a stack of consecutive slices of the CT scans. We also study different methods to benefit from the liver segmentation in order to delineate the lesion. The main focus of this work is to use the detector to localize the lesions, as we demonstrate that it helps to remove false positives triggered by the segmentation network. The benefits of using a detector on top of the segmentation is that the detector acquires a more global insight of the healthiness of a liver tissue compared to the segmentation network, whose final output is pixel-wise and is not forced to take a global decision over a whole liver patch. We show experiments with the LiTS dataset for the lesion and liver segmentation. In order to prove the generality of the segmentation network, we also segment several anatomical structures from the Visceral dataset.
This paper instroduces an unsupervised framework to extract semantically rich features for video representation. Inspired by how the human visual system groups objects based on motion cues, we propose a deep convolutional neural network that disentangles motion, foreground and background information. The proposed architecture consists of a 3D convolutional feature encoder for blocks of 16 frames, which is trained for reconstruction tasks over the first and last frames of the sequence. The model is trained with a fraction of videos from the UCF-101 dataset taking as ground truth the bounding boxes around the activity regions. Qualitative results indicate that the network can successfully update the foreground appearance based on pure-motion features. The benefits of these learned features are shown in a discriminative classification task when compared with a random initialization of the network weights, providing a gain of accuracy above the 10\%.
Deep learning algorithms base their success on building high learning capacity models with millions of parameters that are tuned in a data-driven fashion. These models are trained by processing millions of examples, so that the development of more accurate algorithms is usually limited by the throughput of the computing devices on which they are trained. In this work, we explore how the training of a state-of-the-art neural network for computer vision can be parallelized on a distributed GPU cluster. The effect of distributing the training process is addressed from two different points of view. First, the scalability of the task and its performance in the distributed setting are analyzed. Second, the impact of distributed training methods on the final accuracy of the models is studied.
[ICCS 2017 website]\ [Related session in ICCS 2017]\ [Paper in UPCommons]
}, keywords = {distributed computing; parallel systems; deep learning; Convolutional Neural Networks}, doi = {https://doi.org/10.1016/j.procs.2017.05.074}, url = {http://www.sciencedirect.com/science/article/pii/S1877050917306129}, author = {V{\'\i}ctor Campos and Sastre, Francesc and Yag{\"u}es, Maurici and M{\'\i}riam Bellver and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @inbook {bBellot17, title = {Efficient Combination of Pairwise Feature Networks}, booktitle = {Neural Connectomics Challenge}, year = {2017}, publisher = {Springer International Publishing}, organization = {Springer International Publishing}, chapter = {7}, issn = {978-3-319-53069-7}, doi = {10.1007/978-3-319-53070-3}, author = {P Bellot and Patrick Meyer}, editor = {Battaglia, D and Guyon, I. and Lemaire, V. and Orlandi, J. and Ray, B. and Soriano, J.} } @unpublished {xMohedanoa, title = {Fine-tuning of CNN models for Instance Search with Pseudo-Relevance Feedback}, year = {2017}, publisher = {NIPS 2017 Women in Machine Learning Workshop}, address = {Long Beach, CA, USA}, abstract = {CNN classification models trained on millions of labeled images have been proven to encode {\textquotedblleft}general purpose{\textquotedblright} descriptors in their intermediate layers. These descriptors are useful for a diverse range of computer vision problems~\cite{1}. However, the target task of these models is substantially different to the instance search task. While classification is concerned with distinguishing between different classes, instance search is concerned with identifying concrete instances of a particular class.\
\
In this work we propose an unsupervised approach to finetune a model for similarity learning~\cite{2}. For that, we combine two different search engines: one based on off-the-shelf CNN features, and another one on the popular SIFT features. As shown in the figure below, we observe that the information of pre-trained CNN representations and SIFT is in most of the cases complementary, which allows the generation of high quality rank lists. The fusion of the two rankings is used to generate training data for a particular dataset. A pseudo-relevance feedback strategy~\cite{3} is used for sampling images from rankings, considering the top images as positive examples of a particular instance and middle-low ranked images as negative examples.
}, author = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @article {xCampos, title = {From Pixels to Sentiment: Fine-tuning CNNs for Visual Sentiment Prediction}, journal = {Image and Vision Computing}, year = {2017}, abstract = {Visual multimedia have become an inseparable part of our digital social lives, and they often capture moments tied with deep affections. Automated visual sentiment analysis tools can provide a means of extracting the rich feelings and latent dispositions embedded in these media. In this work, we explore how Convolutional Neural Networks (CNNs), a now de facto computational machine learning tool particularly in the area of Computer Vision, can be specifically applied to the task of visual sentiment prediction. We accomplish this through fine-tuning experiments using a state-of-the-art CNN and via rigorous architecture analysis, we present several modifications that lead to accuracy improvements over prior art on a dataset of images from a popular social media platform. We additionally present visualizations of local patterns that the network learned to associate with image sentiment for insight into how visual positivity (or negativity) is perceived by the model.
}, doi = {http://dx.doi.org/10.1016/j.imavis.2017.01.011}, url = {http://arxiv.org/abs/1604.03489}, author = {V{\'\i}ctor Campos and Jou, Brendan and Xavier Gir{\'o}-i-Nieto} } @inbook {bBellver17, title = {Hierarchical Object Detection with Deep Reinforcement Learning}, booktitle = {Deep Learning for Image Processing Applications}, volume = {31}, year = {2017}, publisher = {IOS Press}, organization = {IOS Press}, address = {Amsterdam, The Netherlands}, abstract = {This work introduces a model for Hierarchical Object Detection with Deep Reinforcement Learning (HOD-DRL). The key idea is to focus on those parts of the image that contain richer information and zoom on them. We train an intelligent agent that, given an image window, is capable of deciding where to focus the attention on five different predefined region candidates (smaller windows). This procedure is iterated providing a hierarchical image analysis.We compare two different candidate proposal strategies to guide the object search: with and without overlap. Moreover, our work compares two different strategies to extract features from a convolutional neural network for each region proposal: a first one that computes new feature maps for each region proposal, and a second one that computes the feature maps for the whole image to later generate crops for each region proposal. Experiments indicate better results for the overlapping candidate proposal strategy and a loss of performance for the cropped image features due to the loss of spatial resolution. We argue that, while this loss seems unavoidable when working with a large number of object candidates, the much more reduced number of region proposals generated by our reinforcement learning agent allows considering to extract features for each location without sharing convolutional computation among regions. Source code and models are available at https://imatge-upc.github.io/detection-2016-nipsws/.
}, keywords = {computer Vision, Object detection, reinforcement learning}, issn = {978-1-61499-822-8}, doi = {10.3233/978-1-61499-822-8-164}, url = {http://ebooks.iospress.nl/volumearticle/48029}, author = {M{\'\i}riam Bellver and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Jordi Torres} } @article {aFrias-Velazquez, title = {Hierarchical stack filtering: a bitplane-based algorithm for massively parallel processors}, journal = {Journal of Real-Time Image Processing}, year = {2017}, month = {03/2017}, abstract = {Full version available at http://rdcu.be/p6w1
With the development of novel parallel architectures for image processing, the implementation of well-known image operators needs to be reformulated to take advantage of the so-called massive parallelism. In this work, we propose a general algorithm that implements a large class of nonlinear filters, called stack filters, with a 2D-array processor. The proposed method consists of decomposing an image into bitplanes with the bitwise decomposition, and then process every bitplane hierarchically. The filtered image is reconstructed by simply stacking the filtered bitplanes according to their order of significance. Owing to its hierarchical structure, our algorithm allows us to trade-off between image quality and processing time, and to significantly reduce the computation time of low-entropy images. Also, experimental tests show that the processing time of our method is substantially lower than that of classical methods when using large structuring elements. All these features are of interest to a variety of real-time applications based on morphological operations such as video segmentation and video enhancement.\
}, keywords = {Array processors, Bitwise decomposition, Morphological operators, Smart camera, Stack filters}, url = {http://rdcu.be/p6w1}, author = {Frias-Velazquez, A. and Morros, J.R. and Garc{\'\i}a, M. and Philips, Wilfried} } @conference {cRomero-Lopeza, title = {The Impact of Segmentation on the Accuracy and Sensitivity of a Melanoma Classifier based on Skin Lesion Images}, booktitle = {Annual Meeting of the Society of Imaging Informatics in Medicine (SIIM)}, year = {2017}, month = {06/2017}, publisher = {Society of Imaging Informatics for Medicine}, organization = {Society of Imaging Informatics for Medicine}, address = {Pittsburgh, PA, USA}, abstract = {The accuracy and sensitivity of a Deep Learning based approach for a 2-class classifier for early melanoma detection\ based on skin lesion dermoscopic images increases when\ the classifier is trained with segmented inputs (i.e., images\ containing only the lesions as binary masks, without the\ surrounding context) instead of entire images.
[SIIM 2017 Annual Meeting website]
[SIIM 2017 Session where our work is presented]
Advisors: Eva Mohedano, Kevin McGuinness and Xavier Giro-i-Nieto
Program: Master{\textquoteright}s degree in Telecommunications Engineering (MET)
Grade: A (9.7/10.0)
This thesis introduces an architecture to improve the accuracy of a Convolutional Neural Network trained for image classification using visual saliency predictions from the original images. In this thesis the accuracy of a Convolutional Neural Network (CNN) trained for classification has been improved using saliency maps from the original images. The network had an AlexNet architecture and was trained using 1.2 million images from the Imagenet dataset. Two methods had been explored in order to exploit the information from the visual saliency predictions. The first methodologies implemented applied the saliency maps directly to the existing layers of the CNN, which in some cases were already trained for classification and in other they were initialized with random weights. In the second methodology the information from the saliency maps was merged from a new branch, trained at the same time as the initial CNN. In order to speed up the training of the networks the experiments were implemented using images reduced to 128x128. With this sizes the proposed model achieves 12.39\% increase in Top-1 accuracy performance with respect to the original CNN, and additionally reduces the number of parameters needed compared to AlexNet. Regarding the original size images 227x227 a model that increases 1.72\% Top-1 accuracy is proposed. To accelerate the training process of the network the images have been reduced. The methodology that provides the higher improvement in accuracy will be implemented using the original size of the images. The results will be compared to those obtained from the network trained only with the original images. All the methodologies proposed are implemented in a network previously trained for classification. Additionally the most successful methodologies will be implemented in the training of a network. The results will provide information about the best way to add saliency maps to improve the accuracy.
Reportatge em{\`e}s dins el Telenot{\'\i}cies Vespre de Televisi{\'o} de Catalunya el diumenge 26 de novembre de 2017.
Els programes d{\textquoteright}intel{\textperiodcentered}lig{\`e}ncia artificial s{\'o}n capa{\c c}os de crear imatges i veus cada cop m{\'e}s realistes i obren la porta a generar mentides de forma m{\'e}s automatitzada
}, keywords = {deep learning, fake news, gan}, url = {http://www.ccma.cat/324/la-meitat-de-les-noticies-que-consumirem-el-2022-seran-falses/noticia/2823178/}, author = {Xavier Gir{\'o}-i-Nieto and Pascual-deLaPuente, Santiago and Mir{\'o}, Vict{\`o}ria and Esteve, Oriol} } @mastersthesis {xCampos17, title = {Learning to Skip State Updates in Recurrent Neural Networks}, year = {2017}, abstract = {Program:\ Master{\textquoteright}s Degree in Telecommunications Engineering
Grade: A with honours (10.0/10.0)
Recurrent Neural Networks (RNNs) continue to show outstanding performance in sequence modeling tasks. However, training RNNs on long sequences often face challenges like slow inference, vanishing gradients and dificulty in capturing long term dependencies. In backpropagation through time settings, these issues are tightly coupled with the large, sequential computational graph resulting from unfolding the RNN in time. We introduce the Skip RNN model which extends existing RNN models by learning to skip state updates and shortens the effective size of the computational graph. This network can be encouraged to perform fewer state updates through a novel loss term. We evaluate the proposed model on various tasks and show how it can reduce the number of required RNN updates while preserving, and sometimes even improving, the performance of the baseline models.
\
The organisation of personal data is receiving increasing research attention due to the challenges we face in gathering, enriching, searching, and visualising such data. \ Given the increasing ease with which personal data being gathered by individuals, the concept of a lifelog digital library of rich multimedia and sensory content for every individual is fast becoming a reality. \ The LTA~2017 workshop aims to bring together academics and practitioners to discuss approaches to lifelog data analytics and applications; and to debate the opportunities and challenges for researchers in this new and challenging area.
}, doi = {10.1145/3123266.3132050}, author = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto and Radeva, Petia and Dimiccoli, M. and Dang-Nguyen, Duc-Tien and Joho, H.} } @conference {cPetrone17, title = {Magnetic Resonance Imaging as a valuable tool for Alzheimer{\textquoteright}s disease screening}, booktitle = {Alzheimer{\textquoteright}s Association International Conference, London, 2017}, year = {2017}, month = {07/2017}, author = {P. Petrone and Ver{\'o}nica Vilaplana and Adri{\`a} Casamitjana and A. Tucholka and C. Falcon and R. Cacciaglia and G. Operto and S. Skouras and J.L. Molinuevo and J.D. Gispert} } @article {aPetrone17, title = {Magnetic Resonance Imaging as a valuable tool for Alzheimer{\textquoteright}s disease screening}, journal = {Alzheimer{\textquoteright}s \& Dementia: The Journal of the Alzheimer{\textquoteright}s Association}, volume = {13}, year = {2017}, month = {07/2017}, pages = {P1245}, doi = {10.1016/j.jalz.2017.07.457}, url = {https://doi.org/10.1016/j.jalz.2017.07.457}, author = {P. Petrone and Ver{\'o}nica Vilaplana and Adri{\`a} Casamitjana and D. Sanchez-Escobedo and A. Tucholka and R. Cacciaglia and G. Operto and S. Skouras and C. Falcon and J.L. Molinuevo and J.D. Gispert} } @conference {cFernandez, title = {More cat than cute? Interpretable Prediction of Adjective-Noun Pairs}, booktitle = {ACM Multimedia 2017 Workshop on Multimodal Understanding of Social, Affective and Subjective Attributes}, year = {2017}, month = {10/2017}, publisher = {ACM SIGMM}, organization = {ACM SIGMM}, address = {Mountain View, CA (USA)}, abstract = {The increasing availability of affect-rich multimedia resources has bolstered interest in understanding sentiment and emotions in and from visual content. Adjective-noun pairs (ANP) are a popular mid-level semantic construct for capturing affect via visually detectable concepts such as {\textquoteleft}{\textquoteleft}cute dog" or {\textquoteleft}{\textquoteleft}beautiful landscape". Current state-of-the-art methods approach ANP prediction by considering each of these compound concepts as individual tokens, ignoring the underlying relationships in ANPs. This work aims at disentangling the contributions of the {\textquoteleft}adjectives{\textquoteright} and {\textquoteleft}nouns{\textquoteright} in the visual prediction of ANPs. Two specialised classifiers, one trained for detecting adjectives and another for nouns, are fused to predict 553 different ANPs. The resulting ANP prediction model is more interpretable as it allows us to study contributions of the adjective and noun components.
Student: Michele Compri
Advisors: Beg{\"u}m Demir (University of Trento) and Xavier Giro-i-Nieto (UPC)
Recent advances in satellite technology has led to an increased volume of remote sensing (RS) image archives, from which retrieving useful information is challenging. Thus, one important research area in remote sensing (RS) is the content-based retrieval of RS images (CBIR). The performance of the CBIR systems depends on the capability of the RS image features in modeling the content of the images as well as the considered retrieval algorithm that assesses the similarity among the features. Existing CBIR systems in the RS literature assume that each image is categorized by only a single label in terms of a land-cover class that is associated to the most significant content of the image. However, RS images usually have complex content, i.e., there are usually several regions within each image related to multiple land-cover classes. Thus, available CBIR systems are not capable of accurately characterizing and exploiting the high level semantic content of RS images for retrieval problems.
To overcome these problems and to effectively characterize the high-level semantic content of RS images, we investigate effectiveness of different deep learning architectures in the framework of multi-label remote sensing image retrieval problems. This is achieved based on a two-steps strategy. In the first step, aConvolutional Neural Network (CNN) pre-trained for image classification with the ImageNet dataset is used off-the-shelf as a feature extractor. In particular, three popular architectures are explored: 1) VGG16; 2) Inception V3; and 3) ResNet50. VGG16 is a CNN characterized by 16 convolutional layers of stacked 3x3 filters, with intermediate max pooling layers and 3 fully connected layers at the end. Inception V3 is an improved version of the former GoogleNet, which contains more layers but less parameters, by removing fully connected layers and using a global average pooling from the last convolutional layer. ResNet50 is even deeper thanks to the introduction of residual layers, that allow data to flow by skipping the convolutional blocks. In he second step of our research, we modify these three off-the-shelf models by fine-tunning their parameters with a subset of RS images and their multi-label information. Experiments carried out on an archive of aerial images show that fine-tuning CNN architectures with annotated images with multi-labels significantly improve the retrieval accuracy with respect to the standard CBIR methods. We find that fine-tunning using with a multi-class approach achieves better results than than considering each label as an independent class. Due to the space constraints, the detailed information on each step of the proposed method will be given in the full version of the paper.\
Image representations extracted from convolutional neural networks (CNNs) outdo hand-crafted features in several computer vision tasks, such as visual image retrieval. This chapter recommends a simple pipeline for encoding the local activations of a convolutional layer of a pretrained CNN utilizing the well-known Bag of Words (BoW) aggregation scheme and called bag of local convolutional features (BLCF). Matching each local array of activations in a convolutional layer to a visual word results in an assignment map, which is a compact representation relating regions of an image with a visual word. We use the assignment map for fast spatial reranking, finding object localizations that are used for query expansion. We show the suitability of the BoW representation based on local CNN features for image retrieval, attaining state-of-the-art performance on the Oxford and Paris buildings benchmarks. We demonstrate that the BLCF system outperforms the latest procedures using sum pooling for a subgroup of the challenging TRECVid INS benchmark according to the mean Average Precision (mAP) metric.
}, issn = {978-1-61499-822-8 }, doi = {10.3233/978-1-61499-822-8-137}, url = {http://ebooks.iospress.nl/volumearticle/48028}, author = {Mohedano, Eva and Amaia Salvador and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Marqu{\'e}s, F.} } @mastersthesis {xBozal, title = {Personalized Image Classication from EEG Signals using Deep Learning}, year = {2017}, abstract = {This thesis explores the semantic classication of images based processing of electroencephalogram\ (EEG) signals generated by the viewer{\textquoteright}s brain. The work extends an existing solution by\ exploring the gains obtained when the parameters of the classier are adapted to the user. Firstly,\ we developed an universal end-to-end model based on deep learning that extracts features from\ the EEG raw signals predicts the semantic content of the image between 40 possible classes from\ the ImageNet dataset. Our main contribution aims at adapting this universal model to new users,\ in order to build a personalized model based on the minimum feedback from the new user. We\ explored different deep learning architectures and hyperparameters to obtain a better accuracy\ than the baseline by Spampinato et al (CVPR 2017). We achieve a result of 89.03 \% and 90.34\ \% of the universal and personalized model respectively.\
This Thesis explores dierent approaches using deep learning techniques to predict emotions\ in videos.\ Working with videos implies a huge amount of data including visual frames and acoustic\ samples. The rst step of the project is basically to extract features to represent the videos in\ small sets of arrays. This procedure is done using pre-trained models based on Convolutional\ Networks, the state of the art in visual recognition. Firstly, visual features are extracted using\ 3D convolutions and acoustic features are extracted using VGG19, a pre-trained convolutional\ model for images fine-tuned to accept the audio inputs.\ Later, these features are fed into a Recurrent model capable of exploiting the temporal information.\ Emotions are measured in terms of valence and arousal, values between [-1, 1]. Additionally,\ the same techniques are also used to attempt to predict fear scenes. In consequence, this thesis\ deals with both regression and classication problems.\ Several architectures and dierent parameters have been tested in order to achieve the best\ performance. Finally, the results will be published in the MediaEval 2017 Challenge and compared\ to the state-of-the-art solutions.
This thesis explores the application of a deep learning approach for the prediction of media\ interestingness. Two dierent models are investigated, one for the prediction of image and one\ for the prediction of video interestingness.\ For the prediction of image interestingness, the ResNet50 network is ne-tuned to obtain\ best results. First, some layers are added. Next, the model is trained and fine-tuned using data\ augmentation, dropout, class weights, and changing other hyper parameters.\ For the prediction of video interestingness, rst, features are extracted with a 3D convolutional\ network. Next a LSTM network is trained and fine-tuned with the features.\ The nal result is a binary label for each image/video: 1 for interesting, 0 for not interesting.\ Additionally, a confidence value is provided for each prediction. Finally, the Mean Average\ Precision (MAP) is employed as evaluation metric to estimate the quality of the final results.
We present a recurrent model for end-to-end instance-aware semantic segmentation that is able to sequentially generate pairs of masks and class predictions. Our proposed system is trainable end-to-end for instance segmentation, does not require further post-processing steps on its output and is conceptually simpler than current methods relying on object proposals. While recent works have proposed recurrent architectures for instance segmentation, these are trained and evaluated for a single category.
Our model is composed of a series of Convolutional LSTMs that are applied in chain with upsampling layers in between to predict a sequence of binary masks and associated class probabilities. Skip connections are incorporated in our model by concatenating the output of the corresponding convolutional layer in the base model with the upsampled output of the ConvLSTM. Binary masks are finally obtained with a 1x1 convolution with sigmoid activation. We concatenate the side outputs of all ConvLSTM layers and apply a per-channel max-pooling operation followed by a single fully-connected layer with softmax activation to obtain the category for each predicted mask.
We train and evaluate our models with the Pascal VOC 2012 dataset. Future work will aim at analyzing and understanding the behavior of the network on other datasets, comparing the system with state of the art solutions and study the relationship of the learned object discovery patterns of our model with those of humans.
}, author = {Amaia Salvador and Baradad, Manel and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cPana, title = {SalGAN: Visual Saliency Prediction with Generative Adversarial Networks}, booktitle = {CVPR 2017 Scene Understanding Workshop (SUNw)}, year = {2017}, address = {Honolulu, Hawaii, USA}, abstract = {We introduce SalGAN, a deep convolutional neural network for visual saliency prediction trained with adversarial examples. The first stage of the network consists of a generator model whose weights are learned by back-propagation computed from a binary cross entropy (BCE) loss over downsampled versions of the saliency maps. The resulting prediction is processed by a discriminator network trained to solve a binary classification task between the saliency maps generated by the generative stage and the ground truth ones. Our experiments show how adversarial training allows reaching state-of-the-art performance across different metrics when combined with a widely-used loss function like BCE.
}, url = {https://arxiv.org/abs/1701.01081}, author = {Pan, Junting and Cristian Canton-Ferrer and McGuinness, Kevin and O{\textquoteright}Connor, N. and Jordi Torres and Elisa Sayrol and Xavier Gir{\'o}-i-Nieto} } @conference {cAssens, title = {SaltiNet: Scan-path Prediction on 360 Degree Images using Saliency Volumes}, booktitle = {ICCV Workshop on Egocentric Perception, Interaction and Computing}, year = {2017}, month = {07/2017}, publisher = {IEEE}, organization = {IEEE}, address = {Venice, Italy}, abstract = {We introduce SaltiNet, a deep neural network for scanpath prediction trained on 360-degree images. The first part of the network consists of a model trained to generate saliency volumes, whose parameters are learned by back-propagation computed from a binary cross entropy (BCE) loss over downsampled versions of the saliency volumes. Sampling strategies over these volumes are used to generate scanpaths over the 360-degree images. Our experiments show the advantages of using saliency volumes, and how they can be used for related tasks.
Winner of three awards at the Salient 360 Challenge at IEEE ICME 2017 (Hong Kong): Best Scan Path, Best Student Scan-path and Audience Award.
\
Deep neural networks have gained popularity in recent years, obtaining outstanding results in a wide range of applications such as computer vision in both academia and multiple industry areas. The progress made in recent years cannot be understood without taking into account the technological advancements seen in key domains such as High Performance Computing, more specifically in the Graphic Processing Unit (GPU) domain. These kind of deep neural networks need massive amounts of data to effectively train the millions of parameters they contain, and this training can take up to days or weeks depending on the computer hardware we are using. In this work, we present how the training of a deep neural network can be parallelized on a distributed GPU cluster. The effect of distributing the training process is addressed from two different points of view. First, the scalability of the task and its performance in the distributed setting are analyzed. Second, the impact of distributed training methods on the training times and final accuracy of the models is studied. We used TensorFlow on top of the GPU cluster of servers with 2 K80 GPU cards, at Barcelona Supercomputing Center (BSC). The results show an improvement for both focused areas. On one hand, the experiments show promising results in order to train a neural network faster. The training time is decreased from 106 hours to 16 hours in our experiments. On the other hand we can observe how increasing the numbers of GPUs in one node rises the throughput, images per second, in a near-linear way. Morever an additional distributed speedup of 10.3 is achieved with 16 nodes taking as baseline the speedup of one node.
}, url = {http://easychair.org/smart-program/CCGRID2017/2017-05-15.html$\#$session:13550}, author = {Jordi Torres and Sastre, Francesc and Yag{\"u}es, Maurici and V{\'\i}ctor Campos and Xavier Gir{\'o}-i-Nieto} } @conference {xLidona, title = {Semantic Summarization of Egocentric Photo Stream Events}, booktitle = {ACM Multimedia 2017 Workshop on Lifelogging Tools and Applications}, year = {2017}, month = {10/2017}, publisher = {ACM}, organization = {ACM}, address = {Mountain View, CA, USA}, abstract = {With the rapid increase of users of wearable cameras in recent years and of the amount of data they produce, there is a strong need for automatic retrieval and summarization techniques. This work addresses the problem of automatically summarizing egocentric photo streams captured through a wearable camera by taking an image retrieval perspective. After removing non-informative images by a new CNN-based filter, \ images are ranked by relevance to ensure semantic diversity and \ finally re-ranked by a novelty criterion to reduce redundancy. \ To assess the results, a new evaluation metric is proposed which takes into account the non-uniqueness of the solution. Experimental results applied on a database of 7,110 images from 6 different subjects and evaluated by experts gave 95.74\% of experts satisfaction and a Mean Opinion Score of 4.57 out of 5.0.
The recent emergence of deep learning methods for medical image analysis has enabled the development of intelligent medical imaging-based diagnosis systems that can assist the human expert in making better decisions about a patient{\textquoteright}s health. In this paper we focus on the problem of skin lesion classification, particularly early melanoma detection, and present a deep-learning based approach to solve the problem of classifying a dermoscopic image containing a skin lesion as malignant or benign. \ The proposed solution is built around the VGGNet convolutional neural network architecture and uses the transfer learning paradigm. Experimental results are encouraging: on the ISIC Archive dataset, the proposed method achieves a sensitivity value of 78.66\%, which is significantly higher than the current state of the art on that dataset.
}, keywords = {Convolutional Neural Networks, deep learning, machine learning, Medical Decision Support Systems, Medical Image Analysis, Skin Lesions}, url = {http://upcommons.upc.edu/handle/2117/103386}, author = {Romero-Lopez, Adria and Burdick, Jack and Xavier Gir{\'o}-i-Nieto and Marques, Oge} } @mastersthesis {xRomero-Lopez, title = {Skin Lesion Detection from Dermoscopic Images using Convolutional Neural Networks}, year = {2017}, abstract = {Advisors: Oge Marques (Florida Atlantic University) and Xavier Giro-i-Nieto (UPC)
The recent emergence of machine learning and deep learning methods for medical image analysis has enabled the development of intelligent medical imaging-based diagnosis systems that can assist physicians in making better decisions about a patient{\textquoteright}s health. In particular, skin imaging is a field where these new methods can be applied with a high rate of success.\
This thesis focuses on the problem of automatic skin lesion detection, \ particularly on melanoma detection, by applying semantic segmentation and classification from dermoscopic images using a deep learning based approach.\ For the first problem, a U-Net convolutional neural network architecture is applied for an accurate extraction of the lesion region.\ For the second problem, the current model performs a binary classification (benign versus malignant) that can be used for early melanoma detection. The model is general enough to be extended to multi-class skin lesion classification. The proposed solution is built around the VGG-Net ConvNet architecture and uses the transfer learning paradigm.\ Finally, this work performs a comparative evaluation of classification \ alone (using the entire image) against a combination of the two approaches (segmentation followed by classification) in order to assess which of them achieves better classification results.
\
Recurrent Neural Networks (RNNs) continue to show \ outstanding performance in sequence modeling tasks. However, training RNNs on long sequences often face challenges like slow inference, vanishing gradients and difficulty in capturing long term dependencies. In backpropagation through time settings, these issues are tightly coupled with the large, sequential computational graph resulting from unfolding the RNN in time. We introduce the Skip RNN model which extends existing RNN models by learning to skip state updates and shortens the effective size of the computational graph. This model can also be encouraged to perform fewer state updates through a budget constraint. We evaluate the proposed model on various tasks and show how it can reduce the number of required RNN updates while preserving, and sometimes even improving, the performance of the baseline RNN models.
\
Program: Bachelor Degree on Telecommunications Science and Technologies (CITTEL)
Grade: A with honours (10.0/10.0)
This thesis explores methodologies for scanpath prediction on images using deep learning\ frameworks.\ As a preliminary step, we analyze the characteristics of the data provided by dierent datasets.\ We then explore the use of Convolutional Neural Networks (CNN) and Long-Short-Term-Memory\ (LSTM) newtworks for scanpath prediction. We observe that these models fail due to the high\ stochastic nature of the data.\ With the gained insight, we propose a novel time-aware visual saliency representation named\ Saliency Volume, that averages scanpaths over multiple observers.\ Next, we explore the SalNet network and adapt it for saliency volume prediction, and we find\ several ways of generating scanpaths from saliency volumes.\ Finally, we ne-tuned our model for scanpaht prediction on 360-degree images and successfully\ submitted it to the Salient360! Challenge from ICME. The source code and models are publicly\ available at https://github.com/massens/saliency-360salient-2017.
The increasing amount of videos online brings several opportunities for training self-supervised neural networks. In this work, we explore cross-modal embeddings between audio and vision by exploiting their alignment on YouTube videos.
\
Joint audio-visual embeddings allow creating links between audio and visual documents by projecting them to a common region of the feature space. They can be applied to enriching radio broadcasts with images, finding soundtracks for user-generated videos or simply enriching a topic search with both audio and video documents.
\
The idea of creating a joint embedding space across modalities has being exploited by other areas [3, 4]. However, joint representation between the video frames and its audio have yet to be fully exploited. A similar approach to the proposed one was [2], where a soundtrack was retrieved to match a music video. However, this work did not target a synchronization between both modalities.
\
We aim at training a temporal-aware embedding which can align both audio and visual tracks. We use the visual and audio features provided in the YouTube-8M dataset [1]. The dataset includes features at both the clip and frame (temporal window) level. We train embeddings for both scales and assess their quality in a retrieval problem, formulated as using the feature extracted from one modality to retrieve the most similar videos based on the features computed in the other modality.
\
We aim at not only finding related documents, but synchronize both sequences. The alignment between the two sequences will rely on computing temporal-aware features with recurrent neural networks at different scales. At retrieval time, different scales will be assessed and results evaluated both with ranking metrics and Amazon Mechanical Turk.
\
References
\
[1] Sami Abu-El-Haija, Nisarg Kothari, Joonseok Lee, Paul Natsev, George Toderici, Balakrishnan Varadarajan, and Sudheendra Vijayanarasimhan. Youtube-8m: A large-scale video classification benchmark. arXiv preprint arXiv:1609.08675, 2016.
\
[2] Sungeun Hong, Woobin Im, and Hyun S Yang. Deep learning for content-based, cross-modal retrieval of videos and music. arXiv preprint arXiv:1704.06761, 2017.
\
[3] Amaia Salvador, Nicholas Hynes, Yusuf Aytar, Javier Marin, Ferda Ofli, Ingmar Weber, and Antonio Torralba. Learning cross-modal embeddings for cooking recipes and food images. In CVPR, 2017.
\
[4] Liwei Wang, Yin Li, and Svetlana Lazebnik. Learning deep structure-preserving image-text embeddings. In CVPR, 2016.
\
}, author = {Amanda Duarte and Sur{\'\i}s, D{\'\i}dac and Amaia Salvador and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cLe, title = {Towards large scale multimedia indexing: A case study on person discovery in broadcast news}, booktitle = {International Workshop on Content-Based Multimedia Indexing - CBMI 2017}, year = {2017}, month = {06/2017}, address = {Firenze, Italy}, abstract = {The rapid growth of multimedia databases and the human interest in their peers make indices representing the location and identity of people in audio-visual documents essential for searching archives. Person discovery in the absence of prior identity knowledge requires accurate association of audio-visual cues and detected names. To this end, we present 3 different strategies to approach this problem: clustering-based naming, verification-based naming, and graph-based naming. Each of these strategies utilizes different recent advances in unsupervised face / speech representation, verification, and optimization. To have a better understanding of the approaches, this paper also provides a quantitative and qualitative comparative study of these approaches using the associated corpus of the Person Discovery challenge at MediaEval 2016. From the results of our experiments, we can observe the pros and cons of each approach, thus paving the way for future promising research directions.
}, author = {Nam Le and Herv{\'e} Bredin and Gabriel Sargent and Miquel India and Paula Lopez-Otero and Claude Barras and Camille Guinaudeau and Guillaume Gravier and Gabriel Barbosa da Fonseca and Izabela Lyon Freire and Zenilton Patroc{\'\i}nio Jr. and Silvio Jamil F. Guimaraes and Gerard Mart{\'\i} and Morros, J.R. and Javier Hernando and Laura Docio-Fernandez and Carmen Garcia-Mateo and Sylvain Meignier and Jean-Marc Odobez} } @mastersthesis {xRoldan, title = {Visual Question Answering 2.0}, year = {2017}, abstract = {This bachelor{\textquoteright}s thesis explores dierent deep learning techniques to solve the Visual Question-Answering (VQA) task, whose aim is to answer questions about images. We study dierent Convolutional\ Neural Networks (CNN) to extract the visual representation from images: Kernelized-CNN (KCNN), VGG-16 and Residual Networks (ResNet). We also analyze the impact of using\ pre-computed word embeddings trained in large datasets (GloVe embeddings). Moreover, we\ examine dierent techniques of joining representations from dierent modalities. This work has\ been submitted to the second edition Visual Question Answering Challenge, and obtained a\ 43.48\% of accuracy.
The popularization of multimedia content on the Web has arised the need to automatically understand, index and retrieve it. In this paper we present ViTS, an automatic Video Tagging System which learns from videos, their web context and comments shared on social networks. ViTS analyses massive multimedia collections by Internet crawling, and maintains a knowledge base that updates in real time with no need of human supervision. As a result, each video is indexed with a rich set of labels and linked with other related contents. ViTS is an industrial product under exploitation with a vocabulary of over 2.5M concepts, capable of indexing more than 150k videos per month. We compare the quality and completeness of our tags with respect to the ones in the YouTube-8M dataset, and we show how ViTS enhances the semantic annotation of the videos with a larger number of labels (10.04 tags/video), with an accuracy of 80,87\%.
\
There has been a growing interest in applying human computation -- \ particularly crowdsourcing techniques -- to assist in the solution of multimedia, image processing, and computer vision problems which are still too difficult to solve using fully automatic algorithms, and yet relatively easy for humans.
In this paper we focus on a specific problem -- object segmentation within color images -- and compare different solutions which combine color image segmentation algorithms with human efforts, either in the form of an explicit interactive segmentation task or through an implicit collection of valuable human traces with a game.\ We use Click{\textquoteright}n{\textquoteright}Cut, a friendly, web-based, interactive segmentation tool that allows segmentation tasks to be assigned to many users, and Ask{\textquoteright}nSeek, a game with a purpose designed for object detection and segmentation.\
The two main contributions of this paper are: (i) We use the results of Click{\textquoteright}n{\textquoteright}Cut campaigns with different groups of users to examine and quantify the crowdsourcing loss incurred when an interactive segmentation task is assigned to paid crowd-workers, comparing their results to the ones obtained when computer vision experts are asked to perform the same tasks. (ii) Since interactive segmentation tasks are inherently tedious and prone to fatigue, we\ compare the quality \ of the results obtained with Click{\textquoteright}n{\textquoteright}Cut with the ones obtained using a (fun, interactive, and potentially less tedious) game designed for the same purpose. We call this contribution the assessment of the gamification loss, since it refers to how much quality of segmentation results may be lost when we switch to a game-based approach to the same task.\
We demonstrate that the crowdsourcing loss is significant when using all the data points from workers, but decreases substantially (and becomes comparable to the quality of expert users performing similar tasks) after performing a modest amount of data analysis and filtering out of users whose data are clearly not useful. We also show that -- on the other hand -- the gamification loss is significantly more severe: the quality of the results drops roughly by half when switching from a focused (yet tedious) task to a more fun and relaxed game environment.\
}, keywords = {Crowdsourcing, GWAP, Object detection, Object segmentation, Serious games}, issn = {1573-7721}, doi = {10.1007/s11042-015-2897-6}, url = {http://dx.doi.org/10.1007/s11042-015-2897-6}, author = {Carlier, Axel and Amaia Salvador and Cabezas, Ferran and Xavier Gir{\'o}-i-Nieto and Charvillat, Vincent and Marques, Oge} } @conference {cMohedanoa, title = {Bags of Local Convolutional Features for Scalable Instance Search}, booktitle = {ACM International Conference on Multimedia Retrieval (ICMR)}, year = {2016}, month = {06/2016}, publisher = {ACM}, organization = {ACM}, address = {New York City, NY; USA}, abstract = {Image representations extracted from convolutional neural networks (CNNs) have been shown to outperform hand-crafted features in multiple computer vision tasks, such as visual image retrieval. This work proposes a simple pipeline for encoding the local activations of a convolutional layer of a pre-trained CNN using the well-known bag of words aggregation scheme (BoW). Assigning each local array of activations in a convolutional layer to a visual word produces an \textit{assignment map}, a compact representation that relates regions of an image with a visual word. We use the assignment map for fast spatial reranking, obtaining object localizations that are used for query expansion. We demonstrate the suitability of the Bag of Words representation based on local CNN features for image retrieval, achieving state-of-the-art performance on the Oxford and Paris buildings benchmarks. We show that our proposed system for CNN feature aggregation with BoW outperforms state-of-the-art techniques using sum pooling at a subset of the challenging TRECVid INS benchmark.
Best poster award at ACM ICMR 2016
Overall acceptance rate in ICMR 2016: 30\%\
2016-05-Seminar-AmaiaSalvador-DeepVision from Image Processing Group on Vimeo.
\
Student: D{\`e}lia Fern{\`a}ndez
Advisors: V{\'\i}ctor Campos (UPC), Brendan Jou (Columbia University), Xavier Gir{\'o}-i-Nieto (UPC) and Shih-Fu Chang (Columbia University)
Grade: A+ (10.0/10.0) - Best Master Thesis award (Class 2016)
One of the main problems in visual affective computing is overcoming the affective gap between low-level visual features and the emotional content of the image. One rising method to capture visual affection is through the use of Adjective-Noun Pairs (ANP), a mid-level affect representation. This thesis addresses two challenges related to ANPs: representing ANPs in a structured ontology and improving ANP detectability. The first part develops two techniques to exploit relations between adjectives and nouns for automatic ANP clustering. The second part introduces and analyzes a novel deep neural network for ANP prediction. Based on the hypothesis of a different contribution of the adjective and the noun depending of the ANP, the novel network fuses the feature representations of adjectives and nouns from two independently trained convolutional neural networks.
This project addresses a novel problem that has appears in the last years. The use of egocentric cameras, devices that takes images of what we see are growing and the main problem of this images are big data (because at the end of the day, we can have a thousand of images, some of them similar and sometimes with a bad quality or low information) and image retrieval (due to the big data, find a certain moment are very difficult and if we don{\textquoteright}t avoid that problem, the properties of egocentric images became useless).
This work have two objectives: the first one explore images that have physiological signals associated in order to allow us to add some physiological features for the retrieval instead of only base the retrieval in visual features as the actual state of the art. For this part we associate interesting images to the images that are memorable, so a correlation between memorability and physiological signals will be found. The second objective is to deal with the egocentric paradigm. Some recent works shows that machine learning algorithms that have been trained with human-taken images cannot be extended to egocentric images due to the image construction. Based on MIT (Massachusetts Institute of Technology) previous work I built a visual game that allows me to manually annotate the memorability of the images with a simple user interaction (the user don{\textquoteright}t know that he is annotating images during the game). From this game I have computed memorability score and I have obtain predicted scores from a convolutional neural network that MIT present in his work: MemNet. From both results I have compared the results in order to decide if the application of the algorithms on egocentric images is possible.
}, author = {Carn{\'e}-Herrera, Marc}, editor = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto} } @conference {cMarsden, title = {Dublin City University and Partners{\textquoteright} Participation in the INS and VTT Tracks at TRECVid 2016}, booktitle = {TRECVID Workshop 2016}, year = {2016}, month = {11/2016}, address = {Gaithersburg, MD, USA}, abstract = {DCU participated with a consortium of colleagues from NUIG and UPC in two tasks,\ INS and VTT. For the INS task we developed a framework consisting of face detection and\ representation and place detection and representation, with a user annotation of top-ranked\ videos. For the VTT task we ran 1,000 concept detectors from the VGG-16 deep CNN on\ 10 keyframes per video and submitted 4 runs for caption re-ranking, based on BM25, Fusion,\ Word2Vec and a fusion of baseline BM25 and Word2Vec. With the same pre-processing for\ caption generation we used an open source image-to-caption CNN-RNN toolkit NeuralTalk2\ to generate a caption for each keyframe and combine them.
}, url = {http://doras.dcu.ie/21484/}, author = {Marsden, Mark and Mohedano, Eva and McGuinness, Kevin and Calafell, Andrea and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Zhou, Jiang and Azevedo, Lucas and Daubert, Tobias and Davis, Brian and H{\"u}rlimann, Manuela and Afli, Haithem and Du, Jinhua and Ganguly, Debasis and Li, Wei and Way, Andy and Smeaton, Alan F.} } @unpublished {xBellver, title = {Efficient search of objects in images using deep reinforcement learning}, journal = {NIPS Women in Machine Learning Workshop}, year = {2016}, type = {Abstract}, address = {Barcelona.}, abstract = {When we humans look at an image, we always perform a sequential extraction of information in order to understand its content. First, we fix our gaze to the most salient part of the image, and from the information extracted we guide our look towards another point of the image, until we have analyzed all the relevant information of it. This is our natural and instinctive behaviour to gather information from our surroundings. Traditionally in computer vision, images have been analysed at the local scale following a sliding window scanning, often at different scales. This approach analyses the different parts of the images independently, without constructing a correlation among them. Just by introducing a hierarchical partition of the image, we can more easily exploit the correlation between regions through a top-down scanning which firstly takes a global view of the image to sequentially focus on the local parts that contain the relevant information (eg. objects or faces). Moreover, if we train a deep architecture that is not based on rewarding regions observed independently, such as traditional object proposals, but rewards successful long-term searches by connecting the different regions observed, we can achieve a sequential detection of objects. , which is proven to be richer in information compared to use simple independent fixations.
\
The goal of this ongoing research is to perform an efficient detection of objects in images. In order to be efficient, the key idea is to focus on those parts of the image which contain richer information and zoom on them, guiding a hierarchical search for objects. An intelligent entity capable of deciding where to focus the attention in the image is trained using deep reinforcement learning techniques. This RL agent first looks the whole image and decides which of the partitions of a quadtree partition is richer in order to find a certain category of objects. The reinforcement learning agent is trained using deep Q-learning using a similar architecture to the one used by DeepMind [1].
\
This work is based on the key idea that with reinforcement learning we can perform a sequential search that rewards short sequences of searches that obtain the highest long-term reward in terms of intersection over union of predicted bounding boxes and ground truth bounding boxes.
\
The input of the network is a convolutional descriptor of the region observed at the current step and a history vector that describes the previous steps of the search. This idea was also used in [2]. Our main difference with the approach of such paper, is that we use a fixed hierarchical partition to guide our sequential search. Furthermore, in order to be efficient, sharing of convolutional features is a key aspect of the pipeline of our system. Convolutional features from VGG-16 [3] are extracted from the initial whole resolution picture, and then the descriptors for each subpartition are cropped from this feature map.
\
References:
[1] Mnih, V., Kavukcuoglu, K., Silver, D., Graves, A., Antonoglou, I., Wierstra, D., \& Riedmiller, M. (2013). Playing atari with deep reinforcement learning. arXiv preprint arXiv:1312.5602.
\
[2] Caicedo, J. C., \& Lazebnik, S. (2015). Active object localization with deep reinforcement learning. In Proceedings of the IEEE International Conference on Computer Vision (pp. 2488-2496).
[3] Simonyan, K., \& Zisserman, A. (2014). Very deep convolutional networks for large-scale image recognition. International Conference on Learning Representation 2015.
\
\
}, author = {M{\'\i}riam Bellver and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @article {xCarne-Herrera, title = {EgoMemNet: Visual Memorability Adaptation to Egocentric Images}, year = {2016}, month = {06/2016}, institution = {4th Workshop on Egocentric (First-Person) Vision, CVPR 2016}, address = {Las Vegas, NV, USA}, abstract = {This work explores the adaptation of visual memorability\ prediction for photos intentionally captured by handheld\ cameras, to images passively captured from an egocentric\ point of view by wearable cameras. The estimation of a visual\ memorability score for an egocentric images is a valuable\ cue when filtering among the large amount of photos\ generated by wearable cameras. For this purpose, a new\ annotation tool and annotated dataset are presented, used\ to fine-tune a pre-trained convolutional neural network.
Extended abstract presented as poster in the\ 4th Workshop on Egocentric (First-Person) Vision,\ CVPR 2016.\
This project focuses on the creation of a new type of egocentric (first person) vision dataset. For that purpose, the EgoMon Gaze \& Video Dataset is presented. This EgoMon dataset was recorded using the eye gaze tracking technology that studies the movement and position of the eyes. The Tobii glasses (wearable, eye tracker and head-mounted device) were the main tool used to record and extract the gaze data for this dataset. The dataset consists in 7 videos of 34 minutes each one of average, 13428 frames extracted from each video (with a frequency of 1 fps), and 7 files with the gaze data (fixations points of the wearer of the glasses) for each frame and video. The videos were recorded in the city of Dublin (Ireland) both indoor and outdoor. The generated dataset has been used to evaluate the performance of a state of art model for visual saliency prediction on egocentric video.
Grade: B (8.2/10.0)
Image representations derived from pre-trained Convolutional Neural Networks (CNNs) have become the new state of the art in computer vision tasks such as instance retrieval. This work explores the suitability for instance retrieval of image- and region-wise representations pooled from an object detection CNN such as Faster R-CNN. We take advantage of the object proposals learned by a Region Proposal Network (RPN) and their associated CNN features to build an instance search pipeline composed of a first filtering stage followed by a spatial reranking. We further investigate the suitability of Faster R-CNN features when the network is fine-tuned for the same objects one wants to retrieve. We assess the performance of our proposed system with the Oxford Buildings 5k, Paris Buildings 6k and a subset of TRECVid Instance Search 2013, achieving competitive results.
\
2016-05-Seminar-AmaiaSalvador-DeepVision from Image Processing Group on Vimeo.
\
Computers are acquiring increasing ability to detect high level visual content such as objects in images, but often lack an affective comprehension of this content. Affective computing is useful for behavioral sciences, with applications in brand monitoring or advertisement effect. The main problem of the visual task of mapping affect or emotions to images is overcoming the affective gap between low-level features and the image emotional content.
\
One rising method to capture visual affections is through the use of Adjective-Noun Pair (ANP). ANPs were introduced as a mid-level affect representation to overcome the affective gap by combining nouns, which define the object content, and adjectives, which add a strong emotional bias, yielding concepts such as {\textquotedblleft}happy dog{\textquotedblright} or {\textquotedblleft}misty morning{\textquotedblright}.
\
Current state of the art methods approach ANP prediction by training visual classifiers on these pairs. In this work, we hypothesize that the visual contribution between nouns and adjectives differ between ANPs. We propose a feature-based intermediate representation for ANP prediction using specialized convolutional networks for adjectives and nouns separately. By fusing a representation from nouns and adjectives, the network learns how much the nouns and adjectives contribute to each ANP, which a single tower network does not allow.
\
The specialized noun and adjective networks follow an AlexNet-styled architecture. These networks are fused into an intermediate feature representation, and ANPs are then learned from it using a fully-connected network. We investigate noun and adjective contributions with two kinds of fusions. First fusion uses the output of the softmax layer: these are class-probability features, so all dimensions have class-correspondence to adjectives and nouns. Second fusion uses the fc7 layer output: these features contain visual information, allowing interpretation of adjective and noun visual relevance. For the feature contributions of each ANP, we compute a deep Taylor decomposition [1].
\
For experiments, we use a subset of 1,200 ANPs from the tag-based English-MVSO [2] dataset. ANPs are composed by the combination of 350 adjective and 617 nouns. With identical settings to the adjective and noun networks, an ANP classification network is trained end-to-end as the baseline. Using the fc7 features, we improve over the baseline in both top-1 and top-5 accuracy. Also, we observe adjective and nouns contribute differently between ANPs; e.g. for the ANP {\textquotedblleft}pregnant woman{\textquotedblright}, the adjective contributes the most, while for {\textquotedblleft}cute cat{\textquotedblright} the predominant contribution is in the noun. Using the probability features we find other insights, as nouns or adjectives co-occurring together, e.g. for {\textquotedblleft}happy halloween{\textquotedblright} the contribution is also high of the nouns {\textquotedblleft}blood{\textquotedblright} and {\textquotedblleft}cat{\textquotedblright}, and of the adjectives {\textquotedblleft}haunted{\textquotedblright} and {\textquotedblleft}dark{\textquotedblright}.\
\
Based on experiment results, we confirm our hypothesis of adjective and nouns contributing differently to ANP concepts. Furthermore, our architecture proves to outperform traditional methods by giving insights on the role of adjectives and nouns on the prediction.
\
[1] Montavon, Gr{\'e}goire, et al. "Deep Taylor Decomposition of Neural Networks." ICML Workshop on Visualization for Deep Learning, 2016.
\
[2] Jou, Brendan, et al. "Visual affect around the world: A large-scale multilingual visual sentiment ontology." ACMM, 2015.
}, author = {Fern{\`a}ndez, D{\`e}lia and V{\'\i}ctor Campos and Jou, Brendan and Xavier Gir{\'o}-i-Nieto and Chang, Shih-Fu} } @conference {cBellver, title = {Hierarchical Object Detection with Deep Reinforcement Learning}, booktitle = {Deep Reinforcement Learning Workshop, NIPS 2016}, year = {2016}, month = {12/2016}, abstract = {We present a method for performing hierarchical object detection in images guided by a deep reinforcement learning agent. The key idea is to focus on those parts of the image that contain richer information and zoom on them. We train an intelligent agent that, given an image window, is capable of deciding where to focus the attention among five different predefined region candidates (smaller windows). This procedure is iterated providing a hierarchical image analysis.
\ We compare two different candidate proposal strategies to guide the object search: with and without overlap. Moreover, our work compares two different strategies to extract features from a convolutional neural network for each region proposal: a first one that computes new feature maps for each region proposal, and a second one that computes the feature maps for the whole image to later generate crops for each region proposal.\
Experiments indicate better results for the overlapping candidate proposal strategy and a loss of performance for the cropped image features due to the loss of spatial resolution. We argue that, while this loss seems unavoidable when working with large amounts of object candidates, the much more reduced amount of region proposals generated by our reinforcement learning agent allows considering to extract features for each location without sharing convolutional computation among regions.
[Project page]\ [arXiv]\ [gitXiv]\ [UPCommons]\ [YouTube]
}, author = {M{\'\i}riam Bellver and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Jordi Torres} } @conference {cdeOliveira-Barraa, title = {Large Scale Content-Based Video Retrieval with LIvRE}, booktitle = {14th International Workshop on Content-based Multimedia Indexing (CBMI)}, year = {2016}, month = {06/2016}, publisher = {IEEE}, organization = {IEEE}, address = {Bucharest, Romania}, abstract = {The fast growth of video data requires robust, efficient, and scalable systems to allow for indexing and retrieval. These systems must be accessible from lightweight, portable and usable interfaces to help users in management and search of video content. This demo paper presents LIvRE, an extension of an existing open source tool for image retrieval to support video indexing. LIvRE consists of three main system components (pre-processing, indexing and retrieval), as well as a scalable and responsive HTML5 user interface accessible from a web browser. LIvRE supports image-based queries, which are efficiently matched with the extracted frames of the indexed videos.
}, author = {Gabriel de Oliveira-Barra and Lux, Mathias and Xavier Gir{\'o}-i-Nieto} } @conference {cdeOliveira-Barra, title = {LEMoRe: A Lifelog Engine for Moments Retrieval at the NTCIR-Lifelog LSAT Task}, booktitle = {The 12th NTCIR Conference, Evaluation of Information Access Technologies}, year = {2016}, month = {06/2016}, publisher = {National Institute of Informatics (NII)}, organization = {National Institute of Informatics (NII)}, address = {Tokyo, Japan}, abstract = {Semantic image retrieval from large amounts of egocentric visual data requires to leverage powerful techniques for filling in the semantic gap. This paper introduces LEMoRe, a Lifelog Engine for Moments Retrieval, developed in the context of the Lifelog Semantic Access Task (LSAT) of the the NTCIR-12 challenge and discusses its performance variation on different trials. \ LEMoRe integrates classical image descriptors with high-level semantic concepts extracted by Convolutional Neural Networks (CNN), powered by a graphic user interface that uses natural language processing. \ Although this is just a first attempt towards interactive image retrieval from large egocentric datasets and there is a large room for improvement of the system components and the user interface, the structure of the system itself and the way the single components cooperate are very promising.
}, url = {http://research.nii.ac.jp/ntcir/workshop/OnlineProceedings12/NTCIR/toc_ntcir.html$\#$Lifelog}, author = {Gabriel de Oliveira-Barra and Xavier Gir{\'o}-i-Nieto and Cartas-Ayala, Alejandro and Radeva, Petia} } @conference {cGurrin, title = {LTA 2016 - The First Workshop on Lifelogging Tools and Applications}, booktitle = {ACM Multimedia}, year = {2016}, month = {10/2016}, publisher = {ACM}, organization = {ACM}, address = {Amsterdam, The Netherlands}, abstract = {The organisation of personal data is receiving increasing research attention due to the challenges that are faced in gathering, enriching, searching and visualising this data. Given the increasing quantities of personal data being gathered by individuals, the concept of a lifelog digital library of rich multimedia and sensory content for every individual is fast becoming a reality. The LTA2016 lifelogging workshop at ACM MM 2016 aims to bring together academics and practitioners to discuss approaches to lifelog data analytics and the applications of same, and to debate the opportunities and challenges for researchers in this new and challenging area.
\
[Additional repo for setting up the environment]
}, author = {Ferri, Andrea}, editor = {Xavier Gir{\'o}-i-Nieto and Jordi Torres and Amaia Salvador} } @mastersthesis {xMasuda-Mora, title = {Open-Ended Visual Question-Answering}, year = {2016}, abstract = {Advisors: Santiago de la Puente and Xavier Gir{\'o}-i-Nieto
Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)
Grade: A with honors (10/10.0)
This thesis studies methods to solve Visual Question-Answering (VQA) tasks with a Deep Learning framework.As a preliminary step, we explore Long Short-Term Memory (LSTM) networks used in Natural Language Processing (NLP) to tackle Question-Answering (text based). We then modify the previous model to accept an image as an input in addition to the question. For this purpose, we explore the VGG-16 and K-CNN convolutional neural networks to extract visual features from the image. These are merged with the word embedding or with a sentence embedding of the question to predict the answer. This work was successfully submitted to the Visual Question Answering Challenge 2016, where it achieved a 53,62\% of accuracy in the test dataset. The developed software has followed the best programming practices and Python code style, providing a consistent baseline in Keras for different configurations. The source code and models are publicly available at https://github.com/imatge-upc/vqa-2016-cvprw.
\
The prediction of salient areas in images has been traditionally addressed with hand-crafted features based on neuroscience principles. This paper, however, addresses the problem with a completely data-driven approach by training a convolutional neural network (convnet). The learning process is formulated as a minimization of a loss function that measures the Euclidean distance of the predicted saliency map with the provided ground truth. The recent publication of large datasets of saliency prediction has provided enough data to train end-to-end architectures that are both fast and accurate. Two designs are proposed: a shallow convnet trained from scratch, and a another deeper solution whose first three layers are adapted from another network trained for classification. To the authors knowledge, these are the first end-to-end CNNs trained and tested for the purpose of saliency prediction.
Student: Alejandro Nespereira
Advisors: Farzad Husain (Catchoom), Tomasz Adamek (Catchoom) and Xavier Gir{\'o}-i-Nieto (UPC)
Program:\ Master in Computer Vision\ (Class of 2016)
This report explores the suitability of using a Siamese Convolutional Neural Network (CNN) for the task of false positive rejection. We present a Siamese CNN model trained with an in-house dataset of weakly textured objects. Our model is able to successfully assert the classification of an object detection pipeline with unseen new objects. Additionally, we also compare it with a hand-crafted method in order to compare its performance. We demonstrate the usage of our model by learning to discriminate between inter and intra object classes for a challenging dataset.
}, author = {Nespereira, Alejandro}, editor = {Husain, Farzad and Adamek, Tomasz and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xMontes, title = {Temporal Activity Detection in Untrimmed Videos with Recurrent Neural Networks}, year = {2016}, abstract = {Advisors: Amaia Salvador and Xavier Gir{\'o}-i-Nieto.
Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)
Grade: A (9.8/10.0)
This thesis explore different approaches using Convolutional and Recurrent Neural Networks to classify and temporally localize activities on videos, furthermore an implementation to achieve it has been proposed.
As the first step, features have been extracted from video frames using an state of the art 3D Convolutional Neural Network. This features are fed in a recurrent neural network that solves the activity classification and temporally location tasks in a simple and flexible way.
Different architectures and configurations have been tested in order to achieve the best performance and learning of the video dataset provided. In addition it has been studied different kind of post processing over the trained network{\textquoteright}s output to achieve a better results on the temporally localization of activities on the videos.
The results provided by the neural network developed in this thesis have been submitted to the ActivityNet Challenge 2016 of the CVPR, achieving competitive results using a simple and flexible architecture.
This work proposes a simple pipeline to classify and temporally localize activities in untrimmed videos. Our system uses features from a 3D Convolutional Neural Network (C3D) as input to train a a recurrent neural network (RNN) that learns to classify video clips of 16 frames. After clip prediction, we post-process the output of the RNN to assign a single activity label to each video, and determine the temporal boundaries of the activity within the video. We show how our system can achieve competitive results in both tasks with a simple architecture. We evaluate our method in the ActivityNet Challenge 2016, achieving a 0.5874 mAP and a 0.2237 mAP in the classification and detection tasks, respectively. Our code and models are publicly available at:\ https://imatge-upc.github.io/activitynet-2016-cvprw/
}, author = {Montes, Alberto and Amaia Salvador and Pascual-deLaPuente, Santiago and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xReyesa, title = {Time-sensitive Egocentric Image Retrieval for Fidings Objects in Lifelogs}, year = {2016}, abstract = {Advisors: Eva Mohedano (Insight DCU), Kevin McGuinness (Insight DCU) and\ Xavier Gir{\'o}-i-Nieto\ (UPC)
Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)
Grade: A (9.4/10.0)
This work explores diverse practices for conducting an object search from large amounts of egocentric images taking into account their temporal information. The application of this technology is to identify where personal belongings were lost or forgotten. We develop a pipeline-structured system. Firstly, the images of the day being scanned are sorted based on their probability to depict the forgotten object. This stage is solved by applying an existing visual search engine based on deep learning features. Secondly, a learned threshold selects the top ranked images as candidates to contain the object. Finally the images are reranked based on temporal and diversity criteria. Furthermore, we build a validation environment for assessing the system{\textquoteright}s performance aiming to find the optimal configuration of its parameters. Due to the lack of related works to be compared with, this thesis proposes an novel evaluation framework and metric to assess the problem.
\
This extended abstract presents our research in the generic field of Visual Question-Answering (VQA) focusing on a new branch that aims to generate question-answer pairs based on an image.\ To do so, we use the VQA dataset provided for the VQA challenge to train a Deep Neural Network which has the image as an input and two different outputs, the question and its associated answer.
Extended abstract presented as poster in the Visual Question Answering Challenge Workshop, CVPR 2016.
}, url = {http://www.visualqa.org/abstracts.html}, author = {Masuda-Mora, Issey and Pascual-deLaPuente, Santiago and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xCalafell, title = {Video Retrieval of Specific Persons in Specific Locations}, year = {2016}, abstract = {Student: Andrea Calafell
Advisors: Eva Mohedano (Insight), Kevin McGuinness (Insight), Noel E. O{\textquoteright}Connor (Insight) and Xavier Gir{\'o}-i-Nieto (UPC)
Program: Master in Computer Vision (Class of 2016)
Grade: A (9.0/10.0)
This thesis explores good practices for improving the detection of specific people in specific places. An approach combining recurrent and convolutional neural network have been considered to perform face detection. However, other more conventional methods have been tested, obtaining the best results by exploiting a deformable part model approach. A CNN is also used to obtain the face feature vectors and, \ with the purpose of helping in the face recognition, an approach to perform query expansion has been also developed. Furthermore, in order to be able to evaluate the different configurations in our non-labelled dataset, a user interface has been used to annotate the images and be able to obtain a precision of the system. Finally, different fusion and normalization strategies has been explored with the aim of combining the scores obtained from the face recognition with the ones obtained in the place recognition.
This project explores visual memorability of egocentric in different ways having three main\ contributions. The first and the main contribution of the project is a new tool visual\ memorability in egocentric images. This tool that consists in a web application that allows\ the annotation of the visual memorability associated to still images with an online game.\ The second contribution of this work is a convolutional neural network model for visual\ memorability prediction that adapts an off-the-shelf model to egocentric images. Moreover,\ a visualization study has been pursued to localize the regions of the images that are more\ memorable than others. With this maps a comparison with saliency maps and is explored.\ This part of the research opens a new branch in visual memorability that consists in use\ memorability maps for saliency prediction. Also the memorability of the images is related\ with a sentiment analysis applying a model that predicts that feature.\ The final contribution is related to join visual memorability of images with human\ behaviour and physical state, finding a relation between memory and some physiological\ signals as: heart rate, galvanic skin response and electroencephalographic signals.
Grade: A with honors (9.8/10.)
}, author = {Carn{\'e}-Herrera, Marc}, editor = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dVentura16, title = {Visual Object Analysis using Regions and Local Features}, year = {2016}, abstract = {Thesis submission: 10-06-2016
Defense date:\ 08-07-2016
Grade: Excel{\textperiodcentered}lent Cum Laude \& International Mention
\
The first part of this dissertation focuses on an analysis of the spatial context in semantic image segmentation. First, we review how spatial context has been tackled in the literature by local features and spatial aggregation techniques. From a discussion about whether the context is beneficial or not for object recognition, we extend a Figure-Border-Ground segmentation for local feature aggregation with ground truth annotations to a more realistic scenario where object proposals techniques are used instead. Whereas the Figure and Ground regions represent the object and the surround respectively, the Border is a region around the object contour, which is found to be the region with the richest contextual information for object recognition. Furthermore, we propose a new contour-based spatial aggregation technique of the local features within the object region by a division of the region into four subregions. Both contributions have been tested on a semantic segmentation benchmark with a combination of free and non-free context local\ features that allows the models automatically learn whether the context is beneficial or not for each semantic category.
The second part of this dissertation addresses the semantic segmentation for a set of closely-related images from an uncalibrated multiview scenario. State-of-the-art semantic segmentation algorithms fail on correctly segmenting the objects from some viewpoints when the techniques are independently applied to each viewpoint image. The lack of large annotations available for multiview segmentation do not allow to obtain a proper model that is robust to viewpoint changes. In this second part, we exploit the spatial correlation that exists between the dierent viewpoints images to obtain a more robust semantic segmentation. First, we review the state-of-the-art co-clustering, co-segmentation and video segmentation techniques that aim to segment the set of images in a generic way, i.e. without considering semantics. Then, a new architecture that considers motion information and provides a multiresolution segmentation is proposed for the co-clustering framework and outperforms state-of-the-art techniques for generic multiview segmentation. Finally,\ the proposed multiview segmentation is combined with the semantic segmentation results giving a method for automatic resolution selection and a coherent semantic multiview segmentation.
\
\
The interest of users in having their lives digitally recorded has grown in the last years thanks to the advances on wearable sensors.\ Wearable cameras are one of the most informative ones, but they generate large amounts of images that require automatic analysis to build useful applications upon them.\ In this work we explore the potential of these devices to find the last appearance of personal objects among the more than 2,000 images that are generated everyday.\ This application could help into developing personal assistants capable of helping users when they do not remember where they left their personal objects.\ We adapt a previous work on instance search to the specific domain of egocentric vision.
Extended abstract presented as poster in the 4th Workshop on Egocentric (First-Person) Vision,\ CVPR 2016.\
}, author = {Reyes, Cristian and Mohedano, Eva and McGuinness, Kevin and O{\textquoteright}Connor, N. and Xavier Gir{\'o}-i-Nieto} } @conference {cReyes, title = {Where is my Phone? Personal Object Retrieval from Egocentric Images}, booktitle = {Lifelogging Tools and Applications Workshop in ACM Multimedia}, year = {2016}, month = {10/2016}, publisher = {ACM}, organization = {ACM}, address = {Amsterdam, The Netherlands}, abstract = {This work presents a retrieval pipeline and evaluation scheme for the problem of finding the last appearance of personal objects in a large dataset of images captured from a wearable camera. Each personal object is modelled by a small set of images that define a query for a visual search engine.The retrieved results are reranked considering the temporal timestamps of the images to increase the relevance of the later detections. Finally, a temporal interleaving of the results is introduced for robustness against false detections. The Mean Reciprocal Rank is proposed as a metric to evaluate this problem. This application could help into developing personal assistants capable of helping users when they do not remember where they left their personal belongings.
}, doi = {http://dx.doi.org/10.1145/2983576.2983582}, url = {http://arxiv.org/abs/1608.08139}, author = {Reyes, Cristian and Mohedano, Eva and McGuinness, Kevin and Noel E. O{\textquoteright}Connor and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xCabezas, title = {Co-filtering human interaction and object segmentation}, year = {2015}, abstract = {Advisors: Axel Carlier and Vincent Charvillat (ENSEEIHT-Universit{\'e} de Toulouse) / Amaia Salvador and\ Xavier Gir{\'o}-i-Nieto\ (UPC)\
Degree: Audiovisual Engineering (4 years) at\ Telecom BCN-ETSETB\ (UPC)
Grade: A with honors (9.6/10)
For so many years the problem of object segmentation has been present in image processing field. Click{\textquoteright}n{\textquoteright}Cut, an already existing web tool for interactive object segmentation, helps us to obtain segmentations of the objects by clicking in green (foreground clicks) inside the object to segment, and in red(background clicks) outside the object to segment. However, the behaviour of all human in front of this web tool is not equal. For this reason, it can be possible that these human interactions do not help us to obtain a good object segmentation as would be the result of a bad human interaction. The main aim of this project is to implement some techniques that allow us to treat with these bad human interactions in order to obtain the best object segmentation possible.
This paper presents our contribution to the ChaLearn Challenge 2015 on Cultural Event Classification. The challenge in this task is to automatically classify images from 50 different cultural events. Our solution is based on the combination of visual features extracted from convolutional neural networks with temporal information using a hierarchical classifier scheme. We extract visual features from the last three fully connected layers of both CaffeNet (pretrained with ImageNet) and our fine tuned version for the ChaLearn challenge. We propose a late fusion strategy that trains a separate low-level SVM on each of the extracted neural codes. The class predictions of the low-level SVMs form the input to a higher level SVM, which gives the final event scores. We achieve our best result by adding a temporal refinement step into our classification scheme, which is applied directly to the output of each low-level SVM. Our approach penalizes high classification scores based on visual features when their time stamp does not match well an event-specific temporal distribution learned from the training and validation data. Our system achieved the second best result in the \ ChaLearn Challenge 2015 on Cultural Event Classification with a mean average precision of 0.767 on the test set.
[Preprint in arXiv]\ [Workshop site]\ [Slides on GDrive]
\
Visual media are powerful means of expressing emotions and sentiments. The constant generation of new content in social networks highlights the need of automated visual sentiment analysis tools. While Convolutional Neural Networks (CNNs) have established a new state-of-the-art in several vision problems, their application to the task of sentiment analysis is mostly unexplored and there are few studies regarding how to design CNNs for this purpose. In this work, we study the suitability of fine-tuning a CNN for visual sentiment prediction as well as explore performance boosting techniques within this deep learning setting. Finally, we provide a deep-dive analysis into a benchmark, state-of-the-art network architecture to gain insight about how to design patterns for CNNs on the task of visual sentiment prediction.
Acceptance rate at the ASM workshop{\textquoteright}15: 56\% (9/16) [source]
\
Visual Sentiment Detection by Deep CNNs $\#$acmmm15 @DocXavi @mad_astronaut @brendanjou - great paper/analysis! pic.twitter.com/RcGchcPA9f
{\textemdash} Damian Borth (@damianborth) October 30, 2015
Advisors: Carles Ventura-Royo (UPC) and Xavier Gir{\'o}-i-Nieto (UPC)
Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)
Grade: A (9.0/10.0)
The motivation of this work is the efficient exploration of hierarchical partitions for semantic segmentation as a method for locating objects in images. While many efforts have been focused on efficient image search in large-scale databases, few works have addressed the problem of locating and recognizing objects efficiently within a given image. My work considers as an input a hierarchical partition of an image that defines a set of regions as candidate locations to contain an object. This approach will be compared to other state of the art algorithms that extract object candidates for an image. The final goal of this work is to semantically segment images efficiently by exploiting the multiscale information provided by a hierarchical partition, maximizing the accuracy of the segmentation when only a very few regions of the partition are analysed.
The prediction of saliency areas in images has been traditionally addressed with hand crafted features based on neuroscience principles. This paper however addresses the problem with a completely data-driven approach by training a convolutional network. The learning process is formulated as a minimization of a loss function that measures the Euclidean distance of the predicted saliency map with the provided ground truth. The recent publication of large datasets of saliency prediction has provided enough data to train a not very deep architecture which is both fast and accurate. The convolutional network in this paper, named JuntingNet, won the LSUN 2015 challenge on saliency prediction with a superior performance in all considered metrics.
With the advent of affordable multimedia smart phones, it has become common that people take videos when they are at events. The larger the event, the larger is the amount of videos taken there and also, the more videos get shared online. To search in this mass of videos is a challenging topic. In this paper we present and discuss a prototype software for searching in such videos. We focus only on visual information, and we report on experiments based on a research data set. With a small study we show that our prototype demonstrates promising results by identifying the same scene in different videos taken from different angles solely based on content based image retrieval.
}, url = {https://upcommons.upc.edu/handle/2117/76553}, author = {Roldan-Carlos, Jennifer and Lux, Mathias and Xavier Gir{\'o}-i-Nieto and Pia Mu{\~n}oz-Trallero and Anagnostopoulos, Nektarios} } @conference {cMohedano, title = {Exploring EEG for Object Detection and Retrieval}, booktitle = {ACM International Conference on Multimedia Retrieval (ICMR) }, year = {2015}, address = {Shanghai, China}, abstract = {This paper explores the potential for using Brain Computer Interfaces (BCI) as a relevance feedback mechanism in content-based image retrieval. We investigate if it is possible to capture useful EEG signals to detect if relevant objects are present in a dataset of realistic and complex images. \ We perform several experiments using a rapid serial visual presentation (RSVP) of images at different rates (5Hz and 10Hz) on 8 users with different degrees of familiarization with BCI and the dataset. We then use the feedback from the BCI and mouse-based interfaces to retrieve objects in a subset of TRECVid images. We show that it is indeed possible detect such objects in complex images and, also, that users with previous knowledge on the dataset or experience with the RSVP outperform others. When the users have limited time to annotate the images (100 seconds in our experiments) both interfaces are comparable in performance. Comparing our best users in a retrieval task, we found that EEG-based relevance feedback outperforms mouse-based feedback. The realistic and complex image dataset differentiates our work from previous studies on EEG for image retrieval.\
[Extended version in arXiv:1504.02356]
Overall acceptance rate: 33\% (source)
Advisors: Amaia Salvador (UPC), Matthias Zeppelzauer (FH St P{\"o}lten), Xavier Gir{\'o}-i-Nieto (UPC)
Studies: Bachelor Degree in Audiovisual Systems Engineering at Telecom BCN-ETSETB from the Technical University of Catalonia (UPC)
Grade: A with honors (10/10)
This thesis explores good practices for improving the performance of an existing convnet trained with a dataset of clean data when an additional dataset of noisy data is available. We develop techniques to clean the noisy data with the help of the clean one, a family of solutions that we will refer to as denoising, and then we explore the best sorting of the clean and noisy datasets during the fine-tuning of a convnet. Then we study strategies to select the subset of images of the clean data that will improve the classification performance, a practice we will efer to as fracking. Next, we determine how many layers are actually better to fine-tune in our convnet, given our amount of data. And finally, we compare the classic convnet architecture where a single network is fine-tuned to solve a multi-class problem with the case of fine-tuning a convnet for binary classification for each considered class.
\
2015-TFG-AndreaCalafell-FineTuningConvolutionalNetworkForCulturalEventRecognition from Image Processing Group on Vimeo.
See https://imatge.upc.edu/web/publications/cultural-event-recognition-visual-convnets-and-temporal-models
This paper explores the potential of brain-computer interfaces in segmenting objects from images. Our approach is centered around designing an effective method for displaying the image parts to the users such that they generate measurable brain reactions. When a block of pixels is displayed, we estimate the probability of that block containing the object of interest using a score based on EEG activity. After several such blocks are displayed in rapid visual serial presentation, the resulting probability map is binarized and combined with the GrabCut algorithm to segment the image into object and background regions. This study extends our previous work that showed how BCI and simple EEG analysis are useful in locating object boundaries in images
}, issn = {1573-7721}, doi = {10.1007/s11042-015-2805-0}, url = {http://dx.doi.org/10.1007/s11042-015-2805-0}, author = {Mohedano, Eva and Healy, Graham and Kevin McGuinness and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Smeaton, Alan F.} } @conference {cVentura, title = {Improving Spatial Codification in Semantic Segmentation}, booktitle = {IEEE International Conference on Image Processing (ICIP), 2015}, year = {2015}, month = {09/2015}, publisher = {IEEE}, organization = {IEEE}, address = {Quebec City}, abstract = {This paper explores novel approaches for improving the spatial codification for the pooling of local descriptors to solve the semantic segmentation problem. We propose to partition the image into three regions for each object to be described: Figure, Border and Ground. This partition aims at minimizing the influence of the image context on the object description and vice versa by introducing an intermediate zone around the object contour. Furthermore, we also propose a richer visual descriptor of the object by applying a Spatial Pyramid over the Figure region. Two novel Spatial Pyramid configurations are explored: Cartesian-based and crown-based Spatial Pyramids. We test these approaches with state-of-the-art techniques and show that they improve the Figure-Ground based pooling in the Pascal VOC 2011 and 2012 semantic segmentation challenges.
This document contains supplementary material for the paper "Improving Spatial Codification in Semantic Segmentation" submitted to ICIP 2015. First, there is a section dedicated to the results obtained by categories when ideal object candidates (ground truth masks) are used. Then, an analysis of the results using CPMC and MCG object candidates also detailed by categories. Finally, visual results for CPMC and MCG are showed.
}, author = {Ventura, C. and Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Kevin McGuinness and Marqu{\'e}s, F. and Noel E. O{\textquoteright}Connor} } @conference {cMcGuinnessa, title = {Insight DCU at TRECVID 2015}, booktitle = {TRECVID 2015 Workshop}, year = {2015}, month = {11/2015}, publisher = {NIST}, organization = {NIST}, address = {Gaithersburg, MD, USA}, abstract = {Insight-DCU participated in the instance search (INS), semantic indexing (SIN), and localization tasks (LOC) this year.
In the INS task we used deep convolutional network features trained on external data and the query data for this year to train our system. We submitted four runs, three based on convolutional network features, and one based on SIFT/BoW. F A insightdcu 1 was an automatic run using features from the last convolutional layer of a deep network with bag-of-words encoding and achieved 0.123 mAP. F A insightdcu 2 modied the previous run to use re-ranking based on an R-CNN model and achieved 0.111 mAP. I A insightdcu 3, our interactive run, achieved 0.269 mAP. Our SIFT-based run F A insightdcu 2 used weak geometric consistency to improve performance over the previous year to 0.187 mAP. Overall we found that using features from the convolutional layers improved performance over features from the fully connected layers used in previous years, and that weak geometric consistency improves performance for local feature ranking.
In the SIN task we again used convolutional network features, this time netuning a network pretrained on external data for the task. We submitted four runs, 2C D A insightdcu.15 1..4 varying the top-level learning algorithm and use of concept co-occurance. 2C D A insightdcu.15 1 used a linear SVM top-level learner, and achieved 0.63 mAP. Exploiting concept co-occurance improved the accuracy of our logistic regression run 2C D A insightdcu.15 3 from 0.058 mAP to 0.6 2C D A insightdcu.15 3.
Our LOC system used training data from IACC.1.B and features similar to our INS run, but using a VLAD encoding instead of a bag-of-words. Unfortunately there was problem with the run that we are still investigating.
Note: UPC and NII participated only in the INS task of this submission.
}, url = {http://www-nlpir.nist.gov/projects/tvpubs/tv.pubs.15.org.html}, author = {Kevin McGuinness and Mohedano, Eva and Amaia Salvador and Zhang, ZhenXing and Marsden, Mark and Wang, Peng and Jargalsaikhan, Iveel and Antony, Joseph and Xavier Gir{\'o}-i-Nieto and Satoh, Shin{\textquoteright}ichi and O{\textquoteright}Connor, N. and Smeaton, Alan F.} } @mastersthesis {xRamos-Caballero15, title = {Keyframe-based Video Summarization Designer}, year = {2015}, abstract = {Advisors: Xavier Gir{\'o}-i-Nieto (UPC) and Horst Eidenberger (TU Wien)
Studies: Bachelor Degree in\ Audiovisual Systems Engineering\ at Telecom BCN-ETSETB from the Technical University of Catalonia (UPC)
Grade: B (8.7/10)
\
This Final Degree Work extends two previous projects and consists in carrying out an improvement of the video keyframe extraction module from one of them called Designer Master, by integrating the algorithms that were developed in the other, Object Maps.
Firstly the proposed solution is explained, which consists in a shot detection method, where the input video is sampled uniformly and afterwards, cumulative pixel-to-pixel difference is applied and a classifier decides which frames are keyframes or not.
Last, to validate our approach we conducted a user study in which both applications were compared. Users were asked to complete a survey regarding to different summaries created by means of the original application and with the one developed in this project. The results obtained were analyzed and they showed that the improvement done in the keyframes extraction module improves slightly the application performance and the quality of the generated summaries.
Advisors: Amaia Salvador (UPC), Brendan Jou (Columbia University) and Xavier Gir{\'o}-i-Nieto (UPC)
Visual media are powerful means of expressing emotions and sentiments. The constant generation of new content in social networks highlights the need of automated visual sentiment analysis tools. While Convolutional Neural Networks (CNNs) have established a new state-of-the-art in several vision problems, their application to the task of sentiment analysis is mostly unexplored and there are few studies regarding how to design CNNs for this purpose. In this work, we study the suitability of fine-tuning a CNN for visual sentiment prediction as well as explore performance boosting techniques within this deep learning setting. Finally, we provide a deep-dive analysis into a benchmark, state-of-the-art network architecture to gain insight about how to design patterns for CNNs on the task of visual sentiment prediction.
\
Advisors:\ Mathias Lux\ (Klagenfurt University) and\ Xavier Gir{\'o}-i-Nieto\ (UPC)
Degree: Electronic Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)
Grade: A (9.0/10.0)
This project explores the expansion of Lucene Image Retrieval Engine (LIRE), an open-source Content-Based Image Retrieval (CBIR) system, for video retrieval on large scale video datasets. The fast growth of the need to store huge amounts of video in servers requires efficient, scalable search and indexing engines capable to assist users in their management and retrieval. In our tool, queries are formulated by visual examples allowing users to find the videos and the moment of time when the query image is matched with. The video dataset used on this scenario comprise over 1,000 hours of different news broadcast channels. This thesis presents an extension and adaptation of Lire and its plugin for Solr, an open-source enterprise search platform from the Apache Lucene project, for video retrieval based on visual features, as well as a web-interface for users from different devices.
The High Efficiency Video Coding standard (HEVC) supports a total of 35 intra prediction modes which aim at reducing spatial redundancy by exploiting pixel correlation within a local neighborhood. In this paper, we show that spatial correlation remains after intra prediction, leading to high energy prediction residues. We propose a novel scheme for encoding the prediction residues using a Mode Dependent Vector Quantization (MDVQ) which aims at reducing the redundancy in residual domain. The MDVQ codebook is optimized in a rate-distortion (RD) sense. Experimental results show that the codebook can be independent of the quantization parameter (QP) with no loss in terms of coding efficiency. A bitrate reduction of 1:1\% on average compared to HEVC can be achieved, while further tests indicate that codebook adaptivity could substantially improve the performance.
}, author = {Huang, B. and Henry, F. and Guillemot, C. and Salembier, P.} } @conference {cNguyen, title = {NII-HITACHI-UIT at TRECVID 2015 Instance Search}, booktitle = {TRECVID 2015 Workshop}, year = {2015}, month = {11/2015}, publisher = {NIST}, organization = {NIST}, address = {Gaithersburg, MD, USA}, abstract = {In this paper, we propose two methods to improve last year instance search framework. Both of them are based on post processing scheme that try to rerank top K shots returned from BOW model. The rst system is to propose a query-adaptive weighting technique between DPM object detectors score and BOW{\textquoteright}s score. In order to nd a good weight, we use a neural network which learns characteristics of the query including number of features, number of shared words and area of the query topic. The second system combines two state-of-the-art object detectors: DPM and Fast RCNN to estimate object location and similarity score, respectively. The nal score is computed using these components together with BOW based similarity score returned from the baseline system. The experimental results show that our system improved pretty much even with a smaller number of top K input ranked list. Compared to other teams, we got the second place with the same run.
}, url = {http://www-nlpir.nist.gov/projects/tvpubs/tv.pubs.15.org.html}, author = {Nguyen, Vinh-Tiep and Duy-Dinh-Le and Amaia Salvador and Caizhi-Zhu and Nguyen, Dinh-Luan and Tran, Minh-Triet and Ngo Duc, Thanh and Anh Duong, Duc and Satoh, Shin{\textquoteright}ichi and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xGris-Sarabia, title = {Pyxel, una llibreria per a l{\textquoteright}anotació automàtica de fotografies}, year = {2015}, abstract = {Tutor:\ Xavier Gir{\'o}-i-Nieto\ (UPC)\
Grau: Enginyeria en Sistemes Audiovisuals (4 anys) a l{\textquoteright}Escola d{\textquoteright}Enginyeria de Terrassa (UPC)
Donat que actualment la generació i difusió de contingut multimèdia ha incrementat molt, s{\textquoteright}està fent recerca per trobar eines d{\textquoteright}anotació automàtica o semiautomàtica d{\textquoteright}imatges, sons, vídeos, etc.
Aquest document recull la informació sobre el desenvolupament i funcionament de la llibreria Pyxel, orientada, concretament, a l{\textquoteright}anotació d{\textquoteright}imatges mitjançant la informació visual de la imatge i la informació textual que l{\textquoteright}acompanya, les metadades.
El Pyxel és un conjunt de classes desenvolupades en llenguatge de programació Python per dur a terme una cadena d{\textquoteright}anotació d{\textquoteright}imatges complerta, es a dir, permet l{\textquoteright}extracció de característiques visuals amb descriptors SIFT i l{\textquoteright}extracció de característiques textuals de metadades mitjançant l{\textquoteright}algoritme de processat de text TF-IDF, així com eines per crear ambdós vocabularis. També proporciona eines per l{\textquoteright}entrenament i la detecció d{\textquoteright}un classificador SVM.
Per tal de gestionar les dades de grans volums d{\textquoteright}imatges les eines del Pyxel estan pensades per fer processat d{\textquoteright}imatges en paral{\textperiodcentered}lel, molt útil per aprofitar d{\textquoteright}una manera optima els recursos d{\textquoteright}un servei de computació gestionat amb SLURM.
Qualificaci{\'o} final: A (9/10)
This paper explores processing techniques to deal with noisy data in crowdsourced object segmentation tasks. We use the data collected with "Click{\textquoteright}n{\textquoteright}Cut", an online interactive segmentation tool, and we perform several experiments towards improving the segmentation results. First, we introduce different superpixel-based techniques to filter users{\textquoteright} traces, and assess their impact on the segmentation result. Second, we present different criteria to detect and discard the traces from potential bad users, resulting in a remarkable increase in performance. Finally, we show a novel superpixel-based segmentation algorithm which does not require any prior filtering and is based on weighting each user{\textquoteright}s contribution according to his/her level of expertise.
Selected among Top 10\% papers in ICIP 2015 based on the reviewer scores and recommendations.
\ [Related BSc thesis by Ferran Cabezas]\
[IEEE ICIP 2015 conference website]\
\
\
}, url = {http://arxiv.org/abs/1505.00145}, author = {Cabezas, Ferran and Carlier, Axel and Amaia Salvador and Xavier Gir{\'o}-i-Nieto and Charvillat, Vincent} } @article {aBonet-Carne14a, title = {Quantitative Ultrasound Texture Analysis of Fetal Lungs To Predict Neonatal Respiratory Morbidity}, journal = {Ultrasound in Obstetrics and Gynecology, Wiley}, volume = {45}, year = {2015}, pages = {427{\textendash}433}, author = {E. Bonet-Carne and M. Palacio and T. Cobo and A. Perez-Moreno and M. Lopez and J. P. Piraquive and J. C. Ramirez and F. Marques and E. Gratacos} } @mastersthesis {xPorta, title = {Rapid Serial Visual Presentation for Relevance Feedback in Image Retrieval with EEG Signals}, year = {2015}, abstract = {Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)
Grade: A (9/10)
This thesis explores the potential of relevance feedback for image retrieval using EEG signals for human-computer interaction. This project aims at studying the optimal parameters of a rapid serial visual presentation (RSVP) of frames from a video database when the user is searching for an object instance. The simulations reported in this thesis assess the trade-off between using a small or a large amount of images in each RSVP round that captures the user feedback. While short RSVP rounds allow a quick learning of the user intention from the system, RSVP rounds must also be long enough to let users generate the P300 EEG signals which are triggered by relevant images. This work also addresses the problem of how to distribute potential relevant and non-relevant images in a RSVP round to maximize the probabilities of displaying each relevant frame separated at least 1 second from another relevant frame, as this configuration generates a cleaner P300 EEG signal. The presented simulations are based on a realistic set up for video retrieval with a subset of 1,000 frames from the TRECVID 2014 Instance Search task.
}, keywords = {eeg, feedback, image, relevance, retrieval}, author = {Porta, Sergi}, editor = {Amaia Salvador and Mohedano, Eva and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @mastersthesis {xBosch, title = {Region-oriented Convolutional Networks for Object Retrieval}, year = {2015}, abstract = {Advisors: Amaia Salvador and\ Xavier Gir{\'o}-i-Nieto\ (UPC)\
Study program: Engineering on Audiovisual Systems (4 years) at Escola d{\textquoteright}Enginyeria de Terrassa\ (UPC)
Grade: A (9.6/10)
This thesis is framed in the computer vision field, addressing a challenge related to instance search. Instance search consists in searching for occurrences of a certain visual instance on a large collection of visual content, and generating a ranked list of results sorted according to their relevance to a user query. This thesis builds up on existing work presented at the TRECVID Instance Search Task in 2014, and explores the use of local deep learning features extracted from object proposals. The performance of different deep learning architectures (at both global and local scales) is evaluated, and a thorough comparison of them is performed. Secondly, this thesis presents the guidelines to follow in order to fine-tune a convolutional neural network for tasks such as image classification, object detection and semantic segmentation. It does so with the final purpose of fine tuning SDS, a CNN trained for both object detection and semantic segmentation, with the recently released Microsoft COCO dataset.
\
Student: Aniol Lidon
Advisors: Petia Radeva (UB) and Xavier Gir{\'o}-i-Nieto (UPC)
Program: Master in Computer Vision
Grade: A (9.8/10.0)
This project generates visual summaries of events depicted from egocentric photos taken with a wearable camera. These summaries are addressed to mild-dementia patients in order to exercise their memory in a daily base. The main contribution is an iterative approach that guarantees the semantic diversity of the summary and a novel soft metric to assess subjective results. Medical experts validated the proposed solution with a Mean Opinion Score of 4.6 out of of 5.0. The flexibility and quality of the solution was also tested in the 2015 Retrieving Diverse Social Images Task from the scientific international benchmark, MediaEval.\
This paper presents the results of the UPC-UB-STP team in the 2015 MediaEval Retrieving Diverse Images Task.The goal of the challenge is to provide a ranked list of Flickr photos for a predefined set of queries. Our approach firstly generates a ranking of images based on a query-independent estimation of its relevance. Only top results are kept and iteratively re-ranked based on their intra-similarity to introduce diversity.
In endoscopic procedures, surgeons work with live video strea\-ms from the inside of their subjects. A main source for documentation of procedures are still frames from the video, identified and taken during the surgery. However, with growing demands and technical means, the streams are saved to storage servers and the surgeons need to retrieve parts of the videos on demand. In this submission we present a demo application allowing for video retrieval based on visual features and late fusion, which allows surgeons to re-find shots taken during the procedure.
[CBMI 2015 Conference website]
Presented in the Special Session on Medical Multimedia Processing (acceptance rate for special sessions= 55\%)
\
Advisor: Xavier Gir{\'o}-i-Nieto (UPC)
Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)
Grade: A with honors (9.9/10.0)
A saliency map is a model that predicts eye fixations on a visual scene. In other words, it is the prediction of saliency areas in images has been traditionally addressed with hand crafted features inspired on neuroscience principles. This work however addresses the problem with a completely data-driven approach by training a convolutional network. The recent publication of large datasets of saliency prediction has provided enough data to train a not very deep network architecture which is both fast and accurate. In our system, named JuntingNet, the learning process is formulated as a minimization of a loss function that measures the Euclidean distance of the predicted saliency map with the provided ground truth. The convolutional network developed in this work, named JuntingNet, won the CVPR Large-scale Scene UNderstanding (LSUN) 2015 challenge on saliency prediction with a superior performance in all considered metrics.
2015-TFG-JuntingPan-VisualSaliencyPredictionUsingDeepLearningTechniques from Image Processing Group on Vimeo.
See https://imatge.upc.edu/web/resources/end-end-convolutional-networks-saliency-prediction-software.
Advisors: Mathias Lux (Klagenfurt University) and\ Xavier Gir{\'o}-i-Nieto\ (UPC)
Degree: Electronic Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)
Grade: A (9.5/10)
This project explores the potential of LIRE, an en existing Content-Based Image Retrieval (CBIR) system, when used to retrieve medical videos. These videos are recording of the live streams used by surgeons during the endoscopic procedures, captured from inside of the subject. The growth of such video content stored in servers requires search engines capable to assist surgeons in their management and retrieval. In our tool, queries are formulated by visual examples and those allow surgeons to re-find shots taken during the procedure. This thesis presents an extension and adaptation of Lire for video retrieval based on visual features and late fusion. The results are assessed from two perspectives: a quantitative and qualitative one. While the quantitative one follows the standard practices and metrics for video retrieval, the qualitative assessment has been based on an empirical social study using a semi-interactive web-interface. In particular, a thinking aloud test was applied to analyze if the user expectations and requirements were fulfilled. Due to the scarcity of surgeons available for the qualitative tests, a second domain was also addressed: videos captured at musical performances. These type of videos has also experienced an exponential growth with the advent of affordable multimedia smart phones, available to a large audience. Analogously to the endoscopic videos, searching in a large data set of such videos is a challenging topic.
Building a visual summary from an egocentric photostream captured by a lifelogging wearable camera is of high interest for different applications (e.g. memory reinforcement).\ In this paper, we propose a new summarization method based on keyframes selection that\ uses visual features extracted by means of a convolutional neural network. Our method applies an unsupervised clustering for dividing the photostreams into events, and finally extracts the most relevant keyframe for each event. We assess the results by applying a blind-taste test on a group of 20 people who assessed the quality of the summaries.
\
Studies: Bachelor degree in Engineering of Audiovisual Systems at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)
Grade: A with honors (9.6/10)
This Final Degree Work approach the problem of the visual summarization of sets of images captured by an eggocentric camera for lifelogging purposes. In first place we try to group the images (which represent the day of person{\textquoteright}s life) into distinguishable and significant esdeveniments. For this purpose, we use visual features extracted with the software Caffe. In second place, we explain the design of extraction techniques of the representative images through similarity graphs. Finally we analyze the assessment scores given by different users whom we presented the different visual summaries obtained in this project. We achieve a 60\% of favorable opinions of the quality of the visual summaries obtained with techniques developed in this project.
(This thesis report is writen in Catalan)\
Aquest Treball de Final de Grau aborda el problema de resumir visualment conjunts d{\textquoteright}imatges capturats mitjançant una càmera egocèntrica per a propòsits de registre vital (lifelogging en anglès).\ En primer lloc s{\textquoteright}intenta agrupar les imatges (que representen el dia d{\textquoteright}una persona) per esdeveniments distingibles i significatius. Per fer-ho s{\textquoteright}utilitzen característiques visuals extretes amb el programari Caffe.\ En segon lloc s{\textquoteright}explica el disseny de tècniques de extracció d{\textquoteright}imatges representatives mitjançant grafs de similitud.\ Per últim s{\textquoteright}analitzen les puntuacions d{\textquoteright}avaluació donades per diversos usuaris als quals se{\textquoteright}ls han presentat els diferents resums visuals obtinguts en aquest projecte. S{\textquoteright}ha pogut assolir un 60\% d{\textquoteright}opinions favorables a la qualitat dels resums obtinguts amb tècniques desenvolupades en aquest treball.
Advisors: Xavier Giro-i-Nieto and Carles Ventura-Royo
\
This Bachelor of Science thesis addresses the problem of image classification combining two popular visual representations: points and regions. Firstly, the study explores bundling interest points with regions. These regions are generated with an initial SLIC partition and using Binary Partition Tree (BPT), considering different scales of resolution in the segmentation. Secondly, it explores modelling visual classes as a group of points extracted from different images. Based on Naive-Bayes Nearest Neighbor (NBNN), we are using 1-Nearest Neighbor with SURF descriptor on the 17 Category Flower Dataset with 1360 images of flowers distributed into 17 classes, 80 images per class. We have verified that grouping interest points of the same class improves the F1-score a 9.2\%. However, bundling interest points into regions using segmentation worsens the F1-score between 1\% and 7\%, depending on the number of regions in the segmentation.
[Extended summary on Bitsearch blog]
Author{\textquoteright}s website: jordisanchez.info
Grade: A (9.3/10)
(This BSc thesis was written in Catalan language.)
\
---------------------------------
Aquest Treball Final de Grau aborda el problema de la classificaci{\'o} d{\textquoteright}imatges combinant dues representacions visuals populars: els punts i les regions. En primer lloc, l{\textquoteright}estudi explora l{\textquoteright}agrupaci{\'o} de punts d{\textquoteright}inter{\`e}s amb les regions. Aquestes regions es generen amb una partici{\'o} inicial SLIC i s{\textquoteright}utilitzen els Arbres de Partici{\'o} Bin{\`a}ria (Binary Partition Tree, BPT), considerant diferents escales de resoluci{\'o} en la segmentaci{\'o}. En segon lloc, s{\textquoteright}estudia modelar les classes com a grup de punts d{\textquoteright}inter{\`e}s extrets d{\textquoteright}imatges diferents. Basant-nos en el classificador Naive-Bayes Nearest Neighbor (NBNN), hem utilitzat el ve{\"\i} m{\'e}s proper amb un descriptor SURF sobre la base de dades 17 Category Flower Dataset, que cont{\'e} 1360 imatges de flors distribu{\"\i}des en 17 classes, amb 80 imatges per classe. Hem pogut verificar que el fet d{\textquoteright}ajuntar els punts d{\textquoteright}inter{\`e}s de les imatges d{\textquoteright}una mateixa classe millora la puntuaci{\'o} F1 en un 9,2\%. No obstant, l{\textquoteright}agrupaci{\'o} de punts d{\textquoteright}inter{\`e}s en regions utilitzant una segmentaci{\'o} de la imatge empitjora la puntuaci{\'o} F1 entre l{\textquoteright}1\% i el 7\%, depenent del nombre de regions de la segmentaci{\'o}.
Lloc web de l{\textquoteright}autor: jordisanchez.info
\ Qualificaci{\'o}: A (9.3/10)
This paper introduces Click{\textquoteright}n{\textquoteright}Cut, a novel web tool for interactive object segmentation addressed to crowdsourcing tasks. Click{\textquoteright}n{\textquoteright}Cut combines bounding boxes and clicks generated by workers to obtain accurate object segmentations. These segmentations are created by combining precomputed object candidates in a light computational fashion that allows an immediate response from the interface. Click{\textquoteright}n{\textquoteright}Cut has been tested with a crowdsourcing campaign to annotate a subset of the Berkeley Segmentation Dataset (BSDS). Results show competitive results with state of the art, especially in time to converge to a high quality segmentation. The data collection campaign included golden standard tests to detect cheaters.
[Related master thesis by Amaia Salvador]
[Related Phd thesis by Axel Carlier]
}, keywords = {Crowdsourcing, figure-ground segmentation, human computing, object candidates}, doi = {10.1145/2660114.2660125}, url = {http://dx.doi.org/10.1145/2660114.2660125}, author = {Carlier, Axel and Amaia Salvador and Xavier Gir{\'o}-i-Nieto and Marques, Oge and Charvillat, Vincent} } @mastersthesis {xGutierrez14, title = {Comparaci{\'o} d{\textquoteright}algoritmes de classificaci{\'o} de tipus de pla en imatges de futbol}, year = {2014}, abstract = {The purpose of this project is to analyze and evaluate football image viewpoint classification algorithms and to implement, if it is possible, a set of upgrades to improve the results of this classification. In particular, the analysis of a publication of the State of the Art and its subsequent comparison to an algorithm created by the Image Processing Group (GPI) of the Polytechnic University of Catalonia is formulated. Sometimes, during the analysis process of sporting events is interesting to automate the extraction of semantic content. In this context, the algorithms compared in this project are football image viewpoint classification algorithms. To carry through the classification, these algorithms use different descriptors calculated on the images. This project originates from the need to compare the image processing group algorithm to present techniques, in order to assess the obtained results and have a better understanding of the current State of the Art. Thus, the State of the Art algorithm will be implemented, analyzed and improved if possible. Then, it will be compared to the one created by the image processing group using an extensive database previously selected. Finally, the results will be presented and analyzed.
}, url = {http://upcommons.upc.edu/bitstream/handle/2099.1/20890/Final_Report.pdf?sequence=4\&isAllowed=y}, author = {Coronado Guti{\'e}rrez, D}, editor = {David Varas and Marqu{\'e}s, F.} } @mastersthesis {xTella, title = {Contextless Object Recognition with Shape-enriched SIFT and Bags of Features}, year = {2014}, abstract = {Advisors:\ Xavier Gir{\'o}-i-Nieto\ (UPC) and Matthias Zeppelzauer\ (TU Wien)
Degree: Telecommunications Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)
Currently, there are highly competitive results in the field of object recognition based on the aggregation of point-based features. The aggregation process, typically with an average or max-pooling of the features generates a single vector that represents the image or region that contains the object.
The aggregated point-based features typically describe the texture around the points with descriptors such as SIFT. These descriptors present limitations for wired and textureless objects. A possible solution is the addition of shape-based information. Shape descriptors have been previously used to encode shape information and thus, recognise those types of objects. But generally an alignment step is required in order to match every point from one shape to other ones. The computational cost of the similarity assessment is high.
We purpose to enrich location and texture-based features with shape-based ones. Two main architectures are explored: On the one side, to enrich the SIFT descriptors with shape information before they are aggregated. On the other side, to create the standard Bag of Words histogram and concatenate a shape histogram, classifying them as a single vector.
We evaluate the proposed techniques and the novel features on the Caltech-101 dataset.
Results show that shape features increase the final performance. Our extension of the Bag of Words with a shape-based histogram(BoW+S) results in better performance. However, for a high number of shape features, BoW+S and enriched SIFT architectures tend to converge.
Final grade: A with honors (10/10)
\
}, keywords = {Bag of Words, Interest Points, object candidates, Object detection, segmentation, shape coding, SIFT, textureless objects, wired objects.}, url = {http://hdl.handle.net/2099.1/22390}, author = {Tella, Marcel}, editor = {Zeppelzauer, Matthias and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xSalvador, title = {Exploiting User Interaction and Object Candidates for Instance Retrieval and Object Segmentation}, year = {2014}, abstract = {Author: Amaia Salvador-Aguilera
Advisors:\ Xavier Gir{\'o}-i-Nieto\ (UPC) and Kevin McGuinness (Dublin CIty University)
Degree:\ Master in Computer Vision\ (1 year)
Video: Thesis defense
This thesis addresses two of the main challenges nowadays for computer vision: object segmentation and visual instance retrieval. The methodologies proposed to solve both problems are based on the use of object candidates and human computation in the computer vision loop. In the object segmentation side, this work explores how human computation can be useful to achieve better segmentation results, by combining users{\textquoteright} traces with a segmentation algorithm based on object candidates. On the other hand, the instance retrieval problem is also addressed using object candidates to compute local features, and involving the user in the retrieval loop by applying relevance feedback strategies.
}, keywords = {computer Vision, human computing, instance search, object candidates, segmentation}, author = {Amaia Salvador}, editor = {Xavier Gir{\'o}-i-Nieto and Kevin McGuinness} } @article {aGiro-i-Nieto13, title = {From Global Image Annotation to Interactive Object Segmentation}, journal = {Multimedia Tools and Applications}, volume = {70}, year = {2014}, month = {05/2014}, chapter = {475}, abstract = {This paper presents a graphical environment for the annotation of still images that works both at the global and local scales. At the global scale, each image can be tagged with positive, negative and neutral labels referred to a semantic class from an ontology. These annotations can be used to train and evaluate an image classifier. A finer annotation at a local scale is also available for interactive segmentation of objects. This process is formulated as a selection of regions from a precomputed hierarchical partition called Binary Partition Tree. Three different semi-supervised methods have been presented and evaluated: bounding boxes, scribbles and hierarchical navigation. The implemented Java source code is published under a free software license.
}, keywords = {annotation, Hierarchical, Interaction, Multiscale, segmentation}, doi = {10.1007/s11042-013-1374-3}, author = {Xavier Gir{\'o}-i-Nieto and Martos, Manel and Mohedano, Eva and Jordi Pont-Tuset} } @conference {cMcGuinness, title = {Insight Centre for Data Analytics (DCU) at TRECVid 2014: Instance Search and Semantic Indexing Tasks}, booktitle = {2014 TRECVID Workshop}, year = {2014}, month = {11/2014}, publisher = {National Institute of Standards and Technology (NIST)}, organization = {National Institute of Standards and Technology (NIST)}, address = {Orlando, Florida (USA)}, abstract = {Insight-DCU participated in the instance search (INS) and semantic indexing (SIN) tasks in 2014. Two very different approaches were submitted for instance search, one based on features extracted using pre-trained deep convolutional neural networks (CNNs), and another based on local SIFT features, large vocabulary visual bag-of-words aggregation, inverted index-based lookup, and geometric verification on the top-N retrieved results. Two interactive runs and two automatic runs were submitted, the best interactive runs achieved a mAP of 0.135 and the best automatic 0.12. Our semantic indexing runs were based also on using convolutional neural network features, and on Support Vector Machine classifiers with linear and RBF kernels. One run was submitted to the main task, two to the no annotation task, and one to the progress task. Data for the no-annotation task was gathered from Google Images and ImageNet. The main task run has achieved a mAP of 0.086, the best no-annotation runs had a close performance to the main run by achieving a mAP of 0.080, while the progress run had 0.043.
[2014 TREC Video Retrieval Evaluation Notebook Papers and Slides]
\
}, url = {http://hdl.handle.net/2117/24915}, author = {Kevin McGuinness and Mohedano, Eva and Zhang, ZhenXing and Hu, Feiyan and Albatal, Rami and Gurrin, Cathal and O{\textquoteright}Connor, N. and Smeaton, Alan F. and Amaia Salvador and Xavier Gir{\'o}-i-Nieto and Ventura, C.} } @mastersthesis {xPereira, title = {An investigation of eye gaze tracking utilities in image object recognition}, year = {2014}, abstract = {Computer vision has been one of the most revolutionary technologies of the last few decades. This project investigates how to improve an image recognition system (image classifier) using a not very exploded technology; eye gaze tracking. The aim of this project is to explore the benefits that this technology can bring to an image classifier. The experiment that is set in this project is to build a dataset with an eye tracking device and (using different sized cropped parts of the image based on the eye tracking data) see how the performance of an image classifier is affected with these images. The results are interesting. Since smaller images have to be processed by using this method, the system is more efficient. Regarding the performance, it is very similar to the one obtained without using any eye tracking data, so it is arguable to state that it presents an improvement, and opens new directions of investigation for future works.
\
\
\
Advisors: Xavier Gir{\'o}-i-Nieto (UPC) and Omar Pera (Pixable)
Degree: Electronic Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)
This Final degree thesis summarizes the tasks that have been developed during an internship in Pixable Inc. in New York City together with the tasks related to the Me- diaeval 2013 evaluation campaign, where I participated with the team of Universitat Politecnica de Catalunya (UPC). The main focus of my work was on the Photofeed service, that is a photo archive service in the cloud.
The popularisation of the storage of photos on the cloud has opened new oppor- tunities and challenges for the organization and extension of photo collections. In my thesis I have developed a light computational solution for the clustering of web photos based on social events. The proposal combines a first oversegmentation of the photo collections of each user based on temporal cues, as previously proposed in the PhotoTOC algorithm [Platt et al, PACRIM 2003]. On a second stage, the resulting mini-clusters are merged based on contextual metadata such as geolocation, keywords and user IDs.
Closely relate to photo clustering we can study mail classification too. Additional tasks were developed for the Contactive company in this field. In order to solve the problems that Contactive was facing in mail analysis tasks, I developed methods for automatically identifying signature blocks and reply lines in plain-text email messages. This analysis has many potential applications, such as preprocessing email for text-to- speech systems; anonymization of email corpora; improving automatic content-based mail classifiers; and email threading. This method is based on applying machine learning methods to a sequential representation of an email message, in which each email is represented as a sequence of lines, and each line is represented as a set of features.
Final grade: A with honors (10/10)
\
This work aims to promote the cooperation and coordination between different image/video processing courses taught at the UPC in order to enhance the learning results. The main contributions are a) the creation of a common set of materials: graphical demonstrators, collections of problems, question banks, etc. and b) the design of strategies strategies to use this material in the development of generic and specific skills, with special emphasis on promoting independent learning
}, keywords = {image/video processing, Matlab demonstrators, question Banks, specific skills, teaching material}, author = {Morros, J.R. and Ver{\'o}nica Vilaplana and Ruiz-Hidalgo, J. and Casas, J. and Gasull, A. and Marqu{\'e}s, F. and M. Pard{\`a}s and Salembier, P.} } @mastersthesis {xFerrarons-Betrian, title = {Mobile Visual Search at Catchoom}, year = {2014}, abstract = {Author: Miquel Ferrarons-Betrian
Advisors:\ Xavier Gir{\'o}-i-Nieto\ (UPC) and Tomasz Adamek\ (Catchoom)
Degree: Master in Computer Vision (1 year)
\
}, keywords = {feature selection, large-scale visual search, Mobile visual search, synthetic views matching, visual word}, author = {Ferrarons-Betrian, Miquel}, editor = {Adamek, Tomasz and Xavier Gir{\'o}-i-Nieto} } @conference {cGallego14a, title = {Multiview Foreground Segmentation using 3D Probabilistic Model}, booktitle = {ICIP, IEEE International Conference on Image Processing}, year = {2014}, month = {10/2014}, abstract = {We propose a complete multi-view foreground segmentation and 3D reconstruction system that defines a 3-dimensional probabilistic model to model the foreground object in the 3 spatial dimensions, thus gathering the information from all the camera views. This 3D model is projected to each one of the views in order to perform the 2D segmentation with the foreground information shared by all the cameras. Then, for each one of the views, a MAP-MRF classification framework is applied between the projected region-based foreground model, the pixel-wise background model and the regionbased shadow model defined for each view. The resultant masks are used to compute the next 3-dimensional reconstruction. This system achieves correct results by reducing the false positive and false negative errors in sequences where some camera sensors can present camouflage situations between foreground and background. Moreover, the use of the 3D model opens possibilities to use it for objects recognition or human activity understanding.
}, author = {Gallego, J. and M. Pard{\`a}s} } @conference {cMohedano, title = {Object segmentation in images using EEG signals}, booktitle = {ACM Multimedia}, year = {2014}, month = {11/2014}, address = {Orlando, Florida (USA)}, abstract = {This paper explores the potential of brain-computer interfaces in segmenting objects from images. Our approach is centered around designing an effective method for displaying the image parts to the users such that they generate measurable brain reactions. When an image region, specifically a block of pixels, is displayed we estimate the probability of the block containing the object of interest using a score based on EEG activity. After several such blocks are displayed, the resulting probability map is binarized and combined with the GrabCut algorithm to segment the image into object and background regions. This study shows that BCI and simple EEG analysis are useful in locating object boundaries in images.
}, keywords = {Brain-computer interfaces, Electroencephalography, GrabCut algorithm, Interactive segmentation, Object segmentation, rapid serial visual presentation}, doi = {10.1145/2647868.2654896}, url = {http://arxiv.org/abs/1408.4363}, author = {Mohedano, Eva and Healy, Graham and Kevin McGuinness and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Smeaton, Alan F.} } @conference {cManchon-Vizuete, title = {Photo Clustering of Social Events by Extending PhotoTOC to a Rich Context}, booktitle = {ICMR 2014 Workshop on Social Events in Web Multimedia (SEWM)}, year = {2014}, month = {04/2014}, publisher = {ACM}, organization = {ACM}, address = {Glasgow, Scotland}, abstract = {The popularisation of the storage of photos on the cloud has opened new opportunities and challenges for the organisation and extension of photo collections. This paper presents a light computational solution for the clustering of web photos based on social events. The proposal combines a first over-segmentation of the photo collections of each user based on temporal cues, as previously proposed in PhotoTOC. On a second stage, the resulting mini-clusters are merged based on contextual metadata such as geolocation, keywords and user IDs. Results indicate that, although temporal cues are very relevant for event clustering, robust solutions should also consider all these additional features.
\
}, keywords = {Clustering, Event Detection, Photo Collections}, url = {http://hdl.handle.net/2117/23009}, author = {Manchon-Vizuete, Daniel and Gris-Sarabia, Irene and Xavier Gir{\'o}-i-Nieto} } @article {aBonet-Carne14, title = {Quantitative Ultrasound Texture Analysis of Fetal Lungs To Predict Neonatal Respiratory Morbidity}, journal = {Ultrasound in Obstetrics and Gynecology, Wiley}, volume = {44}, year = {2014}, author = {E. Bonet-Carne and M. Palacio and T. Cobo and A. Perez-Moreno and M. Lopez and J. P. Piraquive and J. C. Ramirez and F. Marques and E. Gratacos} } @conference {cGallego14, title = {Robust 3D SFS reconstruction based on reliability maps}, booktitle = {ICIP, IEEE International Conference on Image Processing}, year = {2014}, month = {10/2014}, abstract = {This paper deals with Shape from Silhouette (SfS) volumetric reconstruction in the context of multi-view smart room scenarios. The method that we propose first computes a 2D foreground object segmentation in each one of the views, by using region-based models to model the foreground, and shadow classes, and a pixel-wise model to model the background class. Next, we calculate the reliability maps between foreground and background/shadow classes in each view, by computing the hellinger distance among models. These 2D reliability maps are taken into account finally, in the 3D SfS reconstruction algorithm, to obtain an enhanced final volumetric reconstruction. The advantages of our system rely on the possibility to obtain a volumetric representation which automatically defines the optimal tolerance to errors for each one of the voxels of the volume, with a low rate of false positive and false negative errors. The results obtained by using our proposal improve the traditional SfS reconstruction computed with a fixed tolerance for the overall volume.
}, author = {Gallego, J. and M. Pard{\`a}s} } @conference {cManchon-Vizuetea, title = {UPC at MediaEval 2014 Social Event Detection Task}, booktitle = {MediaEval 2014 Workshop}, year = {2014}, month = {10/2014}, publisher = {CEUR Workshop Proceedings}, organization = {CEUR Workshop Proceedings}, address = {Barcelona}, abstract = {This document presents the contribution of the UPC team to the Social Event Detection (SED) Subtask 1 in MediaEval 2014. This contribution extends the solution tested in the previous year with a better optimization of the parameters that determine the clustering algorithm, and by introducing an additional pass that considers the merges of all pairs of mini-clusters generated during the two first passes. Our proposal also addresses the problem of incomplete metadata by generating additional textual tags based on geolocation and natural language processing techniques.
Advisors:\ Xavier Gir{\'o}-i-Nieto\ (UPC) and Horst Eidenberger (TU Wien)
Degree: Telecommunications Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)
The aim of this thesis is to design a tool that performs visual instance\ search mining for news video summarization. This means to extract the\ relevant content of the video in order to be able to recognize the storyline\ of the news.
Initially, a sampling of the video is required to get the frames with a desired rate. Then, different relevant contents are detected from each frame, focusing on faces, text and several objects that the user can select. Next, we use a graph-based clustering method in order to recognize them with a high accuracy and select the most representative ones to show them in the visual summary. Furthermore, a graphical user interface in Wt was developed to create an online demo to test the application.
During the development of the application we have been testing the tool with the CCMA dataset. We prepared a web-based survey based on four results from this dataset to check the opinion of the users. We also validate our visual instance mining results comparing them with the results obtained applying an algorithm developed at Columbia University for video summarization. We have run the algorithm on a dataset of a few videos on two events: {\textquoteright}Boston bombings{\textquoteright} and the {\textquoteright}search of the Malaysian airlines flight{\textquoteright}. We carried out another web-based survey in which users could compare our approach with this related work. With these surveys we analyze if our tool fulfill the requirements we set up.
We can conclude that our system extract visual instances that show the most relevant content of news videos and can be used to summarize these videos effectively.
Final grade: B (7/10)
}, url = {http://hdl.handle.net/2099.1/22362}, author = {Almendros-Guti{\'e}rrez, David}, editor = {Xavier Gir{\'o}-i-Nieto and Eidenberger, Horst} } @conference {cVentura13, title = {Automatic Keyframe Selection based on Mutual Reinforcement Algorithm}, booktitle = {CBMI (Content-Based Multimedia Indexing)}, year = {2013}, month = {09/2013}, address = {Veszprem}, abstract = {This paper addresses the problem of video summarization through an automatic selection of a single representative keyframe. The proposed solution is based on the mutual reinforcement paradigm, where a keyframe is selected thanks to its highest and most frequent similarity to the rest of considered frames. Two variations of the algorithm are explored: a first one where only frames within the same video are used (intraclip mode) and a second one where the decision also depends on the previously selected keyframes of related videos (interclip mode). These two algorithms were evaluated by a set of
professional documentalists from a broadcaster{\textquoteright}s archive, and results concluded that the proposed techniques outperform the semi-manual solution adopted so far in the company.
http://cbmi2013.mik.uni-pannon.hu/
\
Advisors: Xavier Gir{\'o}-i-Nieto\ and\ Horst Eidenberger
School: Vienna University of Technology (Austria)
The amount of digital video content available in the web is constantly increasing. Its handling requires efficient technologies: text search on large databases provides users a great amount of videos; the content results are accessible by a description. Users need a fast and visual way to access relevant video content effectively. Quick visualisation of content using static image summarisation is a sophisticated problem. However, it is worth it because it may solve video navigation problems. Users can very rapidly get an idea of the video with no need to browse through it with a sliding bar as normally done.
In this work a system for automatic video summarisation is developed. It creates an object map the segments of which are extracted from an input video. It allows enhancing video browsing and large video databases management generating a visual index so that the user can rapidly grasp the most relevant content. Finally, accessing them with a simple action requires several technologies that define a complex information processing.
Firstly, shot boundary detection algorithms are required to reduce time redundancy of the video. Secondly, different relevant objects are extracted from each keyframe (faces, cars, etc.). We also describe a workflow to train detection models using multiple open source solutions. Furthermore, faces are a particular and very relevant semantic class. For this reason, we use clustering methods in order to recognise them in an unsupervised recognition process. The image composition of all selected objects and faces is the final stage of the architecture. Composition is defined as the combination of distinct parts to form a whole, therefore, objects have to be rendered in the map in a visually attractive manner.
To validate our approach and assess end-user satisfaction, we conducted a user study in which we compare requirements collected by analysing related literature. We analyse redundancy and informativeness as well as pleasantness.
The results show that our approach effectively creates an image representation for videos and is able to summarise customisable content in an attractive way.
Manel Martos, "Content-based Video Summarization to Object Maps" from Image Processing Group on Vimeo.
\
}, url = {http://hdl.handle.net/2099.1/19359}, author = {Martos, Manel}, editor = {Xavier Gir{\'o}-i-Nieto and Eidenberger, Horst} } @mastersthesis {xSalvador13, title = {Crowdsourced Object Segmentation with a Game}, year = {2013}, abstract = {Co-advised with Axel Carlier (INP Toulouse), Vincent Charvillat\ (INP Toulouse) and Oge Marques (Florida Atlantic University).
Amaia Salvador, "Crowdsourced Object Segmentation with a Game" from Image Processing Group on Vimeo.
\
}, author = {Amaia Salvador}, editor = {Xavier Gir{\'o}-i-Nieto and Carlier, Axel and Charvillat, Vincent and Marques, Oge} } @conference {cSalvador13 , title = {Crowdsourced Object Segmentation with a Game}, booktitle = {ACM Workshop on Crowdsourcing for Multimedia (CrowdMM)}, year = {2013}, month = {10/2013}, address = {Barcelona}, abstract = {We introduce a new algorithm for image segmentation based on crowdsourcing through a game : Ask{\textquoteright}nSeek. The game provides information on the objects of an image, under the form of clicks that are either on the object, or on the background. These logs are then used in order to determine the best segmentation for an object among a set of candidates generated by the state-of-the-art CPMC algorithm. We also introduce a simulator that allows the generation of game logs and therefore gives insight about the number of games needed on an image to perform acceptable segmentation.
Director: Xavier Gir{\'o}-i-Nieto
Els tel{\`e}fons intel{\textperiodcentered}ligents estan cada dia m{\'e}s presents a la nostra societat. L{\textquoteright}educaci{\'o} secund{\`a}ria obligat{\`o}ria no n{\textquoteright}est{\`a} al marge. M{\'e}s aviat al contrari, {\'e}s un dels temes a nivell de conducta que genera m{\'e}s pol{\`e}mica actualment amb l{\textquoteright}alumnat adolescent. Per altra banda, {\'e}s una tecnologia molt potent per accedir a informaci{\'o} i relacionar-se, en qualsevol moment i lloc.
La proposta {\'e}s que l{\textquoteright}alumne hagi de captar amb el tel{\`e}fon m{\`o}bil algun element que tingui a veure amb all{\`o} que s{\textquoteright}ha treballat a classe i li sigui proper. Es tracta de vincular la teoria amb la pr{\`a}ctica, els conceptes de classe amb la seva quotidianitat.
El docent proposar{\`a} uns deures relacionats amb els continguts de classe. L{\textquoteright}alumne utilitzar{\`a} el tel{\`e}fon m{\`o}bil intel{\textperiodcentered}ligent com a vehicle per realitzar la tasca.
Per tal de deixar const{\`a}ncia de la feina feta, s{\textquoteright}utilitzar{\`a} la metodologia de la carpeta d{\textquoteright}aprenentatge electr{\`o}nica (electronic portfolio). Aquesta segona eina permetr{\`a} deixar const{\`a}ncia de tot el que l{\textquoteright}alumne va fent, avaluar-lo i fer una coavaluaci{\'o}.
A la mem{\`o}ria del treball s{\textquoteright}explica quins fonaments pedag{\`o}gics t{\'e} la proposta, com encaixa en el context normatiu de l{\textquoteright}educaci{\'o} secund{\`a}ria obligat{\`o}ria, exemples pr{\`a}ctics d{\textquoteright}exercicis amb els que s{\textquoteright}aplica la metodologia, com aix{\`o} es pot dur a terme a nivell t{\`e}cnic i els resultats de l{\textquoteright}experiment fet amb un grup d{\textquoteright}alumnes utilitzant la metodologia proposada.\
This MSc thesis analyzes two ways of improving the video retrieval techniques for instance search problem. In one hand, "Pairing Interest Points for a better Signature using Sparse Detector{\textquoteright}s Spatial Information", allows the Bag-of-Words model to keep some spatial information. In the other, "Study of the Hamming Embedding Signature Symmetry in Video Retrieval" provides binary signatures that refine the matching based on visual words, and aims to find the best way of matching taking into account the existent asymmetries between image query and videos.
}, url = {http://hdl.handle.net/2099.1/19159}, author = {Garcia-delMolino, Ana}, editor = {Satoh, Shin{\textquoteright}ichi and Ventura, C. and Xavier Gir{\'o}-i-Nieto} } @article {aAlcoverro13, title = {Gesture Control Interface for immersive panoramic displays}, journal = {Multimedia Tools and Applications}, year = {2013}, month = {07/2013}, pages = {1-27}, abstract = {In this paper, we propose a gesture-based interface designed to interact with panoramic scenes. The system combines novel static gestures with a fast hand tracking method. Our proposal is to use static gestures as shortcuts to activate functionalities of the system (i.e. volume up/down, mute, pause, etc.), and hand tracking to freely explore the panoramic video. The overall system is multi-user, and incorporates a user identication module based on face recognition, which is able both to recognize returning users and to add new users online. The system exploits depth data, making it robust to challenging illumination conditions. We show through experimental results the performance of every component of the system compared to the state of the art. We also show the results of a usability study\ performed with several untrained users.
}, issn = {1380-7501}, doi = {10.1007/s11042-013-1605-7}, author = {Alcoverro, M. and Suau, X. and Morros, J.R. and L{\'o}pez-M{\'e}ndez, A. and A. Gil-Moreno and Ruiz-Hidalgo, J. and Casas, J.} } @article {aVentura13, title = {Improving retrieval accuracy of Hierarchical Cellular Trees for generic metric spaces}, journal = {Multimedia Tools and Applications}, year = {2013}, abstract = {Metric Access Methods (MAMs) are indexing techniques which allow working in generic metric spaces. Therefore, MAMs are specially useful for Content-Based Image Retrieval systems based on features which use non Lp norms as similarity measures. MAMs naturally allow the design of image browsers due to their inherent hierarchical structure. The Hierarchical Cellular Tree (HCT), a MAM based indexing technique, provides the starting point of our work. In this paper, we describe some limitations detected in the original formulation of the HCT and propose some modifications to both the index building and the search algorithm. First, the covering radius, which is defined as the distance from the representative to the furthest element in a node,may not cover all the elements belonging to the node{\textquoteright}s subtree. Therefore, we propose to redefine the covering radius as the distance from the representative to the furthest element in the node{\textquoteright}s subtree. This new definition is essential to guarantee a correct construction of the HCT. Second, the proposed Progressive Query retrieval scheme can be redesigned to perform the nearest neighbor operation in a more efficient way. We propose a new retrieval scheme which takes advantage of the benefits of the search algorithm used in the index building. Furthermore, while the evaluation of the HCT in the original work was only subjective, we propose an objective evaluation based on two aspects which are crucial in any approximate search algorithm: the retrieval time and the retrieval accuracy. Finally, we illustrate the usefulness of the proposal by presenting some actual applications.
Advisors: Kevin McGuinness, Xavier Gir{\'o}-i-Nieto, Noel O{\textquoteright}Connor
School: Dublin City University (Ireland)
The main objective of this project is to implement a new way to compute saliency maps and to locate an object in an image by using a brain-computer interface. To achieve this, the project is centered in designing the proper way to display the different parts of the images to the users in such a way that they generate measurable reactions. Once an image window is shown, the objective is to compute a score based on the EEG activity and compare its result with the current automatic methods to estimate saliency maps. Also, the aim of this work is to use the EEG map as a seed for another segmentation algorithm that will extract the object from the background in an image. This study provides evidence that BCI are useful to find the location of the objects in a simple images via straightforward EEG analysis and this represents the starting point to locate objects in more complex images.
Related post on BitSearch.
\
Foreground segmentation in video sequences is an important area of the image processing that attracts great interest among the scientist community, since it makes possible the detection of the objects that appear in the sequences under analysis, and allows us to achieve a correct performance of high level applications which use foreground segmentation as an initial step. The current Ph.D. thesis entitled Parametric Region-Based Foreground Segmentation in Planar and Multi-View Sequences details, in the following pages, the research work carried out within this field. In this investigation, we propose to use parametric probabilistic models at pixel-wise and region level in order to model the different classes that are involved in the classification process of the different regions of the image: foreground, background and, in some sequences, shadow. The development is presented in the following chapters as a generalization of the techniques proposed for objects segmentation in 2D planar sequences to 3D multi-view environment, where we establish a cooperative relationship between all the sensors that are recording the scene. Hence, different scenarios have been analyzed in this thesis in order to improve the foreground segmentation techniques: In the first part of this research, we present segmentation methods appropriate for 2D planar scenarios. We start dealing with foreground segmentation in static camera sequences, where a system that combines pixel-wise background model with region-based foreground and shadow models is proposed in a Bayesian classification framework. The research continues with the application of this method to moving camera scenarios, where the Bayesian framework is developed between foreground and background classes, both characterized with region-based models, in order to obtain a robust foreground segmentation for this kind of sequences. The second stage of the research is devoted to apply these 2D techniques to multi-view acquisition setups, where several cameras are recording the scene at the same time. At the beginning of this section, we propose a foreground segmentation system for sequences recorded by means of color and depth sensors, which combines different probabilistic models created for the background and foreground classes in each one of the views, by taking into account the reliability that each sensor presents. The investigation goes ahead by proposing foreground segregation methods for multi-view smart room scenarios. In these sections, we design two systems where foreground segmentation and 3D reconstruction are combined in order to improve the results of each process. The proposals end with the presentation of a multi-view segmentation system where a foreground probabilistic model is proposed in the 3D space to gather all the object information that appears in the views. The results presented in each one of the proposals show that the foreground segmentation and also the 3D reconstruction can be improved, in these scenarios, by using parametric probabilistic models for modeling the objects to segment, thus introducing the information of the object in a Bayesian classification framework.
}, url = {http://hdl.handle.net/10803/130813}, author = {Gallego, J.}, editor = {M. Pard{\`a}s} } @article {aGallego13, title = {Region Based Foreground Segmentation Combinig Color and Depth Sensors Via Logarithmic Opinion Pool Decision}, journal = {Journal of Visual Communication and Image Representation}, year = {2013}, month = {04/2013}, abstract = {In this paper we present a novel foreground segmentation system that combines color and depth sensors information to perform a more complete Bayesian segmentation between foreground and background classes. The system shows a combination of spatial-color and spatial-depth region-based models for the foreground as well as color and depth pixel-wise models for the background in a Logarithmic Opinion Pool decision framework used to correctly combine the likelihoods of each model. A posterior enhancement step based on a trimap analysis is also proposed in order to correct the precision errors that the depth sensor introduces. The results presented in this paper show that our system is robust in front of color and depth camouflage problems between the foreground object and the background, and also improves the segmentation in the area of the objects\u2019 contours by reducing the false positive detections that appear due to the lack of precision of the depth sensors.
}, doi = {http://dx.doi.org/10.1016/j.jvcir.2013.03.019}, url = {http://www.sciencedirect.com/science/article/pii/S104732031300059X}, author = {Gallego, J. and M. Pard{\`a}s} } @inbook {bLeon13, title = {Region-based caption text extraction}, booktitle = {Lecture Notes in Electrical Engineering (Analysis, Retrieval and Delivery of Multimedia Content)}, volume = {158}, year = {2013}, month = {07/2012}, pages = {21-36}, publisher = {Springer}, organization = {Springer}, address = {New York}, abstract = {This chapter presents a method for caption text detection. The proposed method will be included in a generic indexing system dealing with other semantic concepts which are to be automatically detected as well. To have a coherent detection system, the various object detection algorithms use a common image description, a hierarchical region-based image model. The proposed method takes advantage of texture and geometric features to detect the caption text. Texture features are estimated using wavelet analysis and mainly applied for\ text candidate spotting. In turn,\ text characteristics verification\ relies on geometric features, which are estimated exploiting the region-based image model. Analysis of the region hierarchy provides the final caption text objects. The final step of\ consistency analysis for output\ is performed by a binarization algorithm that robustly estimates the thresholds on the caption text area of support.
}, keywords = {Text detection}, isbn = {978-1-4614-3830-4}, doi = {10.1007/978-1-4614-3831-1_2}, author = {Le{\'o}n, M. and Ver{\'o}nica Vilaplana and Gasull, A. and Marqu{\'e}s, F.} } @conference {cVentura13a, title = {UPC at MediaEval 2013 Hyperlinking Task}, booktitle = {MediaEval 2013 Workshop}, year = {2013}, month = {10/2013}, publisher = {CEUR Workshop Proceedings Vol-1043}, organization = {CEUR Workshop Proceedings Vol-1043}, address = {Barcelona, Catalonia}, abstract = {These working notes paper present the contribution of the UPC team to the Hyperlinking sub-task of the Search and Hyperlinking Task in MediaEval 2013. Our contribution explores the potential of a solution based only on visual cues.
In particular, every automatically generated shot is represented by a keyframe. The linking between video segments is based on the visual similarity of the keyframes they contain. Visual similarity is assessed with the intersection of bag of features histograms generated with the SURF descriptor.
These working notes present the contribution of the UPC team to the Social Event Detection (SED) task in MediaEval 2013. The proposal extends the previous PhotoTOC work in the domain of shared collections of photographs stored in cloud services. An initial over-segmentation of the photo collection is later refined by merging pairs of similar clusters.
This document contains the work done in INP Grenoble during the second semester of the academic year 2011-2012, completed in Barcelona during the firsts months of the 2012-2013. The work presented consists in a camera motion study in different types of video in order to group fragments that have some similarity in the content.\
In the document it is explained how the data extracted by the program Motion 2D, proportionated by the French university, are treated in order to represented them in a more simplified using motion histograms. It is also explained how the different distances between histograms are calculated and how its similarity is computed.\
Three different distances are used: Manhattan, Euclidean and Bhattacharyya, although in the project there can be found the explanation of some others a little bit more complicated. Different histogram configurations are used, using more or less bins to represent the motion.\
Every possible combination of the number of bins and distances are evaluated using a group of 30 fragments of video and the clustering algorithm K-Means. The clustering results are evaluated using F1-Score, a very popular measurement suitable for clustering algorithms and also classification.
}, url = {http://hdl.handle.net/2099.1/17337}, author = {Tort, Laura}, editor = {Xavier Gir{\'o}-i-Nieto and Rombaut, Mich{\`e}le and Pellerin, Denis} } @mastersthesis {cVentura13b, title = {Visual Object Analysis Using Regions and Interest Points}, journal = {ACM Multimedia}, year = {2013}, month = {10/2013}, abstract = {This dissertation research will explore region-based and interest points based image representations, two of the most-used image models for object detection, image classification, and visual search among other applications. We will analyze the relationship between both representations with the goal of proposing a new hybrid representation that takes
advantage of the strengths and overcomes the weaknesses of both approaches. More specifically, we will focus on the gPb-owt-ucm segmentation algorithm and the SIFT local features since they are the most contrasted techniques in their respective fields. Furthermore, using an object retrieval benchmark, this dissertation research will analyze three basic questions: (i) the usefulness of an interest points hierarchy based on a contour strength signal, (ii) the influence of the context on both interest points location and description, and (iii) the analysis of regions as spatial support for bundling interest points.
This dissertion research explores two of the most-used image models for object detection, 3D reconstruction, visual search: region-based and interest-points image representations; and will try to provide a new image model to take advantage of the strengths and overcome the weaknesses of both approaches. More specifically, we will focus on the gPb-owt-ucm segmentation algorithm and the SIFT local features since they are the most contrasted techniques in their respective fields. Furthermore, using an object retrieval benchmark, this dissertation research will analyze three basic questions: (i) the usefulness of an interest points hierarchy based on a contour strength signal, (ii) the influence of the context on both interest points location and description, and (iii) the analysis of regions as spatial support for bundling interest points.\
In this paper we present a foreground segmentation and tracking system for monocular static camera sequences and indoor scenarios that achieves correct foreground detection also in those complicated scenes where similarity between foreground and background colours appears. The work flow of the system is based on three main steps: An initial foreground detection performs a simple segmentation via Gaussian pixel color modeling and shadows removal. Next, a tracking step uses the foreground segmentation for identifying the objects, and tracks them using a modified mean shift algorithm. At the end, an enhanced foreground segmentation step is formulated into a Bayesian framework. For this aim, foreground and shadow candidates are used to construct probabilistic foreground and shadow models. The Bayesian framework combines a pixel-wise color background model with spatial-color models for the foreground and shadows. The final classification is performed using the graph-cut algorithm. The tracking step allows a correct updating of the probabilistic models, achieving a foreground segmentation that reduces the false negative and false positive detections, and obtaining a robust segmentation and tracking of each object of the scene.
}, keywords = {Foreground segmentation, GMM, Objects tracking, Shadow model, Space-color models}, issn = {0167-8655}, doi = {10.1016/j.patrec.2012.05.004}, url = {http://www.sciencedirect.com/science/article/pii/S016786551200164X}, author = {Gallego, J. and M. Pard{\`a}s and Haro, G.} } @inbook {bGallego12, title = {Foreground objects segmentation for moving camera scenarios based on SCGMM}, booktitle = {Computational Intelligence for Multimedia Understanding}, volume = {7252}, number = {Lecture Notes in Computer Science}, year = {2012}, month = {09/2011}, pages = {195-206}, publisher = {Springer}, organization = {Springer}, address = {Berlin Heidelberg}, abstract = {In this paper we present a new system for segmenting non-rigid objects in moving camera sequences for indoor and outdoor scenarios that achieves a correct object segmentation via global MAP-MRF framework formulation for the foreground and background classification task. Our proposal, suitable for video indexation applications, receives as an input an initial segmentation of the object to segment and it consists of two region-based parametric probabilistic models to model the spatial (x,y) and color (r,g,b) domains of the foreground and background classes. Both classes rival each other in modeling the regions that appear within a dynamic region of interest that includes the foreground object to segment and also, the background regions that surrounds the object. The results presented in the paper show the correctness of the object segmentation, reducing false positive and false negative detections originated by the new background regions that appear near the region of the object.
}, keywords = {moving camera sequences, Object segmentation, SCGMM, video indexation}, issn = {978-3-642-32435-2}, doi = {10.1007/978-3-642-32436-9_17}, url = {http://www.springerlink.com/content/r6u266562h586476/}, author = {Gallego, J. and M. Pard{\`a}s and Solano, M.} } @inbook {bVentura12, title = {Hierarchical Navigation and Visual Search for Video Keyframe Retrieval}, booktitle = {Advances in Multimedia Modeling}, series = {Lecture Notes in Computer Science}, volume = {7131}, year = {2012}, pages = {652-654}, publisher = {Springer Berlin / Heidelberg}, organization = {Springer Berlin / Heidelberg}, abstract = {This work presents a browser that supports two strategies for video browsing: the navigation through visual hierarchies and the retrieval of similar images. The input videos are firstly processed by a keyframe extractor to reduce the temporal redundancy and decrease the number of elements to consider. These generated keyframes are hierarchically clustered with the Hierachical Cellular Tree (HCT) algorithm, an indexing technique that also allows the creation of data structures suitable for browsing. Different clustering criteria are available, in the current implementation, based on four MPEG-7 visual descriptors computed at the global scale. The navigation can directly drive the user to find the video timestamps that best match the query or to a keyframe which is globally similar in visual terms to the query. In the latter case, a visual search engine is also available to find other similar keyframes, based as well on MPEG-7 visual descriptors.
Winners of the Novice Run at the VideoBrowser Showdown 2012 at the 18th International Conference on MultiMedia Modeling, Kalgenfurt, Austria, January 4-6, 2012.
}, keywords = {hierarchical navigation, image retrieval, video browser}, isbn = {978-3-642-27354-4}, doi = {10.1007/978-3-642-27355-1_67}, url = {http://dx.doi.org/10.1007/978-3-642-27355-1_67}, author = {Ventura, C. and Martos, Manel and Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @conference {cGiro-i-Nieto12a, title = {Interactive segmentation and tracking of video objects}, booktitle = {Image Analysis for Multimedia Interactive Services (WIAMIS), 2012 13th International Workshop on}, year = {2012}, month = {05/2012}, publisher = {IEEE}, organization = {IEEE}, address = {Dublin, Ireland}, abstract = {This paper describes a mechanism to interactively segment objects from a sequence of video frames. The extracted object can be later embedded in a different background, associated to local scale metadata or used to train an automatic object detector. The workflow requires the interaction of the user at two stages: the temporal segmentation of the frames contain- ing the object and the generation of an object mask to initial- ize a video tracker. The mask is defined as a combination of regions generated by an image segmentation algorithm. This framework has been integrated in an annotation tool available to the public.\
}, keywords = {Image segmentation, Object segmentation, Proposals, semantics, Signal processing algorithms, Video sequences, Visualization}, doi = {10.1109/WIAMIS.2012.6226749}, url = {http://dx.doi.org/10.1109/WIAMIS.2012.6226749}, author = {Xavier Gir{\'o}-i-Nieto and Martos, Manel} } @conference {cLopez-Mendez12, title = {Metric Learning from Poses for Temporal Clustering of Human Motion}, booktitle = {British Machine Vision Conference 2012}, year = {2012}, month = {09/2012}, address = {Guildford, UK}, abstract = {Temporal clustering of human motion into semantically meaningful behaviors is a challenging task. While unsupervised methods do well to some extent, the obtained clusters often lack a semantic interpretation. In this paper, we propose to learn what makes a sequence of human poses different from others such that it should be annotated as an action. To this end, we formulate the problem as weakly supervised temporal clustering for an unknown number of clusters. Weak supervision is attained by learning a metric from the implicit semantic distances derived from already annotated databases. Such a metric contains some low-level semantic information that can be used to effectively segment a human motion sequence into distinct actions or behaviors. The main advantage of our approach is that metrics can be successfully used across datasets, making our method a compelling alternative to unsupervised methods. Experiments on publicly available mocap datasets show the effectiveness of our approach.
}, doi = {10.5244/C.26.49}, author = {L{\'o}pez-M{\'e}ndez, A. and Gall, J. and Casas, J. and van Gool, L.} } @conference {cGiro-i-Nieto12b, title = {Multiscale annotation of still images with GAT}, booktitle = {Proceedings of the 1st International Workshop on Visual Interfaces for Ground Truth Collection in Computer Vision Applications}, year = {2012}, publisher = {ACM}, organization = {ACM}, address = {Capri, Italy}, abstract = {This paper presents GAT, a Graphical Annotation Tool for still images that works both at the global and local scales. This interface has been designed to assist users in the an- notation of images with relation to the semantic classes de- scribed in an ontology. Positive, negative and neutral labels can be assigned to both the whole images or parts of them. The user interface is capable of exploiting segmentation data to assist in the selection of objects. Moreover, the annota- tion capabilities are complemented with additional function- alities that allow the creation and evaluation of an image classifier. The implemented Java source code is published under a free software license.\
}, keywords = {annotation, image, interactive, segmentation, semantics}, isbn = {978-1-4503-1405-3}, doi = {10.1145/2304496.2304497}, url = {http://doi.acm.org/10.1145/2304496.2304497}, author = {Xavier Gir{\'o}-i-Nieto and Martos, Manel} } @conference {cGiro-i-Nieto12, title = {Part-Based Object Retrieval With Binary Partition Trees}, booktitle = {Doctoral Consortium in Computer Vision and Pattern Recognition (CVPR)}, year = {2012}, month = {06/2012}, publisher = {IEEE Computer Society}, organization = {IEEE Computer Society}, address = {Providence (RI), USA}, abstract = {This Phd thesis {\textquotedblleft}Part-based Object Retrieval with Binary Partition Trees{\textquotedblright} addresses the problem of\ visual object retrieval, where a user formulates a query to an image database by providing one or\ multiple examples of an object of interest. The developed techniques aim both at \ finding those\ images in the database that contain the object as well as locating the object in the image and\ segmenting it from the background. Every considered image, both the ones used as queries and the\ ones contained in the target database, is represented as a Binary Partition Tree (BPT), the hierarchy\ of regions previously proposed by Salembier and Garrido (2000). This data structure offers multiple\ opportunities and challenges when applied to the object retrieval problem.
One application of BPTs appears during the formulation of the query, when the user must\ interactively segment the query object from the background. Firstly, the BPT can assist in adjusting\ an initial marker, such as a scribble or bounding box, to the object contours. Secondly, BPT can also\ define a navigation path for the user to adjust an initial selection to the appropriate scale. The\ hierarchical structure of the BPT is also exploited to extract a new type of visual words named\ Hierarchical Bag of Regions (HBoR). Each region defined in the BPT is characterized with a\ feature vector that combines a soft quantization on a visual codebook with an efficient bottom-up\ computation through the BPT. These features allow the definition of a novel feature space, the Parts\ Space, where each object is located according to the parts that compose it.
HBoR features have been applied to two scenarios for object retrieval, both of them solved\ by considering the decomposition of the objects in parts. In the first scenario, the query is formulated\ with a single object exemplar which is to be matched with each BPT in the target database. The\ matching problem is solved in two stages: an initial top-down one that assumes that the hierarchy\ from the query is \ respected in the target BPT, and a second bottom-up one that relaxes \ this\ condition and considers region merges which are not in the target BPT. The second scenario where
HBoR features are applied considers a query composed of several visual objects. In this case, theprovided exemplars are considered as a training set to build a model of the query concept. This\ model is composed of two levels, a \ first one where each part is modelled and detected separately,\ and a second one that characterises the combinations of parts that describe the complete object. The\ analysis process exploits the hierarchical nature of the BPT by using a novel classifier that drives an\ efficient top-down analysis of the target BPTs.
}, author = {Xavier Gir{\'o}-i-Nieto} } @phdthesis {dGiro-i-Nieto12, title = {Part-Based Object Retrieval With Binary Partition Trees}, volume = {Phd}, year = {2012}, month = {05/2012}, pages = {215}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, address = {Barcelona, Catalonia}, abstract = {This thesis addresses the problem of visual object retrieval, where a user formulates a query to an image database by providing one or multiple examples of an object of interest. The presented techniques aim both at finding those images in the database that contain the object as well as locating the object in the image and segmenting it from the background.
Every considered image, both the ones used as queries and the ones contained in the target database, is represented as a Binary Partition Tree (BPT), the hierarchy of regions previously proposed by Salembier and Garrido (2000). This data structure offers multiple opportunities and challenges when applied to the object retrieval problem.
One application of BPTs appears during the formulation of the query, when the user must interactively segment the query object from the background. Firstly, the BPT can assist in adjusting an initial marker, such as a scribble or bounding box, to the object contours. Secondly, BPT can also define a navigation path for the user to adjust an initial selection to the appropriate scale.
The hierarchical structure of the BPT is also exploited to extract a new type of visual words named Hierarchical Bag of Regions (HBoR). Each region defined in the BPT is char- acterized with a feature vector that combines a soft quantization on a visual codebook with an ecient bottom-up computation through the BPT. These features allow the definition of a novel feature space, the Parts Space, where each object is located according to the parts that compose it.
HBoR features have been applied to two scenarios for object retrieval, both of them solved by considering the decomposition of the objects in parts. In the first scenario, the query is formulated with a single object exemplar which is to be matched with each BPT in the target database. The matching problem is solved in two stages: an initial top-down one that assumes that the hierarchy from the query is respected in the target BPT, and a second bottom-up one that relaxes this condition and considers region merges which are not in the target BPT.
The second scenario where HBoR features are applied considers a query composed of several visual objects, such as a person, a bottle or a logo. In this case, the provided exemplars are considered as a training set to build a model of the query concept. This model is composed of two levels, a first one where each part is modelled and detected separately, and a second one\ that characterises the combinations of parts that describe the complete object. The analysis process exploits the hierarchical nature of the BPT by using a novel classifier that drives an efficient top-down analysis of the target BPTs.\ \
\
This paper describes a system developed for the semi- automatic annotation of keyframes in a broadcasting company. The tool aims at assisting archivists who traditionally label every keyframe manually by suggesting them an automatic annotation that they can intuitively edit and validate. The system is valid for any domain as it uses generic MPEG-7 visual descriptors and binary SVM classifiers. The classification engine has been tested on the multiclass problem of semantic shot detection, a type of metadata used in the company to index new con- tent ingested in the system. The detection performance has been tested in two different domains: soccer and parliament. The core engine is accessed by a Rich Internet Application via a web service. The graphical user interface allows the edition of the suggested labels with an intuitive drag and drop mechanism between rows of thumbnails, each row representing a different semantic shot class. The system has been described as complete and easy to use by the professional archivists at the company.
This paper presents a Problem-Based Learning activity that introduce machine learning contents in a study plan that is based on electronics and signal processing subjects. The activity proposes students to follow the same evaluation protocols adopted in scientific challenges, where differents research groups test their own machine learning techniques on a common dataset and evaluation metrics. Teams of students adopt the role of a research group to develop their novel solution for classifying images depending on\ whether they represent a certain semantic concepts. Student must implement software solutions that are tested and presented in class to prove their performance and originalty. This paper includes a detailed evaluation of the workload and satisfaction activity, based on questionaries answered by the students.
}, keywords = {groups, machine learning, problem-based learning}, url = {http://hdl.handle.net/2117/16354}, author = {Mohedano, Eva and Xavier Gir{\'o}-i-Nieto} } @article {aButko11, title = {Acoustic event detection based on feature-level fusion of audio and video modalities}, journal = {Eurasip journal on advances in signal processing}, volume = {2011}, year = {2011}, pages = {1{\textendash}11}, abstract = {Acoustic event detection (AED) aims at determining the identity of sounds and their temporal position in audio signals. When applied to spontaneously generated acoustic events, AED based only on audio information shows a large amount of errors, which are mostly due to temporal overlaps. Actually, temporal overlaps accounted for more than 70\% of errors in the real-world interactive seminar recordings used in CLEAR 2007 evaluations. In this paper, we improve the recognition rate of acoustic events using information from both audio and video modalities. First, the acoustic data are processed to obtain both a set of spectrotemporal features and the 3D localization coordinates of the sound source. Second, a number of features are extracted from video recordings by means of object detection, motion analysis, and multicamera person tracking to represent the visual counterpart of several acoustic events. A feature-level fusion strategy is used, and a parallel structure of binary HMM-based detectors is employed in our work. The experimental results show that information from both the microphone array and video cameras is useful to improve the detection rate of isolated as well as spontaneously generated acoustic events.
}, issn = {1687-6172}, doi = {10.1155/2011/485738}, url = {http://www.hindawi.com/journals/asp/2011/485738/}, author = {Butko, T. and Cristian Canton-Ferrer and Segura, C. and Xavier Gir{\'o}-i-Nieto and Nadeu, C. and Hernando, J. and Casas, J.} } @mastersthesis {xRubiano11, title = {B{\'u}squeda Visual con Retroacci{\'o}n de Relevancia Basada en Actualizacion de Pesos}, year = {2011}, abstract = {This project presents the design and implementation of different techniques of Relevance Feedback for image searches. These techniques use the user interaction with the search results to estimate the importance of various search criteria in the request submitted by the user. The results have been performed through the linear combination of similarity measures of different MPEG-7 visual descriptors. The results of this project have been compared with those previously obtained in the Final Degree Project by Carles Ventura. For this reason the system has been evaluated with the reference database, MPEG-7 Common Color Dataset (CCD).
Video retrieval through text queries is a very common practice in broadcaster archives. The query keywords are compared to the metadata labels that documentalists have previously associated to the video assets. This paper focuses on a ranking strategy to obtain more relevant keyframes among the top hits of the results ranked lists but, at the same time, keeping a diversity of video assets. Previous solutions based on a random walk over a visual similarity graph have been modified to increase the asset diversity by filtering the edges between keyframes depending on their asset. The random walk algorithm is applied separately for ever visual feature to avoid any normalization issue between visual similarity metrics. Finally, this work evaluates performance with two separate metrics: the relevance is measured by the Average Precision and the diversity is assessed by the Average Diversity, a new metric presented in this work.
}, isbn = {978-1-4503-0336-1}, doi = {10.1145/1991996.1992052}, url = {http://dx.doi.org/10.1145/1991996.1992052}, author = {Xavier Gir{\'o}-i-Nieto and Alfaro, M. and Marqu{\'e}s, F.} } @conference {cBragos11a, title = {Implementation and first results of the Introduction to Engineering course in the ETSETB-UPC new degrees}, booktitle = {II Conferencia Internacional en Fomento e Innovaci{\'o}n con Nuevas Tecnolog{\'\i}as en la Docencia de la Ingenier{\'\i}a}, year = {2011}, pages = {1{\textendash}4}, isbn = {978-1-4577-0559-5}, doi = {10.1109/FINTDI.2011.5945971}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5945971}, author = {Bragos, R. and Pegueroles, J. and Alarc{\'o}n, E. and Camps, A. and Sarda, J. and Consolacion, C. and Mussons, J. and Pons, O. and Albert Oliveras and Garc{\'\i}a, M. and Onrubia, R. and Elisa Sayrol} } @mastersthesis {xTella11, title = {Interactive Image Processing demonstrations for the web}, year = {2011}, abstract = {This diploma thesis aims to provide a framework for developing web applications for ImagePlus, the software develpment platform in C++ of the Image Processing Group of the Technical University of Catalonia (UPC). These web applications are to demonstrate the functionality of the image processing algorithms to any visitor to the group website. Developers are also benefited from this graphical user interface because they can easily create Graphical User Interfaces (GUIs) for the processing algorithms
In this paper we present a novel foreground segmentation and 3D reconstruction system for multi-view scenarios. The system achieves correct 3D object reconstruction even when foreground segmentation presents critical misses in some of the views. We introduce the spatial redundancy of the multi-view data into the foreground segmentation process by combining segmentation and the 3D reconstruction in a two steps workflow. First, the segmentation of the objects in each view uses a monocular, region-based foreground segmentation in a MAP-MRF framework for foreground, background and shadow classes. Next, we compute an iterative volume reconstruction in a 3D tolerance loop, obtaining an iteratively enhanced SfS volume. Foreground segmentation is improved by updating the foreground model of each view at each iteration. The results presented in this paper show the improved foreground segmentation and the reduction of errors in the reconstruction of the volume.
}, isbn = {978-1-4577-1304-0}, doi = {10.1109/ICIP.2011.6116731}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=6116731}, author = {Gallego, J. and Salvador, J. and Casas, J. and M. Pard{\`a}s} } @mastersthesis {xAlfaro11 , title = {Reordenaci{\'o} i agrupament d{\textquoteright}imatges d{\textquoteright}una cerca de v{\'\i}deo}, year = {2011}, month = {01/2011}, abstract = {La recuperaci{\'o} de v{\'\i}deo a trav{\'e}s de consultes textuals es una practica molt com{\'u} en els arxius de radiodifusi{\'o}. Les paraules clau de les consultes son comparades amb les metadades que s{\textquoteright}anoten manualment als assets de v{\'\i}deo pels documentalistes. A m{\'e}s, les cerques textuals b{\`a}siques generen llistes de resultats planes, on tots els resultats tenen la mateixa import{\`a}ncia, ja que, es limita a avaluar bin{\`a}riament si la paraula de cerca apareix o no entre les metadades associades als continguts. A m{\'e}s, acostumen a mostrar continguts molt similars, donant al usuari una llista ordenada de resultats de poca diversitat visual. La redund{\`a}ncia en els resultats provoca un malbaratament d{\textquoteright}espai a la interf{\'\i}cie gr{\`a}fica d{\textquoteright}usuari (GUI) que sovint obliga a l{\textquoteright}usuari a interactuar fortament amb la interf{\'\i}cie gr{\`a}fica fins localitzar els resultats rellevants per a la seva cerca. La aportaci{\'o} del present projecte consisteix en la presentaci{\'o} d{\textquoteright}una estrat{\`e}gia de reordenaci{\'o} i agrupaci{\'o} per obtenir keyframes de major rellev{\`a}ncia entre els primers resultats, per{\`o} al mateix temps mantenir una diversitat d{\textquoteright}assets. D{\textquoteright}aquesta forma, aquestes t{\`e}cniques permetran millorar els sistemes de visualitzaci{\'o} d{\textquoteright}imatges resultants d{\textquoteright}una cerca de v{\'\i}deo. L{\textquoteright}eina global es dissenya per ser integrada en l{\textquoteright}entorn del Digition, el gestor de continguts audiovisuals de la Corporaci{\'o} Catalana de Mitjans Audiovisuals.
This thesis describes the graphical user interface developed for semi-automatic keyframebased semantic shot annotation and the semantic shot classifiers built. The graphical user interface aims to optimize the current indexation process by substituting manual annotation for automatic annotation and validation. The system is based on supervised learning binary classifiers and web services. The graphical user interface provides the necessary tools to fix and validate the automatic detections and to learn from the user feedback to retrain the system and improve it. Results of the classifiers evaluation, performed using cross-validation methods, show a good performance in terms of precision and recall. The graphical user interface has been described as complete and easy to use by a professional documentalist at a broadcast company.
One of the challenges in the development of an image retrieval system is to achieve an efficient indexing scheme since both developers and users, who are used to make requests in order to find a multimedia element in a large database, can be frustrated due to the long computational time of the search.
The traditional indexing schemes neither fulfil the dynamic indexing requirement, which allows to add or remove elements from the structure, nor fit well in high dimensional feature spaces due to the phenomenon so called {\textquotedblleft}the curse of dimensionality.{\textquotedblright}
After analysing several indexing techniques from the literature, we have decided to implement an indexing scheme called Hierarchical Cellular Tree (HCT), which was designed to bring an effective solution especially for indexing large multimedia databases. The HCT has allowed to improve the performance of our implemented image retrieval system based on the MPEG-7 visual descriptors. We have also made some contributions by proposing some modifications to the original HCT which have resulted in an improvement of its performance. Thus, we have proposed a redefinition of the covering radius, which does not consider only the elements belonging to the cell, but also all the elements holding from that cell. Since this consideration implies a much more computationally costly algorithm, we have proposed an approximation by excess for the covering radius value. However, we have also implemented a method which allows to update the covering radius to its actual value whenever it is desired. In addition to this, the pre-emptive insertion method has been adapted as a searching technique in order to improve the performance given by the retrieval scheme called Progressive Query, which was originally proposed to be used over the HCT.
Furthermore, the HCT indexing scheme has been also adapted to a server/client architecture by using a messenger system called KSC, which allows to have the HCT loaded on a server waiting for the query requests which are launched for the several clients of the retrieval system. In addition to this, the tool used to request a search over the indexed database has been adapted to a graphic user interface, named GOS (Graphic Object Searcher), which allows the user to order the retrievals in a more friendly way.
This paper presents BitSearch, a web blog written by a team of thesis students where they share\ the evolution of their work. The blog is aimed at improving the communication not only between\ the advisor and the students, but also at motivating the student through the public exposure of the\ research \ development. \ Basic \ wrting \ guidelines \ are \ provided \ by \ the \ professor \ in \ order \ to\ guarantee the quality of the posts and provide good metadata for their retrieval, both by author or\ by a generic text query from a search engine. After one year online, BitSearch has published 176\ posts written by 19 students, in addition to the professor{\textquoteright}s own contributions. Statistics show\ more than 7,000 \ accesses their pages from visitors from more than 100 countries all over the\ globe. The blog tool has helped in improving the guidance on the students activity, developing a\ sense \ of \ team \ work \ among \ authors \ as \ well \ as \ a more \ progressive \ preparation \ of the \ final\ dissertation.
}, keywords = {blog, ~online~learning, ~web-based~tools}, isbn = {978-84-8458-324-0}, url = {http://hdl.handle.net/2117/11371}, author = {Xavier Gir{\'o}-i-Nieto} } @conference {cGiro-i-Nieto10, title = {Digimatge, a Rich Internet Application for video retrieval from a Multimedia Asset Management system}, booktitle = {11th ACM SIGMM International Conference on Multimedia Information Retrieval}, year = {2010}, abstract = {This paper describes the integration of two new services aimed at assisting into the retrieval of video content from an existing Multimedia Asset Manager (MAM) of a TV broadcaster archive. The first tool suggests tags after a first textual query, and the second ranks the keyframe of retrieved assets according to their visual similarity. Both applications were integrated as web services that are accessed from a Rich Internet Application via REST calls.
}, isbn = {978-1-60558-815-5}, doi = {10.1145/1743384.1743458}, url = {http://upcommons.upc.edu/e-prints/urlFiles?idDrac=2249267}, author = {Xavier Gir{\'o}-i-Nieto and Salla, R. and Vives, X.} } @conference {cGallego10, title = {Enhanced bayesian foreground segmentation using brightness and color distortion region-based model for shadow removal}, booktitle = {2010 IEEE International Conference on Image Processing}, year = {2010}, pages = {3449{\textendash}3452}, isbn = {1522-4880}, doi = {10.1109/ICIP.2010.5653897}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5653897\&tag=1}, author = {Gallego, J. and M. Pard{\`a}s} } @mastersthesis {xmunoz-trallero10, title = {Extensi{\'o} d{\textquoteright}una interf{\'\i}cie de cerca d{\textquoteright}imatges a les consultes amb regions}, year = {2010}, abstract = {Aquest {\'e}s un projecte universitat-empresa que presenta una interf{\'\i}cie de cerca multimodal a partir de la integraci{\'o} de diferents eines ja existents per crear una eina global que combina les cerques textuals amb les consultes mitjan{\c c}ant exemple, ja sigui una consulta basada en una imatge o en una regi{\'o} d{\textquoteright}aquesta imatge. L{\textquoteright}eina global s{\textquoteright}integra en l{\textquoteright}entorn del Digition, el gestor de continguts audiovisuals de la Corporaci{\'o} Catalana de Mitjans Audiovisuals. La principal aportaci{\'o} del present projecte consisteix en la creaci{\'o} d{\textquoteright}una interf{\'\i}cie de cerca d{\textquoteright}imatges basada en regions i la seva integraci{\'o} en l{\textquoteright}eina global.
This article presents GAT, a Graphical Annotation Tool based on a region-based hierarchical representation of images. The proposed solution uses Partition Trees to navigate through the image segments which are automatically defined at different spatial scales. Moreover, the system focuses on the navigation through ontologies for a semantic annotation of objects and of the parts that compose them. The tool has been designed under usability criteria to minimize the user interaction by trying to predict the future selection of regions and semantic classes. The implementation uses MPEG-7/XML input and output data to allow interoperability with any type of Partition Tree. This tool is publicly available and its source code can be downloaded under a free software license.
}, issn = {1380-7501}, doi = {10.1007/s11042-009-0389-2}, url = {http://www.springerlink.com/content/j78782k762617352/}, author = {Xavier Gir{\'o}-i-Nieto and Camps, N. and Marqu{\'e}s, F.} } @article {xCortes10, title = {GOS: b{\'u}squeda visual de im{\'a}genes}, number = {25}, year = {2010}, pages = {36{\textendash}44}, keywords = {i3media}, issn = {1698-7047}, url = {http://upcommons.upc.edu/e-prints/urlFiles?idDrac=2251008}, author = {Cort{\'e}s, S.}, editor = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @article {aGudmundsson10, title = {Improved 3D reconstruction in smart-room environments using ToF imaging}, journal = {Computer vision and image understanding}, volume = {114}, number = {12}, year = {2010}, month = {12/2010}, pages = {1376{\textendash}1384}, abstract = {This paper presents a general analysis framework towards exploiting the underlying hierarchical and scalable structure of an articulated object for pose estimation and tracking. Scalable human body models are introduced as an ordered set of articulated models fulfilling an inclusive hierarchy. The concept of annealing is applied to derive a generic particle filtering scheme able to perform a sequential filtering over the set of models contained in the scalable human body model. Two annealing loops are employed, the standard likelihood annealing and the newly introduced structural annealing, leading to a robust, progressive and efficient analysis of the input data. The validity of this scheme is tested by performing markerless human motion capture in a multi-camera environment employing the standard HumanEva annotated datasets. Finally, quantitative results are presented and compared with other existing HMC techniques.
}, issn = {1077-3142}, doi = {10.1016/j.cviu.2010.07.011}, url = {http://www.sciencedirect.com/science/article/pii/S1077314210001736}, author = {Gudmundsson, S. and M. Pard{\`a}s and Casas, J. and Sveinsson, J. and Aanaes, H. and Larsen, R.} } @mastersthesis {x10, title = {Interf{\'\i}cie gr{\`a}fica d{\textquoteright}usuari per a l{\textquoteright}avaluaci{\'o} de classificadors d{\textquoteright}imatges}, year = {2010}, abstract = {Aquest projecte final de carrera s{\textquoteright}ha realitzat amb l{\textquoteright}objectiu de crear una interf{\'\i}cie gr{\`a}fica capa{\c c} d{\textquoteright}avaluar diferents classificadors d{\textquoteright}una forma gr{\`a}fica i molt intu{\"\i}tiva. La interf{\'\i}cie gr{\`a}fica s{\textquoteright}ha d{\textquoteright}implementar dins del Graphic Annotation Tool (GAT), una aplicaci{\'o} de programari lliure creada pel Grup de Processament de la Imatge de la UPC. El projecte es basa en la creaci{\'o} d{\textquoteright}unes anotacions manuals per tal de realitzar un entrenament a partir d{\textquoteright}un algoritme d{\textquoteright}aprenentatge supervisat. Els resultats d{\textquoteright}aquesta classificaci{\'o} seran visualitzats per la nova pestanya del GAT anomenada Classifier. Aix{\`o} ser{\`a} possible gr{\`a}cies a la integraci{\'o} dels motors de d{\textquoteright}entrenament i de detecci{\'o}, a la incorporaci{\'o} del algoritme de validaci{\'o} creuada aix{\'\i} com que sigui possible triar entre diferents classificadors. Per tal d{\textquoteright}avaluar aquests resultats es calculen la precisi{\'o}, el record, els falsos positius i els falsos negatius.
Els tel{\`e}fons intel{\textperiodcentered}ligents (smartphones) estan totalment integrats a la societat. Nombroses cadenes de televisi{\'o}, prove{\"\i}dores de continguts o portals webs desenvolupen aplicacions per a veure la televisi{\'o} en directe o reproduir, en temps real, v{\'\i}deos escollits a la carta. En aquest projecte es desenvolupa un servei de v{\'\i}deos a la carta per a iPhone, que permet reproduir v{\'\i}deos en temps real utilitzant el protocol HTTP Live Streaming d{\textquoteright}Apple. En aquesta mem{\`o}ria es detalla com configurar un servidor de v{\'\i}deo i el desenvolupament d{\textquoteright}una aplicaci{\'o} per a l{\textquoteright}iPhone que reprodueixi v{\'\i}deos en temps real. El servidor es programa i configura en Linux i s{\textquoteright}utilitza programari lliure per a l{\textquoteright}adaptaci{\'o} del v{\'\i}deo segons els requeriments de HTTP Live Streaming. Tamb{\'e} s{\textquoteright}emmagatzemen les metadades del v{\'\i}deo en una base de dades, a la qual s{\textquoteright}accedeix des de l{\textquoteright}iPhone per a mostrar-les en la interf{\'\i}cie. L{\textquoteright}aplicaci{\'o} per a l{\textquoteright}iPhone, permet navegar pel cat{\`a}leg de v{\'\i}deos, visualitzar les metadades del v{\'\i}deo com el t{\'\i}tol, la descripci{\'o} o una miniatura del v{\'\i}deo i reproduir-lo en temps real i en el dispositiu. Aquesta s{\textquoteright}ha desenvolupat en Mac, programant amb Objective-C i utilitzant les eines del kit de desenvolupament per a iPhone (iPhone SDK).
This paper presents the system architecture of a Content-Based Image Retrieval system implemented as a web service. The proposed solution is composed of two parts, a client running a graphical user interface for query formulation and a server where the search engine explores an image repository. The separation of the user interface and the search engine follows a Service as a Software (SaaS) model, a type of cloud computing design where a single core system is online and available to authorized clients. The proposed architecture follows the REST software architecture and HTTP protocol for communications, two solutions that combined with metadata coded in RDF, make the proposed system ready for its integration in the semantic web. User queries are formulated by visual examples through a graphical interface and content is remotely accessed also through HTTP communication. Visual descriptors and similarity measures implemented in this work are mostly defined in the MPEG-7 standard, while textual metadata is coded according to the Dublin Core specifications.
}, isbn = {978-1-4503-0117-6}, doi = {10.1145/1816041.1816093}, url = {http://doi.acm.org/10.1145/1816041.1816093}, author = {Xavier Gir{\'o}-i-Nieto and Ventura, C. and Jordi Pont-Tuset and Cort{\'e}s, S. and Marqu{\'e}s, F.} } @mastersthesis {xSancho10, title = {Tweet@TV: Televisi{\'o} social en 140 car{\`a}cters}, year = {2010}, abstract = {Aquest Projecte de Final de Carrera se centra en aquesta vessant dels serveis interactius de la televisi{\'o}, la televisi{\'o} social. Durant la seva realitzaci{\'o} s{\textquoteright}ha desenvolupat una aplicaci{\'o} per accedir a una xarxa social d{\textquoteright}una forma integrada i sincronitzada amb el consum de televisi{\'o}. Seguint la l{\'\i}nia de recerca del PFC d{\textquoteright}en Manel Martos, Adaptaci{\'o} i distribuci{\'o} de continguts web per IPTV, aquest projecte s{\textquoteright}ha realitzat en l{\textquoteright}empresa Activa Multim{\`e}dia Digital de la Corporaci{\'o} Catalana de Mitjans Audiovisuals entre els mesos de febrer i maig de 2010 en el marc del projecte CREA-IPTV.
Guardonat amb el segon premi de la convocat{\`o}ria Premios Liberalizaci{\'o}n de las Telecomunicaciones 2010 del Colegio de Ingenieros T{\'e}cnicos de Telecomunicaci{\'o}n (COITT), Espanya.
Premi millor projecte final de carrera d{\textquoteright}Enginyeria de Telecomunicaci{\'o} en Serveis Telem{\`a}tics. Atorgat per Accenture (Curs 2009-2010)
\
\
Acoustic events produced in meeting environments may contain useful information for perceptually aware interfaces and multimodal behavior analysis. In this paper, a system to detect and recognize these events from a multimodal perspective is presented combining information from multiple cameras and microphones. First, spectral and temporal features are extracted from a single audio channel and spatial localization is achieved by exploiting cross-correlation among microphone arrays. Second, several video cues obtained from multi-person tracking, motion analysis, face recognition, and object detection provide the visual counterpart of the acoustic events to be detected. A multimodal data fusion at score level is carried out using two approaches: weighted mean average and fuzzy integral. Finally, a multimodal database containing a rich variety of acoustic events has been recorded including manual annotations of the data. A set of metrics allow assessing the performance of the presented algorithms. This dataset is made publicly available for research purposes.
}, doi = {10.1109/CVPRW.2009.5204264}, author = {Cristian Canton-Ferrer and Butko, T. and Segura, C. and Xavier Gir{\'o}-i-Nieto and Nadeu, C. and Hernando, J. and Casas, J.} } @conference {cGallego09, title = {Bayesian foreground segmentation and tracking using pixel-wise background model and region-based foreground model}, booktitle = {16th IEEE International Conference on Image Processing}, year = {2009}, pages = {3205{\textendash}3208}, doi = {10.1109/ICIP.2009.5414380}, url = {http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=\&arnumber=5414380}, author = {Gallego, J. and M. Pard{\`a}s and Haro, G.} } @conference {cLeon09, title = {Caption text extraction for indexing purposes using a hierarchical region-based image model}, booktitle = {16th International Conference on Image Processing}, year = {2009}, pages = {1869{\textendash}1872}, isbn = {978-1-4244-5655-0}, doi = {http://dx.doi.org/10.1109/ICIP.2009.5413607}, url = {http://hdl.handle.net/2117/7940}, author = {Le{\'o}n, M. and Ver{\'o}nica Vilaplana and Gasull, A. and Marqu{\'e}s, F.} } @conference {cButko09, title = {Improving Detection of Acoustic Events Using Audiovisual Data and Feature Level Fusion}, booktitle = {10th Annual Conference of the International Speech Communication Association}, year = {2009}, pages = {1147{\textendash}1150}, isbn = {978-1-61567-692-7}, url = {http://gps-tsc.upc.es/imatge/_Xgiro/research/publications/2009/interspeech.pdf}, author = {Butko, T. and Cristian Canton-Ferrer and Segura, C. and Xavier Gir{\'o}-i-Nieto and Nadeu, C. and Hernando, J. and Casas, J.} } @mastersthesis {dCortes09, title = {Interfaz gr{\'a}fica de usuario para la b{\'u}squeda de im{\'a}genes basada en im{\'a}genes}, year = {2009}, month = {07/2009}, type = {BSc}, abstract = {El Proyecto de Final de Carrera (PFC) responde a esa necesidad de creaci{\'o}n de herramientas de acceso a contenido multimedia, nuevas herramientas que faciliten la recuperaci{\'o}n de toda esa informaci{\'o}n audiovisual almacenada. El Graphic Object Searcher (GOS) es una interfaz gr{\'a}fica para realizar b{\'u}squedas de im{\'a}genes alojadas en grandes bases de datos a partir de una imagen ejemplo y de unos criterios de b{\'u}squeda establecidos por el usuario. La realizaci{\'o}n de PFC permite trabajar en las dos {\'a}reas tecnol{\'o}gicas con m{\'a}s auge actualmente: el sector multimedia (gesti{\'o}n de contenido audiovisual) y las tecnolog{\'\i}as de la informaci{\'o}n (TIC) (inform{\'a}tica al servicio de la comunicaci{\'o}n). Estas dos {\'a}reas tienden a aunar esfuerzos en una sociedad abocada al uso y consumo de contenido audiovisual a trav{\'e}s de m{\'u}ltiples plataformas y dispositivos en cualquier sector econ{\'o}mico y social (ocio, formaci{\'o}n, servicios, etc.). Cualquier profesional del sector audiovisual ha de adquirir conocimiento y experiencia en ambas {\'a}reas para cimentar su carrera.
Mitjan{\c c}ant el present ajut s{\textquoteright}ha ampliat l{\textquoteright}aplicaci{\'o} en xarxa LAVICAD (LAboratori VIrtual de COmunicacions Anal{\`o}giques i Digitals) que s{\textquoteright}ofereix de forma integrada dins de la plataforma d{\textquoteright}e-learning COM@WEB. LAVICAD {\'e}s una eina programada en Java i Matlab i est{\`a} formada per un conjunt de simuladors de la capa f{\'\i}sica de sistemes de comunicacions. Tots els simuladors es presenten en xarxa i es poden utilitzar pels estudiants des de qualsevol ordinador sense necessitat d{\textquoteright}instal{\textperiodcentered}laci{\'o} de cap tipus de software especial. Durant el curs 2007 2008 s{\textquoteright}han desenvolupat entre d{\textquoteright}altres dos l{\'\i}nies de treball. D{\textquoteright}una banda s{\textquoteright}ha programat l{\textquoteright}applet que emula la capa f{\'\i}sica de la televisi{\'o} digital terrestre, com a referent per a l{\textquoteright}ensenyament de sistemes de comunicacions avan{\c c}ats. D{\textquoteright}altra banda s{\textquoteright}ha treballat en la programaci{\'o} de noves funcionalitats de l{\textquoteright}eina LAVICAD, que permeten ajudar als professors en el seguiment i avaluaci{\'o} del treball continuat dels estudiants. En particular s{\textquoteright}ha programat la generaci{\'o} d{\textquoteright}una base de dades que cont{\'e} la informaci{\'o} dels usuaris que s{\textquoteright}han connectat i els resultats obtinguts a l{\textquoteright}executar un determinat simulador. Les dues l{\'\i}nies desenvolupades han de permetre durant l{\textquoteright}actual curs, consolidar l{\textquoteright}{\'u}s dels diferents simuladors per a la doc{\`e}ncia de les assignatures implicades al projecte.
}, url = {http://hdl.handle.net/2099/7235}, author = {Cabrera, M. and Xavier Gir{\'o}-i-Nieto and Rey, F. and Gasull, A. and Casas, J. and Villares, J. and Fernandez, J. and Sala {\'A}lvarez, josep and Espinosa Fricke, Pedro and Fern{\'a}ndez, Carlos Marcos and Cort{\'e}s, S. and Farr{\'e}, Miquel {\`A}ngel} } @inbook {bVilaplana08, title = {Face and speech interaction}, booktitle = {Multimodal user interfaces: from signals to interaction}, year = {2008}, pages = {85{\textendash}118}, isbn = {978-3-540-78344-2}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F. and Gurban, M. and Thiran, J.} } @conference {cCabrera08, title = {Lavicad: laboratorio virtual de comunicaciones anal{\'o}gicas y digitales}, booktitle = {XXIII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {2008}, month = {09/2008}, pages = {1{\textendash}4}, address = {Madrid, Spain}, abstract = {The presented experience consists on the {\textquotedblleft}design of{\textquotedblright} and {\textquotedblleft}experimentation with{\textquotedblright} a virtual laboratory of analog and digital communications: LAVICAD. It has been result a useful tool to verify the performance of different communication systems and signal processing techniques, topics typically integrated in undergraduated courses of the curriculum of telecommunications engineering. The communication systems have been implemented and designed as Java applets and are free access. They can be run at the e-learning platform: comweb.upc.edu. The different communication systems present different levels of user interactivity and when students execute a system integrated in a comweb course, the obtained results can be supervised by the professor as an evaluation and assessment tool. From a pedagogical point of view, the main advantages of using a virtual laboratory supposes, can leads to facilitate the learning of certain matters, acting as a connection between the model of knowledge based on concepts and theories, and their practical understanding and experimentation.\
This paper describes the design of an indexing system for a video database. The system uses region-based manual an- notations of keyframes to create models to automatically annotate new keyframes also at the region level. The pre- sented architecture includes user interfaces for training and querying the system, internal databases to manage ingested content and modelled semantic classes, as well as communi- cation interfaces to allow the system interconnection. The scheme is designed to work as a plug-in to an external Mul- timedia Asset Management (MAM) system.\
This paper presents a set of techniques for the detection of composite objects in video recordings of a controlled environ- ment. Firstly, a selective region-based analysis is performed by tuning the algorithm to the perceptual characteristics of the object in the environment. Secondly, the controlled per- ceptual and semantic variabilities of the object are addressed by the detection analysis thanks to a frame by frame update of the object models, and by allowing multiple models for a single object. The proposed techniques are illustrated in the detection of laptops from a zenithal view in a smart room.\
COMalaWEB significa Comunicaciones en la Web y es una plataforma de e-learning que ofrece diferentes recursos docentes on-line a los estudiantes universitarios con el objetivo de facilitar el estudio preferentemente en las tem{\'a}ticas de procesado de se{\~n}al y comunicaciones.
La plataforma es interactiva y se ha dise{\~n}ado para promover la experimentaci{\'o}n e-learning y adquisici{\'o}n de h{\'a}bitos de estudio basados en el uso de nuevas tecnolog{\'\i}as.
El sistema en un futuro a corto plazo podr{\'a} generar itinerarios autom{\'a}ticos de estudio al estudiante en funci{\'o}n del rendimiento obtenido en las pruebas de autoevaluaci{\'o}n y a partir de informaci{\'o}n empaquetada en metadatos e integrada en una base de datos de objetos de aprendizaje.
El objetivo del proyecto COMalaWEB a medio y largo plazo es constituir un punto de encuentro en la www para estudiantes, profesores y profesionales relacionados con el {\'a}rea de las telecomunicaciones y la docencia a nivel universitario.
Uno de los cursos integrados en COMalaWEB es el laboratorio virtual de comunicaciones anal{\'o}gicas y digitales LaViCAD, de libre difusi{\'o}n y acceso y basado en la simulaci{\'o}n de diferentes sistemas de comunicaciones que pueden ser utilizados tanto en docencia presencial como en educaci{\'o}n a distancia.\
Actualmente se puede consultar el contenido del proyecto en la plataforma: http://comweb.upc.edu/\
This paper presents an annotation tool for the man- ual and region-based annotation of still images. The selection of regions is achieved by navigating through a Partition Tree, a data structure that offers a multiscale representation of the image. The user interface provides a framework for the annotation of both atomic and composite semantic classes and generates an MPEG-7 XML compliant file.\
This paper presents two enhancements for the creation and analysis of Binary Partition Trees (BPTs). Firstly, the classic creation of BPT based on colour is expanded to include syntactic criteria derived from human perception. Secondly, a method to include semantic information in the BPT analysis is shown thanks to the definition of the BPT Semantic Neighborhood and the introduction of Semantic Trees. Both techniques aim at bridging the semantic gap between signal and semantics following a bottom-up and a top-down approach, respectively.
}, isbn = {978-3-540-49335-8}, doi = {10.1007/11930334_15}, url = {http://www.springerlink.com/content/u7201mw06545w057/}, author = {Ferran, C. and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Casas, J.} } @conference {cGiro-i-Nieto06, title = {The edition of the Wikipedia as an academic activity}, booktitle = {4rt. Congr{\'e}s Internacional de Doc{\`e}ncia Unversit{\`a}ria i Innovaci{\'o}}, year = {2006}, month = {07/2006}, pages = {{\textendash}}, address = {Barcelona, Catalonia}, abstract = {This paper presents a learning activity around the Wikipedia, a free online encyclopaedia written by its users. Students are asked to write and review entries related to the course topics following a collaborative environment provided by the wiki tools. This paper proposes a seventeensteps methodology for this task in the framework of an academic course organized by topics. The activity has been successfully introduced in a two different schools of the Technical University of Catalonia. In its first edition, 81 new articles were added by 64 students from the EUETIT, in the second experience 60 articles were created and 14 reviewed by 43 students from the ETSETB.
}, keywords = {wikipedia, ~cooperative, ~online}, isbn = {84-8458-240-X}, url = {http://hdl.handle.net/2117/13157}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Casas, J.} } @inbook {bGiro-i-Nieto06, title = {From partition trees to semantic trees}, booktitle = {Multimedia Content Representation, Classification and Security}, volume = {4105/2006}, number = {4105}, year = {2006}, pages = {306{\textendash}313}, abstract = {This paper proposes a solution to bridge the gap between semantic and visual information formulated as a structural pattern recognition problem. Instances of semantic classes expressed by Description Graphs are detected on a region-based representation of visual data expressed with a Binary Partition Tree. The detection process builds instances of Semantic Trees on the top of the Binary Partition Tree using an encyclopedia of models organised as a hierarchy. At the leaves of the Semantic Tree, classes are defined by perceptual models containing a list of low-level descriptors. The proposed solution is assessed in different environments to show its flexibility.
}, issn = {0302-9743}, doi = {10.1007/11848035_41}, url = {http://www.springerlink.com/content/j148713624k48u3r/}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cGiro-i-Nieto06a, title = {From Partition Trees to Semantic Trees}, booktitle = {International Workshop on Multimedia Content Representation, Classification and Security}, year = {2006}, pages = {306{\textendash}313}, isbn = {3-540-39392-7}, doi = {10.1007/11848035}, url = {http://dx.doi.org/10.1007/11848035_41}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cGiro-i-Nieto06b, title = {From Partition Trees to Semantic Trees}, booktitle = {2ndas Jornades UPC de Investigaci{\'o}n en Autom{\'a}tica, Visi{\'o}n y Rob{\'o}tica}, year = {2006}, pages = {187{\textendash}194}, isbn = {84-7653-885-5}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cGiro-i-Nieto06c, title = {Solucions de programari lliure en un projecte d{\textquoteright}adaptaci{\'o} de dades a XML}, booktitle = {V Jornades de Programari Lliure de la UPC}, year = {2006}, author = {Xavier Gir{\'o}-i-Nieto and Camps, N.} } @inbook {bGiro-i-Nieto05, title = {Automatic extraction and analysis of visual objects information}, booktitle = {Multimedia content and the semantic web}, year = {2005}, pages = {203{\textendash}221}, publisher = {Wiley}, organization = {Wiley}, chapter = {7}, isbn = {978-0-470-85753-3}, doi = {10.1002/0470012617.ch7}, url = {http://eu.wiley.com/WileyCDA/WileyTitle/productCd-0470857536.html}, author = {Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Marqu{\'e}s, F. and Salembier, P.} } @conference {cGiro-i-Nieto05, title = {Detection of Semantic Objects using Description Graphs}, booktitle = {IEEE International Conference on Image Processing}, year = {2005}, address = {Genova, Italy}, abstract = {This paper presents a technique to detect instances of classes (objects) according to their semantic definition in the form of a description graph. Classes are defined as combinations of instances of lower level semantic classes and allow the definition of a semantic tree that organizes classes in semantic levels. At the bottom level of the semantic tree, classes are defined by a perceptual model containing a list of low-level descriptors. The proposed detection algorithm follows a bottom-up/top-down approach, building semantic trees on a region-based representation of the media. The flexibility of the approach is assessed on different examples of planar objects, such as frontal faces, groups of islands, flags and traffic signs.
}, keywords = {Detection algorithms, Explosions, Face detection, Image databases, Indexing, MPEG 7 Standard, Object detection, Testing, Traffic control, Tree graphs}, isbn = {0-7803-9135-7}, doi = {10.1109/ICIP.2005.1529972}, url = {http://dx.doi.org/10.1109/ICIP.2005.1529972}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cGarcia05, title = {Functionalities for mapping 2D images and 3D world objects in a Multicamera Environment}, booktitle = {6th International Workshop on Image Analysis for Multimedia Interactive Services}, year = {2005}, abstract = {We present four functionalities intended to improve the ability of image detection and tracking algorithms to understand a scene in a multicamera system. The redundancy of several available projections of any 3D object onto different cameras might ease video analysis tasks. When some prior information about the 3D object or any of its projections is known, geometric constraints can help to restrict search areas in the images under analysis. The functionalities presented also tackle the problem of selecting the best camera at any time, or computing projected areas of 3D objects in images.
}, author = {Garcia, O. and Casas, J.} } @conference {cVilaplana05, title = {Region-based extraction and analysis of visual objects information}, booktitle = {Fourth International Workshop on Content-Based Multimedia Indexing, CBMI 2005}, year = {2005}, address = {Riga, Letonia}, abstract = {In this paper, we propose a strategy to detect objects from\ still images that relies on combining two types of models: a\ perceptual and a structural model. The algorithms that are\ proposed for both types of models make use of a regionbased description of the image relying on a Binary Partition\ Tree. Perceptual models link the low-level signal description with semantic classes of limited variability. Structural\ models represent the common structure of all instances by\ decomposing the semantic object into simpler objects and\ by defining the relations between them using a DescriptionGraph.
}, isbn = {0-7803-6293-4}, author = {Ver{\'o}nica Vilaplana and Xavier Gir{\'o}-i-Nieto and Salembier, P. and Marqu{\'e}s, F.} } @article {xGiro-i-Nieto04, title = {La converg{\`e}ncia de la TV cap al PC}, year = {2004}, month = {03/2004}, institution = {Diari Avui}, type = {Newspaper}, address = {Barcelona, Catalonia}, author = {Xavier Gir{\'o}-i-Nieto} } @conference {cCabrera04, title = {LaViCAD: LABORATORIO VIRTUAL DE COMUNICACIONES ANAL{\'O}GICAS Y DIGITALES}, booktitle = {4rt. Congr{\'e}s Internacional de Doc{\`e}ncia Unversit{\`a}ria i Innovaci{\'o}}, year = {2004}, pages = {1{\textendash}20}, isbn = {84-8458-240-X}, author = {Cabrera, M. and Fernandez, J. and Berzosa, C. and Francisco, V. and Gasull, A.} } @conference {cGiro-i-Nieto04, title = {The Moonlight project: bringing light to our satellite}, booktitle = {Proceedings of the 4th International Conference on Solar Power from Space SPS{\textquoteright}04 The 4th International Conference on Solar Power SPS{\textquoteright}04 together with the 5th International Conference on Wireless Power transmissions WPT 5}, year = {2004}, pages = {99{\textendash}100}, isbn = {92-9092-878-6}, author = {Xavier Gir{\'o}-i-Nieto and Aragon, M. and Prats, X. and Acero, L.} } @conference {cGiro-i-Nieto03, title = {Detection of Semantic Entities using Description Graphs}, booktitle = {4th European Workshop on Image Analysis for Multimedia Interactive Services (WIAMIS)}, year = {2003}, month = {04/2003}, address = {London, England}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {c03, title = {The Moon Orbital Mirror}, booktitle = {54th International Astronautical Congress (IAC)}, year = {2003}, month = {10/2003}, address = {Bremen, Germany}, abstract = {The current state of space exploration pushes that one of the next steps is the establishment of a permanent base on the Moon. This paper describes the problems related to lunar night, explains the benefits that the Moon Orbital Mirror would offer and gives a possible design for the satellite.
}, url = {http://www.zarm.uni-bremen.de/iaf2003/abstracts/data/pdf/IAC-03-IAA.1.1.02.PDF}, author = {Acero, LLu{\'\i}s and {\`A}. Arag{\'o}n and Xavier Gir{\'o}-i-Nieto and Prats, Xavier} } @inbook {cGiro-i-Nieto03a, title = {Semantic Entity Detection Using Description Graphs}, booktitle = {Digital Media Processing for Multimedia Interactive Services}, year = {2003}, pages = {39{\textendash}42}, publisher = {World Scientific Publishing Co.}, organization = {World Scientific Publishing Co.}, address = {Singapore}, abstract = {This paper presents a technique for the detection of Semantic Entities (SEs) in multimedia content. A definition of a SE in terms of lower-level SEs and their Relations (Rs) is proposed using Description Graphs (DGs). By analyzing the a/v information, an instance DG is built to be compared with a model DG of the SE. As a result, a confidence value is computed to express how well the SE is represented in the content. Examples of the use of this approach are presented in two different applications: detection of frontal faces and recognition of clusters of islands.
}, isbn = {981-238-355-7}, url = {http://books.google.es/books?id=vVvJINURimIC\&printsec=frontcover\&hl=ca\&source=gbs_ge_summary_r\&cad=0$\#$v=onepage\&q\&f=false}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @article {aAvrithis03, title = {Unified Access to Heterogeneous Audiovisual Archives}, journal = {Journal of universal computer science}, volume = {9}, number = {6}, year = {2003}, pages = {510{\textendash}519}, abstract = {In this paper, an integrated information system is presented that offers enhanced search and retrieval capabilities to users of heterogeneous digital audiovisual (a/v) archives. This innovative system exploits the advances in handlings a/v content and related metadata, as introduced by MPEG-4 and worked out by MPEG-7, to offer advanced services characterized by the tri-fold semantic phrasing of the request (query), unified handling and personalized response. The proposed system is targeting the intelligent extraction of semantic information from a/v and text related data taking into account the nature of the queries that users my issue, and the context determined by user profiles. It also provides a personalization process of the response in order to provide end_users with desired information. From a technical point of view, the FAETHON system plays the role of an intermediate access server residing between the end users and multiple heterogeneous audiovisual archives organized according to the new MPEG standards.
}, issn = {0948-6968}, doi = {10.3217/jucs-009-06-0510}, url = {http://www.jucs.org/jucs_9_6}, author = {Avrithis, Y. and Stamou, G. and Wallace, M. and Marqu{\'e}s, F. and Salembier, P. and Xavier Gir{\'o}-i-Nieto and Haas, W. and Vallant, H. and Zufferey, M.} } @conference {cAvrithis03, title = {Unified access to heterogeneous audiovisual archives}, booktitle = {International Conference on Knowledge Management}, year = {2003}, pages = {1{\textendash}2}, address = {Graz, Austria}, author = {Avrithis, Y. and Stamou, G. and Wallace, M. and Marqu{\'e}s, F. and Salembier, P. and Xavier Gir{\'o}-i-Nieto and Haas, W. and Vallant, H. and Zufferey, M.} } @article {aSchelkens03, title = {Wavelet coding of volumetric medical datasets}, journal = {Medical Imaging, IEEE Transactions on}, volume = {22}, year = {2003}, month = {march}, pages = {441 -458}, abstract = {Several techniques based on the three-dimensional (3-D) discrete cosine transform (DCT) have been proposed for volumetric data coding. These techniques fail to provide lossless coding coupled with quality and resolution scalability, which is a significant drawback for medical applications. This paper gives an overview of several state-of-the-art 3-D wavelet coders that do meet these requirements and proposes new compression methods exploiting the quadtree and block-based coding concepts, layered zero-coding principles, and context-based arithmetic coding. Additionally, a new 3-D DCT-based coding scheme is designed and used for benchmarking. The proposed wavelet-based coding algorithms produce embedded data streams that can be decoded up to the lossless level and support the desired set of functionality constraints. Moreover, objective and subjective quality evaluation on various medical volumetric datasets shows that the proposed algorithms provide competitive lossy and lossless compression results when compared with the state-of-the-art.
}, keywords = {Algorithms, Computer-Assisted, Data Compression, discrete cosine transforms, embedded coding, embedded data streams, functionality constraints, image coding, Image Enhancement, Image Interpretation, Imaging, JPEG2000, layered zero coding, lossless compression, medical diagnostic imaging, medical image compression, medical image processing, Numerical Analysis, objective quality evaluation, progressive image transmission, quadtree coding, reviews, Signal Processing, subjective quality evaluation, Three-Dimensional, volumetric coding, wavelet transforms}, issn = {0278-0062}, doi = {10.1109/TMI.2003.809582}, author = {Schelkens, P. and Munteanu, A. and Barbarien, J. and Galca, M. and Xavier Gir{\'o}-i-Nieto and Cornelis, J.} } @conference {cGarrido02, title = {A framework for the retrieval of multiple regions using Binary Partition Trees and low level descriptors}, booktitle = {11th European Signal Processing Conference, EUSIPCO 2002}, year = {2002}, pages = {512{\textendash}516}, address = {Toulouse, France}, author = {Garrido, L. and Salembier, P.} } @conference {cVallverdu02, title = {Graphical study of signals and systems}, booktitle = {14th annual World Conference on Educational Multimedia, Hypermedia \& Telecommunications}, year = {2002}, isbn = {0-8186-7919-0}, author = {Vallverdu, F. and Elisa Sayrol and Gasull, A. and Salavedra, J. and Moreno, A.} } @phdthesis {dGarrido02, title = {Hierarchical Region Based Processing of Images and Video Sequences: Application to Filtering, Segmentation and Information Retrieval}, year = {2002}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, abstract = {This work discusses the usefulness of hierarchical region based representations for image and video processing. Region based representations offer a way to perform a first level of abstraction and reduce the number of elements to process with respect to the classical pixel based representation. In this work the two representations that have demonstrated to be useful for region based processing are reviewed, namely region adjacency graphs and trees, and it is discussed why tree based representations are better suited for our purpose. In fact, trees allow representing the image in a hierarchical way and efficient and complex processing techniques can be applied on it. Two major issues are discussed in this work: how the hierarchical representation may be created from a given image and how the tree may be manipulated or processed. Two tree based representations have been developed: the Max-Tree, and the Binary Partition Tree. The Max-Tree structures in a compact way the connected components that arise from all possible level sets from a gray-level image. It is suitable for the implementation of anti-extensive connected operators, ranging from classical ones (for instance, area filter) to new ones (such as the motion filter developed in this work). The Binary Partition Tree structures the set of regions that are obtained during the execution of a region merging algorithm. Developed to overcome some of the drawbacks imposed by the Max-Tree {\textendash} in particular the lack of flexibility of the tree creation and the self-duality of the tree representation {\textendash}, it has demonstrated to be a representation useful for a rather large range of applications, as it is shown in this work. Processing strategies are focused on pruning techniques. Pruning techniques remove some of the branches of the tree based on an analysis algorithm applied on the nodes of the tree. Pruning techniques applied on the Max-Tree lead to anti-extensive operators, whereas self-dual operators are obtained on the Binary Partition Tree, if the tree is created in a self-dual manner. The pruning techniques that have been developed in this work are directed to the following applications: filtering, segmentation and content based image retrieval. The filtering (in the context of connected operators) and segmentation applications are based on the same principle: the nodes of the tree are analyzed according to a fixed criterion, and the decision to remove or preserve a node usually relies on a threshold applied on the former measured criterion. Pruning is then performed according to the previous decision. As a result, the image associated to the pruned tree represents a filtered or segmented version of the original image according to the selected criterion. Some of the criteria that are discussed in this work are based, for instance, on area, motion, marker \& propagation or a rate-distortion strategy. The problem of the lack of robustness of classical decision approaches of non-increasing criteria is discussed and solved by means of an optimization strategy based on the Viterbi algorithm. Content based image retrieval is the third application we have focused on in this work. Hierarchical region based representations are particularly well suited for this purpose since they allow to represent the image at different scales of resolution, and thus the regions of the image can be described at different scales of resolution. In this work we focus on an image retrieval system which supports low level queries based on visual descriptors and spatial relationships. For that purpose, region descriptors are attached to the nodes of the tree. Two types of queries are discussed: single region query, in which the query is made up of one region and, multiple region query, in which the query is made up of a set of regions. In the former visual descriptors are used to perform the retrieval whereas visual descriptors and spatial relationships are used in the latter case. Moreover, a relevance feedback approach is presented to avoid the need of manually setting the weights associated to each descriptor. An important aspect that has been taken into account throughout this work is the efficient implementation of the algorithms that have been developed for both creation and processing of the tree. In the case of the tree creation, efficiency has been obtained mainly due to the use of hierarchical queues, whereas in the processing step analysis algorithms based on recursive strategies are used to get efficient algorithms.
}, url = {http://hdl.handle.net/10803/6878}, author = {Garrido, L.}, editor = {Salembier, P.} } @conference {cGiro-i-Nieto02, title = {MPEG-7 Descriptors for Earth Observation Satellites}, booktitle = {International Astronautical Congress}, year = {2002}, month = {09/2002}, pages = {1{\textendash}4}, publisher = {Inernational Astronautical Federation}, organization = {Inernational Astronautical Federation}, address = {Houston, Texas (USA)}, abstract = {The amount of digital multimedia information has experienced a spectacular growth during the last years thanks to the advances on digital systems of image, video and audio acquisition. As a response to the need of organizing all this information, ISO/IEC has developed a new standard for multimedia content description called MPEG-7. Among other topics, MPEG-7 defines a set of multimedia descriptors that can be automatically generated using signal processing techniques. Earth Observation Satellites generate large quantities of images stored on enormous databases that can take advantage of the new standard. An automatic indexation of these images using MPEG-7 meta-data can improve their contents management as well as simplify interaction between independent databases. This paper gives an overall description on MPEG-7 standard focusing on the low-level Visual Descriptors. These descriptors can be grouped into four categories: color, texture, shape and motion. Visual Color Descriptors represent the color distribution of an image in terms of a specified color space. Visual Texture Descriptors define the visual pattern of an image according to its homogeneities and non-homogeneities. Visual Shape Descriptors describe the shape of 2D and 3D objects being, at the same time, invariant to scaling, rotation and translation. Motion Descriptors give the essential characteristics of objects and camera motions.
\
These descriptors can be used individually or in combination to index and retrieve satellite images of the Earth from a database. For example, oceans and glaciers can be discerned based on their Color Descriptors, also cities and desert based on the Texture Descriptors, island images can be grouped using the Shape descriptors and cyclone trajectories studied and compared using Motion Descriptors.
}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Marcello, J. and F. Eugenio} } @conference {cGasull02, title = {Oil Spills Detection in SAR Images using Mathematical Morphology}, booktitle = {11th European Signal Processing Conference (EUSIPCO 2002)}, year = {2002}, pages = {25{\textendash}28}, author = {Gasull, A. and F{\'a}bregas, F.X. and Jim{\'e}nez, J. and Marqu{\'e}s, F. and Moreno, V. and Herrero, M.} } @book {eSayrol02, title = {Senyals i sistemes anal{\`o}gics: una introducci{\'o} pr{\`a}ctica}, year = {2002}, isbn = {84-8301-610-9}, url = {http://www.edicionsupc.es}, author = {Elisa Sayrol and Gasull, A. and Moreno, A. and Salavedra, J. and Vallverdu, F.} } @article {aSalembier02, title = {Visual Segment Tree Creation for MPEG-7 Description Schemes}, journal = {Pattern recognition}, volume = {35}, number = {1}, year = {2002}, pages = {563{\textendash}579}, issn = {0031-3203}, author = {Salembier, P. and Llach, J. and Garrido, L.} } @conference {cPineda01, title = {Estudio de campos de golf mediante t{\'e}cnicas de segmentaci{\'o}n}, booktitle = {IX Congreso Nacional de Teledetecci{\'o}n}, year = {2001}, address = {Lleida, Spain}, author = {Pineda, N and Jorge, J and Garrido, L. and Salembier, P.} } @conference {cSayrol01a, title = {Graphical Study of Signals and Systems}, booktitle = {International Conference on Acoustics, Speech and Signal Processing ICASSP{\textquoteright}01}, year = {2001}, isbn = {0-7803-1775-0}, author = {Elisa Sayrol and Gasull, A. and Moreno, A. and Salavedra, J. and Vallverdu, F.} } @book {eSayrol01, title = {Senyals i sistemes anal{\`o}gics: una introducci{\'o} pr{\`a}ctica}, year = {2001}, isbn = {84-8301-510-2}, url = {www.edicionsupc.es}, author = {Elisa Sayrol and Gasull, A. and Moreno, A. and Salavedra, J. and Vallverdu, F.} } @conference {cSchelkens00, title = {3D compression of medical data based on cube-splitting and embedded block coding}, booktitle = {ProRISC/IEEE Workshop}, year = {2000}, month = {12/2000}, address = {Veldhoven, The Netherlands}, author = {Schelkens, P. and Xavier Gir{\'o}-i-Nieto and Barbarien, J. and Cornelis, J.} } @conference {cGasull00, title = {619 - un ejemplo de dise{\~n}o del laboratorio para asignaturas con cr{\'e}ditos te{\'o}ricos y pr{\'a}cticos}, booktitle = {VIII Congreso Universitario de Innovaci{\'o}n Educativa en las Ense{\~n}anzas T{\'e}cnicas}, year = {2000}, pages = {233{\textendash}240}, isbn = {84-7585-380-3}, author = {Gasull, A. and Moreno, A. and Salavedra, J. and Elisa Sayrol and Vallverdu, F.} } @conference {cEugenio00a, title = {Accurate and automatic NOAA-AVHRR image navigation using a global contour matching approach}, booktitle = {International Geoscience and remote Sensing Symposium}, year = {2000}, pages = {639{\textendash}642}, isbn = {0-7803-6362-0}, author = {F. Eugenio and Marqu{\'e}s, F. and G{\'o}mez, L. and Suarez, E. and Rovaris, E.} } @article {aSalembier00, title = {Binary partition tree as an efficient representation for image processing, segmentation and information retrieval}, journal = {IEEE transactions on image processing}, volume = {9}, number = {4}, year = {2000}, pages = {561{\textendash}576}, abstract = {This paper discusses the interest of binary partition trees as a region-oriented image representation. Binary partition trees concentrate in a compact and structured representation a set of meaningful regions that can be extracted from an image. They offer a multiscale representation of the image and define a translation invariant 2-connectivity rule among regions. As shown in this paper, this representation can be used for a large number of processing goals such as filtering, segmentation, information retrieval and visual browsing. Furthermore, the processing of the tree representation leads to very efficient algorithms. Finally, for some applications, it may be interesting to compute the binary partition tree once and to store it for subsequent use for various applications. In this context, the paper shows that the amount of bits necessary to encode a binary partition tree remains moderate.
Many medical data acquisition devices or multispectral imaging techniques produce three-dimensional image data. These images must be stored in limited space devices or transmitted through limited bandwidth channels. Compression techniques are an extremely valuable tool to reduce the expensive resource requirements.
However, compression techniques have already been developed for the more popular two-dimensional images. Splitting the volumetric image in slices and applying a two- dimensional coding technique to each slice is the philosophy followed by the classical approach for 3D compression. This is clearly inefficient, because 2D techniques only exploit the image correlation in the X and Y axis. In volumetric images a new Z-axis appears, whose correlation must be also exploited to achieve the best results.
The basis for all current image and video compression standards is DCT-based coding. For these techniques the computation is based on splitting of the image into NxN blocks and transforming it from the spatial domain into the DCT domain. Typical examples are first generation coders, like JPEG, which produce a non-structured, unique bit-stream. This technique could easily be adapted to three-dimensional by splitting the volume into NxNxN blocks and applying a 3D DCT. However, one encounters two problems. First, the DCT transform is a lossy, and medical practice cannot tolerate any distortion that could lead to an faulty diagnose. Secondly, contemporary transmission techniques make use of concepts like rate-scalability, quality and resolution scalability, features that are not fully supportable by DCT techniques.
Coders using a wavelet transform as front-end are good candidates to overcome these problems. They scan each bit-planes one by one to generate a structured bit-stream. This bit-stream can be truncated to give more or less quality or resolution, and they are classified second-generation coders. A typical example of 3D wavelet coding is the octave zero-tree based coding [Bil99, Xio99, Kim99, Kim00, Sch00a], which currently tends to deliver the best compression performance. However, it is difficult to control the bit-stream structure since it is dependent on the coder{\textquoteright}s data flow.
The new image compression standard JPEG2000 uses a third generation technique, called EBCOT ,incorporating an abstract interface to enable reordering of the generated code packages. In this way a fully controllable bit-stream structure is achieved. For example, the bit-stream can be equipped so that resolution or quality scalability are supported. The current verification model (VM7.0) of JPEG2000 however, does not include three-dimensional coding. The only support that is given for multidimensional and/or multi-spectral images is the possibility to execute a wavelet transform along the component axis. Unfortunately, the code supporting this feature was still buggy at the time this document was written
Adapting this third-generation coding technique to a three-dimensional environment was the aim of this thesis. The input volume is transformed into the wavelet transform with the 3D Wavelet front-end described and implemented by Schelkens et al. [Sch00a] and Barbarien [Joeri{\textquoteright}s thesis]. Later it is coded by an hybrid technique of Cube-Splitting and an JPEG2000{\textquoteright}s EBCOT module, modified to support the third dimension. The Cube-Splitting module codes big zero-volumes very efficiently, while the EBCOT coder is responsible for the coding of the (sub)volumes containing significant samples. Hence, the implemented coder is called CS- EBCOT.\
}, keywords = {coding, volumetric coding}, author = {Xavier Gir{\'o}-i-Nieto} } @conference {cGasull99, title = {Editor gr{\'a}fico de figuras MATLAB}, booktitle = {III Congreso de Usuarios de MATLAB}, year = {1999}, pages = {219{\textendash}227}, isbn = {84-699-1358-1}, author = {Gasull, A. and Elisa Sayrol and Moreno, A. and Vallverdu, F. and Salavedra, J. and Albert Oliveras} } @article {pMarques99a, title = {Partition Decoding Method and Device}, number = {994017614}, year = {1999}, month = {07/1999}, type = {Invention patent}, address = {International}, issn = {994017614}, author = {Marqu{\'e}s, F. and Gomila, C. and Gasull, A.} } @conference {cGarrido99, title = {Representing and retrieving regions using binary partition trees}, booktitle = {1999 IEEE International Conference on Image Processing, ICIP 1999}, year = {1999}, address = {Kobe, Japan}, isbn = {0-7803-5470-2}, author = {Garrido, L. and Salembier, P. and Casas, J.} } @conference {cSayrol99a, title = {Simulaci{\'o}n digital de se{\~n}ales y sistemas anal{\'o}gicos}, booktitle = {III Congreso de Usuarios de MATLAB}, year = {1999}, pages = {67{\textendash}76}, isbn = {84-699-1358-1}, author = {Sayrol E. and Gasull, A. and Moreno, A. and Vallverdu, F. and Salavedra, J. and Albert Oliveras} } @article {aSalembier98, title = {Antiextensive connected operators for image and sequence processing}, journal = {IEEE transactions on image processing}, volume = {7}, number = {4}, year = {1998}, pages = {555{\textendash}570}, abstract = {This paper deals with a class of morphological operators called connected operators. These operators filter the signal by merging its flat zones. As a result, they do not create any new contours and are very attractive for filtering tasks where the contour information has to be preserved. This paper shows that connected operators work implicitly on a structured representation of the image made of flat zones. The max-tree is proposed as a suitable and efficient structure to deal with the processing steps involved in antiextensive connected operators. A formal definition of the various processing steps involved in the operator is proposed and, as a result, several lines of generalization are developed. First, the notion of connectivity and its definition are analyzed. Several modifications of the traditional approach are presented. They lead to connected operators that are able to deal with texture. They also allow the definition of connected operators with less leakage than the classical ones. Second, a set of simplification criteria are proposed and discussed. They lead to simplicity-, entropy-, and motion-oriented operators. The problem of using a nonincreasing criterion is analyzed. Its solution is formulated as an optimization problem that can be very efficiently solved by a Viterbi (1979) algorithm. Finally, several implementation issues are discussed showing that these operators can be very efficiently implemented.
}, issn = {1057-7149}, doi = {10.1109/83.663500}, url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=663500}, author = {Salembier, P. and Albert Oliveras and Garrido, L.} } @conference {cSalembier98, title = {Auto-dual connected operators based on iterative merging algorithms}, booktitle = {International Symposium on Mathematical Morphology and its applications to image and signal processing, ISMM 1998}, year = {1998}, pages = {183{\textendash}190}, address = {Amsterdam, The Netherlands}, isbn = {1522-4880}, author = {Salembier, P. and Garrido, L. and Garc{\'\i}a, D.} } @conference {cSalembier98b, title = {Binary partition tree as an efficient representation for filtering, segmentation and information retreival}, booktitle = {IEEE International Conference on Image Processing, ICIP 1998}, year = {1998}, address = {Chicago (IL), USA}, author = {Salembier, P. and Garrido, L.} } @conference {cSalembier98a, title = {Connected operators for sprite creation and layered representation of image sequences}, booktitle = {9th European Signal Processing Conference, EUSIPCO 1998}, year = {1998}, pages = {2105{\textendash}2108}, address = {Rhodes, Greece}, isbn = {978-1-4503-0159-6}, author = {Salembier, P. and Pujol, O. and Garrido, L.} } @article {aGarrido98, title = {Extensive Operators in Partition Lattices for Image Sequence Analysis}, journal = {Signal processing}, volume = {66}, number = {2}, year = {1998}, pages = {157{\textendash}180}, issn = {0165-1684}, author = {Garrido, L. and Salembier, P. and Garc{\'\i}a, D.} } @article {aMarques98, title = {Prediction of image partitions using fourier descriptors : application to segmentation-based coding schemes}, journal = {IEEE transactions on image processing}, volume = {7}, number = {4}, year = {1998}, pages = {529{\textendash}542}, issn = {1057-7149}, author = {Marqu{\'e}s, F. and Llorens, B. and Gasull, A.} } @conference {cGarrido98, title = {Region-based analysis of video sequences with a general merging algorithm}, booktitle = {9th European Signal Processing Conference, EUSIPCO 1998}, year = {1998}, pages = {1693{\textendash}1696}, address = {Rhodes, Greece}, isbn = {960-7620-06-4}, author = {Garrido, L. and Salembier, P.} } @conference {cVilaplana98, title = {Region-based segmentation segmentation and tracking of human faces}, booktitle = {9th European Signal Processing Conference, EUSIPCO 1998}, year = {1998}, pages = {311{\textendash}314}, address = {Rhodes, Greece}, isbn = {960-7620-06-4}, author = {Ver{\'o}nica Vilaplana and Marqu{\'e}s, F. and Salembier, P. and Garrido, L.} } @conference {cGarrido97a, title = {Anti-extensive Connected Operators with Application to Image Sequences}, booktitle = {VII Simposium Nacional de Reconocimiento de Formas y An{\'a}lisis de Im{\'a}genes}, year = {1997}, pages = {151{\textendash}156}, address = {Barcelona, Spain}, isbn = {84-922529-0-1}, author = {Garrido, L. and Salembier, P. and Albert Oliveras} } @conference {cGarrido97b, title = {A hierarchical technique for image sequence analysis}, booktitle = {Workshop on Image Analysis for Multimedia Application Services, WIAMIS{\textquoteright}97}, year = {1997}, pages = {13{\textendash}20}, address = {Louvain-la-Neuve, Belgium}, author = {Garrido, L. and Marqu{\'e}s, F. and M. Pard{\`a}s and Salembier, P. and Ver{\'o}nica Vilaplana} } @conference {cSalembier97, title = {Image sequence analysis and merging algorithms}, booktitle = {International Workshop on Very Low Bit-rate Video, VLBV{\textquoteright}97}, year = {1997}, pages = {1{\textendash}8}, address = {Link{\"o}ping, Sweden}, isbn = {0-7803-9752-5}, author = {Salembier, P. and Garrido, L. and Garc{\'\i}a, D.} } @conference {cGarrido97, title = {Motion analysis of image sequences using connected operators}, booktitle = {SPIE Visual Communications and Image Processing, VCIP{\textquoteright}97}, year = {1997}, pages = {546{\textendash}557}, address = {San Jose, CA, USA}, isbn = {0-8194-2435-8}, author = {Garrido, L. and Albert Oliveras and Salembier, P.} } @conference {cOliveras97, title = {Stereo image analyis using connected operators}, booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing, ICASSP 1997}, year = {1997}, pages = {3169{\textendash}3172}, address = {Munich, Germany}, isbn = {978-989-8109-05-7}, author = {Albert Oliveras and Garrido, L. and Salembier, P.} } @inbook {bSalembier96, title = {Coding of partition sequences}, booktitle = {Video coding: the second generation approach}, year = {1996}, pages = {125{\textendash}170}, publisher = {Kluwer}, organization = {Kluwer}, edition = {L. Torres and M. Kunt (Eds.)}, isbn = {0 7923 9680 4}, author = {Salembier, P. and Marqu{\'e}s, F. and Gasull, A.} } @article {pOliveras96, title = {Filtering Method and Corresponding Filtering System}, number = {96402925.0-}, year = {1996}, type = {Invention patent}, author = {Albert Oliveras and Salembier, P. and Garrido, L.} } @conference {cSalembier96b, title = {Motion connected operators for image sequences}, booktitle = {VIII European Signal Processing Conference, EUSIPCO{\textquoteright}96}, year = {1996}, pages = {1083{\textendash}1086}, address = {Trieste, Italy}, isbn = {84-600-9597-5}, author = {Salembier, P. and Albert Oliveras and Garrido, L.} } @article {aSayrol96, title = {Motion estimation using higher-order statistics}, journal = {IEEE transactions on image processing}, volume = {5}, number = {6}, year = {1996}, pages = {1077{\textendash}1084}, issn = {1057-7149}, author = {Elisa Sayrol and Gasull, A. and R. Fonollosa, Javier} } @conference {cMarques96a, title = {Partition coding using multigrid chain code and motion compensation}, booktitle = {IEEE INTERNATIONAL CONFERENCE ON IMAGE PROCESSING}, year = {1996}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cSayrol95a, title = {Estimation of motion parameters using hos}, booktitle = {IEEE SIGNAL PROCESSING-ATHOD WORKSHOP ON HIGHER-ORDER STATISTICS.}, year = {1995}, pages = {262{\textendash}265}, isbn = {1522-4880}, author = {Elisa Sayrol and Gasull, A. and R. Fonollosa, Javier} } @conference {cSayrol95, title = {Fourth-order statistics cost functions: applications to time delay estimation and image motion estimation}, booktitle = {VI SPANISH SYMPOSIUM ON PATTERN RECOGNITION AND IMAGE ANALYSIS}, year = {1995}, pages = {543{\textendash}548}, isbn = {978-1-4244-9564-1}, author = {Elisa Sayrol and Gasull, A. and R. Fonollosa, Javier} } @article {aSayrol95, title = {Image Restoration using the W-Slice Method}, journal = {IEEE transactions on image processing}, volume = {4}, number = {4}, year = {1995}, pages = {1174{\textendash}1181}, abstract = {We propose the use of higher order statistics (HOS)-based methods to address the problem of image restoration. The restoration strategy is based on the fact that the phase information of the original image and its HOS are not distorted by some types of blurring. The difficulties associated with the combination of 2-D signals and their HOS are reduced by means of the Radon transform. Two methods that apply the weight-slice algorithm over the projections are developed. Simulation results illustrate the performance of the proposed methods
}, issn = {1057-7149}, author = {Elisa Sayrol and Gasull, A. and Nikias, C.} } @conference {cMarques95a, title = {Interpolation and extrapolation of iomage partitions using fourier descriptions: to segmentation-based coding schemes}, booktitle = {IEEE INTERNATIONAL CONFERENCE ON IMAGE PROCESSING{\textquoteright}95.}, year = {1995}, pages = {584{\textendash}587}, author = {Marqu{\'e}s, F. and Llorens, B. and Gasull, A.} } @article {aSalembier95, title = {Region-based video coding using mathematical morphology}, journal = {Proceedings of the IEEE}, volume = {83}, number = {6}, year = {1995}, pages = {843{\textendash}857}, issn = {0018-9219}, author = {Salembier, P. and Torres, L. and Meyer, F. and Gu, C.} } @conference {cMarques94a, title = {Hierarchical image sequence model for segmentation: application to region-based sequence coding}, booktitle = {VISUAL COMMUNICATION AND IMAGE PROCESSING}, year = {1994}, pages = {554{\textendash}563}, author = {Marqu{\'e}s, F. and Gasull, A. and Vera, V.} } @phdthesis {dSayrol94, title = {Higher-order statistics applications in image sequence processing}, year = {1994}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, url = {http://hdl.handle.net/10803/6950}, author = {Elisa Sayrol}, editor = {Gasull, A.} } @conference {cPardas94a, title = {Motion region overlapping for segmentation-based video coding}, booktitle = {International Conference on Image Processing, ICIP{\textquoteright}94}, year = {1994}, pages = {428{\textendash}431}, address = {Austin, Texas}, isbn = {0-7803-5470-2}, author = {M. Pard{\`a}s and Salembier, P. and Gonzalez, B.} } @conference {cGimeno94, title = {A new approach to texture coding using stochastic vector quantization}, booktitle = {IEEE International Conference on Image Processing{\textquoteright}94}, year = {1994}, pages = {119{\textendash}123}, abstract = {A new method for texture coding which combines 2-D linear prediction and stochastic vector quantization is presented in this paper. To encode a texture, a linear predictor is computed first. Next, a codebook following the prediction error model is generated and the prediction error is encoded with VQ, using an algorithm which takes into account the pixels surrounding the block being encoded. In the decoder, the error image is decoded first and then filtered as a whole, using the prediction filter. Hence, correlation between pixels is not lost from one block to another and a good reproduction quality can be achieved
}, isbn = {0-8186-6950-0}, doi = {10.1109/ICIP.1994.413287}, author = {Gimeno, D. and Torres, L. and Casas, J.} } @conference {cMarques94b, title = {Recursive image sequence segmentation by hierarchical models}, booktitle = {12th IAPR International Conference on Pattern Recognition}, year = {1994}, pages = {523{\textendash}525}, isbn = {0-8186-6265-4}, author = {Marqu{\'e}s, F. and Vera, V. and Gasull, A.} } @conference {cMarques94, title = {Top-down 3d image sequence segmentation technique controlled by morphological}, booktitle = {EUSIPCO-94.}, year = {1994}, pages = {415{\textendash}418}, author = {Marqu{\'e}s, F. and Vera, V. and Gasull, A.} } @conference { cSalembier94, title = {Very low bit rate video coding using morphological segmentation and contour/texture motion compensation}, booktitle = {12th International Conference on Pattern Recognition, ICPR 1994}, year = {1994}, address = {Jerusalem, Israel}, author = {Salembier, P. and Gu, C. and M. Pard{\`a}s and Kunt, M} } @conference {cGasull93, title = {Character recognition and document analysis by morphological techniq}, booktitle = {MATHEMATICAL MORPHOLOGY}, year = {1993}, pages = {198{\textendash}203}, author = {Gasull, A. and Corbera, L. and Marqu{\'e}s, F.} } @conference {cMontolio93, title = {Character recognition and document analysis by morphological techniques}, booktitle = {WORKSHOP ON MATHEMATICAL MORPHOLOGY AND ITS APPLICATIONS TO SIGNAL PROCESSING}, year = {1993}, pages = {198{\textendash}203}, author = {Montolio, P. and Gasull, A. and Corbera, L. and Marqu{\'e}s, F.} } @conference {cSayrol93a, title = {Image analysis using higher-order statistics and the radon transfrom}, booktitle = {IEEE SIGNAL PROCESSING WORKSHOP ON HIGH-ORDER-STATISTICS}, year = {1993}, pages = {76{\textendash}80}, author = {Elisa Sayrol and Nikias, C. and Gasull, A.} } @conference {cSayrol93, title = {Image analysis using higher-order statistics and the random transfor}, booktitle = {WORKSHOP ON HIGHER-ORDER STATISTICS}, year = {1993}, author = {Elisa Sayrol and Gasull, A.} } @conference {cSalembier93f, title = {Morphological segmentation-based coding of image sequences}, booktitle = {IEEE European Conference on Circuits Theory and Design}, year = {1993}, pages = {1245{\textendash}1250}, address = {Davos, Switzerland}, isbn = {0-7803-5470-2}, url = {Problemes de connexi{\'o} a l{\textquoteright}Eprints}, author = {Salembier, P. and Torres, L. and M. Pard{\`a}s and Marqu{\'e}s, F. and HIERRO, P. and Gasull, A.} } @conference {cMarques93, title = {Shape and location coding for contour images}, booktitle = {PROC. OF THE 1993 PICTURE CODING SYMPOSIUM}, year = {1993}, pages = {61{\textendash}63}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cMarques93b, title = {Stochastic image model for segmentation. application to image coding}, booktitle = {SCANDINAVIAN CONFERENCE ON IMAGE ANALYSIS}, year = {1993}, pages = {265{\textendash}272}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cMarques93a, title = {Unsupervised segmentation controlled by morphological contrast ext}, booktitle = {ICASSP}, year = {1993}, pages = {517{\textendash}520}, author = {Marqu{\'e}s, F. and Gasull, A.} } @inbook {bGasull92, title = {Analysis and optimization of the K-Means algorithm for remote sensing applications}, booktitle = {Pattern recognition and image analysis}, year = {1992}, pages = {0{\textendash}0}, isbn = {9810208812}, url = {http://cataleg.upc.edu/search~S1*cat/?searchtype=i\&searcharg=+9810208812\&searchscope=1\&SORT=D\&extended=0\&SUBMIT=Cerca\&searchlimits=\&searchorigarg=t+Proceedings+of++Workshop+on+Network+Robot+Systems+2009}, author = {Gasull, A. and Monte, E. and Torres, L. and Montolio, P. and Marqu{\'e}s, F.} } @conference {cGasull92c, title = {Coagulation time detection by means of a real-time image processing}, booktitle = {14th Annual International Conference of the IEEE Engineering in Medicine and Biology Society}, year = {1992}, pages = {1948{\textendash}1949}, author = {Gasull, A. and Vallverdu, F. and Marqu{\'e}s, F.} } @conference {cMarques92b, title = {Codificacion de imagenes: un metodo de segunda generacion}, booktitle = {VII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1992}, pages = {144{\textendash}148}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cMarques92d, title = {Codificacion de imagenes:un metodo de segunda generacion}, booktitle = {VII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1992}, pages = {144{\textendash}148}, author = {Marqu{\'e}s, F. and Gasull, A.} } @article {aTorres92, title = {Compresi{\'o}n de audio e imagen para sistemas multimedia}, journal = {Bit numerical mathematics}, number = {76}, year = {1992}, pages = {50{\textendash}58}, issn = {0006-3835}, author = {Torres, L. and Moreno, A. and Masgrau, E. and Gasull, A.} } @conference {cCasas92a, title = {Fuzzy classification of Remote Sensing images: a pseudocolor representation of fuzzy partitions}, booktitle = {SPIE Neural and Stochastic Methods in Image and Signal Processing}, year = {1992}, month = {07/1992}, publisher = {SPIE}, organization = {SPIE}, address = {San Diego, CA}, doi = {10.1117/12.130844}, author = {Casas, J. and Hillion, A. and Roux, C. and Torres, L. and Gasull, A.} } @conference {cSalembier92b, title = {Morphological detection based on size and contrast criteria}, booktitle = {14th Annual International Conference of the IEEE Engineering in Medicine and Biology Society}, year = {1992}, pages = {1930{\textendash}1031}, address = {Paris, France}, isbn = {?}, author = {Salembier, P. and Gasull, A. and Marqu{\'e}s, F. and Elisa Sayrol} } @phdthesis {dMarques92, title = {Multiresolution image segmentation based on camporend random fields: Application to image coding}, year = {1992}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, url = {http://hdl.handle.net/10803/6910}, author = {Marqu{\'e}s, F.}, editor = {Gasull, A.} } @conference {cGasull92, title = {Non-linear techniques for image interpolation}, booktitle = {VI European Signal Processing Conference}, year = {1992}, pages = {1473{\textendash}1476}, isbn = {0 444 89587 6}, author = {Gasull, A. and Marqu{\'e}s, F. and Torres, L.} } @conference {cMarques92c, title = {Segmentacion de imagenes multiespectrales con tecnicas piramidales}, booktitle = {VII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1992}, pages = {372{\textendash}376}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cMarques92a, title = {Segmentacion no supervisada de imagenes mediante campos aleatorios}, booktitle = {SIMPOSIUM NACIONAL DE RECONOZIMIENTO DE FORMAS Y ANALISIS IMAGENES}, year = {1992}, pages = {55{\textendash}62}, isbn = {3-8007-2300-X}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cGasull92b, title = {T{\'e}cnicas de preprocesado para la segmentaci{\'o}n de im{\'a}genes}, booktitle = {VII Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1992}, pages = {367{\textendash}371}, author = {Gasull, A. and Marqu{\'e}s, F. and Torres, L.} } @conference {cGasull92a, title = {T{\'e}cnicas de preprocesado para la segmentaci{\'o}n de im{\'a}genes}, booktitle = {U.R.S.I. 92}, year = {1992}, pages = {367{\textendash}371}, isbn = {84-600-8219-9}, author = {Gasull, A. and Marqu{\'e}s, F. and Montolio, P. and Torres, L.} } @conference {cCasas92, title = {Una interpretaci{\'o}n colorim{\'e}trica en clasificaciones fuzzy de im{\'a}genes de teledetecci{\'o}n}, booktitle = {V Simposium Nacional de Reconocimiento de Formas y An{\'a}lisis de Im{\'a}genes}, year = {1992}, pages = {281{\textendash}287}, author = {Casas, J. and Torres, L. and Gasull, A.} } @conference {cGasull91, title = {Analisis de no estacionariedades en la interpolacion de imagenes}, booktitle = {VI Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1991}, pages = {985{\textendash}989}, isbn = {84-600-7766-7}, author = {Gasull, A. and Marqu{\'e}s, F. and Torres, L.} } @conference {cMarques91, title = {Codig-oriented segmentation based on g-m random}, booktitle = {International Conference on Acoustics, Speech and Signal Processing 1991}, year = {1991}, pages = {2749{\textendash}2752}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cMarques91b, title = {Obtencion de un esqueleto morfologico sin puntos redundantes}, booktitle = {VI Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1991}, pages = {980{\textendash}984}, author = {Marqu{\'e}s, F. and Gasull, A. and Torres, L.} } @conference {cMarques91a, title = {Segmentacion de imagenes mediante modelos de gibbs-markov}, booktitle = {VI Simposium Nacional de la Uni{\'o}n Cient{\'\i}fica Internacional de Radio}, year = {1991}, pages = {975{\textendash}979}, author = {Marqu{\'e}s, F. and Gasull, A.} } @conference {cGasull90a, title = {Analisis y optimizacion del algoritmo k-means aplicado a teledeteccion en ima}, booktitle = {IV Simposium Nacional de Reconocimiento de Formas y An{\'a}lisis de Im{\'a}genes}, year = {1990}, pages = {17{\textendash}23}, author = {Gasull, A. and Torres, L.} } @article {aSanz90, title = {Aspectos Cl{\'\i}nicos-toxicol{\'o}gicos en los trabajadores de una industria productora de cromatos}, journal = {Annual review of pharmacology and toxicology}, volume = {7}, year = {1990}, pages = {1:13{\textendash}1:20}, issn = {0362-1642}, author = {Sanz, P. and Ribas, B. and Cobo, E. and Gadea, E. and Marqu{\'e}s, F. and Sol{\'e}, E. and Corbella, J.} } @conference {cGasull90, title = {Contour extraction and image preprocessing of echocardiographic images using r}, booktitle = {Latvian Signal Processing International Conference}, year = {1990}, pages = {26{\textendash}30}, author = {Gasull, A. and Torres, L.} } @conference {cGasull90b, title = {Eleccion de componentes principales para la clasific. no supervisada de image}, booktitle = {IV Simposium Nacional de Reconocimiento de Formas y An{\'a}lisis de Im{\'a}genes}, year = {1990}, pages = {9{\textendash}16}, author = {Gasull, A. and Torres, L.} } @conference {cTorres90, title = {Temporal automatic edge detection of echocardiographic images}, booktitle = {ICASSP}, year = {1990}, pages = {2149{\textendash}2152}, isbn = {078030033}, author = {Torres, L. and Gasull, A.} } @conference {cGasull89a, title = {Automated left ventricular contour extraction and volume calculation form echocardiographic images}, booktitle = {Ultrasonics International 89}, year = {1989}, pages = {1{\textendash}4}, isbn = {SN.}, author = {Gasull, A. and Alquezar, R. and Torres, L. and Sallent, S. and Marqu{\'e}s, F. and Vidal, J. and Sangr{\'a}, E.} } @conference {cGasull89, title = {Automatic left ventricular contour for volume calculation}, booktitle = {Ultrasonics International 89}, year = {1989}, pages = {123{\textendash}126}, isbn = {SN.}, author = {Gasull, A. and Vazquez, G.} } @conference {cVazquez89, title = {Constant variance transversal filtering for adaptive channel equalization}, booktitle = {INTERNATIONAL CONFERENCE ON SONAR SIGNAL PROCESSING}, year = {1989}, pages = {212{\textendash}215}, isbn = {0-7923-9733-9}, author = {Vazquez, G. and Gasull, A. and Sanchez, J. and Lagunas, M.} } @conference {cSallent88, title = {Codificaci{\'o}n piramidal generalizada}, booktitle = {III Simposium Nacional de Reconocimiento de Formas y An{\'a}lisis de Im{\'a}genes}, year = {1988}, pages = {1{\textendash}2}, isbn = {142440469X/1520-6149}, author = {Sallent, S. and Torres, L. and Gasull, A.} } @conference {cGasull88, title = {Detecci{\'o}n autom{\'a}tica de contornos en im{\'a}genes ecogr{\'a}ficas}, booktitle = {III Simposium Nacional de Reconocimiento de Formas y An{\'a}lisis de Im{\'a}genes}, year = {1988}, pages = {1{\textendash}4}, isbn = {?}, author = {Gasull, A. and Marqu{\'e}s, F. and Sallent, S. and Torres, L. and Vidal, J.} }