@article {ade-Mas-Gimenez23, title = {Gradient-Based Metrics for the Evaluation of Image Defogging}, journal = {World Electric Vehicle Journal}, volume = {14}, year = {2023}, month = {09/2023}, chapter = {254}, abstract = {

Fog, haze, or smoke are standard atmospheric phenomena that dramatically compromise the overall visibility of any scene, critically affecting features such as the illumination, contrast, and contour detection of objects. The decrease in visibility compromises the performance of computer vision algorithms such as pattern recognition and segmentation, some of which are very relevant to decision-making in the field of autonomous vehicles. Several dehazing methods have been proposed that either need to estimate fog parameters through physical models or are statistically based. But physical parameters greatly depend on the scene conditions, and statistically based methods require large datasets of natural foggy images together with the original images without fog, i.e., the ground truth, for evaluation. Obtaining proper fog-less ground truth images for pixel-to-pixel evaluation is costly and time-consuming, and this fact hinders progress in the field. This paper aims to tackle this issue by proposing gradient-based metrics for image defogging evaluation that do not require a ground truth image without fog or a physical model. A comparison of the proposed metrics with metrics already used in the NTIRE 2018 defogging challenge as well as several state-of-the-art defogging evaluation metrics is performed to prove its effectiveness in a general situation, showing comparable results to conventional metrics and an improvement in the no-reference scene. A Matlab implementation of the proposed metrics has been developed and it is open-sourced in a public GitHub repository.

}, doi = {10.3390/wevj14090254}, url = {https://www.mdpi.com/2032-6653/14/9/254}, author = {Gerard de-Mas-Gim{\'e}nez and Pablo Garc{\'\i}a-G{\'o}mez and Casas, J. and S. Royo} } @phdthesis {dMosella-Montoro23, title = {Graph Convolutional Neural Networks for 3D Data Analysis}, volume = {Phd}, year = {2023}, school = {Universitat Polit{\`e}cnica de Catalunya}, address = {Barcelona}, abstract = {

Deep Learning allows the extraction of complex features directly from raw input data, eliminating the need for hand-crafted features from the classical Machine Learning pipeline. This new paradigm brought a boost in the performance across several domains, including computer vision, natural language processing and audio processing. However, there are still challenges when dealing with unorganized structures. This thesis addresses this challenge using Graph Convolutional Neural Networks, a new set of techniques capable of managing graph structures that can be used for processing 3D data.\ \ 

The first part of the thesis focuses on the Graph Analysis task, in which we study the capabilities of Graph Convolutional Neural Networks to capture the intrinsic geometric information of 3D data. We propose the Attention Graph Convolution layer that learns to infer the kernel used during the convolution, taking into account the particularities of each neighbourhood of the graph. We explore two variants of the Attention Graph Convolution layer, one that explores a residual approach and another one that allows the convolution to combine different neighbourhood domains. Furthermore, we propose a set of 3D pooling layers that mimics the behaviour of the pooling layers found in common 2D Convolutional Neural Networks architectures. Finally, we present a 2D-3D Fusion block capable of merging the 3D geometric information that we get from a Graph Convolutional Neural Network with the texture information obtained by a 2D Convolutional Neural Network. We evaluate the presented contributions on the RGB-D Scene Classification task.\ 

The second part of this thesis focuses on the Node Analysis task, which consists of extracting features on a node level, taking into account the neighbourhood structure. We present the Multi-Aggregator Graph Convolution layer that uses a multiple aggregator approach to better generalize for unseen topologies and learn better local representations. In addition, it reduces the memory footprint with respect to the Attention Graph Convolution layer. Finally, we analyze the capabilities of our proposed Graph Convolution layers to deal with heterogeneous graphs where the nodes of the graph may belong to different modalities. We evaluate the presented contributions with the Computer Graphics process of skinning a character mesh. Specifically, we propose a Two-Stream Graph Neural Network capable of predicting the skinning weights of a 3D character.

}, url = {http://hdl.handle.net/10803/689400}, author = {Mosella-Montoro, Albert}, editor = {Ruiz-Hidalgo, J.} } @conference {cMas-Montserrat, title = {Generative Moment Matching Networks for Genotype Simulation}, booktitle = {44th Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC{\textquoteright}22)}, year = {2022}, abstract = {

The generation of synthetic genomic sequences using neural networks has potential to overcome privacy and data sharing restrictions and to mitigate potential bias within datasets due to under-representation of some population groups. However, there is not a consensus on which architectures, training procedures, and evaluation metrics should be used when simulating single nucleotide polymorphism (SNP) sequences with neural networks. In this paper, we explore the use of Generative Moment Matching Networks (GMMNs) for SNP simulation, we present some architectural and procedural changes to properly train the networks, and we introduce an evaluation scheme to qualitatively and quantitatively asses the quality of the simulated sequences.

}, author = {Mas-Montserrat, Daniel and Perera, Maria and Barrab{\'e}s, M{\'\i}riam and Geleta, Margarita and Xavier Gir{\'o}-i-Nieto and Ioannidis, Alexander G.} } @mastersthesis {xTarres21, title = {GAN-based Image Colourisation with Feature Reconstruction Loss}, year = {2021}, abstract = {

Automatic image colourisation is a complex and ambiguous task due to having multiple correct solutions. Previous approaches have resulted in desaturated results unless relying on significant user interaction.\ In this thesis we study the state of the art for colourisation and we propose an automatic colourisation approaches based on generative adversarial networks that incorporates a feature reconstruction loss during training. The generative network is framed in an adver- sarial model that learns how to colourise by incorporating a perceptual understanding of the colour. Qualitative and quantitative results show the capacity of the proposed method to colourise images in a realistic way, boosting the colourfulness and perceptual realism of previous GAN-based methodologies.\ We also study and propose a second approach that incorporates segmentation information in the GAN framework and obtain quantitative and qualitative results.

}, author = {Laia Tarr{\'e}s}, editor = {Mrak, Marta and Xavier Gir{\'o}-i-Nieto} } @conference {cTarres, title = {GAN-based Image Colourisation with Feature Reconstruction Loss}, booktitle = {CVPR 2021 Women in Computer Vision Workshop}, year = {2021}, month = {06/2021}, address = {Virtual}, abstract = {

Image colourisation is the task of adding plausible colour to grayscale images. This transformation requires obtaining a three dimensional colour-valued mapping from a real-valued grayscale image, which leads to an undetermined problem because the gray-scale semantics and texture provide cues for multiple possible colour mappings. The goal of image colourisation in not to recover the ground truth colour in a manner that it is perceived as natural by a human observer.\ Our work takes as a baseline a scheme based on an end-to-end trainable convolutional neural network (CNN) trained with a smooth L1 loss to predict the $ab$ channels of a colour image given the $L$ channel. We introduce an extra perceptual reconstruction loss during training to improve the capabilities of a adversarial adversarial model, that we adopt as a baseline.

}, author = {Laia Tarr{\'e}s and G{\'o}rriz, Marc and Xavier Gir{\'o}-i-Nieto and Mrak, Marta} } @mastersthesis {xKazakos20, title = {Generation of Synthetic Referring Expressions for Object Segmentation in Videos}, year = {2020}, abstract = {

Integrating computer vision with natural language processing has achieved significant progress over the last years owing to the continuous evolution of deep learning. A novel vision and language task, which is tackled in the present Master thesis is referring video object segmentation, in which a language query defines which instance to segment from a video sequence. One of the biggest chal- lenges for this task is the lack of relatively large annotated datasets since a tremendous amount of time and human effort is required for annotation. Moreover, existing datasets suffer from poor qual- ity annotations in the sense that approximately one out of ten language expressions fails to uniquely describe the target object.

The purpose of the present Master thesis is to address these challenges by proposing a novel method for generating synthetic referring expressions for an image (video frame). This method pro- duces synthetic referring expressions by using only the ground-truth annotations of the objects as well as their attributes, which are detected by a state-of-the-art object detection deep neural network. One of the advantages of the proposed method is that its formulation allows its application to any object detection or segmentation dataset.

By using the proposed method, the first large-scale dataset with synthetic referring expressions for video object segmentation is created, based on an existing large benchmark dataset for video instance segmentation. A statistical analysis and comparison of the created synthetic dataset with existing ones is also provided in the present Master thesis.

The conducted experiments on three different datasets used for referring video object segmen- tation prove the efficiency of the generated synthetic data. More specifically, the obtained results demonstrate that by pre-training a deep neural network with the proposed synthetic dataset one can improve the ability of the network to generalize across different datasets, without any additional an- notation cost. This outcome is even more important taking into account that no additional annotation cost is involved.

}, author = {Kazakos, Ioannis}, editor = {Xavier Gir{\'o}-i-Nieto} } @article {aGarcia-Gomez20, title = {Geometric Model and Calibration Method for a Solid-State LiDAR}, journal = {Sensors}, volume = {20}, year = {2020}, month = {05/2020}, pages = {2898}, abstract = {

This paper presents a novel calibration method for solid-state LiDAR devices based on a geometrical description of their scanning system, which has variable angular resolution. Determining this distortion across the entire Field-of-View of the system yields accurate and precise measurements which enable it to be combined with other sensors. On the one hand, the geometrical model is formulated using the well-known Snell{\textquoteright}s law and the intrinsic optical assembly of the system, whereas on the other hand the proposed method describes the scanned scenario with an intuitive camera-like approach relating pixel locations with scanning directions. Simulations and experimental results show that the model fits with real devices and the calibration procedure accurately maps their variant resolution so undistorted representations of the observed scenario can be provided. Thus, the calibration method proposed during this work is applicable and valid for existing scanning systems improving their precision and accuracy in an order of magnitude.

}, keywords = {solid-state LiDAR; LiDAR calibration; distortion correction; FOV mapping}, issn = {1424-8220}, doi = {10.3390/s20102898}, url = {https://www.mdpi.com/1424-8220/20/10/2898}, author = {Pablo Garc{\'\i}a-G{\'o}mez and S. Royo and Noel Rodrigo and Casas, J.} } @article {9103248, title = {Grounded Sequence to Sequence Transduction}, journal = {IEEE Journal of Selected Topics in Signal Processing}, volume = {14}, year = {2020}, month = {05/2020}, pages = {577-591}, abstract = {

Speech recognition and machine translation have made major progress over the past decades, providing practical systems to map one language sequence to another. Although multiple modalities such as sound and video are becoming increasingly available, the state-of-the-art systems are inherently unimodal, in the sense that they take a single modality {\textemdash} either speech or text {\textemdash} as input. Evidence from human learning suggests that additional modalities can provide disambiguating signals crucial for many language tasks. In this article, we describe the How2 dataset\ , a large, open-domain collection of videos with transcriptions and their translations. We then show how this single dataset can be used to develop systems for a variety of language tasks and present a number of models meant as starting points. Across tasks, we find that building multimodal architectures that perform better than their unimodal counterpart remains a challenge. This leaves plenty of room for the exploration of more advanced solutions that fully exploit the multimodal nature of the How2 dataset\ , and the general direction of multimodal learning with other datasets as well.

}, author = {L. Specia and L. Barrault and O. Caglayan and Amanda Duarte and D. Elliott and S. Gella and N. Holzenberger and C. Lala and S. J. Lee and J. Libovicky and P. Madhyastha and F. Metze and K. Mulligan and A. Ostapenko and S. Palaskar and R. Sanabria and J. Wang and R. Arora} } @mastersthesis {xCaros, title = {A Generative Dialogue System for Reminiscence Therapy}, year = {2019}, abstract = {

With people living longer than ever, the number of cases with neurodegenerative diseases such as Alzheimer{\textquoteright}s or cognitive impairment increases steadily. In Spain it affects more than 1.2 million patients and it is estimated that in 2050 more than 100 million people will be affected. While there are not effective treatments for this terminal disease, therapies such as reminiscence, that stimulate memories of the patient{\textquoteright}s past are recommended, as they encourage the communication and produce mental and emotional benefits on the patient. Currently, reminiscence therapy takes place in hospitals or residences, where the therapists are located. Since people that receive this therapy are old and may have mobility difficulties, we present an AI solution to guide older adults through reminiscence sessions by using their laptop or smartphone.\ 

Our solution consists in a generative dialogue system composed of two deep learning architectures to recognize image and text content. An Encoder-Decoder with Attention is trained to generate questions from photos provided by the user, which is composed of a pretrained Convolution Neural Network to encode the picture, and a Long Short-Term Memory to decode the image features and generate the question. The second architecture is a sequence-to-sequence model that provides feedback to engage the user in the conversation.

Thanks to the experiments, we realise that we obtain the best performance by training the dialogue model with Persona-Dataset and fine-tuning it with Cornell Movie-Dialogues dataset. Finally, we integrate Telegram as the interface for the user to interact with Elisabot, our trained conversational agent.

}, author = {Caros, Mariona}, editor = {Radeva, Petia and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xRos18, title = {Generative Adversarial Networks for Anomaly Detection in Images}, year = {2018}, abstract = {

Anomaly detection is used to identify abnormal observations that don{\textquoteright}t follow a normal pattern. In

this work, we use the power of Generative Adversarial Networks in sampling from image distributions

to perform anomaly detection with images and to identify local anomalous segments within this

images. Also, we explore potential application of this method to support pathological analysis of

biological tissues.

}, author = {Guillem Batiste}, editor = {Ver{\'o}nica Vilaplana} } @conference {cLin16, title = {Graph based Dynamic Segmentation of Generic Objects in 3D}, booktitle = {CVPR SUNw: Scene Understanding Workshop}, year = {2016}, month = {06/2016}, address = {Las Vegas, US}, abstract = {

We propose a novel 3D segmentation method for RBGD stream data to deal with 3D object segmentation task in a generic scenario with frequent object interactions. It mainly contributes in two aspects, while being generic and not requiring initialization: firstly, a novel tree structure representation for the point cloud of the scene is proposed. Then, a dynamic manangement mechanism for connected component splits and merges exploits the tree structure representation.

}, url = {http://sunw.csail.mit.edu/posters.html}, author = {X. Lin and Casas, J. and M. Pard{\`a}s} } @article {aAlcoverro13, title = {Gesture Control Interface for immersive panoramic displays}, journal = {Multimedia Tools and Applications}, year = {2013}, month = {07/2013}, pages = {1-27}, abstract = {

In this paper, we propose a gesture-based interface designed to interact with panoramic scenes. The system combines novel static gestures with a fast hand tracking method. Our proposal is to use static gestures as shortcuts to activate functionalities of the system (i.e. volume up/down, mute, pause, etc.), and hand tracking to freely explore the panoramic video. The overall system is multi-user, and incorporates a user identi cation module based on face recognition, which is able both to recognize returning users and to add new users online. The system exploits depth data, making it robust to challenging illumination conditions. We show through experimental results the performance of every component of the system compared to the state of the art. We also show the results of a usability study\ performed with several untrained users.

}, issn = {1380-7501}, doi = {10.1007/s11042-013-1605-7}, author = {Alcoverro, M. and Suau, X. and Morros, J.R. and L{\'o}pez-M{\'e}ndez, A. and A. Gil-Moreno and Ruiz-Hidalgo, J. and Casas, J.} } @conference {cKochale13, title = {Gesture controlled interactive rendering in a panoramic scene}, booktitle = {European Interactive TV Conference, EuroITV}, year = {2013}, month = {06/2013}, address = {Como, Italy}, url = {http://upcommons.upc.edu/e-prints/handle/2117/20470}, author = {Kochale, A. and Ruiz-Hidalgo, J. and M. Borsum} } @conference {cZoric13, title = {Gesture Interaction with Rich TV Content in the Social Setting}, booktitle = {Exploring and Enhancing the User Experience for Television, Workshop of ACM SIGCHI Conference on Human Factors in Computing Systems, CHI{\textquoteright}13}, year = {2013}, month = {04/2013}, address = {Paris, France}, abstract = {

The appearance of new immersive TV content has increased the interactive possibilities presented to the viewers. Increased interactivity is seen as a valuable feature in viewing richer television content, but new functionalities are limited by what can be done naturally and intuitively using available devices like remote controls. Therefore, new interaction techniques, such as visual gestures control systems, have appeared aiming to enhance the viewers{\textquoteright} viewing experience. In this work we begin uncovering the potential and challenges of gesture interaction with ultra high definition video for people watching TV together. As a first step we have done a study with a group of people interacting with such content using a gesture-based system in the home environment.

}, url = {http://livingroomexperience.wikispaces.com/}, author = {Zoric, Goranka and Engstr{\"o}m, Arvid and Barkhuus, Louise and Ruiz-Hidalgo, J. and Kochale, A.} } @article {aBosio12, title = {Gene Expression Data Classification Combining Hierarchical Representation and Efficient Feature Selection}, journal = {Journal of Biological Systems}, volume = {20}, year = {2012}, pages = {349-375}, abstract = {

A general framework for microarray data classification is proposed in this paper. It produces precise and reliable classifiers through a two-step approach. At first, the original feature set is enhanced by a new set of features called metagenes. These new features are obtained through a hierarchical clustering process on the original data. Two different metagene generation rules have been analyzed, called Treelets clustering and Euclidean clustering. Metagenes creation is attractive for several reasons: first, they can improve the classification since they broaden the available feature space and capture the common behavior of similar genes reducing the residual measurement noise. Furthermore, by analyzing some of the chosen metagenes for classification with gene set enrichment analysis algorithms, it is shown how metagenes can summarize the behavior of functionally related probe sets. Additionally, metagenes can point out, still undocumented, highly discriminant probe sets numerically related to other probes endowed with prior biological information in order to contribute to the knowledge discovery process.

The second step of the framework is the feature selection which applies the Improved Sequential Floating Forward Selection algorithm (IFFS) to properly choose a subset from the available feature set for classification composed of genes and metagenes. Considering the microarray sample scarcity problem, besides the classical error rate, a reliability measure is introduced to improve the feature selection process. Different scoring schemes are studied to choose the best one using both error rate and reliability. The Linear Discriminant Analysis classifier (LDA) has been used throughout this work, due to its good characteristics, but the proposed framework can be used with almost any classifier. The potential of the proposed framework has been evaluated analyzing all the publicly available datasets offered by the Micro Array Quality Control Study, phase II (MAQC). The comparative results showed that the proposed framework can compete with a wide variety of state of the art alternatives and it can obtain the best mean performance if a particular setup is chosen. A Monte Carlo simulation confirmed that the proposed framework obtains stable and repeatable results.

}, doi = {10.1142/S0218339012400025}, url = {http://www.worldscientific.com/doi/abs/10.1142/S0218339012400025}, author = {Bosio, M. and P. Bellot and Salembier, P. and Albert Oliveras} } @article {aGiro-i-Nieto10, title = {GAT, a Graphical Annotation Tool for semantic regions}, journal = {Multimedia tools and applications}, volume = {46}, number = {2/3 (2010)}, year = {2010}, pages = {155{\textendash}174}, abstract = {

This article presents GAT, a Graphical Annotation Tool based on a region-based hierarchical representation of images. The proposed solution uses Partition Trees to navigate through the image segments which are automatically defined at different spatial scales. Moreover, the system focuses on the navigation through ontologies for a semantic annotation of objects and of the parts that compose them. The tool has been designed under usability criteria to minimize the user interaction by trying to predict the future selection of regions and semantic classes. The implementation uses MPEG-7/XML input and output data to allow interoperability with any type of Partition Tree. This tool is publicly available and its source code can be downloaded under a free software license.

}, issn = {1380-7501}, doi = {10.1007/s11042-009-0389-2}, url = {http://www.springerlink.com/content/j78782k762617352/}, author = {Xavier Gir{\'o}-i-Nieto and Camps, N. and Marqu{\'e}s, F.} } @phdthesis {dRolon10, title = {Generalized Lifting for Sparse Image Representation and Coding}, year = {2010}, school = {Universitat Polit{\`e}cnica de Catalunya (UPC)}, type = {phd}, author = {Rol{\'o}n, J.}, editor = {Salembier, P.} } @article {xCortes10, title = {GOS: b{\'u}squeda visual de im{\'a}genes}, number = {25}, year = {2010}, pages = {36{\textendash}44}, keywords = {i3media}, issn = {1698-7047}, url = {http://upcommons.upc.edu/e-prints/urlFiles?idDrac=2251008}, author = {Cort{\'e}s, S.}, editor = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cRolon09a, title = {Generalized lifting with adaptive local pdf estimation for image coding}, booktitle = {Picture coding symposium, PCS 2009}, year = {2009}, address = {Chicago, USA}, author = {Rol{\'o}n, J. and Mendon{\c c}a, E. and Salembier, P.} } @conference {cFrias-Velazquez09, title = {Gray-scale erosion algorithm based on image bitwise decomposition: application to focal plane processors}, booktitle = {IEEE International Conference on Acoustics, Speech and Signal Processing 2009}, year = {2009}, pages = {845{\textendash}848}, doi = {10.1109/ICASSP.2009.4959716}, url = {http://hdl.handle.net/2117/9156}, author = {Frias-Velazquez, A. and Morros, J.R.} } @conference {cCalderero08a, title = {General region merging approaches based on information theory statistical measures}, booktitle = {IEEE International Conference on Image Processing}, year = {2008}, pages = {3016{\textendash}3019}, isbn = {1-4244-1764-3}, author = {Calderero, F. and Marqu{\'e}s, F.} } @conference {cCalderero08, title = {General Region Merging Based on First Order Markov Information Theory Statistical Measures}, booktitle = {16th European Signal Processing Conference}, year = {2008}, author = {Calderero, F. and Marqu{\'e}s, F.} } @conference {cRolon07, title = {Generalized Lifting For Sparse Image Representation and Coding}, booktitle = {Picture Coding Symposium, PCS 2007}, year = {2007}, pages = {234{\textendash}238}, address = {Lisbon, Portugal}, isbn = {88-86179-83-9}, author = {Rol{\'o}n, J. and Salembier, P.} } @article {aSole07a, title = {Generalized lifting prediction optimization applied to lossless image compression}, journal = {IEEE signal processing letters}, volume = {14}, number = {10}, year = {2007}, pages = {695{\textendash}698}, issn = {1070-9908}, author = {Sol{\'e}, J. and Salembier, P.} } @conference {cDimiccoli07a, title = {Geometrical Filtering Scheme with Connected Operators and Image Inpainting}, booktitle = {SPIE Visual Communcations and Image Processing 2007}, year = {2007}, pages = {1{\textendash}14}, address = {San Jose, CA, USA}, author = {Dimiccoli, M. and Salembier, P.} } @conference {cDorea06, title = {Generation of long-term color and motion coherent partitions}, booktitle = {International Conference on Image Processing}, year = {2006}, pages = {581{\textendash}584}, author = {Dorea, C. and M. Pard{\`a}s and Marqu{\'e}s, F.} } @conference {cVallverdu02, title = {Graphical study of signals and systems}, booktitle = {14th annual World Conference on Educational Multimedia, Hypermedia \& Telecommunications}, year = {2002}, isbn = {0-8186-7919-0}, author = {Vallverdu, F. and Elisa Sayrol and Gasull, A. and Salavedra, J. and Moreno, A.} } @conference {cSayrol01a, title = {Graphical Study of Signals and Systems}, booktitle = {International Conference on Acoustics, Speech and Signal Processing ICASSP{\textquoteright}01}, year = {2001}, isbn = {0-7803-1775-0}, author = {Elisa Sayrol and Gasull, A. and Moreno, A. and Salavedra, J. and Vallverdu, F.} } @article {aMarques97, title = {General requirements for coding oriented segmentation of video sequences}, journal = {Annales des t{\'e}lecommunications. Annals of telecommunications}, volume = {52}, number = {7-8}, year = {1997}, pages = {359{\textendash}366}, issn = {0003-4347}, author = {Marqu{\'e}s, F. and Meyer, F. and M. Pard{\`a}s and Salembier, P.} } @conference {cOliveras96, title = {Generalized connected operators}, booktitle = {SPIE Visual Communication and Image Processing, VCIP{\textquoteright}96}, year = {1996}, pages = {2727{\textendash}2771}, address = {Orlando, Florida, USA}, author = {Albert Oliveras and Salembier, P.} }