@article {aPardas23, title = {Stromal tissue segmentation in Ki67 histology images based on cytokeratin-19 stain translation}, journal = {JOURNAL OF MEDICAL IMAGING}, volume = {10}, year = {2023}, month = {06/2023}, abstract = {

Purpose

The diagnosis and prognosis of breast cancer relies on histopathology image analysis. In this context, proliferation markers, especially Ki67, are increasingly important. The diagnosis using these markers is based on the quantification of proliferation, which implies the counting of Ki67 positive and negative tumoral cells in epithelial regions, thus excluding stromal cells. However, stromal cells are often very difficult to distinguish from negative tumoral cells in Ki67 images and often lead to errors when automatic analysis is used.

Approach

We study the use of automatic semantic segmentation based on convolutional neural networks (CNNs) to separate stromal and epithelial areas on Ki67 stained images. CNNs need to be accurately trained with extensive databases with associated ground truth. As such databases are not publicly available, we propose a method to produce them with minimal manual labelling effort. Inspired by the procedure used by pathologists, we have produced the database relying on knowledge transfer from cytokeratin-19 images to Ki67 using an image-to-image (I2I) translation network.

Results

The automatically produced stroma masks are manually corrected and used to train a CNN that predicts very accurate stroma masks for unseen Ki67 images. An F-score value of 0.87 is achieved. Examples of effect on the KI67 score show the importance of the stroma segmentation.

Conclusions

An I2I translation method has proved very useful for building ground-truth labeling in a task where manual labeling is unfeasible. With reduced correction effort, a dataset can be built to train neural networks for the difficult problem of separating epithelial regions from stroma in stained images where separation is very hard without additional information.

}, isbn = {2329-4302}, doi = {https://doi.org/10.1117/1.JMI.10.3.037502}, author = {M. Pard{\`a}s and D. Anglada and M. Espina and F. Marques and Salembier, P.} } @article {aPedersen22, title = {Experimental confirmation of efficient island divertor operation and successful neoclassical transport optimization in Wendelstein 7-X}, journal = {Nuclear Fusion}, volume = {62}, year = {2022}, month = {04/2022}, issn = {1741-4326}, doi = {10.1088/1741-4326/ac2cf5}, author = {Thomas Sunn Pedersen and al et and Casas, J. and Salembier, P.} } @conference {cRamonb, title = {H3D-Net: Few-Shot High-Fidelity 3D Head Reconstruction}, booktitle = {International Conference on Computer Vision (ICCV)}, year = {2021}, publisher = {IEEE/CVF}, organization = {IEEE/CVF}, address = {Virtual}, abstract = {

Recent learning approaches that implicitly represent surface geometry using coordinate-based neural representations have shown impressive results in the problem of multi-view 3D reconstruction. The effectiveness of these techniques is, however, subject to the availability of a large number (several tens) of input views of the scene, and computationally demanding optimizations. In this paper, we tackle these limitations for the specific problem of few-shot full 3D head reconstruction, by endowing coordinate-based representations with a probabilistic shape prior that enables faster convergence and better generalization when using few input images (down to three). First, we learn a shape model of 3D heads from thousands of incomplete raw scans using implicit representations. At test time, we jointly overfit two coordinate-based neural networks to the scene, one modeling the geometry and another estimating the surface radiance, using implicit differentiable rendering. We devise a two-stage optimization strategy in which the learned prior is used to initialize and constrain the geometry during an initial optimization phase. Then, the prior is unfrozen and fine-tuned to the scene. By doing this, we achieve high-fidelity head reconstructions, including hair and shoulders, and with a high level of detail that consistently outperforms both state-of-the-art 3D Morphable Models methods in the few-shot scenario, and non-parametric methods when large sets of views are available.

}, author = {Ramon, Eduard and Triginer, Gil and Escur, Janna and Pumarola, Albert and Garcia, Jaime and Xavier Gir{\'o}-i-Nieto and Moreno, Francesc} } @article {xEscobar21, title = {Object Model Adaptation for Multiple Object Tracking}, year = {2021}, abstract = {

Multiple object tracking is a broadly used task in multi- ple applications, all the way from bioengineering to security applications. In this paper we propose a variation of RVOS by adding the center estimation of detected instances, by means of a second head in the decoder which is assigned the task of detecting the corresponding object{\textquoteright}s bounding box arithmetic center. We have trained the model using three variants of the cross-entropy loss, which has been adapted to tackle the class imbalance caused by the fact that the center of an object is represented by only one pixel of the image, and have obtained some promising results.

}, author = {Escobar, Miquel}, editor = {Girbau, A. and Ventura, C. and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @mastersthesis {xEscur, title = {Attention-based multi-view 3D reconstruction models}, year = {2020}, abstract = {

Attention mechanism has been successfully used in multiple tasks in the fields of Computer Vision and Natural Language Processing, but has not ever been applied to 3D reconstruction problems. In this work, we explore the potential of attention in a multi-view 3D face recon- struction pipeline. On one hand, we use spatial attention when extracting the features of the input images, taking advantage of the interpretability it provides us. This allows us to validate the proper behaviour of the model. On the other hand, we want to make this multi-view setup invariant to the order of the input image?s views. To do so, instead of concatenating the fea- tures of the different views, we use part of the Transformer architecture as a symmetric merging function, which is based on a multi-head self-attention mechanism, showing an improvement in the performance.

}, author = {Escur, Janna}, editor = {Ramon, Eduard and Xavier Gir{\'o}-i-Nieto} } @conference {cFernandezf, title = {Enhancing Online Knowledge Graph Population with Semantic Knowledge}, booktitle = {19th International Semantic Web Conference (ISWC)}, year = {2020}, month = {11/2020}, address = {Virtual}, abstract = {

Knowledge Graphs (KG) are becoming essential to organize, represent and store the world{\textquoteright}s knowledge, but they still rely heavily on humanly-curated structured data. Information Extraction (IE) tasks, like disambiguating entities and relations from unstructured text, are key to automate KG population. However, Natural Language Processing (NLP) methods alone can not guarantee the validity of the facts extracted and may introduce erroneous information into the KG.\ This work presents an end-to-end system that combines Semantic Knowledge and Validation techniques with NLP methods, to provide KG population of novel facts from clustered news events.\ The contributions of this paper are two-fold: First, we present a novel method for including entity-type knowledge into a Relation Extraction model, improving F1-Score over the baseline with TACRED and TypeRE datasets. Second, we increase the precision by adding data validation on top of the Relation Extraction method. These two contributions are combined in an industrial pipeline for automatic KG population over aggregated news, demonstrating increased data validity when performing online learning from unstructured web data. Finally, the TypeRE and AggregatedNewsRE datasets build to benchmark these results are also published to foster future research in this field.

}, keywords = {Data Validation, Knowledge Graph, Relation Extraction}, author = {Fern{\`a}ndez, D{\`e}lia and Rimmek, Joan Marco and Espadaler, Joan and Garolera, Blai and Barja, Adri{\`a} and Codina, Marc and Sastre, Marc and Xavier Gir{\'o}-i-Nieto and Riveiro, Juan Carlos and Bou-Balust, Elisenda} } @article {9103248, title = {Grounded Sequence to Sequence Transduction}, journal = {IEEE Journal of Selected Topics in Signal Processing}, volume = {14}, year = {2020}, month = {05/2020}, pages = {577-591}, abstract = {

Speech recognition and machine translation have made major progress over the past decades, providing practical systems to map one language sequence to another. Although multiple modalities such as sound and video are becoming increasingly available, the state-of-the-art systems are inherently unimodal, in the sense that they take a single modality {\textemdash} either speech or text {\textemdash} as input. Evidence from human learning suggests that additional modalities can provide disambiguating signals crucial for many language tasks. In this article, we describe the How2 dataset\ , a large, open-domain collection of videos with transcriptions and their translations. We then show how this single dataset can be used to develop systems for a variety of language tasks and present a number of models meant as starting points. Across tasks, we find that building multimodal architectures that perform better than their unimodal counterpart remains a challenge. This leaves plenty of room for the exploration of more advanced solutions that fully exploit the multimodal nature of the How2 dataset\ , and the general direction of multimodal learning with other datasets as well.

}, author = {L. Specia and L. Barrault and O. Caglayan and Amanda Duarte and D. Elliott and S. Gella and N. Holzenberger and C. Lala and S. J. Lee and J. Libovicky and P. Madhyastha and F. Metze and K. Mulligan and A. Ostapenko and S. Palaskar and R. Sanabria and J. Wang and R. Arora} } @conference {cIsart, title = {CNN-based bacilli detection in sputum samples for tuberculosis diagnosis}, booktitle = {International Symposium on Biomedical Imaging (ISBI 2019)}, year = {2019}, month = {04/2019}, abstract = {

Tuberculosis (TB) is one of the infectious diseases that causes more deaths in low and middle-income countries. A low-cost method to diagnose TB consists in analyzing sputum smear samples through microscope observation. Manual identification and counting of bacilli is a very time consuming task and the sensitivity of the diagnosis depends on the availability of skilled technicians. We propose a computer vision technique based on a convolutional neural network (CNN) to automatically segment and count bacilli in sputum samples and predict the infection level.

}, author = {Antoni Isart and Mateu Espasa and Ver{\'o}nica Vilaplana and Elisa Sayrol} } @article {aRoisman18, title = {Differential expression of long non-coding RNAs related to proliferation and histological diversity in follicular lymphomas}, journal = {British Journal of Haematology}, volume = {184}, year = {2019}, month = {Feb 2019}, pages = {373-383}, issn = {ISSN:1365-2141}, doi = {DOI: 10.1111/bjh.15656}, author = {A. Roisman and A. Navarro and G. Clot and G. Castellano and B. Gonzalez-Farre and P. P{\'e}rez-Galan and A. Esteve and M. Dabad and S. Heath and M. Gut and Bosio, M. and P. Bellot and Salembier, P. and Albert Oliveras and I. Slavusky and L. Magnano and H. Horn and A. Rosenwald and G. Ott and M. Aymerich and A. L{\'o}pez-Guillermo and P. Jares and J.I. Mart{\'\i}n-Subero and E. Campo and L. Hern{\'a}ndez} } @article {aGene-Molab, title = {Fruit Detection in an Apple Orchard Using a Mobile Terrestrial Laser Scanner}, journal = {Biosystems Engineering}, volume = {187}, year = {2019}, month = {09/2019}, chapter = {171}, abstract = {

The development of reliable fruit detection and localization systems provides an opportunity to improve the crop value and management by limiting fruit spoilage and optimized harvesting practices. Most proposed systems for fruit detection are based on RGB cameras and thus are affected by intrinsic constraints, such as variable lighting conditions. This work presents a new technique that uses a mobile terrestrial laser scanner (MTLS) to detect and localise Fuji apples. An experimental test focused on Fuji apple trees (Malus domestica Borkh. cv. Fuji) was carried out. A 3D point cloud of the scene was generated using an MTLS composed of a Velodyne VLP-16 LiDAR sensor synchronized with an RTK-GNSS satellite navigation receiver. A reflectance analysis of tree elements was performed, obtaining mean apparent reflectance values of 28.9\%, 29.1\%, and 44.3\% for leaves, branches and trunks, and apples, respectively. These results suggest that the apparent reflectance parameter (at 905 nm wavelength) can be useful to detect apples. For that purpose, a four-step fruit detection algorithm was developed. By applying this algorithm, a localization success of 87.5\%, an identification success of 82.4\%, and an F1-score of 0.858 were obtained in relation to the total amount of fruits. These detection rates are similar to those obtained by RGB-based systems, but with the additional advantages of providing direct 3D fruit location information, which is not affected by sunlight variations. From the experimental results, it can be concluded that LiDAR-based technology and, particularly, its reflectance information, has potential for remote apple detection and 3D location.

}, issn = {1537-5110}, doi = {10.1016/j.biosystemseng.2019.08.017}, url = {https://authors.elsevier.com/c/1Zmc45Tbkk9EHW}, author = {Gen{\'e}-Mola, Jordi and Gregorio, Eduard and Guevara, Javier and Auat Cheein, Fernando and Sanz, Ricardo and Escol{\`a}, Alexandre and Llorens Calveras, Jordi and Morros, J.R. and Ruiz-Hidalgo, J. and Ver{\'o}nica Vilaplana and Rosell-Polo, Joan R.} } @conference {cRamona, title = {Multi-View 3D Face Reconstruction in the Wild using Siamese Networks}, booktitle = {ICCV 2019 Workshop on 3D Face Alignment in the Wild Challenge Workshop (3DFAW)}, year = {2019}, month = {11/2019}, publisher = {IEEE/CVF}, organization = {IEEE/CVF}, address = {Seoul, South Corea}, abstract = {

In this work, we present a novel learning based approach\ to reconstruct 3D faces from a single or multiple images.\ Our method uses a simple yet powerful architecture based\ on siamese neural networks that helps to extract relevant\ features from each view while keeping the models small. Instead of minimizing multiple objectives, we propose to simultaneously learn the 3D shape and the individual camera\ poses by using a single term loss based on the reprojection\ error, which generalizes from one to multiple views. This allows to globally optimize the whole scene without having to\ tune any hyperparameters and to achieve low reprojection\ errors, which are important for further texture generation.\ Finally, we train our model on a large scale dataset with\ more than 6,000 facial scans. We report competitive results\ in 3DFAW 2019 challenge, showing the effectiveness of our\ method.

}, author = {Ramon, Eduard and Escur, Janna and Xavier Gir{\'o}-i-Nieto} } @conference {cDuartea, title = {Wav2Pix: Speech-conditioned Face Generation using Generative Adversarial Networks}, booktitle = {ICASSP}, year = {2019}, month = {05/2019}, publisher = {IEEE}, organization = {IEEE}, address = {Brighton, UK}, abstract = {

Speech is a rich biometric signal that contains information\ about the identity, gender and emotional state of the speaker.\ In this work, we explore its potential to generate face images\ of a speaker by conditioning a Generative Adversarial\ Network (GAN) with raw speech input. We propose a deep\ neural network that is trained from scratch in an end-to-end\ fashion, generating a face directly from the raw speech waveform\ without any additional identity information (e.g reference\ image or one-hot encoding). Our model is trained in a\ self-supervised fashion by exploiting the audio and visual signals\ naturally aligned in videos. With the purpose of training\ from video data, we present a novel dataset collected for this\ work, with high-quality videos of ten youtubers with notable\ expressiveness in both the speech and visual signals.

}, doi = {10.1109/ICASSP.2019.8682970}, url = {http://hdl.handle.net/2117/167073}, author = {Amanda Duarte and Rold{\'a}n, Francisco and Tubau, Miquel and Escur, Janna and Pascual-deLaPuente, Santiago and Amaia Salvador and Mohedano, Eva and McGuinness, Kevin and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {x, title = {Exploring Automatic Speech Recognition with TensorFlow}, year = {2018}, abstract = {

Advisors: Marta R. Costa-juss{\`a} (TALP, UPC) and Xavier Giro-i-Nieto (GPI, UPC)

Grade: A (9.8/10.9)

Speech recognition is the task aiming to identify words in spoken language and convert them into text. This bachelor{\textquoteright}s thesis focuses on using deep learning techniques to build an end-to-end Speech Recognition system. As a preliminary step, we overview the most relevant methods carried out over the last several years. Then, we study one of the latest proposals for this end-to-end approach that uses a sequence to sequence model with attention-based mechanisms. Next, we successfully reproduce the model and test it over the TIMIT database. We analyze the similarities and differences between the current implementation proposal and the original theoretical work. And finally, we experiment and contrast using different parameters (e.g. number of layer units, learning rates and batch sizes) and reduce the Phoneme Error Rate in almost 12\% relative.

Exploring Automatic Speech Recognition with Rensorflow from Universitat Polit{\`e}cnica de Catalunya
}, author = {Escur, Janna}, editor = {Costa-juss{\`a}, Marta R. and Xavier Gir{\'o}-i-Nieto} } @conference {cGene-Mola18, title = {Fruit Detection Using Mobile Terrestrial Laser Scanning}, booktitle = {AgEng 2018,}, year = {2018}, month = {07/2018}, address = {Wageningen (Netherlands)}, abstract = {

The development of reliable fruit detection and localization systems is essential for future sustainable agronomic management of high-value crops. Up to date, most proposed systems on fruit detection and characterization are based on RGB cameras and thus affected by intrinsic constraints, such as variable lighting conditions and camera calibration. This work presents a new technique that uses a mobile terrestrial laser scanner to detect and localize fruits regardless of the prevailing lighting conditions and without the need of a previous calibration. An experimental test focused on two Fuji apple trees (containing 139 and 145 apples each) was carried out. A 3D point cloud of this scene was generated using a Velodyne VLP-16 LiDAR sensor synchronized with a RTK-GNSS receiver. A reflectivity analysis of tree elements was performed, obtaining mean reflectivity values of 28.9\%, 29.1\%, and 44.3\% for leaves, trunks, and fruits, respectively. These results suggest that the reflectivity parameter can be useful to localize fruits in the tree. From this knowledge, a three-step fruit detection algorithm has been developed: 1) reflectivity thresholding to remove most of the leaves and trunks from the original point cloud; 2) statistical outlier removal to reduce noise; 3) connected components clustering using a density-based algorithm. By applying this algorithm to our dataset, a localization success of 85\%, a detachment success of 78.8\%, and a false detection rate of 15.2\% were obtained. These detection rates are similar to those obtained by current RGB-based system, but with the additional advantage of providing direct 3D fruit location information (global coordinates) which is not affected by sunlight variations. It can be concluded that LiDAR technology and, particularly, its reflectivity information, might have potential use in fruit detection. Future work should include the application of this fruit detection technique on a wider range of crop types

}, author = {Gen{\'e}-Mola, Jordi and Gregorio, Eduard and Guevara, Javier and Auat, Fernando and Escol{\`a}, Alexandre and Morros, J.R. and Rosell-Polo, Joan R.} } @article {xGiro-i-Nieto17, title = {La meitat de les not{\'\i}cies que consumirem el 2022 seran falses}, year = {2017}, publisher = {Corporaci{\'o} Catalana de Mitjans Audiovisuals}, address = {Sant Joan Desp{\'\i}}, abstract = {

Reportatge em{\`e}s dins el Telenot{\'\i}cies Vespre de Televisi{\'o} de Catalunya el diumenge 26 de novembre de 2017.

Els programes d{\textquoteright}intel{\textperiodcentered}lig{\`e}ncia artificial s{\'o}n capa{\c c}os de crear imatges i veus cada cop m{\'e}s realistes i obren la porta a generar mentides de forma m{\'e}s automatitzada

}, keywords = {deep learning, fake news, gan}, url = {http://www.ccma.cat/324/la-meitat-de-les-noticies-que-consumirem-el-2022-seran-falses/noticia/2823178/}, author = {Xavier Gir{\'o}-i-Nieto and Pascual-deLaPuente, Santiago and Mir{\'o}, Vict{\`o}ria and Esteve, Oriol} } @conference {cFernandeza, title = {ViTS: Video Tagging System from Massive Web Multimedia Collections}, booktitle = {ICCV 2017 Workshop on Web-scale Vision and Social Media }, year = {2017}, month = {10/2017}, address = {Venice, Italy}, abstract = {

The popularization of multimedia content on the Web has arised the need to automatically understand, index and retrieve it. In this paper we present ViTS, an automatic Video Tagging System which learns from videos, their web context and comments shared on social networks. ViTS analyses massive multimedia collections by Internet crawling, and maintains a knowledge base that updates in real time with no need of human supervision. As a result, each video is indexed with a rich set of labels and linked with other related contents. ViTS is an industrial product under exploitation with a vocabulary of over 2.5M concepts, capable of indexing more than 150k videos per month. We compare the quality and completeness of our tags with respect to the ones in the YouTube-8M dataset, and we show how ViTS enhances the semantic annotation of the videos with a larger number of labels (10.04 tags/video), with an accuracy of 80,87\%.

}, author = {Fern{\`a}ndez, D{\`e}lia and David Varas and Espadaler, Joan and Ferreira, Jordi and Woodward, Alejandro and Rodr{\'\i}guez, David and Xavier Gir{\'o}-i-Nieto and Riveiro, Juan Carlos and Bou, Elisenda} } @conference {cPoignant16, title = {The CAMOMILE Collaborative Annotation Platform for Multi-modal, Multi-lingual and Multi-media Documents}, booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)}, year = {2016}, month = {05/2016}, address = {Portoro{\v z} (Slovenia)}, abstract = {

In this paper, we describe the organization and the implementation of the CAMOMILE collaborative annotation framework for multimodal, multimedia, multilingual (3M) data. Given the versatile nature of the analysis which can be performed on 3M data, the structure of the server was kept intentionally simple in order to preserve its genericity, relying on standard Web technologies. Layers of annotations, defined as data associated to a media fragment from the corpus, are stored in a database and can be managed through standard interfaces with authentication. Interfaces tailored specifically to the needed task can then be developed in an agile way, relying on simple but reliable services for the management of the centralized annotations. We then present our implementation of an active learning scenario for person annotation in video, relying on the CAMOMILE server; during a dry run experiment, the manual annotation of 716 speech segments was thus propagated to 3504 labeled tracks. The code of the CAMOMILE framework is distributed in open source.

}, keywords = {active learning, Annotation tool, collaborative annotation, multimedia, person annotation}, isbn = {978-2-9517408-9-1}, url = {http://www.lrec-conf.org/proceedings/lrec2016/pdf/456_Paper.pdf}, author = {Johann Poignant and Mateusz Budnik and Herv{\'e} Bredin and Claude Barras and Mickael Stefas and Pierrick Bruneau and Gilles Adda and Laurent Besacier and Hazim Ekenel and Gil Francopoulo and Javier Hernando and Joseph Mariani and Morros, J.R. and Georges Qu{\'e}not and Sophie Rosset and Thomas Tamisier} } @mastersthesis {xRamos-Caballero15, title = {Keyframe-based Video Summarization Designer}, year = {2015}, abstract = {

Advisors: Xavier Gir{\'o}-i-Nieto (UPC) and Horst Eidenberger (TU Wien)

Studies: Bachelor Degree inAudiovisual Systems Engineeringat Telecom BCN-ETSETB from the Technical University of Catalonia (UPC)

Grade: B (8.7/10)

This Final Degree Work extends two previous projects and consists in carrying out an improvement of the video keyframe extraction module from one of them called Designer Master, by integrating the algorithms that were developed in the other, Object Maps.

Firstly the proposed solution is explained, which consists in a shot detection method, where the input video is sampled uniformly and afterwards, cumulative pixel-to-pixel difference is applied and a classifier decides which frames are keyframes or not.

Last, to validate our approach we conducted a user study in which both applications were compared. Users were asked to complete a survey regarding to different summaries created by means of the original application and with the one developed in this project. The results obtained were analyzed and they showed that the improvement done in the keyframes extraction module improves slightly the application performance and the quality of the generated summaries.

Keyframe-based Video Summarization Designer from Xavier Giro
}, author = {Ramos-Caballero, Carlos}, editor = {Eidenberger, Horst and Xavier Gir{\'o}-i-Nieto} } @article {aBarkhuus14, title = {New interaction modes for rich panoramic live video experiences}, journal = {Behaviour \& Information Technology}, volume = {33}, year = {2014}, month = {07/2014}, chapter = {859-869}, abstract = {

The possibilities of panoramic video are based on the capabilities of high-resolution digital video streams and higher band- width{\textquoteright}s opportunities to broadcast, stream and transfer large content across platforms. With these opportunities also come challenges such as how to focus on sub-parts of the video stream and interact with the content shown on a large screen. In this paper, we present studies of two different interaction modes with a large-scale panoramic video for live experiences; we focus on interactional challenges and explore if it is (1) possible to develop new interactional methods/ways of approaching this type of high-resolution content and (2) feasible for users to interact with the content in these new ways. We developed prototypes for two different interaction modes: an individual system on a mobile device, either a tablet or a mobile phone, for interacting with the content on the same and a non-touch gesture-based system for the home or small group interaction. We present pilot studies where we explore the possibilities and challenges with these two interaction modes for panoramic content.

}, keywords = {interaction modes, interactive television, panoramic video, pilot studies}, doi = {10.1080/0144929X.2014.914975}, url = {http://www.tandfonline.com/doi/full/10.1080/0144929X.2014.914975}, author = {Barkhuus, Louise and Zoric, Goranka and Engstr{\"o}m, Arvid and Ruiz-Hidalgo, J. and Verzijp, Nico} } @mastersthesis {xAlmendros-Gutierrez14, title = {Visual instance mining of news videos using a graph-based approach}, year = {2014}, abstract = {

Advisors:\ Xavier Gir{\'o}-i-Nieto\ (UPC) and Horst Eidenberger (TU Wien)

Degree: Telecommunications Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)

The aim of this thesis is to design a tool that performs visual instance\ search mining for news video summarization. This means to extract the\ relevant content of the video in order to be able to recognize the storyline\ of the news.

Initially, a sampling of the video is required to get the frames with a desired rate. Then, different relevant contents are detected from each frame, focusing on faces, text and several objects that the user can select. Next, we use a graph-based clustering method in order to recognize them with a high accuracy and select the most representative ones to show them in the visual summary. Furthermore, a graphical user interface in Wt was developed to create an online demo to test the application.

During the development of the application we have been testing the tool with the CCMA dataset. We prepared a web-based survey based on four results from this dataset to check the opinion of the users. We also validate our visual instance mining results comparing them with the results obtained applying an algorithm developed at Columbia University for video summarization. We have run the algorithm on a dataset of a few videos on two events: {\textquoteright}Boston bombings{\textquoteright} and the {\textquoteright}search of the Malaysian airlines flight{\textquoteright}. We carried out another web-based survey in which users could compare our approach with this related work. With these surveys we analyze if our tool fulfill the requirements we set up.

We can conclude that our system extract visual instances that show the most relevant content of news videos and can be used to summarize these videos effectively.

Final grade: B (7/10)

}, url = {http://hdl.handle.net/2099.1/22362}, author = {Almendros-Guti{\'e}rrez, David}, editor = {Xavier Gir{\'o}-i-Nieto and Eidenberger, Horst} } @conference {cVentura13, title = {Automatic Keyframe Selection based on Mutual Reinforcement Algorithm}, booktitle = {CBMI (Content-Based Multimedia Indexing)}, year = {2013}, month = {09/2013}, address = {Veszprem}, abstract = {

This paper addresses the problem of video summarization through an automatic selection of a single representative keyframe. The proposed solution is based on the mutual reinforcement paradigm, where a keyframe is selected thanks to its highest and most frequent similarity to the rest of considered frames. Two variations of the algorithm are explored: a first one where only frames within the same video are used (intraclip mode) and a second one where the decision also depends on the previously selected keyframes of related videos (interclip mode). These two algorithms were evaluated by a set of
professional documentalists from a broadcaster{\textquoteright}s archive, and results concluded that the proposed techniques outperform the semi-manual solution adopted so far in the company.

http://cbmi2013.mik.uni-pannon.hu/

Automatic Keyframe Selection based on Mutual Reinforcement Algorithm from Xavi Gir{\'o}
}, keywords = {mutual reinforcement algorithm, video summarization}, isbn = {978-1-4799-0955-1}, doi = {10.1109/CBMI.2013.6576548}, url = {http://dx.doi.org/10.1109/CBMI.2013.6576548}, author = {Ventura, C. and Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Daniel Giribet and Eusebio Carasusan} } @mastersthesis {xMartos13, title = {Content-based Video Summarisation to Object Maps}, year = {2013}, abstract = {

Advisors: Xavier Gir{\'o}-i-Nieto\ and\ Horst Eidenberger

School: Vienna University of Technology (Austria)

The amount of digital video content available in the web is constantly increasing. Its handling requires efficient technologies: text search on large databases provides users a great amount of videos; the content results are accessible by a description. Users need a fast and visual way to access relevant video content effectively. Quick visualisation of content using static image summarisation is a sophisticated problem. However, it is worth it because it may solve video navigation problems. Users can very rapidly get an idea of the video with no need to browse through it with a sliding bar as normally done.

In this work a system for automatic video summarisation is developed. It creates an object map the segments of which are extracted from an input video. It allows enhancing video browsing and large video databases management generating a visual index so that the user can rapidly grasp the most relevant content. Finally, accessing them with a simple action requires several technologies that define a complex information processing.

Firstly, shot boundary detection algorithms are required to reduce time redundancy of the video. Secondly, different relevant objects are extracted from each keyframe (faces, cars, etc.). We also describe a workflow to train detection models using multiple open source solutions. Furthermore, faces are a particular and very relevant semantic class. For this reason, we use clustering methods in order to recognise them in an unsupervised recognition process. The image composition of all selected objects and faces is the final stage of the architecture. Composition is defined as the combination of distinct parts to form a whole, therefore, objects have to be rendered in the map in a visually attractive manner.

To validate our approach and assess end-user satisfaction, we conducted a user study in which we compare requirements collected by analysing related literature. We analyse redundancy and informativeness as well as pleasantness.

The results show that our approach effectively creates an image representation for videos and is able to summarise customisable content in an attractive way.


Manel Martos, "Content-based Video Summarization to Object Maps" from Image Processing Group on Vimeo.

}, url = {http://hdl.handle.net/2099.1/19359}, author = {Martos, Manel}, editor = {Xavier Gir{\'o}-i-Nieto and Eidenberger, Horst} } @conference {cZoric13, title = {Gesture Interaction with Rich TV Content in the Social Setting}, booktitle = {Exploring and Enhancing the User Experience for Television, Workshop of ACM SIGCHI Conference on Human Factors in Computing Systems, CHI{\textquoteright}13}, year = {2013}, month = {04/2013}, address = {Paris, France}, abstract = {

The appearance of new immersive TV content has increased the interactive possibilities presented to the viewers. Increased interactivity is seen as a valuable feature in viewing richer television content, but new functionalities are limited by what can be done naturally and intuitively using available devices like remote controls. Therefore, new interaction techniques, such as visual gestures control systems, have appeared aiming to enhance the viewers{\textquoteright} viewing experience. In this work we begin uncovering the potential and challenges of gesture interaction with ultra high definition video for people watching TV together. As a first step we have done a study with a group of people interacting with such content using a gesture-based system in the home environment.

}, url = {http://livingroomexperience.wikispaces.com/}, author = {Zoric, Goranka and Engstr{\"o}m, Arvid and Barkhuus, Louise and Ruiz-Hidalgo, J. and Kochale, A.} } @article {aMolina11 , title = {Real-time user independent hand gesture recognition from time-of-flight camera video using static and dynamic models}, journal = {Machine vision and applications}, volume = {24}, year = {2013}, month = {08/2011}, pages = {187{\textendash}204}, chapter = {187}, abstract = {

The use of hand gestures offers an alternative to the commonly used human computer interfaces, providing a more intuitive way of navigating among menus and multimedia applications. This paper presents a system for hand gesture recognition devoted to control windows applications. Starting from the images captured by a time-of-flight camera (a camera that produces images with an intensity level inversely proportional to the depth of the objects observed) the system performs hand segmentation as well as a low-level extraction of potentially relevant features which are related to the morphological representation of the hand silhouette. Classification based on these features discriminates between a set of possible static hand postures which results, combined with the estimated motion pattern of the hand, in the recognition of dynamic hand gestures. The whole system works in real-time, allowing practical interaction between user and application.

}, issn = {0932-8092}, doi = {10.1007/s00138-011-0364-6}, url = {http://www.springerlink.com/content/062m51v58073572h/fulltext.pdf}, author = {Molina, J. and Escudero-Vi{\~n}olo, M. and Signorelo, A. and M. Pard{\`a}s and Ferran, C. and Bescos, J. and Marqu{\'e}s, F. and Mart{\'\i}nez, J.} } @article {aCalderero12, title = {Multispectral Cooperative Partition Sequence Fusion for Joint Classification and Hierarchical Segmentation}, journal = {Geoscience and Remote Sensing Letters, IEEE}, volume = {9}, year = {2012}, pages = {1012-1016}, abstract = {

In this letter, a region-based fusion methodology is presented for joint classification and hierarchical segmentation of specific ground cover classes from high-spatial-resolution remote sensing images. Multispectral information is fused at the partition level using nonlinear techniques, which allows the different relevance of the various bands to be fully exploited. A hierarchical segmentation is performed for each individual band, and the ensuing segmentation results are fused in an iterative and cooperative way. At each iteration, a consensus partition is obtained based on information theory and is combined with a specific ground cover classification. Here, the proposed approach is applied to the extraction and segmentation of vegetation areas. The result is a hierarchy of partitions with the most relevant information of the vegetation areas at different levels of resolution. This system has been tested for vegetation analysis in high-spatial-resolution images from the QuickBird and GeoEye satellites.

}, keywords = {GeoEye satellite, geophysical image processing, geophysical techniques, ground cover classification, hierarchical segmentation, high-spatial-resolution remote sensing images, image classification, image fusion, image region analysis, Image segmentation, information fusion, information theory, joint classification, Joints, Merging, multispectral cooperative partition sequence fusion, multispectral images, multispectral information, nonlinear techniques, partition level, QuickBird satellite, region merging, region-based fusion methodology, Remote sensing, Spatial resolution, specific ground cover classes, Vegetation mapping}, issn = {1545-598X}, doi = {10.1109/LGRS.2012.2188776}, author = {Calderero, F. and F. Eugenio and Marcello, J. and Marqu{\'e}s, F.} } @conference {cMarcello09, title = {Cloud motion estimation in seviri image sequences}, booktitle = {2009 IEEE International Geoscience and Remote Sensing Symposium}, year = {2009}, pages = {642{\textendash}645}, isbn = {978-1-4244-3394-0}, doi = {10.1109/IGARSS.2009.5417842}, url = {http://hdl.handle.net/2117/9492}, author = {Marcello, J. and F. Eugenio and Marqu{\'e}s, F.} } @conference {cCalderero09, title = {Hierarchical segmentation of vegetation areas in high spatial resolution images by fusion of multispectral information}, booktitle = {2009 IEEE International Geoscience and Remote Sensing Symposium}, year = {2009}, pages = {232{\textendash}235}, isbn = {978-1-4244-3394-0}, doi = {10.1109/IGARSS.2009.5417329}, url = {http://hdl.handle.net/2117/9494}, author = {Calderero, F. and Marqu{\'e}s, F. and Marcello, J. and F. Eugenio} } @conference {cCabrera09, title = {LAVICAD: LAboratori VIrtual de Comunicacions Anal{\`o}giques i Digitals}, booktitle = {Jornada d{\textquoteright}Innovaci{\'o} Docent - RIMA (JID-RIMA)}, year = {2009}, month = {02/2009}, publisher = {UPCommons}, organization = {UPCommons}, address = {Barcelona, Catalonia}, abstract = {

Mitjan{\c c}ant el present ajut s{\textquoteright}ha ampliat l{\textquoteright}aplicaci{\'o} en xarxa LAVICAD (LAboratori VIrtual de COmunicacions Anal{\`o}giques i Digitals) que s{\textquoteright}ofereix de forma integrada dins de la plataforma d{\textquoteright}e-learning COM@WEB. LAVICAD {\'e}s una eina programada en Java i Matlab i est{\`a} formada per un conjunt de simuladors de la capa f{\'\i}sica de sistemes de comunicacions. Tots els simuladors es presenten en xarxa i es poden utilitzar pels estudiants des de qualsevol ordinador sense necessitat d{\textquoteright}instal{\textperiodcentered}laci{\'o} de cap tipus de software especial. Durant el curs 2007 2008 s{\textquoteright}han desenvolupat entre d{\textquoteright}altres dos l{\'\i}nies de treball. D{\textquoteright}una banda s{\textquoteright}ha programat l{\textquoteright}applet que emula la capa f{\'\i}sica de la televisi{\'o} digital terrestre, com a referent per a l{\textquoteright}ensenyament de sistemes de comunicacions avan{\c c}ats. D{\textquoteright}altra banda s{\textquoteright}ha treballat en la programaci{\'o} de noves funcionalitats de l{\textquoteright}eina LAVICAD, que permeten ajudar als professors en el seguiment i avaluaci{\'o} del treball continuat dels estudiants. En particular s{\textquoteright}ha programat la generaci{\'o} d{\textquoteright}una base de dades que cont{\'e} la informaci{\'o} dels usuaris que s{\textquoteright}han connectat i els resultats obtinguts a l{\textquoteright}executar un determinat simulador. Les dues l{\'\i}nies desenvolupades han de permetre durant l{\textquoteright}actual curs, consolidar l{\textquoteright}{\'u}s dels diferents simuladors per a la doc{\`e}ncia de les assignatures implicades al projecte.

}, url = {http://hdl.handle.net/2099/7235}, author = {Cabrera, M. and Xavier Gir{\'o}-i-Nieto and Rey, F. and Gasull, A. and Casas, J. and Villares, J. and Fernandez, J. and Sala {\'A}lvarez, josep and Espinosa Fricke, Pedro and Fern{\'a}ndez, Carlos Marcos and Cort{\'e}s, S. and Farr{\'e}, Miquel {\`A}ngel} } @article {aOliveras08, title = {Elevated basal hepcidin levels in the liver may inhibit the development of malaria infection: Another piece towards solving the malaria puzzle?}, journal = {Medical hypotheses}, volume = {70}, number = {3}, year = {2008}, pages = {630{\textendash}634}, issn = {0306-9877}, doi = {10.1016/j.mehy.2007.07.021}, url = {http://www.sciencedirect.com/science/article/B6WN2-4PK8B8Y-5/2/f5ccc9584cc2e8c2095731c9be9d4a31}, author = {Albert Oliveras and Espel-Masferrer, E.} } @article {aMarcello08, title = {Motion estimation techniques to automatically track oceanographic thermal structures in multi-sensor image sequences}, journal = {IEEE transactions on geoscience and remote sensing}, volume = {46}, number = {9}, year = {2008}, pages = {2743{\textendash}2762}, issn = {0196-2892}, author = {Marcello, J. and F. Eugenio and Marqu{\'e}s, F. and Hernandez-Guerra, A. and Gasull, A.} } @book {eAguilar07, title = {Diccionari de Telecomunicacions}, year = {2007}, isbn = {978-84-412-1459-0}, author = {Aguilar, M. and Alcober, J. and Altes, J. and Aragones, X. and Artigas, D. and Bardes, D. and Barlabe, A. and Bragos, R. and Calderer, J. and Cardama, A. and Casademont, J. and Casals, L. and Comer{\'o}n, A. and Cotrina, J. and Cruz, L. and Dios, V. and Duxans, H. and Esparza, O. and Esquerra, I. and Garcia, D. and Garcias, P. and Gomez, C. and Gorricho, J. and Guinjoan, F. and Hesselbach, X. and Liria, A. and Lopez, J. and Madrenas, J. and Madue{\~n}o, M. and Mestre, F. and Monte, E. and Morros, J.R. and Mu{\~n}oz, J. and Pallar{\'e}s, E. and Pons, J. and Recolons, J. and Rincon, D. and Riu, P. and Pradell, L. and Pascual-Iserte, A. and Prat, L. and Rey, F. and Villares, J.} } @phdthesis {dMarcello06, title = {Desarrollo de t{\'e}cnicas de procesado de im{\'a}genes, multitemporales y multisensoriales, de teledetecci{\'o}n para la detecci{\'o}n y seguimiento de estructuras oceanogr{\'a}ficas}, year = {2006}, school = {Universidad de Las Palmas de Gran Canaria (ULPGC)}, type = {phd}, author = {Marcello, J.}, editor = {Marqu{\'e}s, F. and F. Eugenio} } @article {aMarcello05, title = {Automatic tool for the precise detection of upwelling and filaments in remote sensing imagery}, journal = {IEEE transactions on geoscience and remote sensing}, volume = {43}, number = {7}, year = {2005}, pages = {1605{\textendash}1616}, issn = {0196-2892}, author = {Marcello, J. and Marqu{\'e}s, F. and F. Eugenio} } @conference {cEugenio04, title = {An automated multisensor satellite imagery registration technique based on the optimization of contour features}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium 2004}, year = {2004}, pages = {1410{\textendash}1413}, author = {F. Eugenio and Marcello, J. and Marqu{\'e}s, F.} } @conference {cMarcello04, title = {Precise upwelling and filaments automatic extraction from multisensorial imagery}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium 2004}, year = {2004}, pages = {2018{\textendash}2021}, author = {Marcello, J. and F. Eugenio and Marqu{\'e}s, F.} } @article {aEugenio03, title = {Automatic satellite image georeferencing using a contour matching approach}, journal = {IEEE transactions on geoscience and remote sensing}, volume = {41}, number = {12}, year = {2003}, pages = {2869{\textendash}2880}, issn = {0196-2892}, author = {F. Eugenio and Marqu{\'e}s, F.} } @conference {cEugenio03a, title = {Automatic structures detection and spatial registration using multisensor satellite imagery}, booktitle = {Proceedings of the International Geoscience and Remote Sensing Symposium,}, year = {2003}, pages = {1038{\textendash}1040}, author = {F. Eugenio and Rovaris, E. and Marcello, J. and Marqu{\'e}s, F.} } @conference {cEugenio03, title = {Marine coastal dynamic study using an automatic structure detection and spatial registration tool}, booktitle = {IEEE International Geoscience and Remote Sensing Symposium (IGARSS-03)}, year = {2003}, pages = {1{\textendash}3}, isbn = {0-7803-7930-6}, author = {F. Eugenio and Marcello, J. and Marqu{\'e}s, F.} } @conference {cMarcelo02, title = {Automatic feature extraction from multisensorial oceanographic imagery}, booktitle = {International Geoscience and Remote Sensing Symposium, 2002. IGARSS {\textquoteright}02. 2002 IEEE}, year = {2002}, pages = {4{\textendash}8}, isbn = {0-7803-7536-0}, author = {Marcello, J. and Marqu{\'e}s, F. and F. Eugenio} } @conference {cEugenio02, title = {A contour-based approach to automatic and accurate registration of multitemporal and multisensor satellite imagery}, booktitle = {International Geoscience and Remote Sensing Symposium, 2002. IGARSS {\textquoteright}02. 2002 IEEE}, year = {2002}, pages = {1{\textendash}3}, isbn = {0-7803-7536-0}, author = {F. Eugenio and Marqu{\'e}s, F. and Marcello, J.} } @conference {cGiro-i-Nieto02, title = {MPEG-7 Descriptors for Earth Observation Satellites}, booktitle = {International Astronautical Congress}, year = {2002}, month = {09/2002}, pages = {1{\textendash}4}, publisher = {Inernational Astronautical Federation}, organization = {Inernational Astronautical Federation}, address = {Houston, Texas (USA)}, abstract = {

The amount of digital multimedia information has experienced a spectacular growth during the last years thanks to the advances on digital systems of image, video and audio acquisition. As a response to the need of organizing all this information, ISO/IEC has developed a new standard for multimedia content description called MPEG-7. Among other topics, MPEG-7 defines a set of multimedia descriptors that can be automatically generated using signal processing techniques. Earth Observation Satellites generate large quantities of images stored on enormous databases that can take advantage of the new standard. An automatic indexation of these images using MPEG-7 meta-data can improve their contents management as well as simplify interaction between independent databases. This paper gives an overall description on MPEG-7 standard focusing on the low-level Visual Descriptors. These descriptors can be grouped into four categories: color, texture, shape and motion. Visual Color Descriptors represent the color distribution of an image in terms of a specified color space. Visual Texture Descriptors define the visual pattern of an image according to its homogeneities and non-homogeneities. Visual Shape Descriptors describe the shape of 2D and 3D objects being, at the same time, invariant to scaling, rotation and translation. Motion Descriptors give the essential characteristics of objects and camera motions.

These descriptors can be used individually or in combination to index and retrieve satellite images of the Earth from a database. For example, oceans and glaciers can be discerned based on their Color Descriptors, also cities and desert based on the Texture Descriptors, island images can be grouped using the Shape descriptors and cyclone trajectories studied and compared using Motion Descriptors.

}, author = {Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F. and Marcello, J. and F. Eugenio} } @conference {cEugenio01, title = {Pixel and sub-pixel accuracy in satellite image georeferencing using an automatic contour matching approach}, booktitle = {IEEE International Conference on Image Processing}, year = {2001}, isbn = {0-7803-6727-8}, author = {F. Eugenio and Marqu{\'e}s, F. and Marcello, J.} } @article {aEugenio01, title = {A real-time automatic acquisition, processing and distribution system for AVHRR and SeaWIFS imagery}, journal = {IEEE geoscience electronics society newsletter}, volume = {-}, number = {Issue 20}, year = {2001}, pages = {10{\textendash}15}, issn = {0161-7869}, author = {F. Eugenio and Marcello, J. and Marqu{\'e}s, F. and Hernandez-Guerra, A. and Rovaris, E.} } @conference {cEugenio00a, title = {Accurate and automatic NOAA-AVHRR image navigation using a global contour matching approach}, booktitle = {International Geoscience and remote Sensing Symposium}, year = {2000}, pages = {639{\textendash}642}, isbn = {0-7803-6362-0}, author = {F. Eugenio and Marqu{\'e}s, F. and G{\'o}mez, L. and Suarez, E. and Rovaris, E.} } @conference {cEugenio00, title = {A contour matching approach for accurate NOAA-AVHRR image navigation}, booktitle = {10th European Signal Processing Conference (EUSIPCO 2000)}, year = {2000}, isbn = {952-15-0447-1}, author = {F. Eugenio and Marqu{\'e}s, F. and Suarez, E. and Rovaris, E.} } @conference {cSayrol99a, title = {Simulaci{\'o}n digital de se{\~n}ales y sistemas anal{\'o}gicos}, booktitle = {III Congreso de Usuarios de MATLAB}, year = {1999}, pages = {67{\textendash}76}, isbn = {84-699-1358-1}, author = {Sayrol E. and Gasull, A. and Moreno, A. and Vallverdu, F. and Salavedra, J. and Albert Oliveras} } @conference {cCasas94d, title = {Morphological scheme for morphometric analysis of epidermal biopsy images}, booktitle = {International Symposium on Mathematical Morphology and its Applications to Image Processing II}, year = {1994}, address = {Fontainebleau}, isbn = {0-7923-3093-5}, author = {Casas, J. and Esteban, P. and Moreno, A. and Carrera, M.} } @inbook {bCasas94, title = {Morphological scheme for morphometric analysis of epidermal biopsy images}, booktitle = {Mathematical Morphology and its Applications to Image Processing}, volume = {2}, number = {Computational Imaging and Vision}, year = {1994}, pages = {325{\textendash}331}, publisher = {Springer}, organization = {Springer}, address = {Dordrecht}, abstract = {

This paper addresses the problem of morphometric analysis of microscope images from cutaneous biopsy samples. A morphological scheme is applied for the automatic measurement of histologic parameters of the epidermis. It consists in an unsupervised segmentation approach that is strongly based on an {\textquoteright}a priori{\textquoteright} model of the images. The watershed algorithm has proven to be a very powerful tool for the introduction of such {\textquoteright}a priori{\textquoteright} information, because the segmentation process can be conveniently guided by some strategic markers in order to perform the detection of the desired structures. This permits an automatic measurement of some objective parameters which are highly correlated with the evolution of some skin diseases.

}, doi = {10.1007/978-94-011-1040-2_42}, author = {Casas, J. and Esteban, P. and Moreno, A. and Carrera, M.}, editor = {Serra, J. and Soille, P.} }