@mastersthesis {xGranero, title = {A Video Database for Analyzing Affective Physiological Responses}, year = {2019}, abstract = {
Affective computing, leveraged by machine learning techniques, is advancing rapidly in the task of affect recognition in videos. However, there is a need for more annotated data. Several studies have built huge video datasets with emotions annotations. Others have collected music videos or film scenes datasets with physiological signals. However, none of them approached a solution with both physiological signals and user-generated videos. The work introduced here presents GALLUS, a novel database of user-generated videos with affective physiological responses. The database is composed of 775 videos that have been previously annotated through an online crowdsourcing platform. Physiological responses such as electroencephalography, electrocardiography, galvanic skin response, facial emotion recognition, and eye-gaze have been collected from 30 participants while they watched the stimuli. Our dataset will be made public to foster research in affect recognition.
The goal of this work is segmenting on a video sequence the objects\ which are mentioned in a linguistic description of the scene. We\ have adapted an existing deep neural network that achieves state of\ the art performance in semi-supervised video object segmentation,\ to add a linguistic branch that would generate an attention map\ over the video frames, making the segmentation of the objects\ temporally consistent along the sequence.
\
\
}, author = {Herrera-Palacio, Alba and Ventura, C. and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xNieto, title = {Video Saliency Prediction with Deep Neural Networks}, year = {2019}, abstract = {Saliency prediction is a topic undergoing intense study in computer vision with a broad range of applications. It consists in predicting where the attention is going to be received in an image or a video by a human. Our work is based on a deep neural network named SalGAN, which was trained on a saliency annotated dataset of static images. In this thesis we investigate different approaches for extending SalGAN to the video domain. To this end, we investigate the recently proposed saliency annotated video dataset DHF1K to train and evaluate our models. The obtained results indicate that techniques such as depth estimation or coordconv can effectively be used as additional modalities to enhance the saliency prediction of static images obtained with SalGAN, achieving encouraging results in the DHF1K benchmark. Our work is based on pytorch and it is publicly available here.
In this article we present a web platform used by media producers to monitor word events, detected by VLX-Stories.\ The event detector system retrieves multi-regional articles from news sites, aggregates them by topic, and summarizes them by extracting and structuring its most relevant entities in order to answer the journalism W{\textquoteright}s: who, what, when and where.\ The dashboard displays online detected events in a semantically linked space which allows navigation among trending news stories on distinct countries, categories and time.\ Moreover, detected events are linked to costumer contents, helping editorial process by providing real time access to breaking news related to their contents.\ (Demo paper)\
}, url = {http://ceur-ws.org/Vol-2456/paper61.pdf}, author = {Fern{\`a}ndez, D{\`e}lia and Bou, Elisenda and Xavier Gir{\'o}-i-Nieto} } @conference {cFernandezd, title = {VLX-Stories: building an online Event Knowledge Base with Emerging Entity detection}, booktitle = {The Semantic Web {\textendash} ISWC 2019}, year = {2019}, month = {10/2019}, pages = {382-399}, publisher = {Springer, Cham}, organization = {Springer, Cham}, chapter = {24}, address = {Auckland, New Zealand}, abstract = {We present an online multilingual system for event detection and comprehension from media feeds. The system retrieves information from news sites and social networks, aggregates them into events (event detection), and summarizes them by extracting semantic labels of its most relevant entities (event representation) in order to answer the journalism W{\textquoteright}s: who, what, when and where. The generated events populate VLX-Stories -an event Knowledge Base (KB)- transforming unstructured text data to a structured knowledge base representation.\ Our system exploits an external entity Knowledge Base (VLX-KG) to help populate VLX-Stories. At the same time, this external knowledge base can also be extended with a Dynamic Entity Linking (DEL) module, which detects Emerging Entities (EE) on unstructured data and adds them to VLX-KG.\ The system is currently used in production, detecting over 6000 monthly events from over 3500 news feeds from seven different countries and in three different languages.
Understanding the inner workings of deep learning algorithms is key to eciently exploit the\ large number of videos that are generated every day. For the self-supervised learning of the spatiotemporal\ information contained within these videos, there are several types of algorithms based\ on convolutional neural networks (CNNs) following an auto-encoder style architecture. However,\ we have checked that this type of models, trained for the frame prediction task, learn jointly\ these spatio-temporal information, so the model is not able to recognize appearance-motion\ combinations not seen during training. Our proposed model, called DisNet, can learn separately\ the appearance and motion through disentanglement, so that it solves the generalization and\ scalability problems. To demonstrate this, we conducted numerous experiments under highly\ controlled conditions, generating specic datasets that make the conventional model fail for\ the appearance and motion classication tasks, and analyzing how well our proposal behaves\ under the same conditions.
\
}, author = {Arenas, Carlos}, editor = {Xavier Gir{\'o}-i-Nieto and V{\'\i}ctor Campos and Palacio, Sebastian} } @mastersthesis {xRoldan, title = {Visual Question Answering 2.0}, year = {2017}, abstract = {This bachelor{\textquoteright}s thesis explores dierent deep learning techniques to solve the Visual Question-Answering (VQA) task, whose aim is to answer questions about images. We study dierent Convolutional\ Neural Networks (CNN) to extract the visual representation from images: Kernelized-CNN (KCNN), VGG-16 and Residual Networks (ResNet). We also analyze the impact of using\ pre-computed word embeddings trained in large datasets (GloVe embeddings). Moreover, we\ examine dierent techniques of joining representations from dierent modalities. This work has\ been submitted to the second edition Visual Question Answering Challenge, and obtained a\ 43.48\% of accuracy.
The popularization of multimedia content on the Web has arised the need to automatically understand, index and retrieve it. In this paper we present ViTS, an automatic Video Tagging System which learns from videos, their web context and comments shared on social networks. ViTS analyses massive multimedia collections by Internet crawling, and maintains a knowledge base that updates in real time with no need of human supervision. As a result, each video is indexed with a rich set of labels and linked with other related contents. ViTS is an industrial product under exploitation with a vocabulary of over 2.5M concepts, capable of indexing more than 150k videos per month. We compare the quality and completeness of our tags with respect to the ones in the YouTube-8M dataset, and we show how ViTS enhances the semantic annotation of the videos with a larger number of labels (10.04 tags/video), with an accuracy of 80,87\%.
\
Student: Andrea Calafell
Advisors: Eva Mohedano (Insight), Kevin McGuinness (Insight), Noel E. O{\textquoteright}Connor (Insight) and Xavier Gir{\'o}-i-Nieto (UPC)
Program: Master in Computer Vision (Class of 2016)
Grade: A (9.0/10.0)
This thesis explores good practices for improving the detection of specific people in specific places. An approach combining recurrent and convolutional neural network have been considered to perform face detection. However, other more conventional methods have been tested, obtaining the best results by exploiting a deformable part model approach. A CNN is also used to obtain the face feature vectors and, \ with the purpose of helping in the face recognition, an approach to perform query expansion has been also developed. Furthermore, in order to be able to evaluate the different configurations in our non-labelled dataset, a user interface has been used to annotate the images and be able to obtain a precision of the system. Finally, different fusion and normalization strategies has been explored with the aim of combining the scores obtained from the face recognition with the ones obtained in the place recognition.
This project explores visual memorability of egocentric in different ways having three main\ contributions. The first and the main contribution of the project is a new tool visual\ memorability in egocentric images. This tool that consists in a web application that allows\ the annotation of the visual memorability associated to still images with an online game.\ The second contribution of this work is a convolutional neural network model for visual\ memorability prediction that adapts an off-the-shelf model to egocentric images. Moreover,\ a visualization study has been pursued to localize the regions of the images that are more\ memorable than others. With this maps a comparison with saliency maps and is explored.\ This part of the research opens a new branch in visual memorability that consists in use\ memorability maps for saliency prediction. Also the memorability of the images is related\ with a sentiment analysis applying a model that predicts that feature.\ The final contribution is related to join visual memorability of images with human\ behaviour and physical state, finding a relation between memory and some physiological\ signals as: heart rate, galvanic skin response and electroencephalographic signals.
Grade: A with honors (9.8/10.)
}, author = {Carn{\'e}-Herrera, Marc}, editor = {Gurrin, Cathal and Xavier Gir{\'o}-i-Nieto} } @phdthesis {dVentura16, title = {Visual Object Analysis using Regions and Local Features}, year = {2016}, abstract = {Thesis submission: 10-06-2016
Defense date:\ 08-07-2016
Grade: Excel{\textperiodcentered}lent Cum Laude \& International Mention
\
The first part of this dissertation focuses on an analysis of the spatial context in semantic image segmentation. First, we review how spatial context has been tackled in the literature by local features and spatial aggregation techniques. From a discussion about whether the context is beneficial or not for object recognition, we extend a Figure-Border-Ground segmentation for local feature aggregation with ground truth annotations to a more realistic scenario where object proposals techniques are used instead. Whereas the Figure and Ground regions represent the object and the surround respectively, the Border is a region around the object contour, which is found to be the region with the richest contextual information for object recognition. Furthermore, we propose a new contour-based spatial aggregation technique of the local features within the object region by a division of the region into four subregions. Both contributions have been tested on a semantic segmentation benchmark with a combination of free and non-free context local\ features that allows the models automatically learn whether the context is beneficial or not for each semantic category.
The second part of this dissertation addresses the semantic segmentation for a set of closely-related images from an uncalibrated multiview scenario. State-of-the-art semantic segmentation algorithms fail on correctly segmenting the objects from some viewpoints when the techniques are independently applied to each viewpoint image. The lack of large annotations available for multiview segmentation do not allow to obtain a proper model that is robust to viewpoint changes. In this second part, we exploit the spatial correlation that exists between the dierent viewpoints images to obtain a more robust semantic segmentation. First, we review the state-of-the-art co-clustering, co-segmentation and video segmentation techniques that aim to segment the set of images in a generic way, i.e. without considering semantics. Then, a new architecture that considers motion information and provides a multiresolution segmentation is proposed for the co-clustering framework and outperforms state-of-the-art techniques for generic multiview segmentation. Finally,\ the proposed multiview segmentation is combined with the semantic segmentation results giving a method for automatic resolution selection and a coherent semantic multiview segmentation.
\
\
In endoscopic procedures, surgeons work with live video strea\-ms from the inside of their subjects. A main source for documentation of procedures are still frames from the video, identified and taken during the surgery. However, with growing demands and technical means, the streams are saved to storage servers and the surgeons need to retrieve parts of the videos on demand. In this submission we present a demo application allowing for video retrieval based on visual features and late fusion, which allows surgeons to re-find shots taken during the procedure.
[CBMI 2015 Conference website]
Presented in the Special Session on Medical Multimedia Processing (acceptance rate for special sessions= 55\%)
\
Advisor: Xavier Gir{\'o}-i-Nieto (UPC)
Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)
Grade: A with honors (9.9/10.0)
A saliency map is a model that predicts eye fixations on a visual scene. In other words, it is the prediction of saliency areas in images has been traditionally addressed with hand crafted features inspired on neuroscience principles. This work however addresses the problem with a completely data-driven approach by training a convolutional network. The recent publication of large datasets of saliency prediction has provided enough data to train a not very deep network architecture which is both fast and accurate. In our system, named JuntingNet, the learning process is formulated as a minimization of a loss function that measures the Euclidean distance of the predicted saliency map with the provided ground truth. The convolutional network developed in this work, named JuntingNet, won the CVPR Large-scale Scene UNderstanding (LSUN) 2015 challenge on saliency prediction with a superior performance in all considered metrics.
2015-TFG-JuntingPan-VisualSaliencyPredictionUsingDeepLearningTechniques from Image Processing Group on Vimeo.
See https://imatge.upc.edu/web/resources/end-end-convolutional-networks-saliency-prediction-software.
Advisors: Mathias Lux (Klagenfurt University) and\ Xavier Gir{\'o}-i-Nieto\ (UPC)
Degree: Electronic Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)
Grade: A (9.5/10)
This project explores the potential of LIRE, an en existing Content-Based Image Retrieval (CBIR) system, when used to retrieve medical videos. These videos are recording of the live streams used by surgeons during the endoscopic procedures, captured from inside of the subject. The growth of such video content stored in servers requires search engines capable to assist surgeons in their management and retrieval. In our tool, queries are formulated by visual examples and those allow surgeons to re-find shots taken during the procedure. This thesis presents an extension and adaptation of Lire for video retrieval based on visual features and late fusion. The results are assessed from two perspectives: a quantitative and qualitative one. While the quantitative one follows the standard practices and metrics for video retrieval, the qualitative assessment has been based on an empirical social study using a semi-interactive web-interface. In particular, a thinking aloud test was applied to analyze if the user expectations and requirements were fulfilled. Due to the scarcity of surgeons available for the qualitative tests, a second domain was also addressed: videos captured at musical performances. These type of videos has also experienced an exponential growth with the advent of affordable multimedia smart phones, available to a large audience. Analogously to the endoscopic videos, searching in a large data set of such videos is a challenging topic.
Building a visual summary from an egocentric photostream captured by a lifelogging wearable camera is of high interest for different applications (e.g. memory reinforcement).\ In this paper, we propose a new summarization method based on keyframes selection that\ uses visual features extracted by means of a convolutional neural network. Our method applies an unsupervised clustering for dividing the photostreams into events, and finally extracts the most relevant keyframe for each event. We assess the results by applying a blind-taste test on a group of 20 people who assessed the quality of the summaries.
\
Studies: Bachelor degree in Engineering of Audiovisual Systems at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)
Grade: A with honors (9.6/10)
This Final Degree Work approach the problem of the visual summarization of sets of images captured by an eggocentric camera for lifelogging purposes. In first place we try to group the images (which represent the day of person{\textquoteright}s life) into distinguishable and significant esdeveniments. For this purpose, we use visual features extracted with the software Caffe. In second place, we explain the design of extraction techniques of the representative images through similarity graphs. Finally we analyze the assessment scores given by different users whom we presented the different visual summaries obtained in this project. We achieve a 60\% of favorable opinions of the quality of the visual summaries obtained with techniques developed in this project.
(This thesis report is writen in Catalan)\
Aquest Treball de Final de Grau aborda el problema de resumir visualment conjunts d{\textquoteright}imatges capturats mitjançant una càmera egocèntrica per a propòsits de registre vital (lifelogging en anglès).\ En primer lloc s{\textquoteright}intenta agrupar les imatges (que representen el dia d{\textquoteright}una persona) per esdeveniments distingibles i significatius. Per fer-ho s{\textquoteright}utilitzen característiques visuals extretes amb el programari Caffe.\ En segon lloc s{\textquoteright}explica el disseny de tècniques de extracció d{\textquoteright}imatges representatives mitjançant grafs de similitud.\ Per últim s{\textquoteright}analitzen les puntuacions d{\textquoteright}avaluació donades per diversos usuaris als quals se{\textquoteright}ls han presentat els diferents resums visuals obtinguts en aquest projecte. S{\textquoteright}ha pogut assolir un 60\% d{\textquoteright}opinions favorables a la qualitat dels resums obtinguts amb tècniques desenvolupades en aquest treball.
Advisors:\ Xavier Gir{\'o}-i-Nieto\ (UPC) and Horst Eidenberger (TU Wien)
Degree: Telecommunications Engineering (5 years) at\ Telecom BCN-ETSETB\ (UPC)
The aim of this thesis is to design a tool that performs visual instance\ search mining for news video summarization. This means to extract the\ relevant content of the video in order to be able to recognize the storyline\ of the news.
Initially, a sampling of the video is required to get the frames with a desired rate. Then, different relevant contents are detected from each frame, focusing on faces, text and several objects that the user can select. Next, we use a graph-based clustering method in order to recognize them with a high accuracy and select the most representative ones to show them in the visual summary. Furthermore, a graphical user interface in Wt was developed to create an online demo to test the application.
During the development of the application we have been testing the tool with the CCMA dataset. We prepared a web-based survey based on four results from this dataset to check the opinion of the users. We also validate our visual instance mining results comparing them with the results obtained applying an algorithm developed at Columbia University for video summarization. We have run the algorithm on a dataset of a few videos on two events: {\textquoteright}Boston bombings{\textquoteright} and the {\textquoteright}search of the Malaysian airlines flight{\textquoteright}. We carried out another web-based survey in which users could compare our approach with this related work. With these surveys we analyze if our tool fulfill the requirements we set up.
We can conclude that our system extract visual instances that show the most relevant content of news videos and can be used to summarize these videos effectively.
Final grade: B (7/10)
}, url = {http://hdl.handle.net/2099.1/22362}, author = {Almendros-Guti{\'e}rrez, David}, editor = {Xavier Gir{\'o}-i-Nieto and Eidenberger, Horst} } @mastersthesis {xTort13, title = {Video Clustering Using Camera Motion}, year = {2013}, abstract = {This document contains the work done in INP Grenoble during the second semester of the academic year 2011-2012, completed in Barcelona during the firsts months of the 2012-2013. The work presented consists in a camera motion study in different types of video in order to group fragments that have some similarity in the content.\
In the document it is explained how the data extracted by the program Motion 2D, proportionated by the French university, are treated in order to represented them in a more simplified using motion histograms. It is also explained how the different distances between histograms are calculated and how its similarity is computed.\
Three different distances are used: Manhattan, Euclidean and Bhattacharyya, although in the project there can be found the explanation of some others a little bit more complicated. Different histogram configurations are used, using more or less bins to represent the motion.\
Every possible combination of the number of bins and distances are evaluated using a group of 30 fragments of video and the clustering algorithm K-Means. The clustering results are evaluated using F1-Score, a very popular measurement suitable for clustering algorithms and also classification.
}, url = {http://hdl.handle.net/2099.1/17337}, author = {Tort, Laura}, editor = {Xavier Gir{\'o}-i-Nieto and Rombaut, Mich{\`e}le and Pellerin, Denis} } @mastersthesis {cVentura13b, title = {Visual Object Analysis Using Regions and Interest Points}, journal = {ACM Multimedia}, year = {2013}, month = {10/2013}, abstract = {This dissertation research will explore region-based and interest points based image representations, two of the most-used image models for object detection, image classification, and visual search among other applications. We will analyze the relationship between both representations with the goal of proposing a new hybrid representation that takes
advantage of the strengths and overcomes the weaknesses of both approaches. More specifically, we will focus on the gPb-owt-ucm segmentation algorithm and the SIFT local features since they are the most contrasted techniques in their respective fields. Furthermore, using an object retrieval benchmark, this dissertation research will analyze three basic questions: (i) the usefulness of an interest points hierarchy based on a contour strength signal, (ii) the influence of the context on both interest points location and description, and (iii) the analysis of regions as spatial support for bundling interest points.
This dissertion research explores two of the most-used image models for object detection, 3D reconstruction, visual search: region-based and interest-points image representations; and will try to provide a new image model to take advantage of the strengths and overcome the weaknesses of both approaches. More specifically, we will focus on the gPb-owt-ucm segmentation algorithm and the SIFT local features since they are the most contrasted techniques in their respective fields. Furthermore, using an object retrieval benchmark, this dissertation research will analyze three basic questions: (i) the usefulness of an interest points hierarchy based on a contour strength signal, (ii) the influence of the context on both interest points location and description, and (iii) the analysis of regions as spatial support for bundling interest points.\
This paper presents a variational framework for obtaining super-resolved video-sequences, based on the observation that reconstruction-based Super-Resolution (SR) algorithms are limited by two factors: registration exactitude and Point Spread Function (PSF) estimation accuracy.\ To minimize the impact of the first limiting factor, a small-scale linear inpainting algorithm is proposed to provide smooth SR video frames.\ To improve the second limiting factor, a fast PSF local estimation and total variation-based denoising is proposed. Experimental results reflect the improvements provided by the proposed method when compared to classic SR approaches.
}, url = {http://ieeexplore.ieee.org/xpl/articleDetails.jsp?tp=\&arnumber=6460315\&contentType=Conference+Publications\&searchField\%3DSearch_All\%26queryText\%3Dvariational+reconstruction}, author = {Salvador, J. and Rivero, D. and Kochale, A. and Ruiz-Hidalgo, J.} } @conference {cLopez-Mendez10, title = {Virtual view appearance representation for human motion analysis in multi-view environments}, booktitle = {18th European Signal Processing Conference}, year = {2010}, pages = {959{\textendash}963}, isbn = {2076-1465}, url = {http://hdl.handle.net/2117/8747}, author = {L{\'o}pez-M{\'e}ndez, A. and Cristian Canton-Ferrer and Casas, J.} } @conference {cAlcoverro09, title = {Visual hull reconstruction algorithms comparison: towards robustness to silhouette errors}, booktitle = {International Conference on Computer Vision Theory and Applications 2009}, year = {2009}, pages = {464{\textendash}469}, isbn = {978-989-8111-69-2}, url = {http://upcommons.upc.edu/e-prints/urlFiles?idDrac=2642455}, author = {Alcoverro, M. and M. Pard{\`a}s} } @conference {cCanton-Ferrer09b, title = {Voxel based annealed particle filtering for markerless 3D articulated motion capture}, booktitle = {3DTV Conference: The True Vision - Capture, Transmission and Display of 3D Video, 2009}, year = {2009}, pages = {1{\textendash}4}, doi = {10.1109/3DTV.2009.5069645}, url = {http://ieeexplore.ieee.org/xpl/tocresult.jsp?isnumber=5069609\&isYear=2009}, author = {Cristian Canton-Ferrer and Casas, J. and M. Pard{\`a}s} } @conference {cSalvador06a, title = {Voxelitzaci{\'o} Adaptada a les Imatges en Entorns Multic{\`a}mera}, booktitle = {2ones Jornades UPC de Investigaci{\'o}n en Autom{\'a}tica, Visi{\'o}n y Rob{\'o}tica}, year = {2006}, pages = {1{\textendash}6}, isbn = {978-3-540-26042-4}, author = {Salvador, J. and Casas, J.} } @article {aSalembier02, title = {Visual Segment Tree Creation for MPEG-7 Description Schemes}, journal = {Pattern recognition}, volume = {35}, number = {1}, year = {2002}, pages = {563{\textendash}579}, issn = {0031-3203}, author = {Salembier, P. and Llach, J. and Garrido, L.} } @conference {cSalembier00b, title = {Visual Segment Tree Creation for MPEG-7 Description Schemes}, booktitle = {IEEE International Conference on Multimedia and Expo, ICME{\textquoteright}2000 }, year = {2000}, pages = {56{\textendash}61}, address = {New York City, NY, USA}, author = {Salembier, P. and Llach, J. and Garrido, L.} } @mastersthesis {xGiro-i-Nieto00, title = {Volumetric Data Compression based on Cube-Splitting and Embedded Block Coding by Optimized Truncation}, year = {2000}, abstract = {Many medical data acquisition devices or multispectral imaging techniques produce three-dimensional image data. These images must be stored in limited space devices or transmitted through limited bandwidth channels. Compression techniques are an extremely valuable tool to reduce the expensive resource requirements.
However, compression techniques have already been developed for the more popular two-dimensional images. Splitting the volumetric image in slices and applying a two- dimensional coding technique to each slice is the philosophy followed by the classical approach for 3D compression. This is clearly inefficient, because 2D techniques only exploit the image correlation in the X and Y axis. In volumetric images a new Z-axis appears, whose correlation must be also exploited to achieve the best results.
The basis for all current image and video compression standards is DCT-based coding. For these techniques the computation is based on splitting of the image into NxN blocks and transforming it from the spatial domain into the DCT domain. Typical examples are first generation coders, like JPEG, which produce a non-structured, unique bit-stream. This technique could easily be adapted to three-dimensional by splitting the volume into NxNxN blocks and applying a 3D DCT. However, one encounters two problems. First, the DCT transform is a lossy, and medical practice cannot tolerate any distortion that could lead to an faulty diagnose. Secondly, contemporary transmission techniques make use of concepts like rate-scalability, quality and resolution scalability, features that are not fully supportable by DCT techniques.
Coders using a wavelet transform as front-end are good candidates to overcome these problems. They scan each bit-planes one by one to generate a structured bit-stream. This bit-stream can be truncated to give more or less quality or resolution, and they are classified second-generation coders. A typical example of 3D wavelet coding is the octave zero-tree based coding [Bil99, Xio99, Kim99, Kim00, Sch00a], which currently tends to deliver the best compression performance. However, it is difficult to control the bit-stream structure since it is dependent on the coder{\textquoteright}s data flow.
The new image compression standard JPEG2000 uses a third generation technique, called EBCOT ,incorporating an abstract interface to enable reordering of the generated code packages. In this way a fully controllable bit-stream structure is achieved. For example, the bit-stream can be equipped so that resolution or quality scalability are supported. The current verification model (VM7.0) of JPEG2000 however, does not include three-dimensional coding. The only support that is given for multidimensional and/or multi-spectral images is the possibility to execute a wavelet transform along the component axis. Unfortunately, the code supporting this feature was still buggy at the time this document was written
Adapting this third-generation coding technique to a three-dimensional environment was the aim of this thesis. The input volume is transformed into the wavelet transform with the 3D Wavelet front-end described and implemented by Schelkens et al. [Sch00a] and Barbarien [Joeri{\textquoteright}s thesis]. Later it is coded by an hybrid technique of Cube-Splitting and an JPEG2000{\textquoteright}s EBCOT module, modified to support the third dimension. The Cube-Splitting module codes big zero-volumes very efficiently, while the EBCOT coder is responsible for the coding of the (sub)volumes containing significant samples. Hence, the implemented coder is called CS- EBCOT.\
}, keywords = {coding, volumetric coding}, author = {Xavier Gir{\'o}-i-Nieto} } @inbook {bMarques99, title = {Video compresssion standards}, booktitle = {Electronic imaging technology}, year = {1999}, pages = {31{\textendash}64}, publisher = {SPIE Optical Engineering Press}, organization = {SPIE Optical Engineering Press}, edition = {Edward R. Dougherty (Ed.)}, isbn = {0819430374}, author = {Marqu{\'e}s, F. and Salembier, P.} } @conference {cMarcotegui99, title = {A video generation tool allowing friendly user interaction}, booktitle = {1999 IEEE INternational Conference on Image Processing}, year = {1999}, isbn = {0-7803-5470-2}, author = {Marcotegui, B. and Correia, P. and Marqu{\'e}s, F. and Mech, R. and Rosa, R. and Wollborn, M. and Zanoguera, F.} } @conference {cPardas98, title = {Video Object Segmentation introducing depth and motion information}, booktitle = {IEEE International Conference on Image Processing}, year = {1998}, isbn = {-}, author = {M. Pard{\`a}s} } @article {pPardas97, title = {Video coding method and corresponding coding and decoding systems}, number = {9693276.5-}, year = {1997}, type = {Invention patent}, author = {M. Pard{\`a}s and Salembier, P. and Ayuso, X. and Mart{\'\i}, E.} } @conference {cSalembier96a, title = {Very low rate video coding using active triangular mesh}, booktitle = {IEEE International Conference on Acoustics, Speech \& Signal Processing, ICASSP 1996}, year = {1996}, pages = {97{\textendash}110}, address = {Atlanta (GA), USA}, isbn = {84-699-1358-1}, author = {Salembier, P. and Ayuso, X.} } @conference {cMorros96, title = {Video sequence segmentation based on rate-distortion theory}, booktitle = {SPIE Visual Communication and Image Processing, VCIP{\textquoteright}96}, year = {1996}, month = {02/1996}, publisher = {Proc. SPIE 2727, 1185}, organization = {Proc. SPIE 2727, 1185}, address = {Orlando, Florida, USA}, abstract = {\
}, author = {Morros, J.R. and Marqu{\'e}s, F. and M. Pard{\`a}s and Salembier, P.} } @conference { cSalembier94, title = {Very low bit rate video coding using morphological segmentation and contour/texture motion compensation}, booktitle = {12th International Conference on Pattern Recognition, ICPR 1994}, year = {1994}, address = {Jerusalem, Israel}, author = {Salembier, P. and Gu, C. and M. Pard{\`a}s and Kunt, M} }