@article {aGiro-i-Nieto13, title = {From Global Image Annotation to Interactive Object Segmentation}, journal = {Multimedia Tools and Applications}, volume = {70}, year = {2014}, month = {05/2014}, chapter = {475}, abstract = {

This paper presents a graphical environment for the annotation of still images that works both at the global and local scales. At the global scale, each image can be tagged with positive, negative and neutral labels referred to a semantic class from an ontology. These annotations can be used to train and evaluate an image classifier. A finer annotation at a local scale is also available for interactive segmentation of objects. This process is formulated as a selection of regions from a precomputed hierarchical partition called Binary Partition Tree. Three different semi-supervised methods have been presented and evaluated: bounding boxes, scribbles and hierarchical navigation. The implemented Java source code is published under a free software license.

}, keywords = {annotation, Hierarchical, Interaction, Multiscale, segmentation}, doi = {10.1007/s11042-013-1374-3}, author = {Xavier Gir{\'o}-i-Nieto and Martos, Manel and Mohedano, Eva and Jordi Pont-Tuset} } @mastersthesis {xMartos13, title = {Content-based Video Summarisation to Object Maps}, year = {2013}, abstract = {

Advisors: Xavier Gir{\'o}-i-Nieto\ and\ Horst Eidenberger

School: Vienna University of Technology (Austria)

The amount of digital video content available in the web is constantly increasing. Its handling requires efficient technologies: text search on large databases provides users a great amount of videos; the content results are accessible by a description. Users need a fast and visual way to access relevant video content effectively. Quick visualisation of content using static image summarisation is a sophisticated problem. However, it is worth it because it may solve video navigation problems. Users can very rapidly get an idea of the video with no need to browse through it with a sliding bar as normally done.

In this work a system for automatic video summarisation is developed. It creates an object map the segments of which are extracted from an input video. It allows enhancing video browsing and large video databases management generating a visual index so that the user can rapidly grasp the most relevant content. Finally, accessing them with a simple action requires several technologies that define a complex information processing.

Firstly, shot boundary detection algorithms are required to reduce time redundancy of the video. Secondly, different relevant objects are extracted from each keyframe (faces, cars, etc.). We also describe a workflow to train detection models using multiple open source solutions. Furthermore, faces are a particular and very relevant semantic class. For this reason, we use clustering methods in order to recognise them in an unsupervised recognition process. The image composition of all selected objects and faces is the final stage of the architecture. Composition is defined as the combination of distinct parts to form a whole, therefore, objects have to be rendered in the map in a visually attractive manner.

To validate our approach and assess end-user satisfaction, we conducted a user study in which we compare requirements collected by analysing related literature. We analyse redundancy and informativeness as well as pleasantness.

The results show that our approach effectively creates an image representation for videos and is able to summarise customisable content in an attractive way.

Manel Martos, "Content-based Video Summarization to Object Maps" from Image Processing Group on Vimeo.

}, url = {http://hdl.handle.net/2099.1/19359}, author = {Martos, Manel}, editor = {Xavier Gir{\'o}-i-Nieto and Eidenberger, Horst} } @inbook {bVentura12, title = {Hierarchical Navigation and Visual Search for Video Keyframe Retrieval}, booktitle = {Advances in Multimedia Modeling}, series = {Lecture Notes in Computer Science}, volume = {7131}, year = {2012}, pages = {652-654}, publisher = {Springer Berlin / Heidelberg}, organization = {Springer Berlin / Heidelberg}, abstract = {

This work presents a browser that supports two strategies for video browsing: the navigation through visual hierarchies and the retrieval of similar images. The input videos are firstly processed by a keyframe extractor to reduce the temporal redundancy and decrease the number of elements to consider. These generated keyframes are hierarchically clustered with the Hierachical Cellular Tree (HCT) algorithm, an indexing technique that also allows the creation of data structures suitable for browsing. Different clustering criteria are available, in the current implementation, based on four MPEG-7 visual descriptors computed at the global scale. The navigation can directly drive the user to find the video timestamps that best match the query or to a keyframe which is globally similar in visual terms to the query. In the latter case, a visual search engine is also available to find other similar keyframes, based as well on MPEG-7 visual descriptors.

Winners of the Novice Run at the VideoBrowser Showdown 2012 at the 18th International Conference on MultiMedia Modeling, Kalgenfurt, Austria, January 4-6, 2012.

}, keywords = {hierarchical navigation, image retrieval, video browser}, isbn = {978-3-642-27354-4}, doi = {10.1007/978-3-642-27355-1_67}, url = {http://dx.doi.org/10.1007/978-3-642-27355-1_67}, author = {Ventura, C. and Martos, Manel and Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Marqu{\'e}s, F.} } @conference {cGiro-i-Nieto12a, title = {Interactive segmentation and tracking of video objects}, booktitle = {Image Analysis for Multimedia Interactive Services (WIAMIS), 2012 13th International Workshop on}, year = {2012}, month = {05/2012}, publisher = {IEEE}, organization = {IEEE}, address = {Dublin, Ireland}, abstract = {

This paper describes a mechanism to interactively segment objects from a sequence of video frames. The extracted object can be later embedded in a different background, associated to local scale metadata or used to train an automatic object detector. The workflow requires the interaction of the user at two stages: the temporal segmentation of the frames contain- ing the object and the generation of an object mask to initial- ize a video tracker. The mask is defined as a combination of regions generated by an image segmentation algorithm. This framework has been integrated in an annotation tool available to the public.\

}, keywords = {Image segmentation, Object segmentation, Proposals, semantics, Signal processing algorithms, Video sequences, Visualization}, doi = {10.1109/WIAMIS.2012.6226749}, url = {http://dx.doi.org/10.1109/WIAMIS.2012.6226749}, author = {Xavier Gir{\'o}-i-Nieto and Martos, Manel} } @conference {cGiro-i-Nieto12b, title = {Multiscale annotation of still images with GAT}, booktitle = {Proceedings of the 1st International Workshop on Visual Interfaces for Ground Truth Collection in Computer Vision Applications}, year = {2012}, publisher = {ACM}, organization = {ACM}, address = {Capri, Italy}, abstract = {

This paper presents GAT, a Graphical Annotation Tool for still images that works both at the global and local scales. This interface has been designed to assist users in the an- notation of images with relation to the semantic classes de- scribed in an ontology. Positive, negative and neutral labels can be assigned to both the whole images or parts of them. The user interface is capable of exploiting segmentation data to assist in the selection of objects. Moreover, the annota- tion capabilities are complemented with additional function- alities that allow the creation and evaluation of an image classifier. The implemented Java source code is published under a free software license.\

}, keywords = {annotation, image, interactive, segmentation, semantics}, isbn = {978-1-4503-1405-3}, doi = {10.1145/2304496.2304497}, url = {http://doi.acm.org/10.1145/2304496.2304497}, author = {Xavier Gir{\'o}-i-Nieto and Martos, Manel} } @inbook {bCarcel12, title = {Rich Internet Application for Semi-automatic Annotation of Semantic Shots on Keyframes}, booktitle = {Computational Intelligence for Multimedia Understanding}, volume = {7242}, number = {Lecture Notes in Computer Science}, year = {2012}, pages = {172-182}, publisher = {Springer-Verlag}, organization = {Springer-Verlag}, address = {Pisa, Italy}, abstract = {

This paper describes a system developed for the semi- automatic annotation of keyframes in a broadcasting company. The tool aims at assisting archivists who traditionally label every keyframe manually by suggesting them an automatic annotation that they can intuitively edit and validate. The system is valid for any domain as it uses generic MPEG-7 visual descriptors and binary SVM classifiers. The classification engine has been tested on the multiclass problem of semantic shot detection, a type of metadata used in the company to index new con- tent ingested in the system. The detection performance has been tested in two different domains: soccer and parliament. The core engine is accessed by a Rich Internet Application via a web service. The graphical user interface allows the edition of the suggested labels with an intuitive drag and drop mechanism between rows of thumbnails, each row representing a different semantic shot class. The system has been described as complete and easy to use by the professional archivists at the company.

[Blog]

Rich Internet Application for Semi-Automatic Annotation of Semantic Shots on Keyframes from Xavi Gir{\'o}

}, keywords = {annotation, classification, MPEG-7 visual descriptors, RIA, semantic shot}, isbn = {978-3-642-32435-2}, doi = {10.1007/978-3-642-32436-9_15}, url = {http://www.springerlink.com/content/x34632125j381045/}, author = {Carcel, Elisabet and Martos, Manel and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} }