@mastersthesis {xDomenech, title = {Hiding Images in their Spoken Narratives}, year = {2022}, abstract = {

Steganography is the technique of hiding secret data within an ordinary, non-secret, file or message in order to avoid its detection. Throughout our work, we study the case where the hidden secret data is an image and the non-secret data or cover signal is an audio. To this end, we use a recently proposed residual architecture operating on top of short-time discrete cosine transform (STDCT) audio spectrograms. In our work, we evaluate the above mentioned residual steganography architecture with the Localized Narratives dataset, explore the feasibility of using short-time fourier transform (STFT) audio spectrograms instead of STDCTs to improve the efficiency of the system, investigate the use of hidden signals permuted with the objective to spread the audio corruption of the revealed images, apply averaged audio windows to improve quality results and tested the system in real-world distortions.

}, author = {Teresa Domenech}, editor = {McGuinness, Kevin and Pons, Jordi and Xavier Gir{\'o}-i-Nieto} } @conference {cGeleta21, title = {PixInWav: Residual Steganography for Hiding Pixels in Audio}, booktitle = {ICASSP}, year = {2022}, month = {06/2021}, abstract = {

Steganography comprises the mechanics of hiding data in a host media that may be publicly available. While previous works focused on unimodal setups (e.g., hiding images in images, or hiding audio in audio), PixInWav targets the multimodal case of hiding images in audio. To this end, we propose a novel residual architecture operating on top of short-time discrete cosine transform (STDCT) audio spectrograms. Among our results, we find that the residual audio steganography setup we propose allows independent encoding of the hidden image from the host audio without compromising quality. Accordingly, while previous works require both host and hidden signals to hide a signal, PixInWav can encode images offline --- which can be later hidden, in a residual fashion, into any audio signal. Finally, we test our scheme in a lab setting to transmit images over airwaves from a loudspeaker to a microphone verifying our theoretical insights and obtaining promising results.

Paper, poster \& video on IEEE SigPort.
Full paper on\ arXiv,\ UPCommons and IEEE Explore.
Source code on github.
Tweets by @ritageleta: [1], [2], [3].
Tweets by @DocXavi: [1], [2], [3].
CVPR 2021 Women in Computer Vision Workshop.

\ Deep Learning Barcelona Symposium 2022

Presentation from the early stages of the project (January 2021):

}, author = {Geleta, Margarita and Punt{\'\i}, Cristina and McGuinness, Kevin and Pons, Jordi and Cristian Canton-Ferrer and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xGraneroa, title = {2D to 3D body pose estimation for sign language with Deep Learning}, year = {2020}, abstract = {

This project aims at leveraging the challenge of using 3D poses for Sign Language translation or animation by transforming 2D pose datasets into 3D ones. The goal is, using a 3D dataset of American Sign Language, to train a deep neural network that will predict the depth coordinates of the skeleton keypoints from 2D coordinates. Specifically, it will be explored a Long Short-Term Memory network, an architecture broadly used for sequence to sequence tasks. The conclusions extracted on this report are that despite some of the results being good enough to be used for actual 3D SL annotation, the majority of them lack the precision to do so, and they are too variant with respect to the dataset split. It is also concluded that the solutions approached here could be improved by adding some regularization methods, more powerful hardware to run better experiments, and new input features such as keypoint visibility.

Software

}, author = {P{\'e}rez-Granero, Pol}, editor = {McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xPunti20, title = {PixInPix: Hidding Pixels in Pixels}, year = {2020}, abstract = {

PixInPix is an steganography hidding system of images within other images.\ The system designed is able to create, from an cover image and a message, a new steganography image. This new stego-image looks as similar as possible as the cover but has the message hidden in it.\ Our approach adopts the U-net architecture and combines two reconstruction losses to provide a simple yet effective approach tested in low resolution images from MNIST, CIFAR and ImageNet.

}, author = {Punt{\'\i}, Cristina}, editor = {McGuinness, Kevin and Cristian Canton-Ferrer and Xavier Gir{\'o}-i-Nieto} } @conference {cGorrizc, title = {Assessing Knee OA Severity with CNN attention-based end-to-end architectures}, booktitle = {International Conference on Medical Imaging with Deep Learning (MIDL) 2019}, year = {2019}, month = {02/2019}, publisher = {JMLR}, organization = {JMLR}, address = {London, United Kingdom}, abstract = {

This work proposes a novel end-to-end convolutional neural network (CNN) architecture to\ automatically quantify the severity of knee osteoarthritis (OA) using X-Ray images, which\ incorporates trainable attention modules acting as unsupervised fine-grained detectors of\ the region of interest (ROI). The proposed attention modules can be applied at different\ levels and scales across any CNN pipeline helping the network to learn relevant attention\ patterns over the most informative parts of the image at different resolutions. We test\ the proposed attention mechanism on existing state-of-the-art CNN architectures as our\ base models, achieving promising results on the benchmark knee OA datasets from the\ osteoarthritis initiative (OAI) and multicenter osteoarthritis study (MOST). All the codes\ from our experiments will be publicly available on the github repository: https://github.com/marc-gorriz/KneeOA-CNNAttention

2nd International Conference on Medical Imaging (MIDL) 2019
Paper in Proceeding,\ OpenReview\ and UPCommons.

}, url = {http://proceedings.mlr.press/v102/gorriz19a.html}, author = {G{\'o}rriz, Marc and Antony, Joseph and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @mastersthesis {xCaselles, title = {Integrating low-level motion cues in deep video saliency}, year = {2019}, abstract = {

This thesis investigates the importance of motion when predicting saliency in videos. Naturally, humans observe both dynamic and static objects. When we are focused on watching a video, we tend to keep our eyes on the objects that are moving in the scene, items that we quickly recognize, as well as to those that attract our attention. In this work, different experiments are presented to corroborate this implication. Various approaches will be shown implementing an adaptation of the SalBCE neural network by using only motion. A simple implementation is proposed for the generation of saliency maps using previously extracted static and dynamic information from the images. The DHF1K dataset has been used for the experiment{\textquoteright}s realization.

}, keywords = {Motion, Saliency, video}, author = {Caselles, Pol}, editor = {McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @conference {cLinardosa, title = {Simple vs complex temporal recurrences for video saliency prediction}, booktitle = {British Machine Vision Conference (BMVC)}, year = {2019}, month = {09/2019}, publisher = {British Machine Vision Association}, organization = {British Machine Vision Association}, address = {Cardiff, Wales / UK.}, abstract = {

This paper investigates modifying an existing neural network architecture for static saliency prediction using two types of recurrences that integrate information from the temporal domain. The first modification is the addition of a ConvLSTM within the architecture, while the second is a computationally simple exponential moving average of an internal convolutional state. We use weights pre-trained on the SALICON dataset and fine-tune our model on DHF1K. Our results show that both modifications achieve state-of-the-art results and produce similar saliency maps.

}, url = {https://bmvc2019.org/wp-content/uploads/papers/0952-paper.pdf}, author = {Linardos, Panagiotis and Mohedano, Eva and Nieto, Juan Jos{\'e} and O{\textquoteright}Connor, N. and Xavier Gir{\'o}-i-Nieto and McGuinness, Kevin} } @mastersthesis {xNieto, title = {Video Saliency Prediction with Deep Neural Networks}, year = {2019}, abstract = {

Saliency prediction is a topic undergoing intense study in computer vision with a broad range of applications. It consists in predicting where the attention is going to be received in an image or a video by a human. Our work is based on a deep neural network named SalGAN, which was trained on a saliency annotated dataset of static images. In this thesis we investigate different approaches for extending SalGAN to the video domain. To this end, we investigate the recently proposed saliency annotated video dataset DHF1K to train and evaluate our models. The obtained results indicate that techniques such as depth estimation or coordconv can effectively be used as additional modalities to enhance the saliency prediction of static images obtained with SalGAN, achieving encouraging results in the DHF1K benchmark. Our work is based on pytorch and it is publicly available here.

Source code

Video Saliency Prediction with Deep Neural Networks - Juan Jose Nieto - DCU 2019 from Universitat Polit{\`e}cnica de Catalunya

}, author = {Nieto, Juan Jos{\'e}}, editor = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @conference {cDuartea, title = {Wav2Pix: Speech-conditioned Face Generation using Generative Adversarial Networks}, booktitle = {ICASSP}, year = {2019}, month = {05/2019}, publisher = {IEEE}, organization = {IEEE}, address = {Brighton, UK}, abstract = {

Speech is a rich biometric signal that contains information\ about the identity, gender and emotional state of the speaker.\ In this work, we explore its potential to generate face images\ of a speaker by conditioning a Generative Adversarial\ Network (GAN) with raw speech input. We propose a deep\ neural network that is trained from scratch in an end-to-end\ fashion, generating a face directly from the raw speech waveform\ without any additional identity information (e.g reference\ image or one-hot encoding). Our model is trained in a\ self-supervised fashion by exploiting the audio and visual signals\ naturally aligned in videos. With the purpose of training\ from video data, we present a novel dataset collected for this\ work, with high-quality videos of ten youtubers with notable\ expressiveness in both the speech and visual signals.

Project page
Paper on IEEExplore, UPCommons and\ arXiv
ICASSP 2019

}, doi = {10.1109/ICASSP.2019.8682970}, url = {http://hdl.handle.net/2117/167073}, author = {Amanda Duarte and Rold{\'a}n, Francisco and Tubau, Miquel and Escur, Janna and Pascual-deLaPuente, Santiago and Amaia Salvador and Mohedano, Eva and McGuinness, Kevin and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cGomez, title = {Demonstration of an Open Source Framework for Qualitative Evaluation of CBIR Systems}, booktitle = {ACM Multimedia}, year = {2018}, month = {10/2018}, publisher = {ACM}, organization = {ACM}, address = {Seoul, South Korea}, abstract = {

Evaluating image retrieval systems in a quantitative way, for example by computing measures like mean average precision, allows for objective comparisons with a ground-truth. However, in cases where ground-truth is not\ available, the only alternative is to collect feedback from a user. Thus, qualitative assessments become important to better understand how the system works. Visualizing the results could be, in some scenarios, the only way to evaluate the results obtained and also the only opportunity to identify that a system is failing. This necessitates developing a User Interface (UI) for a Content Based Image Retrieval (CBIR) system that allows visualization of results and improvement via capturing user relevance feedback. A well-designed UI facilitates understanding of the performance of the system, both in cases where it works well and perhaps more importantly those which highlight the need for improvement. Our open-source system implements three components to facilitate researchers to quickly develop these capabilities for their retrieval engine. We present: a web-based user interface to visualize retrieval results and collect user annotations; a server that simplifies\ connection with any underlying CBIR system; and a server that manages the search engine data.\

User Interface for an Image Retrieval Engine System from Universitat Polit{\`e}cnica de Catalunya

}, doi = {10.1145/3240508.3241395}, url = {https://dl.acm.org/citation.cfm?id=3241395}, author = {Gomez, Paula and Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @mastersthesis {xColl-Pol, title = {The Importance of Time in Visual Attention Models}, year = {2018}, abstract = {

Predicting visual attention is a very active field in the computer vision community. Visual attention is a mechanism of the visual system that can select relevant areas within a scene. Models for saliency prediction are intended to automatically predict which regions are likely to be attended by a human observer. Traditionally, ground truth saliency maps are built using only the spatial position of the fixation points, being these xation points the locations where an observer fixates the gaze when viewing a scene. In this work we explore encoding the temporal information as well, and assess it in the application of prediction saliency maps with deep neural networks. It has been observed that the later fixations in a scanpath are usually selected randomly during visualization, specially in those images with few regions of interest. Therefore, computer vision models have dificulties learning to predict them. In this work, we explore a temporal weighting over the saliency maps to better cope with this random behaviour. The newly proposed saliency representation assigns dierent weights depending on the position in the sequence of gaze fixations, giving more importance to early timesteps than later ones. We used this maps to train MLNet, a state of the art for predicting saliency maps. MLNet predictions were evaluated and compared to the results obtained when the model has been trained using traditional saliency maps. Finally, we show how the temporally weighted saliency maps brought some improvement when used to weight the visual features in an image retrieval task.

Source code and models

The Importance of Time in Visual Attention Models from Universitat Polit{\`e}cnica de Catalunya

}, author = {Coll-Pol, Marta}, editor = {McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @conference {cAssensa, title = {PathGAN: Visual Scanpath Prediction with Generative Adversarial Networks}, booktitle = {ECCV 2018 Workshop on Egocentric Perception, Interaction and Compution (EPIC)}, year = {2018}, month = {07/2018}, publisher = {Springer}, organization = {Springer}, address = {Munich, Germany}, abstract = {

We introduce PathGAN, a deep neural network for visual scanpath prediction trained on adversarial examples. A visual scanpath is defined as the sequence of fixation points over an image defined by a human observer with its gaze. PathGAN is composed of two parts, the generator and the discriminator. Both parts extract features from images using off-the-shelf networks, and train recurrent layers to generate or discriminate scanpaths accordingly. In scanpath prediction, the stochastic nature of the data makes it very difficult to generate realistic predictions using supervised learning strategies, but we adopt adversarial training as a suitable alternative. Our experiments prove how PathGAN improves the state of the art of visual scanpath prediction on the Salient360! dataset.

This work obtained the\ 2nd award in Prediction of Head-gaze Scan-paths for Images, and the 2nd award in Prediction of Eye-gaze Scan-paths for Images at the IEEE ICME 2018 Salient360! Challenge.

}, doi = {10.1007/978-3-030-11021-5_25}, url = {https://doi.org/10.1007/978-3-030-11021-5_25}, author = {Assens, Marc and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @conference {cMohedanob, title = {Saliency Weighted Convolutional Features for Instance Search}, booktitle = {Content-Based Multimedia Indexing - CBMI}, year = {2018}, month = {09/2018}, publisher = {IEEE}, organization = {IEEE}, address = {La Rochelle, France}, abstract = {

This work explores attention models to weight the contribution of local convolutional representations for the instance search task. We present a retrieval framework based on bags of local convolutional features (BLCF) that benefits from saliency weighting to build an efficient image representation. The use of human visual attention models (saliency) allows significant improvements in retrieval performance without the need to conduct region analysis or spatial verification, and without requiring any feature fine tuning. We investigate the impact of different saliency models, finding that higher performance on saliency benchmarks does not necessarily equate to improved performance when used in instance search tasks. The proposed approach outperforms the state-of-the-art on the challenging INSTRE benchmark by a large margin, and provides similar performance on the Oxford and Paris benchmarks compared to more complex methods that use off-the-shelf representations.

Saliency Weighted Convolutional Features for Instance Search from Universitat Polit{\`e}cnica de Catalunya

}, author = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @article {aAssens, title = {Scanpath and Saliency Prediction on 360 Degree Images}, journal = {Elsevier Signal Processing: Image Communication}, year = {2018}, abstract = {

We introduce deep neural networks for scanpath and saliency prediction trained on 360-degree images. The scanpath prediction model called SaltiNet is based on a temporal-aware novel representation of saliency information named the saliency volume. The first part of the network consists of a model trained to generate saliency volumes, whose parameters are fit by back-propagation using a binary cross entropy (BCE) loss over downsampled versions of the saliency volumes. Sampling strategies over these volumes are used to generate scanpaths over the 360-degree images. Our experiments show the advantages of using saliency volumes, and how they can be used for related tasks. We also show how a similar architecture achieves state-of-the-art performance for the related task of saliency map prediction. Our source code and trained models available here.

}, url = {https://www.sciencedirect.com/science/article/pii/S0923596518306209}, author = {Assens, Marc and McGuinness, Kevin and O{\textquoteright}Connor, N. and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xRoldana, title = {Speech-conditioned Face Generation with Deep Adversarial Networks}, year = {2018}, abstract = {

Image synthesis have been a trending task for the AI community in recent years. Many works have shown the potential of Generative Adversarial Networks (GANs) to deal with tasks such as text or audio to image synthesis. In particular, recent advances in deep learning using audio have inspired many works involving both visual and auditory information. In this work we propose a face synthesis method using audio and/or language representations as inputs. Furthermore, a dataset which relates speech utterances with a face and an identity has been built, fitting for other tasks apart from face synthesis such as speaker recognition or voice conversion.

\ Source code

Speech Conditioned Face Generation with Deep Adversarial Networks from Universitat Polit{\`e}cnica de Catalunya

}, author = {Rold{\'a}n, Francisco}, editor = {Pascual-deLaPuente, Santiago and Amaia Salvador and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @unpublished {xMohedanoa, title = {Fine-tuning of CNN models for Instance Search with Pseudo-Relevance Feedback}, year = {2017}, publisher = {NIPS 2017 Women in Machine Learning Workshop}, address = {Long Beach, CA, USA}, abstract = {

CNN classification models trained on millions of labeled images have been proven to encode {\textquotedblleft}general purpose{\textquotedblright} descriptors in their intermediate layers. These descriptors are useful for a diverse range of computer vision problems~\cite{1}. However, the target task of these models is substantially different to the instance search task. While classification is concerned with distinguishing between different classes, instance search is concerned with identifying concrete instances of a particular class.\

In this work we propose an unsupervised approach to finetune a model for similarity learning~\cite{2}. For that, we combine two different search engines: one based on off-the-shelf CNN features, and another one on the popular SIFT features. As shown in the figure below, we observe that the information of pre-trained CNN representations and SIFT is in most of the cases complementary, which allows the generation of high quality rank lists. The fusion of the two rankings is used to generate training data for a particular dataset. A pseudo-relevance feedback strategy~\cite{3} is used for sampling images from rankings, considering the top images as positive examples of a particular instance and middle-low ranked images as negative examples.

}, author = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @mastersthesis {xArazo, title = {The impact of visual saliency prediction in image classification}, year = {2017}, abstract = {

Advisors: Eva Mohedano, Kevin McGuinness and Xavier Giro-i-Nieto

Program: Master{\textquoteright}s degree in Telecommunications Engineering (MET)

Grade: A (9.7/10.0)

This thesis introduces an architecture to improve the accuracy of a Convolutional Neural Network trained for image classification using visual saliency predictions from the original images. In this thesis the accuracy of a Convolutional Neural Network (CNN) trained for classification has been improved using saliency maps from the original images. The network had an AlexNet architecture and was trained using 1.2 million images from the Imagenet dataset. Two methods had been explored in order to exploit the information from the visual saliency predictions. The first methodologies implemented applied the saliency maps directly to the existing layers of the CNN, which in some cases were already trained for classification and in other they were initialized with random weights. In the second methodology the information from the saliency maps was merged from a new branch, trained at the same time as the initial CNN. In order to speed up the training of the networks the experiments were implemented using images reduced to 128x128. With this sizes the proposed model achieves 12.39\% increase in Top-1 accuracy performance with respect to the original CNN, and additionally reduces the number of parameters needed compared to AlexNet. Regarding the original size images 227x227 a model that increases 1.72\% Top-1 accuracy is proposed. To accelerate the training process of the network the images have been reduced. The methodology that provides the higher improvement in accuracy will be implemented using the original size of the images. The results will be compared to those obtained from the network trained only with the original images. All the methodologies proposed are implemented in a network previously trained for classification. Additionally the most successful methodologies will be implemented in the training of a network. The results will provide information about the best way to add saliency maps to improve the accuracy.

The impact of visual saliency prediction in image classification from Xavier Giro

}, author = {Arazo, Eric}, editor = {McGuinness, Kevin and Mohedano, Eva and Xavier Gir{\'o}-i-Nieto} } @inbook {bMohedano17, title = {Object Retrieval with Deep Convolutional Features}, booktitle = {Deep Learning for Image Processing Applications}, volume = {31}, number = {Advances in Parallel Computing}, year = {2017}, publisher = {IOS Press}, organization = {IOS Press}, address = {Amsterdam, The Netherlands}, abstract = {

Image representations extracted from convolutional neural networks (CNNs) outdo hand-crafted features in several computer vision tasks, such as visual image retrieval. This chapter recommends a simple pipeline for encoding the local activations of a convolutional layer of a pretrained CNN utilizing the well-known Bag of Words (BoW) aggregation scheme and called bag of local convolutional features (BLCF). Matching each local array of activations in a convolutional layer to a visual word results in an assignment map, which is a compact representation relating regions of an image with a visual word. We use the assignment map for fast spatial reranking, finding object localizations that are used for query expansion. We show the suitability of the BoW representation based on local CNN features for image retrieval, attaining state-of-the-art performance on the Oxford and Paris buildings benchmarks. We demonstrate that the BLCF system outperforms the latest procedures using sum pooling for a subgroup of the challenging TRECVid INS benchmark according to the mean Average Precision (mAP) metric.

}, issn = {978-1-61499-822-8 }, doi = {10.3233/978-1-61499-822-8-137}, url = {http://ebooks.iospress.nl/volumearticle/48028}, author = {Mohedano, Eva and Amaia Salvador and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Marqu{\'e}s, F.} } @conference {cPana, title = {SalGAN: Visual Saliency Prediction with Generative Adversarial Networks}, booktitle = {CVPR 2017 Scene Understanding Workshop (SUNw)}, year = {2017}, address = {Honolulu, Hawaii, USA}, abstract = {

We introduce SalGAN, a deep convolutional neural network for visual saliency prediction trained with adversarial examples. The first stage of the network consists of a generator model whose weights are learned by back-propagation computed from a binary cross entropy (BCE) loss over downsampled versions of the saliency maps. The resulting prediction is processed by a discriminator network trained to solve a binary classification task between the saliency maps generated by the generative stage and the ground truth ones. Our experiments show how adversarial training allows reaching state-of-the-art performance across different metrics when combined with a widely-used loss function like BCE.

}, url = {https://arxiv.org/abs/1701.01081}, author = {Pan, Junting and Cristian Canton-Ferrer and McGuinness, Kevin and O{\textquoteright}Connor, N. and Jordi Torres and Elisa Sayrol and Xavier Gir{\'o}-i-Nieto} } @conference {cAssens, title = {SaltiNet: Scan-path Prediction on 360 Degree Images using Saliency Volumes}, booktitle = {ICCV Workshop on Egocentric Perception, Interaction and Computing}, year = {2017}, month = {07/2017}, publisher = {IEEE}, organization = {IEEE}, address = {Venice, Italy}, abstract = {

We introduce SaltiNet, a deep neural network for scanpath prediction trained on 360-degree images. The first part of the network consists of a model trained to generate saliency volumes, whose parameters are learned by back-propagation computed from a binary cross entropy (BCE) loss over downsampled versions of the saliency volumes. Sampling strategies over these volumes are used to generate scanpaths over the 360-degree images. Our experiments show the advantages of using saliency volumes, and how they can be used for related tasks.

Winner of three awards at the Salient 360 Challenge at IEEE ICME 2017 (Hong Kong): Best Scan Path, Best Student Scan-path and Audience Award.

SaltiNet: The Temporal Dimension of Visual Attention Models from Xavier Giro-i-Nieto

}, isbn = {978-1-5386-1034-3}, doi = {10.1109/ICCVW.2017.275}, url = {http://ieeexplore.ieee.org/document/8265485/}, author = {Assens, Marc and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N.} } @mastersthesis {xAssens, title = {The Temporal Dimension of Visual Attention Models}, year = {2017}, abstract = {

Program: Bachelor Degree on Telecommunications Science and Technologies (CITTEL)

Grade: A with honours (10.0/10.0)

This thesis explores methodologies for scanpath prediction on images using deep learning\ frameworks.\ As a preliminary step, we analyze the characteristics of the data provided by dierent datasets.\ We then explore the use of Convolutional Neural Networks (CNN) and Long-Short-Term-Memory\ (LSTM) newtworks for scanpath prediction. We observe that these models fail due to the high\ stochastic nature of the data.\ With the gained insight, we propose a novel time-aware visual saliency representation named\ Saliency Volume, that averages scanpaths over multiple observers.\ Next, we explore the SalNet network and adapt it for saliency volume prediction, and we find\ several ways of generating scanpaths from saliency volumes.\ Finally, we ne-tuned our model for scanpaht prediction on 360-degree images and successfully\ submitted it to the Salient360! Challenge from ICME. The source code and models are publicly\ available at https://github.com/massens/saliency-360salient-2017.

The Temporal Dimension of Visual Attention Models from Xavier Giro-i-Nieto

}, author = {Assens, Marc}, editor = {McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and Noel E. O{\textquoteright}Connor} } @conference {cMohedanoa, title = {Bags of Local Convolutional Features for Scalable Instance Search}, booktitle = {ACM International Conference on Multimedia Retrieval (ICMR)}, year = {2016}, month = {06/2016}, publisher = {ACM}, organization = {ACM}, address = {New York City, NY; USA}, abstract = {

Image representations extracted from convolutional neural networks (CNNs) have been shown to outperform hand-crafted features in multiple computer vision tasks, such as visual image retrieval. This work proposes a simple pipeline for encoding the local activations of a convolutional layer of a pre-trained CNN using the well-known bag of words aggregation scheme (BoW). Assigning each local array of activations in a convolutional layer to a visual word produces an \textit{assignment map}, a compact representation that relates regions of an image with a visual word. We use the assignment map for fast spatial reranking, obtaining object localizations that are used for query expansion. We demonstrate the suitability of the Bag of Words representation based on local CNN features for image retrieval, achieving state-of-the-art performance on the Oxford and Paris buildings benchmarks. We show that our proposed system for CNN feature aggregation with BoW outperforms state-of-the-art techniques using sum pooling at a subset of the challenging TRECVid INS benchmark.

Best poster award at ACM ICMR 2016

Preprint on arXiv

Project page with source code

Post on GitXiv

Overall acceptance rate in ICMR 2016: 30\%\

2016-05-Seminar-AmaiaSalvador-DeepVision from Image Processing Group on Vimeo.

Convolutional Features for Instance Search from Xavier Giro

}, keywords = {Bag of Words, Convolutional Neural Networks, Instance Retrieval}, isbn = {978-1-4503-4359-6}, doi = {http://dx.doi.org/10.1145/2911996.2912061}, url = {http://dx.doi.org/10.1145/2911996.2912061}, author = {Mohedano, Eva and Amaia Salvador and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Marqu{\'e}s, F.} } @conference {cMarsden, title = {Dublin City University and Partners{\textquoteright} Participation in the INS and VTT Tracks at TRECVid 2016}, booktitle = {TRECVID Workshop 2016}, year = {2016}, month = {11/2016}, address = {Gaithersburg, MD, USA}, abstract = {

DCU participated with a consortium of colleagues from NUIG and UPC in two tasks,\ INS and VTT. For the INS task we developed a framework consisting of face detection and\ representation and place detection and representation, with a user annotation of top-ranked\ videos. For the VTT task we ran 1,000 concept detectors from the VGG-16 deep CNN on\ 10 keyframes per video and submitted 4 runs for caption re-ranking, based on BM25, Fusion,\ Word2Vec and a fusion of baseline BM25 and Word2Vec. With the same pre-processing for\ caption generation we used an open source image-to-caption CNN-RNN toolkit NeuralTalk2\ to generate a caption for each keyframe and combine them.

}, url = {http://doras.dcu.ie/21484/}, author = {Marsden, Mark and Mohedano, Eva and McGuinness, Kevin and Calafell, Andrea and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Zhou, Jiang and Azevedo, Lucas and Daubert, Tobias and Davis, Brian and H{\"u}rlimann, Manuela and Afli, Haithem and Du, Jinhua and Ganguly, Debasis and Li, Wei and Way, Andy and Smeaton, Alan F.} } @conference {cPan, title = {Shallow and Deep Convolutional Networks for Saliency Prediction}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition, CVPR}, year = {2016}, month = {06/2016}, publisher = {Computer Vision Foundation / IEEE}, organization = {Computer Vision Foundation / IEEE}, address = {Las Vegas, NV, USA}, abstract = {

The prediction of salient areas in images has been traditionally addressed with hand-crafted features based on neuroscience principles. This paper, however, addresses the problem with a completely data-driven approach by training a convolutional neural network (convnet). The learning process is formulated as a minimization of a loss function that measures the Euclidean distance of the predicted saliency map with the provided ground truth. The recent publication of large datasets of saliency prediction has provided enough data to train end-to-end architectures that are both fast and accurate. Two designs are proposed: a shallow convnet trained from scratch, and a another deeper solution whose first three layers are adapted from another network trained for classification. To the authors knowledge, these are the first end-to-end CNNs trained and tested for the purpose of saliency prediction.

Project page
Preprint on arXiv
Page on gitXiv
Acceptance rate in CVPR 2016: 29.9\%

}, url = {http://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Pan_Shallow_and_Deep_CVPR_2016_paper.pdf}, author = {Pan, Junting and McGuinness, Kevin and Elisa Sayrol and O{\textquoteright}Connor, N. and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xReyesa, title = {Time-sensitive Egocentric Image Retrieval for Fidings Objects in Lifelogs}, year = {2016}, abstract = {

Advisors: Eva Mohedano (Insight DCU), Kevin McGuinness (Insight DCU) and\ Xavier Gir{\'o}-i-Nieto\ (UPC)

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Grade: A (9.4/10.0)

This work explores diverse practices for conducting an object search from large amounts of egocentric images taking into account their temporal information. The application of this technology is to identify where personal belongings were lost or forgotten. We develop a pipeline-structured system. Firstly, the images of the day being scanned are sorted based on their probability to depict the forgotten object. This stage is solved by applying an existing visual search engine based on deep learning features. Secondly, a learned threshold selects the top ranked images as candidates to contain the object. Finally the images are reranked based on temporal and diversity criteria. Furthermore, we build a validation environment for assessing the system{\textquoteright}s performance aiming to find the optimal configuration of its parameters. Due to the lack of related works to be compared with, this thesis proposes an novel evaluation framework and metric to assess the problem.

Project page

[Tfg cristian reyes] time sensitive egocentric image retrieval for finding objects in lifelogs from Xavier Giro

}, author = {Reyes, Cristian}, editor = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xCalafell, title = {Video Retrieval of Specific Persons in Specific Locations}, year = {2016}, abstract = {

Student: Andrea Calafell

Advisors: Eva Mohedano (Insight), Kevin McGuinness (Insight), Noel E. O{\textquoteright}Connor (Insight) and Xavier Gir{\'o}-i-Nieto (UPC)

Program: Master in Computer Vision (Class of 2016)

Grade: A (9.0/10.0)

This thesis explores good practices for improving the detection of specific people in specific places. An approach combining recurrent and convolutional neural network have been considered to perform face detection. However, other more conventional methods have been tested, obtaining the best results by exploiting a deformable part model approach. A CNN is also used to obtain the face feature vectors and, \ with the purpose of helping in the face recognition, an approach to perform query expansion has been also developed. Furthermore, in order to be able to evaluate the different configurations in our non-labelled dataset, a user interface has been used to annotate the images and be able to obtain a precision of the system. Finally, different fusion and normalization strategies has been explored with the aim of combining the scores obtained from the face recognition with the ones obtained in the place recognition.

Video Retrieval of Specific Persons in Specific Locations from Xavier Giro

}, author = {Calafell, Andrea}, editor = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @article {xReyes, title = {Where did I leave my phone ?}, year = {2016}, month = {06/2016}, institution = {4th Workshop on Egocentric (First-Person) Vision, CVPR 2016}, type = {Extended abstract}, address = {Las Vegas, NV, USA}, abstract = {

The interest of users in having their lives digitally recorded has grown in the last years thanks to the advances on wearable sensors.\ Wearable cameras are one of the most informative ones, but they generate large amounts of images that require automatic analysis to build useful applications upon them.\ In this work we explore the potential of these devices to find the last appearance of personal objects among the more than 2,000 images that are generated everyday.\ This application could help into developing personal assistants capable of helping users when they do not remember where they left their personal objects.\ We adapt a previous work on instance search to the specific domain of egocentric vision.

Extended abstract presented as poster in the 4th Workshop on Egocentric (First-Person) Vision,\ CVPR 2016.\

Project page on GitHub

}, author = {Reyes, Cristian and Mohedano, Eva and McGuinness, Kevin and O{\textquoteright}Connor, N. and Xavier Gir{\'o}-i-Nieto} } @conference {cReyes, title = {Where is my Phone? Personal Object Retrieval from Egocentric Images}, booktitle = {Lifelogging Tools and Applications Workshop in ACM Multimedia}, year = {2016}, month = {10/2016}, publisher = {ACM}, organization = {ACM}, address = {Amsterdam, The Netherlands}, abstract = {

This work presents a retrieval pipeline and evaluation scheme for the problem of finding the last appearance of personal objects in a large dataset of images captured from a wearable camera. Each personal object is modelled by a small set of images that define a query for a visual search engine.The retrieved results are reranked considering the temporal timestamps of the images to increase the relevance of the later detections. Finally, a temporal interleaving of the results is introduced for robustness against false detections. The Mean Reciprocal Rank is proposed as a metric to evaluate this problem. This application could help into developing personal assistants capable of helping users when they do not remember where they left their personal belongings.

}, doi = {http://dx.doi.org/10.1145/2983576.2983582}, url = {http://arxiv.org/abs/1608.08139}, author = {Reyes, Cristian and Mohedano, Eva and McGuinness, Kevin and Noel E. O{\textquoteright}Connor and Xavier Gir{\'o}-i-Nieto} } @conference {cMohedano, title = {Exploring EEG for Object Detection and Retrieval}, booktitle = {ACM International Conference on Multimedia Retrieval (ICMR) }, year = {2015}, address = {Shanghai, China}, abstract = {

This paper explores the potential for using Brain Computer Interfaces (BCI) as a relevance feedback mechanism in content-based image retrieval. We investigate if it is possible to capture useful EEG signals to detect if relevant objects are present in a dataset of realistic and complex images. \ We perform several experiments using a rapid serial visual presentation (RSVP) of images at different rates (5Hz and 10Hz) on 8 users with different degrees of familiarization with BCI and the dataset. We then use the feedback from the BCI and mouse-based interfaces to retrieve objects in a subset of TRECVid images. We show that it is indeed possible detect such objects in complex images and, also, that users with previous knowledge on the dataset or experience with the RSVP outperform others. When the users have limited time to annotate the images (100 seconds in our experiments) both interfaces are comparable in performance. Comparing our best users in a retrieval task, we found that EEG-based relevance feedback outperforms mouse-based feedback. The realistic and complex image dataset differentiates our work from previous studies on EEG for image retrieval.\

[Extended version in arXiv:1504.02356]

[ACM ICMR 2015 website]\

Overall acceptance rate: 33\% (source)

}, doi = {10.1145/2671188.2749368}, url = {http://arxiv.org/abs/1504.02356}, author = {Mohedano, Eva and Amaia Salvador and Porta, Sergi and Xavier Gir{\'o}-i-Nieto and Kevin McGuinness and Healy, Graham and O{\textquoteright}Connor, N.} } @article {aMohedano, title = {Improving Object Segmentation by using EEG signals and Rapid Serial Visual Presentation}, journal = {Multimedia Tools and Applications}, year = {2015}, month = {07/2015}, abstract = {

This paper explores the potential of brain-computer interfaces in segmenting objects from images. Our approach is centered around designing an effective method for displaying the image parts to the users such that they generate measurable brain reactions. When a block of pixels is displayed, we estimate the probability of that block containing the object of interest using a score based on EEG activity. After several such blocks are displayed in rapid visual serial presentation, the resulting probability map is binarized and combined with the GrabCut algorithm to segment the image into object and background regions. This study extends our previous work that showed how BCI and simple EEG analysis are useful in locating object boundaries in images

}, issn = {1573-7721}, doi = {10.1007/s11042-015-2805-0}, url = {http://dx.doi.org/10.1007/s11042-015-2805-0}, author = {Mohedano, Eva and Healy, Graham and Kevin McGuinness and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Smeaton, Alan F.} } @conference {cVentura, title = {Improving Spatial Codification in Semantic Segmentation}, booktitle = {IEEE International Conference on Image Processing (ICIP), 2015}, year = {2015}, month = {09/2015}, publisher = {IEEE}, organization = {IEEE}, address = {Quebec City}, abstract = {

This paper explores novel approaches for improving the spatial codification for the pooling of local descriptors to solve the semantic segmentation problem. We propose to partition the image into three regions for each object to be described: Figure, Border and Ground. This partition aims at minimizing the influence of the image context on the object description and vice versa by introducing an intermediate zone around the object contour. Furthermore, we also propose a richer visual descriptor of the object by applying a Spatial Pyramid over the Figure region. Two novel Spatial Pyramid configurations are explored: Cartesian-based and crown-based Spatial Pyramids. We test these approaches with state-of-the-art techniques and show that they improve the Figure-Ground based pooling in the Pascal VOC 2011 and 2012 semantic segmentation challenges.

Preprint in arXiv

Improving Spatial Codification in Semantic Segmentation from Xavier Giro

}, url = {http://arxiv.org/abs/1505.07409}, author = {Ventura, C. and Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Kevin McGuinness and Marqu{\'e}s, F. and O{\textquoteright}Connor, N.} } @article {xVentura, title = {Improving Spatial Codification in Semantic Segmentation (Supplementary Material)}, year = {2015}, month = {09/2015}, abstract = {

This document contains supplementary material for the paper "Improving Spatial Codification in Semantic Segmentation" submitted to ICIP 2015. First, there is a section dedicated to the results obtained by categories when ideal object candidates (ground truth masks) are used. Then, an analysis of the results using CPMC and MCG object candidates also detailed by categories. Finally, visual results for CPMC and MCG are showed.

}, author = {Ventura, C. and Xavier Gir{\'o}-i-Nieto and Ver{\'o}nica Vilaplana and Kevin McGuinness and Marqu{\'e}s, F. and Noel E. O{\textquoteright}Connor} } @conference {cMcGuinnessa, title = {Insight DCU at TRECVID 2015}, booktitle = {TRECVID 2015 Workshop}, year = {2015}, month = {11/2015}, publisher = {NIST}, organization = {NIST}, address = {Gaithersburg, MD, USA}, abstract = {

Insight-DCU participated in the instance search (INS), semantic indexing (SIN), and localization tasks (LOC) this year.

In the INS task we used deep convolutional network features trained on external data and the query data for this year to train our system. We submitted four runs, three based on convolutional network features, and one based on SIFT/BoW. F A insightdcu 1 was an automatic run using features from the last convolutional layer of a deep network with bag-of-words encoding and achieved 0.123 mAP. F A insightdcu 2 modied the previous run to use re-ranking based on an R-CNN model and achieved 0.111 mAP. I A insightdcu 3, our interactive run, achieved 0.269 mAP. Our SIFT-based run F A insightdcu 2 used weak geometric consistency to improve performance over the previous year to 0.187 mAP. Overall we found that using features from the convolutional layers improved performance over features from the fully connected layers used in previous years, and that weak geometric consistency improves performance for local feature ranking.

In the SIN task we again used convolutional network features, this time netuning a network pretrained on external data for the task. We submitted four runs, 2C D A insightdcu.15 1..4 varying the top-level learning algorithm and use of concept co-occurance. 2C D A insightdcu.15 1 used a linear SVM top-level learner, and achieved 0.63 mAP. Exploiting concept co-occurance improved the accuracy of our logistic regression run 2C D A insightdcu.15 3 from 0.058 mAP to 0.6 2C D A insightdcu.15 3.

Our LOC system used training data from IACC.1.B and features similar to our INS run, but using a VLAD encoding instead of a bag-of-words. Unfortunately there was problem with the run that we are still investigating.

Note: UPC and NII participated only in the INS task of this submission.

}, url = {http://www-nlpir.nist.gov/projects/tvpubs/tv.pubs.15.org.html}, author = {Kevin McGuinness and Mohedano, Eva and Amaia Salvador and Zhang, ZhenXing and Marsden, Mark and Wang, Peng and Jargalsaikhan, Iveel and Antony, Joseph and Xavier Gir{\'o}-i-Nieto and Satoh, Shin{\textquoteright}ichi and O{\textquoteright}Connor, N. and Smeaton, Alan F.} } @mastersthesis {xSalvador, title = {Exploiting User Interaction and Object Candidates for Instance Retrieval and Object Segmentation}, year = {2014}, abstract = {

Author: Amaia Salvador-Aguilera

Advisors:\ Xavier Gir{\'o}-i-Nieto\ (UPC) and Kevin McGuinness (Dublin CIty University)

Degree:\ Master in Computer Vision\ (1 year)

Video: Thesis defense

This thesis addresses two of the main challenges nowadays for computer vision: object segmentation and visual instance retrieval. The methodologies proposed to solve both problems are based on the use of object candidates and human computation in the computer vision loop. In the object segmentation side, this work explores how human computation can be useful to achieve better segmentation results, by combining users{\textquoteright} traces with a segmentation algorithm based on object candidates. On the other hand, the instance retrieval problem is also addressed using object candidates to compute local features, and involving the user in the retrieval loop by applying relevance feedback strategies.

}, keywords = {computer Vision, human computing, instance search, object candidates, segmentation}, author = {Amaia Salvador}, editor = {Xavier Gir{\'o}-i-Nieto and Kevin McGuinness} } @conference {cMcGuinness, title = {Insight Centre for Data Analytics (DCU) at TRECVid 2014: Instance Search and Semantic Indexing Tasks}, booktitle = {2014 TRECVID Workshop}, year = {2014}, month = {11/2014}, publisher = {National Institute of Standards and Technology (NIST)}, organization = {National Institute of Standards and Technology (NIST)}, address = {Orlando, Florida (USA)}, abstract = {

Insight-DCU participated in the instance search (INS) and semantic indexing (SIN) tasks in 2014. Two very different approaches were submitted for instance search, one based on features extracted using pre-trained deep convolutional neural networks (CNNs), and another based on local SIFT features, large vocabulary visual bag-of-words aggregation, inverted index-based lookup, and geometric verification on the top-N retrieved results. Two interactive runs and two automatic runs were submitted, the best interactive runs achieved a mAP of 0.135 and the best automatic 0.12. Our semantic indexing runs were based also on using convolutional neural network features, and on Support Vector Machine classifiers with linear and RBF kernels. One run was submitted to the main task, two to the no annotation task, and one to the progress task. Data for the no-annotation task was gathered from Google Images and ImageNet. The main task run has achieved a mAP of 0.086, the best no-annotation runs had a close performance to the main run by achieving a mAP of 0.080, while the progress run had 0.043.

[2014 TREC Video Retrieval Evaluation Notebook Papers and Slides]

Poster \

}, url = {http://hdl.handle.net/2117/24915}, author = {Kevin McGuinness and Mohedano, Eva and Zhang, ZhenXing and Hu, Feiyan and Albatal, Rami and Gurrin, Cathal and O{\textquoteright}Connor, N. and Smeaton, Alan F. and Amaia Salvador and Xavier Gir{\'o}-i-Nieto and Ventura, C.} } @conference {cMohedano, title = {Object segmentation in images using EEG signals}, booktitle = {ACM Multimedia}, year = {2014}, month = {11/2014}, address = {Orlando, Florida (USA)}, abstract = {

This paper explores the potential of brain-computer interfaces in segmenting objects from images. Our approach is centered around designing an effective method for displaying the image parts to the users such that they generate measurable brain reactions. When an image region, specifically a block of pixels, is displayed we estimate the probability of the block containing the object of interest using a score based on EEG activity. After several such blocks are displayed, the resulting probability map is binarized and combined with the GrabCut algorithm to segment the image into object and background regions. This study shows that BCI and simple EEG analysis are useful in locating object boundaries in images.

}, keywords = {Brain-computer interfaces, Electroencephalography, GrabCut algorithm, Interactive segmentation, Object segmentation, rapid serial visual presentation}, doi = {10.1145/2647868.2654896}, url = {http://arxiv.org/abs/1408.4363}, author = {Mohedano, Eva and Healy, Graham and Kevin McGuinness and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Smeaton, Alan F.} } @mastersthesis {xMohedano13, title = {Investigating EEG for Saliency and Segmentation Applications in Image Processing}, year = {2013}, abstract = {

Advisors: Kevin McGuinness, Xavier Gir{\'o}-i-Nieto, Noel O{\textquoteright}Connor

School: Dublin City University (Ireland)

The main objective of this project is to implement a new way to compute saliency maps and to locate an object in an image by using a brain-computer interface. To achieve this, the project is centered in designing the proper way to display the different parts of the images to the users in such a way that they generate measurable reactions. Once an image window is shown, the objective is to compute a score based on the EEG activity and compare its result with the current automatic methods to estimate saliency maps. Also, the aim of this work is to use the EEG map as a seed for another segmentation algorithm that will extract the object from the background in an image. This study provides evidence that BCI are useful to find the location of the objects in a simple images via straightforward EEG analysis and this represents the starting point to locate objects in more complex images.