@conference {cMarsden, title = {Dublin City University and Partners{\textquoteright} Participation in the INS and VTT Tracks at TRECVid 2016}, booktitle = {TRECVID Workshop 2016}, year = {2016}, month = {11/2016}, address = {Gaithersburg, MD, USA}, abstract = {

DCU participated with a consortium of colleagues from NUIG and UPC in two tasks,\ INS and VTT. For the INS task we developed a framework consisting of face detection and\ representation and place detection and representation, with a user annotation of top-ranked\ videos. For the VTT task we ran 1,000 concept detectors from the VGG-16 deep CNN on\ 10 keyframes per video and submitted 4 runs for caption re-ranking, based on BM25, Fusion,\ Word2Vec and a fusion of baseline BM25 and Word2Vec. With the same pre-processing for\ caption generation we used an open source image-to-caption CNN-RNN toolkit NeuralTalk2\ to generate a caption for each keyframe and combine them.

}, url = {http://doras.dcu.ie/21484/}, author = {Marsden, Mark and Mohedano, Eva and McGuinness, Kevin and Calafell, Andrea and Xavier Gir{\'o}-i-Nieto and O{\textquoteright}Connor, N. and Zhou, Jiang and Azevedo, Lucas and Daubert, Tobias and Davis, Brian and H{\"u}rlimann, Manuela and Afli, Haithem and Du, Jinhua and Ganguly, Debasis and Li, Wei and Way, Andy and Smeaton, Alan F.} } @mastersthesis {xCalafell, title = {Video Retrieval of Specific Persons in Specific Locations}, year = {2016}, abstract = {

Student: Andrea Calafell

Advisors: Eva Mohedano (Insight), Kevin McGuinness (Insight), Noel E. O{\textquoteright}Connor (Insight) and Xavier Gir{\'o}-i-Nieto (UPC)

Program: Master in Computer Vision (Class of 2016)

Grade: A (9.0/10.0)

This thesis explores good practices for improving the detection of specific people in specific places. An approach combining recurrent and convolutional neural network have been considered to perform face detection. However, other more conventional methods have been tested, obtaining the best results by exploiting a deformable part model approach. A CNN is also used to obtain the face feature vectors and, \ with the purpose of helping in the face recognition, an approach to perform query expansion has been also developed. Furthermore, in order to be able to evaluate the different configurations in our non-labelled dataset, a user interface has been used to annotate the images and be able to obtain a precision of the system. Finally, different fusion and normalization strategies has been explored with the aim of combining the scores obtained from the face recognition with the ones obtained in the place recognition.

Video Retrieval of Specific Persons in Specific Locations from Xavier Giro

}, author = {Calafell, Andrea}, editor = {Mohedano, Eva and McGuinness, Kevin and Xavier Gir{\'o}-i-Nieto} } @conference {cSalvadora, title = {Cultural Event Recognition with Visual ConvNets and Temporal Models}, booktitle = {CVPR ChaLearn Looking at People Workshop 2015}, year = {2015}, month = {06/2015}, abstract = {

This paper presents our contribution to the ChaLearn Challenge 2015 on Cultural Event Classification. The challenge in this task is to automatically classify images from 50 different cultural events. Our solution is based on the combination of visual features extracted from convolutional neural networks with temporal information using a hierarchical classifier scheme. We extract visual features from the last three fully connected layers of both CaffeNet (pretrained with ImageNet) and our fine tuned version for the ChaLearn challenge. We propose a late fusion strategy that trains a separate low-level SVM on each of the extracted neural codes. The class predictions of the low-level SVMs form the input to a higher level SVM, which gives the final event scores. We achieve our best result by adding a temporal refinement step into our classification scheme, which is applied directly to the output of each low-level SVM. Our approach penalizes high classification scores based on visual features when their time stamp does not match well an event-specific temporal distribution learned from the training and validation data. Our system achieved the second best result in the \ ChaLearn Challenge 2015 on Cultural Event Classification with a mean average precision of 0.767 on the test set.

[Preprint in arXiv]\ [Workshop site]\ [Slides on GDrive]

Cultural Event Recognition with Visual ConvNets and Temporal Models from Xavier Giro

}, url = {http://www.cv-foundation.org/openaccess/content_cvpr_workshops_2015/W09/papers/Salvador_Cultural_Event_Recognition_2015_CVPR_paper.pdf}, author = {Amaia Salvador and Zeppelzauer, Matthias and Manchon-Vizuete, Daniel and Calafell, Andrea and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xCalafell-Oros, title = {Fine-tuning a Convolutional Network for Cultural Event Recognition}, year = {2015}, abstract = {

Advisors: Amaia Salvador (UPC), Matthias Zeppelzauer (FH St P{\"o}lten), Xavier Gir{\'o}-i-Nieto (UPC)

Studies: Bachelor Degree in Audiovisual Systems Engineering at Telecom BCN-ETSETB from the Technical University of Catalonia (UPC)

Grade: A with honors (10/10)

This thesis explores good practices for improving the performance of an existing convnet trained with a dataset of clean data when an additional dataset of noisy data is available. We develop techniques to clean the noisy data with the help of the clean one, a family of solutions that we will refer to as denoising, and then we explore the best sorting of the clean and noisy datasets during the fine-tuning of a convnet. Then we study strategies to select the subset of images of the clean data that will improve the classification performance, a practice we will efer to as fracking. Next, we determine how many layers are actually better to fine-tune in our convnet, given our amount of data. And finally, we compare the classic convnet architecture where a single network is fine-tuned to solve a multi-class problem with the case of fine-tuning a convnet for binary classification for each considered class.

Fine tuning a convolutional network for cultural event recognition from Xavier Giro

2015-TFG-AndreaCalafell-FineTuningConvolutionalNetworkForCulturalEventRecognition from Image Processing Group on Vimeo.

See https://imatge.upc.edu/web/publications/cultural-event-recognition-visual-convnets-and-temporal-models

}, url = {http://hdl.handle.net/2117/78391}, author = {Calafell, Andrea}, editor = {Amaia Salvador and Zeppelzauer, Matthias and Xavier Gir{\'o}-i-Nieto} }