@mastersthesis {xMontes, title = {Temporal Activity Detection in Untrimmed Videos with Recurrent Neural Networks}, year = {2016}, abstract = {

Advisors: Amaia Salvador and Xavier Gir{\'o}-i-Nieto.

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Grade: A (9.8/10.0)

This thesis explore different approaches using Convolutional and Recurrent Neural Networks to classify and temporally localize activities on videos, furthermore an implementation to achieve it has been proposed.

As the first step, features have been extracted from video frames using an state of the art 3D Convolutional Neural Network. This features are fed in a recurrent neural network that solves the activity classification and temporally location tasks in a simple and flexible way.

Different architectures and configurations have been tested in order to achieve the best performance and learning of the video dataset provided. In addition it has been studied different kind of post processing over the trained network{\textquoteright}s output to achieve a better results on the temporally localization of activities on the videos.

The results provided by the neural network developed in this thesis have been submitted to the ActivityNet Challenge 2016 of the CVPR, achieving competitive results using a simple and flexible architecture.

Project page

Temporal Activity Detection in Untrimmed Videos with Recurrent Neural Networks from Xavier Giro

}, keywords = {deep learning, neural networks, videos}, author = {Montes, Alberto}, editor = {Amaia Salvador and Xavier Gir{\'o}-i-Nieto} } @conference {cMontes, title = {Temporal Activity Detection in Untrimmed Videos with Recurrent Neural Networks}, booktitle = {1st NIPS Workshop on Large Scale Computer Vision Systems 2016}, year = {2016}, month = {12/2016}, abstract = {

This work proposes a simple pipeline to classify and temporally localize activities in untrimmed videos. Our system uses features from a 3D Convolutional Neural Network (C3D) as input to train a a recurrent neural network (RNN) that learns to classify video clips of 16 frames. After clip prediction, we post-process the output of the RNN to assign a single activity label to each video, and determine the temporal boundaries of the activity within the video. We show how our system can achieve competitive results in both tasks with a simple architecture. We evaluate our method in the ActivityNet Challenge 2016, achieving a 0.5874 mAP and a 0.2237 mAP in the classification and detection tasks, respectively. Our code and models are publicly available at:\ https://imatge-upc.github.io/activitynet-2016-cvprw/

}, author = {Montes, Alberto and Amaia Salvador and Pascual-deLaPuente, Santiago and Xavier Gir{\'o}-i-Nieto} }