@conference {cShou, title = {Online Action Detection in Untrimmed, Streaming Videos}, booktitle = {European Conference on Computer Vision (ECCV)}, year = {2018}, month = {02/2018}, address = {Munich, German}, abstract = {

We aim to tackle a novel task in action detection - Online Detection of Action Start (ODAS) in untrimmed, streaming videos. The goal of ODAS is to detect the start of an action instance, with high categorization accuracy and low detection latency. ODAS is important in many applications such as early alert generation to allow timely security or emergency response. We propose three novel methods to specifically address the challenges in training ODAS models: (1) hard negative samples generation based on Generative Adversarial Network (GAN) to distinguish ambiguous background, (2) explicitly modeling the temporal consistency between data around action start and data succeeding action start, and (3) adaptive sampling strategy to handle the scarcity of training data. We conduct extensive experiments using THUMOS{\textquoteright}14 and ActivityNet. We show that our proposed methods lead to significant performance gains and improve the state-of-the-art methods. An ablation study confirms the effectiveness of each proposed method.

}, url = {https://arxiv.org/abs/1802.06822}, author = {Shou, Zheng and Pan, Junting and Chan, Johnatan and Miyazawa, Kazuyuki and Mansour, Hassan and Vetro, Anthony and Xavier Gir{\'o}-i-Nieto and Chang, Shih-Fu} } @conference {cPana, title = {SalGAN: Visual Saliency Prediction with Generative Adversarial Networks}, booktitle = {CVPR 2017 Scene Understanding Workshop (SUNw)}, year = {2017}, address = {Honolulu, Hawaii, USA}, abstract = {

We introduce SalGAN, a deep convolutional neural network for visual saliency prediction trained with adversarial examples. The first stage of the network consists of a generator model whose weights are learned by back-propagation computed from a binary cross entropy (BCE) loss over downsampled versions of the saliency maps. The resulting prediction is processed by a discriminator network trained to solve a binary classification task between the saliency maps generated by the generative stage and the ground truth ones. Our experiments show how adversarial training allows reaching state-of-the-art performance across different metrics when combined with a widely-used loss function like BCE.

}, url = {https://arxiv.org/abs/1701.01081}, author = {Pan, Junting and Cristian Canton-Ferrer and McGuinness, Kevin and O{\textquoteright}Connor, N. and Jordi Torres and Elisa Sayrol and Xavier Gir{\'o}-i-Nieto} } @conference {cPan, title = {Shallow and Deep Convolutional Networks for Saliency Prediction}, booktitle = {IEEE Conference on Computer Vision and Pattern Recognition, CVPR}, year = {2016}, month = {06/2016}, publisher = {Computer Vision Foundation / IEEE}, organization = {Computer Vision Foundation / IEEE}, address = {Las Vegas, NV, USA}, abstract = {

The prediction of salient areas in images has been traditionally addressed with hand-crafted features based on neuroscience principles. This paper, however, addresses the problem with a completely data-driven approach by training a convolutional neural network (convnet). The learning process is formulated as a minimization of a loss function that measures the Euclidean distance of the predicted saliency map with the provided ground truth. The recent publication of large datasets of saliency prediction has provided enough data to train end-to-end architectures that are both fast and accurate. Two designs are proposed: a shallow convnet trained from scratch, and a another deeper solution whose first three layers are adapted from another network trained for classification. To the authors knowledge, these are the first end-to-end CNNs trained and tested for the purpose of saliency prediction.

Project page
Preprint on arXiv
Page on gitXiv
Acceptance rate in CVPR 2016: 29.9\%

}, url = {http://www.cv-foundation.org/openaccess/content_cvpr_2016/papers/Pan_Shallow_and_Deep_CVPR_2016_paper.pdf}, author = {Pan, Junting and McGuinness, Kevin and Elisa Sayrol and O{\textquoteright}Connor, N. and Xavier Gir{\'o}-i-Nieto} } @article {xPan, title = {End-to-end Convolutional Network for Saliency Prediction}, year = {2015}, month = {07/2015}, institution = {arXiv}, address = {Boston, MA (USA)}, abstract = {

The prediction of saliency areas in images has been traditionally addressed with hand crafted features based on neuroscience principles. This paper however addresses the problem with a completely data-driven approach by training a convolutional network. The learning process is formulated as a minimization of a loss function that measures the Euclidean distance of the predicted saliency map with the provided ground truth. The recent publication of large datasets of saliency prediction has provided enough data to train a not very deep architecture which is both fast and accurate. The convolutional network in this paper, named JuntingNet, won the LSUN 2015 challenge on saliency prediction with a superior performance in all considered metrics.

}, url = {http://arxiv.org/abs/1507.01422}, author = {Pan, Junting and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xPana, title = {Visual Saliency Prediction using Deep learning Techniques}, year = {2015}, abstract = {

Advisor: Xavier Gir{\'o}-i-Nieto (UPC)

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Grade: A with honors (9.9/10.0)

A saliency map is a model that predicts eye fixations on a visual scene. In other words, it is the prediction of saliency areas in images has been traditionally addressed with hand crafted features inspired on neuroscience principles. This work however addresses the problem with a completely data-driven approach by training a convolutional network. The recent publication of large datasets of saliency prediction has provided enough data to train a not very deep network architecture which is both fast and accurate. In our system, named JuntingNet, the learning process is formulated as a minimization of a loss function that measures the Euclidean distance of the predicted saliency map with the provided ground truth. The convolutional network developed in this work, named JuntingNet, won the CVPR Large-scale Scene UNderstanding (LSUN) 2015 challenge on saliency prediction with a superior performance in all considered metrics.

Saliency prediction using deep learning techniques from Xavier Giro

2015-TFG-JuntingPan-VisualSaliencyPredictionUsingDeepLearningTechniques from Image Processing Group on Vimeo.

See https://imatge.upc.edu/web/resources/end-end-convolutional-networks-saliency-prediction-software.

}, author = {Pan, Junting}, editor = {Xavier Gir{\'o}-i-Nieto} }