@conference {cGirbau21, title = {Multiple Object Tracking with Mixture Density Networks for Trajectory Estimation}, booktitle = {CVPR 2021 Robust Video Scene Understanding: Tracking and Video Segmentation (RVSU) Workshop}, year = {2021}, abstract = {

Multiple object tracking faces several challenges that may be alleviated with trajectory information. Knowing the posterior locations of an object helps disambiguating and solving situations such as occlusions, re-identification, and identity switching. In this work, we show that trajectory estimation can become a key factor for tracking, and present TrajE, a trajectory estimator based on recurrent mixture density networks, as a generic module that can be added to existing object trackers. To provide several trajectory hypotheses, our method uses beam search. Also, relying on the same estimated trajectory, we propose to reconstruct a track after an occlusion occurs. We integrate TrajE into two state of the art tracking algorithms, CenterTrack [63] and Tracktor [3]. Their respective performances in the MOTChallenge 2017 test set are boosted 6.3 and 0.3 points in MOTA score, and 1.8 and 3.1 in IDF1, setting a new state of the art for the CenterTrack+TrajE configuration.

}, url = {https://arxiv.org/abs/2106.10950}, author = {Girbau, A. and Xavier Gir{\'o}-i-Nieto and Rius, Ignasi and Marqu{\'e}s, F.} } @article {xEscobar21, title = {Object Model Adaptation for Multiple Object Tracking}, year = {2021}, abstract = {

Multiple object tracking is a broadly used task in multi- ple applications, all the way from bioengineering to security applications. In this paper we propose a variation of RVOS by adding the center estimation of detected instances, by means of a second head in the decoder which is assigned the task of detecting the corresponding object{\textquoteright}s bounding box arithmetic center. We have trained the model using three variants of the cross-entropy loss, which has been adapted to tackle the class imbalance caused by the fact that the center of an object is represented by only one pixel of the image, and have obtained some promising results.

}, author = {Escobar, Miquel}, editor = {Girbau, A. and Ventura, C. and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @phdthesis {dGirbau21, title = {Sports broadcasting and multiple object tracking with deep learning methods}, volume = {PhD}, year = {2021}, month = {03/2021}, type = {Industrial}, abstract = {

Since less than a decade ago, deep learning techniques started to dominate many different\ fields, revolutionizing the possibilities of artificial intelligence. Seeing their potential, industrial\ sectors started to invest in applying such technologies as key components of the company\ strategy. This thesis has been developed in an industrial context, in AutomaticTV. The main\ focus along this period has been the transfer of knowledge and know-how between academia\ and industry, development of tools to exploit this knowledge, the exploration of new techniques\ for future challenges, and, from an academic research perspective, contributions to the\ multiple object tracking problem.

The first part of the thesis is devoted to the introduction of deep learning technologies to\ AutomaticTV, a company dedicated to automatic sports analysis and broadcasting, and the\ development of tools and tasks that surround the application.

The second part of this thesis introduces the contributions to the multiple object tracking\ challenge. We present TrajE, a trajectory estimator based on mixture density networks and\ beam search, used to boost the performance of existing multiple object trackers, and introduce\ an occlusion reconstruction step using the estimated trajectory information. By adding TrajE\ to an existing multiple object tracker, we boost its performance by 6.3, 1.8 points in MOTA and\ IDF1 scores respectively, becoming the new state of the art in the MOTChallenge dataset.

}, author = {Girbau, A.}, editor = {Rius, Ignasi and Xavier Gir{\'o}-i-Nieto and Marqu{\'e}s, F.} } @conference {cVenturaa, title = {RVOS: End-to-End Recurrent Network for Video Object Segmentation}, booktitle = {CVPR}, year = {2019}, month = {06/2019}, publisher = {OpenCVF / IEEE}, organization = {OpenCVF / IEEE}, address = {Long Beach, CA, USA}, abstract = {

Multiple object video object segmentation is a challenging task, specially for the zero-shot case, when no object mask is given at the initial frame and the model has to find the objects to be segmented along the sequence. In our work, we propose RVOS, a recurrent network that is fully end-to-end trainable for multiple object video object segmentation, with a recurrence module working on two different domains: (i) the spatial, which allows to discover the different object instances within a frame, and (ii) the temporal, which allows to keep the coherence of the segmented objects along time. We train RVOS for zero-shot video object segmentation and are the first ones to report quantitative results for DAVIS-2017 and YouTube-VOS benchmarks. Further, we adapt RVOS for one-shot video object segmentation by using the masks obtained in previous time-steps as inputs to be processed by the recurrent module. Our model reaches comparable results to state-of-the-art techniques in YouTube-VOS benchmark and outperforms all previous video object segmentation methods not using online learning in the DAVIS-2017 benchmark.

RVOS: End-to-End Recurrent Network for Video Object Segmentation (CVPR 2019) from Universitat Polit{\`e}cnica de Catalunya



}, url = {http://openaccess.thecvf.com/content_CVPR_2019/html/Ventura_RVOS_End-To-End_Recurrent_Network_for_Video_Object_Segmentation_CVPR_2019_paper.html}, author = {Ventura, C. and M{\'\i}riam Bellver and Girbau, A. and Amaia Salvador and Marqu{\'e}s, F. and Xavier Gir{\'o}-i-Nieto} }