@conference {cIndia18, title = {UPC Multimodal Speaker Diarization System for the 2018 Albayzin Challenge}, booktitle = {IberSpeech 2018}, year = {2018}, month = {11/2018}, address = {Barcelona}, abstract = {

This paper presents the UPC system proposed for the Multimodal Speaker Diarization task of the 2018 Albayzin Challenge. This approach works by processing individually the speech and the image signal. In the speech domain, speaker diarization is performed using identity embeddings created by a triplet loss DNN that uses i-vectors as input. The triplet DNN is trained with an additional regularization loss that minimizes the variance of both positive and negative distances. A sliding window is then used to compare speech segments with enrollment speaker targets using cosine distance between the embeddings. To detect identities from the face modality, a face detector followed by a face tracker has been used on the videos. For each cropped face a feature vector is obtained using a Deep Neural Network based on the ResNet 34 architecture, trained using a\  metric learning triplet loss (available from dlib library). For each track the face feature vector is obtained by averaging the features obtained for each one of the frames of that track. Then, this feature vector is compared with the features extracted from the images of the enrollment identities. The proposed system is evaluated on the RTVE2018 database.

}, keywords = {Face Diarization, Multimodal Systems, Speaker Diarization}, author = {India, M. and Sagastiberri, I. and Palau, P. and Elisa Sayrol and Morros, J.R. and Hernando, J.} } @conference {cIndia, title = {UPC System for the 2016 MediaEval Multimodal Person Discovery in Broadcast TV task}, booktitle = {MediaEval 2016 Workshop}, year = {2016}, month = {10/2016}, address = {Hilversum, The Netherlands}, abstract = {

The UPC system works by extracting monomodal signal segments (face tracks, speech segments) that overlap with the person names overlaid in the video signal. These segments are assigned directly with the name of the person and used as a reference to \ compare against the non-overlapping (unassigned) signal segments. This process is performed independently both on the speech and video signals. A simple fusion scheme is used to combine both monomodal annotations into a single one.

}, author = {India, M. and Mart{\'\i}, G. and Cotillas, C. and Bouritsas, G. and Elisa Sayrol and Morros, J.R. and Hernando, J.} } @conference {cIndia15, title = {UPC System for the 2015 MediaEval Multimodal Person Discovery in Broadcast TV task}, booktitle = {MediaEval 2015 Workshop}, year = {2015}, month = {09/2015}, address = {Wurzen, Germany}, abstract = {

This paper describes a system to identify people in broadcast TV shows in a purely unsupervised manner. The system outputs the identity of people that appear, talk and can be identified by using information appearing in the show (in our case, text with person names). Three types of monomodal technologies are used: speech diarization, video diarization and text detection / named entity recognition. These technologies are combined using a linear programming approach where some restrictions are imposed.

}, author = {India, M. and David Varas and Ver{\'o}nica Vilaplana and Morros, J.R. and Hernando, J.} }