@conference {cTarresa, title = {Sign Language Translation from Instructional Videos}, booktitle = {CVPR 2023 Women in Computer Vision Workshop}, year = {2023}, month = {04/2023}, publisher = {Computer Vision Foundation / IEEE}, organization = {Computer Vision Foundation / IEEE}, address = {Vancouver, Canada}, abstract = {
The advances in automatic sign language translation (SLT) to spoken languages have been mostly benchmarked with datasets of limited size and restricted domains. Our work advances the state of the art by providing the first baseline results on How2Sign, a large and broad dataset. We train a Transformer over I3D video features, using the reduced BLEU as a reference metric for validation, instead of the widely used BLEU score. We report a result of 8.03 on the BLEU score, and publish the first open-source implementation of its kind to promote further advances.
\
\
Sign Language Translation (SLT) task has been addressed in multiple approaches in recent years. In this work we aim to investigate the impact of using different types of visual sign language representation for SLT. For this investigation we use the state-of-the-art in SLT, the Sign Language Transformers model. We compare the translation output performance of two types of body pose estimation models as our skeleton extractor,\ and 2D CNN features trained on the test dataset. These later perform best, and I3D features outperform the pose estimation-based ones.\
}, author = {Maram A. Mohamed}, editor = {Laia Tarr{\'e}s and Xavier Gir{\'o}-i-Nieto} } @article {xBudria22, title = {Multimodal 3D Hand Pose Enhancement for Sign Language}, year = {2022}, abstract = {The application of recent deep learning breakthroughs to the domain of sign language has yielded very promising results. However, sign language processing systems depend on large amounts of labeled high-quality data to work properly. Current hand pose estimation methods are often unreliable and do not always produce estimations with enough quality. To mitigate this issue, we explore the applicability of the novel Body2Hands method for the obtainment of high-quality hand pose estimations.
\
\
Introduction to Research, BSc Data Science and Engineering, Autumn 2021:
The end goal of Sign Language Translation is to either produce spoken sentences from sign videos or generate sign videos from their corresponding written transcriptions. In this situation, this task has been address in multiple approaches in recent years. Moreover, it has been proved that taking advantage of the sign gloss representations improves substantially the model{\textquoteright}s performance in this task. Therefore, in this work we replicate the state-of-the-art Transformer-based approach on the task and evaluate it on the multimodal American Sign Language How2Sign dataset. Furthermore, we provide baseline recognition and translation results that represent an starting point to further research on the topic. In addition, we provide a new sentence-based alignment for the How2Sign videos, as their current alignment was with speech, which we have used to tackle the Sign Language Translation task properly.\
\
\
Sign Language Translation is an open problem whose goal is to generate written sentences from sign videos. In recent years, many research works that have been developed in this field mainly addressed the Sign Language Recognition task, which consists in understanding the input signs and transcribing them into sequences of annotations. Moreover, current studies show that taking advantage of the latter task helps to learn meaningful representations and can be seen as an intermediate step towards the end goal of translation.
In this work, we present a method to generate automatic pseudo-glosses from written sentences, which can work as a replacement for real glosses. This addresses the issue of their collection, as they need to be manually annotated and it is extremely costly.
Furthermore, we introduce a new implementation built on Fairseq of the Transformer-model approach introduced by Camgoz et al., which is jointly trained to solve the recognition and translation tasks. Besides, we provide new baseline results on both implementations: first, on the Phoenix dataset, we present results that outperform the ones provided by Camgoz et al. in their work, and, second, on the How2Sign dataset, we present the first results on the translation task. These results can work as a baseline for future research in the field.
This paper describes the system developed at the Universitat Polit{\`e}cnica de Catalunya for the Workshop on Machine Translation 2022 Sign Language Translation Task, in particular, for the sign-to-text direction. We use a Transformer model implemented with the Fairseq modeling toolkit. We have experimented with the vocabulary size, data augmentation tech- niques and pretraining the model with the PHOENIX-14T dataset. Our system obtains 0.50 BLEU score for the test set, improving the organizers{\textquoteright} baseline by 0.38 BLEU. We remark the poor results for both the baseline and our system, and thus, the unreliability of our findings.
\
\
}, author = {Laia Tarr{\'e}s and Gerard I. G{\'a}llego and Xavier Gir{\'o}-i-Nieto and Jordi Torres} } @mastersthesis {xBudria, title = {Topic Detection from Sign Language Videos}, year = {2022}, abstract = {Significant progress has been made recently on challenging tasks in automatic sign language understanding, such as sign language recognition, translation and production. However, most works have focused on datasets with relatively few samples, short recordings and limited vocabulary and signing space. Moreover, they have neglected the less complex task of sign language video classification, whose analogue in spoken language, namely text classification, has been widely addressed. For this reason, in this work, we introduce the novel task of sign language topic detection. We base our experiments on How2Sign, a large-scale video dataset spanning multiple semantic domains. The contributions of this thesis are twofold. First, we present the first study of sign language topic detection in continuous sign language videos, providing baseline models for this task. Second, we perform a comparison between different visual features and deep learning architectures that are commonly employed in the sign language understanding literature. We implement our modelling pipelines in Fairseq, a PyTorch library that is extensively used in the spoken language community. Modular, extensible code for running our experiments is provided along this thesis.
}, author = {{\'A}lvaro Budria}, editor = {Laia Tarr{\'e}s and Xavier Gir{\'o}-i-Nieto} } @conference {cBudria, title = {Topic Detection in Continuous Sign Language Videos}, booktitle = {Accessibility, Vision, and Autonomy Meet (AVA) CVPR Workshop}, year = {2022}, month = {06/2022}, abstract = {Significant progress has been made recently on challenging tasks in automatic sign language understanding, such as sign language recognition, translation and production. However, these works have focused on datasets with relatively few samples, short recordings and limited vocabulary and signing space.\ In this work, we introduce the novel task of sign language topic detection. We base our experiments on How2Sign, a large-scale video dataset spanning multiple semantic domains. We provide strong baselines for the task of topic detection,\ \ and present a comparison between different visual features commonly used in the domain of sign language.
}, author = {{\'A}lvaro Budria and Laia Tarr{\'e}s and Gerard I. G{\'a}llego and Moreno, Francesc and Jordi Torres and Xavier Gir{\'o}-i-Nieto} } @conference {cTarres, title = {GAN-based Image Colourisation with Feature Reconstruction Loss}, booktitle = {CVPR 2021 Women in Computer Vision Workshop}, year = {2021}, month = {06/2021}, address = {Virtual}, abstract = {Image colourisation is the task of adding plausible colour to grayscale images. This transformation requires obtaining a three dimensional colour-valued mapping from a real-valued grayscale image, which leads to an undetermined problem because the gray-scale semantics and texture provide cues for multiple possible colour mappings. The goal of image colourisation in not to recover the ground truth colour in a manner that it is perceived as natural by a human observer.\ Our work takes as a baseline a scheme based on an end-to-end trainable convolutional neural network (CNN) trained with a smooth L1 loss to predict the $ab$ channels of a colour image given the $L$ channel. We introduce an extra perceptual reconstruction loss during training to improve the capabilities of a adversarial adversarial model, that we adopt as a baseline.
}, author = {Laia Tarr{\'e}s and G{\'o}rriz, Marc and Xavier Gir{\'o}-i-Nieto and Mrak, Marta} } @mastersthesis {xTarres21, title = {GAN-based Image Colourisation with Feature Reconstruction Loss}, year = {2021}, abstract = {Automatic image colourisation is a complex and ambiguous task due to having multiple correct solutions. Previous approaches have resulted in desaturated results unless relying on significant user interaction.\ In this thesis we study the state of the art for colourisation and we propose an automatic colourisation approaches based on generative adversarial networks that incorporates a feature reconstruction loss during training. The generative network is framed in an adver- sarial model that learns how to colourise by incorporating a perceptual understanding of the colour. Qualitative and quantitative results show the capacity of the proposed method to colourise images in a realistic way, boosting the colourfulness and perceptual realism of previous GAN-based methodologies.\ We also study and propose a second approach that incorporates segmentation information in the GAN framework and obtain quantitative and qualitative results.
}, author = {Laia Tarr{\'e}s}, editor = {Mrak, Marta and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xIturralde, title = {Towards video alignment across cameras with sign language 2D poses}, year = {2021}, abstract = {This thesis degree is part of a project from the Image Group at UPC that is focused on sign language translation using deep learning technologies. This thesis builds on top of an existing database called How2Sign, that contains more than 83 hours of sign language translation videos. This database has some textual annotations aligned to a front RGB camera. The same scenes are also captured by a side RGB and a front RGB-D cameras. These three cameras are not synchronized, so it is necessary to align the segments annotated on the RGB front camera to the other cameras. This thesis explores a solution based on the cross correlation operator. Our work is to process the coordinates of the joints of the subject that appears in the videos, not from the point of view of image or video processing based on pixels. The first part if this thesis is to investigate the properties of the cross-correlation function by locating short video segments of a long recording based on automatically extracted 2D human poses. The experiments studied the impact of adding noise. The second part applied the cross-correlation to try to align two videos with the same scene, but recorded with different cameras from different points of view.
Convolutional Neural Networks have gained popularity in the recent years due to their performance regarding image analysis, both in classification and segmentation. Especially in the medical field, it is increasingly common to use automatic techniques to help specialists with the diagnosis.
In this thesis, the problem of skin lesion classification is studied. The study is based on the ISIC Challenges, given the collaboration with Hospital Cl{\'\i}nic de Barcelona, and we help in the development of the database for the ISIC Challenge 2019.
One of the key points of the development is obtaining a model that manages to classify with accuracy a database provided. To do so, we study residual neural networks and an ensemble of them to further improve the results.
The purpose of this project, therefore is the study, analysis and evaluation of the variants and modifications of residual neural networks so that it adapts to our problem using an ensemble of them. In the process, the neural network will have to tackle the problem of class imbalance
}, author = {Laia Tarr{\'e}s}, editor = {Ver{\'o}nica Vilaplana and Marc Combalia} }