@mastersthesis {xOriola, title = {Species-agnostic Local Ancestry Inference on Genomic Data with Convolutions}, year = {2021}, abstract = {

Local Ancestry Inference (LAI) is the high resolution prediction of ancestry (African, European, ...) across a DNA sequence. LAI is becoming increasingly important in DNA sequence analysis for the study of human ancestry and migrations. It is also necessary for polygenic risk scores research (prediction of traits and disease risk). Most current LAI models are built for specific species, set of ancestries and chromosomes, hence a new model needs to be trained from scratch for every slightly different setting. This creates a big barrier for research and industry to shift across different LAI scenarios. In this thesis we present SALAI-Net, the first statistical method for LAI with reference panel that can be used on any set of species and ancestries (species-agnostic). Loter is the state of the art in species-agnostic models with reference panel, and is based on a dynamic programming algorithm. However, it is slow and does not perform very well in small reference panel settings. Our model is based on a novel hand-engineered template matching block followed by a convolutional smoothing filter optimized to minimize cross-entropy loss on a training dataset. The right choice of DNA sequence encoding, similarity features and architecture is what makes our model able to generalize well to unseen ancestries, species, and different chromosomes. We benchmark our models on whole genome data of humans and we test the ability to generalize to dog species when trained on human data. Our models outperform the state-of-the-art method by a big margin in terms of accuracy, testing in different settings and datasets. Moreover, it is up to two orders of magnitude faster. Our model also shows close to no generalization gap when switching between species.

}, author = {Oriol, Benet}, editor = {Mas-Montserrat, Daniel and Ioannidis, Alexander G. and Xavier Gir{\'o}-i-Nieto} } @conference {cBernal, title = {Transcription-Enriched Joint Embeddings or Spoken Descriptions of Images and Videos}, booktitle = {CVPR 2020 Workshop on Egocentric Perception, Interaction and Computing}, year = {2020}, month = {06/2020}, publisher = {arXiv}, organization = {arXiv}, address = {Seattle, WA, USA}, abstract = {

In this work, we propose an effective approach for training unique embedding representations by combining three simultaneous modalities: image and spoken and textual narratives. The proposed methodology departs from a baseline system that spawns a embedding space trained with only spoken narratives and image cues. Our experiments on the EPIC-Kitchen and Places Audio Caption datasets show that introducing the human-generated textual transcriptions of the spoken narratives helps to the training procedure yielding to get better embedding representations. The triad speech, image and words allows for a better estimate of the point embedding and show an improving of the performance within tasks like image and speech retrieval,\ even when text third modality, text, is not present in the task.

Transcription-Enriched Joint Embeddings for Spoken Descriptions of Images and Videos from Universitat Polit{\`e}cnica de Catalunya
}, author = {Oriol, Benet and Luque, J. and Diego, Ferran and Xavier Gir{\'o}-i-Nieto} } @conference {cOriol, title = {Hate Speech in Pixels: Detection of Offensive Memes towards Automatic Moderation}, booktitle = {NeurIPS 2019 Workshop on AI for Social Good}, year = {2019}, month = {09/2019}, address = {Vancouver, Canada}, abstract = {

This work addresses the challenge of hate speech detection in Internet memes, and attempts using visual information to automatically detect hate speech, unlike any previous work of our knowledge.\ Memes are pixel-based multimedia documents that contain photos or illustrations together with phrases which, when combined, usually adopt a funny meaning.\ However, hate memes are also used to spread hate through social networks, so their automatic detection would help reduce their harmful societal impact.\ Our results indicate that the model can learn to detect some of the memes, but that the task is far from being solved with this simple architecture.\ While previous work focuses on linguistic hate speech, our experiments indicate how the visual modality can be much more informative for hate speech detection than the linguistic one in memes.\ In our experiments, we built a dataset of 5,020 memes to train and evaluate a multi-layer perceptron over the visual and language representations, whether independently or fused.

Hate Speech in Pixels: Detection of Offensive Memes towards Automatic Moderation from Universitat Polit{\`e}cnica de Catalunya

\>
}, author = {Oriol, Benet and Cristian Canton-Ferrer and Xavier Gir{\'o}-i-Nieto} } @mastersthesis {xOriol, title = {Multimodal Hate Speech Detection in Memes}, year = {2019}, abstract = {

This thesis explores a multimodal approach to Hate Speech detection, involving vision and language (text). More specifically, we deal with the context of memes, a form of internet humour which will present additional challenges.\ We first gather meme data from different sources. This way, we create a hate memes dataset for this task. Then, we use this data for the training and evaluation of statistical models, which are based on state-of-the art neural networks.\ We study different ways to fine-tune pretrained descriptors for our specific task. We propose a way to add expert knowledge into the system and orient it into a real world issue-solving system. We also discuss ways to deal with the issue of reduced amount of data, experimenting with a self-supervised learning approach for pre-training.\ We also compare the effect or contribution of each modality in the overall performance of the model.

}, author = {Oriol, Benet}, editor = {Cristian Canton-Ferrer and Xavier Gir{\'o}-i-Nieto} }