@conference {cDuartea, title = {Wav2Pix: Speech-conditioned Face Generation using Generative Adversarial Networks}, booktitle = {ICASSP}, year = {2019}, month = {05/2019}, publisher = {IEEE}, organization = {IEEE}, address = {Brighton, UK}, abstract = {

Speech is a rich biometric signal that contains information\ about the identity, gender and emotional state of the speaker.\ In this work, we explore its potential to generate face images\ of a speaker by conditioning a Generative Adversarial\ Network (GAN) with raw speech input. We propose a deep\ neural network that is trained from scratch in an end-to-end\ fashion, generating a face directly from the raw speech waveform\ without any additional identity information (e.g reference\ image or one-hot encoding). Our model is trained in a\ self-supervised fashion by exploiting the audio and visual signals\ naturally aligned in videos. With the purpose of training\ from video data, we present a novel dataset collected for this\ work, with high-quality videos of ten youtubers with notable\ expressiveness in both the speech and visual signals.

}, doi = {10.1109/ICASSP.2019.8682970}, url = {http://hdl.handle.net/2117/167073}, author = {Amanda Duarte and Rold{\'a}n, Francisco and Tubau, Miquel and Escur, Janna and Pascual-deLaPuente, Santiago and Amaia Salvador and Mohedano, Eva and McGuinness, Kevin and Jordi Torres and Xavier Gir{\'o}-i-Nieto} }