@mastersthesis {xMasuda-Mora, title = {Open-Ended Visual Question-Answering}, year = {2016}, abstract = {

Advisors: Santiago de la Puente and Xavier Gir{\'o}-i-Nieto

Studies: Bachelor degree in Science and Telecommunication Technologies Engineering at\ Telecom BCN-ETSETB\ from the Technical University of Catalonia (UPC)

Grade: A with honors (10/10.0)

This thesis studies methods to solve Visual Question-Answering (VQA) tasks with a Deep Learning framework.As a preliminary step, we explore Long Short-Term Memory (LSTM) networks used in Natural Language Processing (NLP) to tackle Question-Answering (text based). We then modify the previous model to accept an image as an input in addition to the question. For this purpose, we explore the VGG-16 and K-CNN convolutional neural networks to extract visual features from the image. These are merged with the word embedding or with a sentence embedding of the question to predict the answer. This work was successfully submitted to the Visual Question Answering Challenge 2016, where it achieved a 53,62\% of accuracy in the test dataset. The developed software has followed the best programming practices and Python code style, providing a consistent baseline in Keras for different configurations. The source code and models are publicly available at https://github.com/imatge-upc/vqa-2016-cvprw.

Download the report from UPCommons or arXiv.
Project page on GitHub.

Open-ended Visual Question-Answering from Xavier Giro

}, url = {https://upcommons.upc.edu/handle/2117/89671}, author = {Masuda-Mora, Issey}, editor = {Pascual-deLaPuente, Santiago and Xavier Gir{\'o}-i-Nieto} }