@article {aButko11, title = {Acoustic event detection based on feature-level fusion of audio and video modalities}, journal = {Eurasip journal on advances in signal processing}, volume = {2011}, year = {2011}, pages = {1{\textendash}11}, abstract = {

Acoustic event detection (AED) aims at determining the identity of sounds and their temporal position in audio signals. When applied to spontaneously generated acoustic events, AED based only on audio information shows a large amount of errors, which are mostly due to temporal overlaps. Actually, temporal overlaps accounted for more than 70\% of errors in the real-world interactive seminar recordings used in CLEAR 2007 evaluations. In this paper, we improve the recognition rate of acoustic events using information from both audio and video modalities. First, the acoustic data are processed to obtain both a set of spectrotemporal features and the 3D localization coordinates of the sound source. Second, a number of features are extracted from video recordings by means of object detection, motion analysis, and multicamera person tracking to represent the visual counterpart of several acoustic events. A feature-level fusion strategy is used, and a parallel structure of binary HMM-based detectors is employed in our work. The experimental results show that information from both the microphone array and video cameras is useful to improve the detection rate of isolated as well as spontaneously generated acoustic events.

}, issn = {1687-6172}, doi = {10.1155/2011/485738}, url = {http://www.hindawi.com/journals/asp/2011/485738/}, author = {Butko, T. and Cristian Canton-Ferrer and Segura, C. and Xavier Gir{\'o}-i-Nieto and Nadeu, C. and Hernando, J. and Casas, J.} } @conference {cCanton-Ferrer09, title = {Audiovisual Event Detection Towards Scene Understanding}, booktitle = {2009 IEEE Computer-Society Conference on Computer Vision and Pattern Recognition Workshops}, year = {2009}, pages = {840{\textendash}847}, abstract = {

Acoustic events produced in meeting environments may contain useful information for perceptually aware interfaces and multimodal behavior analysis. In this paper, a system to detect and recognize these events from a multimodal perspective is presented combining information from multiple cameras and microphones. First, spectral and temporal features are extracted from a single audio channel and spatial localization is achieved by exploiting cross-correlation among microphone arrays. Second, several video cues obtained from multi-person tracking, motion analysis, face recognition, and object detection provide the visual counterpart of the acoustic events to be detected. A multimodal data fusion at score level is carried out using two approaches: weighted mean average and fuzzy integral. Finally, a multimodal database containing a rich variety of acoustic events has been recorded including manual annotations of the data. A set of metrics allow assessing the performance of the presented algorithms. This dataset is made publicly available for research purposes.

}, doi = {10.1109/CVPRW.2009.5204264}, author = {Cristian Canton-Ferrer and Butko, T. and Segura, C. and Xavier Gir{\'o}-i-Nieto and Nadeu, C. and Hernando, J. and Casas, J.} } @conference {cButko09, title = {Improving Detection of Acoustic Events Using Audiovisual Data and Feature Level Fusion}, booktitle = {10th Annual Conference of the International Speech Communication Association}, year = {2009}, pages = {1147{\textendash}1150}, isbn = {978-1-61567-692-7}, url = {http://gps-tsc.upc.es/imatge/_Xgiro/research/publications/2009/interspeech.pdf}, author = {Butko, T. and Cristian Canton-Ferrer and Segura, C. and Xavier Gir{\'o}-i-Nieto and Nadeu, C. and Hernando, J. and Casas, J.} } @conference {cButko08, title = {Fusion of Audio and Video Modalities for Detection of Acoustic Events}, booktitle = {Interspeech 2008, 9th Annual Conference of the International Speech Communication Association}, year = {2008}, month = {09/2008}, pages = {123{\textendash}126}, publisher = {ISCA}, organization = {ISCA}, address = {Brisbane, Australia}, isbn = {978-1-61567-378-0}, url = {http://www.isca-speech.org/archive/interspeech_2008/i08_0123.html}, author = {Butko, T. and Temko, A. and Nadeu, C. and Cristian Canton-Ferrer} } @conference {cButko08a, title = {Inclusion of video information for detection of acoustic events using the fuzzy integral}, booktitle = {Machine Learning for Multimodal Interaction: 5th International Workshop}, year = {2008}, pages = {74{\textendash}85}, isbn = {978-3540858522}, author = {Butko, T. and Temko, A. and Nadeu, C. and Cristian Canton-Ferrer} }