@conference {cCanton-Ferrer09, title = {Audiovisual Event Detection Towards Scene Understanding}, booktitle = {2009 IEEE Computer-Society Conference on Computer Vision and Pattern Recognition Workshops}, year = {2009}, pages = {840{\textendash}847}, abstract = {

Acoustic events produced in meeting environments may contain useful information for perceptually aware interfaces and multimodal behavior analysis. In this paper, a system to detect and recognize these events from a multimodal perspective is presented combining information from multiple cameras and microphones. First, spectral and temporal features are extracted from a single audio channel and spatial localization is achieved by exploiting cross-correlation among microphone arrays. Second, several video cues obtained from multi-person tracking, motion analysis, face recognition, and object detection provide the visual counterpart of the acoustic events to be detected. A multimodal data fusion at score level is carried out using two approaches: weighted mean average and fuzzy integral. Finally, a multimodal database containing a rich variety of acoustic events has been recorded including manual annotations of the data. A set of metrics allow assessing the performance of the presented algorithms. This dataset is made publicly available for research purposes.

}, doi = {10.1109/CVPRW.2009.5204264}, author = {Cristian Canton-Ferrer and Butko, T. and Segura, C. and Xavier Gir{\'o}-i-Nieto and Nadeu, C. and Hernando, J. and Casas, J.} }