@inproceedings{dfd921d07ead4da68a28a2c92cf76d0e,
title = "Visual Graph Reasoning Network",
abstract = "Visual question answering (VQA) is a fundamental and challenging cross-modal task. This task requires the model to fully understand the image's content and reason out the answer based on the question. Existing VQA models understand visual content mainly based on bottom-up or grid features. However, both types of vision features have some drawbacks. The discreteness and independence of bottom-up features pre-vent models from adequately performing relational reasoning. Image segmentation by grid features leads to the fragmentation of meaningful visual regions, limiting the cross-modal alignment capability of the model. Therefore, we proposed a more flexible method called Visual Graph. It can connect different patches according to semantic similarity and spatial relevance to model the potential relationships and cluster the adjacent homologous patches. Based on the Visual Graph, we designed a Visual Graph Reasoning Network for VQA. We evaluated our model on GQA and VQA-v2. The experimental results show that our models can achieve excellent performance between single models.",
keywords = "Cross-modal, Visual Graph, Visual Question Answering, Visual Reasoning",
author = "Dingbang Li and Xin Lin and Haibin Cai and Wenzhou Chen",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 48th IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2023 ; Conference date: 04-06-2023 Through 10-06-2023",
year = "2023",
doi = "10.1109/ICASSP49357.2023.10094852",
language = "英语",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing, Proceedings",
address = "美国",
}