@inproceedings{d8d86e5649e44885b86b1db321469fb7,
title = "Modeling Intra- and Inter-Modal Alignment with Optimal Transport for Visual Dialog",
abstract = "Visual dialog aims to address a sequence of questions by effectively reasoning over both the dialog history and image content. While existing methods primarily focus on devising various attention mechanisms to capture interactions between different modalities, explicit signals encouraging semantic alignment in the visual dialog are seldom utilized. In this paper, we present a novel approach that leverages Optimal Transport to provide explicit and interpretable training signals to guide intra- and inter-modal alignment for the text and image in the visual dialog. Specifically, our approach consists of two kinds of alignment modules, Word-Word Alignment (WWA) and Region-Word Alignment (RWA). The WWA module learns latent relationships between a given question and a dialog history to align different concepts or pronouns that represent the same entity. As for the RWA module, it models the internal structures of text and images with graphs and performs graph matching for region-word alignment. We perform experiments on the benchmark dataset Visdial v1.0, and the experimental results show that our proposed approach achieves new state-of-the-art performance with respect to most metrics.",
keywords = "alignment, optimal transport, visual dialog",
author = "Renjie Zheng and Qin Chen and Jie Zhou and Junfeng Tian and Liang He",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 35th IEEE International Conference on Tools with Artificial Intelligence, ICTAI 2023 ; Conference date: 06-11-2023 Through 08-11-2023",
year = "2023",
doi = "10.1109/ICTAI59109.2023.00123",
language = "英语",
series = "Proceedings - International Conference on Tools with Artificial Intelligence, ICTAI",
publisher = "IEEE Computer Society",
pages = "805--812",
booktitle = "Proceedings - 2023 IEEE 35th International Conference on Tools with Artificial Intelligence, ICTAI 2023",
address = "美国",
}