@inproceedings{30102874684f42acb31c74ec1957aa9f,
title = "Image Alone Are Not Enough: A General Semantic-Augmented Transformer-Based Framework for Image Captioning",
abstract = "Image captioning has long been widely regarded as a modal transformation task from visual to linguistic modality. Most current research focuses on the information transformation between single modalities dominated by visual features, while less attention is paid to the interaction between visual features and linguistic features. This rigid single-modal conversion method is prone to information confusion and loss during the conversion process, making it difficult for the model to generate accurate and detailed captions. In this paper, we propose a general Semantic-Augmented Transformer-Based (SAT) framework to facilitate smoother transformation between the two modalities. In the encoding stage, we use the fine-grained description of each region to fuse with the corresponding image features to make the image feature representation closer to the text feature representation. In the decoding stage, the caption's part-of-speech information is used as prior knowledge to constrain the model to pay more attention to the details in the image rather than only to the prominent entities for fine-grained captions. We extensively evaluate our framework on various state-of-the-art transformer-based models. Experiments demonstrate that these models have superior performance on the MS-COCO dataset under our framework.",
keywords = "Part-Of-Speech, Transformer, image captioning, visual-linguistic fusion",
author = "Jiawei Liu and Xin Lin and Liang He",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 2023 International Joint Conference on Neural Networks, IJCNN 2023 ; Conference date: 18-06-2023 Through 23-06-2023",
year = "2023",
doi = "10.1109/IJCNN54540.2023.10191656",
language = "英语",
series = "Proceedings of the International Joint Conference on Neural Networks",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "IJCNN 2023 - International Joint Conference on Neural Networks, Proceedings",
address = "美国",
}