@inproceedings{ea928b434f664029ab3e5de5af73cfd0,
title = "VG-Annotator: Vision-Language Models as Query Annotators for Unsupervised Visual Grounding",
abstract = "Visual grounding focuses on localizing objects referred to by natural language queries. Existing fully and weakly supervised methods rely on a mass of language queries for training. However, collecting natural language queries corresponding to specific objects by annotators is expensive. To reduce the reliance on human-written queries, we propose a novel unsupervised visual grounding framework named VG-Annotator. Different from the existing unsupervised methods that rely on manually designed rules to link objects and language queries. The key idea of VG-Annotator lies in that vision-language pre-trained (VLP) generation models can be language query annotators. Thanks to the powerful multi-modal understanding ability implicitly learned from large-scale pre-training, we consider stimulating models to explicitly generate appropriate descriptions for specific objects in natural language. To this end, we explore a series of multi-modal instructions to indicate which object should be described. We also introduce a supervised fine-tuning process to teach the vision-language models to follow the instructions. Extensive experiments show that the proposed method obtains high-quality language queries. The visual grounding model trained with the generated queries outperforms state-of-the-art unsupervised methods on five widely used datasets.",
keywords = "Instruction Tuning, Unsupervised Learning, Visual Grounding",
author = "Jiabo Ye and Junfeng Tian and Xiaoshan Yang and Zhenru Zhang and Anwen Hu and Ming Yan and Ji Zhang and Liang He and Xin Lin",
note = "Publisher Copyright: {\textcopyright} 2024 IEEE.; 2024 IEEE International Conference on Multimedia and Expo, ICME 2024 ; Conference date: 15-07-2024 Through 19-07-2024",
year = "2024",
doi = "10.1109/ICME57554.2024.10688227",
language = "英语",
series = "Proceedings - IEEE International Conference on Multimedia and Expo",
publisher = "IEEE Computer Society",
booktitle = "2024 IEEE International Conference on Multimedia and Expo, ICME 2024",
address = "美国",
}