@inproceedings{b9aa6de9d24b4dada23dcbbd2e6cfdf4,
title = "Query Graph Attention for Video Relation Detection",
abstract = "As a bridge to connect vision and language, visual relations between objects, visual relation provide a more comprehensive visual content understanding beyond objects. Most previous works adopt the track-to-detect framework for video visual relation detection (VidVRD), which cannot capture long-term spatio- temporal contexts in different stages and also suffers from inefficiency. In this work, we propose a query-based method for video visual relation detection. Our model exploits graph structure to autoregressively generate relation graphs with spatio-temporal contexts and uses an attentional graph convolutional network to fuse the contexts. Experiments on benchmark datasets ImageNet-VidVRD demonstrate the accuracy of our method.",
keywords = "Graph convolutional network, Transformer, Video relation detection",
author = "Jian Wang and Haibin Cai",
note = "Publisher Copyright: {\textcopyright} 2023 SPIE.; 2023 International Conference on Image, Signal Processing, and Pattern Recognition, ISPP 2023 ; Conference date: 24-02-2023 Through 26-02-2023",
year = "2023",
doi = "10.1117/12.2681229",
language = "英语",
series = "Proceedings of SPIE - The International Society for Optical Engineering",
publisher = "SPIE",
editor = "Paulo Batista and Pachori, \{Ram Bilas\}",
booktitle = "International Conference on Image, Signal Processing, and Pattern Recognition, ISPP 2023",
address = "美国",
}