@inproceedings{0dac36aedcf647b896d5baea98069688,
title = "FashionKLIP: Enhancing E-Commerce Image-Text Retrieval with Fashion Multi-Modal Conceptual Knowledge Graph",
abstract = "Image-text retrieval is a core task in the multimodal domain, which arises a lot of attention from both research and industry communities. Recently, the booming of visionlanguage pre-trained (VLP) models has greatly enhanced the performance of cross-modal retrieval. However, the fine-grained interactions between objects from different modalities are far from well-established. This issue becomes more severe in the e-commerce domain, which lacks sufficient training data and fine-grained cross-modal knowledge. To alleviate the problem, this paper proposes a novel e-commerce knowledge-enhanced VLP model FashionKLIP. We first automatically establish a multi-modal conceptual knowledge graph from large-scale e-commerce image-text data, and then inject the prior knowledge into the VLP model to align across modalities at the conceptual level. The experiments conducted on a public benchmark dataset demonstrate that FashionKLIP effectively enhances the performance of e-commerce image-text retrieval upon stateof-the-art VLP models by a large margin. The application of the method in real industrial scenarios also proves the feasibility and efficiency of FashionKLIP.",
author = "Xiaodan Wang and Chengyu Wang and Lei Li and Zhixu Li and Ben Chen and Linbo Jin and Jun Huang and Yanghua Xiao and Ming Gao",
note = "Publisher Copyright: {\textcopyright} ACL 2023.All rights reserved.; 61st Annual Meeting of the Association for Computational Linguistics, ACL 2023 ; Conference date: 09-07-2023 Through 14-07-2023",
year = "2023",
doi = "10.18653/v1/2023.acl-industry.16",
language = "英语",
series = "Proceedings of the Annual Meeting of the Association for Computational Linguistics",
publisher = "Association for Computational Linguistics (ACL)",
pages = "149--158",
booktitle = "Industry Track",
address = "澳大利亚",
}