@inproceedings{c9c40fe0ecde4db798366c059cb9b95d,
title = "Accelerating Recommendation Inference via GPU Streams",
abstract = "Deep Learning based recommendation is common in various recommendation services and widely used in the industry. To predict user preferences accurately, state-of-the-art recommendation models contain an increasing number of features and various methods of feature interaction, which both lengthen inference time. We observe that the embedding lookup and feature interaction of different features in a recommendation model is independent of each other. However, current deep learning frameworks (e.g., TensorFlow, PyTorch) are oblivious to this independence, and schedule the operators to execute sequentially in a single computational stream. In this work, we exploit multiple CUDA streams to parallelize the execution of embedding lookup and feature interaction. To further overlap the processing of different sparse features and minimize synchronization overhead, we propose a topology-aware operator assignment algorithm to schedule operators to computational streams. We implement a prototype, namely StreamRec, based on TensorFlow XLA. Our experiments show that StreamRec is able to reduce latency by up to 27.8\% and increase throughput by up to 52\% in comparison to the original TensorFlow XLA.",
keywords = "CUDA Stream, Inference Service, Operator Assignment, Parallelization, Recommendation model",
author = "Yuean Niu and Zhizhen Xu and Chen Xu and Jiaqiang Wang",
note = "Publisher Copyright: {\textcopyright} 2023, The Author(s), under exclusive license to Springer Nature Switzerland AG.; 28th International Conference on Database Systems for Advanced Applications, DASFAA 2023 ; Conference date: 17-04-2023 Through 20-04-2023",
year = "2023",
doi = "10.1007/978-3-031-30637-2\_36",
language = "英语",
isbn = "9783031306365",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "546--561",
editor = "Xin Wang and Sapino, \{Maria Luisa\} and Wook-Shin Han and \{El Abbadi\}, Amr and Gill Dobbie and Zhiyong Feng and Yingxiao Shao and Hongzhi Yin",
booktitle = "Database Systems for Advanced Applications - 28th International Conference, DASFAA 2023, Proceedings",
address = "德国",
}