@inproceedings{9a75218ed6704db0b18e3510d19b4203,
title = "DS2: Handling data skew using data stealings over high-speed networks",
abstract = "Distributed in-memory computing systems have dramatic performance improvement over traditional disk-based systems, which makes them widely used in large-scale data processing applications. Unfortunately, uneven and unpredictable data distributions caused by data skew have a significant impact on the performance. In Spark, when data skew happens, some tasks will process much more data than other tasks and become the performance bottleneck. The traditional approaches to handling data skew are based on sampling and repartitioning, which incur additional overhead. In this paper, we divide data skew in distributed data processing systems into intra-node and inter-node skew. Based on data stealing, we proposed DS2 to handle both intra-node and inter-node data skew. It aims to improve the performance under data skew, without involving additional overhead. DS2 first balances the skewed data distribution in the local and then handles the inter-node skew by RDMA during execution. It achieves up to 2.96× speedup on the aggregation operator and 2.81× speedup on the join operator.",
keywords = "Data Skew, Data Stealing, OLAP, RDMA",
author = "Zeyu He and Zhifang Li and Xiaoshuang Peng and Chuliang Weng",
note = "Publisher Copyright: {\textcopyright} 2021 IEEE.; 37th IEEE International Conference on Data Engineering, ICDE 2021 ; Conference date: 19-04-2021 Through 22-04-2021",
year = "2021",
month = apr,
doi = "10.1109/ICDE51399.2021.00168",
language = "英语",
series = "Proceedings - International Conference on Data Engineering",
publisher = "IEEE Computer Society",
pages = "1865--1870",
booktitle = "Proceedings - 2021 IEEE 37th International Conference on Data Engineering, ICDE 2021",
address = "美国",
}