@inproceedings{42a087e554774978922508b89fbd6d8b,
title = "Bread: A Hybrid Approach for Instruction Data Mining Through Balanced Retrieval and Dynamic Data Sampling",
abstract = "Recent advancements in Instruction Tuning (IT) have shown promise for aligning Large Language Models (LLMs) with users{\textquoteright} intentions, yet its efficacy is often compromised by dependence on high-quality datasets. Previous works have concentrated on the aggregation or production of huge IT datasets through human labor or significant cost-intensive LLM APIs, which lacks adequate mechanisms to guarantee the quality of the resulting data. Moreover, training on such amount of IT data is both time-consuming and costly. To address these issues, we present Bread (Instruction Mining through Balanced REtrieval And Dynamic Data Sampling), a novel approach designed to minimize the requisite volume of IT data. Bread uses a two-stage strategy combining balanced retrieval and dynamic sampling to focus on data diversity and quality, offering a cost-saving solution without relying on any specific LLMs. Experimental results suggest that Bread outperforms baselines and shows great flexibility across various IT datasets and LLMs, thereby marking a step forward in efficient Instruction Tuning. Our code is available at https://github.com/mihara-bot/Bread.",
keywords = "Data Selection, Instruction Tuning, Large Language Models",
author = "Xinlin Zhuang and Xin Mao and Jiang, \{Yuan Hao\} and Hongyi Wu and Shangqing Zhao and Li Cai and Shu Liu and Yang Chen and Yuxiang Song and Chenghao Jia and Yuhao Zhou and Man Lan",
note = "Publisher Copyright: {\textcopyright} The Author(s), under exclusive license to Springer Nature Singapore Pte Ltd. 2025.; 13th CCF International Conference on Natural Language Processing and Chinese Computing, NLPCC 2024 ; Conference date: 01-11-2024 Through 03-11-2024",
year = "2025",
doi = "10.1007/978-981-97-9434-8\_18",
language = "英语",
isbn = "9789819794331",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Science and Business Media Deutschland GmbH",
pages = "229--240",
editor = "Wong, \{Derek F.\} and Zhongyu Wei and Muyun Yang",
booktitle = "Natural Language Processing and Chinese Computing - 13th National CCF Conference, NLPCC 2024, Proceedings",
address = "德国",
}