@inproceedings{78460a106b464533a0b50b48edd2e14d,
title = "Random-based algorithm for efficient entity matching",
abstract = "Most of the state-of-the-art MapReduce-based entity matching methods inherit traditional Entity Resolution techniques on centralized system and focus on data blocking strategies for structured entities n order to solve the load balancing problem occurred in distributed environment. In this paper, we propose a MapReduce-based entity matching framework for Entity Matching on semi-structured and unstructured data. Each entity is represented by a high dimensional vector generated from description data. In order to reduce network transmission, we produce lower dimensional bit-vectors called signatures for those entity vectors based on Locality Sensitive Hash (LSH) function. Our LSH is required for promising cosine similarity. A series of random algorithms are designed to ensure the performance for entity matching. Moreover, our design contains a solution for reducing redundant computation by one round of additional MapReduce job. Experiments show that our approach has a huge advantages on both processing speed and accuracy compared to the other methods.",
author = "Pingfu Chao and Zhu Gao and Yuming Li and Junhua Fang and Rong Zhang and Aoying Zhou",
note = "Publisher Copyright: {\textcopyright} Springer International Publishing Switzerland 2015.; 17th Asia-PacificWeb Conference, APWeb 2015 ; Conference date: 18-09-2015 Through 20-09-2015",
year = "2015",
doi = "10.1007/978-3-319-25255-1\_42",
language = "英语",
isbn = "9783319252544",
series = "Lecture Notes in Computer Science (including subseries Lecture Notes in Artificial Intelligence and Lecture Notes in Bioinformatics)",
publisher = "Springer Verlag",
pages = "509--521",
editor = "Reynold Cheng and Bin Cui and Zhenjie Zhang and Ruichu Cai and Jia Xu",
booktitle = "Web Technologies and Applications - 17th Asia-PacificWeb Conference,APWeb 2015, Proceedings",
address = "德国",
}