@inproceedings{f79fbd56daf6483aa3476433901534e4,
title = "Forum data extraction without explicit rules",
abstract = "Web forum data contributed by millions of users are the mixture of well-formed user information and free-format user-generated content. Though easy to read for users, forum data are difficult to be analyzed by computer systems because of various surrounding HTML tags. It is challenging to extract forum data from a large number of Web sites automatically since these sites may have different styles. In this paper, we propose an approach to extract user information and user-generated content from multiple forum sites by using both structural and textual characteristics of forums. A structural induction process and a term combination computation process are introduced to assure extraction accuracy and automation. Extensive experiments on real-life data sets show the effectiveness of our proposed method.",
keywords = "forum data extraction, user-generated content",
author = "Jingwei Zhang and Cheqing Jin and Yuming Lin and Xueqing Gong",
year = "2012",
doi = "10.1109/CGC.2012.72",
language = "英语",
isbn = "9780769548647",
series = "Proceedings - 2nd International Conference on Cloud and Green Computing and 2nd International Conference on Social Computing and Its Applications, CGC/SCA 2012",
pages = "460--465",
booktitle = "Proceedings - 2nd International Conference on Cloud and Green Computing and 2nd International Conference on Social Computing and Its Applications, CGC/SCA 2012",
note = "2nd International Conference on Cloud and Green Computing, CGC 2012, Held Jointly with the 2nd International Conference on Social Computing and Its Applications, SCA 2012 ; Conference date: 01-11-2012 Through 03-11-2012",
}