@inproceedings{719fb65890044d398b9914265e58dbcb,
title = "Categorical term frequency probability based feature selection for document categorization",
abstract = "Document categorization technology heavily relies on the categorical distribution of features. Those terms which occur unevenly in various categories have strong distinguishable information as to categorization. At first, we give the definition of CTFP (Categorical Term Frequency Probability), which will be used to accurately reflect the categorical characteristics of terms on each category. Then, the CTFP-VM (Variance-Mean based on CTFP) feature selection criterion is introduced to reveal the category distribution difference. After computing and ranking the variance mean based on CTFP distribution for each term, feature sets are obtained for document categorization. We perform the document categorization experiments on SVM classifiers with the well-known Reuters-21578 and 20 news-18828 corpuses as unbalanced and balanced corpus respectively. Experiments compare the novel methods with other conventional feature selection algorithms and the proposed method achieves the best feature set for document categorization The experimental results also demonstrate that the proposed variance mean feature selection method base on CTFP not only has better Fl-metric for document categorization but excellent corpus adaptability.",
keywords = "categorical distribution, document categorization, feature selection, term frequency, variance mean",
author = "Qiang Li and Liang He and Xin Lin",
note = "Publisher Copyright: {\textcopyright} 2013 IEEE.; 2013 International Conference on Soft Computing and Pattern Recognition, SoCPaR 2013 ; Conference date: 15-12-2013 Through 18-12-2013",
year = "2013",
doi = "10.1109/SOCPAR.2013.7054103",
language = "英语",
series = "2013 International Conference on Soft Computing and Pattern Recognition, SoCPaR 2013",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "66--71",
booktitle = "2013 International Conference on Soft Computing and Pattern Recognition, SoCPaR 2013",
address = "美国",
}