@inproceedings{eb95196db5e04fc38b29e2aa37c768e6,
title = "CLMAE: A Liter and Faster Masked Autoencoders",
abstract = "Self-supervised pre-training has been widely utilized on various vision tasks and gains a great success. However, pre-training on big datasets suffers a lengthy training schedule and large memory consumption. To alleviate these problems, we propose a light-weighted model called Convolutional Lite Masked AutoEncoder (CLMAE). To improve the convergence speed of the transformer during pre-training. We introduce two-stage convolutional progressive patch embedding and an additional convolution in the feed-forward layer, which promote better correlation among patches in the spatial dimensions. The most important design is called cross-layer parameter sharing mechanism, which reduces model parameters with little impact on the performance. We find that sharing parameters among layers not only improves the parameter efficiency, but also acts as a form of regularization that stabilizes the training. Experimental results on downstream tasks show the effectiveness and generalization ability of CLMAE, which accelerates the training process significantly (by 5× for ViT-B and MAE) and reduces a quarter of parameters (by 25M fewer for ViT-B), with a competitive accuracy (82.8\% on ImageNet-1K).",
keywords = "Pre-training, masked autoencoders",
author = "Yiran Song and Lizhuang Ma",
note = "Publisher Copyright: {\textcopyright} 2023 IEEE.; 48th IEEE International Conference on Acoustics, Speech and Signal Processing, ICASSP 2023 ; Conference date: 04-06-2023 Through 10-06-2023",
year = "2023",
doi = "10.1109/ICASSP49357.2023.10096059",
language = "英语",
series = "ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
booktitle = "ICASSP 2023 - 2023 IEEE International Conference on Acoustics, Speech and Signal Processing, Proceedings",
address = "美国",
}