@inbook{cc68324d0b08472d88f7ce3bbd1fcbfb,
title = "Reddit temporal N-gram corpus and its applications on paraphrase and semantic similarity in social media using a topic-based latent semantic analysis",
abstract = "This paper introduces a new large-scale n-gram corpus that is created specifically from social media text. Two distinguishing characteristics of this corpus are its monthly temporal attribute and that it is created from 1.65 billion comments of user-generated text in Reddit. The usefulness of this corpus is exemplified and evaluated by a novel Topic-based Latent Semantic Analysis (TLSA) algorithm. The experimental results show that unsupervised TLSA outperforms all the state-of-the-art unsupervised and semi-supervised methods in SEMEVAL 2015: paraphrase and semantic similarity in Twitter tasks.",
author = "Anh Dang and Abidalrahman Moh'd and Aminul Islam and Rosane Minghim and Michael Smit and Evangelos Milios",
note = "Publisher Copyright: {\textcopyright} 1963-2018 ACL.; 26th International Conference on Computational Linguistics, COLING 2016 ; Conference date: 11-12-2016 Through 16-12-2016",
year = "2016",
language = "English",
isbn = "9784879747020",
series = "COLING 2016 - 26th International Conference on Computational Linguistics, Proceedings of COLING 2016: Technical Papers",
publisher = "Association for Computational Linguistics, ACL Anthology",
pages = "3553--3564",
booktitle = "COLING 2016 - 26th International Conference on Computational Linguistics, Proceedings of COLING 2016",
}