@phdthesis{oai:kitami-it.repo.nii.ac.jp:02000332, author = {Eronen Juuso Kalevi Kristian}, month = {Sep}, note = {In this thesis, I study two different methods for improving multilingual automatic cyberbullying detection. First, I study the effectiveness of Feature Density (FD) using different linguisticallybacked feature preprocessing methods in order to estimate dataset complexity, which in turn is used to comparatively estimate the potential performance of machine learning (ML) classifiers prior to any training. I hypothesize that estimating dataset complexity allows for the reduction of the number of required experiments iterations, making it possible to optimize the resourceintensive training of ML models which is becoming a serious issue due to the increases in available dataset sizes and the ever rising popularity of models based on Deep Neural Networks (DNN). The problem of constantly increasing needs for more powerful computational resources is also affecting the environment due to alarmingly-growing amount of CO2 emissions caused by training of large-scale ML models. I use cyberbullying datasets collected for multiple languages, namely English, Japanese and Polish. The difference in linguistic complexity of datasets allows me to additionally discuss the efficacy of linguistically-backed word preprocessing. Second, I study the selection of transfer languages for automatic abusive language detection. I demonstrate the effectiveness of cross-lingual transfer learning for zero-shot abusive language detection. This way it is possible to use existing data from higher-resource languages to build better detection systems for languages lacking data. The datasets are from eight different languages from three language families. I measure the distance between the languages using several language similarity measures, especially by quantifying the World Atlas of Language Structures. I show that there is a correlation between linguistic similarity and classifier performance, making it possible to choose an optimal transfer language for zero shot abusive language detection. Next, I demonstrate that this method is also generally applicable to multiple Natural Language Processing tasks, specifically sentiment analysis, named entity recognition and dependency parsing. I show that there is also a correlation between linguistic similarity and zero-shot cross-lingual transfer performance for these tasks, allowing me to select an ideal transfer language in order to aid with the problem of dealing with languages that do not currently have a sufficient amount of data. Lastly, I show that the World Atlas of Language Structures can be quantified into an effective linguistic similarity method.}, school = {北見工業大学}, title = {素性密度及びクロスリンガルゼロショット転移学習による多言語のネットいじめ自動検出の改良に関する研究}, year = {2022} }