ISSN: 2534-5192 (electronic) – 2681-8566 (print)


◀ Ballier, Pacquetet & ArnoldProceedings



ISBN: 978-2-9570549-0-9
e-ISBN: 978-2-9570549-1-6

Download (57.57 MB)

Vocalic and Consonantal Grapheme Classification through Spectral Decomposition
Patricia Thaine & Gerald Penn ORCID iD icon
Download (352.36 kB)

Abstract. We consider two related problems in this paper. Given an undeciphered alphabetic writing system or mono-alphabetic cipher, determine: (1) which of its letters correspond to vowels and which to consonants; and (2) whether the writing system is a vocalic alphabet or an abjad. We are able to show that a very simple spectral decomposition based on character co-occurrences provides nearly perfect performance with respect to answering both question types.

DOI: https://doi.org/10.36824/2018-graf-thai


Berg, K. (2012). “Identifying Graphematic Units”. In: Written Language & Literacy 15.1, pp. 26–45.

Daniels, Peter T. and William Bright (1996). The World’s Writing Systems. Oxford: Oxford University Press.

Gillogly, Jim (2002). Voynich Manuscript.

Goldsmith, J. and A. Xanthos (2009). “Learning Phonological Categories”. In: Language 85.1, pp. 4–38.

Guy, Jacques B.M. (1991a). “Statistical Properties of Two Folios of the Voynich Manuscript”. In: Cryptologia 15.3, pp. 207–218.

_____________ (1991b). “Vowel Identification: An Old (but Good) Algorithm”. In: Cryptologia 15.3, pp. 258–262.

Kim, Young-Bum and Benjamin Snyder (2013). “Unsupervised Consonant-Vowel Prediction over Hundreds of Languages”. In: Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics. Sofia, Bulgaria, pp. 1527–1536.

Moler, Cleve and Donald Morrison (1983). “Singular Value Analysis of Cryptograms”. In: American Mathematical Monthly 90, pp. 78–87.

Ohaver, Merle E. (1933). Cryptogram Solving. Columbus, OH: Etcetera Press.

Reddy, Sravana and Kevin Knight (2011). “What We Know about the Voynich Manuscript”. In: Proceedings of the 5th ACL-HLT Workshop on Language Technology for Cultural Heritage, Social Sciences, and Humanities. Portland, OR, pp. 78–86.

Strang, Gilbert (2005). Linear Algebra and Its Applications. 4th ed. Pacific Grove, CA: Brooks/Cole Publishing Company.

Stubbs, Michael and Isabel Barth (2003). “Using Recurrent Phrases as Text-Type”. In: Functions of language 10.1, pp. 61–104.

Sukhotin, Boris V. [Сухотин, Борис В.] (1962). “Экспериментальное выделение классов букв с помощью электронной вычислительной машины [Experimental Selection of Letter Classes with the Help of Electronic Digital Machines]”. In: Проблемы структурной лингвистики [Problems of Structural Linguistics] 234, pp. 198–106.

Weiner, Edmund (2013). Early Modern English Pronunciation and Spelling. http://public.oed.com/aspects-of-english/english-in-time/early-modern-english-pronunciation-and-spelling/ [Accessed: 2014-07-29].

@MISC{VoynichManuscript,
   AUTHOR = {Gillogly, Jim},
   TITLE = {{Voynich Manuscript}},
   YEAR = {2002},
}

@ARTICLE{berg12,
   AUTHOR = {Berg, K.},
   TITLE = {{Identifying Graphematic Units}},
   JOURNAL = {Written Language \& Literacy},
   YEAR = {2012},
   VOLUME = {15},
   NUMBER = {1},
   PAGES = {26--45},
}

@BOOK{adler2001cross,
   AUTHOR = {Adler, Leonore Loeb and Gielen, Uwe Peter},
   TITLE = {{Cross-Cultural Topics in Psychology}},
   PUBLISHER = {Greenwood Publishing Group},
   ADDRESS = {Westport, CO},
   YEAR = {2001},
}

@MISC{ager1omniglot,
   AUTHOR = {Ager,Simon},
   TITLE = {{Writing direction index}},
   YEAR = {2015},
   NOTE = {\url{http://www.omniglot.com/writing/direction.htm} [Accessed: 2014-07-30]},
}

@BOOK{anderson1993writing,
   AUTHOR = {Anderson, Lloyd B.},
   TITLE = {{The Writing System of La Mojarra and Associated Monuments}},
   PUBLISHER = {Ecological Linguistics},
   YEAR = {1993},
   VOLUME = {1},
}

@INPROCEEDINGS{belkin2002using,
   AUTHOR = {Belkin, Mikhail and Goldsmith, John},
   TITLE = {{Using Eigenvectors of the Bigram Graph to Infer Morpheme Identity}},
   BOOKTITLE = {{Proceedings of the ACL-02 Workshop on Morphological and Phonological 
   Learning---Volume 6}},
   YEAR = {2002},
   PAGES = {41--47},
}

@BOOK{chadwick1990decipherment,
   AUTHOR = {Chadwick, John},
   TITLE = {{The Decipherment of Linear B}},
   PUBLISHER = {Cambridge University Press},
   ADDRESS = {Cambridge},
   YEAR = {1990},
}

@MISC{Currier1976voynich,
   AUTHOR = {Currier, Captain Prescott H.},
   TITLE = {{Papers on the Voynich Manuscript}},
   YEAR = {2013},
   NOTE = {\url{http://www.voynich.nu/extra/curr_pdfs.html} [Accessed: 2014-07-30]},
}

@BOOK{daniels1996world,
   AUTHOR = {Daniels, Peter T. and Bright, William},
   TITLE = {{The World's Writing Systems}},
   PUBLISHER = {Oxford University Press},
   ADDRESS = {Oxford},
   YEAR = {1996},
}

@ARTICLE{farmer2004collapse,
   AUTHOR = {Farmer, Steve and Sproat, Richard and Witzel, Michael},
   TITLE = {{The Collapse of the Indus-Script Thesis: The Myth of a Literate Harappan 
   Civilization}},
   JOURNAL = {Electronic Journal of Vedic Studies},
   YEAR = {2004},
   VOLUME = {11},
   NUMBER = {2},
   PAGES = {19--57},
}

@MISC{Gillogly2002Voynich,
   AUTHOR = {Gillogly, Jim},
   TITLE = {{Voynich manuscript}},
   YEAR = {2002},
   NOTE = {\url{http://www.ic.unicamp.br/~stolfi/voynich/mirror/gillogly/voynich.orig} 
   [Accessed: 2014-07-30]},
}

@INPROCEEDINGS{goldsmith2004signatures,
   AUTHOR = {Goldsmith, John and Hu, Yu},
   TITLE = {{From Signatures to Finite State Automata}},
   BOOKTITLE = {{Midwest Computational Linguistics Colloquium. Bloomington, Indiana}},
   YEAR = {2004},
}

@ARTICLE{goldsmith2001unsupervised,
   AUTHOR = {Goldsmith, John},
   TITLE = {{Unsupervised Learning of the Morphology of a Natural Language}},
   JOURNAL = {Computational linguistics},
   YEAR = {2001},
   VOLUME = {27},
   NUMBER = {2},
   PAGES = {153--198},
}

@ARTICLE{guy1991statistical,
   AUTHOR = {Guy, Jacques B.M.},
   TITLE = {{Statistical Properties of Two Folios of the Voynich Manuscript}},
   JOURNAL = {Cryptologia},
   YEAR = {1991},
   VOLUME = {15},
   NUMBER = {3},
   PAGES = {207--218},
}

@ARTICLE{guy1991vowel,
   AUTHOR = {Guy, Jacques B.M.},
   TITLE = {{Vowel Identification: An Old (but Good) Algorithm}},
   JOURNAL = {Cryptologia},
   YEAR = {1991},
   VOLUME = {15},
   NUMBER = {3},
   PAGES = {258--262},
}

@ARTICLE{houston2003has,
   AUTHOR = {Houston, Stephen D. and Coe, Michael D.},
   TITLE = {{Has Isthmian Writing Been Deciphered}},
   JOURNAL = {Mexicon},
   YEAR = {2003},
   VOLUME = {25},
   NUMBER = {6},
   PAGES = {151--161},
}

@INPROCEEDINGS{hu2005sed,
   AUTHOR = {Hu, Yu and Matveeva, Irina and Goldsmith, John and Sprague, Colin},
   TITLE = {{The SED Heuristic for Morpheme Discovery: A Look at Swahili}},
   BOOKTITLE = {{Proceedings of the Workshop on Psychocomputational Models of Human
   Language Acquisition}},
   YEAR = {2005},
   PAGES = {28--35},
}

@ARTICLE{justeson1993decipherment,
   AUTHOR = {Justeson, John S. and Kaufman, Terrence},
   TITLE = {{A Decipherment of Epi-Olmec Hieroglyphic Writing}},
   JOURNAL = {Science},
   YEAR = {1993},
   VOLUME = {259},
   NUMBER = {5102},
   PAGES = {1703--1711},
}

@INPROCEEDINGS{kim2013unsupervised,
   AUTHOR = {Kim, Young-Bum and Snyder, Benjamin},
   TITLE = {{Unsupervised Consonant-Vowel Prediction over Hundreds of Languages}},
   BOOKTITLE = {{Proceedings of the 51st Annual Meeting of the Association for
   Computational Linguistics}},
   ADDRESS = {Sofia, Bulgaria},
   YEAR = {2013},
   PAGES = {1527--1536},
}

@INPROCEEDINGS{kim2012universal,
   AUTHOR = {Kim, Young-Bum and Snyder, Benjamin},
   TITLE = {{Universal Grapheme-to-Phoneme Prediction over Latin Alphabets}},
   BOOKTITLE = {{Proceedings of the 2012 Joint Conference on Empirical Methods in Natural
   Language Processing and Computational Natural Language Learning}},
   YEAR = {2012},
   PAGES = {332--343},
}

@INPROCEEDINGS{knight2011copiale,
   AUTHOR = {Knight, Kevin and Megyesi, Beáta and Schaefer, Christiane},
   TITLE = {{The Copiale Cipher}},
   BOOKTITLE = {{Proceedings of the 4th Workshop on Building and Using Comparable Corpora:
   Comparable Corpora and the Web}},
   YEAR = {2011},
   PAGES = {2--9},
}

@INPROCEEDINGS{knight2006unsupervised,
   AUTHOR = {Knight, Kevin and Nair, Anish and Rathod, Nishit and Yamada, Kenji},
   TITLE = {{Unsupervised Analysis for Decipherment Problems}},
   BOOKTITLE = {{Proceedings of the COLING/ACL Conference}},
   YEAR = {2006},
   PAGES = {499--506},
}

@INPROCEEDINGS{knight1999computational,
   AUTHOR = {Knight, Kevin and Yamada, Kenji},
   TITLE = {{A Computational Approach to Deciphering Unknown Scripts}},
   BOOKTITLE = {{ACL Workshop on Unsupervised Learning in Natural Language Processing}},
   YEAR = {1999},
   PAGES = {37--44},
}

@ARTICLE{kober1946inflection,
   AUTHOR = {Kober, Alice E.},
   TITLE = {{Inflection in Linear Class B: 1-Declension}},
   JOURNAL = {American Journal of Archaeology},
   YEAR = {1946},
   PAGES = {268--276},
}

@ARTICLE{de2006discovering,
   AUTHOR = {Lin, Shou and Knight, Kevin},
   TITLE = {{Discovering the Linear Writing Order of a Two-Dimensional Ancient 
   Hieroglyphic Script}},
   JOURNAL = {Artificial Intelligence},
   YEAR = {2006},
   VOLUME = {170},
   NUMBER = {4},
   PAGES = {409--421},
}

@ARTICLE{macri1996rongorongo,
   AUTHOR = {Macri, Martha J.},
   TITLE = {{RongoRongo of Easter Island}},
   JOURNAL = {The world's writing systems},
   YEAR = {1996},
   PAGES = {183--188},
}

@BOOK{melchert2003luwians,
   AUTHOR = {Melchert, H. Craig},
   TITLE = {{The Luwians}},
   PUBLISHER = {Brill},
   ADDRESS = {Leuven},
   YEAR = {2003},
}

@ARTICLE{moler1983singular,
   AUTHOR = {Moler, Cleve and Morrison, Donald},
   TITLE = {{Singular Value Analysis of Cryptograms}},
   JOURNAL = {American Mathematical Monthly},
   YEAR = {1983},
   VOLUME = {90},
   PAGES = {78--87},
}

@INPROCEEDINGS{mukherjee2009discovering,
   AUTHOR = {Mukherjee, Animesh and Choudhury, Monojit and Kannan, Ravi},
   TITLE = {{Discovering Global Patterns in Linguistic Networks through Spectral Analysis:
   A Case Study of the Consonant Inventories}},
   BOOKTITLE = {{Proceedings of the 12th Conference of the European Chapter of the
   Association for Computational Linguistics}},
   YEAR = {2009},
   PAGES = {585--593},
}

@ARTICLE{mukherjee2007modeling,
   AUTHOR = {Mukherjee, Animesh and Choudhury, Monojit and Basu, Anupam and Ganguly, Niloy},
   TITLE = {{Modeling the Co-occurrence Principles of the Consonant Inventories: A Complex
   Network Approach}},
   JOURNAL = {International Journal of Modern Physics C},
   YEAR = {2007},
   VOLUME = {18},
   NUMBER = {02},
   PAGES = {281--295},
}

@BOOK{ohaver33,
   AUTHOR = {Ohaver, Merle E.},
   TITLE = {{Cryptogram Solving}},
   PUBLISHER = {Etcetera Press},
   ADDRESS = {Columbus, OH},
   YEAR = {1933},
}

@INPROCEEDINGS{penn2006quantitative,
   AUTHOR = {Penn, Gerald and Choma, Travis},
   TITLE = {{Quantitative Methods for Classifying Writing Systems}},
   BOOKTITLE = {{Proceedings of the Human Language Technology Conference of the NAACL, 
   Companion Volume: Short Papers}},
   YEAR = {2006},
   PAGES = {117--120},
}

@ARTICLE{rao2010probabilistic,
   AUTHOR = {Rao, Rajesh P.N.},
   TITLE = {{Probabilistic Analysis of an Ancient Undeciphered Script}},
   JOURNAL = {IEEE Computer},
   YEAR = {2010},
   VOLUME = {43},
   NUMBER = {4},
   PAGES = {76--80},
}

@ARTICLE{rao2009markov,
   AUTHOR = {Rao, Rajesh P.N. and Yadav, Nisha and Vahia, Mayank N. and Joglekar,
   Hrishikesh and Adhikari, R. and Mahadevan, Iravatham},
   TITLE = {{A Markov Model of the Indus Script}},
   JOURNAL = {Proceedings of the National Academy of Sciences},
   YEAR = {2009},
   VOLUME = {106},
   NUMBER = {33},
   PAGES = {13685--13690},
}

@INPROCEEDINGS{ravi2011bayesian,
   AUTHOR = {Ravi, Sujith and Knight, Kevin},
   TITLE = {{Bayesian Inference for Zodiac and Other Homophonic Ciphers}},
   BOOKTITLE = {{Proceedings of the 49th Annual Meeting of the Association for 
   omputational Linguistics: Human Language Technologies-Volume 1}},
   YEAR = {2011},
   PAGES = {239--247},
}

@INPROCEEDINGS{reddy2011we,
   AUTHOR = {Reddy, Sravana and Knight, Kevin},
   TITLE = {{What We Know about the Voynich Manuscript}},
   BOOKTITLE = {{Proceedings of the 5th ACL-HLT Workshop on Language Technology for
   Cultural Heritage, Social Sciences, and Humanities}},
   ADDRESS = {Portland, OR},
   YEAR = {2011},
   PAGES = {78--86},
}

@BOOK{robinson2002lost,
   AUTHOR = {Robinson, Andrew},
   TITLE = {{Lost Languages: The Enigma of the World's Undeciphered Scripts}},
   PUBLISHER = {McGraw-Hill},
   ADDRESS = {New York},
   YEAR = {2002},
}

@BOOK{saussure1966course,
   AUTHOR = {Saussure, Ferdinand de},
   EDITOR = {Bally, Charles and Sechehaye, Albert and Riedlinger, Albert},
   TRANSLATOR = {Wade, Baskin},
   TITLE = {{Course in General Linguistics}},
   PUBLISHER = {McGraw-Hill},
   ADDRESS = {New York},
   YEAR = {1966},
}

@INPROCEEDINGS{snyder2010statistical,
   AUTHOR = {Snyder, Benjamin and Barzilay, Regina and Knight, Kevin},
   TITLE = {{A Statistical Model for Lost Language Decipherment}},
   BOOKTITLE = {{Proceedings of the 48th Annual Meeting of the Association for 
   Computational Linguistics}},
   YEAR = {2010},
   PAGES = {1048--1057},
}

@BOOK{sproat2000computational,
   AUTHOR = {Sproat, Richard William},
   TITLE = {{A Computational Theory of Writing Systems}},
   PUBLISHER = {MIT Press},
   ADDRESS = {Cambridge, MA},
   YEAR = {2000},
}

@ARTICLE{sproat2010ancient,
   AUTHOR = {Sproat, Richard},
   TITLE = {{Ancient Symbols, Computational Linguistics, and the Reviewing 
   Practices of the General Science Journals}},
   JOURNAL = {Computational Linguistics},
   YEAR = {2010},
   VOLUME = {36},
   NUMBER = {3},
   PAGES = {585--594},
}

@BOOK{Strang,
   AUTHOR = {Strang,Gilbert},
   TITLE = {{Linear Algebra and Its Applications}},
   EDITION = {4},
   PUBLISHER = {Brooks/Cole Publishing Company},
   ADDRESS = {Pacific Grove, CA},
   YEAR = {2005},
}

@ARTICLE{stubbs-barth03,
   AUTHOR = {Stubbs, Michael and Barth, Isabel},
   TITLE = {{Using Recurrent Phrases as Text-Type}},
   JOURNAL = {Functions of language},
   YEAR = {2003},
   VOLUME = {10},
   NUMBER = {1},
   PAGES = {61--104},
}

@ARTICLE{sukhotin62,
   AUTHOR = {Sukhotin, B.V.},
   AUTHOR_ORIGINAL = {Сухотин, Б.В.},
   AUTHOR+AN = {1=ru-Cyrl},
   TITLE = {{Экспериментальное выделение классов букв с помощью электронной вычислительной
   ма\-ши\-ны [Experimental Selection of Letter Classes with the Help of Electronic
   Digital Machines]}},
   JOURNAL = {Проблемы структурной лингвистики [Problems of Structural Linguistics]},
   YEAR = {1962},
   VOLUME = {234},
   PAGES = {198--106},
}

@MISC{OED,
   AUTHOR = {Weiner, Edmund},
   TITLE = {{Early Modern English Pronunciation and Spelling}},
   YEAR = {2013},
   NOTE = {\url{http://public.oed.com/aspects-of-english/english-in-time/early-modern-english-pronunciation-and-spelling/} [Accessed: 2014-07-29]},
}

@INPROCEEDINGS{wu2012corpora,
   AUTHOR = {Wu, Katherine and Solman, Jennifer and Linehan, Ruth and Sproat, Richard},
   TITLE = {{Corpora of Non-linguistic Symbol Systems}},
   BOOKTITLE = {{LSA Annual Meeting Extended Abstracts}},
   YEAR = {2012},
}

@ARTICLE{goldsmith-xanthos09,
   AUTHOR = {Goldsmith, J. and Xanthos, A.},
   TITLE = {{Learning Phonological Categories}},
   JOURNAL = {Language},
   YEAR = {2009},
   VOLUME = {85},
   NUMBER = {1},
   PAGES = {4--38},
}

Patricia Thaine & Gerald Penn ORCID iD icon (2019), Vocalic and Consonantal Grapheme Classification through Spectral Decomposition, in Proceedings of Graphemics in the 21st Century, Brest 2018 (Yannis Haralambous, Ed.), Brest: Fluxus Editions, 367–386

@INPROCEEDINGS{gla1-thai,
   AUTHOR = {Thaine, Patricia and Penn, Gerald},
   EDITOR = {Haralambous, Yannis},
   TITLE = {{Vocalic and Consonantal Grapheme Classification through Spectral 
Decomposition}},
   BOOKTITLE = {{Proceedings of Graphemics in the 21st Century, Brest 2018}},
   PUBLISHER = {Fluxus Editions},
   ADDRESS = {Brest},
   YEAR = {2019},
   PAGES = {367--386},
   DOI = {https://doi.org/10.36824/2018-graf-thai},
}