{"_id":"grapheme-splitter","_rev":"225112","name":"grapheme-splitter","description":"A JavaScript library that breaks strings into their individual user-perceived characters. It supports emojis!","dist-tags":{"latest":"1.0.4"},"maintainers":[{"name":"orling","email":"orlin.georgiev@gmail.com"}],"time":{"modified":"2021-06-03T15:42:17.000Z","created":"2017-07-20T20:37:11.585Z","1.0.4":"2018-09-11T20:25:43.410Z","1.0.3":"2018-08-23T04:12:08.802Z","1.0.2":"2017-07-20T20:50:54.145Z","1.0.1":"2017-07-20T20:37:11.585Z"},"users":{},"author":{"name":"Orlin Georgiev"},"repository":{"type":"git","url":"git+https://github.com/orling/grapheme-splitter.git"},"versions":{"1.0.4":{"name":"grapheme-splitter","version":"1.0.4","description":"A JavaScript library that breaks strings into their individual user-perceived characters. It supports emojis!","homepage":"https://github.com/orling/grapheme-splitter","author":{"name":"Orlin Georgiev"},"contributors":[{"name":"Lucas Tadeu Teixeira","email":"lucas@fastmail.nl","url":"https://lucas.is"}],"main":"index.js","license":"MIT","keywords":["utf-8","strings","emoji","split"],"scripts":{"test":"tape tests/grapheme_splitter_tests.js"},"repository":{"type":"git","url":"git+https://github.com/orling/grapheme-splitter.git"},"bugs":{"url":"https://github.com/orling/grapheme-splitter/issues"},"dependencies":{},"devDependencies":{"tape":"^4.6.3"},"gitHead":"0609d90dcbc93b42d8ceebb7aec0eda38b5d916d","_id":"grapheme-splitter@1.0.4","_npmVersion":"6.4.0","_nodeVersion":"6.11.0","_npmUser":{"name":"orling","email":"orlin.georgiev@gmail.com"},"dist":{"shasum":"9cf3a665c6247479896834af35cf1dbb4400767e","size":34119,"noattachment":false,"key":"/grapheme-splitter/-/grapheme-splitter-1.0.4.tgz","tarball":"http://registry.cnpm.dingdandao.com/grapheme-splitter/download/grapheme-splitter-1.0.4.tgz"},"maintainers":[{"name":"orling","email":"orlin.georgiev@gmail.com"}],"directories":{},"_npmOperationalInternal":{"host":"s3://npm-registry-packages","tmp":"tmp/grapheme-splitter_1.0.4_1536697543257_0.5835726576037328"},"_hasShrinkwrap":false,"publish_time":1536697543410,"_cnpm_publish_time":1536697543410},"1.0.3":{"name":"grapheme-splitter","version":"1.0.3","description":"A JavaScipt library that breaks strings into their individual user-perceived characters. It supports emojis!","homepage":"https://github.com/orling/grapheme-splitter","author":{"name":"Orlin Georgiev"},"contributors":[{"name":"Lucas Tadeu Teixeira","email":"lucas@fastmail.nl","url":"https://lucas.is"}],"main":"index.js","license":"MIT","keywords":["utf-8","strings","emoji","split"],"scripts":{"test":"tape tests/grapheme_splitter_tests.js"},"repository":{"type":"git","url":"git+https://github.com/orling/grapheme-splitter.git"},"bugs":{"url":"https://github.com/orling/grapheme-splitter/issues"},"dependencies":{},"devDependencies":{"tape":"^4.6.3"},"gitHead":"642ea8d2e6f26156e90ff8e8bcb00af0676b97ec","_id":"grapheme-splitter@1.0.3","_npmVersion":"6.4.0","_nodeVersion":"6.11.0","_npmUser":{"name":"orling","email":"orlin.georgiev@gmail.com"},"dist":{"shasum":"6ffffdd44311862ada843f9cd3e7d05eda9f411c","size":34122,"noattachment":false,"key":"/grapheme-splitter/-/grapheme-splitter-1.0.3.tgz","tarball":"http://registry.cnpm.dingdandao.com/grapheme-splitter/download/grapheme-splitter-1.0.3.tgz"},"maintainers":[{"name":"orling","email":"orlin.georgiev@gmail.com"}],"directories":{},"_npmOperationalInternal":{"host":"s3://npm-registry-packages","tmp":"tmp/grapheme-splitter_1.0.3_1534997528574_0.18999743495129384"},"_hasShrinkwrap":false,"publish_time":1534997528802,"_cnpm_publish_time":1534997528802},"1.0.2":{"name":"grapheme-splitter","version":"1.0.2","description":"A JavaScipt library that breaks strings into their individual user-perceived characters. It supports emojis!","homepage":"https://github.com/orling/grapheme-splitter","author":{"name":"Orlin Georgiev"},"contributors":[{"name":"Lucas Tadeu Teixeira","email":"lucas@fastmail.nl","url":"https://lucas.is"}],"main":"index.js","license":"MIT","keywords":["utf-8","strings","emoji","split"],"scripts":{"test":"tape tests/grapheme_splitter_tests.js"},"repository":{"type":"git","url":"git+https://github.com/orling/grapheme-splitter.git"},"bugs":{"url":"https://github.com/orling/grapheme-splitter/issues"},"dependencies":{},"devDependencies":{"tape":"^4.6.3"},"engines":{"npm":"~7.3.0"},"gitHead":"663679c01d7576470a3b5889a8ea166fb57f10a0","_id":"grapheme-splitter@1.0.2","_shasum":"639e9dc1bf065892c643de31daa27cf58b1068e2","_from":".","_npmVersion":"3.10.10","_nodeVersion":"6.11.0","_npmUser":{"name":"orling","email":"orlin.georgiev@gmail.com"},"dist":{"shasum":"639e9dc1bf065892c643de31daa27cf58b1068e2","size":33263,"noattachment":false,"key":"/grapheme-splitter/-/grapheme-splitter-1.0.2.tgz","tarball":"http://registry.cnpm.dingdandao.com/grapheme-splitter/download/grapheme-splitter-1.0.2.tgz"},"maintainers":[{"name":"orling","email":"orlin.georgiev@gmail.com"}],"_npmOperationalInternal":{"host":"s3://npm-registry-packages","tmp":"tmp/grapheme-splitter-1.0.2.tgz_1500583853152_0.11804638826288283"},"directories":{},"publish_time":1500583854145,"_cnpm_publish_time":1500583854145,"_hasShrinkwrap":false},"1.0.1":{"name":"grapheme-splitter","version":"1.0.1","description":"A JavaScipt library that breaks strings into their individual user-perceived characters. It supports emojis!","homepage":"https://github.com/orling/grapheme-splitter","author":{"name":"Orlin Georgiev"},"contributors":[{"name":"Lucas Tadeu Teixeira","email":"lucas@fastmail.nl","url":"https://lucas.is"}],"main":"index.js","license":"MIT","keywords":["utf-8","strings","emoji","split"],"scripts":{"test":"tape tests/grapheme_splitter_tests.js"},"repository":{"type":"git","url":"git+https://github.com/orling/grapheme-splitter.git"},"bugs":{"url":"https://github.com/orling/grapheme-splitter/issues"},"dependencies":{},"devDependencies":{"tape":"^4.6.3"},"engines":{"npm":"~7.3.0"},"gitHead":"27cbe6a2606b80f5322b735947ca4b5dd133d915","_id":"grapheme-splitter@1.0.1","_shasum":"445abeddfab3e4a250049978d38990e3c4bd1a7f","_from":".","_npmVersion":"3.10.10","_nodeVersion":"6.11.0","_npmUser":{"name":"orling","email":"orlin.georgiev@gmail.com"},"dist":{"shasum":"445abeddfab3e4a250049978d38990e3c4bd1a7f","size":33272,"noattachment":false,"key":"/grapheme-splitter/-/grapheme-splitter-1.0.1.tgz","tarball":"http://registry.cnpm.dingdandao.com/grapheme-splitter/download/grapheme-splitter-1.0.1.tgz"},"maintainers":[{"name":"orling","email":"orlin.georgiev@gmail.com"}],"_npmOperationalInternal":{"host":"s3://npm-registry-packages","tmp":"tmp/grapheme-splitter-1.0.1.tgz_1500583030522_0.4711209151428193"},"directories":{},"publish_time":1500583031585,"_hasShrinkwrap":false,"_cnpm_publish_time":1500583031585}},"readme":"# Background\r\n\r\nIn JavaScript there is not always a one-to-one relationship between string characters and what a user would call a separate visual \"letter\". Some symbols are represented by several characters. This can cause issues when splitting strings and inadvertently cutting a multi-char letter in half, or when you need the actual number of letters in a string.\r\n\r\nFor example, emoji characters like \"????\",\"????\",\"????\",\"????\" and \"????\" are represented by two JavaScript characters each (high surrogate and low surrogate). That is, \r\n\r\n```javascript\r\n\"????\".length == 2\r\n```\r\nThe combined emoji are even longer:\r\n```javascript\r\n\"????️‍????\".length == 6\r\n```\r\n\r\nWhat's more, some languages often include combining marks - characters that are used to modify the letters before them. Common examples are the German letter ü and the Spanish letter ñ. Sometimes they can be represented alternatively both as a single character and as a letter + combining mark, with both forms equally valid:\r\n    \r\n```javascript\r\nvar two = \"ñ\"; // unnormalized two-char n+◌̃  , i.e. \"\\u006E\\u0303\";\r\nvar one = \"ñ\"; // normalized single-char, i.e. \"\\u00F1\"\r\nconsole.log(one!=two); // prints 'true'\r\n```\r\n\r\nUnicode normalization, as performed by the popular punycode.js library or ECMAScript 6's String.normalize, can **sometimes** fix those differences and turn two-char sequences into single characters. But it is **not** enough in all cases. Some languages like Hindi make extensive use of combining marks on their letters, that have no dedicated single-codepoint Unicode sequences, due to the sheer number of possible combinations.\r\nFor example, the Hindi word \"अनुच्छेद\" is comprised of 5 letters and 3 combining marks:\r\n\r\nअ + न + ु + च + ् + छ + े + द\r\n\r\nwhich is in fact just 5 user-perceived letters:\r\n\r\nअ + नु + च् + छे + द\r\n\r\nand which Unicode normalization would not combine properly.\r\nThere are also the unusual letter+combining mark combinations which have no dedicated Unicode codepoint. The string Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘ obviously has 5 separate letters, but is in fact comprised of 58 JavaScript characters, most of which are combining marks.\r\n\r\nEnter the grapheme-splitter.js library. It can be used to properly split JavaScript strings into what a human user would call separate letters (or \"extended grapheme clusters\" in Unicode terminology), no matter what their internal representation is. It is an implementation on the [Default Grapheme Cluster Boundary](http://unicode.org/reports/tr29/#Default_Grapheme_Cluster_Table) of [UAX #29](http://www.unicode.org/reports/tr29/). \r\n\r\n# Installation\r\n\r\nYou can use the index.js file directly as-is. Or you you can install `grapheme-splitter` to your project using the NPM command below:\r\n\r\n```\r\n$ npm install --save grapheme-splitter\r\n```\r\n\r\n# Tests\r\n\r\nTo run the tests on `grapheme-splitter`, use the command below:\r\n\r\n```\r\n$ npm test\r\n```\r\n\r\n# Usage\r\n\r\nJust initialize and use:\r\n\r\n```javascript\r\nvar splitter = new GraphemeSplitter();\r\n\r\n// split the string to an array of grapheme clusters (one string each)\r\nvar graphemes = splitter.splitGraphemes(string);\r\n\r\n// iterate the string to an iterable iterator of grapheme clusters (one string each)\r\nvar graphemes = splitter.iterateGraphemes(string);\r\n\r\n// or do this if you just need their number\r\nvar graphemeCount = splitter.countGraphemes(string);\r\n```\r\n\r\n# Examples\r\n\r\n```javascript\r\nvar splitter = new GraphemeSplitter();\r\n\r\n// plain latin alphabet - nothing spectacular\r\nsplitter.splitGraphemes(\"abcd\"); // returns [\"a\", \"b\", \"c\", \"d\"]\r\n\r\n// two-char emojis and six-char combined emoji\r\nsplitter.splitGraphemes(\"????????????????????????️‍????\"); // returns [\"????\",\"????\",\"????\",\"????\",\"????\",\"????️‍????\"]\r\n\r\n// diacritics as combining marks, 10 JavaScript chars\r\nsplitter.splitGraphemes(\"Ĺo͂řȩm̅\"); // returns [\"Ĺ\",\"o͂\",\"ř\",\"ȩ\",\"m̅\"]\r\n\r\n// individual Korean characters (Jamo), 4 JavaScript chars\r\nsplitter.splitGraphemes(\"뎌쉐\"); // returns [\"뎌\",\"쉐\"]\r\n\r\n// Hindi text with combining marks, 8 JavaScript chars\r\nsplitter.splitGraphemes(\"अनुच्छेद\"); // returns [\"अ\",\"नु\",\"च्\",\"छे\",\"द\"]\r\n\r\n// demonic multiple combining marks, 75 JavaScript chars\r\nsplitter.splitGraphemes(\"Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞\"); // returns [\"Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍\",\"A̴̵̜̰͔ͫ͗͢\",\"L̠ͨͧͩ͘\",\"G̴̻͈͍͔̹̑͗̎̅͛́\",\"Ǫ̵̹̻̝̳͂̌̌͘\",\"!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞\"]\r\n```\r\n\r\n# TypeScript\r\n\r\nGrapheme splitter includes TypeScript declarations.\r\n\r\n```typescript\r\nimport GraphemeSplitter = require('grapheme-splitter')\r\n\r\nconst splitter = new GraphemeSplitter()\r\n\r\nconst split: string[] = splitter.splitGraphemes('Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞')\r\n```\r\n\r\n# Acknowledgements\r\n\r\nThis library is heavily influenced by Devon Govett's excellent grapheme-breaker CoffeeScript library at https://github.com/devongovett/grapheme-breaker with an emphasis on ease of integration and pure JavaScript implementation.\r\n\r\n\r\n\r\n","_attachments":{},"homepage":"https://github.com/orling/grapheme-splitter","bugs":{"url":"https://github.com/orling/grapheme-splitter/issues"},"license":"MIT"}