Final touches for combining character support

2026-03-24 17:50:15 +09:00 · 2020-05-20 17:00:56 -04:00
parent 79c0ea17ad
commit ff0683d6d0
5 changed files with 128 additions and 18 deletions
--- a/internal/util/lua.go
+++ b/internal/util/lua.go
@@ -1,15 +1,11 @@
 package util

-import (
-	"unicode/utf8"
-)
-
 // LuaRuneAt is a helper function for lua plugins to return the rune
 // at an index within a string
 func LuaRuneAt(str string, runeidx int) string {
 	i := 0
 	for len(str) > 0 {
-		r, size := utf8.DecodeRuneInString(str)
+		r, _, size := DecodeCharacterInString(str)

 		str = str[size:]

@@ -26,7 +22,7 @@ func LuaRuneAt(str string, runeidx int) string {
 func LuaGetLeadingWhitespace(s string) string {
 	ws := []byte{}
 	for len(s) > 0 {
-		r, size := utf8.DecodeRuneInString(s)
+		r, _, size := DecodeCharacterInString(s)
 		if r == ' ' || r == '\t' {
 			ws = append(ws, byte(r))
 		} else {
@@ -40,6 +36,6 @@ func LuaGetLeadingWhitespace(s string) string {

 // LuaIsWordChar returns true if the first rune in a string is a word character
 func LuaIsWordChar(s string) bool {
-	r, _ := utf8.DecodeRuneInString(s)
+	r, _, _ := DecodeCharacterInString(s)
 	return IsWordChar(r)
 }
--- a/internal/util/unicode.go
+++ b/internal/util/unicode.go
@@ -5,6 +5,17 @@ import (
 	"unicode/utf8"
 )

+// Unicode is annoying. A "code point" (rune in Go-speak) may need up to
+// 4 bytes to represent it. In general, a code point will represent a
+// complete character, but this is not always the case. A character with
+// accents may be made up of multiple code points (the code point for the
+// original character, and additional code points for each accent/marking).
+// The functions below are meant to help deal with these additional "combining"
+// code points. In underlying operations (search, replace, etc...), micro will
+// treat a character with combining code points as just the original code point.
+// For rendering, micro will display the combining characters. It's not perfect
+// but it's pretty good.
+
 // combining character range table
 var combining = &unicode.RangeTable{
 	R16: []unicode.Range16{
@@ -35,6 +46,25 @@ func DecodeCharacter(b []byte) (rune, []rune, int) {
 	return r, combc, size
 }

+// DecodeCharacterInString returns the next character from a string
+// A character is a rune along with any accompanying combining runes
+func DecodeCharacterInString(str string) (rune, []rune, int) {
+	r, size := utf8.DecodeRuneInString(str)
+	str = str[size:]
+	c, s := utf8.DecodeRuneInString(str)
+
+	var combc []rune
+	for unicode.In(c, combining) {
+		combc = append(combc, c)
+		size += s
+
+		str = str[s:]
+		c, s = utf8.DecodeRuneInString(str)
+	}
+
+	return r, combc, size
+}
+
 // CharacterCount returns the number of characters in a byte array
 // Similar to utf8.RuneCount but for unicode characters
 func CharacterCount(b []byte) int {
--- a/internal/util/util.go
+++ b/internal/util/util.go
@@ -13,7 +13,6 @@ import (
 	"strings"
 	"time"
 	"unicode"
-	"unicode/utf8"

 	"github.com/blang/semver"
 	runewidth "github.com/mattn/go-runewidth"
@@ -82,7 +81,7 @@ func SliceEndStr(str string, index int) string {
 			return str[totalSize:]
 		}

-		_, size := utf8.DecodeRuneInString(str[totalSize:])
+		_, _, size := DecodeCharacterInString(str[totalSize:])
 		totalSize += size
 		i++
 	}
@@ -119,7 +118,7 @@ func SliceStartStr(str string, index int) string {
 			return str[:totalSize]
 		}

-		_, size := utf8.DecodeRuneInString(str[totalSize:])
+		_, _, size := DecodeCharacterInString(str[totalSize:])
 		totalSize += size
 		i++
 	}