From ff0683d6d0c958232734f71d09dae1ad89dd50cb Mon Sep 17 00:00:00 2001
From: Zachary Yedidia <zyedidia@gmail.com>
Date: Wed, 20 May 2020 17:00:56 -0400
Subject: [PATCH] Final touches for combining character support

---
 internal/util/lua.go         | 10 ++---
 internal/util/unicode.go     | 30 +++++++++++++
 internal/util/util.go        |  5 +--
 pkg/highlight/highlighter.go | 15 +++----
 pkg/highlight/unicode.go     | 86 ++++++++++++++++++++++++++++++++++++
 5 files changed, 128 insertions(+), 18 deletions(-)
 create mode 100644 pkg/highlight/unicode.go

diff --git a/internal/util/lua.go b/internal/util/lua.go
index 8d1ce772..eda14981 100644
--- a/internal/util/lua.go
+++ b/internal/util/lua.go
@@ -1,15 +1,11 @@
 package util
 
-import (
-	"unicode/utf8"
-)
-
 // LuaRuneAt is a helper function for lua plugins to return the rune
 // at an index within a string
 func LuaRuneAt(str string, runeidx int) string {
 	i := 0
 	for len(str) > 0 {
-		r, size := utf8.DecodeRuneInString(str)
+		r, _, size := DecodeCharacterInString(str)
 
 		str = str[size:]
 
@@ -26,7 +22,7 @@ func LuaRuneAt(str string, runeidx int) string {
 func LuaGetLeadingWhitespace(s string) string {
 	ws := []byte{}
 	for len(s) > 0 {
-		r, size := utf8.DecodeRuneInString(s)
+		r, _, size := DecodeCharacterInString(s)
 		if r == ' ' || r == '\t' {
 			ws = append(ws, byte(r))
 		} else {
@@ -40,6 +36,6 @@ func LuaGetLeadingWhitespace(s string) string {
 
 // LuaIsWordChar returns true if the first rune in a string is a word character
 func LuaIsWordChar(s string) bool {
-	r, _ := utf8.DecodeRuneInString(s)
+	r, _, _ := DecodeCharacterInString(s)
 	return IsWordChar(r)
 }
diff --git a/internal/util/unicode.go b/internal/util/unicode.go
index 477bc691..d5a4d022 100644
--- a/internal/util/unicode.go
+++ b/internal/util/unicode.go
@@ -5,6 +5,17 @@ import (
 	"unicode/utf8"
 )
 
+// Unicode is annoying. A "code point" (rune in Go-speak) may need up to
+// 4 bytes to represent it. In general, a code point will represent a
+// complete character, but this is not always the case. A character with
+// accents may be made up of multiple code points (the code point for the
+// original character, and additional code points for each accent/marking).
+// The functions below are meant to help deal with these additional "combining"
+// code points. In underlying operations (search, replace, etc...), micro will
+// treat a character with combining code points as just the original code point.
+// For rendering, micro will display the combining characters. It's not perfect
+// but it's pretty good.
+
 // combining character range table
 var combining = &unicode.RangeTable{
 	R16: []unicode.Range16{
@@ -35,6 +46,25 @@ func DecodeCharacter(b []byte) (rune, []rune, int) {
 	return r, combc, size
 }
 
+// DecodeCharacterInString returns the next character from a string
+// A character is a rune along with any accompanying combining runes
+func DecodeCharacterInString(str string) (rune, []rune, int) {
+	r, size := utf8.DecodeRuneInString(str)
+	str = str[size:]
+	c, s := utf8.DecodeRuneInString(str)
+
+	var combc []rune
+	for unicode.In(c, combining) {
+		combc = append(combc, c)
+		size += s
+
+		str = str[s:]
+		c, s = utf8.DecodeRuneInString(str)
+	}
+
+	return r, combc, size
+}
+
 // CharacterCount returns the number of characters in a byte array
 // Similar to utf8.RuneCount but for unicode characters
 func CharacterCount(b []byte) int {
diff --git a/internal/util/util.go b/internal/util/util.go
index 44a9a12f..a63a3518 100644
--- a/internal/util/util.go
+++ b/internal/util/util.go
@@ -13,7 +13,6 @@ import (
 	"strings"
 	"time"
 	"unicode"
-	"unicode/utf8"
 
 	"github.com/blang/semver"
 	runewidth "github.com/mattn/go-runewidth"
@@ -82,7 +81,7 @@ func SliceEndStr(str string, index int) string {
 			return str[totalSize:]
 		}
 
-		_, size := utf8.DecodeRuneInString(str[totalSize:])
+		_, _, size := DecodeCharacterInString(str[totalSize:])
 		totalSize += size
 		i++
 	}
@@ -119,7 +118,7 @@ func SliceStartStr(str string, index int) string {
 			return str[:totalSize]
 		}
 
-		_, size := utf8.DecodeRuneInString(str[totalSize:])
+		_, _, size := DecodeCharacterInString(str[totalSize:])
 		totalSize += size
 		i++
 	}
diff --git a/pkg/highlight/highlighter.go b/pkg/highlight/highlighter.go
index 560c41a6..440b1ff0 100644
--- a/pkg/highlight/highlighter.go
+++ b/pkg/highlight/highlighter.go
@@ -3,7 +3,6 @@ package highlight
 import (
 	"regexp"
 	"strings"
-	"unicode/utf8"
 )
 
 func sliceStart(slc []byte, index int) []byte {
@@ -15,7 +14,7 @@ func sliceStart(slc []byte, index int) []byte {
 			return slc[totalSize:]
 		}
 
-		_, size := utf8.DecodeRune(slc[totalSize:])
+		_, _, size := DecodeCharacter(slc[totalSize:])
 		totalSize += size
 		i++
 	}
@@ -32,7 +31,7 @@ func sliceEnd(slc []byte, index int) []byte {
 			return slc[:totalSize]
 		}
 
-		_, size := utf8.DecodeRune(slc[totalSize:])
+		_, _, size := DecodeCharacter(slc[totalSize:])
 		totalSize += size
 		i++
 	}
@@ -47,9 +46,9 @@ func runePos(p int, str []byte) int {
 		return 0
 	}
 	if p >= len(str) {
-		return utf8.RuneCount(str)
+		return CharacterCount(str)
 	}
-	return utf8.RuneCount(str[:p])
+	return CharacterCount(str[:p])
 }
 
 func combineLineMatch(src, dst LineMatch) LineMatch {
@@ -112,7 +111,7 @@ func findIndex(regex *regexp.Regexp, skip *regexp.Regexp, str []byte, canMatchSt
 	var strbytes []byte
 	if skip != nil {
 		strbytes = skip.ReplaceAllFunc(str, func(match []byte) []byte {
-			res := make([]byte, utf8.RuneCount(match))
+			res := make([]byte, CharacterCount(match))
 			return res
 		})
 	} else {
@@ -148,7 +147,7 @@ func findAllIndex(regex *regexp.Regexp, str []byte, canMatchStart, canMatchEnd b
 }
 
 func (h *Highlighter) highlightRegion(highlights LineMatch, start int, canMatchEnd bool, lineNum int, line []byte, curRegion *region, statesOnly bool) LineMatch {
-	lineLen := utf8.RuneCount(line)
+	lineLen := CharacterCount(line)
 	if start == 0 {
 		if !statesOnly {
 			if _, ok := highlights[0]; !ok {
@@ -236,7 +235,7 @@ func (h *Highlighter) highlightRegion(highlights LineMatch, start int, canMatchE
 }
 
 func (h *Highlighter) highlightEmptyRegion(highlights LineMatch, start int, canMatchEnd bool, lineNum int, line []byte, statesOnly bool) LineMatch {
-	lineLen := utf8.RuneCount(line)
+	lineLen := CharacterCount(line)
 	if lineLen == 0 {
 		if canMatchEnd {
 			h.lastRegion = nil
diff --git a/pkg/highlight/unicode.go b/pkg/highlight/unicode.go
new file mode 100644
index 00000000..eea53e53
--- /dev/null
+++ b/pkg/highlight/unicode.go
@@ -0,0 +1,86 @@
+package highlight
+
+import (
+	"unicode"
+	"unicode/utf8"
+)
+
+// combining character range table
+var combining = &unicode.RangeTable{
+	R16: []unicode.Range16{
+		{0x0300, 0x036f, 1}, // combining diacritical marks
+		{0x1ab0, 0x1aff, 1}, // combining diacritical marks extended
+		{0x1dc0, 0x1dff, 1}, // combining diacritical marks supplement
+		{0x20d0, 0x20ff, 1}, // combining diacritical marks for symbols
+		{0xfe20, 0xfe2f, 1}, // combining half marks
+	},
+}
+
+// DecodeCharacter returns the next character from an array of bytes
+// A character is a rune along with any accompanying combining runes
+func DecodeCharacter(b []byte) (rune, []rune, int) {
+	r, size := utf8.DecodeRune(b)
+	b = b[size:]
+	c, s := utf8.DecodeRune(b)
+
+	var combc []rune
+	for unicode.In(c, combining) {
+		combc = append(combc, c)
+		size += s
+
+		b = b[s:]
+		c, s = utf8.DecodeRune(b)
+	}
+
+	return r, combc, size
+}
+
+// DecodeCharacterInString returns the next character from a string
+// A character is a rune along with any accompanying combining runes
+func DecodeCharacterInString(str string) (rune, []rune, int) {
+	r, size := utf8.DecodeRuneInString(str)
+	str = str[size:]
+	c, s := utf8.DecodeRuneInString(str)
+
+	var combc []rune
+	for unicode.In(c, combining) {
+		combc = append(combc, c)
+		size += s
+
+		str = str[s:]
+		c, s = utf8.DecodeRuneInString(str)
+	}
+
+	return r, combc, size
+}
+
+// CharacterCount returns the number of characters in a byte array
+// Similar to utf8.RuneCount but for unicode characters
+func CharacterCount(b []byte) int {
+	s := 0
+
+	for len(b) > 0 {
+		r, size := utf8.DecodeRune(b)
+		if !unicode.In(r, combining) {
+			s++
+		}
+
+		b = b[size:]
+	}
+
+	return s
+}
+
+// CharacterCount returns the number of characters in a string
+// Similar to utf8.RuneCountInString but for unicode characters
+func CharacterCountInString(str string) int {
+	s := 0
+
+	for _, r := range str {
+		if !unicode.In(r, combining) {
+			s++
+		}
+	}
+
+	return s
+}