From ff0683d6d0c958232734f71d09dae1ad89dd50cb Mon Sep 17 00:00:00 2001 From: Zachary Yedidia Date: Wed, 20 May 2020 17:00:56 -0400 Subject: [PATCH] Final touches for combining character support --- internal/util/lua.go | 10 ++--- internal/util/unicode.go | 30 +++++++++++++ internal/util/util.go | 5 +-- pkg/highlight/highlighter.go | 15 +++---- pkg/highlight/unicode.go | 86 ++++++++++++++++++++++++++++++++++++ 5 files changed, 128 insertions(+), 18 deletions(-) create mode 100644 pkg/highlight/unicode.go diff --git a/internal/util/lua.go b/internal/util/lua.go index 8d1ce772..eda14981 100644 --- a/internal/util/lua.go +++ b/internal/util/lua.go @@ -1,15 +1,11 @@ package util -import ( - "unicode/utf8" -) - // LuaRuneAt is a helper function for lua plugins to return the rune // at an index within a string func LuaRuneAt(str string, runeidx int) string { i := 0 for len(str) > 0 { - r, size := utf8.DecodeRuneInString(str) + r, _, size := DecodeCharacterInString(str) str = str[size:] @@ -26,7 +22,7 @@ func LuaRuneAt(str string, runeidx int) string { func LuaGetLeadingWhitespace(s string) string { ws := []byte{} for len(s) > 0 { - r, size := utf8.DecodeRuneInString(s) + r, _, size := DecodeCharacterInString(s) if r == ' ' || r == '\t' { ws = append(ws, byte(r)) } else { @@ -40,6 +36,6 @@ func LuaGetLeadingWhitespace(s string) string { // LuaIsWordChar returns true if the first rune in a string is a word character func LuaIsWordChar(s string) bool { - r, _ := utf8.DecodeRuneInString(s) + r, _, _ := DecodeCharacterInString(s) return IsWordChar(r) } diff --git a/internal/util/unicode.go b/internal/util/unicode.go index 477bc691..d5a4d022 100644 --- a/internal/util/unicode.go +++ b/internal/util/unicode.go @@ -5,6 +5,17 @@ import ( "unicode/utf8" ) +// Unicode is annoying. A "code point" (rune in Go-speak) may need up to +// 4 bytes to represent it. In general, a code point will represent a +// complete character, but this is not always the case. A character with +// accents may be made up of multiple code points (the code point for the +// original character, and additional code points for each accent/marking). +// The functions below are meant to help deal with these additional "combining" +// code points. In underlying operations (search, replace, etc...), micro will +// treat a character with combining code points as just the original code point. +// For rendering, micro will display the combining characters. It's not perfect +// but it's pretty good. + // combining character range table var combining = &unicode.RangeTable{ R16: []unicode.Range16{ @@ -35,6 +46,25 @@ func DecodeCharacter(b []byte) (rune, []rune, int) { return r, combc, size } +// DecodeCharacterInString returns the next character from a string +// A character is a rune along with any accompanying combining runes +func DecodeCharacterInString(str string) (rune, []rune, int) { + r, size := utf8.DecodeRuneInString(str) + str = str[size:] + c, s := utf8.DecodeRuneInString(str) + + var combc []rune + for unicode.In(c, combining) { + combc = append(combc, c) + size += s + + str = str[s:] + c, s = utf8.DecodeRuneInString(str) + } + + return r, combc, size +} + // CharacterCount returns the number of characters in a byte array // Similar to utf8.RuneCount but for unicode characters func CharacterCount(b []byte) int { diff --git a/internal/util/util.go b/internal/util/util.go index 44a9a12f..a63a3518 100644 --- a/internal/util/util.go +++ b/internal/util/util.go @@ -13,7 +13,6 @@ import ( "strings" "time" "unicode" - "unicode/utf8" "github.com/blang/semver" runewidth "github.com/mattn/go-runewidth" @@ -82,7 +81,7 @@ func SliceEndStr(str string, index int) string { return str[totalSize:] } - _, size := utf8.DecodeRuneInString(str[totalSize:]) + _, _, size := DecodeCharacterInString(str[totalSize:]) totalSize += size i++ } @@ -119,7 +118,7 @@ func SliceStartStr(str string, index int) string { return str[:totalSize] } - _, size := utf8.DecodeRuneInString(str[totalSize:]) + _, _, size := DecodeCharacterInString(str[totalSize:]) totalSize += size i++ } diff --git a/pkg/highlight/highlighter.go b/pkg/highlight/highlighter.go index 560c41a6..440b1ff0 100644 --- a/pkg/highlight/highlighter.go +++ b/pkg/highlight/highlighter.go @@ -3,7 +3,6 @@ package highlight import ( "regexp" "strings" - "unicode/utf8" ) func sliceStart(slc []byte, index int) []byte { @@ -15,7 +14,7 @@ func sliceStart(slc []byte, index int) []byte { return slc[totalSize:] } - _, size := utf8.DecodeRune(slc[totalSize:]) + _, _, size := DecodeCharacter(slc[totalSize:]) totalSize += size i++ } @@ -32,7 +31,7 @@ func sliceEnd(slc []byte, index int) []byte { return slc[:totalSize] } - _, size := utf8.DecodeRune(slc[totalSize:]) + _, _, size := DecodeCharacter(slc[totalSize:]) totalSize += size i++ } @@ -47,9 +46,9 @@ func runePos(p int, str []byte) int { return 0 } if p >= len(str) { - return utf8.RuneCount(str) + return CharacterCount(str) } - return utf8.RuneCount(str[:p]) + return CharacterCount(str[:p]) } func combineLineMatch(src, dst LineMatch) LineMatch { @@ -112,7 +111,7 @@ func findIndex(regex *regexp.Regexp, skip *regexp.Regexp, str []byte, canMatchSt var strbytes []byte if skip != nil { strbytes = skip.ReplaceAllFunc(str, func(match []byte) []byte { - res := make([]byte, utf8.RuneCount(match)) + res := make([]byte, CharacterCount(match)) return res }) } else { @@ -148,7 +147,7 @@ func findAllIndex(regex *regexp.Regexp, str []byte, canMatchStart, canMatchEnd b } func (h *Highlighter) highlightRegion(highlights LineMatch, start int, canMatchEnd bool, lineNum int, line []byte, curRegion *region, statesOnly bool) LineMatch { - lineLen := utf8.RuneCount(line) + lineLen := CharacterCount(line) if start == 0 { if !statesOnly { if _, ok := highlights[0]; !ok { @@ -236,7 +235,7 @@ func (h *Highlighter) highlightRegion(highlights LineMatch, start int, canMatchE } func (h *Highlighter) highlightEmptyRegion(highlights LineMatch, start int, canMatchEnd bool, lineNum int, line []byte, statesOnly bool) LineMatch { - lineLen := utf8.RuneCount(line) + lineLen := CharacterCount(line) if lineLen == 0 { if canMatchEnd { h.lastRegion = nil diff --git a/pkg/highlight/unicode.go b/pkg/highlight/unicode.go new file mode 100644 index 00000000..eea53e53 --- /dev/null +++ b/pkg/highlight/unicode.go @@ -0,0 +1,86 @@ +package highlight + +import ( + "unicode" + "unicode/utf8" +) + +// combining character range table +var combining = &unicode.RangeTable{ + R16: []unicode.Range16{ + {0x0300, 0x036f, 1}, // combining diacritical marks + {0x1ab0, 0x1aff, 1}, // combining diacritical marks extended + {0x1dc0, 0x1dff, 1}, // combining diacritical marks supplement + {0x20d0, 0x20ff, 1}, // combining diacritical marks for symbols + {0xfe20, 0xfe2f, 1}, // combining half marks + }, +} + +// DecodeCharacter returns the next character from an array of bytes +// A character is a rune along with any accompanying combining runes +func DecodeCharacter(b []byte) (rune, []rune, int) { + r, size := utf8.DecodeRune(b) + b = b[size:] + c, s := utf8.DecodeRune(b) + + var combc []rune + for unicode.In(c, combining) { + combc = append(combc, c) + size += s + + b = b[s:] + c, s = utf8.DecodeRune(b) + } + + return r, combc, size +} + +// DecodeCharacterInString returns the next character from a string +// A character is a rune along with any accompanying combining runes +func DecodeCharacterInString(str string) (rune, []rune, int) { + r, size := utf8.DecodeRuneInString(str) + str = str[size:] + c, s := utf8.DecodeRuneInString(str) + + var combc []rune + for unicode.In(c, combining) { + combc = append(combc, c) + size += s + + str = str[s:] + c, s = utf8.DecodeRuneInString(str) + } + + return r, combc, size +} + +// CharacterCount returns the number of characters in a byte array +// Similar to utf8.RuneCount but for unicode characters +func CharacterCount(b []byte) int { + s := 0 + + for len(b) > 0 { + r, size := utf8.DecodeRune(b) + if !unicode.In(r, combining) { + s++ + } + + b = b[size:] + } + + return s +} + +// CharacterCount returns the number of characters in a string +// Similar to utf8.RuneCountInString but for unicode characters +func CharacterCountInString(str string) int { + s := 0 + + for _, r := range str { + if !unicode.In(r, combining) { + s++ + } + } + + return s +}