Final touches for combining character support

This commit is contained in:
Zachary Yedidia
2020-05-20 17:00:56 -04:00
parent 79c0ea17ad
commit ff0683d6d0
5 changed files with 128 additions and 18 deletions

View File

@@ -1,15 +1,11 @@
package util
import (
"unicode/utf8"
)
// LuaRuneAt is a helper function for lua plugins to return the rune
// at an index within a string
func LuaRuneAt(str string, runeidx int) string {
i := 0
for len(str) > 0 {
r, size := utf8.DecodeRuneInString(str)
r, _, size := DecodeCharacterInString(str)
str = str[size:]
@@ -26,7 +22,7 @@ func LuaRuneAt(str string, runeidx int) string {
func LuaGetLeadingWhitespace(s string) string {
ws := []byte{}
for len(s) > 0 {
r, size := utf8.DecodeRuneInString(s)
r, _, size := DecodeCharacterInString(s)
if r == ' ' || r == '\t' {
ws = append(ws, byte(r))
} else {
@@ -40,6 +36,6 @@ func LuaGetLeadingWhitespace(s string) string {
// LuaIsWordChar returns true if the first rune in a string is a word character
func LuaIsWordChar(s string) bool {
r, _ := utf8.DecodeRuneInString(s)
r, _, _ := DecodeCharacterInString(s)
return IsWordChar(r)
}

View File

@@ -5,6 +5,17 @@ import (
"unicode/utf8"
)
// Unicode is annoying. A "code point" (rune in Go-speak) may need up to
// 4 bytes to represent it. In general, a code point will represent a
// complete character, but this is not always the case. A character with
// accents may be made up of multiple code points (the code point for the
// original character, and additional code points for each accent/marking).
// The functions below are meant to help deal with these additional "combining"
// code points. In underlying operations (search, replace, etc...), micro will
// treat a character with combining code points as just the original code point.
// For rendering, micro will display the combining characters. It's not perfect
// but it's pretty good.
// combining character range table
var combining = &unicode.RangeTable{
R16: []unicode.Range16{
@@ -35,6 +46,25 @@ func DecodeCharacter(b []byte) (rune, []rune, int) {
return r, combc, size
}
// DecodeCharacterInString returns the next character from a string
// A character is a rune along with any accompanying combining runes
func DecodeCharacterInString(str string) (rune, []rune, int) {
r, size := utf8.DecodeRuneInString(str)
str = str[size:]
c, s := utf8.DecodeRuneInString(str)
var combc []rune
for unicode.In(c, combining) {
combc = append(combc, c)
size += s
str = str[s:]
c, s = utf8.DecodeRuneInString(str)
}
return r, combc, size
}
// CharacterCount returns the number of characters in a byte array
// Similar to utf8.RuneCount but for unicode characters
func CharacterCount(b []byte) int {

View File

@@ -13,7 +13,6 @@ import (
"strings"
"time"
"unicode"
"unicode/utf8"
"github.com/blang/semver"
runewidth "github.com/mattn/go-runewidth"
@@ -82,7 +81,7 @@ func SliceEndStr(str string, index int) string {
return str[totalSize:]
}
_, size := utf8.DecodeRuneInString(str[totalSize:])
_, _, size := DecodeCharacterInString(str[totalSize:])
totalSize += size
i++
}
@@ -119,7 +118,7 @@ func SliceStartStr(str string, index int) string {
return str[:totalSize]
}
_, size := utf8.DecodeRuneInString(str[totalSize:])
_, _, size := DecodeCharacterInString(str[totalSize:])
totalSize += size
i++
}