mirror of
				https://gitea.com/Lydanne/buildx.git
				synced 2025-11-04 18:13:42 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			269 lines
		
	
	
		
			8.5 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			269 lines
		
	
	
		
			8.5 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
package uniseg
 | 
						|
 | 
						|
import "unicode/utf8"
 | 
						|
 | 
						|
// The states of the grapheme cluster parser.
 | 
						|
const (
 | 
						|
	grAny = iota
 | 
						|
	grCR
 | 
						|
	grControlLF
 | 
						|
	grL
 | 
						|
	grLVV
 | 
						|
	grLVTT
 | 
						|
	grPrepend
 | 
						|
	grExtendedPictographic
 | 
						|
	grExtendedPictographicZWJ
 | 
						|
	grRIOdd
 | 
						|
	grRIEven
 | 
						|
)
 | 
						|
 | 
						|
// The grapheme cluster parser's breaking instructions.
 | 
						|
const (
 | 
						|
	grNoBoundary = iota
 | 
						|
	grBoundary
 | 
						|
)
 | 
						|
 | 
						|
// The grapheme cluster parser's state transitions. Maps (state, property) to
 | 
						|
// (new state, breaking instruction, rule number). The breaking instruction
 | 
						|
// always refers to the boundary between the last and next code point.
 | 
						|
//
 | 
						|
// This map is queried as follows:
 | 
						|
//
 | 
						|
//   1. Find specific state + specific property. Stop if found.
 | 
						|
//   2. Find specific state + any property.
 | 
						|
//   3. Find any state + specific property.
 | 
						|
//   4. If only (2) or (3) (but not both) was found, stop.
 | 
						|
//   5. If both (2) and (3) were found, use state and breaking instruction from
 | 
						|
//      the transition with the lower rule number, prefer (3) if rule numbers
 | 
						|
//      are equal. Stop.
 | 
						|
//   6. Assume grAny and grBoundary.
 | 
						|
var grTransitions = map[[2]int][3]int{
 | 
						|
	// GB5
 | 
						|
	{grAny, prCR}:      {grCR, grBoundary, 50},
 | 
						|
	{grAny, prLF}:      {grControlLF, grBoundary, 50},
 | 
						|
	{grAny, prControl}: {grControlLF, grBoundary, 50},
 | 
						|
 | 
						|
	// GB4
 | 
						|
	{grCR, prAny}:        {grAny, grBoundary, 40},
 | 
						|
	{grControlLF, prAny}: {grAny, grBoundary, 40},
 | 
						|
 | 
						|
	// GB3.
 | 
						|
	{grCR, prLF}: {grAny, grNoBoundary, 30},
 | 
						|
 | 
						|
	// GB6.
 | 
						|
	{grAny, prL}: {grL, grBoundary, 9990},
 | 
						|
	{grL, prL}:   {grL, grNoBoundary, 60},
 | 
						|
	{grL, prV}:   {grLVV, grNoBoundary, 60},
 | 
						|
	{grL, prLV}:  {grLVV, grNoBoundary, 60},
 | 
						|
	{grL, prLVT}: {grLVTT, grNoBoundary, 60},
 | 
						|
 | 
						|
	// GB7.
 | 
						|
	{grAny, prLV}: {grLVV, grBoundary, 9990},
 | 
						|
	{grAny, prV}:  {grLVV, grBoundary, 9990},
 | 
						|
	{grLVV, prV}:  {grLVV, grNoBoundary, 70},
 | 
						|
	{grLVV, prT}:  {grLVTT, grNoBoundary, 70},
 | 
						|
 | 
						|
	// GB8.
 | 
						|
	{grAny, prLVT}: {grLVTT, grBoundary, 9990},
 | 
						|
	{grAny, prT}:   {grLVTT, grBoundary, 9990},
 | 
						|
	{grLVTT, prT}:  {grLVTT, grNoBoundary, 80},
 | 
						|
 | 
						|
	// GB9.
 | 
						|
	{grAny, prExtend}: {grAny, grNoBoundary, 90},
 | 
						|
	{grAny, prZWJ}:    {grAny, grNoBoundary, 90},
 | 
						|
 | 
						|
	// GB9a.
 | 
						|
	{grAny, prSpacingMark}: {grAny, grNoBoundary, 91},
 | 
						|
 | 
						|
	// GB9b.
 | 
						|
	{grAny, prPreprend}: {grPrepend, grBoundary, 9990},
 | 
						|
	{grPrepend, prAny}:  {grAny, grNoBoundary, 92},
 | 
						|
 | 
						|
	// GB11.
 | 
						|
	{grAny, prExtendedPictographic}:                     {grExtendedPictographic, grBoundary, 9990},
 | 
						|
	{grExtendedPictographic, prExtend}:                  {grExtendedPictographic, grNoBoundary, 110},
 | 
						|
	{grExtendedPictographic, prZWJ}:                     {grExtendedPictographicZWJ, grNoBoundary, 110},
 | 
						|
	{grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110},
 | 
						|
 | 
						|
	// GB12 / GB13.
 | 
						|
	{grAny, prRegionalIndicator}:    {grRIOdd, grBoundary, 9990},
 | 
						|
	{grRIOdd, prRegionalIndicator}:  {grRIEven, grNoBoundary, 120},
 | 
						|
	{grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120},
 | 
						|
}
 | 
						|
 | 
						|
// Graphemes implements an iterator over Unicode extended grapheme clusters,
 | 
						|
// specified in the Unicode Standard Annex #29. Grapheme clusters correspond to
 | 
						|
// "user-perceived characters". These characters often consist of multiple
 | 
						|
// code points (e.g. the "woman kissing woman" emoji consists of 8 code points:
 | 
						|
// woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ +
 | 
						|
// woman) and the rules described in Annex #29 must be applied to group those
 | 
						|
// code points into clusters perceived by the user as one character.
 | 
						|
type Graphemes struct {
 | 
						|
	// The code points over which this class iterates.
 | 
						|
	codePoints []rune
 | 
						|
 | 
						|
	// The (byte-based) indices of the code points into the original string plus
 | 
						|
	// len(original string). Thus, len(indices) = len(codePoints) + 1.
 | 
						|
	indices []int
 | 
						|
 | 
						|
	// The current grapheme cluster to be returned. These are indices into
 | 
						|
	// codePoints/indices. If start == end, we either haven't started iterating
 | 
						|
	// yet (0) or the iteration has already completed (1).
 | 
						|
	start, end int
 | 
						|
 | 
						|
	// The index of the next code point to be parsed.
 | 
						|
	pos int
 | 
						|
 | 
						|
	// The current state of the code point parser.
 | 
						|
	state int
 | 
						|
}
 | 
						|
 | 
						|
// NewGraphemes returns a new grapheme cluster iterator.
 | 
						|
func NewGraphemes(s string) *Graphemes {
 | 
						|
	l := utf8.RuneCountInString(s)
 | 
						|
	codePoints := make([]rune, l)
 | 
						|
	indices := make([]int, l+1)
 | 
						|
	i := 0
 | 
						|
	for pos, r := range s {
 | 
						|
		codePoints[i] = r
 | 
						|
		indices[i] = pos
 | 
						|
		i++
 | 
						|
	}
 | 
						|
	indices[l] = len(s)
 | 
						|
	g := &Graphemes{
 | 
						|
		codePoints: codePoints,
 | 
						|
		indices:    indices,
 | 
						|
	}
 | 
						|
	g.Next() // Parse ahead.
 | 
						|
	return g
 | 
						|
}
 | 
						|
 | 
						|
// Next advances the iterator by one grapheme cluster and returns false if no
 | 
						|
// clusters are left. This function must be called before the first cluster is
 | 
						|
// accessed.
 | 
						|
func (g *Graphemes) Next() bool {
 | 
						|
	g.start = g.end
 | 
						|
 | 
						|
	// The state transition gives us a boundary instruction BEFORE the next code
 | 
						|
	// point so we always need to stay ahead by one code point.
 | 
						|
 | 
						|
	// Parse the next code point.
 | 
						|
	for g.pos <= len(g.codePoints) {
 | 
						|
		// GB2.
 | 
						|
		if g.pos == len(g.codePoints) {
 | 
						|
			g.end = g.pos
 | 
						|
			g.pos++
 | 
						|
			break
 | 
						|
		}
 | 
						|
 | 
						|
		// Determine the property of the next character.
 | 
						|
		nextProperty := property(g.codePoints[g.pos])
 | 
						|
		g.pos++
 | 
						|
 | 
						|
		// Find the applicable transition.
 | 
						|
		var boundary bool
 | 
						|
		transition, ok := grTransitions[[2]int{g.state, nextProperty}]
 | 
						|
		if ok {
 | 
						|
			// We have a specific transition. We'll use it.
 | 
						|
			g.state = transition[0]
 | 
						|
			boundary = transition[1] == grBoundary
 | 
						|
		} else {
 | 
						|
			// No specific transition found. Try the less specific ones.
 | 
						|
			transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}]
 | 
						|
			transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}]
 | 
						|
			if okAnyProp && okAnyState {
 | 
						|
				// Both apply. We'll use a mix (see comments for grTransitions).
 | 
						|
				g.state = transAnyState[0]
 | 
						|
				boundary = transAnyState[1] == grBoundary
 | 
						|
				if transAnyProp[2] < transAnyState[2] {
 | 
						|
					g.state = transAnyProp[0]
 | 
						|
					boundary = transAnyProp[1] == grBoundary
 | 
						|
				}
 | 
						|
			} else if okAnyProp {
 | 
						|
				// We only have a specific state.
 | 
						|
				g.state = transAnyProp[0]
 | 
						|
				boundary = transAnyProp[1] == grBoundary
 | 
						|
				// This branch will probably never be reached because okAnyState will
 | 
						|
				// always be true given the current transition map. But we keep it here
 | 
						|
				// for future modifications to the transition map where this may not be
 | 
						|
				// true anymore.
 | 
						|
			} else if okAnyState {
 | 
						|
				// We only have a specific property.
 | 
						|
				g.state = transAnyState[0]
 | 
						|
				boundary = transAnyState[1] == grBoundary
 | 
						|
			} else {
 | 
						|
				// No known transition. GB999: Any x Any.
 | 
						|
				g.state = grAny
 | 
						|
				boundary = true
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		// If we found a cluster boundary, let's stop here. The current cluster will
 | 
						|
		// be the one that just ended.
 | 
						|
		if g.pos-1 == 0 /* GB1 */ || boundary {
 | 
						|
			g.end = g.pos - 1
 | 
						|
			break
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return g.start != g.end
 | 
						|
}
 | 
						|
 | 
						|
// Runes returns a slice of runes (code points) which corresponds to the current
 | 
						|
// grapheme cluster. If the iterator is already past the end or Next() has not
 | 
						|
// yet been called, nil is returned.
 | 
						|
func (g *Graphemes) Runes() []rune {
 | 
						|
	if g.start == g.end {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	return g.codePoints[g.start:g.end]
 | 
						|
}
 | 
						|
 | 
						|
// Str returns a substring of the original string which corresponds to the
 | 
						|
// current grapheme cluster. If the iterator is already past the end or Next()
 | 
						|
// has not yet been called, an empty string is returned.
 | 
						|
func (g *Graphemes) Str() string {
 | 
						|
	if g.start == g.end {
 | 
						|
		return ""
 | 
						|
	}
 | 
						|
	return string(g.codePoints[g.start:g.end])
 | 
						|
}
 | 
						|
 | 
						|
// Bytes returns a byte slice which corresponds to the current grapheme cluster.
 | 
						|
// If the iterator is already past the end or Next() has not yet been called,
 | 
						|
// nil is returned.
 | 
						|
func (g *Graphemes) Bytes() []byte {
 | 
						|
	if g.start == g.end {
 | 
						|
		return nil
 | 
						|
	}
 | 
						|
	return []byte(string(g.codePoints[g.start:g.end]))
 | 
						|
}
 | 
						|
 | 
						|
// Positions returns the interval of the current grapheme cluster as byte
 | 
						|
// positions into the original string. The first returned value "from" indexes
 | 
						|
// the first byte and the second returned value "to" indexes the first byte that
 | 
						|
// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
 | 
						|
// the original string "str". If Next() has not yet been called, both values are
 | 
						|
// 0. If the iterator is already past the end, both values are 1.
 | 
						|
func (g *Graphemes) Positions() (int, int) {
 | 
						|
	return g.indices[g.start], g.indices[g.end]
 | 
						|
}
 | 
						|
 | 
						|
// Reset puts the iterator into its initial state such that the next call to
 | 
						|
// Next() sets it to the first grapheme cluster again.
 | 
						|
func (g *Graphemes) Reset() {
 | 
						|
	g.start, g.end, g.pos, g.state = 0, 0, 0, grAny
 | 
						|
	g.Next() // Parse ahead again.
 | 
						|
}
 | 
						|
 | 
						|
// GraphemeClusterCount returns the number of user-perceived characters
 | 
						|
// (grapheme clusters) for the given string. To calculate this number, it
 | 
						|
// iterates through the string using the Graphemes iterator.
 | 
						|
func GraphemeClusterCount(s string) (n int) {
 | 
						|
	g := NewGraphemes(s)
 | 
						|
	for g.Next() {
 | 
						|
		n++
 | 
						|
	}
 | 
						|
	return
 | 
						|
}
 |