mirror of
				https://gitea.com/Lydanne/buildx.git
				synced 2025-11-04 10:03:42 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			133 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			Ragel
		
	
	
	
	
	
			
		
		
	
	
			133 lines
		
	
	
		
			3.8 KiB
		
	
	
	
		
			Ragel
		
	
	
	
	
	
package textseg
 | 
						|
 | 
						|
import (
 | 
						|
    "errors"
 | 
						|
    "unicode/utf8"
 | 
						|
)
 | 
						|
 | 
						|
// Generated from grapheme_clusters.rl. DO NOT EDIT
 | 
						|
%%{
 | 
						|
  # (except you are actually in grapheme_clusters.rl here, so edit away!)
 | 
						|
 | 
						|
  machine graphclust;
 | 
						|
  write data;
 | 
						|
}%%
 | 
						|
 | 
						|
var Error = errors.New("invalid UTF8 text")
 | 
						|
 | 
						|
// ScanGraphemeClusters is a split function for bufio.Scanner that splits
 | 
						|
// on grapheme cluster boundaries.
 | 
						|
func ScanGraphemeClusters(data []byte, atEOF bool) (int, []byte, error) {
 | 
						|
    if len(data) == 0 {
 | 
						|
        return 0, nil, nil
 | 
						|
    }
 | 
						|
 | 
						|
    // Ragel state
 | 
						|
	cs := 0 // Current State
 | 
						|
	p := 0  // "Pointer" into data
 | 
						|
	pe := len(data) // End-of-data "pointer"
 | 
						|
    ts := 0
 | 
						|
    te := 0
 | 
						|
    act := 0
 | 
						|
    eof := pe
 | 
						|
 | 
						|
    // Make Go compiler happy
 | 
						|
    _ = ts
 | 
						|
    _ = te
 | 
						|
    _ = act
 | 
						|
    _ = eof
 | 
						|
 | 
						|
    startPos := 0
 | 
						|
    endPos := 0
 | 
						|
 | 
						|
    %%{
 | 
						|
        include GraphemeCluster "grapheme_clusters_table.rl";
 | 
						|
 | 
						|
        action start {
 | 
						|
            startPos = p
 | 
						|
        }
 | 
						|
 | 
						|
        action end {
 | 
						|
            endPos = p
 | 
						|
        }
 | 
						|
 | 
						|
        action emit {
 | 
						|
            return endPos+1, data[startPos:endPos+1], nil
 | 
						|
        }
 | 
						|
 | 
						|
        ZWJGlue = ZWJ (Glue_After_Zwj | E_Base_GAZ Extend* E_Modifier?)?;
 | 
						|
        AnyExtender = Extend | ZWJGlue | SpacingMark;
 | 
						|
        Extension = AnyExtender*;
 | 
						|
        ReplacementChar = (0xEF 0xBF 0xBD);
 | 
						|
 | 
						|
        CRLFSeq = CR LF;
 | 
						|
        ControlSeq = Control | ReplacementChar;
 | 
						|
        HangulSeq = (
 | 
						|
            L+ (((LV? V+ | LVT) T*)?|LV?) |
 | 
						|
            LV V* T* |
 | 
						|
            V+ T* |
 | 
						|
            LVT T* |
 | 
						|
            T+
 | 
						|
        ) Extension;
 | 
						|
        EmojiSeq = (E_Base | E_Base_GAZ) Extend* E_Modifier? Extension;
 | 
						|
        ZWJSeq = ZWJGlue Extension;
 | 
						|
        EmojiFlagSeq = Regional_Indicator Regional_Indicator? Extension;
 | 
						|
 | 
						|
        UTF8Cont = 0x80 .. 0xBF;
 | 
						|
        AnyUTF8 = (
 | 
						|
            0x00..0x7F |
 | 
						|
            0xC0..0xDF . UTF8Cont |
 | 
						|
            0xE0..0xEF . UTF8Cont . UTF8Cont |
 | 
						|
            0xF0..0xF7 . UTF8Cont . UTF8Cont . UTF8Cont
 | 
						|
        );
 | 
						|
 | 
						|
        # OtherSeq is any character that isn't at the start of one of the extended sequences above, followed by extension
 | 
						|
        OtherSeq = (AnyUTF8 - (CR|LF|Control|ReplacementChar|L|LV|V|LVT|T|E_Base|E_Base_GAZ|ZWJ|Regional_Indicator|Prepend)) Extension;
 | 
						|
 | 
						|
        # PrependSeq is prepend followed by any of the other patterns above, except control characters which explicitly break
 | 
						|
        PrependSeq = Prepend+ (HangulSeq|EmojiSeq|ZWJSeq|EmojiFlagSeq|OtherSeq)?;
 | 
						|
 | 
						|
        CRLFTok = CRLFSeq >start @end;
 | 
						|
        ControlTok = ControlSeq >start @end;
 | 
						|
        HangulTok = HangulSeq >start @end;
 | 
						|
        EmojiTok = EmojiSeq >start @end;
 | 
						|
        ZWJTok = ZWJSeq >start @end;
 | 
						|
        EmojiFlagTok = EmojiFlagSeq >start @end;
 | 
						|
        OtherTok = OtherSeq >start @end;
 | 
						|
        PrependTok = PrependSeq >start @end;
 | 
						|
 | 
						|
        main := |*
 | 
						|
            CRLFTok => emit;
 | 
						|
            ControlTok => emit;
 | 
						|
            HangulTok => emit;
 | 
						|
            EmojiTok => emit;
 | 
						|
            ZWJTok => emit;
 | 
						|
            EmojiFlagTok => emit;
 | 
						|
            PrependTok => emit;
 | 
						|
            OtherTok => emit;
 | 
						|
 | 
						|
            # any single valid UTF-8 character would also be valid per spec,
 | 
						|
            # but we'll handle that separately after the loop so we can deal
 | 
						|
            # with requesting more bytes if we're not at EOF.
 | 
						|
        *|;
 | 
						|
 | 
						|
        write init;
 | 
						|
        write exec;
 | 
						|
    }%%
 | 
						|
 | 
						|
    // If we fall out here then we were unable to complete a sequence.
 | 
						|
    // If we weren't able to complete a sequence then either we've
 | 
						|
    // reached the end of a partial buffer (so there's more data to come)
 | 
						|
    // or we have an isolated symbol that would normally be part of a
 | 
						|
    // grapheme cluster but has appeared in isolation here.
 | 
						|
 | 
						|
    if !atEOF {
 | 
						|
        // Request more
 | 
						|
        return 0, nil, nil
 | 
						|
    }
 | 
						|
 | 
						|
    // Just take the first UTF-8 sequence and return that.
 | 
						|
    _, seqLen := utf8.DecodeRune(data)
 | 
						|
    return seqLen, data[:seqLen], nil
 | 
						|
}
 |