From bc75601622b5ef29abf593c715c84c5dbf16323f Mon Sep 17 00:00:00 2001 From: Lee ByeongJun Date: Mon, 20 Jan 2025 03:24:43 +0900 Subject: [PATCH 1/2] buffer --- fixer_v2/query/buffer.go | 165 ++++++++++++++ fixer_v2/query/buffer_test.go | 415 ++++++++++++++++++++++++++++++++++ fixer_v2/query/hole_test.go | 37 +++ fixer_v2/query/internal.go | 32 +-- fixer_v2/query/parser.go | 1 - 5 files changed, 634 insertions(+), 16 deletions(-) create mode 100644 fixer_v2/query/buffer.go create mode 100644 fixer_v2/query/buffer_test.go diff --git a/fixer_v2/query/buffer.go b/fixer_v2/query/buffer.go new file mode 100644 index 0000000..3e86cf5 --- /dev/null +++ b/fixer_v2/query/buffer.go @@ -0,0 +1,165 @@ +package query + +import ( + "fmt" + "io" + "strings" +) + +// TODO: should handle Unicode characters? +// TODO: make thread-safe + +// buffer represents a state machine based parser buffer that tracks character transitions +// and accumulates tokens. It maintains internal state for parsing both meta-variables +// and regular text tokens. +type buffer struct { + data []byte // Raw input bytes + length int // Length of input data + index int // Current position in data + + last States // Previous state + state States // Current state + class Classes // Character class of current byte + + tokenStart int // Starting position of current token + tokenValue strings.Builder // Accumulates characters for current token +} + +// newBuffer creates a new buffer instance initialized with the input string. +// The buffer starts in the GO (initial) state. +func newBuffer(input string) *buffer { + return &buffer{ + data: []byte(input), + length: len(input), + index: 0, + last: GO, + state: GO, + } +} + +// startToken begins accumulating a new token by recording the start position +// and resetting the token value builder. This should be called at the start +// of parsing any new token. +func (b *buffer) startToken() { + b.tokenStart = b.index + b.tokenValue.Reset() +} + +// getClass determines the character class of the current byte in the buffer. +// Returns `C_OTHER` if beyond buffer bounds. +func (b *buffer) getClass() Classes { + if b.index >= b.length { + return C_OTHER + } + return getCharacterClass(b.data[b.index]) +} + +// transition performs a state transition based on the current character and state. +// Returns the next state and we can detect any error that occurred during transition. +func (b *buffer) transition() (States, error) { + if b.index >= b.length { + return __, io.EOF + } + + b.class = b.getClass() + nextState := StateTransitionTable[b.state][b.class] + + // check for error state + if nextState == ER { + return ER, fmt.Errorf("invalid syntax at position %d", b.index) + } + + // update state + b.last = b.state + b.state = nextState + + return b.state, nil +} + +// parseMetaVariable parses a meta-variable pattern like :[name] or :[name:type] +// and returns the corresponding HoleConfig. +// +// The parsing process: +// 1. Starts with ':' character +// 2. Accumulates characters while tracking state transitions +// 3. Handles closing brackets (CB or QB states) +// 4. Optionally processes quantifiers (*, +, ?) +func (b *buffer) parseMetaVariable() (*HoleConfig, error) { + b.startToken() + + // check initial state + if b.index >= b.length || b.data[b.index] != ':' { + return nil, fmt.Errorf("expected ':' at position %d", b.index) + } + + for b.index < b.length { + state, err := b.transition() + if err != nil { + return nil, err + } + + // process current character + b.tokenValue.WriteByte(b.data[b.index]) + b.index++ + + // CB(closing bracket) or QB(double closing bracket) state reached + if state == CB || state == QB { + // check if next character is quantifier + if b.index < b.length && isQuantifier(b.data[b.index]) { + b.tokenValue.WriteByte(b.data[b.index]) + b.index++ + state = QT + } + + // create token + value := b.tokenValue.String() + config, err := ParseHolePattern(value) + if err != nil { + return nil, err + } + return config, nil + } + } + + return nil, fmt.Errorf("incomplete meta variable at position %d", b.tokenStart) +} + +// parseText parses regular text content until a special character or meta-variable +// pattern is encountered. Handles both regular text and whitespace. +// +// The parsing process: +// 1. Accumulates characters while in TX or WS states +// 2. Stops at special characters (CL, OB, DB states) +// 3. Returns accumulated text or error if no text found +func (b *buffer) parseText() (string, error) { + b.startToken() + + for b.index < b.length { + state, err := b.transition() + if err != nil && err != io.EOF { + return "", err + } + + currentChar := b.data[b.index] + + // stop at special characters or meta-variable start + if state == CL || state == OB || state == DB { + break + } + + // handle whitespace or regular text + if state == TX || state == WS { + b.tokenValue.WriteByte(currentChar) + b.index++ + continue + } + + break + } + + if b.tokenValue.Len() == 0 { + return "", fmt.Errorf("no text found at position %d", b.tokenStart) + } + + return b.tokenValue.String(), nil +} diff --git a/fixer_v2/query/buffer_test.go b/fixer_v2/query/buffer_test.go new file mode 100644 index 0000000..a3e78c1 --- /dev/null +++ b/fixer_v2/query/buffer_test.go @@ -0,0 +1,415 @@ +package query + +import ( + "testing" +) + +func TestNewBuffer(t *testing.T) { + tests := []struct { + name string + input string + wantLen int + wantLast States + wantState States + }{ + { + name: "empty input", + input: "", + wantLen: 0, + wantLast: GO, + wantState: GO, + }, + { + name: "simple input", + input: "test", + wantLen: 4, + wantLast: GO, + wantState: GO, + }, + { + name: "with metavariable", + input: ":[test]", + wantLen: 7, + wantLast: GO, + wantState: GO, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := newBuffer(tt.input) + if b.length != tt.wantLen { + t.Errorf("newBuffer().length = %v, want %v", b.length, tt.wantLen) + } + if b.last != tt.wantLast { + t.Errorf("newBuffer().last = %v, want %v", b.last, tt.wantLast) + } + if b.state != tt.wantState { + t.Errorf("newBuffer().state = %v, want %v", b.state, tt.wantState) + } + }) + } +} + +func TestBuffer_StartToken(t *testing.T) { + b := newBuffer("test input") + b.index = 5 + b.tokenValue.WriteString("existing") + + b.startToken() + + if b.tokenStart != 5 { + t.Errorf("buffer.tokenStart = %v, want %v", b.tokenStart, 5) + } + if b.tokenValue.Len() != 0 { + t.Errorf("buffer.tokenValue length = %v, want 0", b.tokenValue.Len()) + } +} + +func TestBuffer_GetClass(t *testing.T) { + tests := []struct { + name string + input string + index int + want Classes + }{ + { + name: "colon", + input: ":", + index: 0, + want: C_COLON, + }, + { + name: "left bracket", + input: "[", + index: 0, + want: C_LBRACK, + }, + { + name: "identifier", + input: "abc", + index: 0, + want: C_IDENT, + }, + { + name: "out of bounds", + input: "", + index: 0, + want: C_OTHER, + }, + { + name: "whitespace", + input: " \t\n", + index: 0, + want: C_SPACE, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := newBuffer(tt.input) + b.index = tt.index + if got := b.getClass(); got != tt.want { + t.Errorf("buffer.getClass() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestBuffer_Transition(t *testing.T) { + tests := []struct { + name string + input string + initState States + wantState States + wantErr bool + }{ + { + name: "start of metavariable", + input: ":[test]", + initState: GO, + wantState: CL, + wantErr: false, + }, + { + name: "invalid transition", + input: "]", + initState: GO, + wantState: ER, + wantErr: true, + }, + { + name: "empty input", + input: "", + initState: GO, + wantState: __, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := newBuffer(tt.input) + b.state = tt.initState + + got, err := b.transition() + if (err != nil) != tt.wantErr { + t.Errorf("buffer.transition() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.wantState { + t.Errorf("buffer.transition() = %v, want %v", got, tt.wantState) + } + }) + } +} + +func TestBuffer_ParseMetaVariable(t *testing.T) { + tests := []struct { + name string + input string + want *HoleConfig + wantErr bool + }{ + { + name: "simple metavariable", + input: ":[test]", + want: &HoleConfig{ + Name: "test", + Type: HoleAny, + Quantifier: QuantNone, + }, + wantErr: false, + }, + { + name: "typed metavariable", + input: ":[test:identifier]", + want: &HoleConfig{ + Name: "test", + Type: HoleIdentifier, + Quantifier: QuantNone, + }, + wantErr: false, + }, + { + name: "incomplete metavariable", + input: ":[test", + want: nil, + wantErr: true, + }, + { + name: "simple metavariable with plus quantifier", + input: ":[test]+", + want: &HoleConfig{ + Name: "test", + Type: HoleAny, + Quantifier: QuantOneOrMore, + }, + wantErr: false, + }, + { + name: "simple metavariable with star quantifier", + input: ":[test]*", + want: &HoleConfig{ + Name: "test", + Type: HoleAny, + Quantifier: QuantZeroOrMore, + }, + wantErr: false, + }, + { + name: "simple metavariable with question mark quantifier", + input: ":[test]?", + want: &HoleConfig{ + Name: "test", + Type: HoleAny, + Quantifier: QuantZeroOrOne, + }, + wantErr: false, + }, + { + name: "typed metavariable with whitespace type", + input: ":[ws:whitespace]", + want: &HoleConfig{ + Name: "ws", + Type: HoleWhitespace, + Quantifier: QuantNone, + }, + wantErr: false, + }, + { + name: "typed metavariable but no type", + input: ":[test:]", + want: nil, + wantErr: true, + }, + { + name: "typed metavariable but only colon", + input: ":[:]", + want: nil, + wantErr: true, + }, + { + name: "typed metavariable with block type and quantifier", + input: ":[b:block]*", + want: &HoleConfig{ + Name: "b", + Type: HoleBlock, + Quantifier: QuantZeroOrMore, + }, + wantErr: false, + }, + { + name: "invalid metavariable - empty name", + input: ":[]", + want: nil, + wantErr: true, + }, + { + name: "invalid metavariable - invalid type", + input: ":[test:invalid]", + want: nil, + wantErr: true, + }, + { + name: "invalid metavariable - multiple colons", + input: ":[test:type:extra]", + want: nil, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := newBuffer(tt.input) + got, err := b.parseMetaVariable() + if (err != nil) != tt.wantErr { + t.Errorf("buffer.parseMetaVariable() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !tt.wantErr && !compareHoleConfig(got, tt.want) { + t.Errorf("buffer.parseMetaVariable() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestBuffer_ParseText(t *testing.T) { + tests := []struct { + name string + input string + want string + wantErr bool + }{ + { + name: "simple text", + input: "hello", + want: "hello", + wantErr: false, + }, + { + name: "long text", + input: "Hello world This is a test string with some content", + want: "Hello world This is a test string with some content", + wantErr: false, + }, + { + name: "text until special char", + input: "hello:[test]", + want: "hello", + wantErr: false, + }, + { + name: "only whitespace", + input: " \t\n", + want: " \t\n", + wantErr: false, + }, + { + name: "text until metavariable", + input: "hello:[var]", + want: "hello", + wantErr: false, + }, + { + name: "text between metavariables", + input: ":[var1]middle:[var2]", + want: "", + wantErr: true, + }, + { + name: "empty input", + input: "", + want: "", + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + b := newBuffer(tt.input) + got, err := b.parseText() + if (err != nil) != tt.wantErr { + t.Errorf("buffer.parseText() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("buffer.parseText() = %v, want %v", got, tt.want) + } + }) + } +} + +func compareHoleConfig(a, b *HoleConfig) bool { + if a == nil || b == nil { + return a == b + } + return a.Name == b.Name && + a.Type == b.Type && + a.Quantifier == b.Quantifier +} + +func BenchmarkBuffer_ParseMetaVariable(b *testing.B) { + cases := []struct { + name string + input string + }{ + { + name: "simple", + input: ":[var]", + }, + { + name: "identifier_with_quantifier", + input: ":[test:identifier]*", + }, + { + name: "block_with_quantifier", + input: ":[block:block]+", + }, + { + name: "expression_optional", + input: ":[expr:expression]?", + }, + { + name: "whitespace", + input: ":[ws:whitespace]", + }, + { + name: "multiple hole expressions", + input: ":[[var:identifier]]+ :[[expr:expression]]?", + }, + } + + for _, tc := range cases { + b.Run(tc.name, func(b *testing.B) { + buffer := newBuffer(tc.input) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + buffer.index = 0 + buffer.state = GO + _, _ = buffer.parseMetaVariable() + } + }) + } +} diff --git a/fixer_v2/query/hole_test.go b/fixer_v2/query/hole_test.go index 60d3622..edc641b 100644 --- a/fixer_v2/query/hole_test.go +++ b/fixer_v2/query/hole_test.go @@ -263,3 +263,40 @@ func TestMatchHoleWithConfig(t *testing.T) { }) } } + +func BenchmarkParseHolePattern(b *testing.B) { + patterns := []struct { + name string + pattern string + }{ + { + name: "simple", + pattern: ":[var]", + }, + { + name: "identifier_with_quantifier", + pattern: ":[[name:identifier]]*", + }, + { + name: "block_with_quantifier", + pattern: ":[[block:block]]+", + }, + { + name: "complex_expression", + pattern: ":[[expr:expression]]?", + }, + { + name: "multiple hole expressions", + pattern: ":[[var:identifier]]+ :[[expr:expression]]?", + }, + } + + for _, p := range patterns { + b.Run(p.name, func(b *testing.B) { + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, _ = ParseHolePattern(p.pattern) + } + }) + } +} diff --git a/fixer_v2/query/internal.go b/fixer_v2/query/internal.go index 8516f53..3801a06 100644 --- a/fixer_v2/query/internal.go +++ b/fixer_v2/query/internal.go @@ -59,6 +59,8 @@ type ( Classes int8 // Represents character classes in the pattern ) +const __ States = -1 + // States represent different stages of lexical analysis: // - GO (0) - Initial state, ready to start processing input // - OK (1) - Accept state, token successfully recognized @@ -114,21 +116,21 @@ const ( // 3. After quantifiers (QT), we can continue with any valid pattern start // 4. TX (text) state allows transitioning back to pattern parsing var StateTransitionTable = [14][9]States{ - // COLON LBRACK RBRACK LBRACE RBRACE SPACE IDENT QUANT OTHER - /* GO 0*/ { CL, OB, ER, BR, BR, WS, TX, ER, ER }, - /* OK 1*/ { CL, OB, ER, BR, BR, WS, TX, ER, ER }, - /* CL 2*/ { TX, OB, ER, ER, ER, ER, ID, ER, ER }, - /* OB 3*/ { TX, DB, ER, ER, ER, ER, NM, ER, ER }, - /* DB 4*/ { TX, ER, ER, ER, ER, ER, NM, ER, ER }, - /* NM 5*/ { ID, ER, CB, ER, ER, ER, NM, ER, ER }, - /* ID 6*/ { ER, ER, CB, ER, ER, ER, ID, ER, ER }, - /* CB 7*/ { OK, ER, QB, ER, ER, WS, TX, QT, ER }, - /* QB 8*/ { OK, ER, ER, ER, ER, WS, TX, QT, ER }, - /* QT 9*/ { CL, ER, ER, BR, BR, WS, TX, ER, ER }, - /* TX10*/ { CL, ER, ER, BR, BR, WS, TX, ER, ER }, - /* WS11*/ { CL, ER, ER, BR, BR, WS, TX, ER, ER }, - /* BR12*/ { CL, ER, ER, BR, OK, WS, TX, ER, ER }, - /* ER13*/ { ER, ER, ER, ER, ER, ER, ER, ER, ER }, + // COLON LBRACK RBRACK LBRACE RBRACE SPACE IDENT QUANT OTHER + /* GO 0*/ { CL, OB, ER, BR, BR, WS, TX, ER, ER }, + /* OK 1*/ { CL, OB, ER, BR, BR, WS, TX, ER, ER }, + /* CL 2*/ { TX, OB, ER, ER, ER, ER, ID, ER, ER }, + /* OB 3*/ { TX, DB, ER, ER, ER, ER, NM, ER, ER }, + /* DB 4*/ { TX, ER, ER, ER, ER, ER, NM, ER, ER }, + /* NM 5*/ { ID, ER, CB, ER, ER, ER, NM, ER, ER }, + /* ID 6*/ { ER, ER, CB, ER, ER, ER, ID, ER, ER }, + /* CB 7*/ { OK, ER, QB, ER, ER, WS, TX, QT, ER }, + /* QB 8*/ { OK, ER, ER, ER, ER, WS, TX, QT, ER }, + /* QT 9*/ { CL, ER, ER, BR, BR, WS, TX, ER, ER }, + /* TX10*/ { CL, ER, ER, BR, BR, WS, TX, ER, ER }, + /* WS11*/ { CL, ER, ER, BR, BR, WS, TX, ER, ER }, + /* BR12*/ { CL, ER, ER, BR, OK, WS, TX, ER, ER }, + /* ER13*/ { ER, ER, ER, ER, ER, ER, ER, ER, ER }, } // isFinalState determines whether a given state is a final (accepting) state. diff --git a/fixer_v2/query/parser.go b/fixer_v2/query/parser.go index db3996f..9416f5f 100644 --- a/fixer_v2/query/parser.go +++ b/fixer_v2/query/parser.go @@ -34,7 +34,6 @@ func (p *Parser) Parse() Node { return rootNode } -// parseNode parses a single node based on the current token // parseNode parses a single node based on the current token func (p *Parser) parseNode() Node { token := p.tokens[p.current] From b7b5f0416efca926782f55ae1e847ef518dd7f0c Mon Sep 17 00:00:00 2001 From: Lee ByeongJun Date: Mon, 20 Jan 2025 03:28:24 +0900 Subject: [PATCH 2/2] ci --- fixer_v2/query/buffer.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fixer_v2/query/buffer.go b/fixer_v2/query/buffer.go index 3e86cf5..9b7cc28 100644 --- a/fixer_v2/query/buffer.go +++ b/fixer_v2/query/buffer.go @@ -1,6 +1,7 @@ package query import ( + "errors" "fmt" "io" "strings" @@ -136,7 +137,7 @@ func (b *buffer) parseText() (string, error) { for b.index < b.length { state, err := b.transition() - if err != nil && err != io.EOF { + if err != nil && !errors.Is(err, io.EOF) { return "", err }