summaryrefslogtreecommitdiff
path: root/vendor/github.com/hashicorp/hil/scanner/scanner.go
blob: bab86c67a63e0044e404e969bb9258a235a9534e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
package scanner

import (
	"unicode"
	"unicode/utf8"

	"github.com/hashicorp/hil/ast"
)

// Scan returns a channel that recieves Tokens from the given input string.
//
// The scanner's job is just to partition the string into meaningful parts.
// It doesn't do any transformation of the raw input string, so the caller
// must deal with any further interpretation required, such as parsing INTEGER
// tokens into real ints, or dealing with escape sequences in LITERAL or
// STRING tokens.
//
// Strings in the returned tokens are slices from the original string.
//
// startPos should be set to ast.InitPos unless the caller knows that
// this interpolation string is part of a larger file and knows the position
// of the first character in that larger file.
func Scan(s string, startPos ast.Pos) <-chan *Token {
	ch := make(chan *Token)
	go scan(s, ch, startPos)
	return ch
}

func scan(s string, ch chan<- *Token, pos ast.Pos) {
	// 'remain' starts off as the whole string but we gradually
	// slice of the front of it as we work our way through.
	remain := s

	// nesting keeps track of how many ${ .. } sequences we are
	// inside, so we can recognize the minor differences in syntax
	// between outer string literals (LITERAL tokens) and quoted
	// string literals (STRING tokens).
	nesting := 0

	// We're going to flip back and forth between parsing literals/strings
	// and parsing interpolation sequences ${ .. } until we reach EOF or
	// some INVALID token.
All:
	for {
		startPos := pos
		// Literal string processing first, since the beginning of
		// a string is always outside of an interpolation sequence.
		literalVal, terminator := scanLiteral(remain, pos, nesting > 0)

		if len(literalVal) > 0 {
			litType := LITERAL
			if nesting > 0 {
				litType = STRING
			}
			ch <- &Token{
				Type:    litType,
				Content: literalVal,
				Pos:     startPos,
			}
			remain = remain[len(literalVal):]
		}

		ch <- terminator
		remain = remain[len(terminator.Content):]
		pos = terminator.Pos
		// Safe to use len() here because none of the terminator tokens
		// can contain UTF-8 sequences.
		pos.Column = pos.Column + len(terminator.Content)

		switch terminator.Type {
		case INVALID:
			// Synthetic EOF after invalid token, since further scanning
			// is likely to just produce more garbage.
			ch <- &Token{
				Type:    EOF,
				Content: "",
				Pos:     pos,
			}
			break All
		case EOF:
			// All done!
			break All
		case BEGIN:
			nesting++
		case CQUOTE:
			// nothing special to do
		default:
			// Should never happen
			panic("invalid string/literal terminator")
		}

		// Now we do the processing of the insides of ${ .. } sequences.
		// This loop terminates when we encounter either a closing } or
		// an opening ", which will cause us to return to literal processing.
	Interpolation:
		for {

			token, size, newPos := scanInterpolationToken(remain, pos)
			ch <- token
			remain = remain[size:]
			pos = newPos

			switch token.Type {
			case INVALID:
				// Synthetic EOF after invalid token, since further scanning
				// is likely to just produce more garbage.
				ch <- &Token{
					Type:    EOF,
					Content: "",
					Pos:     pos,
				}
				break All
			case EOF:
				// All done
				// (though a syntax error that we'll catch in the parser)
				break All
			case END:
				nesting--
				if nesting < 0 {
					// Can happen if there are unbalanced ${ and } sequences
					// in the input, which we'll catch in the parser.
					nesting = 0
				}
				break Interpolation
			case OQUOTE:
				// Beginning of nested quoted string
				break Interpolation
			}
		}
	}

	close(ch)
}

// Returns the token found at the start of the given string, followed by
// the number of bytes that were consumed from the string and the adjusted
// source position.
//
// Note that the number of bytes consumed can be more than the length of
// the returned token contents if the string begins with whitespace, since
// it will be silently consumed before reading the token.
func scanInterpolationToken(s string, startPos ast.Pos) (*Token, int, ast.Pos) {
	pos := startPos
	size := 0

	// Consume whitespace, if any
	for len(s) > 0 && byteIsSpace(s[0]) {
		if s[0] == '\n' {
			pos.Column = 1
			pos.Line++
		} else {
			pos.Column++
		}
		size++
		s = s[1:]
	}

	// Unexpected EOF during sequence
	if len(s) == 0 {
		return &Token{
			Type:    EOF,
			Content: "",
			Pos:     pos,
		}, size, pos
	}

	next := s[0]
	var token *Token

	switch next {
	case '(', ')', '[', ']', ',', '.', '+', '-', '*', '/', '%', '?', ':':
		// Easy punctuation symbols that don't have any special meaning
		// during scanning, and that stand for themselves in the
		// TokenType enumeration.
		token = &Token{
			Type:    TokenType(next),
			Content: s[:1],
			Pos:     pos,
		}
	case '}':
		token = &Token{
			Type:    END,
			Content: s[:1],
			Pos:     pos,
		}
	case '"':
		token = &Token{
			Type:    OQUOTE,
			Content: s[:1],
			Pos:     pos,
		}
	case '!':
		if len(s) >= 2 && s[:2] == "!=" {
			token = &Token{
				Type:    NOTEQUAL,
				Content: s[:2],
				Pos:     pos,
			}
		} else {
			token = &Token{
				Type:    BANG,
				Content: s[:1],
				Pos:     pos,
			}
		}
	case '<':
		if len(s) >= 2 && s[:2] == "<=" {
			token = &Token{
				Type:    LTE,
				Content: s[:2],
				Pos:     pos,
			}
		} else {
			token = &Token{
				Type:    LT,
				Content: s[:1],
				Pos:     pos,
			}
		}
	case '>':
		if len(s) >= 2 && s[:2] == ">=" {
			token = &Token{
				Type:    GTE,
				Content: s[:2],
				Pos:     pos,
			}
		} else {
			token = &Token{
				Type:    GT,
				Content: s[:1],
				Pos:     pos,
			}
		}
	case '=':
		if len(s) >= 2 && s[:2] == "==" {
			token = &Token{
				Type:    EQUAL,
				Content: s[:2],
				Pos:     pos,
			}
		} else {
			// A single equals is not a valid operator
			token = &Token{
				Type:    INVALID,
				Content: s[:1],
				Pos:     pos,
			}
		}
	case '&':
		if len(s) >= 2 && s[:2] == "&&" {
			token = &Token{
				Type:    AND,
				Content: s[:2],
				Pos:     pos,
			}
		} else {
			token = &Token{
				Type:    INVALID,
				Content: s[:1],
				Pos:     pos,
			}
		}
	case '|':
		if len(s) >= 2 && s[:2] == "||" {
			token = &Token{
				Type:    OR,
				Content: s[:2],
				Pos:     pos,
			}
		} else {
			token = &Token{
				Type:    INVALID,
				Content: s[:1],
				Pos:     pos,
			}
		}
	default:
		if next >= '0' && next <= '9' {
			num, numType := scanNumber(s)
			token = &Token{
				Type:    numType,
				Content: num,
				Pos:     pos,
			}
		} else if stringStartsWithIdentifier(s) {
			ident, runeLen := scanIdentifier(s)
			tokenType := IDENTIFIER
			if ident == "true" || ident == "false" {
				tokenType = BOOL
			}
			token = &Token{
				Type:    tokenType,
				Content: ident,
				Pos:     pos,
			}
			// Skip usual token handling because it doesn't
			// know how to deal with UTF-8 sequences.
			pos.Column = pos.Column + runeLen
			return token, size + len(ident), pos
		} else {
			_, byteLen := utf8.DecodeRuneInString(s)
			token = &Token{
				Type:    INVALID,
				Content: s[:byteLen],
				Pos:     pos,
			}
			// Skip usual token handling because it doesn't
			// know how to deal with UTF-8 sequences.
			pos.Column = pos.Column + 1
			return token, size + byteLen, pos
		}
	}

	// Here we assume that the token content contains no UTF-8 sequences,
	// because we dealt with UTF-8 characters as a special case where
	// necessary above.
	size = size + len(token.Content)
	pos.Column = pos.Column + len(token.Content)

	return token, size, pos
}

// Returns the (possibly-empty) prefix of the given string that represents
// a literal, followed by the token that marks the end of the literal.
func scanLiteral(s string, startPos ast.Pos, nested bool) (string, *Token) {
	litLen := 0
	pos := startPos
	var terminator *Token
	for {

		if litLen >= len(s) {
			if nested {
				// We've ended in the middle of a quoted string,
				// which means this token is actually invalid.
				return "", &Token{
					Type:    INVALID,
					Content: s,
					Pos:     startPos,
				}
			}
			terminator = &Token{
				Type:    EOF,
				Content: "",
				Pos:     pos,
			}
			break
		}

		next := s[litLen]

		if next == '$' && len(s) > litLen+1 {
			follow := s[litLen+1]

			if follow == '{' {
				terminator = &Token{
					Type:    BEGIN,
					Content: s[litLen : litLen+2],
					Pos:     pos,
				}
				pos.Column = pos.Column + 2
				break
			} else if follow == '$' {
				// Double-$ escapes the special processing of $,
				// so we will consume both characters here.
				pos.Column = pos.Column + 2
				litLen = litLen + 2
				continue
			}
		}

		// special handling that applies only to quoted strings
		if nested {
			if next == '"' {
				terminator = &Token{
					Type:    CQUOTE,
					Content: s[litLen : litLen+1],
					Pos:     pos,
				}
				pos.Column = pos.Column + 1
				break
			}

			// Escaped quote marks do not terminate the string.
			//
			// All we do here in the scanner is avoid terminating a string
			// due to an escaped quote. The parser is responsible for the
			// full handling of escape sequences, since it's able to produce
			// better error messages than we can produce in here.
			if next == '\\' && len(s) > litLen+1 {
				follow := s[litLen+1]

				if follow == '"' {
					// \" escapes the special processing of ",
					// so we will consume both characters here.
					pos.Column = pos.Column + 2
					litLen = litLen + 2
					continue
				}
			}
		}

		if next == '\n' {
			pos.Column = 1
			pos.Line++
			litLen++
		} else {
			pos.Column++

			// "Column" measures runes, so we need to actually consume
			// a valid UTF-8 character here.
			_, size := utf8.DecodeRuneInString(s[litLen:])
			litLen = litLen + size
		}

	}

	return s[:litLen], terminator
}

// scanNumber returns the extent of the prefix of the string that represents
// a valid number, along with what type of number it represents: INT or FLOAT.
//
// scanNumber does only basic character analysis: numbers consist of digits
// and periods, with at least one period signalling a FLOAT. It's the parser's
// responsibility to validate the form and range of the number, such as ensuring
// that a FLOAT actually contains only one period, etc.
func scanNumber(s string) (string, TokenType) {
	period := -1
	byteLen := 0
	numType := INTEGER
	for {
		if byteLen >= len(s) {
			break
		}

		next := s[byteLen]
		if next != '.' && (next < '0' || next > '9') {
			// If our last value was a period, then we're not a float,
			// we're just an integer that ends in a period.
			if period == byteLen-1 {
				byteLen--
				numType = INTEGER
			}

			break
		}

		if next == '.' {
			// If we've already seen a period, break out
			if period >= 0 {
				break
			}

			period = byteLen
			numType = FLOAT
		}

		byteLen++
	}

	return s[:byteLen], numType
}

// scanIdentifier returns the extent of the prefix of the string that
// represents a valid identifier, along with the length of that prefix
// in runes.
//
// Identifiers may contain utf8-encoded non-Latin letters, which will
// cause the returned "rune length" to be shorter than the byte length
// of the returned string.
func scanIdentifier(s string) (string, int) {
	byteLen := 0
	runeLen := 0
	for {
		if byteLen >= len(s) {
			break
		}

		nextRune, size := utf8.DecodeRuneInString(s[byteLen:])
		if !(nextRune == '_' ||
			nextRune == '-' ||
			nextRune == '.' ||
			nextRune == '*' ||
			unicode.IsNumber(nextRune) ||
			unicode.IsLetter(nextRune) ||
			unicode.IsMark(nextRune)) {
			break
		}

		// If we reach a star, it must be between periods to be part
		// of the same identifier.
		if nextRune == '*' && s[byteLen-1] != '.' {
			break
		}

		// If our previous character was a star, then the current must
		// be period. Otherwise, undo that and exit.
		if byteLen > 0 && s[byteLen-1] == '*' && nextRune != '.' {
			byteLen--
			if s[byteLen-1] == '.' {
				byteLen--
			}

			break
		}

		byteLen = byteLen + size
		runeLen = runeLen + 1
	}

	return s[:byteLen], runeLen
}

// byteIsSpace implements a restrictive interpretation of spaces that includes
// only what's valid inside interpolation sequences: spaces, tabs, newlines.
func byteIsSpace(b byte) bool {
	switch b {
	case ' ', '\t', '\r', '\n':
		return true
	default:
		return false
	}
}

// stringStartsWithIdentifier returns true if the given string begins with
// a character that is a legal start of an identifier: an underscore or
// any character that Unicode considers to be a letter.
func stringStartsWithIdentifier(s string) bool {
	if len(s) == 0 {
		return false
	}

	first := s[0]

	// Easy ASCII cases first
	if (first >= 'a' && first <= 'z') || (first >= 'A' && first <= 'Z') || first == '_' {
		return true
	}

	// If our first byte begins a UTF-8 sequence then the sequence might
	// be a unicode letter.
	if utf8.RuneStart(first) {
		firstRune, _ := utf8.DecodeRuneInString(s)
		if unicode.IsLetter(firstRune) {
			return true
		}
	}

	return false
}