shithub: hugo

--- a/hugolib/shortcode.go

+++ b/hugolib/shortcode.go

@@ -22,6 +22,8 @@

 	"regexp"

 	"sort"

+	"github.com/gohugoio/hugo/parser/pageparser"

 	_errors "github.com/pkg/errors"

 	"strings"

@@ -478,18 +480,18 @@

 // pageTokens state:

 // - before: positioned just before the shortcode start

 // - after: shortcode(s) consumed (plural when they are nested)

-func (s *shortcodeHandler) extractShortcode(ordinal int, pt *pageTokens, p *PageWithoutContent) (*shortcode, error) {

+func (s *shortcodeHandler) extractShortcode(ordinal int, pt *pageparser.Tokens, p *PageWithoutContent) (*shortcode, error) {

 	sc := &shortcode{ordinal: ordinal}

 	var isInner = false

-	var currItem item

 	var cnt = 0

 	var nestedOrdinal = 0

 	// TODO(bep) 2errors revisit after https://github.com/gohugoio/hugo/issues/5324

-	msgf := func(i item, format string, args ...interface{}) string {

+	msgf := func(i pageparser.Item, format string, args ...interface{}) string {

 		format = format + ":%d:"

-		c1 := strings.Count(pt.lexer.input[:i.pos], "\n") + 1

+		// TODO(bep) 2errors

+		c1 := 32 // strings.Count(pt.lexer.input[:i.pos], "\n") + 1

 		c2 := bytes.Count(p.frontmatter, []byte{'\n'})

 		args = append(args, c1+c2)

 		return fmt.Sprintf(format, args...)

@@ -498,18 +500,17 @@

 Loop:

 	for {

-		currItem = pt.next()

-		switch currItem.typ {

-		case tLeftDelimScWithMarkup, tLeftDelimScNoMarkup:

-			next := pt.peek()

-			if next.typ == tScClose {

+		currItem := pt.Next()

+		switch {

+		case currItem.IsLeftShortcodeDelim():

+			next := pt.Peek()

+			if next.IsShortcodeClose() {

 				continue

 			if cnt > 0 {

 				// nested shortcode; append it to inner content

-				pt.backup3(currItem, next)

+				pt.Backup3(currItem, next)

 				nested, err := s.extractShortcode(nestedOrdinal, pt, p)

 				nestedOrdinal++

 				if nested.name != "" {

@@ -522,12 +523,12 @@

 			} else {

-				sc.doMarkup = currItem.typ == tLeftDelimScWithMarkup

+				sc.doMarkup = currItem.IsShortcodeMarkupDelimiter()

 			cnt++

-		case tRightDelimScWithMarkup, tRightDelimScNoMarkup:

+		case currItem.IsRightShortcodeDelim():

 			// we trust the template on this:

 			// if there's no inner, we're done

 			if !isInner {

@@ -534,27 +535,27 @@

 				return sc, nil

-		case tScClose:

-			next := pt.peek()

+		case currItem.IsShortcodeClose():

+			next := pt.Peek()

 			if !isInner {

-				if next.typ == tError {

+				if next.IsError() {

 					// return that error, more specific

 					continue

-				return sc, errors.New(msgf(next, "shortcode %q has no .Inner, yet a closing tag was provided", next.val))

+				return sc, errors.New(msgf(next, "shortcode %q has no .Inner, yet a closing tag was provided", next.Val))

-			if next.typ == tRightDelimScWithMarkup || next.typ == tRightDelimScNoMarkup {

+			if next.IsRightShortcodeDelim() {

 				// self-closing

-				pt.consume(1)

+				pt.Consume(1)

 			} else {

-				pt.consume(2)

+				pt.Consume(2)

 			return sc, nil

-		case tText:

-			sc.inner = append(sc.inner, currItem.val)

-		case tScName:

-			sc.name = currItem.val

+		case currItem.IsText():

+			sc.inner = append(sc.inner, currItem.Val)

+		case currItem.IsShortcodeName():

+			sc.name = currItem.Val

 			// We pick the first template for an arbitrary output format

 			// if more than one. It is "all inner or no inner".

 			tmpl := getShortcodeTemplateForTemplateKey(scKey{}, sc.name, p.s.Tmpl)

@@ -568,18 +569,18 @@

 				return sc, _errors.Wrap(err, msgf(currItem, "failed to handle template for shortcode %q", sc.name))

-		case tScParam:

-			if !pt.isValueNext() {

+		case currItem.IsShortcodeParam():

+			if !pt.IsValueNext() {

 				continue

-			} else if pt.peek().typ == tScParamVal {

+			} else if pt.Peek().IsShortcodeParamVal() {

 				// named params

 				if sc.params == nil {

 					params := make(map[string]string)

-					params[currItem.val] = pt.next().val

+					params[currItem.Val] = pt.Next().Val

 					sc.params = params

 				} else {

 					if params, ok := sc.params.(map[string]string); ok {

-						params[currItem.val] = pt.next().val

+						params[currItem.Val] = pt.Next().Val

 					} else {

 						return sc, errShortCodeIllegalState

@@ -589,11 +590,11 @@

 				// positional params

 				if sc.params == nil {

 					var params []string

-					params = append(params, currItem.val)

+					params = append(params, currItem.Val)

 					sc.params = params

 				} else {

 					if params, ok := sc.params.([]string); ok {

-						params = append(params, currItem.val)

+						params = append(params, currItem.Val)

 						sc.params = params

 					} else {

 						return sc, errShortCodeIllegalState

@@ -602,9 +603,9 @@

-		case tError, tEOF:

+		case currItem.IsDone():

 			// handled by caller

-			pt.backup()

+			pt.Backup()

 			break Loop

@@ -624,7 +625,7 @@

 	// the parser takes a string;

 	// since this is an internal API, it could make sense to use the mutable []byte all the way, but

 	// it seems that the time isn't really spent in the byte copy operations, and the impl. gets a lot cleaner

-	pt := &pageTokens{lexer: newShortcodeLexer("parse-page", stringToParse, pos(startIdx))}

+	pt := pageparser.ParseFrom(stringToParse, startIdx)

 	result := bp.GetBuffer()

 	defer bp.PutBuffer(result)

@@ -632,20 +633,19 @@

 	// the parser is guaranteed to return items in proper order or fail, so …

 	// … it's safe to keep some "global" state

-	var currItem item

 	var currShortcode shortcode

 	var ordinal int

 Loop:

 	for {

-		currItem = pt.next()

+		currItem := pt.Next()

-		switch currItem.typ {

-		case tText:

-			result.WriteString(currItem.val)

-		case tLeftDelimScWithMarkup, tLeftDelimScNoMarkup:

+		switch {

+		case currItem.IsText():

+			result.WriteString(currItem.Val)

+		case currItem.IsLeftShortcodeDelim():

 			// let extractShortcode handle left delim (will do so recursively)

-			pt.backup()

+			pt.Backup()

 			currShortcode, err := s.extractShortcode(ordinal, pt, p)

@@ -665,11 +665,11 @@

 			result.WriteString(placeHolder)

 			ordinal++

 			s.shortcodes.Add(placeHolder, currShortcode)

-		case tEOF:

+		case currItem.IsEOF():

 			break Loop

-		case tError:

+		case currItem.IsError():

 			err := fmt.Errorf("%s:shortcode:%d: %s",

-				p.pathOrTitle(), (p.lineNumRawContentStart() + pt.lexer.lineNum() - 1), currItem)

+				p.pathOrTitle(), (p.lineNumRawContentStart() + pt.LineNumber() - 1), currItem)

 			currShortcode.err = err

 			return result.String(), err

--- a/hugolib/shortcodeparser.go

+++ /dev/null

@@ -1,586 +1,0 @@

-// Copyright 2015 The Hugo Authors. All rights reserved.

-//

-// Licensed under the Apache License, Version 2.0 (the "License");

-// you may not use this file except in compliance with the License.

-// You may obtain a copy of the License at

-// http://www.apache.org/licenses/LICENSE-2.0

-//

-// Unless required by applicable law or agreed to in writing, software

-// distributed under the License is distributed on an "AS IS" BASIS,

-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

-// See the License for the specific language governing permissions and

-// limitations under the License.

-package hugolib

-import (

-	"fmt"

-	"strings"

-	"unicode"

-	"unicode/utf8"

-)

-// The lexical scanning below is highly inspired by the great talk given by

-// Rob Pike called "Lexical Scanning in Go" (it's on YouTube, Google it!).

-// See slides here: http://cuddle.googlecode.com/hg/talk/lex.html

-// parsing

-type pageTokens struct {

-	lexer     *pagelexer

-	token     [3]item // 3-item look-ahead is what we currently need

-	peekCount int

-}

-func (t *pageTokens) next() item {

-	if t.peekCount > 0 {

-		t.peekCount--

-	} else {

-		t.token[0] = t.lexer.nextItem()

-	}

-	return t.token[t.peekCount]

-}

-// backs up one token.

-func (t *pageTokens) backup() {

-	t.peekCount++

-}

-// backs up two tokens.

-func (t *pageTokens) backup2(t1 item) {

-	t.token[1] = t1

-	t.peekCount = 2

-}

-// backs up three tokens.

-func (t *pageTokens) backup3(t2, t1 item) {

-	t.token[1] = t1

-	t.token[2] = t2

-	t.peekCount = 3

-}

-// check for non-error and non-EOF types coming next

-func (t *pageTokens) isValueNext() bool {

-	i := t.peek()

-	return i.typ != tError && i.typ != tEOF

-}

-// look at, but do not consume, the next item

-// repeated, sequential calls will return the same item

-func (t *pageTokens) peek() item {

-	if t.peekCount > 0 {

-		return t.token[t.peekCount-1]

-	}

-	t.peekCount = 1

-	t.token[0] = t.lexer.nextItem()

-	return t.token[0]

-}

-// convencience method to consume the next n tokens, but back off Errors and EOF

-func (t *pageTokens) consume(cnt int) {

-	for i := 0; i < cnt; i++ {

-		token := t.next()

-		if token.typ == tError || token.typ == tEOF {

-			t.backup()

-			break

-		}

-	}

-}

-// lexical scanning

-// position (in bytes)

-type pos int

-type item struct {

-	typ itemType

-	pos pos

-	val string

-}

-func (i item) String() string {

-	switch {

-	case i.typ == tEOF:

-		return "EOF"

-	case i.typ == tError:

-		return i.val

-	case i.typ > tKeywordMarker:

-		return fmt.Sprintf("<%s>", i.val)

-	case len(i.val) > 20:

-		return fmt.Sprintf("%.20q...", i.val)

-	}

-	return fmt.Sprintf("[%s]", i.val)

-}

-type itemType int

-const (

-	tError itemType = iota

-	tEOF

-	// shortcode items

-	tLeftDelimScNoMarkup

-	tRightDelimScNoMarkup

-	tLeftDelimScWithMarkup

-	tRightDelimScWithMarkup

-	tScClose

-	tScName

-	tScParam

-	tScParamVal

-	//itemIdentifier

-	tText // plain text, used for everything outside the shortcodes

-	// preserved for later - keywords come after this

-	tKeywordMarker

-)

-const eof = -1

-// returns the next state in scanner.

-type stateFunc func(*pagelexer) stateFunc

-type pagelexer struct {

-	name    string

-	input   string

-	state   stateFunc

-	pos     pos // input position

-	start   pos // item start position

-	width   pos // width of last element

-	lastPos pos // position of the last item returned by nextItem

-	// shortcode state

-	currLeftDelimItem  itemType

-	currRightDelimItem itemType

-	currShortcodeName  string          // is only set when a shortcode is in opened state

-	closingState       int             // > 0 = on its way to be closed

-	elementStepNum     int             // step number in element

-	paramElements      int             // number of elements (name + value = 2) found first

-	openShortcodes     map[string]bool // set of shortcodes in open state

-	// items delivered to client

-	items []item

-}

-// note: the input position here is normally 0 (start), but

-// can be set if position of first shortcode is known

-func newShortcodeLexer(name, input string, inputPosition pos) *pagelexer {

-	lexer := &pagelexer{

-		name:               name,

-		input:              input,

-		currLeftDelimItem:  tLeftDelimScNoMarkup,

-		currRightDelimItem: tRightDelimScNoMarkup,

-		pos:                inputPosition,

-		openShortcodes:     make(map[string]bool),

-		items:              make([]item, 0, 5),

-	}

-	lexer.runShortcodeLexer()

-	return lexer

-}

-// main loop

-// this looks kind of funky, but it works

-func (l *pagelexer) runShortcodeLexer() {

-	for l.state = lexTextOutsideShortcodes; l.state != nil; {

-		l.state = l.state(l)

-	}

-}

-// state functions

-const (

-	leftDelimScNoMarkup    = "{{<"

-	rightDelimScNoMarkup   = ">}}"

-	leftDelimScWithMarkup  = "{{%"

-	rightDelimScWithMarkup = "%}}"

-	leftComment            = "/*" // comments in this context us used to to mark shortcodes as "not really a shortcode"

-	rightComment           = "*/"

-)

-func (l *pagelexer) next() rune {

-	if int(l.pos) >= len(l.input) {

-		l.width = 0

-		return eof

-	}

-	// looks expensive, but should produce the same iteration sequence as the string range loop

-	// see: http://blog.golang.org/strings

-	runeValue, runeWidth := utf8.DecodeRuneInString(l.input[l.pos:])

-	l.width = pos(runeWidth)

-	l.pos += l.width

-	return runeValue

-}

-// peek, but no consume

-func (l *pagelexer) peek() rune {

-	r := l.next()

-	l.backup()

-	return r

-}

-// steps back one

-func (l *pagelexer) backup() {

-	l.pos -= l.width

-}

-// sends an item back to the client.

-func (l *pagelexer) emit(t itemType) {

-	l.items = append(l.items, item{t, l.start, l.input[l.start:l.pos]})

-	l.start = l.pos

-}

-// special case, do not send '\\' back to client

-func (l *pagelexer) ignoreEscapesAndEmit(t itemType) {

-	val := strings.Map(func(r rune) rune {

-		if r == '\\' {

-			return -1

-		}

-		return r

-	}, l.input[l.start:l.pos])

-	l.items = append(l.items, item{t, l.start, val})

-	l.start = l.pos

-}

-// gets the current value (for debugging and error handling)

-func (l *pagelexer) current() string {

-	return l.input[l.start:l.pos]

-}

-// ignore current element

-func (l *pagelexer) ignore() {

-	l.start = l.pos

-}

-// nice to have in error logs

-func (l *pagelexer) lineNum() int {

-	return strings.Count(l.input[:l.lastPos], "\n") + 1

-}

-// nil terminates the parser

-func (l *pagelexer) errorf(format string, args ...interface{}) stateFunc {

-	l.items = append(l.items, item{tError, l.start, fmt.Sprintf(format, args...)})

-	return nil

-}

-// consumes and returns the next item

-func (l *pagelexer) nextItem() item {

-	item := l.items[0]

-	l.items = l.items[1:]

-	l.lastPos = item.pos

-	return item

-}

-// scans until an opening shortcode opening bracket.

-// if no shortcodes, it will keep on scanning until EOF

-func lexTextOutsideShortcodes(l *pagelexer) stateFunc {

-	for {

-		if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) {

-			if l.pos > l.start {

-				l.emit(tText)

-			}

-			if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) {

-				l.currLeftDelimItem = tLeftDelimScWithMarkup

-				l.currRightDelimItem = tRightDelimScWithMarkup

-			} else {

-				l.currLeftDelimItem = tLeftDelimScNoMarkup

-				l.currRightDelimItem = tRightDelimScNoMarkup

-			}

-			return lexShortcodeLeftDelim

-		}

-		if l.next() == eof {

-			break

-		}

-	}

-	// Done!

-	if l.pos > l.start {

-		l.emit(tText)

-	}

-	l.emit(tEOF)

-	return nil

-}

-func lexShortcodeLeftDelim(l *pagelexer) stateFunc {

-	l.pos += pos(len(l.currentLeftShortcodeDelim()))

-	if strings.HasPrefix(l.input[l.pos:], leftComment) {

-		return lexShortcodeComment

-	}

-	l.emit(l.currentLeftShortcodeDelimItem())

-	l.elementStepNum = 0

-	l.paramElements = 0

-	return lexInsideShortcode

-}

-func lexShortcodeComment(l *pagelexer) stateFunc {

-	posRightComment := strings.Index(l.input[l.pos:], rightComment+l.currentRightShortcodeDelim())

-	if posRightComment <= 1 {

-		return l.errorf("comment must be closed")

-	}

-	// we emit all as text, except the comment markers

-	l.emit(tText)

-	l.pos += pos(len(leftComment))

-	l.ignore()

-	l.pos += pos(posRightComment - len(leftComment))

-	l.emit(tText)

-	l.pos += pos(len(rightComment))

-	l.ignore()

-	l.pos += pos(len(l.currentRightShortcodeDelim()))

-	l.emit(tText)

-	return lexTextOutsideShortcodes

-}

-func lexShortcodeRightDelim(l *pagelexer) stateFunc {

-	l.closingState = 0

-	l.pos += pos(len(l.currentRightShortcodeDelim()))

-	l.emit(l.currentRightShortcodeDelimItem())

-	return lexTextOutsideShortcodes

-}

-// either:

-// 1. param

-// 2. "param" or "param\"

-// 3. param="123" or param="123\"

-// 4. param="Some \"escaped\" text"

-func lexShortcodeParam(l *pagelexer, escapedQuoteStart bool) stateFunc {

-	first := true

-	nextEq := false

-	var r rune

-	for {

-		r = l.next()

-		if first {

-			if r == '"' {

-				// a positional param with quotes

-				if l.paramElements == 2 {

-					return l.errorf("got quoted positional parameter. Cannot mix named and positional parameters")

-				}

-				l.paramElements = 1

-				l.backup()

-				return lexShortcodeQuotedParamVal(l, !escapedQuoteStart, tScParam)

-			}

-			first = false

-		} else if r == '=' {

-			// a named param

-			l.backup()

-			nextEq = true

-			break

-		}

-		if !isAlphaNumericOrHyphen(r) {

-			l.backup()

-			break

-		}

-	}

-	if l.paramElements == 0 {

-		l.paramElements++

-		if nextEq {

-			l.paramElements++

-		}

-	} else {

-		if nextEq && l.paramElements == 1 {

-			return l.errorf("got named parameter '%s'. Cannot mix named and positional parameters", l.current())

-		} else if !nextEq && l.paramElements == 2 {

-			return l.errorf("got positional parameter '%s'. Cannot mix named and positional parameters", l.current())

-		}

-	}

-	l.emit(tScParam)

-	return lexInsideShortcode

-}

-func lexShortcodeQuotedParamVal(l *pagelexer, escapedQuotedValuesAllowed bool, typ itemType) stateFunc {

-	openQuoteFound := false

-	escapedInnerQuoteFound := false

-	escapedQuoteState := 0

-Loop:

-	for {

-		switch r := l.next(); {

-		case r == '\\':

-			if l.peek() == '"' {

-				if openQuoteFound && !escapedQuotedValuesAllowed {

-					l.backup()

-					break Loop

-				} else if openQuoteFound {

-					// the coming quoute is inside

-					escapedInnerQuoteFound = true

-					escapedQuoteState = 1

-				}

-			}

-		case r == eof, r == '\n':

-			return l.errorf("unterminated quoted string in shortcode parameter-argument: '%s'", l.current())

-		case r == '"':

-			if escapedQuoteState == 0 {

-				if openQuoteFound {

-					l.backup()

-					break Loop

-				} else {

-					openQuoteFound = true

-					l.ignore()

-				}

-			} else {

-				escapedQuoteState = 0

-			}

-		}

-	}

-	if escapedInnerQuoteFound {

-		l.ignoreEscapesAndEmit(typ)

-	} else {

-		l.emit(typ)

-	}

-	r := l.next()

-	if r == '\\' {

-		if l.peek() == '"' {

-			// ignore the escaped closing quote

-			l.ignore()

-			l.next()

-			l.ignore()

-		}

-	} else if r == '"' {

-		// ignore closing quote

-		l.ignore()

-	} else {

-		// handled by next state

-		l.backup()

-	}

-	return lexInsideShortcode

-}

-// scans an alphanumeric inside shortcode

-func lexIdentifierInShortcode(l *pagelexer) stateFunc {

-	lookForEnd := false

-Loop:

-	for {

-		switch r := l.next(); {

-		case isAlphaNumericOrHyphen(r):

-		// Allow forward slash inside names to make it possible to create namespaces.

-		case r == '/':

-		default:

-			l.backup()

-			word := l.input[l.start:l.pos]

-			if l.closingState > 0 && !l.openShortcodes[word] {

-				return l.errorf("closing tag for shortcode '%s' does not match start tag", word)

-			} else if l.closingState > 0 {

-				l.openShortcodes[word] = false

-				lookForEnd = true

-			}

-			l.closingState = 0

-			l.currShortcodeName = word

-			l.openShortcodes[word] = true

-			l.elementStepNum++

-			l.emit(tScName)

-			break Loop

-		}

-	}

-	if lookForEnd {

-		return lexEndOfShortcode

-	}

-	return lexInsideShortcode

-}

-func lexEndOfShortcode(l *pagelexer) stateFunc {

-	if strings.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) {

-		return lexShortcodeRightDelim

-	}

-	switch r := l.next(); {

-	case isSpace(r):

-		l.ignore()

-	default:

-		return l.errorf("unclosed shortcode")

-	}

-	return lexEndOfShortcode

-}

-// scans the elements inside shortcode tags

-func lexInsideShortcode(l *pagelexer) stateFunc {

-	if strings.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) {

-		return lexShortcodeRightDelim

-	}

-	switch r := l.next(); {

-	case r == eof:

-		// eol is allowed inside shortcodes; this may go to end of document before it fails

-		return l.errorf("unclosed shortcode action")

-	case isSpace(r), isEndOfLine(r):

-		l.ignore()

-	case r == '=':

-		l.ignore()

-		return lexShortcodeQuotedParamVal(l, l.peek() != '\\', tScParamVal)

-	case r == '/':

-		if l.currShortcodeName == "" {

-			return l.errorf("got closing shortcode, but none is open")

-		}

-		l.closingState++

-		l.emit(tScClose)

-	case r == '\\':

-		l.ignore()

-		if l.peek() == '"' {

-			return lexShortcodeParam(l, true)

-		}

-	case l.elementStepNum > 0 && (isAlphaNumericOrHyphen(r) || r == '"'): // positional params can have quotes

-		l.backup()

-		return lexShortcodeParam(l, false)

-	case isAlphaNumeric(r):

-		l.backup()

-		return lexIdentifierInShortcode

-	default:

-		return l.errorf("unrecognized character in shortcode action: %#U. Note: Parameters with non-alphanumeric args must be quoted", r)

-	}

-	return lexInsideShortcode

-}

-// state helpers

-func (l *pagelexer) currentLeftShortcodeDelimItem() itemType {

-	return l.currLeftDelimItem

-}

-func (l *pagelexer) currentRightShortcodeDelimItem() itemType {

-	return l.currRightDelimItem

-}

-func (l *pagelexer) currentLeftShortcodeDelim() string {

-	if l.currLeftDelimItem == tLeftDelimScWithMarkup {

-		return leftDelimScWithMarkup

-	}

-	return leftDelimScNoMarkup

-}

-func (l *pagelexer) currentRightShortcodeDelim() string {

-	if l.currRightDelimItem == tRightDelimScWithMarkup {

-		return rightDelimScWithMarkup

-	}

-	return rightDelimScNoMarkup

-}

-// helper functions

-func isSpace(r rune) bool {

-	return r == ' ' || r == '\t'

-}

-func isAlphaNumericOrHyphen(r rune) bool {

-	// let unquoted YouTube ids as positional params slip through (they contain hyphens)

-	return isAlphaNumeric(r) || r == '-'

-}

-func isEndOfLine(r rune) bool {

-	return r == '\r' || r == '\n'

-}

-func isAlphaNumeric(r rune) bool {

-	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)

-}

--- a/hugolib/shortcodeparser_test.go

+++ /dev/null

@@ -1,207 +1,0 @@

-// Copyright 2015 The Hugo Authors. All rights reserved.

-//

-// Licensed under the Apache License, Version 2.0 (the "License");

-// you may not use this file except in compliance with the License.

-// You may obtain a copy of the License at

-// http://www.apache.org/licenses/LICENSE-2.0

-//

-// Unless required by applicable law or agreed to in writing, software

-// distributed under the License is distributed on an "AS IS" BASIS,

-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

-// See the License for the specific language governing permissions and

-// limitations under the License.

-package hugolib

-import (

-	"testing"

-)

-type shortCodeLexerTest struct {

-	name  string

-	input string

-	items []item

-}

-var (

-	tstEOF       = item{tEOF, 0, ""}

-	tstLeftNoMD  = item{tLeftDelimScNoMarkup, 0, "{{<"}

-	tstRightNoMD = item{tRightDelimScNoMarkup, 0, ">}}"}

-	tstLeftMD    = item{tLeftDelimScWithMarkup, 0, "{{%"}

-	tstRightMD   = item{tRightDelimScWithMarkup, 0, "%}}"}

-	tstSCClose   = item{tScClose, 0, "/"}

-	tstSC1       = item{tScName, 0, "sc1"}

-	tstSC2       = item{tScName, 0, "sc2"}

-	tstSC3       = item{tScName, 0, "sc3"}

-	tstSCSlash   = item{tScName, 0, "sc/sub"}

-	tstParam1    = item{tScParam, 0, "param1"}

-	tstParam2    = item{tScParam, 0, "param2"}

-	tstVal       = item{tScParamVal, 0, "Hello World"}

-)

-var shortCodeLexerTests = []shortCodeLexerTest{

-	{"empty", "", []item{tstEOF}},

-	{"spaces", " \t\n", []item{{tText, 0, " \t\n"}, tstEOF}},

-	{"text", `to be or not`, []item{{tText, 0, "to be or not"}, tstEOF}},

-	{"no markup", `{{< sc1 >}}`, []item{tstLeftNoMD, tstSC1, tstRightNoMD, tstEOF}},

-	{"with EOL", "{{< sc1 \n >}}", []item{tstLeftNoMD, tstSC1, tstRightNoMD, tstEOF}},

-	{"forward slash inside name", `{{< sc/sub >}}`, []item{tstLeftNoMD, tstSCSlash, tstRightNoMD, tstEOF}},

-	{"simple with markup", `{{% sc1 %}}`, []item{tstLeftMD, tstSC1, tstRightMD, tstEOF}},

-	{"with spaces", `{{<     sc1     >}}`, []item{tstLeftNoMD, tstSC1, tstRightNoMD, tstEOF}},

-	{"mismatched rightDelim", `{{< sc1 %}}`, []item{tstLeftNoMD, tstSC1,

-		{tError, 0, "unrecognized character in shortcode action: U+0025 '%'. Note: Parameters with non-alphanumeric args must be quoted"}}},

-	{"inner, markup", `{{% sc1 %}} inner {{% /sc1 %}}`, []item{

-		tstLeftMD,

-		tstSC1,

-		tstRightMD,

-		{tText, 0, " inner "},

-		tstLeftMD,

-		tstSCClose,

-		tstSC1,

-		tstRightMD,

-		tstEOF,

-	}},

-	{"close, but no open", `{{< /sc1 >}}`, []item{

-		tstLeftNoMD, {tError, 0, "got closing shortcode, but none is open"}}},

-	{"close wrong", `{{< sc1 >}}{{< /another >}}`, []item{

-		tstLeftNoMD, tstSC1, tstRightNoMD, tstLeftNoMD, tstSCClose,

-		{tError, 0, "closing tag for shortcode 'another' does not match start tag"}}},

-	{"close, but no open, more", `{{< sc1 >}}{{< /sc1 >}}{{< /another >}}`, []item{

-		tstLeftNoMD, tstSC1, tstRightNoMD, tstLeftNoMD, tstSCClose, tstSC1, tstRightNoMD, tstLeftNoMD, tstSCClose,

-		{tError, 0, "closing tag for shortcode 'another' does not match start tag"}}},

-	{"close with extra keyword", `{{< sc1 >}}{{< /sc1 keyword>}}`, []item{

-		tstLeftNoMD, tstSC1, tstRightNoMD, tstLeftNoMD, tstSCClose, tstSC1,

-		{tError, 0, "unclosed shortcode"}}},

-	{"Youtube id", `{{< sc1 -ziL-Q_456igdO-4 >}}`, []item{

-		tstLeftNoMD, tstSC1, {tScParam, 0, "-ziL-Q_456igdO-4"}, tstRightNoMD, tstEOF}},

-	{"non-alphanumerics param quoted", `{{< sc1 "-ziL-.%QigdO-4" >}}`, []item{

-		tstLeftNoMD, tstSC1, {tScParam, 0, "-ziL-.%QigdO-4"}, tstRightNoMD, tstEOF}},

-	{"two params", `{{< sc1 param1   param2 >}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam1, tstParam2, tstRightNoMD, tstEOF}},

-	// issue #934

-	{"self-closing", `{{< sc1 />}}`, []item{

-		tstLeftNoMD, tstSC1, tstSCClose, tstRightNoMD, tstEOF}},

-	// Issue 2498

-	{"multiple self-closing", `{{< sc1 />}}{{< sc1 />}}`, []item{

-		tstLeftNoMD, tstSC1, tstSCClose, tstRightNoMD,

-		tstLeftNoMD, tstSC1, tstSCClose, tstRightNoMD, tstEOF}},

-	{"self-closing with param", `{{< sc1 param1 />}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam1, tstSCClose, tstRightNoMD, tstEOF}},

-	{"multiple self-closing with param", `{{< sc1 param1 />}}{{< sc1 param1 />}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam1, tstSCClose, tstRightNoMD,

-		tstLeftNoMD, tstSC1, tstParam1, tstSCClose, tstRightNoMD, tstEOF}},

-	{"multiple different self-closing with param", `{{< sc1 param1 />}}{{< sc2 param1 />}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam1, tstSCClose, tstRightNoMD,

-		tstLeftNoMD, tstSC2, tstParam1, tstSCClose, tstRightNoMD, tstEOF}},

-	{"nested simple", `{{< sc1 >}}{{< sc2 >}}{{< /sc1 >}}`, []item{

-		tstLeftNoMD, tstSC1, tstRightNoMD,

-		tstLeftNoMD, tstSC2, tstRightNoMD,

-		tstLeftNoMD, tstSCClose, tstSC1, tstRightNoMD, tstEOF}},

-	{"nested complex", `{{< sc1 >}}ab{{% sc2 param1 %}}cd{{< sc3 >}}ef{{< /sc3 >}}gh{{% /sc2 %}}ij{{< /sc1 >}}kl`, []item{

-		tstLeftNoMD, tstSC1, tstRightNoMD,

-		{tText, 0, "ab"},

-		tstLeftMD, tstSC2, tstParam1, tstRightMD,

-		{tText, 0, "cd"},

-		tstLeftNoMD, tstSC3, tstRightNoMD,

-		{tText, 0, "ef"},

-		tstLeftNoMD, tstSCClose, tstSC3, tstRightNoMD,

-		{tText, 0, "gh"},

-		tstLeftMD, tstSCClose, tstSC2, tstRightMD,

-		{tText, 0, "ij"},

-		tstLeftNoMD, tstSCClose, tstSC1, tstRightNoMD,

-		{tText, 0, "kl"}, tstEOF,

-	}},

-	{"two quoted params", `{{< sc1 "param nr. 1" "param nr. 2" >}}`, []item{

-		tstLeftNoMD, tstSC1, {tScParam, 0, "param nr. 1"}, {tScParam, 0, "param nr. 2"}, tstRightNoMD, tstEOF}},

-	{"two named params", `{{< sc1 param1="Hello World" param2="p2Val">}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam1, tstVal, tstParam2, {tScParamVal, 0, "p2Val"}, tstRightNoMD, tstEOF}},

-	{"escaped quotes", `{{< sc1 param1=\"Hello World\"  >}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam1, tstVal, tstRightNoMD, tstEOF}},

-	{"escaped quotes, positional param", `{{< sc1 \"param1\"  >}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam1, tstRightNoMD, tstEOF}},

-	{"escaped quotes inside escaped quotes", `{{< sc1 param1=\"Hello \"escaped\" World\"  >}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam1,

-		{tScParamVal, 0, `Hello `}, {tError, 0, `got positional parameter 'escaped'. Cannot mix named and positional parameters`}}},

-	{"escaped quotes inside nonescaped quotes",

-		`{{< sc1 param1="Hello \"escaped\" World"  >}}`, []item{

-			tstLeftNoMD, tstSC1, tstParam1, {tScParamVal, 0, `Hello "escaped" World`}, tstRightNoMD, tstEOF}},

-	{"escaped quotes inside nonescaped quotes in positional param",

-		`{{< sc1 "Hello \"escaped\" World"  >}}`, []item{

-			tstLeftNoMD, tstSC1, {tScParam, 0, `Hello "escaped" World`}, tstRightNoMD, tstEOF}},

-	{"unterminated quote", `{{< sc1 param2="Hello World>}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam2, {tError, 0, "unterminated quoted string in shortcode parameter-argument: 'Hello World>}}'"}}},

-	{"one named param, one not", `{{< sc1 param1="Hello World" p2 >}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam1, tstVal,

-		{tError, 0, "got positional parameter 'p2'. Cannot mix named and positional parameters"}}},

-	{"one named param, one quoted positional param", `{{< sc1 param1="Hello World" "And Universe" >}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam1, tstVal,

-		{tError, 0, "got quoted positional parameter. Cannot mix named and positional parameters"}}},

-	{"one quoted positional param, one named param", `{{< sc1 "param1" param2="And Universe" >}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam1,

-		{tError, 0, "got named parameter 'param2'. Cannot mix named and positional parameters"}}},

-	{"ono positional param, one not", `{{< sc1 param1 param2="Hello World">}}`, []item{

-		tstLeftNoMD, tstSC1, tstParam1,

-		{tError, 0, "got named parameter 'param2'. Cannot mix named and positional parameters"}}},

-	{"commented out", `{{</* sc1 */>}}`, []item{

-		{tText, 0, "{{<"}, {tText, 0, " sc1 "}, {tText, 0, ">}}"}, tstEOF}},

-	{"commented out, with asterisk inside", `{{</* sc1 "**/*.pdf" */>}}`, []item{

-		{tText, 0, "{{<"}, {tText, 0, " sc1 \"**/*.pdf\" "}, {tText, 0, ">}}"}, tstEOF}},

-	{"commented out, missing close", `{{</* sc1 >}}`, []item{

-		{tError, 0, "comment must be closed"}}},

-	{"commented out, misplaced close", `{{</* sc1 >}}*/`, []item{

-		{tError, 0, "comment must be closed"}}},

-}

-func TestShortcodeLexer(t *testing.T) {

-	t.Parallel()

-	for i, test := range shortCodeLexerTests {

-		items := collect(&test)

-		if !equal(items, test.items) {

-			t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, items, test.items)

-		}

-	}

-}

-func BenchmarkShortcodeLexer(b *testing.B) {

-	b.ResetTimer()

-	for i := 0; i < b.N; i++ {

-		for _, test := range shortCodeLexerTests {

-			items := collect(&test)

-			if !equal(items, test.items) {

-				b.Errorf("%s: got\n\t%v\nexpected\n\t%v", test.name, items, test.items)

-			}

-		}

-	}

-}

-func collect(t *shortCodeLexerTest) (items []item) {

-	l := newShortcodeLexer(t.name, t.input, 0)

-	for {

-		item := l.nextItem()

-		items = append(items, item)

-		if item.typ == tEOF || item.typ == tError {

-			break

-		}

-	}

-	return

-}

-// no positional checking, for now ...

-func equal(i1, i2 []item) bool {

-	if len(i1) != len(i2) {

-		return false

-	}

-	for k := range i1 {

-		if i1[k].typ != i2[k].typ {

-			return false

-		}

-		if i1[k].val != i2[k].val {

-			return false

-		}

-	}

-	return true

-}

--- /dev/null

+++ b/parser/pageparser/shortcodeparser.go

@@ -1,0 +1,644 @@

+// Copyright 2018 The Hugo Authors. All rights reserved.

+//

+// Licensed under the Apache License, Version 2.0 (the "License");

+// you may not use this file except in compliance with the License.

+// You may obtain a copy of the License at

+// http://www.apache.org/licenses/LICENSE-2.0

+//

+// Unless required by applicable law or agreed to in writing, software

+// distributed under the License is distributed on an "AS IS" BASIS,

+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+// See the License for the specific language governing permissions and

+// limitations under the License.

+package pageparser

+import (

+	"fmt"

+	"strings"

+	"unicode"

+	"unicode/utf8"

+)

+// The lexical scanning below is highly inspired by the great talk given by

+// Rob Pike called "Lexical Scanning in Go" (it's on YouTube, Google it!).

+// See slides here: http://cuddle.googlecode.com/hg/talk/lex.html

+// parsing

+type Tokens struct {

+	lexer     *pagelexer

+	token     [3]Item // 3-item look-ahead is what we currently need

+	peekCount int

+}

+func (t *Tokens) Next() Item {

+	if t.peekCount > 0 {

+		t.peekCount--

+	} else {

+		t.token[0] = t.lexer.nextItem()

+	}

+	return t.token[t.peekCount]

+}

+// backs up one token.

+func (t *Tokens) Backup() {

+	t.peekCount++

+}

+// backs up two tokens.

+func (t *Tokens) Backup2(t1 Item) {

+	t.token[1] = t1

+	t.peekCount = 2

+}

+// backs up three tokens.

+func (t *Tokens) Backup3(t2, t1 Item) {

+	t.token[1] = t1

+	t.token[2] = t2

+	t.peekCount = 3

+}

+// check for non-error and non-EOF types coming next

+func (t *Tokens) IsValueNext() bool {

+	i := t.Peek()

+	return i.typ != tError && i.typ != tEOF

+}

+// look at, but do not consume, the next item

+// repeated, sequential calls will return the same item

+func (t *Tokens) Peek() Item {

+	if t.peekCount > 0 {

+		return t.token[t.peekCount-1]

+	}

+	t.peekCount = 1

+	t.token[0] = t.lexer.nextItem()

+	return t.token[0]

+}

+// Consume is a convencience method to consume the next n tokens,

+// but back off Errors and EOF.

+func (t *Tokens) Consume(cnt int) {

+	for i := 0; i < cnt; i++ {

+		token := t.Next()

+		if token.typ == tError || token.typ == tEOF {

+			t.Backup()

+			break

+		}

+	}

+}

+// LineNumber returns the current line number. Used for logging.

+func (t *Tokens) LineNumber() int {

+	return t.lexer.lineNum()

+}

+// lexical scanning

+// position (in bytes)

+type pos int

+type Item struct {

+	typ itemType

+	pos pos

+	Val string

+}

+func (i Item) IsText() bool {

+	return i.typ == tText

+}

+func (i Item) IsShortcodeName() bool {

+	return i.typ == tScName

+}

+func (i Item) IsLeftShortcodeDelim() bool {

+	return i.typ == tLeftDelimScWithMarkup || i.typ == tLeftDelimScNoMarkup

+}

+func (i Item) IsRightShortcodeDelim() bool {

+	return i.typ == tRightDelimScWithMarkup || i.typ == tRightDelimScNoMarkup

+}

+func (i Item) IsShortcodeClose() bool {

+	return i.typ == tScClose

+}

+func (i Item) IsShortcodeParam() bool {

+	return i.typ == tScParam

+}

+func (i Item) IsShortcodeParamVal() bool {

+	return i.typ == tScParamVal

+}

+func (i Item) IsShortcodeMarkupDelimiter() bool {

+	return i.typ == tLeftDelimScWithMarkup || i.typ == tRightDelimScWithMarkup

+}

+func (i Item) IsDone() bool {

+	return i.typ == tError || i.typ == tEOF

+}

+func (i Item) IsEOF() bool {

+	return i.typ == tEOF

+}

+func (i Item) IsError() bool {

+	return i.typ == tError

+}

+func (i Item) String() string {

+	switch {

+	case i.typ == tEOF:

+		return "EOF"

+	case i.typ == tError:

+		return i.Val

+	case i.typ > tKeywordMarker:

+		return fmt.Sprintf("<%s>", i.Val)

+	case len(i.Val) > 20:

+		return fmt.Sprintf("%.20q...", i.Val)

+	}

+	return fmt.Sprintf("[%s]", i.Val)

+}

+type itemType int

+const (

+	tError itemType = iota

+	tEOF

+	// shortcode items

+	tLeftDelimScNoMarkup

+	tRightDelimScNoMarkup

+	tLeftDelimScWithMarkup

+	tRightDelimScWithMarkup

+	tScClose

+	tScName

+	tScParam

+	tScParamVal

+	//itemIdentifier

+	tText // plain text, used for everything outside the shortcodes

+	// preserved for later - keywords come after this

+	tKeywordMarker

+)

+const eof = -1

+// returns the next state in scanner.

+type stateFunc func(*pagelexer) stateFunc

+type pagelexer struct {

+	name    string

+	input   string

+	state   stateFunc

+	pos     pos // input position

+	start   pos // item start position

+	width   pos // width of last element

+	lastPos pos // position of the last item returned by nextItem

+	// shortcode state

+	currLeftDelimItem  itemType

+	currRightDelimItem itemType

+	currShortcodeName  string          // is only set when a shortcode is in opened state

+	closingState       int             // > 0 = on its way to be closed

+	elementStepNum     int             // step number in element

+	paramElements      int             // number of elements (name + value = 2) found first

+	openShortcodes     map[string]bool // set of shortcodes in open state

+	// items delivered to client

+	items []Item

+}

+func Parse(s string) *Tokens {

+	return ParseFrom(s, 0)

+}

+func ParseFrom(s string, from int) *Tokens {

+	return &Tokens{lexer: newShortcodeLexer("default", s, pos(from))}

+}

+// note: the input position here is normally 0 (start), but

+// can be set if position of first shortcode is known

+func newShortcodeLexer(name, input string, inputPosition pos) *pagelexer {

+	lexer := &pagelexer{

+		name:               name,

+		input:              input,

+		currLeftDelimItem:  tLeftDelimScNoMarkup,

+		currRightDelimItem: tRightDelimScNoMarkup,

+		pos:                inputPosition,

+		openShortcodes:     make(map[string]bool),

+		items:              make([]Item, 0, 5),

+	}

+	lexer.runShortcodeLexer()

+	return lexer

+}

+// main loop

+// this looks kind of funky, but it works

+func (l *pagelexer) runShortcodeLexer() {

+	for l.state = lexTextOutsideShortcodes; l.state != nil; {

+		l.state = l.state(l)

+	}

+}

+// state functions

+const (

+	leftDelimScNoMarkup    = "{{<"

+	rightDelimScNoMarkup   = ">}}"

+	leftDelimScWithMarkup  = "{{%"

+	rightDelimScWithMarkup = "%}}"

+	leftComment            = "/*" // comments in this context us used to to mark shortcodes as "not really a shortcode"

+	rightComment           = "*/"

+)

+func (l *pagelexer) next() rune {

+	if int(l.pos) >= len(l.input) {

+		l.width = 0

+		return eof

+	}

+	// looks expensive, but should produce the same iteration sequence as the string range loop

+	// see: http://blog.golang.org/strings

+	runeValue, runeWidth := utf8.DecodeRuneInString(l.input[l.pos:])

+	l.width = pos(runeWidth)

+	l.pos += l.width

+	return runeValue

+}

+// peek, but no consume

+func (l *pagelexer) peek() rune {

+	r := l.next()

+	l.backup()

+	return r

+}

+// steps back one

+func (l *pagelexer) backup() {

+	l.pos -= l.width

+}

+// sends an item back to the client.

+func (l *pagelexer) emit(t itemType) {

+	l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos]})

+	l.start = l.pos

+}

+// special case, do not send '\\' back to client

+func (l *pagelexer) ignoreEscapesAndEmit(t itemType) {

+	val := strings.Map(func(r rune) rune {

+		if r == '\\' {

+			return -1

+		}

+		return r

+	}, l.input[l.start:l.pos])

+	l.items = append(l.items, Item{t, l.start, val})

+	l.start = l.pos

+}

+// gets the current value (for debugging and error handling)

+func (l *pagelexer) current() string {

+	return l.input[l.start:l.pos]

+}

+// ignore current element

+func (l *pagelexer) ignore() {

+	l.start = l.pos

+}

+// nice to have in error logs

+func (l *pagelexer) lineNum() int {

+	return strings.Count(l.input[:l.lastPos], "\n") + 1

+}

+// nil terminates the parser

+func (l *pagelexer) errorf(format string, args ...interface{}) stateFunc {

+	l.items = append(l.items, Item{tError, l.start, fmt.Sprintf(format, args...)})

+	return nil

+}

+// consumes and returns the next item

+func (l *pagelexer) nextItem() Item {

+	item := l.items[0]

+	l.items = l.items[1:]

+	l.lastPos = item.pos

+	return item

+}

+// scans until an opening shortcode opening bracket.

+// if no shortcodes, it will keep on scanning until EOF

+func lexTextOutsideShortcodes(l *pagelexer) stateFunc {

+	for {

+		if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || strings.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup) {

+			if l.pos > l.start {

+				l.emit(tText)

+			}

+			if strings.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) {

+				l.currLeftDelimItem = tLeftDelimScWithMarkup

+				l.currRightDelimItem = tRightDelimScWithMarkup

+			} else {

+				l.currLeftDelimItem = tLeftDelimScNoMarkup

+				l.currRightDelimItem = tRightDelimScNoMarkup

+			}

+			return lexShortcodeLeftDelim

+		}

+		if l.next() == eof {

+			break

+		}

+	}

+	// Done!

+	if l.pos > l.start {

+		l.emit(tText)

+	}

+	l.emit(tEOF)

+	return nil

+}

+func lexShortcodeLeftDelim(l *pagelexer) stateFunc {

+	l.pos += pos(len(l.currentLeftShortcodeDelim()))

+	if strings.HasPrefix(l.input[l.pos:], leftComment) {

+		return lexShortcodeComment

+	}

+	l.emit(l.currentLeftShortcodeDelimItem())

+	l.elementStepNum = 0

+	l.paramElements = 0

+	return lexInsideShortcode

+}

+func lexShortcodeComment(l *pagelexer) stateFunc {

+	posRightComment := strings.Index(l.input[l.pos:], rightComment+l.currentRightShortcodeDelim())

+	if posRightComment <= 1 {

+		return l.errorf("comment must be closed")

+	}

+	// we emit all as text, except the comment markers

+	l.emit(tText)

+	l.pos += pos(len(leftComment))

+	l.ignore()

+	l.pos += pos(posRightComment - len(leftComment))

+	l.emit(tText)

+	l.pos += pos(len(rightComment))

+	l.ignore()

+	l.pos += pos(len(l.currentRightShortcodeDelim()))

+	l.emit(tText)

+	return lexTextOutsideShortcodes

+}

+func lexShortcodeRightDelim(l *pagelexer) stateFunc {

+	l.closingState = 0

+	l.pos += pos(len(l.currentRightShortcodeDelim()))

+	l.emit(l.currentRightShortcodeDelimItem())

+	return lexTextOutsideShortcodes

+}

+// either:

+// 1. param

+// 2. "param" or "param\"

+// 3. param="123" or param="123\"

+// 4. param="Some \"escaped\" text"

+func lexShortcodeParam(l *pagelexer, escapedQuoteStart bool) stateFunc {

+	first := true

+	nextEq := false

+	var r rune

+	for {

+		r = l.next()

+		if first {

+			if r == '"' {

+				// a positional param with quotes

+				if l.paramElements == 2 {

+					return l.errorf("got quoted positional parameter. Cannot mix named and positional parameters")

+				}

+				l.paramElements = 1

+				l.backup()

+				return lexShortcodeQuotedParamVal(l, !escapedQuoteStart, tScParam)

+			}

+			first = false

+		} else if r == '=' {

+			// a named param

+			l.backup()

+			nextEq = true

+			break

+		}

+		if !isAlphaNumericOrHyphen(r) {

+			l.backup()

+			break

+		}

+	}

+	if l.paramElements == 0 {

+		l.paramElements++

+		if nextEq {

+			l.paramElements++

+		}

+	} else {

+		if nextEq && l.paramElements == 1 {

+			return l.errorf("got named parameter '%s'. Cannot mix named and positional parameters", l.current())

+		} else if !nextEq && l.paramElements == 2 {

+			return l.errorf("got positional parameter '%s'. Cannot mix named and positional parameters", l.current())

+		}

+	}

+	l.emit(tScParam)

+	return lexInsideShortcode

+}

+func lexShortcodeQuotedParamVal(l *pagelexer, escapedQuotedValuesAllowed bool, typ itemType) stateFunc {

+	openQuoteFound := false

+	escapedInnerQuoteFound := false

+	escapedQuoteState := 0

+Loop:

+	for {

+		switch r := l.next(); {

+		case r == '\\':

+			if l.peek() == '"' {

+				if openQuoteFound && !escapedQuotedValuesAllowed {

+					l.backup()

+					break Loop

+				} else if openQuoteFound {

+					// the coming quoute is inside

+					escapedInnerQuoteFound = true

+					escapedQuoteState = 1

+				}

+			}

+		case r == eof, r == '\n':

+			return l.errorf("unterminated quoted string in shortcode parameter-argument: '%s'", l.current())

+		case r == '"':

+			if escapedQuoteState == 0 {

+				if openQuoteFound {

+					l.backup()

+					break Loop

+				} else {

+					openQuoteFound = true

+					l.ignore()

+				}

+			} else {

+				escapedQuoteState = 0

+			}

+		}

+	}

+	if escapedInnerQuoteFound {

+		l.ignoreEscapesAndEmit(typ)

+	} else {

+		l.emit(typ)

+	}

+	r := l.next()

+	if r == '\\' {

+		if l.peek() == '"' {

+			// ignore the escaped closing quote

+			l.ignore()

+			l.next()

+			l.ignore()

+		}

+	} else if r == '"' {

+		// ignore closing quote

+		l.ignore()

+	} else {

+		// handled by next state

+		l.backup()

+	}

+	return lexInsideShortcode

+}

+// scans an alphanumeric inside shortcode

+func lexIdentifierInShortcode(l *pagelexer) stateFunc {

+	lookForEnd := false

+Loop:

+	for {

+		switch r := l.next(); {

+		case isAlphaNumericOrHyphen(r):

+		// Allow forward slash inside names to make it possible to create namespaces.

+		case r == '/':

+		default:

+			l.backup()

+			word := l.input[l.start:l.pos]

+			if l.closingState > 0 && !l.openShortcodes[word] {

+				return l.errorf("closing tag for shortcode '%s' does not match start tag", word)

+			} else if l.closingState > 0 {

+				l.openShortcodes[word] = false

+				lookForEnd = true

+			}

+			l.closingState = 0

+			l.currShortcodeName = word

+			l.openShortcodes[word] = true

+			l.elementStepNum++

+			l.emit(tScName)

+			break Loop

+		}

+	}

+	if lookForEnd {

+		return lexEndOfShortcode

+	}

+	return lexInsideShortcode

+}

+func lexEndOfShortcode(l *pagelexer) stateFunc {

+	if strings.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) {

+		return lexShortcodeRightDelim

+	}

+	switch r := l.next(); {

+	case isSpace(r):

+		l.ignore()

+	default:

+		return l.errorf("unclosed shortcode")

+	}

+	return lexEndOfShortcode

+}

+// scans the elements inside shortcode tags

+func lexInsideShortcode(l *pagelexer) stateFunc {

+	if strings.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) {

+		return lexShortcodeRightDelim

+	}

+	switch r := l.next(); {

+	case r == eof:

+		// eol is allowed inside shortcodes; this may go to end of document before it fails

+		return l.errorf("unclosed shortcode action")

+	case isSpace(r), isEndOfLine(r):

+		l.ignore()

+	case r == '=':

+		l.ignore()

+		return lexShortcodeQuotedParamVal(l, l.peek() != '\\', tScParamVal)

+	case r == '/':

+		if l.currShortcodeName == "" {

+			return l.errorf("got closing shortcode, but none is open")

+		}

+		l.closingState++

+		l.emit(tScClose)

+	case r == '\\':

+		l.ignore()

+		if l.peek() == '"' {

+			return lexShortcodeParam(l, true)

+		}

+	case l.elementStepNum > 0 && (isAlphaNumericOrHyphen(r) || r == '"'): // positional params can have quotes

+		l.backup()

+		return lexShortcodeParam(l, false)

+	case isAlphaNumeric(r):

+		l.backup()

+		return lexIdentifierInShortcode

+	default:

+		return l.errorf("unrecognized character in shortcode action: %#U. Note: Parameters with non-alphanumeric args must be quoted", r)

+	}

+	return lexInsideShortcode

+}

+// state helpers

+func (l *pagelexer) currentLeftShortcodeDelimItem() itemType {

+	return l.currLeftDelimItem

+}

+func (l *pagelexer) currentRightShortcodeDelimItem() itemType {

+	return l.currRightDelimItem

+}

+func (l *pagelexer) currentLeftShortcodeDelim() string {

+	if l.currLeftDelimItem == tLeftDelimScWithMarkup {

+		return leftDelimScWithMarkup

+	}

+	return leftDelimScNoMarkup

+}

+func (l *pagelexer) currentRightShortcodeDelim() string {

+	if l.currRightDelimItem == tRightDelimScWithMarkup {

+		return rightDelimScWithMarkup

+	}

+	return rightDelimScNoMarkup

+}

+// helper functions

+func isSpace(r rune) bool {

+	return r == ' ' || r == '\t'

+}

+func isAlphaNumericOrHyphen(r rune) bool {

+	// let unquoted YouTube ids as positional params slip through (they contain hyphens)

+	return isAlphaNumeric(r) || r == '-'

+}

+func isEndOfLine(r rune) bool {

+	return r == '\r' || r == '\n'

+}

+func isAlphaNumeric(r rune) bool {

+	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)

+}

--- /dev/null

+++ b/parser/pageparser/shortcodeparser_test.go

@@ -1,0 +1,207 @@

+// Copyright 2018 The Hugo Authors. All rights reserved.

+//

+// Licensed under the Apache License, Version 2.0 (the "License");

+// you may not use this file except in compliance with the License.

+// You may obtain a copy of the License at

+// http://www.apache.org/licenses/LICENSE-2.0

+//

+// Unless required by applicable law or agreed to in writing, software

+// distributed under the License is distributed on an "AS IS" BASIS,

+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+// See the License for the specific language governing permissions and

+// limitations under the License.

+package pageparser

+import (

+	"testing"

+)

+type shortCodeLexerTest struct {

+	name  string

+	input string

+	items []Item

+}

+var (

+	tstEOF       = Item{tEOF, 0, ""}

+	tstLeftNoMD  = Item{tLeftDelimScNoMarkup, 0, "{{<"}

+	tstRightNoMD = Item{tRightDelimScNoMarkup, 0, ">}}"}

+	tstLeftMD    = Item{tLeftDelimScWithMarkup, 0, "{{%"}

+	tstRightMD   = Item{tRightDelimScWithMarkup, 0, "%}}"}

+	tstSCClose   = Item{tScClose, 0, "/"}

+	tstSC1       = Item{tScName, 0, "sc1"}

+	tstSC2       = Item{tScName, 0, "sc2"}

+	tstSC3       = Item{tScName, 0, "sc3"}

+	tstSCSlash   = Item{tScName, 0, "sc/sub"}

+	tstParam1    = Item{tScParam, 0, "param1"}

+	tstParam2    = Item{tScParam, 0, "param2"}

+	tstVal       = Item{tScParamVal, 0, "Hello World"}

+)

+var shortCodeLexerTests = []shortCodeLexerTest{

+	{"empty", "", []Item{tstEOF}},

+	{"spaces", " \t\n", []Item{{tText, 0, " \t\n"}, tstEOF}},

+	{"text", `to be or not`, []Item{{tText, 0, "to be or not"}, tstEOF}},

+	{"no markup", `{{< sc1 >}}`, []Item{tstLeftNoMD, tstSC1, tstRightNoMD, tstEOF}},

+	{"with EOL", "{{< sc1 \n >}}", []Item{tstLeftNoMD, tstSC1, tstRightNoMD, tstEOF}},

+	{"forward slash inside name", `{{< sc/sub >}}`, []Item{tstLeftNoMD, tstSCSlash, tstRightNoMD, tstEOF}},

+	{"simple with markup", `{{% sc1 %}}`, []Item{tstLeftMD, tstSC1, tstRightMD, tstEOF}},

+	{"with spaces", `{{<     sc1     >}}`, []Item{tstLeftNoMD, tstSC1, tstRightNoMD, tstEOF}},

+	{"mismatched rightDelim", `{{< sc1 %}}`, []Item{tstLeftNoMD, tstSC1,

+		{tError, 0, "unrecognized character in shortcode action: U+0025 '%'. Note: Parameters with non-alphanumeric args must be quoted"}}},

+	{"inner, markup", `{{% sc1 %}} inner {{% /sc1 %}}`, []Item{

+		tstLeftMD,

+		tstSC1,

+		tstRightMD,

+		{tText, 0, " inner "},

+		tstLeftMD,

+		tstSCClose,

+		tstSC1,

+		tstRightMD,

+		tstEOF,

+	}},

+	{"close, but no open", `{{< /sc1 >}}`, []Item{

+		tstLeftNoMD, {tError, 0, "got closing shortcode, but none is open"}}},

+	{"close wrong", `{{< sc1 >}}{{< /another >}}`, []Item{

+		tstLeftNoMD, tstSC1, tstRightNoMD, tstLeftNoMD, tstSCClose,

+		{tError, 0, "closing tag for shortcode 'another' does not match start tag"}}},

+	{"close, but no open, more", `{{< sc1 >}}{{< /sc1 >}}{{< /another >}}`, []Item{

+		tstLeftNoMD, tstSC1, tstRightNoMD, tstLeftNoMD, tstSCClose, tstSC1, tstRightNoMD, tstLeftNoMD, tstSCClose,

+		{tError, 0, "closing tag for shortcode 'another' does not match start tag"}}},

+	{"close with extra keyword", `{{< sc1 >}}{{< /sc1 keyword>}}`, []Item{

+		tstLeftNoMD, tstSC1, tstRightNoMD, tstLeftNoMD, tstSCClose, tstSC1,

+		{tError, 0, "unclosed shortcode"}}},

+	{"Youtube id", `{{< sc1 -ziL-Q_456igdO-4 >}}`, []Item{

+		tstLeftNoMD, tstSC1, {tScParam, 0, "-ziL-Q_456igdO-4"}, tstRightNoMD, tstEOF}},

+	{"non-alphanumerics param quoted", `{{< sc1 "-ziL-.%QigdO-4" >}}`, []Item{

+		tstLeftNoMD, tstSC1, {tScParam, 0, "-ziL-.%QigdO-4"}, tstRightNoMD, tstEOF}},

+	{"two params", `{{< sc1 param1   param2 >}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam1, tstParam2, tstRightNoMD, tstEOF}},

+	// issue #934

+	{"self-closing", `{{< sc1 />}}`, []Item{

+		tstLeftNoMD, tstSC1, tstSCClose, tstRightNoMD, tstEOF}},

+	// Issue 2498

+	{"multiple self-closing", `{{< sc1 />}}{{< sc1 />}}`, []Item{

+		tstLeftNoMD, tstSC1, tstSCClose, tstRightNoMD,

+		tstLeftNoMD, tstSC1, tstSCClose, tstRightNoMD, tstEOF}},

+	{"self-closing with param", `{{< sc1 param1 />}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam1, tstSCClose, tstRightNoMD, tstEOF}},

+	{"multiple self-closing with param", `{{< sc1 param1 />}}{{< sc1 param1 />}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam1, tstSCClose, tstRightNoMD,

+		tstLeftNoMD, tstSC1, tstParam1, tstSCClose, tstRightNoMD, tstEOF}},

+	{"multiple different self-closing with param", `{{< sc1 param1 />}}{{< sc2 param1 />}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam1, tstSCClose, tstRightNoMD,

+		tstLeftNoMD, tstSC2, tstParam1, tstSCClose, tstRightNoMD, tstEOF}},

+	{"nested simple", `{{< sc1 >}}{{< sc2 >}}{{< /sc1 >}}`, []Item{

+		tstLeftNoMD, tstSC1, tstRightNoMD,

+		tstLeftNoMD, tstSC2, tstRightNoMD,

+		tstLeftNoMD, tstSCClose, tstSC1, tstRightNoMD, tstEOF}},

+	{"nested complex", `{{< sc1 >}}ab{{% sc2 param1 %}}cd{{< sc3 >}}ef{{< /sc3 >}}gh{{% /sc2 %}}ij{{< /sc1 >}}kl`, []Item{

+		tstLeftNoMD, tstSC1, tstRightNoMD,

+		{tText, 0, "ab"},

+		tstLeftMD, tstSC2, tstParam1, tstRightMD,

+		{tText, 0, "cd"},

+		tstLeftNoMD, tstSC3, tstRightNoMD,

+		{tText, 0, "ef"},

+		tstLeftNoMD, tstSCClose, tstSC3, tstRightNoMD,

+		{tText, 0, "gh"},

+		tstLeftMD, tstSCClose, tstSC2, tstRightMD,

+		{tText, 0, "ij"},

+		tstLeftNoMD, tstSCClose, tstSC1, tstRightNoMD,

+		{tText, 0, "kl"}, tstEOF,

+	}},

+	{"two quoted params", `{{< sc1 "param nr. 1" "param nr. 2" >}}`, []Item{

+		tstLeftNoMD, tstSC1, {tScParam, 0, "param nr. 1"}, {tScParam, 0, "param nr. 2"}, tstRightNoMD, tstEOF}},

+	{"two named params", `{{< sc1 param1="Hello World" param2="p2Val">}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam1, tstVal, tstParam2, {tScParamVal, 0, "p2Val"}, tstRightNoMD, tstEOF}},

+	{"escaped quotes", `{{< sc1 param1=\"Hello World\"  >}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam1, tstVal, tstRightNoMD, tstEOF}},

+	{"escaped quotes, positional param", `{{< sc1 \"param1\"  >}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam1, tstRightNoMD, tstEOF}},

+	{"escaped quotes inside escaped quotes", `{{< sc1 param1=\"Hello \"escaped\" World\"  >}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam1,

+		{tScParamVal, 0, `Hello `}, {tError, 0, `got positional parameter 'escaped'. Cannot mix named and positional parameters`}}},

+	{"escaped quotes inside nonescaped quotes",

+		`{{< sc1 param1="Hello \"escaped\" World"  >}}`, []Item{

+			tstLeftNoMD, tstSC1, tstParam1, {tScParamVal, 0, `Hello "escaped" World`}, tstRightNoMD, tstEOF}},

+	{"escaped quotes inside nonescaped quotes in positional param",

+		`{{< sc1 "Hello \"escaped\" World"  >}}`, []Item{

+			tstLeftNoMD, tstSC1, {tScParam, 0, `Hello "escaped" World`}, tstRightNoMD, tstEOF}},

+	{"unterminated quote", `{{< sc1 param2="Hello World>}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam2, {tError, 0, "unterminated quoted string in shortcode parameter-argument: 'Hello World>}}'"}}},

+	{"one named param, one not", `{{< sc1 param1="Hello World" p2 >}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam1, tstVal,

+		{tError, 0, "got positional parameter 'p2'. Cannot mix named and positional parameters"}}},

+	{"one named param, one quoted positional param", `{{< sc1 param1="Hello World" "And Universe" >}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam1, tstVal,

+		{tError, 0, "got quoted positional parameter. Cannot mix named and positional parameters"}}},

+	{"one quoted positional param, one named param", `{{< sc1 "param1" param2="And Universe" >}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam1,

+		{tError, 0, "got named parameter 'param2'. Cannot mix named and positional parameters"}}},

+	{"ono positional param, one not", `{{< sc1 param1 param2="Hello World">}}`, []Item{

+		tstLeftNoMD, tstSC1, tstParam1,

+		{tError, 0, "got named parameter 'param2'. Cannot mix named and positional parameters"}}},

+	{"commented out", `{{</* sc1 */>}}`, []Item{

+		{tText, 0, "{{<"}, {tText, 0, " sc1 "}, {tText, 0, ">}}"}, tstEOF}},

+	{"commented out, with asterisk inside", `{{</* sc1 "**/*.pdf" */>}}`, []Item{

+		{tText, 0, "{{<"}, {tText, 0, " sc1 \"**/*.pdf\" "}, {tText, 0, ">}}"}, tstEOF}},

+	{"commented out, missing close", `{{</* sc1 >}}`, []Item{

+		{tError, 0, "comment must be closed"}}},

+	{"commented out, misplaced close", `{{</* sc1 >}}*/`, []Item{

+		{tError, 0, "comment must be closed"}}},

+}

+func TestShortcodeLexer(t *testing.T) {

+	t.Parallel()

+	for i, test := range shortCodeLexerTests {

+		items := collect(&test)

+		if !equal(items, test.items) {

+			t.Errorf("[%d] %s: got\n\t%v\nexpected\n\t%v", i, test.name, items, test.items)

+		}

+	}

+}

+func BenchmarkShortcodeLexer(b *testing.B) {

+	b.ResetTimer()

+	for i := 0; i < b.N; i++ {

+		for _, test := range shortCodeLexerTests {

+			items := collect(&test)

+			if !equal(items, test.items) {

+				b.Errorf("%s: got\n\t%v\nexpected\n\t%v", test.name, items, test.items)

+			}

+		}

+	}

+}

+func collect(t *shortCodeLexerTest) (items []Item) {

+	l := newShortcodeLexer(t.name, t.input, 0)

+	for {

+		item := l.nextItem()

+		items = append(items, item)

+		if item.typ == tEOF || item.typ == tError {

+			break

+		}

+	}

+	return

+}

+// no positional checking, for now ...

+func equal(i1, i2 []Item) bool {

+	if len(i1) != len(i2) {

+		return false

+	}

+	for k := range i1 {

+		if i1[k].typ != i2[k].typ {

+			return false

+		}

+		if i1[k].Val != i2[k].Val {

+			return false

+		}

+	}

+	return true

+}

--

⑨