diff options
author | Ralph Amissah <ralph.amissah@gmail.com> | 2021-02-19 17:10:51 -0500 |
---|---|---|
committer | Ralph Amissah <ralph.amissah@gmail.com> | 2021-02-24 16:46:47 -0500 |
commit | 02ca32ae0a5bc290918d2b2a3288e385b9cc6b11 (patch) | |
tree | 06379785e8a0165a7deb981c2eba362894820634 /src/ext_depends/D-YAML/source/dyaml/scanner.d | |
parent | build from static source-tree pre fetch depends (diff) |
external & build dependences in src tree
- external & build dependences boost licensed
- ext_depends (external depends)
- D-YAML
- tinyendian
- d2sqlite3
- imageformats
- build_depends
- dub2nix
Diffstat (limited to 'src/ext_depends/D-YAML/source/dyaml/scanner.d')
-rw-r--r-- | src/ext_depends/D-YAML/source/dyaml/scanner.d | 1788 |
1 files changed, 1788 insertions, 0 deletions
diff --git a/src/ext_depends/D-YAML/source/dyaml/scanner.d b/src/ext_depends/D-YAML/source/dyaml/scanner.d new file mode 100644 index 0000000..2009521 --- /dev/null +++ b/src/ext_depends/D-YAML/source/dyaml/scanner.d @@ -0,0 +1,1788 @@ + +// Copyright Ferdinand Majerech 2011-2014. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +/// YAML scanner. +/// Code based on PyYAML: http://www.pyyaml.org +module dyaml.scanner; + + +import core.stdc.string; + +import std.algorithm; +import std.array; +import std.conv; +import std.ascii : isAlphaNum, isDigit, isHexDigit; +import std.exception; +import std.string; +import std.typecons; +import std.traits : Unqual; +import std.utf; + +import dyaml.escapes; +import dyaml.exception; +import dyaml.queue; +import dyaml.reader; +import dyaml.style; +import dyaml.token; + +package: +/// Scanner produces tokens of the following types: +/// STREAM-START +/// STREAM-END +/// DIRECTIVE(name, value) +/// DOCUMENT-START +/// DOCUMENT-END +/// BLOCK-SEQUENCE-START +/// BLOCK-MAPPING-START +/// BLOCK-END +/// FLOW-SEQUENCE-START +/// FLOW-MAPPING-START +/// FLOW-SEQUENCE-END +/// FLOW-MAPPING-END +/// BLOCK-ENTRY +/// FLOW-ENTRY +/// KEY +/// VALUE +/// ALIAS(value) +/// ANCHOR(value) +/// TAG(value) +/// SCALAR(value, plain, style) + +alias isBreak = among!('\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); + +alias isBreakOrSpace = among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); + +alias isWhiteSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); + +alias isNonLinebreakWhitespace = among!(' ', '\t'); + +alias isNonScalarStartCharacter = among!('-', '?', ':', ',', '[', ']', '{', '}', + '#', '&', '*', '!', '|', '>', '\'', '"', '%', '@', '`', ' ', '\t', '\0', '\n', + '\r', '\u0085', '\u2028', '\u2029'); + +alias isURIChar = among!('-', ';', '/', '?', ':', '@', '&', '=', '+', '$', ',', + '_', '.', '!', '~', '*', '\'', '(', ')', '[', ']', '%'); + +alias isNSChar = among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029'); + +alias isBChar = among!('\n', '\r', '\u0085', '\u2028', '\u2029'); + +alias isFlowScalarBreakSpace = among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029', '\'', '"', '\\'); + +/// Marked exception thrown at scanner errors. +/// +/// See_Also: MarkedYAMLException +class ScannerException : MarkedYAMLException +{ + mixin MarkedExceptionCtors; +} + +/// Generates tokens from data provided by a Reader. +struct Scanner +{ + private: + /// A simple key is a key that is not denoted by the '?' indicator. + /// For example: + /// --- + /// block simple key: value + /// ? not a simple key: + /// : { flow simple key: value } + /// We emit the KEY token before all keys, so when we find a potential simple + /// key, we try to locate the corresponding ':' indicator. Simple keys should be + /// limited to a single line and 1024 characters. + /// + /// 16 bytes on 64-bit. + static struct SimpleKey + { + /// Character index in reader where the key starts. + uint charIndex = uint.max; + /// Index of the key token from start (first token scanned being 0). + uint tokenIndex; + /// Line the key starts at. + uint line; + /// Column the key starts at. + ushort column; + /// Is this required to be a simple key? + bool required; + /// Is this struct "null" (invalid)?. + bool isNull; + } + + /// Block chomping types. + enum Chomping + { + /// Strip all trailing line breaks. '-' indicator. + strip, + /// Line break of the last line is preserved, others discarded. Default. + clip, + /// All trailing line breaks are preserved. '+' indicator. + keep + } + + /// Reader used to read from a file/stream. + Reader reader_; + /// Are we done scanning? + bool done_; + + /// Level of nesting in flow context. If 0, we're in block context. + uint flowLevel_; + /// Current indentation level. + int indent_ = -1; + /// Past indentation levels. Used as a stack. + Appender!(int[]) indents_; + + /// Processed tokens not yet emitted. Used as a queue. + Queue!Token tokens_; + + /// Number of tokens emitted through the getToken method. + uint tokensTaken_; + + /// Can a simple key start at the current position? A simple key may start: + /// - at the beginning of the line, not counting indentation spaces + /// (in block context), + /// - after '{', '[', ',' (in the flow context), + /// - after '?', ':', '-' (in the block context). + /// In the block context, this flag also signifies if a block collection + /// may start at the current position. + bool allowSimpleKey_ = true; + + /// Possible simple keys indexed by flow levels. + SimpleKey[] possibleSimpleKeys_; + + public: + /// Construct a Scanner using specified Reader. + this(Reader reader) @safe nothrow + { + // Return the next token, but do not delete it from the queue + reader_ = reader; + fetchStreamStart(); + } + + /// Advance to the next token + void popFront() @safe + { + ++tokensTaken_; + tokens_.pop(); + } + + /// Return the current token + const(Token) front() @safe + { + enforce(!empty, "No token left to peek"); + return tokens_.peek(); + } + + /// Return whether there are any more tokens left. + bool empty() @safe + { + while (needMoreTokens()) + { + fetchToken(); + } + return tokens_.empty; + } + + private: + /// Most scanning error messages have the same format; so build them with this + /// function. + string expected(T)(string expected, T found) + { + return text("expected ", expected, ", but found ", found); + } + + /// Determine whether or not we need to fetch more tokens before peeking/getting a token. + bool needMoreTokens() @safe pure + { + if(done_) { return false; } + if(tokens_.empty) { return true; } + + /// The current token may be a potential simple key, so we need to look further. + stalePossibleSimpleKeys(); + return nextPossibleSimpleKey() == tokensTaken_; + } + + /// Fetch at token, adding it to tokens_. + void fetchToken() @safe + { + // Eat whitespaces and comments until we reach the next token. + scanToNextToken(); + + // Remove obsolete possible simple keys. + stalePossibleSimpleKeys(); + + // Compare current indentation and column. It may add some tokens + // and decrease the current indentation level. + unwindIndent(reader_.column); + + // Get the next character. + const dchar c = reader_.peekByte(); + + // Fetch the token. + if(c == '\0') { return fetchStreamEnd(); } + if(checkDirective()) { return fetchDirective(); } + if(checkDocumentStart()) { return fetchDocumentStart(); } + if(checkDocumentEnd()) { return fetchDocumentEnd(); } + // Order of the following checks is NOT significant. + switch(c) + { + case '[': return fetchFlowSequenceStart(); + case '{': return fetchFlowMappingStart(); + case ']': return fetchFlowSequenceEnd(); + case '}': return fetchFlowMappingEnd(); + case ',': return fetchFlowEntry(); + case '!': return fetchTag(); + case '\'': return fetchSingle(); + case '\"': return fetchDouble(); + case '*': return fetchAlias(); + case '&': return fetchAnchor(); + case '?': if(checkKey()) { return fetchKey(); } goto default; + case ':': if(checkValue()) { return fetchValue(); } goto default; + case '-': if(checkBlockEntry()) { return fetchBlockEntry(); } goto default; + case '|': if(flowLevel_ == 0) { return fetchLiteral(); } break; + case '>': if(flowLevel_ == 0) { return fetchFolded(); } break; + default: if(checkPlain()) { return fetchPlain(); } + } + + throw new ScannerException("While scanning for the next token, found character " ~ + "\'%s\', index %s that cannot start any token" + .format(c, to!int(c)), reader_.mark); + } + + + /// Return the token number of the nearest possible simple key. + uint nextPossibleSimpleKey() @safe pure nothrow @nogc + { + uint minTokenNumber = uint.max; + foreach(k, ref simpleKey; possibleSimpleKeys_) + { + if(simpleKey.isNull) { continue; } + minTokenNumber = min(minTokenNumber, simpleKey.tokenIndex); + } + return minTokenNumber; + } + + /// Remove entries that are no longer possible simple keys. + /// + /// According to the YAML specification, simple keys + /// - should be limited to a single line, + /// - should be no longer than 1024 characters. + /// Disabling this will allow simple keys of any length and + /// height (may cause problems if indentation is broken though). + void stalePossibleSimpleKeys() @safe pure + { + foreach(level, ref key; possibleSimpleKeys_) + { + if(key.isNull) { continue; } + if(key.line != reader_.line || reader_.charIndex - key.charIndex > 1024) + { + enforce(!key.required, + new ScannerException("While scanning a simple key", + Mark(reader_.name, key.line, key.column), + "could not find expected ':'", reader_.mark)); + key.isNull = true; + } + } + } + + /// Check if the next token starts a possible simple key and if so, save its position. + /// + /// This function is called for ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'. + void savePossibleSimpleKey() @safe pure + { + // Check if a simple key is required at the current position. + const required = (flowLevel_ == 0 && indent_ == reader_.column); + assert(allowSimpleKey_ || !required, "A simple key is required only if it is " ~ + "the first token in the current line. Therefore it is always allowed."); + + if(!allowSimpleKey_) { return; } + + // The next token might be a simple key, so save its number and position. + removePossibleSimpleKey(); + const tokenCount = tokensTaken_ + cast(uint)tokens_.length; + + const line = reader_.line; + const column = reader_.column; + const key = SimpleKey(cast(uint)reader_.charIndex, tokenCount, line, + cast(ushort)min(column, ushort.max), required); + + if(possibleSimpleKeys_.length <= flowLevel_) + { + const oldLength = possibleSimpleKeys_.length; + possibleSimpleKeys_.length = flowLevel_ + 1; + //No need to initialize the last element, it's already done in the next line. + possibleSimpleKeys_[oldLength .. flowLevel_] = SimpleKey.init; + } + possibleSimpleKeys_[flowLevel_] = key; + } + + /// Remove the saved possible key position at the current flow level. + void removePossibleSimpleKey() @safe pure + { + if(possibleSimpleKeys_.length <= flowLevel_) { return; } + + if(!possibleSimpleKeys_[flowLevel_].isNull) + { + const key = possibleSimpleKeys_[flowLevel_]; + enforce(!key.required, + new ScannerException("While scanning a simple key", + Mark(reader_.name, key.line, key.column), + "could not find expected ':'", reader_.mark)); + possibleSimpleKeys_[flowLevel_].isNull = true; + } + } + + /// Decrease indentation, removing entries in indents_. + /// + /// Params: column = Current column in the file/stream. + void unwindIndent(const int column) @safe + { + if(flowLevel_ > 0) + { + // In flow context, tokens should respect indentation. + // The condition should be `indent >= column` according to the spec. + // But this condition will prohibit intuitively correct + // constructions such as + // key : { + // } + + // In the flow context, indentation is ignored. We make the scanner less + // restrictive than what the specification requires. + // if(pedantic_ && flowLevel_ > 0 && indent_ > column) + // { + // throw new ScannerException("Invalid intendation or unclosed '[' or '{'", + // reader_.mark) + // } + return; + } + + // In block context, we may need to issue the BLOCK-END tokens. + while(indent_ > column) + { + indent_ = indents_.data.back; + assert(indents_.data.length); + indents_.shrinkTo(indents_.data.length - 1); + tokens_.push(blockEndToken(reader_.mark, reader_.mark)); + } + } + + /// Increase indentation if needed. + /// + /// Params: column = Current column in the file/stream. + /// + /// Returns: true if the indentation was increased, false otherwise. + bool addIndent(int column) @safe + { + if(indent_ >= column){return false;} + indents_ ~= indent_; + indent_ = column; + return true; + } + + + /// Add STREAM-START token. + void fetchStreamStart() @safe nothrow + { + tokens_.push(streamStartToken(reader_.mark, reader_.mark, reader_.encoding)); + } + + ///Add STREAM-END token. + void fetchStreamEnd() @safe + { + //Set intendation to -1 . + unwindIndent(-1); + removePossibleSimpleKey(); + allowSimpleKey_ = false; + possibleSimpleKeys_.destroy; + + tokens_.push(streamEndToken(reader_.mark, reader_.mark)); + done_ = true; + } + + /// Add DIRECTIVE token. + void fetchDirective() @safe + { + // Set intendation to -1 . + unwindIndent(-1); + // Reset simple keys. + removePossibleSimpleKey(); + allowSimpleKey_ = false; + + auto directive = scanDirective(); + tokens_.push(directive); + } + + /// Add DOCUMENT-START or DOCUMENT-END token. + void fetchDocumentIndicator(TokenID id)() + if(id == TokenID.documentStart || id == TokenID.documentEnd) + { + // Set indentation to -1 . + unwindIndent(-1); + // Reset simple keys. Note that there can't be a block collection after '---'. + removePossibleSimpleKey(); + allowSimpleKey_ = false; + + Mark startMark = reader_.mark; + reader_.forward(3); + tokens_.push(simpleToken!id(startMark, reader_.mark)); + } + + /// Aliases to add DOCUMENT-START or DOCUMENT-END token. + alias fetchDocumentStart = fetchDocumentIndicator!(TokenID.documentStart); + alias fetchDocumentEnd = fetchDocumentIndicator!(TokenID.documentEnd); + + /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. + void fetchFlowCollectionStart(TokenID id)() @safe + { + // '[' and '{' may start a simple key. + savePossibleSimpleKey(); + // Simple keys are allowed after '[' and '{'. + allowSimpleKey_ = true; + ++flowLevel_; + + Mark startMark = reader_.mark; + reader_.forward(); + tokens_.push(simpleToken!id(startMark, reader_.mark)); + } + + /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. + alias fetchFlowSequenceStart = fetchFlowCollectionStart!(TokenID.flowSequenceStart); + alias fetchFlowMappingStart = fetchFlowCollectionStart!(TokenID.flowMappingStart); + + /// Add FLOW-SEQUENCE-START or FLOW-MAPPING-START token. + void fetchFlowCollectionEnd(TokenID id)() + { + // Reset possible simple key on the current level. + removePossibleSimpleKey(); + // No simple keys after ']' and '}'. + allowSimpleKey_ = false; + --flowLevel_; + + Mark startMark = reader_.mark; + reader_.forward(); + tokens_.push(simpleToken!id(startMark, reader_.mark)); + } + + /// Aliases to add FLOW-SEQUENCE-START or FLOW-MAPPING-START token/ + alias fetchFlowSequenceEnd = fetchFlowCollectionEnd!(TokenID.flowSequenceEnd); + alias fetchFlowMappingEnd = fetchFlowCollectionEnd!(TokenID.flowMappingEnd); + + /// Add FLOW-ENTRY token; + void fetchFlowEntry() @safe + { + // Reset possible simple key on the current level. + removePossibleSimpleKey(); + // Simple keys are allowed after ','. + allowSimpleKey_ = true; + + Mark startMark = reader_.mark; + reader_.forward(); + tokens_.push(flowEntryToken(startMark, reader_.mark)); + } + + /// Additional checks used in block context in fetchBlockEntry and fetchKey. + /// + /// Params: type = String representing the token type we might need to add. + /// id = Token type we might need to add. + void blockChecks(string type, TokenID id)() + { + enum context = type ~ " keys are not allowed here"; + // Are we allowed to start a key (not neccesarily a simple one)? + enforce(allowSimpleKey_, new ScannerException(context, reader_.mark)); + + if(addIndent(reader_.column)) + { + tokens_.push(simpleToken!id(reader_.mark, reader_.mark)); + } + } + + /// Add BLOCK-ENTRY token. Might add BLOCK-SEQUENCE-START in the process. + void fetchBlockEntry() @safe + { + if(flowLevel_ == 0) { blockChecks!("Sequence", TokenID.blockSequenceStart)(); } + + // It's an error for the block entry to occur in the flow context, + // but we let the parser detect this. + + // Reset possible simple key on the current level. + removePossibleSimpleKey(); + // Simple keys are allowed after '-'. + allowSimpleKey_ = true; + + Mark startMark = reader_.mark; + reader_.forward(); + tokens_.push(blockEntryToken(startMark, reader_.mark)); + } + + /// Add KEY token. Might add BLOCK-MAPPING-START in the process. + void fetchKey() @safe + { + if(flowLevel_ == 0) { blockChecks!("Mapping", TokenID.blockMappingStart)(); } + + // Reset possible simple key on the current level. + removePossibleSimpleKey(); + // Simple keys are allowed after '?' in the block context. + allowSimpleKey_ = (flowLevel_ == 0); + + Mark startMark = reader_.mark; + reader_.forward(); + tokens_.push(keyToken(startMark, reader_.mark)); + } + + /// Add VALUE token. Might add KEY and/or BLOCK-MAPPING-START in the process. + void fetchValue() @safe + { + //Do we determine a simple key? + if(possibleSimpleKeys_.length > flowLevel_ && + !possibleSimpleKeys_[flowLevel_].isNull) + { + const key = possibleSimpleKeys_[flowLevel_]; + possibleSimpleKeys_[flowLevel_].isNull = true; + Mark keyMark = Mark(reader_.name, key.line, key.column); + const idx = key.tokenIndex - tokensTaken_; + + assert(idx >= 0); + + // Add KEY. + // Manually inserting since tokens are immutable (need linked list). + tokens_.insert(keyToken(keyMark, keyMark), idx); + + // If this key starts a new block mapping, we need to add BLOCK-MAPPING-START. + if(flowLevel_ == 0 && addIndent(key.column)) + { + tokens_.insert(blockMappingStartToken(keyMark, keyMark), idx); + } + + // There cannot be two simple keys in a row. + allowSimpleKey_ = false; + } + // Part of a complex key + else + { + // We can start a complex value if and only if we can start a simple key. + enforce(flowLevel_ > 0 || allowSimpleKey_, + new ScannerException("Mapping values are not allowed here", reader_.mark)); + + // If this value starts a new block mapping, we need to add + // BLOCK-MAPPING-START. It'll be detected as an error later by the parser. + if(flowLevel_ == 0 && addIndent(reader_.column)) + { + tokens_.push(blockMappingStartToken(reader_.mark, reader_.mark)); + } + + // Reset possible simple key on the current level. + removePossibleSimpleKey(); + // Simple keys are allowed after ':' in the block context. + allowSimpleKey_ = (flowLevel_ == 0); + } + + // Add VALUE. + Mark startMark = reader_.mark; + reader_.forward(); + tokens_.push(valueToken(startMark, reader_.mark)); + } + + /// Add ALIAS or ANCHOR token. + void fetchAnchor_(TokenID id)() @safe + if(id == TokenID.alias_ || id == TokenID.anchor) + { + // ALIAS/ANCHOR could be a simple key. + savePossibleSimpleKey(); + // No simple keys after ALIAS/ANCHOR. + allowSimpleKey_ = false; + + auto anchor = scanAnchor(id); + tokens_.push(anchor); + } + + /// Aliases to add ALIAS or ANCHOR token. + alias fetchAlias = fetchAnchor_!(TokenID.alias_); + alias fetchAnchor = fetchAnchor_!(TokenID.anchor); + + /// Add TAG token. + void fetchTag() @safe + { + //TAG could start a simple key. + savePossibleSimpleKey(); + //No simple keys after TAG. + allowSimpleKey_ = false; + + tokens_.push(scanTag()); + } + + /// Add block SCALAR token. + void fetchBlockScalar(ScalarStyle style)() @safe + if(style == ScalarStyle.literal || style == ScalarStyle.folded) + { + // Reset possible simple key on the current level. + removePossibleSimpleKey(); + // A simple key may follow a block scalar. + allowSimpleKey_ = true; + + auto blockScalar = scanBlockScalar(style); + tokens_.push(blockScalar); + } + + /// Aliases to add literal or folded block scalar. + alias fetchLiteral = fetchBlockScalar!(ScalarStyle.literal); + alias fetchFolded = fetchBlockScalar!(ScalarStyle.folded); + + /// Add quoted flow SCALAR token. + void fetchFlowScalar(ScalarStyle quotes)() + { + // A flow scalar could be a simple key. + savePossibleSimpleKey(); + // No simple keys after flow scalars. + allowSimpleKey_ = false; + + // Scan and add SCALAR. + auto scalar = scanFlowScalar(quotes); + tokens_.push(scalar); + } + + /// Aliases to add single or double quoted block scalar. + alias fetchSingle = fetchFlowScalar!(ScalarStyle.singleQuoted); + alias fetchDouble = fetchFlowScalar!(ScalarStyle.doubleQuoted); + + /// Add plain SCALAR token. + void fetchPlain() @safe + { + // A plain scalar could be a simple key + savePossibleSimpleKey(); + // No simple keys after plain scalars. But note that scanPlain() will + // change this flag if the scan is finished at the beginning of the line. + allowSimpleKey_ = false; + auto plain = scanPlain(); + + // Scan and add SCALAR. May change allowSimpleKey_ + tokens_.push(plain); + } + + pure: + + ///Check if the next token is DIRECTIVE: ^ '%' ... + bool checkDirective() @safe + { + return reader_.peekByte() == '%' && reader_.column == 0; + } + + /// Check if the next token is DOCUMENT-START: ^ '---' (' '|'\n') + bool checkDocumentStart() @safe + { + // Check one char first, then all 3, to prevent reading outside the buffer. + return reader_.column == 0 && + reader_.peekByte() == '-' && + reader_.prefix(3) == "---" && + reader_.peek(3).isWhiteSpace; + } + + /// Check if the next token is DOCUMENT-END: ^ '...' (' '|'\n') + bool checkDocumentEnd() @safe + { + // Check one char first, then all 3, to prevent reading outside the buffer. + return reader_.column == 0 && + reader_.peekByte() == '.' && + reader_.prefix(3) == "..." && + reader_.peek(3).isWhiteSpace; + } + + /// Check if the next token is BLOCK-ENTRY: '-' (' '|'\n') + bool checkBlockEntry() @safe + { + return !!reader_.peek(1).isWhiteSpace; + } + + /// Check if the next token is KEY(flow context): '?' + /// + /// or KEY(block context): '?' (' '|'\n') + bool checkKey() @safe + { + return (flowLevel_ > 0 || reader_.peek(1).isWhiteSpace); + } + + /// Check if the next token is VALUE(flow context): ':' + /// + /// or VALUE(block context): ':' (' '|'\n') + bool checkValue() @safe + { + return flowLevel_ > 0 || reader_.peek(1).isWhiteSpace; + } + + /// Check if the next token is a plain scalar. + /// + /// A plain scalar may start with any non-space character except: + /// '-', '?', ':', ',', '[', ']', '{', '}', + /// '#', '&', '*', '!', '|', '>', '\'', '\"', + /// '%', '@', '`'. + /// + /// It may also start with + /// '-', '?', ':' + /// if it is followed by a non-space character. + /// + /// Note that we limit the last rule to the block context (except the + /// '-' character) because we want the flow context to be space + /// independent. + bool checkPlain() @safe + { + const c = reader_.peek(); + if(!c.isNonScalarStartCharacter) + { + return true; + } + return !reader_.peek(1).isWhiteSpace && + (c == '-' || (flowLevel_ == 0 && (c == '?' || c == ':'))); + } + + /// Move to the next non-space character. + void findNextNonSpace() @safe + { + while(reader_.peekByte() == ' ') { reader_.forward(); } + } + + /// Scan a string of alphanumeric or "-_" characters. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + void scanAlphaNumericToSlice(string name)(const Mark startMark) + { + size_t length; + dchar c = reader_.peek(); + while(c.isAlphaNum || c.among!('-', '_')) { c = reader_.peek(++length); } + + enforce(length > 0, new ScannerException("While scanning " ~ name, + startMark, expected("alphanumeric, '-' or '_'", c), reader_.mark)); + + reader_.sliceBuilder.write(reader_.get(length)); + } + + /// Scan and throw away all characters until next line break. + void scanToNextBreak() @safe + { + while(!reader_.peek().isBreak) { reader_.forward(); } + } + + /// Scan all characters until next line break. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + void scanToNextBreakToSlice() @safe + { + uint length; + while(!reader_.peek(length).isBreak) + { + ++length; + } + reader_.sliceBuilder.write(reader_.get(length)); + } + + + /// Move to next token in the file/stream. + /// + /// We ignore spaces, line breaks and comments. + /// If we find a line break in the block context, we set + /// allowSimpleKey` on. + /// + /// We do not yet support BOM inside the stream as the + /// specification requires. Any such mark will be considered as a part + /// of the document. + void scanToNextToken() @safe + { + // TODO(PyYAML): We need to make tab handling rules more sane. A good rule is: + // Tabs cannot precede tokens + // BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END, + // KEY(block), VALUE(block), BLOCK-ENTRY + // So the checking code is + // if <TAB>: + // allowSimpleKey_ = false + // We also need to add the check for `allowSimpleKey_ == true` to + // `unwindIndent` before issuing BLOCK-END. + // Scanners for block, flow, and plain scalars need to be modified. + + for(;;) + { + //All whitespace in flow context is ignored, even whitespace + // not allowed in other contexts + if (flowLevel_ > 0) + { + while(reader_.peekByte().isNonLinebreakWhitespace) { reader_.forward(); } + } + else + { + findNextNonSpace(); + } + if(reader_.peekByte() == '#') { scanToNextBreak(); } + if(scanLineBreak() != '\0') + { + if(flowLevel_ == 0) { allowSimpleKey_ = true; } + } + else + { + break; + } + } + } + + /// Scan directive token. + Token scanDirective() @safe + { + Mark startMark = reader_.mark; + // Skip the '%'. + reader_.forward(); + + // Scan directive name + reader_.sliceBuilder.begin(); + scanDirectiveNameToSlice(startMark); + const name = reader_.sliceBuilder.finish(); + + reader_.sliceBuilder.begin(); + + // Index where tag handle ends and suffix starts in a tag directive value. + uint tagHandleEnd = uint.max; + if(name == "YAML") { scanYAMLDirectiveValueToSlice(startMark); } + else if(name == "TAG") { tagHandleEnd = scanTagDirectiveValueToSlice(startMark); } + char[] value = reader_.sliceBuilder.finish(); + + Mark endMark = reader_.mark; + + DirectiveType directive; + if(name == "YAML") { directive = DirectiveType.yaml; } + else if(name == "TAG") { directive = DirectiveType.tag; } + else + { + directive = DirectiveType.reserved; + scanToNextBreak(); + } + + scanDirectiveIgnoredLine(startMark); + + return directiveToken(startMark, endMark, value, directive, tagHandleEnd); + } + + /// Scan name of a directive token. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + void scanDirectiveNameToSlice(const Mark startMark) @safe + { + // Scan directive name. + scanAlphaNumericToSlice!"a directive"(startMark); + + enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), + new ScannerException("While scanning a directive", startMark, + expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark)); + } + + /// Scan value of a YAML directive token. Returns major, minor version separated by '.'. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + void scanYAMLDirectiveValueToSlice(const Mark startMark) @safe + { + findNextNonSpace(); + + scanYAMLDirectiveNumberToSlice(startMark); + + enforce(reader_.peekByte() == '.', + new ScannerException("While scanning a directive", startMark, + expected("digit or '.'", reader_.peek()), reader_.mark)); + // Skip the '.'. + reader_.forward(); + + reader_.sliceBuilder.write('.'); + scanYAMLDirectiveNumberToSlice(startMark); + + enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), + new ScannerException("While scanning a directive", startMark, + expected("digit or '.'", reader_.peek()), reader_.mark)); + } + + /// Scan a number from a YAML directive. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + void scanYAMLDirectiveNumberToSlice(const Mark startMark) @safe + { + enforce(isDigit(reader_.peek()), + new ScannerException("While scanning a directive", startMark, + expected("digit", reader_.peek()), reader_.mark)); + + // Already found the first digit in the enforce(), so set length to 1. + uint length = 1; + while(reader_.peek(length).isDigit) { ++length; } + + reader_.sliceBuilder.write(reader_.get(length)); + } + + /// Scan value of a tag directive. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + /// + /// Returns: Length of tag handle (which is before tag prefix) in scanned data + uint scanTagDirectiveValueToSlice(const Mark startMark) @safe + { + findNextNonSpace(); + const startLength = reader_.sliceBuilder.length; + scanTagDirectiveHandleToSlice(startMark); + const handleLength = cast(uint)(reader_.sliceBuilder.length - startLength); + findNextNonSpace(); + scanTagDirectivePrefixToSlice(startMark); + + return handleLength; + } + + /// Scan handle of a tag directive. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + void scanTagDirectiveHandleToSlice(const Mark startMark) @safe + { + scanTagHandleToSlice!"directive"(startMark); + enforce(reader_.peekByte() == ' ', + new ScannerException("While scanning a directive handle", startMark, + expected("' '", reader_.peek()), reader_.mark)); + } + + /// Scan prefix of a tag directive. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + void scanTagDirectivePrefixToSlice(const Mark startMark) @safe + { + scanTagURIToSlice!"directive"(startMark); + enforce(reader_.peek().among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), + new ScannerException("While scanning a directive prefix", startMark, + expected("' '", reader_.peek()), reader_.mark)); + } + + /// Scan (and ignore) ignored line after a directive. + void scanDirectiveIgnoredLine(const Mark startMark) @safe + { + findNextNonSpace(); + if(reader_.peekByte() == '#') { scanToNextBreak(); } + enforce(reader_.peek().isBreak, + new ScannerException("While scanning a directive", startMark, + expected("comment or a line break", reader_.peek()), reader_.mark)); + scanLineBreak(); + } + + + /// Scan an alias or an anchor. + /// + /// The specification does not restrict characters for anchors and + /// aliases. This may lead to problems, for instance, the document: + /// [ *alias, value ] + /// can be interpteted in two ways, as + /// [ "value" ] + /// and + /// [ *alias , "value" ] + /// Therefore we restrict aliases to ASCII alphanumeric characters. + Token scanAnchor(const TokenID id) @safe + { + const startMark = reader_.mark; + const dchar i = reader_.get(); + + reader_.sliceBuilder.begin(); + if(i == '*') { scanAlphaNumericToSlice!"an alias"(startMark); } + else { scanAlphaNumericToSlice!"an anchor"(startMark); } + // On error, value is discarded as we return immediately + char[] value = reader_.sliceBuilder.finish(); + + enum anchorCtx = "While scanning an anchor"; + enum aliasCtx = "While scanning an alias"; + enforce(reader_.peek().isWhiteSpace || + reader_.peekByte().among!('?', ':', ',', ']', '}', '%', '@'), + new ScannerException(i == '*' ? aliasCtx : anchorCtx, startMark, + expected("alphanumeric, '-' or '_'", reader_.peek()), reader_.mark)); + + if(id == TokenID.alias_) + { + return aliasToken(startMark, reader_.mark, value); + } + if(id == TokenID.anchor) + { + return anchorToken(startMark, reader_.mark, value); + } + assert(false, "This code should never be reached"); + } + + /// Scan a tag token. + Token scanTag() @safe + { + const startMark = reader_.mark; + dchar c = reader_.peek(1); + + reader_.sliceBuilder.begin(); + scope(failure) { reader_.sliceBuilder.finish(); } + // Index where tag handle ends and tag suffix starts in the tag value + // (slice) we will produce. + uint handleEnd; + + if(c == '<') + { + reader_.forward(2); + + handleEnd = 0; + scanTagURIToSlice!"tag"(startMark); + enforce(reader_.peekByte() == '>', + new ScannerException("While scanning a tag", startMark, + expected("'>'", reader_.peek()), reader_.mark)); + reader_.forward(); + } + else if(c.isWhiteSpace) + { + reader_.forward(); + handleEnd = 0; + reader_.sliceBuilder.write('!'); + } + else + { + uint length = 1; + bool useHandle; + + while(!c.isBreakOrSpace) + { + if(c == '!') + { + useHandle = true; + break; + } + ++length; + c = reader_.peek(length); + } + + if(useHandle) + { + scanTagHandleToSlice!"tag"(startMark); + handleEnd = cast(uint)reader_.sliceBuilder.length; + } + else + { + reader_.forward(); + reader_.sliceBuilder.write('!'); + handleEnd = cast(uint)reader_.sliceBuilder.length; + } + + scanTagURIToSlice!"tag"(startMark); + } + + enforce(reader_.peek().isBreakOrSpace, + new ScannerException("While scanning a tag", startMark, expected("' '", reader_.peek()), + reader_.mark)); + + char[] slice = reader_.sliceBuilder.finish(); + return tagToken(startMark, reader_.mark, slice, handleEnd); + } + + /// Scan a block scalar token with specified style. + Token scanBlockScalar(const ScalarStyle style) @safe + { + const startMark = reader_.mark; + + // Scan the header. + reader_.forward(); + + const indicators = scanBlockScalarIndicators(startMark); + + const chomping = indicators[0]; + const increment = indicators[1]; + scanBlockScalarIgnoredLine(startMark); + + // Determine the indentation level and go to the first non-empty line. + Mark endMark; + uint indent = max(1, indent_ + 1); + + reader_.sliceBuilder.begin(); + alias Transaction = SliceBuilder.Transaction; + // Used to strip the last line breaks written to the slice at the end of the + // scalar, which may be needed based on chomping. + Transaction breaksTransaction = Transaction(&reader_.sliceBuilder); + // Read the first indentation/line breaks before the scalar. + size_t startLen = reader_.sliceBuilder.length; + if(increment == int.min) + { + auto indentation = scanBlockScalarIndentationToSlice(); + endMark = indentation[1]; + indent = max(indent, indentation[0]); + } + else + { + indent += increment - 1; + endMark = scanBlockScalarBreaksToSlice(indent); + } + + // int.max means there's no line break (int.max is outside UTF-32). + dchar lineBreak = cast(dchar)int.max; + + // Scan the inner part of the block scalar. + while(reader_.column == indent && reader_.peekByte() != '\0') + { + breaksTransaction.commit(); + const bool leadingNonSpace = !reader_.peekByte().among!(' ', '\t'); + // This is where the 'interesting' non-whitespace data gets read. + scanToNextBreakToSlice(); + lineBreak = scanLineBreak(); + + + // This transaction serves to rollback data read in the + // scanBlockScalarBreaksToSlice() call. + breaksTransaction = Transaction(&reader_.sliceBuilder); + startLen = reader_.sliceBuilder.length; + // The line breaks should actually be written _after_ the if() block + // below. We work around that by inserting + endMark = scanBlockScalarBreaksToSlice(indent); + + // This will not run during the last iteration (see the if() vs the + // while()), hence breaksTransaction rollback (which happens after this + // loop) will never roll back data written in this if() block. + if(reader_.column == indent && reader_.peekByte() != '\0') + { + // Unfortunately, folding rules are ambiguous. + + // This is the folding according to the specification: + if(style == ScalarStyle.folded && lineBreak == '\n' && + leadingNonSpace && !reader_.peekByte().among!(' ', '\t')) + { + // No breaks were scanned; no need to insert the space in the + // middle of slice. + if(startLen == reader_.sliceBuilder.length) + { + reader_.sliceBuilder.write(' '); + } + } + else + { + // We need to insert in the middle of the slice in case any line + // breaks were scanned. + reader_.sliceBuilder.insert(lineBreak, startLen); + } + + ////this is Clark Evans's interpretation (also in the spec + ////examples): + // + //if(style == ScalarStyle.folded && lineBreak == '\n') + //{ + // if(startLen == endLen) + // { + // if(!" \t"d.canFind(reader_.peekByte())) + // { + // reader_.sliceBuilder.write(' '); + // } + // else + // { + // chunks ~= lineBreak; + // } + // } + //} + //else + //{ + // reader_.sliceBuilder.insertBack(lineBreak, endLen - startLen); + //} + } + else + { + break; + } + } + + // If chompint is Keep, we keep (commit) the last scanned line breaks + // (which are at the end of the scalar). Otherwise re remove them (end the + // transaction). + if(chomping == Chomping.keep) { breaksTransaction.commit(); } + else { breaksTransaction.end(); } + if(chomping != Chomping.strip && lineBreak != int.max) + { + // If chomping is Keep, we keep the line break but the first line break + // that isn't stripped (since chomping isn't Strip in this branch) must + // be inserted _before_ the other line breaks. + if(chomping == Chomping.keep) + { + reader_.sliceBuilder.insert(lineBreak, startLen); + } + // If chomping is not Keep, breaksTransaction was cancelled so we can + // directly write the first line break (as it isn't stripped - chomping + // is not Strip) + else + { + reader_.sliceBuilder.write(lineBreak); + } + } + + char[] slice = reader_.sliceBuilder.finish(); + return scalarToken(startMark, endMark, slice, style); + } + + /// Scan chomping and indentation indicators of a scalar token. + Tuple!(Chomping, int) scanBlockScalarIndicators(const Mark startMark) @safe + { + auto chomping = Chomping.clip; + int increment = int.min; + dchar c = reader_.peek(); + + /// Indicators can be in any order. + if(getChomping(c, chomping)) + { + getIncrement(c, increment, startMark); + } + else + { + const gotIncrement = getIncrement(c, increment, startMark); + if(gotIncrement) { getChomping(c, chomping); } + } + + enforce(c.among!(' ', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'), + new ScannerException("While scanning a block scalar", startMark, + expected("chomping or indentation indicator", c), reader_.mark)); + + return tuple(chomping, increment); + } + + /// Get chomping indicator, if detected. Return false otherwise. + /// + /// Used in scanBlockScalarIndicators. + /// + /// Params: + /// + /// c = The character that may be a chomping indicator. + /// chomping = Write the chomping value here, if detected. + bool getChomping(ref dchar c, ref Chomping chomping) @safe + { + if(!c.among!('+', '-')) { return false; } + chomping = c == '+' ? Chomping.keep : Chomping.strip; + reader_.forward(); + c = reader_.peek(); + return true; + } + + /// Get increment indicator, if detected. Return false otherwise. + /// + /// Used in scanBlockScalarIndicators. + /// + /// Params: + /// + /// c = The character that may be an increment indicator. + /// If an increment indicator is detected, this will be updated to + /// the next character in the Reader. + /// increment = Write the increment value here, if detected. + /// startMark = Mark for error messages. + bool getIncrement(ref dchar c, ref int increment, const Mark startMark) @safe + { + if(!c.isDigit) { return false; } + // Convert a digit to integer. + increment = c - '0'; + assert(increment < 10 && increment >= 0, "Digit has invalid value"); + + enforce(increment > 0, + new ScannerException("While scanning a block scalar", startMark, + expected("indentation indicator in range 1-9", "0"), reader_.mark)); + + reader_.forward(); + c = reader_.peek(); + return true; + } + + /// Scan (and ignore) ignored line in a block scalar. + void scanBlockScalarIgnoredLine(const Mark startMark) @safe + { + findNextNonSpace(); + if(reader_.peekByte()== '#') { scanToNextBreak(); } + + enforce(reader_.peek().isBreak, + new ScannerException("While scanning a block scalar", startMark, + expected("comment or line break", reader_.peek()), reader_.mark)); + + scanLineBreak(); + } + + /// Scan indentation in a block scalar, returning line breaks, max indent and end mark. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + Tuple!(uint, Mark) scanBlockScalarIndentationToSlice() @safe + { + uint maxIndent; + Mark endMark = reader_.mark; + + while(reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) + { + if(reader_.peekByte() != ' ') + { + reader_.sliceBuilder.write(scanLineBreak()); + endMark = reader_.mark; + continue; + } + reader_.forward(); + maxIndent = max(reader_.column, maxIndent); + } + + return tuple(maxIndent, endMark); + } + + /// Scan line breaks at lower or specified indentation in a block scalar. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + Mark scanBlockScalarBreaksToSlice(const uint indent) @safe + { + Mark endMark = reader_.mark; + + for(;;) + { + while(reader_.column < indent && reader_.peekByte() == ' ') { reader_.forward(); } + if(!reader_.peek().among!('\n', '\r', '\u0085', '\u2028', '\u2029')) { break; } + reader_.sliceBuilder.write(scanLineBreak()); + endMark = reader_.mark; + } + + return endMark; + } + + /// Scan a qouted flow scalar token with specified quotes. + Token scanFlowScalar(const ScalarStyle quotes) @safe + { + const startMark = reader_.mark; + const quote = reader_.get(); + + reader_.sliceBuilder.begin(); + + scanFlowScalarNonSpacesToSlice(quotes, startMark); + + while(reader_.peek() != quote) + { + scanFlowScalarSpacesToSlice(startMark); + scanFlowScalarNonSpacesToSlice(quotes, startMark); + } + reader_.forward(); + + auto slice = reader_.sliceBuilder.finish(); + return scalarToken(startMark, reader_.mark, slice, quotes); + } + + /// Scan nonspace characters in a flow scalar. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + void scanFlowScalarNonSpacesToSlice(const ScalarStyle quotes, const Mark startMark) + @safe + { + for(;;) + { + dchar c = reader_.peek(); + + size_t numCodePoints; + while(!reader_.peek(numCodePoints).isFlowScalarBreakSpace) { ++numCodePoints; } + + if (numCodePoints > 0) { reader_.sliceBuilder.write(reader_.get(numCodePoints)); } + + c = reader_.peek(); + if(quotes == ScalarStyle.singleQuoted && c == '\'' && reader_.peek(1) == '\'') + { + reader_.forward(2); + reader_.sliceBuilder.write('\''); + } + else if((quotes == ScalarStyle.doubleQuoted && c == '\'') || + (quotes == ScalarStyle.singleQuoted && c.among!('"', '\\'))) + { + reader_.forward(); + reader_.sliceBuilder.write(c); + } + else if(quotes == ScalarStyle.doubleQuoted && c == '\\') + { + reader_.forward(); + c = reader_.peek(); + if(c.among!(escapes)) + { + reader_.forward(); + // Escaping has been moved to Parser as it can't be done in + // place (in a slice) in case of '\P' and '\L' (very uncommon, + // but we don't want to break the spec) + char[2] escapeSequence = ['\\', cast(char)c]; + reader_.sliceBuilder.write(escapeSequence); + } + else if(c.among!(escapeHexCodeList)) + { + const hexLength = dyaml.escapes.escapeHexLength(c); + reader_.forward(); + + foreach(i; 0 .. hexLength) { + enforce(reader_.peek(i).isHexDigit, + new ScannerException("While scanning a double quoted scalar", startMark, + expected("escape sequence of hexadecimal numbers", + reader_.peek(i)), reader_.mark)); + } + char[] hex = reader_.get(hexLength); + + enforce((hex.length > 0) && (hex.length <= 8), + new ScannerException("While scanning a double quoted scalar", startMark, + "overflow when parsing an escape sequence of " ~ + "hexadecimal numbers.", reader_.mark)); + + char[2] escapeStart = ['\\', cast(char) c]; + reader_.sliceBuilder.write(escapeStart); + reader_.sliceBuilder.write(hex); + + } + else if(c.among!('\n', '\r', '\u0085', '\u2028', '\u2029')) + { + scanLineBreak(); + scanFlowScalarBreaksToSlice(startMark); + } + else + { + throw new ScannerException("While scanning a double quoted scalar", startMark, + text("found unsupported escape character ", c), + reader_.mark); + } + } + else { return; } + } + } + + /// Scan space characters in a flow scalar. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// spaces into that slice. + void scanFlowScalarSpacesToSlice(const Mark startMark) @safe + { + // Increase length as long as we see whitespace. + size_t length; + while(reader_.peekByte(length).among!(' ', '\t')) { ++length; } + auto whitespaces = reader_.prefixBytes(length); + + // Can check the last byte without striding because '\0' is ASCII + const c = reader_.peek(length); + enforce(c != '\0', + new ScannerException("While scanning a quoted scalar", startMark, + "found unexpected end of buffer", reader_.mark)); + + // Spaces not followed by a line break. + if(!c.among!('\n', '\r', '\u0085', '\u2028', '\u2029')) + { + reader_.forward(length); + reader_.sliceBuilder.write(whitespaces); + return; + } + + // There's a line break after the spaces. + reader_.forward(length); + const lineBreak = scanLineBreak(); + + if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); } + + // If we have extra line breaks after the first, scan them into the + // slice. + const bool extraBreaks = scanFlowScalarBreaksToSlice(startMark); + + // No extra breaks, one normal line break. Replace it with a space. + if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); } + } + + /// Scan line breaks in a flow scalar. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// line breaks into that slice. + bool scanFlowScalarBreaksToSlice(const Mark startMark) @safe + { + // True if at least one line break was found. + bool anyBreaks; + for(;;) + { + // Instead of checking indentation, we check for document separators. + const prefix = reader_.prefix(3); + enforce(!(prefix == "---" || prefix == "...") || + !reader_.peek(3).isWhiteSpace, + new ScannerException("While scanning a quoted scalar", startMark, + "found unexpected document separator", reader_.mark)); + + // Skip any whitespaces. + while(reader_.peekByte().among!(' ', '\t')) { reader_.forward(); } + + // Encountered a non-whitespace non-linebreak character, so we're done. + if(!reader_.peek().among!(' ', '\n', '\r', '\u0085', '\u2028', '\u2029')) { break; } + + const lineBreak = scanLineBreak(); + anyBreaks = true; + reader_.sliceBuilder.write(lineBreak); + } + return anyBreaks; + } + + /// Scan plain scalar token (no block, no quotes). + Token scanPlain() @safe + { + // We keep track of the allowSimpleKey_ flag here. + // Indentation rules are loosed for the flow context + const startMark = reader_.mark; + Mark endMark = startMark; + const indent = indent_ + 1; + + // We allow zero indentation for scalars, but then we need to check for + // document separators at the beginning of the line. + // if(indent == 0) { indent = 1; } + + reader_.sliceBuilder.begin(); + + alias Transaction = SliceBuilder.Transaction; + Transaction spacesTransaction; + // Stop at a comment. + while(reader_.peekByte() != '#') + { + // Scan the entire plain scalar. + size_t length; + dchar c = reader_.peek(length); + for(;;) + { + const cNext = reader_.peek(length + 1); + if(c.isWhiteSpace || + (flowLevel_ == 0 && c == ':' && cNext.isWhiteSpace) || + (flowLevel_ > 0 && c.among!(',', ':', '?', '[', ']', '{', '}'))) + { + break; + } + ++length; + c = cNext; + } + + // It's not clear what we should do with ':' in the flow context. + enforce(flowLevel_ == 0 || c != ':' || + reader_.peek(length + 1).isWhiteSpace || + reader_.peek(length + 1).among!(',', '[', ']', '{', '}'), + new ScannerException("While scanning a plain scalar", startMark, + "found unexpected ':' . Please check " ~ + "http://pyyaml.org/wiki/YAMLColonInFlowContext for details.", + reader_.mark)); + + if(length == 0) { break; } + + allowSimpleKey_ = false; + + reader_.sliceBuilder.write(reader_.get(length)); + + endMark = reader_.mark; + + spacesTransaction.commit(); + spacesTransaction = Transaction(&reader_.sliceBuilder); + + const startLength = reader_.sliceBuilder.length; + scanPlainSpacesToSlice(); + if(startLength == reader_.sliceBuilder.length || + (flowLevel_ == 0 && reader_.column < indent)) + { + break; + } + } + + spacesTransaction.end(); + char[] slice = reader_.sliceBuilder.finish(); + + return scalarToken(startMark, endMark, slice, ScalarStyle.plain); + } + + /// Scan spaces in a plain scalar. + /// + /// Assumes that the caller is building a slice in Reader, and puts the spaces + /// into that slice. + void scanPlainSpacesToSlice() @safe + { + // The specification is really confusing about tabs in plain scalars. + // We just forbid them completely. Do not use tabs in YAML! + + // Get as many plain spaces as there are. + size_t length; + while(reader_.peekByte(length) == ' ') { ++length; } + char[] whitespaces = reader_.prefixBytes(length); + reader_.forward(length); + + const dchar c = reader_.peek(); + if(!c.isNSChar) + { + // We have spaces, but no newline. + if(whitespaces.length > 0) { reader_.sliceBuilder.write(whitespaces); } + return; + } + + // Newline after the spaces (if any) + const lineBreak = scanLineBreak(); + allowSimpleKey_ = true; + + static bool end(Reader reader_) @safe pure + { + const prefix = reader_.prefix(3); + return ("---" == prefix || "..." == prefix) + && reader_.peek(3).among!(' ', '\t', '\0', '\n', '\r', '\u0085', '\u2028', '\u2029'); + } + + if(end(reader_)) { return; } + + bool extraBreaks; + + alias Transaction = SliceBuilder.Transaction; + auto transaction = Transaction(&reader_.sliceBuilder); + if(lineBreak != '\n') { reader_.sliceBuilder.write(lineBreak); } + while(reader_.peek().isNSChar) + { + if(reader_.peekByte() == ' ') { reader_.forward(); } + else + { + const lBreak = scanLineBreak(); + extraBreaks = true; + reader_.sliceBuilder.write(lBreak); + + if(end(reader_)) { return; } + } + } + transaction.commit(); + + // No line breaks, only a space. + if(lineBreak == '\n' && !extraBreaks) { reader_.sliceBuilder.write(' '); } + } + + /// Scan handle of a tag token. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + void scanTagHandleToSlice(string name)(const Mark startMark) + { + dchar c = reader_.peek(); + enum contextMsg = "While scanning a " ~ name; + enforce(c == '!', + new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark)); + + uint length = 1; + c = reader_.peek(length); + if(c != ' ') + { + while(c.isAlphaNum || c.among!('-', '_')) + { + ++length; + c = reader_.peek(length); + } + enforce(c == '!', + new ScannerException(contextMsg, startMark, expected("'!'", c), reader_.mark)); + ++length; + } + + reader_.sliceBuilder.write(reader_.get(length)); + } + + /// Scan URI in a tag token. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + void scanTagURIToSlice(string name)(const Mark startMark) + { + // Note: we do not check if URI is well-formed. + dchar c = reader_.peek(); + const startLen = reader_.sliceBuilder.length; + { + uint length; + while(c.isAlphaNum || c.isURIChar) + { + if(c == '%') + { + auto chars = reader_.get(length); + reader_.sliceBuilder.write(chars); + length = 0; + scanURIEscapesToSlice!name(startMark); + } + else { ++length; } + c = reader_.peek(length); + } + if(length > 0) + { + auto chars = reader_.get(length); + reader_.sliceBuilder.write(chars); + length = 0; + } + } + // OK if we scanned something, error otherwise. + enum contextMsg = "While parsing a " ~ name; + enforce(reader_.sliceBuilder.length > startLen, + new ScannerException(contextMsg, startMark, expected("URI", c), reader_.mark)); + } + + // Not @nogc yet because std.utf.decode is not @nogc + /// Scan URI escape sequences. + /// + /// Assumes that the caller is building a slice in Reader, and puts the scanned + /// characters into that slice. + void scanURIEscapesToSlice(string name)(const Mark startMark) + { + import core.exception : UnicodeException; + // URI escapes encode a UTF-8 string. We store UTF-8 code units here for + // decoding into UTF-32. + Appender!string buffer; + + + enum contextMsg = "While scanning a " ~ name; + while(reader_.peekByte() == '%') + { + reader_.forward(); + char[2] nextByte = [reader_.peekByte(), reader_.peekByte(1)]; + + enforce(nextByte[0].isHexDigit && nextByte[1].isHexDigit, + new ScannerException(contextMsg, startMark, + expected("URI escape sequence of 2 hexadecimal " ~ + "numbers", nextByte), reader_.mark)); + + buffer ~= nextByte[].to!ubyte(16); + + reader_.forward(2); + } + try + { + foreach (dchar chr; buffer.data) + { + reader_.sliceBuilder.write(chr); + } + } + catch (UnicodeException) + { + throw new ScannerException(contextMsg, startMark, + "Invalid UTF-8 data encoded in URI escape sequence", + reader_.mark); + } + } + + + /// Scan a line break, if any. + /// + /// Transforms: + /// '\r\n' : '\n' + /// '\r' : '\n' + /// '\n' : '\n' + /// '\u0085' : '\n' + /// '\u2028' : '\u2028' + /// '\u2029 : '\u2029' + /// no break : '\0' + dchar scanLineBreak() @safe + { + // Fast path for ASCII line breaks. + const b = reader_.peekByte(); + if(b < 0x80) + { + if(b == '\n' || b == '\r') + { + if(reader_.prefix(2) == "\r\n") { reader_.forward(2); } + else { reader_.forward(); } + return '\n'; + } + return '\0'; + } + + const c = reader_.peek(); + if(c == '\x85') + { + reader_.forward(); + return '\n'; + } + if(c == '\u2028' || c == '\u2029') + { + reader_.forward(); + return c; + } + return '\0'; + } +} |