From 0a398692e6b01b5b715b5612b3144ff01f22113d Mon Sep 17 00:00:00 2001 From: Tatu Saloranta Date: Thu, 19 Sep 2013 12:12:11 -0700 Subject: [PATCH] Finished #47 implementation --- .../jackson/core/base/ParserMinimalBase.java | 2 - .../core/json/ReaderBasedJsonParser.java | 131 ++++++++------ .../core/json/UTF8StreamJsonParser.java | 163 ++++++++---------- 3 files changed, 153 insertions(+), 143 deletions(-) diff --git a/src/main/java/com/fasterxml/jackson/core/base/ParserMinimalBase.java b/src/main/java/com/fasterxml/jackson/core/base/ParserMinimalBase.java index 8241b75663..ef26325509 100644 --- a/src/main/java/com/fasterxml/jackson/core/base/ParserMinimalBase.java +++ b/src/main/java/com/fasterxml/jackson/core/base/ParserMinimalBase.java @@ -16,8 +16,6 @@ * Note that 'minimal' here mostly refers to minimal number of fields * (size) and functionality that is specific to certain types * of parser implementations; but not necessarily to number of methods. - * - * @author Tatu Saloranta */ public abstract class ParserMinimalBase extends JsonParser diff --git a/src/main/java/com/fasterxml/jackson/core/json/ReaderBasedJsonParser.java b/src/main/java/com/fasterxml/jackson/core/json/ReaderBasedJsonParser.java index 8a810712b3..a947fd6c96 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/ReaderBasedJsonParser.java +++ b/src/main/java/com/fasterxml/jackson/core/json/ReaderBasedJsonParser.java @@ -17,6 +17,13 @@ public final class ReaderBasedJsonParser extends ParserBase { + // Latin1 encoding is not supported, but we do use 8-bit subset for + // pre-processing task, to simplify first pass, keep it fast. + protected final static int[] _icLatin1 = CharTypes.getInputCodeLatin1(); + + // White-space processing is done all the time, pre-fetch as well + private final static int[] _icWS = CharTypes.getInputCodeWS(); + /* /********************************************************** /* Input configuration @@ -664,10 +671,10 @@ public JsonToken nextToken() case '7': case '8': case '9': - t = parseNumberText(i); + t = _parseNumber(i); break; default: - t = _handleUnexpectedValue(i); + t = _handleOddValue(i); break; } @@ -818,7 +825,6 @@ public void close() throws IOException /* /********************************************************** /* Internal methods, number parsing - /* (note: in 1.8 and prior, part of "ReaderBasedNumericParser" /********************************************************** */ @@ -837,8 +843,7 @@ public void close() throws IOException * deferred, since it is usually the most complicated and costliest * part of processing. */ - protected JsonToken parseNumberText(int ch) - throws IOException, JsonParseException + protected JsonToken _parseNumber(int ch) throws IOException { /* Although we will always be complete with respect to textual * representation (that is, all characters will be parsed), @@ -950,7 +955,7 @@ protected JsonToken parseNumberText(int ch) } while (false); _inputPtr = negative ? (startPtr+1) : startPtr; - return parseNumberText2(negative); + return _parseNumber2(negative); } /** @@ -960,8 +965,7 @@ protected JsonToken parseNumberText(int ch) * that it has to explicitly copy contents to the text buffer * instead of just sharing the main input buffer. */ - private JsonToken parseNumberText2(boolean negative) - throws IOException, JsonParseException + private JsonToken _parseNumber2(boolean negative) throws IOException { char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); int outPtr = 0; @@ -1084,8 +1088,7 @@ private JsonToken parseNumberText2(boolean negative) * Method called when we have seen one zero, and want to ensure * it is not followed by another */ - private char _verifyNoLeadingZeroes() - throws IOException, JsonParseException + private char _verifyNoLeadingZeroes() throws IOException { // Ok to have plain "0" if (_inputPtr >= _inputEnd && !loadMore()) { @@ -1120,8 +1123,7 @@ private char _verifyNoLeadingZeroes() * Method called if expected numeric value (due to leading sign) does not * look like a number */ - protected JsonToken _handleInvalidNumberStart(int ch, boolean negative) - throws IOException, JsonParseException + protected JsonToken _handleInvalidNumberStart(int ch, boolean negative) throws IOException { if (ch == 'I') { if (_inputPtr >= _inputEnd) { @@ -1170,7 +1172,7 @@ protected String _parseName(int i) throws IOException final int inputLen = _inputEnd; if (ptr < inputLen) { - final int[] codes = CharTypes.getInputCodeLatin1(); + final int[] codes = _icLatin1; final int maxCode = codes.length; do { @@ -1190,10 +1192,10 @@ protected String _parseName(int i) throws IOException int start = _inputPtr; _inputPtr = ptr; - return _parseFieldName2(start, hash, INT_QUOTE); + return _parseName2(start, hash, INT_QUOTE); } - private String _parseFieldName2(int startPtr, int hash, int endChar) throws IOException + private String _parseName2(int startPtr, int hash, int endChar) throws IOException { _textBuffer.resetWithShared(_inputBuffer, startPtr, (_inputPtr - startPtr)); @@ -1313,7 +1315,7 @@ protected String _parseAposName() throws IOException final int inputLen = _inputEnd; if (ptr < inputLen) { - final int[] codes = CharTypes.getInputCodeLatin1(); + final int[] codes = _icLatin1; final int maxCode = codes.length; do { @@ -1334,14 +1336,14 @@ protected String _parseAposName() throws IOException int start = _inputPtr; _inputPtr = ptr; - return _parseFieldName2(start, hash, '\''); + return _parseName2(start, hash, '\''); } /** * Method for handling cases where first non-space character * of an expected value token is not legal for standard JSON content. */ - protected JsonToken _handleUnexpectedValue(int i) throws IOException + protected JsonToken _handleOddValue(int i) throws IOException { // Most likely an error, unless we are to allow single-quote-strings switch (i) { @@ -1484,7 +1486,7 @@ protected void _finishString() throws IOException final int inputLen = _inputEnd; if (ptr < inputLen) { - final int[] codes = CharTypes.getInputCodeLatin1(); + final int[] codes = _icLatin1; final int maxCode = codes.length; do { @@ -1621,22 +1623,34 @@ protected void _skipCR() throws IOException private int _skipWS() throws IOException { + final int[] codes = _icWS; while (_inputPtr < _inputEnd || loadMore()) { int i = (int) _inputBuffer[_inputPtr++]; - if (i > INT_SPACE) { - if (i != INT_SLASH) { - return i; - } + if (i >= 64) { + return i; + } + switch (codes[i]) { + case -1: + _throwInvalidSpace(i); + case 0: + return i; + case 1: + continue; + case '\n': + ++_currInputRow; + _currInputRowStart = _inputPtr; + break; + case '\r': + _skipCR(); + break; + case '/': _skipComment(); - } else if (i != INT_SPACE) { - if (i == INT_LF) { - ++_currInputRow; - _currInputRowStart = _inputPtr; - } else if (i == INT_CR) { - _skipCR(); - } else if (i != INT_TAB) { - _throwInvalidSpace(i); + break; + case '#': + if (!_skipYAMLComment()) { + return i; } + break; } } throw _constructError("Unexpected end-of-input within/between "+_parsingContext.getTypeDesc()+" entries"); @@ -1644,24 +1658,34 @@ private int _skipWS() throws IOException private int _skipWSOrEnd() throws IOException { - while ((_inputPtr < _inputEnd) || loadMore()) { + final int[] codes = _icWS; + while (_inputPtr < _inputEnd || loadMore()) { int i = (int) _inputBuffer[_inputPtr++]; - if (i > INT_SPACE) { - if (i == INT_SLASH) { - _skipComment(); - continue; - } - return i; - } - if (i != INT_SPACE) { - if (i == INT_LF) { - ++_currInputRow; - _currInputRowStart = _inputPtr; - } else if (i == INT_CR) { - _skipCR(); - } else if (i != INT_TAB) { - _throwInvalidSpace(i); + if (i >= 64) { + return i; + } + switch (codes[i]) { + case -1: + _throwInvalidSpace(i); + case 0: + return i; + case 1: + continue; + case '\n': + ++_currInputRow; + _currInputRowStart = _inputPtr; + break; + case '\r': + _skipCR(); + break; + case '/': + _skipComment(); + break; + case '#': + if (!_skipYAMLComment()) { + return i; } + break; } } // We ran out of input... @@ -1680,7 +1704,7 @@ private void _skipComment() throws IOException } char c = _inputBuffer[_inputPtr++]; if (c == '/') { - _skipCppComment(); + _skipLine(); } else if (c == '*') { _skipCComment(); } else { @@ -1720,7 +1744,16 @@ private void _skipCComment() throws IOException _reportInvalidEOF(" in a comment"); } - private void _skipCppComment() throws IOException + private boolean _skipYAMLComment() throws IOException + { + if (!isEnabled(Feature.ALLOW_YAML_COMMENTS)) { + return false; + } + _skipLine(); + return true; + } + + private void _skipLine() throws IOException { // Ok: need to find EOF or linefeed while ((_inputPtr < _inputEnd) || loadMore()) { diff --git a/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java b/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java index 426db76ef7..836c0afa12 100644 --- a/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java +++ b/src/main/java/com/fasterxml/jackson/core/json/UTF8StreamJsonParser.java @@ -19,13 +19,15 @@ public final class UTF8StreamJsonParser { final static byte BYTE_LF = (byte) '\n'; - private final static int[] sInputCodesUtf8 = CharTypes.getInputCodeUtf8(); + // This is the main input-code lookup table, fetched eagerly + private final static int[] _icUTF8 = CharTypes.getInputCodeUtf8(); - /** - * Latin1 encoding is not supported, but we do use 8-bit subset for - * pre-processing task, to simplify first pass, keep it fast. - */ - private final static int[] sInputCodesLatin1 = CharTypes.getInputCodeLatin1(); + // Latin1 encoding is not supported, but we do use 8-bit subset for + // pre-processing task, to simplify first pass, keep it fast. + protected final static int[] _icLatin1 = CharTypes.getInputCodeLatin1(); + + // White-space processing is done all the time, pre-fetch as well + private final static int[] _icWS = CharTypes.getInputCodeWS(); /* /********************************************************** @@ -1416,11 +1418,11 @@ protected Name _parseFieldName(int i) throws IOException, JsonParseException { if (i != INT_QUOTE) { - return _handleUnusualFieldName(i); + return _handleOddName(i); } // First: can we optimize out bounds checks? if ((_inputPtr + 9) > _inputEnd) { // Need 8 chars, plus one trailing (quote) - return slowParseFieldName(); + return slowParseName(); } // If so, can also unroll loops nicely @@ -1430,7 +1432,7 @@ protected Name _parseFieldName(int i) * later on), and just handle quotes and backslashes here. */ final byte[] input = _inputBuffer; - final int[] codes = sInputCodesLatin1; + final int[] codes = _icLatin1; int q = input[_inputPtr++] & 0xFF; @@ -1447,35 +1449,35 @@ protected Name _parseFieldName(int i) i = input[_inputPtr++] & 0xFF; if (codes[i] == 0) { _quad1 = q; - return parseMediumFieldName(i, codes); + return parseMediumName(i, codes); } if (i == INT_QUOTE) { // one byte/char case or broken return findName(q, 4); } - return parseFieldName(q, i, 4); + return parseName(q, i, 4); } if (i == INT_QUOTE) { // one byte/char case or broken return findName(q, 3); } - return parseFieldName(q, i, 3); + return parseName(q, i, 3); } if (i == INT_QUOTE) { // one byte/char case or broken return findName(q, 2); } - return parseFieldName(q, i, 2); + return parseName(q, i, 2); } if (i == INT_QUOTE) { // one byte/char case or broken return findName(q, 1); } - return parseFieldName(q, i, 1); + return parseName(q, i, 1); } if (q == INT_QUOTE) { // special case, "" return BytesToNameCanonicalizer.getEmptyName(); } - return parseFieldName(0, q, 0); // quoting or invalid char + return parseName(0, q, 0); // quoting or invalid char } - protected Name parseMediumFieldName(int q2, final int[] codes) + protected Name parseMediumName(int q2, final int[] codes) throws IOException, JsonParseException { // Ok, got 5 name bytes so far @@ -1484,7 +1486,7 @@ protected Name parseMediumFieldName(int q2, final int[] codes) if (i == INT_QUOTE) { // 5 bytes return findName(_quad1, q2, 1); } - return parseFieldName(_quad1, q2, i, 1); // quoting or invalid char + return parseName(_quad1, q2, i, 1); // quoting or invalid char } q2 = (q2 << 8) | i; i = _inputBuffer[_inputPtr++] & 0xFF; @@ -1492,7 +1494,7 @@ protected Name parseMediumFieldName(int q2, final int[] codes) if (i == INT_QUOTE) { // 6 bytes return findName(_quad1, q2, 2); } - return parseFieldName(_quad1, q2, i, 2); + return parseName(_quad1, q2, i, 2); } q2 = (q2 << 8) | i; i = _inputBuffer[_inputPtr++] & 0xFF; @@ -1500,7 +1502,7 @@ protected Name parseMediumFieldName(int q2, final int[] codes) if (i == INT_QUOTE) { // 7 bytes return findName(_quad1, q2, 3); } - return parseFieldName(_quad1, q2, i, 3); + return parseName(_quad1, q2, i, 3); } q2 = (q2 << 8) | i; i = _inputBuffer[_inputPtr++] & 0xFF; @@ -1508,18 +1510,18 @@ protected Name parseMediumFieldName(int q2, final int[] codes) if (i == INT_QUOTE) { // 8 bytes return findName(_quad1, q2, 4); } - return parseFieldName(_quad1, q2, i, 4); + return parseName(_quad1, q2, i, 4); } _quadBuffer[0] = _quad1; _quadBuffer[1] = q2; - return parseLongFieldName(i); + return parseLongName(i); } - protected Name parseLongFieldName(int q) + protected Name parseLongName(int q) throws IOException, JsonParseException { // As explained above, will ignore UTF-8 encoding at this point - final int[] codes = sInputCodesLatin1; + final int[] codes = _icLatin1; int qlen = 2; while (true) { @@ -1528,7 +1530,7 @@ protected Name parseLongFieldName(int q) * and may not always be possible) */ if ((_inputEnd - _inputPtr) < 4) { - return parseEscapedFieldName(_quadBuffer, qlen, 0, q, 0); + return parseEscapedName(_quadBuffer, qlen, 0, q, 0); } // Otherwise can skip boundary checks for 4 bytes in loop @@ -1537,7 +1539,7 @@ protected Name parseLongFieldName(int q) if (i == INT_QUOTE) { return findName(_quadBuffer, qlen, q, 1); } - return parseEscapedFieldName(_quadBuffer, qlen, q, i, 1); + return parseEscapedName(_quadBuffer, qlen, q, i, 1); } q = (q << 8) | i; @@ -1546,7 +1548,7 @@ protected Name parseLongFieldName(int q) if (i == INT_QUOTE) { return findName(_quadBuffer, qlen, q, 2); } - return parseEscapedFieldName(_quadBuffer, qlen, q, i, 2); + return parseEscapedName(_quadBuffer, qlen, q, i, 2); } q = (q << 8) | i; @@ -1555,7 +1557,7 @@ protected Name parseLongFieldName(int q) if (i == INT_QUOTE) { return findName(_quadBuffer, qlen, q, 3); } - return parseEscapedFieldName(_quadBuffer, qlen, q, i, 3); + return parseEscapedName(_quadBuffer, qlen, q, i, 3); } q = (q << 8) | i; @@ -1564,7 +1566,7 @@ protected Name parseLongFieldName(int q) if (i == INT_QUOTE) { return findName(_quadBuffer, qlen, q, 4); } - return parseEscapedFieldName(_quadBuffer, qlen, q, i, 4); + return parseEscapedName(_quadBuffer, qlen, q, i, 4); } // Nope, no end in sight. Need to grow quad array etc @@ -1581,7 +1583,7 @@ protected Name parseLongFieldName(int q) * to come consequtively. Happens rarely, so this is offlined; * plus we'll also do full checks for escaping etc. */ - protected Name slowParseFieldName() + protected Name slowParseName() throws IOException, JsonParseException { if (_inputPtr >= _inputEnd) { @@ -1593,20 +1595,20 @@ protected Name slowParseFieldName() if (i == INT_QUOTE) { // special case, "" return BytesToNameCanonicalizer.getEmptyName(); } - return parseEscapedFieldName(_quadBuffer, 0, 0, i, 0); + return parseEscapedName(_quadBuffer, 0, 0, i, 0); } - private Name parseFieldName(int q1, int ch, int lastQuadBytes) + private Name parseName(int q1, int ch, int lastQuadBytes) throws IOException, JsonParseException { - return parseEscapedFieldName(_quadBuffer, 0, q1, ch, lastQuadBytes); + return parseEscapedName(_quadBuffer, 0, q1, ch, lastQuadBytes); } - private Name parseFieldName(int q1, int q2, int ch, int lastQuadBytes) + private Name parseName(int q1, int q2, int ch, int lastQuadBytes) throws IOException, JsonParseException { _quadBuffer[0] = q1; - return parseEscapedFieldName(_quadBuffer, 1, q2, ch, lastQuadBytes); + return parseEscapedName(_quadBuffer, 1, q2, ch, lastQuadBytes); } /** @@ -1616,8 +1618,8 @@ private Name parseFieldName(int q1, int q2, int ch, int lastQuadBytes) * needs to be able to handle more exceptional cases, gets * slower, and hance is offlined to a separate method. */ - protected Name parseEscapedFieldName(int[] quads, int qlen, int currQuad, int ch, - int currQuadBytes) + protected Name parseEscapedName(int[] quads, int qlen, int currQuad, int ch, + int currQuadBytes) throws IOException, JsonParseException { /* 25-Nov-2008, tatu: This may seem weird, but here we do @@ -1625,7 +1627,7 @@ protected Name parseEscapedFieldName(int[] quads, int qlen, int currQuad, int ch * assume that part is ok (if not it will get caught * later on), and just handle quotes and backslashes here. */ - final int[] codes = sInputCodesLatin1; + final int[] codes = _icLatin1; while (true) { if (codes[ch] != 0) { @@ -1717,12 +1719,12 @@ protected Name parseEscapedFieldName(int[] quads, int qlen, int currQuad, int ch * In standard mode will just throw an expection; but * in non-standard modes may be able to parse name. */ - protected Name _handleUnusualFieldName(int ch) + protected Name _handleOddName(int ch) throws IOException, JsonParseException { // [JACKSON-173]: allow single quotes if (ch == '\'' && isEnabled(Feature.ALLOW_SINGLE_QUOTES)) { - return _parseApostropheFieldName(); + return _parseAposName(); } // [JACKSON-69]: allow unquoted names if feature enabled: if (!isEnabled(Feature.ALLOW_UNQUOTED_FIELD_NAMES)) { @@ -1790,7 +1792,7 @@ protected Name _handleUnusualFieldName(int ch) * for valid JSON -- more alternatives, more code, generally * bit slower execution. */ - protected Name _parseApostropheFieldName() + protected Name _parseAposName() throws IOException, JsonParseException { if (_inputPtr >= _inputEnd) { @@ -1809,7 +1811,7 @@ protected Name _parseApostropheFieldName() // Copied from parseEscapedFieldName, with minor mods: - final int[] codes = sInputCodesLatin1; + final int[] codes = _icLatin1; while (true) { if (ch == '\'') { @@ -2075,7 +2077,7 @@ protected void _finishString() throws IOException } int outPtr = 0; char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); - final int[] codes = sInputCodesUtf8; + final int[] codes = _icUTF8; final int max = Math.min(_inputEnd, (ptr + outBuf.length)); final byte[] inputBuffer = _inputBuffer; @@ -2102,7 +2104,7 @@ private void _finishString2(char[] outBuf, int outPtr) int c; // Here we do want to do full decoding, hence: - final int[] codes = sInputCodesUtf8; + final int[] codes = _icUTF8; final byte[] inputBuffer = _inputBuffer; main_loop: @@ -2190,7 +2192,7 @@ protected void _skipString() throws IOException _tokenIncomplete = false; // Need to be fully UTF-8 aware here: - final int[] codes = sInputCodesUtf8; + final int[] codes = _icUTF8; final byte[] inputBuffer = _inputBuffer; main_loop: @@ -2299,7 +2301,7 @@ protected JsonToken _handleApos() char[] outBuf = _textBuffer.emptyAndGetCurrentSegment(); // Here we do want to do full decoding, hence: - final int[] codes = sInputCodesUtf8; + final int[] codes = _icUTF8; final byte[] inputBuffer = _inputBuffer; main_loop: @@ -2476,10 +2478,10 @@ protected void _reportInvalidToken(String matchedPart, String msg) /* Internal methods, ws skipping, escape/unescape /********************************************************** */ - + private final int _skipWS() throws IOException { - final int[] codes = CharTypes.getInputCodeWS(); + final int[] codes = _icWS; while (_inputPtr < _inputEnd || loadMore()) { final int i = _inputBuffer[_inputPtr++] & 0xFF; switch (codes[i]) { @@ -2507,7 +2509,7 @@ private final int _skipWS() throws IOException _skipComment(); break; case '#': - if (!_skipHashComment()) { + if (!_skipYAMLComment()) { return i; } break; @@ -2524,7 +2526,7 @@ private final int _skipWS() throws IOException private int _skipWSOrEnd() throws IOException { - final int[] codes = CharTypes.getInputCodeWS(); + final int[] codes = _icWS; while ((_inputPtr < _inputEnd) || loadMore()) { final int i = _inputBuffer[_inputPtr++] & 0xFF; switch (codes[i]) { @@ -2552,7 +2554,7 @@ private int _skipWSOrEnd() throws IOException _skipComment(); break; case '#': - if (!_skipHashComment()) { + if (!_skipYAMLComment()) { return i; } break; @@ -2657,7 +2659,7 @@ private void _skipComment() throws IOException } int c = _inputBuffer[_inputPtr++] & 0xFF; if (c == '/') { - _skipCppComment(); + _skipLine(); } else if (c == '*') { _skipCComment(); } else { @@ -2711,7 +2713,20 @@ private void _skipCComment() throws IOException _reportInvalidEOF(" in a comment"); } - private void _skipCppComment() throws IOException + private boolean _skipYAMLComment() throws IOException + { + if (!isEnabled(Feature.ALLOW_YAML_COMMENTS)) { + return false; + } + _skipLine(); + return true; + } + + /** + * Method for skipping contents of an input line; usually for CPP + * and YAML style comments. + */ + private void _skipLine() throws IOException { // Ok: need to find EOF or linefeed final int[] codes = CharTypes.getInputCodeComment(); @@ -2739,49 +2754,13 @@ private void _skipCppComment() throws IOException _skipUtf8_4(i); break; default: // e.g. -1 - // Is this good enough error message? - _reportInvalidChar(i); - } - } - } - } - - protected boolean _skipHashComment() throws IOException - { - if (!isEnabled(Feature.ALLOW_YAML_COMMENTS)) { - return false; - } - // would plain UTF-8 work? - final int[] codes = CharTypes.getInputCodeComment(); - // Skip until line-feed - while ((_inputPtr < _inputEnd) || loadMore()) { - int i = (int) _inputBuffer[_inputPtr++] & 0xFF; - int code = codes[i]; - if (code != 0) { - switch (code) { - case '\n': - ++_currInputRow; - _currInputRowStart = _inputPtr; - return true; - case '\r': - _skipCR(); - return true; - case 2: // 2-byte UTF - _skipUtf8_2(i); - break; - case 3: // 3-byte UTF - _skipUtf8_3(i); - break; - case 4: // 4-byte UTF - _skipUtf8_4(i); - break; - default: // e.g. -1 - // Is this good enough error message? - _reportInvalidChar(i); + if (code < 0) { + // Is this good enough error message? + _reportInvalidChar(i); + } } } } - return true; } @Override