-
Notifications
You must be signed in to change notification settings - Fork 113
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adds support for macro-aware transcoding from binary to text. #1000
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
// SPDX-License-Identifier: Apache-2.0 | ||
package com.amazon.ion | ||
|
||
import java.io.Closeable | ||
import java.io.IOException | ||
|
||
/** | ||
* An enhancement to an Ion reader that supports macro-aware transcoding. | ||
*/ | ||
interface MacroAwareIonReader : Closeable { | ||
|
||
/** | ||
* Performs a macro-aware transcode of the stream being read by this reader. | ||
* For Ion 1.0 streams, this functions similarly to providing a system-level | ||
* [IonReader] to [IonWriter.writeValues]. For Ion 1.1 streams, the transcoded | ||
* stream will include the same symbol tables, encoding directives, and | ||
* e-expression invocations as the source stream. In both cases, the | ||
* transcoded stream will be data-model equivalent to the source stream. | ||
* | ||
* The following limitations should be noted: | ||
* 1. Encoding directives with no effect on the encoding context may be | ||
* elided from the transcoded stream. An example would be an encoding | ||
* directive that re-exports the existing context but adds no new | ||
* macros or new symbols. | ||
* 2. When transcoding from text to text, comments will not be preserved. | ||
* 3. Open content in encoding directives (e.g. macro invocations that | ||
* expand to nothing) will not be preserved. | ||
* 4. Granular details of the binary encoding, like inlining vs. interning | ||
* for a particular symbol or length-prefixing vs. delimiting for a | ||
* particular container, may not be preserved. It is up to the user | ||
* to provide a writer configured to match these details if important. | ||
* | ||
* To get a [MacroAwareIonReader] use `_Private_IonReaderBuilder.buildMacroAware`. | ||
* To get a [MacroAwareIonWriter] use [IonEncodingVersion.textWriterBuilder] or | ||
* [IonEncodingVersion.binaryWriterBuilder]. | ||
* @param writer the writer to which the reader's stream will be transcoded. | ||
*/ | ||
@Throws(IOException::class) | ||
fun transcodeTo(writer: MacroAwareIonWriter) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,10 @@ | |
import com.amazon.ion.IonBufferConfiguration; | ||
import com.amazon.ion.IonCursor; | ||
import com.amazon.ion.IonException; | ||
import com.amazon.ion.IonReader; | ||
import com.amazon.ion.IonType; | ||
import com.amazon.ion.MacroAwareIonReader; | ||
import com.amazon.ion.MacroAwareIonWriter; | ||
import com.amazon.ion.SymbolTable; | ||
import com.amazon.ion.SymbolToken; | ||
import com.amazon.ion.Timestamp; | ||
|
@@ -21,6 +24,7 @@ | |
import com.amazon.ion.impl.macro.EncodingContext; | ||
import com.amazon.ion.impl.macro.Expression; | ||
import com.amazon.ion.impl.macro.EExpressionArgsReader; | ||
import com.amazon.ion.impl.macro.IonReaderFromReaderAdapter; | ||
import com.amazon.ion.impl.macro.Macro; | ||
import com.amazon.ion.impl.macro.MacroCompiler; | ||
import com.amazon.ion.impl.macro.MacroTable; | ||
|
@@ -32,6 +36,7 @@ | |
import com.amazon.ion.impl.macro.MacroRef; | ||
import com.amazon.ion.impl.macro.SystemMacro; | ||
|
||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.math.BigDecimal; | ||
import java.math.BigInteger; | ||
|
@@ -41,21 +46,22 @@ | |
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.Date; | ||
import java.util.HashMap; | ||
import java.util.Iterator; | ||
import java.util.LinkedHashMap; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.function.Consumer; | ||
|
||
import static com.amazon.ion.SystemSymbols.ION_ENCODING; | ||
import static com.amazon.ion.SystemSymbols.ION_SYMBOL_TABLE_SID; | ||
import static com.amazon.ion.impl.IonReaderContinuableApplicationBinary.SYMBOLS_LIST_INITIAL_CAPACITY; | ||
import static com.amazon.ion.impl.IonTypeID.SYSTEM_SYMBOL_VALUE; | ||
import static com.amazon.ion.impl.bin.Ion_1_1_Constants.*; | ||
|
||
/** | ||
* An IonCursor capable of raw parsing of binary Ion streams. | ||
*/ | ||
class IonReaderContinuableCoreBinary extends IonCursorBinary implements IonReaderContinuableCore { | ||
class IonReaderContinuableCoreBinary extends IonCursorBinary implements IonReaderContinuableCore, MacroAwareIonReader { | ||
|
||
// The UTF-8 bytes that represent the text "$ion_encoding" for quick byte-by-byte comparisons. | ||
private static final byte[] ION_ENCODING_UTF8 = ION_ENCODING.getBytes(StandardCharsets.UTF_8); | ||
|
@@ -132,6 +138,9 @@ class IonReaderContinuableCoreBinary extends IonCursorBinary implements IonReade | |
// Adapts this reader for use in code that supports multiple reader types. | ||
private final ReaderAdapter readerAdapter = new ReaderAdapterContinuable(this); | ||
|
||
// Adapts this reader for use in code that supports IonReader. | ||
private final IonReader asIonReader = new IonReaderFromReaderAdapter(readerAdapter); | ||
|
||
// Reads encoding directives from the stream. | ||
private final EncodingDirectiveReader encodingDirectiveReader = new EncodingDirectiveReader(); | ||
|
||
|
@@ -1105,7 +1114,7 @@ boolean startsWithIonEncoding() { | |
public String getSymbol(int sid) { | ||
// Only symbol IDs declared in Ion 1.1 encoding directives (not Ion 1.0 symbol tables) are resolved by the | ||
// core reader. In Ion 1.0, 'symbols' is never populated by the core reader. | ||
if (sid - 1 <= localSymbolMaxOffset) { | ||
if (sid > 0 && sid - 1 <= localSymbolMaxOffset) { | ||
return symbols[sid - 1]; | ||
} | ||
return null; | ||
|
@@ -1217,7 +1226,7 @@ private class EncodingDirectiveReader { | |
boolean isSymbolTableAppend = false; | ||
boolean isMacroTableAppend = false; | ||
List<String> newSymbols = new ArrayList<>(8); | ||
Map<MacroRef, Macro> newMacros = new HashMap<>(); | ||
Map<MacroRef, Macro> newMacros = new LinkedHashMap<>(); | ||
MacroCompiler macroCompiler = new MacroCompiler(this::resolveMacro, readerAdapter); | ||
|
||
boolean isSymbolTableAlreadyClassified = false; | ||
|
@@ -1490,6 +1499,10 @@ void readEncodingDirective() { | |
state = State.COMPILING_MACRO; | ||
Macro newMacro = macroCompiler.compileMacro(); | ||
newMacros.put(MacroRef.byId(++localMacroMaxOffset), newMacro); | ||
String macroName = macroCompiler.getMacroName(); | ||
if (macroName != null) { | ||
newMacros.put(MacroRef.byName(macroName), newMacro); | ||
} | ||
state = State.IN_MACRO_TABLE_SEXP; | ||
break; | ||
default: | ||
|
@@ -1733,6 +1746,33 @@ protected void stepOutOfEExpression() { | |
} | ||
} | ||
|
||
/** | ||
* @return true if current value has a sequence of annotations that begins with `$ion_symbol_table`; otherwise, | ||
* false. | ||
*/ | ||
protected boolean startsWithIonSymbolTable() { | ||
if (minorVersion == 0 && annotationSequenceMarker.startIndex >= 0) { | ||
long savedPeekIndex = peekIndex; | ||
peekIndex = annotationSequenceMarker.startIndex; | ||
int sid = readVarUInt_1_0(); | ||
peekIndex = savedPeekIndex; | ||
return ION_SYMBOL_TABLE_SID == sid; | ||
} else if (minorVersion == 1) { | ||
Marker marker = annotationTokenMarkers.get(0); | ||
return matchesSystemSymbol_1_1(marker, SystemSymbols_1_1.ION_SYMBOL_TABLE); | ||
} | ||
return false; | ||
} | ||
|
||
/** | ||
* @return true if the reader is positioned on a symbol table; otherwise, false. | ||
*/ | ||
protected boolean isPositionedOnSymbolTable() { | ||
return hasAnnotations && | ||
getEncodingType() == IonType.STRUCT && | ||
startsWithIonSymbolTable(); | ||
} | ||
|
||
/** | ||
* Consumes the next value (if any) from the MacroEvaluator, setting `event` based on the result. | ||
* @return true if evaluation of the current invocation has completed; otherwise, false. | ||
|
@@ -1758,6 +1798,97 @@ private boolean evaluateNext() { | |
return false; | ||
} | ||
|
||
@Override | ||
public void transcodeTo(MacroAwareIonWriter writer) throws IOException { | ||
registerIvmNotificationConsumer((major, minor) -> { | ||
resetEncodingContext(); | ||
// Which IVM to write is inherent to the writer implementation. | ||
// We don't have a single implementation that writes both formats. | ||
writer.startEncodingSegmentWithIonVersionMarker(); | ||
}); | ||
while (transcodeNextTo(writer) != Event.NEEDS_DATA); | ||
} | ||
|
||
/** | ||
* Transcodes the next value, and any encoding directives that may precede it, | ||
* to the given writer. | ||
* @param writer the writer to which the value will be transcoded. | ||
* @return the result of the operation. | ||
* @throws IOException if thrown during writing. | ||
*/ | ||
Event transcodeNextTo(MacroAwareIonWriter writer) throws IOException { | ||
jobarr-amzn marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// NOTE: this method is structured very similarly to nextValue(). During performance analysis, we should | ||
// see if the methods can be unified without sacrificing hot path performance. Performance of this method | ||
// is not considered critical. | ||
lobBytesRead = 0; | ||
while (true) { | ||
if (parent == null || state != State.READING_VALUE) { | ||
if (state != State.READING_VALUE && state != State.COMPILING_MACRO) { | ||
boolean isEncodingDirectiveFromEExpression = isEvaluatingEExpression; | ||
encodingDirectiveReader.readEncodingDirective(); | ||
if (state != State.READING_VALUE) { | ||
throw new IonException("Unexpected EOF when writing encoding-level value."); | ||
} | ||
// If the encoding directive was expanded from an e-expression, that expression has already been | ||
// written. In that case, just make sure the writer is using the new context. Otherwise, also write | ||
// the encoding directive. | ||
writer.startEncodingSegmentWithEncodingDirective( | ||
encodingDirectiveReader.newMacros, | ||
encodingDirectiveReader.isMacroTableAppend, | ||
encodingDirectiveReader.newSymbols, | ||
encodingDirectiveReader.isSymbolTableAppend, | ||
isEncodingDirectiveFromEExpression | ||
); | ||
} | ||
if (isEvaluatingEExpression) { | ||
if (evaluateNext()) { | ||
continue; | ||
} | ||
} else { | ||
event = super.nextValue(); | ||
} | ||
if (minorVersion == 1 && parent == null && isPositionedOnEncodingDirective()) { | ||
encodingDirectiveReader.resetState(); | ||
state = State.ON_ION_ENCODING_SEXP; | ||
continue; | ||
} | ||
} else if (isEvaluatingEExpression) { | ||
if (evaluateNext()) { | ||
continue; | ||
} | ||
} else { | ||
event = super.nextValue(); | ||
} | ||
if (valueTid != null && valueTid.isMacroInvocation) { | ||
expressionArgsReader.beginEvaluatingMacroInvocation(macroEvaluator); | ||
macroEvaluatorIonReader.transcodeArgumentsTo(writer); | ||
Comment on lines
+1863
to
+1864
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a full materialization of the e-expression arguments followed by a re-write. If we change to lazy evaluation of e-expression arguments, this will need to change. |
||
isEvaluatingEExpression = true; | ||
if (evaluateNext()) { | ||
continue; | ||
} | ||
if (parent == null && isPositionedOnEvaluatedEncodingDirective()) { | ||
encodingDirectiveReader.resetState(); | ||
state = State.ON_ION_ENCODING_SEXP; | ||
continue; | ||
} | ||
} | ||
if (isEvaluatingEExpression) { | ||
// EExpressions are not expanded and provided to the writer; only the raw encoding is transferred. | ||
continue; | ||
} | ||
break; | ||
} | ||
if (event != Event.NEEDS_DATA) { | ||
if (minorVersion > 0 && isPositionedOnSymbolTable()) { | ||
// TODO finalize handling of Ion 1.0-style symbol tables in Ion 1.1: https://github.com/amazon-ion/ion-java/issues/1002 | ||
throw new IonException("Macro-aware transcoding of Ion 1.1 data containing Ion 1.0-style symbol tables not yet supported."); | ||
} | ||
// The reader is now positioned on an actual encoding value. Write the value. | ||
writer.writeValue(asIonReader); | ||
} | ||
return event; | ||
} | ||
|
||
@Override | ||
public Event nextValue() { | ||
lobBytesRead = 0; | ||
|
@@ -2511,7 +2642,7 @@ IntList getAnnotationSidList() { | |
* @return a SymbolToken. | ||
*/ | ||
protected SymbolToken getSymbolToken(int sid) { | ||
return new SymbolTokenImpl(sid); | ||
return new SymbolTokenImpl(getSymbol(sid), sid); | ||
} | ||
|
||
protected final SymbolToken getSystemSymbolToken(Marker marker) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,13 @@ | ||
// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
package com.amazon.ion.impl; | ||
|
||
import com.amazon.ion.IonCatalog; | ||
import com.amazon.ion.IonException; | ||
import com.amazon.ion.IonReader; | ||
import com.amazon.ion.IonTextReader; | ||
import com.amazon.ion.IonValue; | ||
import com.amazon.ion.MacroAwareIonReader; | ||
import com.amazon.ion.system.IonReaderBuilder; | ||
import com.amazon.ion.util.IonStreamUtils; | ||
|
||
|
@@ -352,4 +352,16 @@ public IonTextReader build(String ionText) { | |
return makeReaderText(validateCatalog(), ionText, lstFactory); | ||
} | ||
|
||
/** | ||
* Creates a new {@link MacroAwareIonReader} over the given data. | ||
* @param ionData the data to read. | ||
* @return a new MacroAwareIonReader instance. | ||
*/ | ||
public MacroAwareIonReader buildMacroAware(byte[] ionData) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: this is "hidden" in |
||
// TODO make this work for text too. | ||
if (!IonStreamUtils.isIonBinary(ionData)) { | ||
throw new UnsupportedOperationException("MacroAwareIonReader is not yet implemented for text data."); | ||
} | ||
return new IonReaderContinuableCoreBinary(getBufferConfiguration(), ionData, 0, ionData.length); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These are technical limitations, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could code around most of these, but it's generally not worth the effort.