From a626a8eabe359b9e5ff3b921c2b55cd951eeb82b Mon Sep 17 00:00:00 2001 From: Tyler Gregg Date: Mon, 11 Nov 2024 12:53:59 -0800 Subject: [PATCH] Adds a Macroize tool to enable conversion of Ion 1.0 data to Ion 1.1 using specified macros and text patterns. --- .../com/amazon/ion/IonEncodingVersion.java | 4 +- .../apps/macroize/InvocationSubstitute.java | 178 ++++++ .../amazon/ion/apps/macroize/Macroize.java | 599 ++++++++++++++++++ .../apps/macroize/MacroizeMacroMatcher.java | 41 ++ .../ion/apps/macroize/MacroizeSpec.java | 184 ++++++ .../apps/macroize/ManualEncodingContext.java | 200 ++++++ .../ion/apps/macroize/PrefixTextPattern.java | 52 ++ .../apps/macroize/SubstringTextPattern.java | 62 ++ .../ion/apps/macroize/SuggestedSignature.java | 58 ++ .../amazon/ion/apps/macroize/TextPattern.java | 26 + .../ion/apps/macroize/ThrowingProcedure.java | 10 + .../ion/apps/macroize/ThrowingSupplier.java | 10 + .../apps/macroize/VerbatimTextPattern.java | 44 ++ .../com/amazon/ion/impl/bin/WriteBuffer.java | 2 +- .../amazon/ion/impl/lite/IonDatagramLite.java | 9 +- .../amazon/ion/impl/macro/MacroMatcher.java | 257 ++++++++ .../java/com/amazon/ion/DatagramTest.java | 23 +- .../ion/apps/macroize/MacroizeTest.java | 75 +++ 18 files changed, 1811 insertions(+), 23 deletions(-) create mode 100644 src/main/java/com/amazon/ion/apps/macroize/InvocationSubstitute.java create mode 100644 src/main/java/com/amazon/ion/apps/macroize/Macroize.java create mode 100644 src/main/java/com/amazon/ion/apps/macroize/MacroizeMacroMatcher.java create mode 100644 src/main/java/com/amazon/ion/apps/macroize/MacroizeSpec.java create mode 100644 src/main/java/com/amazon/ion/apps/macroize/ManualEncodingContext.java create mode 100644 src/main/java/com/amazon/ion/apps/macroize/PrefixTextPattern.java create mode 100644 src/main/java/com/amazon/ion/apps/macroize/SubstringTextPattern.java create mode 100644 src/main/java/com/amazon/ion/apps/macroize/SuggestedSignature.java create mode 100644 src/main/java/com/amazon/ion/apps/macroize/TextPattern.java create mode 100644 src/main/java/com/amazon/ion/apps/macroize/ThrowingProcedure.java create mode 100644 src/main/java/com/amazon/ion/apps/macroize/ThrowingSupplier.java create mode 100644 src/main/java/com/amazon/ion/apps/macroize/VerbatimTextPattern.java create mode 100644 src/main/java/com/amazon/ion/impl/macro/MacroMatcher.java create mode 100644 src/test/java/com/amazon/ion/apps/macroize/MacroizeTest.java diff --git a/src/main/java/com/amazon/ion/IonEncodingVersion.java b/src/main/java/com/amazon/ion/IonEncodingVersion.java index cc064915ec..da7801824a 100644 --- a/src/main/java/com/amazon/ion/IonEncodingVersion.java +++ b/src/main/java/com/amazon/ion/IonEncodingVersion.java @@ -24,7 +24,7 @@ public abstract class IonEncodingVersion * Ion 1.0, see the binary and * text specification. */ - public static IonEncodingVersion ION_1_0 = new IonEncodingVersion(0) { + public static final IonEncodingVersion ION_1_0 = new IonEncodingVersion(0) { @Override public IonBinaryWriterBuilder binaryWriterBuilder() { @@ -40,7 +40,7 @@ public IonTextWriterBuilder textWriterBuilder() { /** * Ion 1.1, TODO link to the finalized specification. */ - public static IonEncodingVersion ION_1_1 = new IonEncodingVersion(1) { + public static final IonEncodingVersion ION_1_1 = new IonEncodingVersion(1) { @Override public IonBinaryWriterBuilder_1_1 binaryWriterBuilder() { diff --git a/src/main/java/com/amazon/ion/apps/macroize/InvocationSubstitute.java b/src/main/java/com/amazon/ion/apps/macroize/InvocationSubstitute.java new file mode 100644 index 0000000000..84a2b6143b --- /dev/null +++ b/src/main/java/com/amazon/ion/apps/macroize/InvocationSubstitute.java @@ -0,0 +1,178 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import com.amazon.ion.IonContainer; +import com.amazon.ion.IonSequence; +import com.amazon.ion.IonSexp; +import com.amazon.ion.IonStruct; +import com.amazon.ion.IonSystem; +import com.amazon.ion.IonValue; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * Describes a macro invocation that should be substituted into a datagram in place of the literal value that is + * currently there. + * TODO this is needed because we currently don't have a way of describing a macro invocation in the DOM. If that + * changes, this may go away. + */ +class InvocationSubstitute { + + static final String INVOCATION_ANNOTATION = "$ion_invocation"; + static final String EMPTY_GROUP_ANNOTATION = "$ion_empty"; + private IonContainer parent; + private int indexToReplace; + private final String fieldNameToReplace; + private final String shapeName; + private final List parameters; + private final SuggestedSignature signature; + private final IonSystem system; + + /** + * @param system the IonSystem that owns the parent container. + * @param parent the parent container that holds the value to be replaced with an invocation. + * @param indexToReplace the index in the parent of the value to be replaced. + * @param fieldNameToReplace the field name of the value to be replaced, if in a struct. + * @param shapeName the name of the macro to invoke. + * @param signature the signature of the macro to invoke. + */ + InvocationSubstitute( + IonSystem system, + IonContainer parent, + int indexToReplace, + String fieldNameToReplace, + String shapeName, + SuggestedSignature signature + ) { + this.system = system; + this.parent = parent; + this.indexToReplace = indexToReplace; + this.fieldNameToReplace = fieldNameToReplace; + this.shapeName = shapeName; + this.parameters = extractArguments(parent, indexToReplace, fieldNameToReplace, signature); + this.signature = signature; + } + + /** + * Retrieves the IonValue to be replaced with an invocation. + * @param parent the parent container of the value to replace. + * @param indexToReplace the index in the parent of the value to be replaced. + * @param fieldNameToReplace the field name of the value to be replaced, if in a struct. + * @return + */ + private static IonValue select(IonContainer parent, int indexToReplace, String fieldNameToReplace) { + IonValue target = null; + if (fieldNameToReplace == null || !(parent instanceof IonStruct)) { + Iterator children = parent.iterator(); + int index = 0; + while (index <= indexToReplace) { + index++; + if (!children.hasNext()) { + return null; + } + target = children.next(); + } + } else { + target = ((IonStruct) parent).get(fieldNameToReplace); + } + return target; + } + + /** + * @return an IonSexp that is used to represent an empty expression group. + */ + private IonSexp emptyExpressionGroup() { + IonSexp empty = system.newEmptySexp(); + empty.addTypeAnnotation(EMPTY_GROUP_ANNOTATION); + return empty; + } + + /** + * Extracts the values from the source data that must be passed into the invocation that will replace the current + * value. + * @param parent the parent container of the value to replace. + * @param indexToReplace the index in the parent of the value to be replaced. + * @param fieldNameToReplace the field name of the value to be replaced, if in a struct. + * @param signature the signature of the invocation. + * @return the list of arguments. + */ + private List extractArguments( + IonContainer parent, + int indexToReplace, + String fieldNameToReplace, + SuggestedSignature signature + ) { + IonStruct targetStruct = (IonStruct) select(parent, indexToReplace, fieldNameToReplace); + if (targetStruct == null) { + throw new IllegalArgumentException("Failed to extract parameters for " + fieldNameToReplace); + } + List parameters = new ArrayList<>(); + for (String argument : signature.allParameters()) { + IonValue parameter = targetStruct.get(argument); + if (parameter == null) { + // This is a missing optional + parameters.add(emptyExpressionGroup()); + } else { + parameters.add(parameter); + } + } + // Remove all the optionals that occur contiguously at the end of the invocation. + int tailOptionalCount = 0; + for (int i = parameters.size() - 1; i >= 0; i--) { + String[] annotations = parameters.get(i).getTypeAnnotations(); + if (annotations.length == 1 && annotations[0].equals(EMPTY_GROUP_ANNOTATION)) { + tailOptionalCount++; + } else { + break; + } + } + if (tailOptionalCount > 0) { + parameters = parameters.subList(0, parameters.size() - tailOptionalCount); + } + return parameters; + } + + /** + * Substitutes the target value with an invocation. + * @param nextDepthSubstitutes the substitutes at the next-greater depth. If the target values of those substitutes + * were children of the value substituted in this method, then their parent and index + * to replace must be updated to point at the new invocation. + */ + public void substitute(List nextDepthSubstitutes) { + IonValue target = select(parent, indexToReplace, fieldNameToReplace); + String fieldName = target == null ? null : target.getFieldName(); + IonSexp invocation = system.newEmptySexp(); + invocation.addTypeAnnotation(INVOCATION_ANNOTATION); + invocation.add(system.newSymbol(shapeName)); + for (IonValue value : parameters) { + value.removeFromContainer(); + invocation.add(value); + } + IonValue replaced; + if (fieldName == null) { + IonSequence parentSequence = ((IonSequence) parent); + if (indexToReplace >= parentSequence.size()) { + parentSequence.add(invocation); + replaced = null; + } else { + replaced = parentSequence.set(indexToReplace, invocation); + } + } else { + replaced = ((IonStruct) parent).get(fieldName); + ((IonStruct) parent).put(fieldName, invocation); + } + + if (nextDepthSubstitutes != null) { + for (InvocationSubstitute nextDepthSubstitute : nextDepthSubstitutes) { + if (nextDepthSubstitute.parent == replaced) { + nextDepthSubstitute.parent = invocation; + // The first index of an invocation starts at 1, since the macro name comes first. + nextDepthSubstitute.indexToReplace = signature.indexOf(nextDepthSubstitute.shapeName) + 1; + } + } + } + } +} diff --git a/src/main/java/com/amazon/ion/apps/macroize/Macroize.java b/src/main/java/com/amazon/ion/apps/macroize/Macroize.java new file mode 100644 index 0000000000..28d5d3cfba --- /dev/null +++ b/src/main/java/com/amazon/ion/apps/macroize/Macroize.java @@ -0,0 +1,599 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import com.amazon.ion.IonContainer; +import com.amazon.ion.IonDatagram; +import com.amazon.ion.IonException; +import com.amazon.ion.IonReader; +import com.amazon.ion.IonSystem; +import com.amazon.ion.IonType; +import com.amazon.ion.IonValue; +import com.amazon.ion.IonWriter; +import com.amazon.ion.SymbolToken; +import com.amazon.ion.Timestamp; +import com.amazon.ion.impl.BufferedOutputStreamFastAppendable; +import com.amazon.ion.impl.IonRawTextWriter_1_1; +import com.amazon.ion.impl.IonRawWriter_1_1; +import com.amazon.ion.impl._Private_IonTextAppender; +import com.amazon.ion.impl._Private_IonTextWriterBuilder_1_1; +import com.amazon.ion.impl.bin.BlockAllocatorProviders; +import com.amazon.ion.impl.bin.IonRawBinaryWriter_1_1; +import com.amazon.ion.impl.bin.WriteBuffer; +import com.amazon.ion.system.IonReaderBuilder; +import com.amazon.ion.system.IonSystemBuilder; +import com.amazon.ion.system.IonTextWriterBuilder; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.math.BigDecimal; +import java.math.BigInteger; +import java.math.MathContext; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Re-writes a stream of Ion data to the Ion 1.1 equivalent, leveraging Ion 1.1 macros. + */ +public class Macroize { + + private static final IonSystem SYSTEM = IonSystemBuilder.standard().build(); + + public static void main(String[] args) throws IOException { + // TODO replace argument handling with a library like pico CLI + String specFile = null; + boolean outputBinary = false; + int i; + for (i = 0; i < args.length; i++) { + switch (args[i]) { + case "--spec": + specFile = args[++i]; + break; + case "--format": + switch(args[++i]) { + case "binary": + outputBinary = true; + break; + case "text": + outputBinary = false; + break; + default: + throw new IllegalArgumentException("Unrecognized format: " + args[i]); + } + break; + case "--help": + case "-h": + System.out.println("IonJava Macroize Tool v0.1"); + System.out.println("Usage:\n--spec [--format ] "); + System.exit(0); + break; + default: + if (i == args.length - 1) { + // This is the final argument; it must be the input file name. + break; + } + throw new IllegalArgumentException("Unrecognized option: " + args[i]); + } + } + if (specFile == null) { + throw new IllegalArgumentException("Expected a spec file to be provided via the --spec option."); + } + + String inputFileWithSuffix = args[args.length - 1]; + Path inputPath = checkPath(inputFileWithSuffix); + String outputFileSuffix = outputBinary ? ".10n" : ".ion"; + String inputName = inputPath.toFile().getName(); + int dotIndex = inputName.lastIndexOf('.'); + String inputNameWithoutSuffix = (dotIndex < 0) ? inputName : inputName.substring(0, dotIndex); + Path specPath = checkPath(specFile); + + Path invocationsPath = Files.createTempFile(inputNameWithoutSuffix + "-invocations", ".ion"); + invocationsPath.toFile().deleteOnExit(); + Path headlessPath = Files.createTempFile(inputNameWithoutSuffix + "-headless-1-1", outputFileSuffix); + headlessPath.toFile().deleteOnExit(); + Path parentDirectory = inputPath.toAbsolutePath().getParent(); + if (parentDirectory == null) { + throw new IllegalArgumentException("Invalid input path: " + inputPath); + } + Path convertedPath = parentDirectory.resolve(inputNameWithoutSuffix + "-1-1" + outputFileSuffix); + + macroize( + () -> IonReaderBuilder.standard().build(Files.newInputStream(inputPath)), + () -> IonTextWriterBuilder.standard().build(Files.newOutputStream(invocationsPath)), + () -> IonReaderBuilder.standard().build(Files.newInputStream(invocationsPath)), + () -> Files.newOutputStream(headlessPath), + () -> Files.newOutputStream(convertedPath), + () -> appendCopy(headlessPath, convertedPath), + () -> IonReaderBuilder.standard().build(Files.newInputStream(specPath)), + outputBinary, + System.out + ); + System.out.println("Ion 1.1 file written to: " + convertedPath.toAbsolutePath()); + } + + /** + * Re-writes a stream of Ion data to the Ion 1.1 equivalent, leveraging Ion 1.1 macros. + * @param inputReaderSupplier supplies an IonReader over the input data. + * @param invocationsWriterSupplier supplies an IonWriter to write a description of where macro invocations should be substituted into the stream. + * @param invocationsReaderSupplier supplies an IonReader over the macro invocation description stream. + * @param headlessOutputSupplier supplies an OutputStream to which the body of the converted stream will be written (i.e., without a preceding encoding context). + * @param fullOutputSupplier supplies an OutputStream to which the entire converted stream (including encoding context) will be written. + * @param assembleFullOutput the procedure for appending the headless stream to the end of the stream containing the encoding context, creating the full output. + * @param specReaderSupplier supplies an IonReader over the spec file that informs the conversion. + * @param outputBinary true if the stream will be converted to binary Ion 1.1; false if it will be converted to text Ion 1.1. + * @param log an appendable log of any messages produced during the conversion, such as statistics and status. + * @throws IOException if thrown during the conversion. + */ + static void macroize( + ThrowingSupplier inputReaderSupplier, + ThrowingSupplier invocationsWriterSupplier, + ThrowingSupplier invocationsReaderSupplier, + ThrowingSupplier headlessOutputSupplier, + ThrowingSupplier fullOutputSupplier, + ThrowingProcedure assembleFullOutput, + ThrowingSupplier specReaderSupplier, + boolean outputBinary, + Appendable log + ) throws IOException { + // Read the input data into memory. + IonDatagram source; + try (IonReader reader = inputReaderSupplier.get()) { + source = SYSTEM.getLoader().load(reader); + } + + // Prepare the context and the spec to be used during the conversion. + ManualEncodingContext context = new ManualEncodingContext(); + MacroizeSpec spec = new MacroizeSpec(); + try (IonReader reader = specReaderSupplier.get()) { + spec.readSpec(reader, context); + } + + // Using the spec, produce a marked up text Ion 1.0 representation of the input that + // indicates which structs should be replaced with macro invocations. + try (IonWriter writer = invocationsWriterSupplier.get()) { + writeMacroMatchesUsingMarkedUpIon10(writer, source, spec, log); + } + + // Go through the marked up invocations and re-write to Ion 1.1, intercepting the special marked up + // Ion 1.0 values and replacing them with proper Ion 1.1 e-expressions. + log.append("\n\nConverting to 1.1\n"); + IonRawWriter_1_1 writer = newRawWriter_1_1(headlessOutputSupplier.get(), outputBinary); + try (IonReader reader = invocationsReaderSupplier.get()) { + while (reader.next() != null) { + replaceMatchesWithInvocations(reader, writer, context, outputBinary, spec.textPatterns); + } + } finally { + writer.close(); + } + + // Write the symbol and macro tables + IonRawWriter_1_1 symbolTableWriter = newRawWriter_1_1(fullOutputSupplier.get(), outputBinary); + try { + symbolTableWriter.writeIVM(); + context.writeTo(symbolTableWriter); + } finally { + symbolTableWriter.close(); + } + // Now, copy the headless Ion 1.1 data to the end. + assembleFullOutput.execute(); + log.append("\nDone.\n"); + } + + /** + * Substitute value literals that match any of the specified macros with invocation instructions, represented using + * annotated Ion 1.0 s-expressions of the form `$ion_invocation::(name_of_macro arguments...)`. This intermediate + * form is used to make it possible to mutate the existing IonValue structure, which does not support modeling + * macro invocations. If this is supported in the future, this can likely be simplified. + * @param writer the writer. + * @param source the source data. + * @param spec the spec containing the macros to match. + * @param log an appendable log. + * @throws IOException if thrown during writing. + */ + private static void writeMacroMatchesUsingMarkedUpIon10( + IonWriter writer, + IonDatagram source, + MacroizeSpec spec, + Appendable log + ) throws IOException { + Map suggestedSignatures = spec.matchMacros(source, log); + for (int topLevelValueIndex = 0; topLevelValueIndex < source.size(); topLevelValueIndex++) { + IonValue topLevelValue = source.get(topLevelValueIndex); + if (!IonType.isContainer(topLevelValue.getType())) { + topLevelValue.writeTo(writer); + continue; + } + // key: depth, value: invocations at that depth + Map> invocationSubstitutes = new HashMap<>(); + findMatch(topLevelValue, source, topLevelValueIndex, spec.customMatchers, suggestedSignatures, invocationSubstitutes, 0); + matchMacrosRecursive((IonContainer) topLevelValue, spec.customMatchers, suggestedSignatures, invocationSubstitutes, 1); + // Iterate over all invocation matches, sorted by depth from shallowest to deepest. + for ( + Map.Entry> substitutesByDepth + : invocationSubstitutes.entrySet().stream().sorted(Map.Entry.comparingByKey()).collect(Collectors.toList()) + ) { + int depth = substitutesByDepth.getKey(); + for (InvocationSubstitute substitute : substitutesByDepth.getValue()) { + substitute.substitute(invocationSubstitutes.get(depth + 1)); + } + // 'topLevelValue' has been replaced with an invocation; update it with the replacement before writing. + if (depth == 0) { + topLevelValue = source.get(topLevelValueIndex); + } + } + topLevelValue.writeTo(writer); + } + } + + /** + * Attempts to match the given value with any of the given macro matchers. + * @param value the value to attempt to match. + * @param parent the value's parent container (which may be an IonDatagram if 'value' is at the top level). + * @param containerIndex the index of 'value' within 'parent'. + * @param customMacroMatchers the macro matchers to evaluate. + * @param suggestedSignatures the macro signatures available. + * @param substituteInvocations receives the invocation substitutes identified for this value, organized by depth. + * @param depth the depth at which the given container resides. + * @return true if a match was found. + */ + private static boolean findMatch( + IonValue value, + IonContainer parent, + int containerIndex, + List customMacroMatchers, + Map suggestedSignatures, + Map> substituteInvocations, + int depth + ) { + // TODO efficiency is not a main concern for the first release of this tool, but if it does become + // important, then it should be considered how the following might be optimized. Currently every value + // every depth must be compared against all macro matchers. + for (MacroizeMacroMatcher customMacroMatcher : customMacroMatchers) { + if (customMacroMatcher.match(value)) { + String name = customMacroMatcher.name(); + InvocationSubstitute substitute = new InvocationSubstitute(SYSTEM, parent, containerIndex, value.getFieldName(), name, suggestedSignatures.get(name)); + substituteInvocations.computeIfAbsent(depth, k -> new ArrayList<>()).add(substitute); + return true; + } + } + return false; + } + + /** + * Recursively visits the given container, evaluating it against the possible macro matches. + * @param container a container. + * @param customMacroMatchers the macro matchers to evaluate. + * @param suggestedSignatures the macro signatures available. + * @param substituteInvocations receives the invocation substitutes identified for this value, organized by depth. + * @param depth the depth at which the given container resides. + * @return the name of the macro that this container matched, or null if there was no match. + */ + private static String matchMacrosRecursive( + IonContainer container, + List customMacroMatchers, + Map suggestedSignatures, + Map> substituteInvocations, + int depth + ) { + Iterator children = container.iterator(); + int containerIndex = 0; + Set childFields = new LinkedHashSet<>(); + while (children.hasNext()) { + IonValue child = children.next(); + if (findMatch(child, container, containerIndex, customMacroMatchers, suggestedSignatures, substituteInvocations, depth)) { + // A custom matcher was matched; don't descend further. + containerIndex++; + continue; + } + if (container.getType() == IonType.STRUCT) { + childFields.add(child.getFieldName()); + } + switch (child.getType()) { + case STRUCT: + case LIST: + case SEXP: + String shapeName = matchMacrosRecursive((IonContainer) child, customMacroMatchers, suggestedSignatures, substituteInvocations, depth + 1); + if (shapeName != null) { + InvocationSubstitute substitute = new InvocationSubstitute(SYSTEM, container, containerIndex, child.getFieldName(), shapeName, suggestedSignatures.get(shapeName)); + substituteInvocations.computeIfAbsent(depth, k -> new ArrayList<>()).add(substitute); + } + break; + default: + break; + } + containerIndex++; + } + String shapeName = getNameOfShape(container); + if (shapeName == null) { + return null; + } + SuggestedSignature suggestedSignature = suggestedSignatures.get(shapeName); + if (suggestedSignature != null && suggestedSignature.isCompatible(childFields)) { + if (container.getType() == IonType.STRUCT) { + return shapeName; + } + } + return null; + } + + /** + * Iterates through a stream that may contain macro invocation markup (e.g. + * `$ion_invocation::(name_of_macro arguments...)`), replacing these special marked up s-expressions with + * actual Ion 1.1 e-expressions. + * TODO the structure of this method is copied from `AbstractIonWriter.writeValueRecursive`, though several changes + * were made to fit this purpose. Ideally the code could be shared somehow. + * @param reader the reader over the marked-up Ion 1.0 stream. + * @param writer an Ion 1.1 raw writer. + * @param context the encoding context, containing the symbols and macros that will be used in the Ion 1.1 stream. + * @param isBinary true if the output encoding is binary; false if it is text. + * @param textPatterns the text patterns to match and replace when writing. + */ + private static void replaceMatchesWithInvocations( + IonReader reader, + IonRawWriter_1_1 writer, + ManualEncodingContext context, + boolean isBinary, + List textPatterns + ) { + // The IonReader does not need to be at the top level (getDepth()==0) when the function is called. + // We take note of its initial depth so we can avoid advancing the IonReader beyond the starting value. + int startingDepth = writer.depth(); + + // The IonReader will be at `startingDepth` when the function is first called and then again when we + // have finished traversing all of its children. This boolean tracks which of those two states we are + // in when `getDepth() == startingDepth`. + boolean alreadyProcessedTheStartingValue = false; + + // The IonType of the IonReader's current value. + IonType type; + + while (true) { + // Each time we reach the top of the loop we are in one of three states: + // 1. We have not yet begun processing the starting value. + // 2. We are currently traversing the starting value's children. + // 3. We have finished processing the starting value. + if (writer.depth() == startingDepth) { + // The IonReader is at the starting depth. We're either beginning our traversal or finishing it. + if (alreadyProcessedTheStartingValue) { + // We're finishing our traversal. + break; + } + // We're beginning our traversal. Don't advance the cursor; instead, use the current + // value's IonType. + type = reader.getType(); + // We've begun processing the starting value. + alreadyProcessedTheStartingValue = true; + } else { + // We're traversing the starting value's children (that is: values at greater depths). We need to + // advance the cursor by calling next(). + type = reader.next(); + } + + if (type == null) { + // There are no more values at this level. If we're at the starting level, we're done. + if (writer.depth() == startingDepth) { + break; + } + // Otherwise, step out once and then try to move forward again. + reader.stepOut(); + writer.stepOut(); + continue; + } + + final SymbolToken fieldName = reader.getFieldNameSymbol(); + if (fieldName != null && !writer._private_hasFieldName() && writer.isInStruct()) { + // TODO apply text patterns to field names + writer.writeFieldName(context.internSymbol(fieldName.getText())); + } + if (fieldName == null && writer.isInStruct()) { + throw new IonException("Missing field name"); + } + final SymbolToken[] annotations = reader.getTypeAnnotationSymbols(); + boolean isEexp = false; + boolean isEmptyExpressionGroup = false; + if (annotations.length == 1 && annotations[0].getText().equals(InvocationSubstitute.INVOCATION_ANNOTATION)) { + isEexp = true; + } else if (annotations.length == 1 && annotations[0].getText().equals(InvocationSubstitute.EMPTY_GROUP_ANNOTATION)) { + isEmptyExpressionGroup = true; + } else { + for (SymbolToken annotation : annotations) { + // TODO apply text patterns to annotations + writer.writeAnnotations(context.internSymbol(annotation.getText())); + } + } + if (reader.isNullValue()) { + writer.writeNull(type); + continue; + } + + switch (type) { + case BOOL: + final boolean booleanValue = reader.booleanValue(); + writer.writeBool(booleanValue); + break; + case INT: + switch (reader.getIntegerSize()) { + case INT: + final int intValue = reader.intValue(); + writer.writeInt(intValue); + break; + case LONG: + final long longValue = reader.longValue(); + writer.writeInt(longValue); + break; + case BIG_INTEGER: + final BigInteger bigIntegerValue = reader.bigIntegerValue(); + writer.writeInt(bigIntegerValue); + break; + default: + throw new IllegalStateException(); + } + break; + case FLOAT: + final double doubleValue = reader.doubleValue(); + writer.writeFloat(doubleValue); + break; + case DECIMAL: + BigDecimal decimalValue = reader.decimalValue(); + if (decimalValue.precision() > 16) { + decimalValue = decimalValue.round(MathContext.DECIMAL64); + } + writer.writeDecimal(decimalValue); + break; + case TIMESTAMP: + final Timestamp timestampValue = reader.timestampValue(); + writer.writeTimestamp(timestampValue); + break; + case SYMBOL: + final SymbolToken symbolToken = reader.symbolValue(); + writer.writeSymbol(context.internSymbol(symbolToken.getText())); + break; + case STRING: + final String stringValue = reader.stringValue(); + boolean isMatched = false; + for (TextPattern stringPattern : textPatterns) { + if (stringPattern.matches(stringValue)) { + stringPattern.invoke(stringValue, context, writer, isBinary); + isMatched = true; + break; + } + } + if (!isMatched) { + writer.writeString(stringValue); + } + break; + case CLOB: + final byte[] clobValue = reader.newBytes(); + writer.writeClob(clobValue); + break; + case BLOB: + final byte[] blobValue = reader.newBytes(); + writer.writeBlob(blobValue); + break; + case SEXP: + reader.stepIn(); + if (isEmptyExpressionGroup) { + writer.stepInExpressionGroup(false); + } else if (isEexp) { + reader.next(); + String macroName = reader.stringValue(); + if (isBinary) { + writer.stepInEExp(context.getMacroId(macroName), false, context.getMacro(macroName)); + } else { + writer.stepInEExp(macroName); + } + } else { + writer.stepInSExp(false); + } + break; + case LIST: + reader.stepIn(); + writer.stepInList(false); + break; + case STRUCT: + reader.stepIn(); + writer.stepInStruct(false); + break; + default: + throw new IllegalStateException("Unexpected type: " + type); + } + } + } + + /** + * Checks that the file with the given name exists and can be read. + * @param name the file name. + * @return a Path to the file. + */ + private static Path checkPath(String name) { + File file = new File(name); + if (!file.canRead()) { + throw new IllegalArgumentException("Cannot read file: " + name); + } + return file.toPath(); + } + + /** + * Appends a copy of the contents of 'from' to the end of the contents of 'to'. + * @param from the path to copy from. + * @param to the path to append to. + * @throws IOException if thrown during the copy. + */ + private static void appendCopy(Path from, Path to) throws IOException { + try (OutputStream output = new FileOutputStream(to.toFile(), true)) { + Files.copy(from, output); + } + } + + private static IonRawWriter_1_1 newRawWriter_1_1(OutputStream out, boolean isBinary) { + return isBinary ? newRawBinaryWriter_1_1(out) : newRawTextWriter_1_1(out); + } + + private static IonRawWriter_1_1 newRawBinaryWriter_1_1(OutputStream out) { + return new IonRawBinaryWriter_1_1( + out, + new WriteBuffer(BlockAllocatorProviders.basicProvider().vendAllocator(32768), () -> {}), + 0 + ); + } + + private static IonRawWriter_1_1 newRawTextWriter_1_1(OutputStream out) { + _Private_IonTextWriterBuilder_1_1 builder = _Private_IonTextWriterBuilder_1_1.standard() + .withNewLineType(IonTextWriterBuilder.NewLineType.LF) + .withPrettyPrinting(); + BufferedOutputStreamFastAppendable appendable = new BufferedOutputStreamFastAppendable( + out, + BlockAllocatorProviders.basicProvider().vendAllocator(4096), + 1.0 + ); + return new IonRawTextWriter_1_1( + builder, + _Private_IonTextAppender.forFastAppendable(appendable, StandardCharsets.UTF_8) + ); + } + + /** + * Sanitizes the given string so that it may be used as a macro name. + * @param original the original string. + * @return the sanitized name. + */ + private static String sanitizeName(String original) { + String sanitized = original.replaceAll("[.:\\-/]", "_"); + if (!Character.isAlphabetic(sanitized.charAt(0))) { + return "z" + sanitized; // This is arbitrary. + } + return sanitized; + } + + /** + * Gets a name describing the given container. This will either be its field name, if in a struct, or the field + * name of its parent sequence, if applicable. Otherwise, this method will return null. + * @param container the value for which to get a shape name. + * @return the name, or null if no name can be determined. + */ + private static String getNameOfShape(IonContainer container) { + String shapeName = container.getFieldName(); + if (shapeName == null) { + // Homogeneous sequences of structs are common. In this case use the field name of the sequence, if any. + IonContainer parentContainer = container.getContainer(); + if (parentContainer != null && (parentContainer.getType() == IonType.LIST || parentContainer.getType() == IonType.SEXP)) { + shapeName = parentContainer.getFieldName(); + } + } + if (shapeName == null) { + return null; + } + return sanitizeName(shapeName); + } +} diff --git a/src/main/java/com/amazon/ion/apps/macroize/MacroizeMacroMatcher.java b/src/main/java/com/amazon/ion/apps/macroize/MacroizeMacroMatcher.java new file mode 100644 index 0000000000..f009de329a --- /dev/null +++ b/src/main/java/com/amazon/ion/apps/macroize/MacroizeMacroMatcher.java @@ -0,0 +1,41 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import com.amazon.ion.IonReader; +import com.amazon.ion.impl.macro.Macro; +import com.amazon.ion.impl.macro.MacroMatcher; +import com.amazon.ion.impl.macro.MacroRef; + +/** + * A {@link MacroMatcher} that uses a {@link ManualEncodingContext} and can produce {@link SuggestedSignature}s. + */ +class MacroizeMacroMatcher extends MacroMatcher { + + public MacroizeMacroMatcher(IonReader macroReader, ManualEncodingContext symbolTable) { + super(macroReader, ref -> symbolTable.getMacro(((MacroRef.ByName) ref).getName())); + symbolTable.addMacro(name(), macro()); + } + + /** + * @return the suggested signature for this matcher. + */ + SuggestedSignature getSignature() { + SuggestedSignature signature = new SuggestedSignature(); + for (Macro.Parameter parameter : macro().getSignature()) { + switch (parameter.getCardinality()) { + case ZeroOrOne: + signature.addOptional(parameter.getVariableName()); + break; + case ExactlyOne: + signature.addRequired(parameter.getVariableName()); + break; + case OneOrMore: + throw new UnsupportedOperationException("TODO: + not yet supported"); + case ZeroOrMore: + throw new UnsupportedOperationException("TODO: * not yet supported"); + } + } + return signature; + } +} diff --git a/src/main/java/com/amazon/ion/apps/macroize/MacroizeSpec.java b/src/main/java/com/amazon/ion/apps/macroize/MacroizeSpec.java new file mode 100644 index 0000000000..ac9227526d --- /dev/null +++ b/src/main/java/com/amazon/ion/apps/macroize/MacroizeSpec.java @@ -0,0 +1,184 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import com.amazon.ion.IonContainer; +import com.amazon.ion.IonDatagram; +import com.amazon.ion.IonException; +import com.amazon.ion.IonReader; +import com.amazon.ion.IonType; +import com.amazon.ion.IonValue; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Specifies how a particular stream of Ion data should be written using Ion 1.1. This spec is read from an Ion file + * that contains a struct with the following shape: + *
+ * {@code
+ *     {
+ *         macros: [(macro ...) ...] // The elements are Ion 1.1 TDL macro definitions
+ *         textPatterns: [(verbatim | prefix | substring ...) ...] // The elements refer to {@link TextPattern} types
+ *     }
+ * }
+ * 
+ * The textPattern elements may have the following shape: + *
+ * {@code
+ *     (verbatim [string...]) // Each string in the list is a string to write as a symbol using make_string
+ *     (prefix string [string...]) // The standalone string is the prefix; the optional list elements are potential suffixes.
+ *     (substring string [string...]) // The standalone string is a target substring; the optional list elements are potential prefixes or suffixes.
+ * }
+ * 
+ * Note the following known limitations, which may be fixed in the future: + *
    + *
  • Within macro definitions that expand to structs, variable names must match the field name, + * e.g., {foo: (%foo)}
  • + *
  • The tool only attempts to match suggested macros to container values.
  • + *
  • Nested macro invocations are not yet supported.
  • + *
+ */ +class MacroizeSpec { + final List customMatchers = new ArrayList<>(); + final List textPatterns = new ArrayList<>(); + + /** + * Reads the spec from the given reader. It is assumed that next() has not yet been called to position the reader + * on the spec struct. + * @param reader the reader. + * @param context the encoding context. + */ + void readSpec(IonReader reader, ManualEncodingContext context) { + if (reader.next() != IonType.STRUCT) { + throw new IonException("Expected struct."); + } + reader.stepIn(); + while (reader.next() != null) { + if (reader.getType() != IonType.LIST) { + throw new IonException("Expected list."); + } + switch (reader.getFieldName()) { + case "macros": + readMacroMatchers(reader, context, customMatchers); + break; + case "textPatterns": + readTextPatterns(reader, context, textPatterns); + break; + default: + throw new IonException("Expected 'macros' or 'textPatterns'."); + } + } + } + + private static void readMacroMatchers(IonReader reader, ManualEncodingContext symbolTable, List matchers) { + reader.stepIn(); + while (reader.next() != null) { + matchers.add(new MacroizeMacroMatcher(reader, symbolTable)); + } + reader.stepOut(); + } + + private static void readTextPatterns(IonReader reader, ManualEncodingContext symbolTable, List patterns) { + reader.stepIn(); + while (reader.next() != null) { + if (reader.getType() != IonType.SEXP) { + throw new IonException("Expected s-exp."); + } + reader.stepIn(); + if (!IonType.isText(reader.next())) { + throw new IonException("Expected pattern type name."); + } + switch (reader.stringValue()) { + case "verbatim": + patterns.add(new VerbatimTextPattern(symbolTable, readStringList(reader))); + break; + case "prefix": + if (!IonType.isText(reader.next())) { + throw new IonException("Expected prefix."); + } + patterns.add(new PrefixTextPattern(symbolTable, reader.stringValue(), readStringList(reader))); + break; + case "substring": + if (!IonType.isText(reader.next())) { + throw new IonException("Expected substring."); + } + patterns.add(new SubstringTextPattern(symbolTable, reader.stringValue(), readStringList(reader))); + break; + default: + throw new IonException("Expected 'stringAsSymbol', 'prefix', or 'contains'."); + } + reader.stepOut(); + } + reader.stepOut(); + } + + private static List readStringList(IonReader reader) { + List strings = new ArrayList<>(); + if (reader.next() == null) { + return strings; + } + if (reader.getType() != IonType.LIST) { + throw new IonException("Expected list of strings."); + } + reader.stepIn(); + while (reader.next() != null) { + if (IonType.isText(reader.getType())) { + strings.add(reader.stringValue()); + } + } + reader.stepOut(); + return strings; + } + + private void recursiveMatch(IonContainer container, Map matchCounter) { + for (IonValue child : container) { + for (MacroizeMacroMatcher customMatcher : customMatchers) { + if (customMatcher.match(child)) { + matchCounter.compute(customMatcher.name(), (key, existingValue) -> { + if (existingValue == null) { + existingValue = 0; + } + return existingValue + 1; + }); + } + } + switch (child.getType()) { + case STRUCT: + case LIST: + case SEXP: + recursiveMatch((IonContainer) child, matchCounter); + break; + default: + break; + } + } + } + + /** + * Match values from the given source against the macro matchers supplied by the spec. Logs the number of + * occurrences of each macro match and assembles suggested signatures for each matcher with at least one match. + * @param source the source data. + * @param log the log to receive messages about occurrences. + * @return a map from macro name to suggested signature for each name with at least one match. + * @throws IOException if thrown when logging occurrences. + */ + Map matchMacros(IonDatagram source, Appendable log) throws IOException { + Map customMacroMatches = new HashMap<>(); + Map suggestedSignatures = new HashMap<>(); + recursiveMatch(source, customMacroMatches); + + for (MacroizeMacroMatcher customMacroMatcher : customMatchers) { + String matcherName = customMacroMatcher.name(); + Integer occurrences = customMacroMatches.get(matcherName); + if (occurrences != null && occurrences > 0) { + suggestedSignatures.put(matcherName, customMacroMatcher.getSignature()); + log.append(String.format("%n%n === %s (total occurrences: %d)%n", matcherName, occurrences)); + } + } + return suggestedSignatures; + } +} diff --git a/src/main/java/com/amazon/ion/apps/macroize/ManualEncodingContext.java b/src/main/java/com/amazon/ion/apps/macroize/ManualEncodingContext.java new file mode 100644 index 0000000000..79e9b0380b --- /dev/null +++ b/src/main/java/com/amazon/ion/apps/macroize/ManualEncodingContext.java @@ -0,0 +1,200 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import com.amazon.ion.impl.IonRawWriter_1_1; +import com.amazon.ion.impl.SystemSymbols_1_1; +import com.amazon.ion.impl.macro.Expression; +import com.amazon.ion.impl.macro.Expression.TemplateBodyExpression; +import com.amazon.ion.impl.macro.Macro; +import com.amazon.ion.impl.macro.SystemMacro; +import com.amazon.ion.impl.macro.TemplateMacro; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Consumer; + +/** + * An encoding context that is manipulated manually. To be used alongside an IonRawWriter_1_1. + * TODO consider whether this class may be replaced by something similar from the core library. + */ +class ManualEncodingContext { + private final Map symbolToId = new HashMap<>(); + private final Map macroNameToId = new HashMap<>(); + private final Map macroNameToMacro = new HashMap<>(); + + int symbolMaxId = 0; + int macroMaxId = -1; + + public ManualEncodingContext() { + // Intern the Ion 1.1 special symbols that aren't in the system symbol table. + // TODO these should be written inline instead of added to the symbol table. + internSymbol("%"); + internSymbol("?"); + } + + /** + * Adds the given macro to the macro table. + * @param macroName the name of the macro. + * @param macro the macro. + */ + public void addMacro(String macroName, TemplateMacro macro) { + macroNameToId.put(macroName, ++macroMaxId); + macroNameToMacro.put(macroName, macro); + // Intern the symbols that will occur in the macro signature and template body. + internSymbol(macroName); + for (Expression.TemplateBodyExpression expression : macro.getBody()) { + if (expression instanceof TemplateBodyExpression.FieldName) { + internSymbol(((TemplateBodyExpression.FieldName) expression).getValue().getText()); + } + } + for (Macro.Parameter parameter : macro.getSignature()) { + internSymbol(parameter.getVariableName()); + } + } + + /** + * Gets the mapping to the given symbol in the symbol table, or creates a mapping if none yet exists. + * @param symbol the symbol to intern. + * @return the symbol ID. + */ + public int internSymbol(String symbol) { + return symbolToId.computeIfAbsent(symbol, k -> ++symbolMaxId); + } + + /** + * @param symbol a symbol. + * @return true if the symbol already has a mapping in the symbol table. + */ + public boolean hasSymbol(String symbol) { + return symbolToId.get(symbol) != null; + } + + /** + * @param macroName the name of a macro. + * @return the ID of the given macro in the macro table, if present. + */ + public int getMacroId(String macroName) { + return macroNameToId.get(macroName); + } + + /** + * @param macroName the name of a macro. + * @return the macro, if present in the macro table. + */ + public TemplateMacro getMacro(String macroName) { + return macroNameToMacro.get(macroName); + } + + /** + * Writes the encoding context to the given writer. It is assumed that the symbols in the symbol table are used + * to encode the macro table, so the symbol table is written first in its own encoding directive, followed by + * the macro table. + * @param writer the writer. + */ + public void writeTo(IonRawWriter_1_1 writer) { + // write the symbol table + writer.stepInEExp(SystemMacro.SetSymbols); + writer.stepInExpressionGroup(false); + List> symbols = new ArrayList<>(symbolToId.entrySet()); + symbols.sort(Map.Entry.comparingByValue()); + symbols.forEach(e -> writer.writeString(e.getKey())); + writer.stepOut(); + writer.stepOut(); + + // write the macro table + if (macroNameToId.isEmpty()) { + return; + } + writer.stepInEExp(SystemMacro.SetMacros); + writer.stepInExpressionGroup(false); + List> macros = new ArrayList<>(macroNameToId.entrySet()); + macros.sort(Map.Entry.comparingByValue()); + for (Map.Entry macroAndId : macros) { + TemplateMacro macro = macroNameToMacro.get(macroAndId.getKey()); + writeMacroTo(writer, macroAndId.getKey(), macro); + } + writer.stepOut(); + writer.stepOut(); + } + + /** + * Writes the given macro. + * @param writer the writer. + * @param name the name of the macro to write. + * @param macro the macro to write. + */ + private void writeMacroTo(IonRawWriter_1_1 writer, String name, TemplateMacro macro) { + writeMacroTo(writer, name, macro, symbol -> writer.writeSymbol(internSymbol(symbol)), symbol -> writer.writeFieldName(internSymbol(symbol))); + } + + /** + * Writes the given macro. + * @param writer the writer. + * @param name the name of the macro to write. + * @param macro the macro to write. + * @param symbolWriter function that writes a symbol value. + * @param fieldNameWriter function that writes a field name. + */ + private static void writeMacroTo(IonRawWriter_1_1 writer, String name, TemplateMacro macro, Consumer symbolWriter, Consumer fieldNameWriter) { + writer.stepInSExp(false); + writer.writeSymbol(SystemSymbols_1_1.MACRO); + symbolWriter.accept(name); + writer.stepInSExp(false); + List signature = macro.getSignature(); + for (Macro.Parameter parameter : signature) { + symbolWriter.accept(parameter.getVariableName()); + if (parameter.getCardinality() != Macro.ParameterCardinality.ExactlyOne) { + symbolWriter.accept("?"); + } + } + writer.stepOut(); + List body = macro.getBody(); + int index = 0; + int[] numberOfTimesToStepOut = new int[body.size() + 1]; + Arrays.fill(numberOfTimesToStepOut, 0); + for (Expression.TemplateBodyExpression expression : body) { + for (int i = 0; i < numberOfTimesToStepOut[index]; i++) { + writer.stepOut(); + } + if (expression instanceof Expression.ExpressionGroup) { + // Note: assumes that template bodies are composed of either structs or system macro invocations. Will + // need to be generalized to fit other use cases as necessary. + writer.stepInSExp(true); + symbolWriter.accept("."); + writer.writeAnnotations(SystemSymbols_1_1.ION); + writer.writeSymbol(SystemSymbols_1_1.MAKE_STRING); + writer.stepInSExp(true); + symbolWriter.accept(".."); + numberOfTimesToStepOut[((Expression.ExpressionGroup) expression).getEndExclusive()]++; + } else if (expression instanceof TemplateBodyExpression.FieldName) { + fieldNameWriter.accept(((TemplateBodyExpression.FieldName) expression).getValue().getText()); + } else if (expression instanceof TemplateBodyExpression.VariableRef) { + writer.stepInSExp(true); + symbolWriter.accept("%"); + symbolWriter.accept(signature.get(((TemplateBodyExpression.VariableRef) expression).getSignatureIndex()).getVariableName()); + writer.stepOut(); + } else if (expression instanceof Expression.TextValue) { + writer.writeString(((Expression.TextValue) expression).getStringValue()); + } else if (expression instanceof Expression.ListValue) { + writer.stepInList(true); + numberOfTimesToStepOut[((Expression.ListValue) expression).getEndExclusive()]++; + } else if (expression instanceof Expression.StructValue) { + writer.stepInStruct(true); + numberOfTimesToStepOut[((Expression.StructValue) expression).getEndExclusive()]++; + } else if (expression instanceof Expression.BoolValue) { + writer.writeBool(((Expression.BoolValue) expression).getValue()); + } else { + throw new UnsupportedOperationException("TODO: unsupported expression type"); + } + index++; + } + for (int i = 0; i < numberOfTimesToStepOut[body.size()]; i++) { + writer.stepOut(); + } + writer.stepOut(); + } +} diff --git a/src/main/java/com/amazon/ion/apps/macroize/PrefixTextPattern.java b/src/main/java/com/amazon/ion/apps/macroize/PrefixTextPattern.java new file mode 100644 index 0000000000..124b8789ba --- /dev/null +++ b/src/main/java/com/amazon/ion/apps/macroize/PrefixTextPattern.java @@ -0,0 +1,52 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import com.amazon.ion.impl.IonRawWriter_1_1; +import com.amazon.ion.impl.macro.SystemMacro; + +import java.util.List; + +/** + * Writes a String value as a make_string invocation whose first argument is a symbol and whose second argument + * is either a symbol or a string. This allows for strings with common prefixes to be written compactly, even if + * they may have high-cardinality suffixes. + */ +class PrefixTextPattern implements TextPattern { // TODO unify with SubstringTextPattern? + private final String commonPrefix; + + /** + * @param context the encoding context. + * @param commonPrefix the prefix. + * @param suffixes recurring suffixes, if any. May be empty. If a suffix not present in this list is encountered + * in the data, that suffix will be written as a string instead of a symbol. + */ + PrefixTextPattern(ManualEncodingContext context, String commonPrefix, List suffixes) { + this.commonPrefix = commonPrefix; + context.internSymbol(commonPrefix); + for (String suffix : suffixes) { + context.internSymbol(suffix); + } + } + + @Override + public boolean matches(String candidate) { + return candidate.startsWith(commonPrefix); + } + + @Override + public void invoke(String match, ManualEncodingContext table, IonRawWriter_1_1 writer, boolean isBinary) { + // TODO consider whether these could/should be written using a custom macro that itself calls make_string. + writer.stepInEExp(SystemMacro.MakeString); + writer.stepInExpressionGroup(true); + writer.writeSymbol(table.internSymbol(commonPrefix)); + String suffix = match.replace(commonPrefix, ""); + if (table.hasSymbol(suffix)) { + writer.writeSymbol(table.internSymbol(suffix)); + } else { + writer.writeString(suffix); + } + writer.stepOut(); + writer.stepOut(); + } +} diff --git a/src/main/java/com/amazon/ion/apps/macroize/SubstringTextPattern.java b/src/main/java/com/amazon/ion/apps/macroize/SubstringTextPattern.java new file mode 100644 index 0000000000..2c2481d8df --- /dev/null +++ b/src/main/java/com/amazon/ion/apps/macroize/SubstringTextPattern.java @@ -0,0 +1,62 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import com.amazon.ion.impl.IonRawWriter_1_1; +import com.amazon.ion.impl.macro.SystemMacro; + +import java.util.List; + +/** + * Writes a String value as a make_string invocation with a prefix, a recurring substring, and a suffix. This allows for + * strings with common substrings to be written compactly, even if they may have high-cardinality prefixes and/or + * suffixes. + */ +class SubstringTextPattern implements TextPattern { + + private final String substring; + + /** + * @param context the encoding context. + * @param substring the prefix. + * @param prefixesAndSuffixes recurring prefixes and/or suffixes, if any. May be empty. If a prefix or suffix + * not present in this list is encountered in the data, it will be written as a string + * instead of a symbol. + */ + SubstringTextPattern(ManualEncodingContext context, String substring, List prefixesAndSuffixes) { + this.substring = substring; + context.internSymbol(substring); + for (String prefixOrSuffix : prefixesAndSuffixes) { + context.internSymbol(prefixOrSuffix); + } + } + + @Override + public boolean matches(String candidate) { + return candidate.contains(substring); + } + + private void writeComponent(String component, ManualEncodingContext table, IonRawWriter_1_1 writer) { + if (table.hasSymbol(component)) { + writer.writeSymbol(table.internSymbol(component)); + } else { + writer.writeString(component); + } + } + + @Override + public void invoke(String match, ManualEncodingContext table, IonRawWriter_1_1 writer, boolean isBinary) { + writer.stepInEExp(SystemMacro.MakeString); + writer.stepInExpressionGroup(true); + String[] components = match.split(substring); + if (!components[0].isEmpty()) { + writeComponent(components[0], table, writer); + } + writer.writeSymbol(table.internSymbol(substring)); + if (components.length > 1 && !components[1].isEmpty()) { + writeComponent(components[1], table, writer); + } + writer.stepOut(); + writer.stepOut(); + } +} diff --git a/src/main/java/com/amazon/ion/apps/macroize/SuggestedSignature.java b/src/main/java/com/amazon/ion/apps/macroize/SuggestedSignature.java new file mode 100644 index 0000000000..1778286f75 --- /dev/null +++ b/src/main/java/com/amazon/ion/apps/macroize/SuggestedSignature.java @@ -0,0 +1,58 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import java.util.LinkedHashSet; +import java.util.Set; + +/** + * Represents a simple suggested macro signature. TODO support + and * cardinalities. + */ +class SuggestedSignature { + + // Names of the required parameters (! cardinality), in the order they were added. + private final Set required = new LinkedHashSet<>(); + // Names of the optional parameters (? cardinality), in the order they were added. + private final Set optional = new LinkedHashSet<>(); + // Names of all parameters (required and optional), in the order they were added. + private final Set all = new LinkedHashSet<>(); + + public void addRequired(String argument) { + required.add(argument); + all.add(argument); + } + + public void addOptional(String argument) { + optional.add(argument); + all.add(argument); + } + + public Set allParameters() { + return all; + } + + /** + * Gets the index of the target parameter in the sequence of all parameters. It is up to the caller to ensure + * the target parameter exists. + * @param targetParameter the target parameter + * @return the index of the target parameter. + */ + public int indexOf(String targetParameter) { + int index = 0; + for (String parameter : all) { + if (targetParameter.equals(parameter)) { + return index; + } + index++; + } + return index; + } + + /** + * @param candidate a set of parameter names to attempt to match to this signature. + * @return true if the given parameters are compatible with this signature. + */ + public boolean isCompatible(Set candidate) { + return candidate.containsAll(required) && all.containsAll(candidate); + } +} diff --git a/src/main/java/com/amazon/ion/apps/macroize/TextPattern.java b/src/main/java/com/amazon/ion/apps/macroize/TextPattern.java new file mode 100644 index 0000000000..e150ffa7ab --- /dev/null +++ b/src/main/java/com/amazon/ion/apps/macroize/TextPattern.java @@ -0,0 +1,26 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import com.amazon.ion.impl.IonRawWriter_1_1; + +/** + * A string pattern to match in some source data. + */ +interface TextPattern { + + /** + * @param candidate a string to evaluate against the pattern. + * @return true if the candidate matches this pattern. + */ + boolean matches(String candidate); + + /** + * Writes this pattern from the given match. It is up to the caller to ensure the given string is actually a match. + * @param match the match from which to write the pattern. + * @param table the context to use when writing. + * @param writer the writer to which the pattern will be written. + * @param isBinary true if the output format is binary. + */ + void invoke(String match, ManualEncodingContext table, IonRawWriter_1_1 writer, boolean isBinary); +} diff --git a/src/main/java/com/amazon/ion/apps/macroize/ThrowingProcedure.java b/src/main/java/com/amazon/ion/apps/macroize/ThrowingProcedure.java new file mode 100644 index 0000000000..df4aec23d1 --- /dev/null +++ b/src/main/java/com/amazon/ion/apps/macroize/ThrowingProcedure.java @@ -0,0 +1,10 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import java.io.IOException; + +@FunctionalInterface +interface ThrowingProcedure { + void execute() throws IOException; +} diff --git a/src/main/java/com/amazon/ion/apps/macroize/ThrowingSupplier.java b/src/main/java/com/amazon/ion/apps/macroize/ThrowingSupplier.java new file mode 100644 index 0000000000..a89031134e --- /dev/null +++ b/src/main/java/com/amazon/ion/apps/macroize/ThrowingSupplier.java @@ -0,0 +1,10 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import java.io.IOException; + +@FunctionalInterface +interface ThrowingSupplier { + T get() throws IOException; +} diff --git a/src/main/java/com/amazon/ion/apps/macroize/VerbatimTextPattern.java b/src/main/java/com/amazon/ion/apps/macroize/VerbatimTextPattern.java new file mode 100644 index 0000000000..37d484293d --- /dev/null +++ b/src/main/java/com/amazon/ion/apps/macroize/VerbatimTextPattern.java @@ -0,0 +1,44 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import com.amazon.ion.impl.IonRawWriter_1_1; +import com.amazon.ion.impl.macro.SystemMacro; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +/** + * Writes a String value as a make_string invocation whose argument is a symbol. This allows recurring text to be + * added to the symbol table and encoded using an ID while retaining the String type. + */ +class VerbatimTextPattern implements TextPattern { + + // The strings to write using make_string invocations. + private final Set targets; + + /** + * @param context the encoding context. + * @param strings the strings to be written using make_string invocations. + */ + VerbatimTextPattern(ManualEncodingContext context, List strings) { + this.targets = new HashSet<>(); + targets.addAll(strings); + for (String target : strings) { + context.internSymbol(target); + } + } + + @Override + public boolean matches(String candidate) { + return targets.contains(candidate); + } + + @Override + public void invoke(String match, ManualEncodingContext table, IonRawWriter_1_1 writer, boolean isBinary) { + writer.stepInEExp(SystemMacro.MakeString); + writer.writeSymbol(table.internSymbol(match)); + writer.stepOut(); + } +} diff --git a/src/main/java/com/amazon/ion/impl/bin/WriteBuffer.java b/src/main/java/com/amazon/ion/impl/bin/WriteBuffer.java index 4049f49d62..63f19ecdac 100644 --- a/src/main/java/com/amazon/ion/impl/bin/WriteBuffer.java +++ b/src/main/java/com/amazon/ion/impl/bin/WriteBuffer.java @@ -21,7 +21,7 @@ * in an IndexOutOfBoundsException. The number 10 is chosen because it is the maximum number of bytes required to write * a long value as a FlexInt or VarInt. */ -/*package*/ final class WriteBuffer implements Closeable +public final class WriteBuffer implements Closeable { private final BlockAllocator allocator; private final List blocks; diff --git a/src/main/java/com/amazon/ion/impl/lite/IonDatagramLite.java b/src/main/java/com/amazon/ion/impl/lite/IonDatagramLite.java index 6edf35a668..cd528be673 100644 --- a/src/main/java/com/amazon/ion/impl/lite/IonDatagramLite.java +++ b/src/main/java/com/amazon/ion/impl/lite/IonDatagramLite.java @@ -323,7 +323,14 @@ public ListIterator listIterator(int index) @Override public IonValue set(int index, IonValue element){ - throw new UnsupportedOperationException(); + if (((IonValueLite) element)._context.getContextSymbolTable() != getContextForIndex(null, index).getContextSymbolTable()) { + // Note: this isn't impossible to support, but it requires care in the case where 'element' may depend + // on symbol table mappings unique to its own context. In order to sidestep this complexity until a use + // case is identified for it, only setting the element at an index that uses the same symbol table is + // currently supported. + throw new UnsupportedOperationException(); + } + return super.set(index, element); } @Override diff --git a/src/main/java/com/amazon/ion/impl/macro/MacroMatcher.java b/src/main/java/com/amazon/ion/impl/macro/MacroMatcher.java new file mode 100644 index 0000000000..7bf2edcfcd --- /dev/null +++ b/src/main/java/com/amazon/ion/impl/macro/MacroMatcher.java @@ -0,0 +1,257 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.impl.macro; + +import com.amazon.ion.IonReader; +import com.amazon.ion.IonType; +import com.amazon.ion.IonValue; +import com.amazon.ion.system.IonReaderBuilder; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Iterator; +import java.util.function.Function; + +/** + * Matches source data to macro definitions. + * TODO not supported yet: nested invocations + */ +public class MacroMatcher { + + private final TemplateMacro macro; + private final String name; + + /** + * Creates a matcher for the given TDL text. + * @param macroText the TDL text that defines a single macro. + * @param macroTable the macro table's mapping function. + */ + public MacroMatcher(String macroText, Function macroTable) { + try (IonReader macroReader = IonReaderBuilder.standard().build(macroText)) { + MacroCompiler compiler = new MacroCompiler(macroTable::apply, new ReaderAdapterIonReader(macroReader)); + macroReader.next(); + macro = compiler.compileMacro(); + name = compiler.getMacroName(); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } + + /** + * Creates a matcher for the macro on which the given reader is positioned. + * @param macroReader the reader positioned on a TDL definition of a single macro. + * @param macroTable the macro table's mapping function. + */ + public MacroMatcher(IonReader macroReader, Function macroTable) { + MacroCompiler compiler = new MacroCompiler(macroTable::apply, new ReaderAdapterIonReader(macroReader)); + macro = compiler.compileMacro(); + name = compiler.getMacroName(); + } + + /** + * @return the name of the macro. + */ + public String name() { + return name; + } + + /** + * @return the macro. + */ + public TemplateMacro macro() { + return macro; + } + + private T requireExpressionType(Expression.TemplateBodyExpression expression, Class requiredType) { + if (requiredType.isAssignableFrom(expression.getClass())) { + return requiredType.cast(expression); + } + return null; + } + + /** + * Attempts to match the value on which the reader is positioned to this matcher's macro by iterating over the value + * and the macro body in lockstep until either an incompatibility is found (no match) or the value and body end + * (match). + * @param reader a reader positioned on a value to attempt to match to this matcher's macro. + * @return true if the value matches this matcher's macro. + */ + public boolean match(IonReader reader) { + Iterator bodyIterator = macro.getBody().iterator(); + int index = 0; + int[] numberOfContainerEndsAtExpressionIndex = new int[macro.getBody().size() + 1]; + while (true) { + for (int i = 0; i < numberOfContainerEndsAtExpressionIndex[index]; i++) { + if (reader.next() != null) { + return false; + } + reader.stepOut(); + } + IonType type = reader.next(); + boolean hasNextExpression = bodyIterator.hasNext(); + Expression.TemplateBodyExpression expression = null; + if (hasNextExpression) { + expression = bodyIterator.next(); + } else if (type != null) { + return false; + } + if (type == null) { + if (expression instanceof Expression.FieldName) { + expression = bodyIterator.next(); + } + if (expression instanceof Expression.VariableRef) { + if (macro.getSignature().get(((Expression.VariableRef) expression).getSignatureIndex()).getCardinality().canBeVoid) { + // This is a trailing optional argument that is omitted in the source data, which is still + // considered compatible with the signature. + continue; + } + return false; + } else if (hasNextExpression) { + return false; + } + break; + } + index++; + if (expression instanceof Expression.FieldName) { + if (!((Expression.FieldName) expression).getValue().assumeText().equals(reader.getFieldName())) { + return false; + } + if (!bodyIterator.hasNext()) { + throw new IllegalStateException("dangling field name"); + } + expression = bodyIterator.next(); + index++; + } + if (expression instanceof Expression.VariableRef) { + // For now, a variable matches any value at the current position. + // TODO check cardinality and encoding type. + continue; + } + if (expression instanceof Expression.ExpressionGroup) { + throw new UnsupportedOperationException("TODO: handle expression groups"); + } + if (expression instanceof Expression.MacroInvocation) { + throw new UnsupportedOperationException("TODO: handle nested invocations"); + } + if (expression instanceof Expression.DataModelValue) { + Expression.DataModelValue dataModelValueExpression = (Expression.DataModelValue) expression; + if (!Arrays.asList(reader.getTypeAnnotationSymbols()).equals(dataModelValueExpression.getAnnotations())) { + return false; + } + } + switch (type) { + case NULL: + Expression.NullValue nullValue = requireExpressionType(expression, Expression.NullValue.class); + if (nullValue == null) { + return false; + } + break; + case BOOL: + Expression.BoolValue boolValue = requireExpressionType(expression, Expression.BoolValue.class); + if (boolValue == null || (boolValue.getValue() != reader.booleanValue())) { + return false; + } + break; + case INT: + switch (reader.getIntegerSize()) { + case INT: + case LONG: + Expression.LongIntValue intValue = requireExpressionType(expression, Expression.LongIntValue.class); + if (intValue == null || (intValue.getValue() != reader.longValue())) { + return false; + } + break; + case BIG_INTEGER: + Expression.BigIntValue bigIntValue = requireExpressionType(expression, Expression.BigIntValue.class); + if (bigIntValue == null || (!bigIntValue.getBigIntegerValue().equals(reader.bigIntegerValue()))) { + return false; + } + break; + } + break; + case FLOAT: + Expression.FloatValue floatValue = requireExpressionType(expression, Expression.FloatValue.class); + if (floatValue == null || (Double.compare(floatValue.getValue(), reader.doubleValue()) != 0)) { + return false; + } + break; + case DECIMAL: + Expression.DecimalValue decimalValue = requireExpressionType(expression, Expression.DecimalValue.class); + if (decimalValue == null || (!decimalValue.getValue().equals(reader.bigDecimalValue()))) { + return false; + } + break; + case TIMESTAMP: + Expression.TimestampValue timestampValue = requireExpressionType(expression, Expression.TimestampValue.class); + if (timestampValue == null || (!timestampValue.getValue().equals(reader.timestampValue()))) { + return false; + } + break; + case SYMBOL: + Expression.SymbolValue symbolValue = requireExpressionType(expression, Expression.SymbolValue.class); + if (symbolValue == null || (!symbolValue.getValue().assumeText().equals(reader.symbolValue().assumeText()))) { + return false; + } + break; + case STRING: + Expression.StringValue stringValue = requireExpressionType(expression, Expression.StringValue.class); + if (stringValue == null || (!stringValue.getValue().equals(reader.stringValue()))) { + return false; + } + break; + case CLOB: + Expression.ClobValue clobValue = requireExpressionType(expression, Expression.ClobValue.class); + if (clobValue == null || (!Arrays.equals(clobValue.getValue(), reader.newBytes()))) { + return false; + } + break; + case BLOB: + Expression.BlobValue blobValue = requireExpressionType(expression, Expression.BlobValue.class); + if (blobValue == null || (!Arrays.equals(blobValue.getValue(), reader.newBytes()))) { + return false; + } + break; + case LIST: + reader.stepIn(); + Expression.ListValue listValue = requireExpressionType(expression, Expression.ListValue.class); + if (listValue == null) { + return false; + } + numberOfContainerEndsAtExpressionIndex[listValue.getEndExclusive()]++; + break; + case SEXP: + reader.stepIn(); + Expression.SExpValue sexpValue = requireExpressionType(expression, Expression.SExpValue.class); + if (sexpValue == null) { + return false; + } + numberOfContainerEndsAtExpressionIndex[sexpValue.getEndExclusive()]++; + break; + case STRUCT: + reader.stepIn(); + Expression.StructValue structValue = requireExpressionType(expression, Expression.StructValue.class); + if (structValue == null) { + return false; + } + numberOfContainerEndsAtExpressionIndex[structValue.getEndExclusive()]++; + break; + case DATAGRAM: + throw new IllegalStateException(); + } + } + return true; + } + + /** + * @see #match(IonReader) + * @param value the value to attempt to match. + * @return true if the value matches this matcher's macro. + */ + public boolean match(IonValue value) { + try (IonReader domReader = IonReaderBuilder.standard().build(value)) { + return match(domReader); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } +} diff --git a/src/test/java/com/amazon/ion/DatagramTest.java b/src/test/java/com/amazon/ion/DatagramTest.java index 831ccfd036..458e3b6b2c 100644 --- a/src/test/java/com/amazon/ion/DatagramTest.java +++ b/src/test/java/com/amazon/ion/DatagramTest.java @@ -1,18 +1,5 @@ -/* - * Copyright 2007-2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"). - * You may not use this file except in compliance with the License. - * A copy of the License is located at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * or in the "license" file accompanying this file. This file is distributed - * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either - * express or implied. See the License for the specific language governing - * permissions and limitations under the License. - */ - +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 package com.amazon.ion; import static com.amazon.ion.SymbolTable.UNKNOWN_SYMBOL_ID; @@ -673,15 +660,13 @@ public void testGetAssignedSymbolTable() ((_Private_IonValue)dg).getAssignedSymbolTable(); } - /** - * TODO amazon-ion/ion-java/issues/50 Datagram.set() should work, but it's documented to throw - */ - @Test(expected = UnsupportedOperationException.class) + @Test public void testSet() { IonDatagram dg = system().newDatagram(); dg.add().newNull(); dg.set(0, system().newBool(true)); + assertEquals(system().getLoader().load("true"), dg); } @Test diff --git a/src/test/java/com/amazon/ion/apps/macroize/MacroizeTest.java b/src/test/java/com/amazon/ion/apps/macroize/MacroizeTest.java new file mode 100644 index 0000000000..47c66e18a0 --- /dev/null +++ b/src/test/java/com/amazon/ion/apps/macroize/MacroizeTest.java @@ -0,0 +1,75 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 +package com.amazon.ion.apps.macroize; + +import com.amazon.ion.IonDatagram; +import com.amazon.ion.IonSystem; +import com.amazon.ion.system.IonReaderBuilder; +import com.amazon.ion.system.IonSystemBuilder; +import com.amazon.ion.system.IonTextWriterBuilder; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class MacroizeTest { + + private static final IonSystem SYSTEM = IonSystemBuilder.standard().build(); + + private static void testMacroize( + String input, + String spec, + boolean outputBinary, + Map expectedOccurrences + ) throws IOException { + StringBuilder invocations = new StringBuilder(); + ByteArrayOutputStream headless = new ByteArrayOutputStream(); + ByteArrayOutputStream context = new ByteArrayOutputStream(); + ByteArrayOutputStream complete = new ByteArrayOutputStream(); + StringBuilder log = new StringBuilder(); + Macroize.macroize( + () -> IonReaderBuilder.standard().build(input), + () -> IonTextWriterBuilder.pretty().build(invocations), + () -> IonReaderBuilder.standard().build(invocations.toString()), + () -> headless, + () -> context, + () -> { + complete.write(context.toByteArray()); + complete.write(headless.toByteArray()); + }, + () -> IonReaderBuilder.standard().build(spec), + outputBinary, + log + ); + IonDatagram from10 = SYSTEM.getLoader().load(input); + IonDatagram from11 = SYSTEM.getLoader().load(complete.toByteArray()); + assertEquals(from10, from11); + for (Map.Entry expectedOccurrence : expectedOccurrences.entrySet()) { + assertTrue(log.toString().contains( + String.format("%s (total occurrences: %d)", expectedOccurrence.getKey(), expectedOccurrence.getValue())) + ); + } + // TODO assert that the text patterns were matched as expected + } + + @ParameterizedTest(name = "outputBinary={0}") + @ValueSource(booleans = {true, false}) + public void macroizeWithSpec(boolean outputBinary) throws IOException { + String spec = "{macros: [(macro foobar (foo bar?) {foo: (%foo), bar: (%bar)})], textPatterns: [(verbatim [baz]), (prefix \"/user/files/\" [a, b])]}"; + String input = "{foo: 1, bar: 2} {foo: 3} \"baz\" {foobar: {foo: 4, bar: 5}, path: \"/user/files/a\"} \"/user/files/c\""; + Map expectedOccurrences = new HashMap() {{ + put("foobar", 3); + }}; + testMacroize(input, spec, outputBinary, expectedOccurrences); + } + + // TODO add tests that exercise using every Ion type in macro definitions + // TODO test substring text pattern + // TODO address known limitations, as documented in the top-level JavaDoc on MacroizeSpec +}