diff --git a/README.md b/README.md index 798fb7b..f2a30ef 100644 --- a/README.md +++ b/README.md @@ -12,15 +12,15 @@ is inexpensive relative to the cost of parsing (and in the case of a DOM, materi ## Usage Path extractor works in two phases: -1. Configuration +1. Configuration 2. Notification ### Search Paths A `SearchPath` is a path provided to the extractor for matching. It's composed of a list of `PathComponent`s which can be one of: -* Wildcard: matches all values -* Index: match the value at that index -* Text: match all values whose field names are equivalent to that text +* Wildcard: matches all values. +* Index: match the value at that index. +* Text: match all values whose field names are equivalent to that text. Some examples: ``` @@ -78,11 +78,45 @@ final IonReader ionReader = IonReaderBuilder.standard().build("{foo: 1}" pathExtractor.match(ionReader); assertEquals("[1, 2, 20]", list.toString()); +```pa + +## Benchmark + +Some benchmarks comparing the path extractor with fully materializing a DOM are included in this package. All benchmarks +use as data source the JSON in https://data.nasa.gov/data.json, a publicly available data set from NASA. + +The `dataset` struct from the original JSON is written as Ion binary and Ion text without any type coercion. The +binary file is ~81M and the text file ~95M. There are four benchmarks types: +1. `dom`: fully materializes a DOM for the file using an `IonLoader`. +1. `full`: fully materializes all struct fields as `IonValue`s using a path extractor. +1. `partial`: materializes a single struct fields as `IonValue` using a path extractor.a +1. `partialNoDom`: access the java representation directly of a single struct field without materializing an `IonValue`. + +There is a binary and a text version for all four benchmark types. See the `PathExtractorBenchmark` class for +more details. + +To execute the benchmarks run: `gradle --no-daemon jmh`, requires an internet connection as it downloads the data set. +Results below, higher is better. + +``` +Benchmark Mode Cnt Score Error Units +PathExtractorBenchmark.domBinary thrpt 10 1.128 ± 0.050 ops/s +PathExtractorBenchmark.domText thrpt 10 0.601 ± 0.019 ops/s +PathExtractorBenchmark.fullBinary thrpt 10 1.227 ± 0.014 ops/s +PathExtractorBenchmark.fullText thrpt 10 0.665 ± 0.010 ops/s +PathExtractorBenchmark.partialBinary thrpt 10 14.912 ± 0.271 ops/s +PathExtractorBenchmark.partialBinaryNoDom thrpt 10 15.650 ± 0.297 ops/s +PathExtractorBenchmark.partialText thrpt 10 1.343 ± 0.029 ops/s +PathExtractorBenchmark.partialTextNoDom thrpt 10 1.307 ± 0.015 ops/s ``` +Using the path extractor has equivalent performance for both text and binary when fully materializing the document and +can give significant performance improvements when partially materializing binary documents. This happens due to Ion's +ability to skip scan values in the binary format as they are length prefixed. The gains will be proportional to how +much of the document can be skipped over. + ## Ion Developer information See the developer guide on: http://amzn.github.io/ion-docs/guides/path-extractor-guide.html ## License - This library is licensed under the Apache 2.0 License. diff --git a/build.gradle b/build.gradle index 9d0e8f1..3bc2b21 100644 --- a/build.gradle +++ b/build.gradle @@ -18,10 +18,15 @@ buildscript { repositories { mavenCentral() + maven { + url "https://plugins.gradle.org/m2/" + } } dependencies { + classpath "me.champeau.gradle:jmh-gradle-plugin:0.4.7" classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version" + classpath "software.amazon.ion:ion-java:$ionVersion" } } @@ -37,25 +42,25 @@ repositories { mavenCentral() } -dependencies { - compile "software.amazon.ion:ion-java:$ionVersion" - - // using kotlin to make tests less verbose - testCompile "org.jetbrains.kotlin:kotlin-stdlib-jdk8:$kotlin_version" - testCompile "org.jetbrains.kotlin:kotlin-test-junit:$kotlin_version" - - // JUnit 5 - testCompile "org.junit.jupiter:junit-jupiter-api:$junitVersion" - testCompile "org.junit.jupiter:junit-jupiter-params:$junitVersion" - testRuntime "org.junit.jupiter:junit-jupiter-engine:$junitVersion" -} - tasks.withType(org.jetbrains.kotlin.gradle.tasks.KotlinCompile).all { kotlinOptions { jvmTarget = "1.8" } } +apply plugin: "me.champeau.gradle.jmh" +jmh { + fork = 1 + benchmarkMode = ["thrpt"] + failOnError = true + + // warmup + warmupIterations = 5 + + // iterations + iterations = 10 +} + apply plugin: 'checkstyle' checkstyle { toolVersion = "8.12" @@ -70,3 +75,16 @@ tasks.withType(Checkstyle) { html.enabled = true } } + +dependencies { + compile "software.amazon.ion:ion-java:$ionVersion" + + // using kotlin to make tests less verbose + testCompile "org.jetbrains.kotlin:kotlin-stdlib-jdk8:$kotlin_version" + testCompile "org.jetbrains.kotlin:kotlin-test-junit:$kotlin_version" + + // JUnit 5 + testCompile "org.junit.jupiter:junit-jupiter-api:$junitVersion" + testCompile "org.junit.jupiter:junit-jupiter-params:$junitVersion" + testRuntime "org.junit.jupiter:junit-jupiter-engine:$junitVersion" +} diff --git a/src/jmh/java/software/amazon/ionpathextraction/benchmarks/PathExtractorBenchmark.java b/src/jmh/java/software/amazon/ionpathextraction/benchmarks/PathExtractorBenchmark.java new file mode 100644 index 0000000..7ecbb75 --- /dev/null +++ b/src/jmh/java/software/amazon/ionpathextraction/benchmarks/PathExtractorBenchmark.java @@ -0,0 +1,256 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at: + * + * http://aws.amazon.com/apache2.0/ + * + * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific + * language governing permissions and limitations under the License. + */ + +package software.amazon.ionpathextraction.benchmarks; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.net.URL; +import java.util.function.Function; +import java.util.stream.Stream; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import software.amazon.ion.IonReader; +import software.amazon.ion.IonSystem; +import software.amazon.ion.IonWriter; +import software.amazon.ion.system.IonBinaryWriterBuilder; +import software.amazon.ion.system.IonReaderBuilder; +import software.amazon.ion.system.IonSystemBuilder; +import software.amazon.ion.system.IonTextWriterBuilder; +import software.amazon.ionpathextraction.PathExtractor; +import software.amazon.ionpathextraction.PathExtractorBuilder; + +/** + * Benchmarks comparing the PathExtractor with fully materializing the DOM. + */ +public class PathExtractorBenchmark { + private static final IonSystem DOM_FACTORY = IonSystemBuilder.standard().build(); + + private static IonReader newReader(final InputStream inputStream) { + return IonReaderBuilder.standard().build(inputStream); + } + + private static IonWriter newBinaryWriter(final OutputStream outputStream) { + return IonBinaryWriterBuilder.standard().build(outputStream); + } + + private static IonWriter newTextWriter(final OutputStream outputStream) { + return IonTextWriterBuilder.standard().build(outputStream); + } + + private static final String DATA_URL = "https://data.nasa.gov/data.json"; + private static byte[] bytesBinary; + private static byte[] bytesText; + + private static void setupTestData() throws IOException { + final URL url = new URL(DATA_URL); + + final ByteArrayOutputStream binaryOut = new ByteArrayOutputStream(); + try ( + final InputStream inputStream = url.openStream(); + final IonReader reader = newReader(inputStream); + final IonWriter binaryWriter = newBinaryWriter(binaryOut) + ) { + binaryWriter.writeValues(reader); + } + + bytesBinary = binaryOut.toByteArray(); + + // text version. Writes from the binary memory buffer to avoid downloading the data twice + final ByteArrayOutputStream textOut = new ByteArrayOutputStream(); + try ( + final InputStream inputStream = new ByteArrayInputStream(bytesBinary); + final IonReader reader = newReader(inputStream); + final IonWriter writer = newTextWriter(textOut) + ) { + writer.writeValues(reader); + } + + bytesText = textOut.toByteArray(); + } + + // sets up shared test data once. + static { + try { + setupTestData(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + /** + * Each thread gets a single instance. + */ + @State(Scope.Thread) + public static class ThreadState { + + PathExtractor pathExtractor; + PathExtractor pathExtractorPartial; + PathExtractor pathExtractorPartialNoDom; + + @Setup(Level.Trial) + public void setup() throws Exception { + pathExtractor = makePathExtractor(reader -> { + // reads matches as DOM doing similar work as the DOM loader + DOM_FACTORY.newValue(reader); + return 0; + }, + "(@context)", + "(@type)", + "(conformsTo)", + "(describedBy)", + "(dataset * @type)", + "(dataset * accessLevel)", + "(dataset * accrualPeriodicity)", + "(dataset * bureauCode)", + "(dataset * contactPoint)", + "(dataset * description)", + "(dataset * distribution)", + "(dataset * identifier)", + "(dataset * issued)", + "(dataset * keyword)", + "(dataset * landingPage)", + "(dataset * modified)", + "(dataset * programCode)", + "(dataset * publisher)", + "(dataset * title)", + "(dataset * license)" + ); + + pathExtractorPartial = makePathExtractor(reader -> { + // reads matches as DOM doing similar work as the DOM loader but only for matched values + DOM_FACTORY.newValue(reader); + return 0; + }, + "(@context)", + "(@type)", + "(conformsTo)", + "(describedBy)", + "(dataset * accessLevel)" + ); + + pathExtractorPartialNoDom = makePathExtractor(reader -> { + // reads the value without materializing a DOM object + reader.stringValue(); // all matched paths are strings + return 0; + }, + "(@context)", + "(@type)", + "(conformsTo)", + "(describedBy)", + "(dataset * accessLevel)" + ); + } + + private PathExtractor makePathExtractor(final Function callback, + final String... searchPaths) { + final PathExtractorBuilder builder = PathExtractorBuilder.standard(); + Stream.of(searchPaths).forEach(sp -> builder.withSearchPath(sp, callback)); + return builder.build(); + } + } + + /** + * Fully materializes all struct fields as IonValues using a path extractor. + */ + @Benchmark + public Object fullBinary(final ThreadState threadState) { + // instantiate reader inside benchmark to be more comparable to dom loading + IonReader reader = newReader(new ByteArrayInputStream(bytesBinary)); + threadState.pathExtractor.match(reader); + + return reader; + } + + /** + * Text version of {@link #fullBinary(ThreadState)}. + */ + @Benchmark + public Object fullText(final ThreadState threadState) { + // instantiate reader inside benchmark to be more comparable to dom loading + IonReader reader = newReader(new ByteArrayInputStream(bytesText)); + threadState.pathExtractor.match(reader); + + return reader; + } + + /** + * Materializes a single struct fields as IonValue using a path extractor. + */ + @Benchmark + public Object partialBinary(final ThreadState threadState) { + // instantiate reader inside benchmark to be more comparable to dom loading + IonReader reader = newReader(new ByteArrayInputStream(bytesBinary)); + threadState.pathExtractorPartial.match(reader); + + return reader; + } + + /** + * Text version of {@link #partialBinary(ThreadState)}. + */ + @Benchmark + public Object partialText(final ThreadState threadState) { + // instantiate reader inside benchmark to be more comparable to dom loading + IonReader reader = newReader(new ByteArrayInputStream(bytesText)); + threadState.pathExtractorPartial.match(reader); + + return reader; + } + + /** + * Access the java representation directly of a single struct field without materializing an `IonValue`. + */ + @Benchmark + public Object partialBinaryNoDom(final ThreadState threadState) { + // instantiate reader inside benchmark to be more comparable to dom loading + IonReader reader = newReader(new ByteArrayInputStream(bytesBinary)); + threadState.pathExtractorPartialNoDom.match(reader); + + return reader; + } + + /** + * Text version of {@link #partialBinaryNoDom(ThreadState)}. + */ + @Benchmark + public Object partialTextNoDom(final ThreadState threadState) { + // instantiate reader inside benchmark to be more comparable to dom loading + IonReader reader = newReader(new ByteArrayInputStream(bytesText)); + threadState.pathExtractorPartialNoDom.match(reader); + + return reader; + } + + /** + * Fully materializes a DOM for the file using an IonLoader. + */ + @Benchmark + public Object domBinary() { + return DOM_FACTORY.getLoader().load(bytesBinary); + } + + /** + * Text version of {@link #domBinary()}. + */ + @Benchmark + public Object domText() { + return DOM_FACTORY.getLoader().load(bytesText); + } +}