amazon-ion · raganhan · Oct 31, 2018 · Oct 31, 2018 · Oct 31, 2018 · tgregg
diff --git a/README.md b/README.md
@@ -12,15 +12,15 @@ is inexpensive relative to the cost of parsing (and in the case of a DOM, materi
 
 ## Usage 
 Path extractor works in two phases: 
-1. Configuration 
+1. Configuration
 2. Notification  
 
 ### Search Paths
 A `SearchPath` is a path provided to the extractor for matching. It's composed of a list of `PathComponent`s 
 which can be one of: 
-* Wildcard: matches all values
-* Index: match the value at that index 
-* Text: match all values whose field names are equivalent to that text
+* Wildcard: matches all values.
+* Index: match the value at that index.
+* Text: match all values whose field names are equivalent to that text.
 
 Some examples: 
 ```
@@ -78,11 +78,45 @@ final IonReader ionReader = IonReaderBuilder.standard().build("{foo: 1}"
 pathExtractor.match(ionReader);
 
 assertEquals("[1, 2, 20]", list.toString());
+```pa
+
+## Benchmark 
+
+Some benchmarks comparing the path extractor with fully materializing a DOM are included in this package. All benchmarks
+use as data source the JSON in https://data.nasa.gov/data.json, a publicly available data set from NASA. 
+
+The `dataset` struct from the original JSON is written as Ion binary and Ion text without any type coercion. The 
+binary file is ~81M and the text file ~95M. There are four benchmarks types: 
+1. `dom`: fully materializes a DOM for the file using an `IonLoader`. 
+1. `full`: fully materializes all struct fields as `IonValue`s using a path extractor.
+1. `partial`: materializes a single struct fields as `IonValue` using a path extractor.a
+1. `partialNoDom`: access the java representation directly of a single struct field without materializing an `IonValue`.
+
+There is a binary and a text version for all four benchmark types. See the `PathExtractorBenchmark` class for 
+more details.
+
+To execute the benchmarks run: `gradle --no-daemon jmh`, requires an internet connection as it downloads the data set. 
+Results below, higher is better. 
+
+```
+Benchmark                                   Mode  Cnt   Score   Error  Units
+PathExtractorBenchmark.domBinary           thrpt   10   1.128 ± 0.050  ops/s
+PathExtractorBenchmark.domText             thrpt   10   0.601 ± 0.019  ops/s
+PathExtractorBenchmark.fullBinary          thrpt   10   1.227 ± 0.014  ops/s
+PathExtractorBenchmark.fullText            thrpt   10   0.665 ± 0.010  ops/s
+PathExtractorBenchmark.partialBinary       thrpt   10  14.912 ± 0.271  ops/s
+PathExtractorBenchmark.partialBinaryNoDom  thrpt   10  15.650 ± 0.297  ops/s
+PathExtractorBenchmark.partialText         thrpt   10   1.343 ± 0.029  ops/s
+PathExtractorBenchmark.partialTextNoDom    thrpt   10   1.307 ± 0.015  ops/s
 ```
 
+Using the path extractor has equivalent performance for both text and binary when fully materializing the document and 
+can give significant performance improvements when partially materializing binary documents. This happens due to Ion's 
+ability to skip scan values in the binary format as they are length prefixed. The gains will be proportional to how 
+much of the document can be skipped over.    
+
 ## Ion Developer information
 See the developer guide on: http://amzn.github.io/ion-docs/guides/path-extractor-guide.html
 
 ## License
-
 This library is licensed under the Apache 2.0 License. 
diff --git a/build.gradle b/build.gradle
@@ -18,10 +18,15 @@ buildscript {
 
     repositories {
         mavenCentral()
+        maven {
+            url "https://plugins.gradle.org/m2/"
+        }
     }
 
     dependencies {
+        classpath "me.champeau.gradle:jmh-gradle-plugin:0.4.7"
         classpath "org.jetbrains.kotlin:kotlin-gradle-plugin:$kotlin_version"
+        classpath "software.amazon.ion:ion-java:$ionVersion"
     }
 }
 
@@ -37,25 +42,25 @@ repositories {
     mavenCentral()
 }
 
-dependencies {
-    compile "software.amazon.ion:ion-java:$ionVersion"
-
-    // using kotlin to make tests less verbose
-    testCompile "org.jetbrains.kotlin:kotlin-stdlib-jdk8:$kotlin_version"
-    testCompile "org.jetbrains.kotlin:kotlin-test-junit:$kotlin_version"
-
-    // JUnit 5
-    testCompile "org.junit.jupiter:junit-jupiter-api:$junitVersion"
-    testCompile "org.junit.jupiter:junit-jupiter-params:$junitVersion"
-    testRuntime "org.junit.jupiter:junit-jupiter-engine:$junitVersion"
-}
-
 tasks.withType(org.jetbrains.kotlin.gradle.tasks.KotlinCompile).all {
     kotlinOptions {
         jvmTarget = "1.8"
     }
 }
 
+apply plugin: "me.champeau.gradle.jmh"
+jmh {
+    fork = 1
+    benchmarkMode = ["thrpt"]
+    failOnError = true
+
+    // warmup
+    warmupIterations = 5
+
+    // iterations
+    iterations = 10
+}
+
 apply plugin: 'checkstyle'
 checkstyle {
     toolVersion = "8.12"
@@ -70,3 +75,16 @@ tasks.withType(Checkstyle) {
         html.enabled = true
     }
 }
+
+dependencies {
+    compile "software.amazon.ion:ion-java:$ionVersion"
+
+    // using kotlin to make tests less verbose
+    testCompile "org.jetbrains.kotlin:kotlin-stdlib-jdk8:$kotlin_version"
+    testCompile "org.jetbrains.kotlin:kotlin-test-junit:$kotlin_version"
+
+    // JUnit 5
+    testCompile "org.junit.jupiter:junit-jupiter-api:$junitVersion"
+    testCompile "org.junit.jupiter:junit-jupiter-params:$junitVersion"
+    testRuntime "org.junit.jupiter:junit-jupiter-engine:$junitVersion"
+}
diff --git a/src/jmh/java/software/amazon/ionpathextraction/benchmarks/PathExtractorBenchmark.java b/src/jmh/java/software/amazon/ionpathextraction/benchmarks/PathExtractorBenchmark.java
@@ -0,0 +1,256 @@
+/*
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * Licensed under the Apache License, Version 2.0 (the "License").
+ * You may not use this file except in compliance with the License.
+ * A copy of the License is located at:
+ *
+ *     http://aws.amazon.com/apache2.0/
+ *
+ * or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific
+ * language governing permissions and limitations under the License.
+ */
+
+package software.amazon.ionpathextraction.benchmarks;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.net.URL;
+import java.util.function.Function;
+import java.util.stream.Stream;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import software.amazon.ion.IonReader;
+import software.amazon.ion.IonSystem;
+import software.amazon.ion.IonWriter;
+import software.amazon.ion.system.IonBinaryWriterBuilder;
+import software.amazon.ion.system.IonReaderBuilder;
+import software.amazon.ion.system.IonSystemBuilder;
+import software.amazon.ion.system.IonTextWriterBuilder;
+import software.amazon.ionpathextraction.PathExtractor;
+import software.amazon.ionpathextraction.PathExtractorBuilder;
+
+/**
+ * Benchmarks comparing the PathExtractor with fully materializing the DOM.
+ */
+public class PathExtractorBenchmark {
+    private static final IonSystem DOM_FACTORY = IonSystemBuilder.standard().build();
+
+    private static IonReader newReader(final InputStream inputStream) {
+        return IonReaderBuilder.standard().build(inputStream);
+    }
+
+    private static IonWriter newBinaryWriter(final OutputStream outputStream) {
+        return IonBinaryWriterBuilder.standard().build(outputStream);
+    }
+
+    private static IonWriter newTextWriter(final OutputStream outputStream) {
+        return IonTextWriterBuilder.standard().build(outputStream);
+    }
+
+    private static final String DATA_URL = "https://data.nasa.gov/data.json";
+    private static byte[] bytesBinary;
+    private static byte[] bytesText;
+
+    private static void setupTestData() throws IOException {
+        final URL url = new URL(DATA_URL);
+
+        final ByteArrayOutputStream binaryOut = new ByteArrayOutputStream();
+        try (
+            final InputStream inputStream = url.openStream();
+            final IonReader reader = newReader(inputStream);
+            final IonWriter binaryWriter = newBinaryWriter(binaryOut)
+        ) {
+            binaryWriter.writeValues(reader);
+        }
+
+        bytesBinary = binaryOut.toByteArray();
+
+        // text version. Writes from the binary memory buffer to avoid downloading the data twice
+        final ByteArrayOutputStream textOut = new ByteArrayOutputStream();
+        try (
+            final InputStream inputStream = new ByteArrayInputStream(bytesBinary);
+            final IonReader reader = newReader(inputStream);
+            final IonWriter writer = newTextWriter(textOut)
+        ) {
+            writer.writeValues(reader);
+        }
+
+        bytesText = textOut.toByteArray();
+    }
+
+    // sets up shared test data once.
+    static {
+        try {
+            setupTestData();
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    /**
+     * Each thread gets a single instance.
+     */
+    @State(Scope.Thread)
+    public static class ThreadState {
+
+        PathExtractor pathExtractor;
+        PathExtractor pathExtractorPartial;
+        PathExtractor pathExtractorPartialNoDom;
+
+        @Setup(Level.Trial)
+        public void setup() throws Exception {
+            pathExtractor = makePathExtractor(reader -> {
+                    // reads matches as DOM doing similar work as the DOM loader
+                    DOM_FACTORY.newValue(reader);
+                    return 0;
+                },
+                "(@context)",
+                "(@type)",
+                "(conformsTo)",
+                "(describedBy)",
+                "(dataset * @type)",
+                "(dataset * accessLevel)",
+                "(dataset * accrualPeriodicity)",
+                "(dataset * bureauCode)",
+                "(dataset * contactPoint)",
+                "(dataset * description)",
+                "(dataset * distribution)",
+                "(dataset * identifier)",
+                "(dataset * issued)",
+                "(dataset * keyword)",
+                "(dataset * landingPage)",
+                "(dataset * modified)",
+                "(dataset * programCode)",
+                "(dataset * publisher)",
+                "(dataset * title)",
+                "(dataset * license)"
+            );
+
+            pathExtractorPartial = makePathExtractor(reader -> {
+                    // reads matches as DOM doing similar work as the DOM loader but only for matched values
+                    DOM_FACTORY.newValue(reader);
+                    return 0;
+                },
+                "(@context)",
+                "(@type)",
+                "(conformsTo)",
+                "(describedBy)",
+                "(dataset * accessLevel)"
+            );
+
+            pathExtractorPartialNoDom = makePathExtractor(reader -> {
+                    // reads the value without materializing a DOM object
+                    reader.stringValue(); // all matched paths are strings
+                    return 0;
+                },
+                "(@context)",
+                "(@type)",
+                "(conformsTo)",
+                "(describedBy)",
+                "(dataset * accessLevel)"
+            );
+        }
+
+        private PathExtractor makePathExtractor(final Function<IonReader, Integer> callback,
+                                                final String... searchPaths) {
+            final PathExtractorBuilder builder = PathExtractorBuilder.standard();
+            Stream.of(searchPaths).forEach(sp -> builder.withSearchPath(sp, callback));
+            return builder.build();
+        }
+    }
+
+    /**
+     * Fully materializes all struct fields as IonValues using a path extractor.
+     */
+    @Benchmark
+    public Object fullBinary(final ThreadState threadState) {
+        // instantiate reader inside benchmark to be more comparable to dom loading
+        IonReader reader = newReader(new ByteArrayInputStream(bytesBinary));
+        threadState.pathExtractor.match(reader);
+
+        return reader;
+    }
+
+    /**
+     * Text version of {@link #fullBinary(ThreadState)}.
+     */
+    @Benchmark
+    public Object fullText(final ThreadState threadState) {
+        // instantiate reader inside benchmark to be more comparable to dom loading
+        IonReader reader = newReader(new ByteArrayInputStream(bytesText));
+        threadState.pathExtractor.match(reader);
+
+        return reader;
+    }
+
+    /**
+     * Materializes a single struct fields as IonValue using a path extractor.
+     */
+    @Benchmark
+    public Object partialBinary(final ThreadState threadState) {
+        // instantiate reader inside benchmark to be more comparable to dom loading
+        IonReader reader = newReader(new ByteArrayInputStream(bytesBinary));
+        threadState.pathExtractorPartial.match(reader);
+
+        return reader;
+    }
+
+    /**
+     * Text version of {@link #partialBinary(ThreadState)}.
+     */
+    @Benchmark
+    public Object partialText(final ThreadState threadState) {
+        // instantiate reader inside benchmark to be more comparable to dom loading
+        IonReader reader = newReader(new ByteArrayInputStream(bytesText));
+        threadState.pathExtractorPartial.match(reader);
+
+        return reader;
+    }
+
+    /**
+     * Access the java representation directly of a single struct field without materializing an `IonValue`.
+     */
+    @Benchmark
+    public Object partialBinaryNoDom(final ThreadState threadState) {
+        // instantiate reader inside benchmark to be more comparable to dom loading
+        IonReader reader = newReader(new ByteArrayInputStream(bytesBinary));
+        threadState.pathExtractorPartialNoDom.match(reader);
+
+        return reader;
+    }
+
+    /**
+     * Text version of {@link #partialBinaryNoDom(ThreadState)}.
+     */
+    @Benchmark
+    public Object partialTextNoDom(final ThreadState threadState) {
+        // instantiate reader inside benchmark to be more comparable to dom loading
+        IonReader reader = newReader(new ByteArrayInputStream(bytesText));
+        threadState.pathExtractorPartialNoDom.match(reader);
+
+        return reader;
+    }
+
+    /**
+     * Fully materializes a DOM for the file using an IonLoader.
+     */
+    @Benchmark
+    public Object domBinary() {
+        return DOM_FACTORY.getLoader().load(bytesBinary);
+    }
+
+    /**
+     * Text version of {@link #domBinary()}.
+     */
+    @Benchmark
+    public Object domText() {
+        return DOM_FACTORY.getLoader().load(bytesText);
+    }
+}