From e1006342b4445e528ceedd0b2c51f31f175282eb Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 24 Jan 2024 12:08:31 -0800 Subject: [PATCH 1/3] initial commit --- .../apache/datasketches/kll/KllDoublesHelper.java | 1 - .../apache/datasketches/kll/KllFloatsHelper.java | 5 ++--- .../apache/datasketches/kll/KllItemsHelper.java | 14 ++++++++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java index 741092337..15e6c52f7 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java @@ -37,7 +37,6 @@ * @author Kevin Lang * @author Alexander Saydakov */ -// final class KllDoublesHelper { /** diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java index 8c35fc66a..efdb4e4b2 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java @@ -37,7 +37,6 @@ * @author Kevin Lang * @author Alexander Saydakov */ -// final class KllFloatsHelper { /** @@ -485,10 +484,10 @@ private static void populateFloatWorkArrays( if (selfPop == 0 && otherPop == 0) { continue; } if (selfPop > 0 && otherPop == 0) { System.arraycopy(myCurFloatItemsArr, myCurLevelsArr[lvl], workbuf, worklevels[lvl], selfPop); - } + } else if (selfPop == 0 && otherPop > 0) { System.arraycopy(otherFloatItemsArr, otherLevelsArr[lvl], workbuf, worklevels[lvl], otherPop); - } + } else if (selfPop > 0 && otherPop > 0) { mergeSortedFloatArrays( myCurFloatItemsArr, myCurLevelsArr[lvl], selfPop, diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsHelper.java b/src/main/java/org/apache/datasketches/kll/KllItemsHelper.java index 502d43278..c3144a8eb 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsHelper.java @@ -39,6 +39,20 @@ @SuppressWarnings("unchecked") final class KllItemsHelper { + /** + * Create Items Array from given item and weight. + * Used with weighted update only. + * @param item the given item + * @param weight the given weight + * @return the Items Array. + */ + static T[] createItemsArray(final T item, final int weight) { + final int itemsArrLen = Integer.bitCount(weight); + final Object[] itemsArr = new Object[itemsArrLen]; + Arrays.fill(itemsArr, item); + return (T[]) itemsArr; + } + /** * The following code is only valid in the special case of exactly reaching capacity while updating. * It cannot be used while merging, while reducing k, or anything else. From 84c9cb216f33cae5f36f41c5f6c6b50cf42c9189 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 24 Jan 2024 18:39:49 -0800 Subject: [PATCH 2/3] Implemented weighted updates for KllItemsSketch. Made a few very minor adjustments to some of the parallel floats and doubles classes. --- .../common/ArrayOfItemsSerDe.java | 2 +- .../datasketches/kll/KllDoublesHelper.java | 6 +- .../datasketches/kll/KllFloatsHelper.java | 6 +- .../kll/KllHeapDoublesSketch.java | 2 - .../datasketches/kll/KllHeapFloatsSketch.java | 2 - .../datasketches/kll/KllHeapItemsSketch.java | 50 ++++++-- .../datasketches/kll/KllItemsHelper.java | 80 +++++++----- .../datasketches/kll/KllItemsSketch.java | 23 +++- .../kll/KllItemsSketchSortedView.java | 2 +- .../datasketches/kll/KllMiscDoublesTest.java | 2 +- .../datasketches/kll/KllMiscFloatsTest.java | 4 +- .../datasketches/kll/KllMiscItemsTest.java | 120 +++++++++++++++++- 12 files changed, 239 insertions(+), 60 deletions(-) diff --git a/src/main/java/org/apache/datasketches/common/ArrayOfItemsSerDe.java b/src/main/java/org/apache/datasketches/common/ArrayOfItemsSerDe.java index 0f6422c01..fdd14bd96 100644 --- a/src/main/java/org/apache/datasketches/common/ArrayOfItemsSerDe.java +++ b/src/main/java/org/apache/datasketches/common/ArrayOfItemsSerDe.java @@ -111,5 +111,5 @@ public int sizeOf(final T[] items) { * Returns the concrete class of type T * @return the concrete class of type T */ - public abstract Class getClassOfT(); + public abstract Class getClassOfT(); } diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java index 15e6c52f7..42c9e38b7 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java @@ -31,6 +31,7 @@ import org.apache.datasketches.memory.WritableMemory; +// // /** * Static methods to support KllDoublesSketch @@ -337,6 +338,7 @@ static void updateDouble(final KllDoublesSketch dblSk, final double item, final } else { dblSk.updateMinMax(item); final KllHeapDoublesSketch tmpSk = new KllHeapDoublesSketch(dblSk.getK(), DEFAULT_M, item, weight); + dblSk.merge(tmpSk); } } @@ -470,7 +472,7 @@ private static void populateDoubleWorkArrays( //workBuf and workLevels are modif workLevels[0] = 0; - // Note: the level zero data from "other" was already inserted into "self", + // Note: the level zero data from "other" was already inserted into "self". // This copies into workbuf. final int selfPopZero = KllHelper.currentLevelSizeItems(0, myCurNumLevels, myCurLevelsArr); System.arraycopy(myCurDoubleItemsArr, myCurLevelsArr[0], workBuf, workLevels[0], selfPopZero); @@ -480,7 +482,7 @@ private static void populateDoubleWorkArrays( //workBuf and workLevels are modif final int selfPop = KllHelper.currentLevelSizeItems(lvl, myCurNumLevels, myCurLevelsArr); final int otherPop = KllHelper.currentLevelSizeItems(lvl, otherNumLevels, otherLevelsArr); workLevels[lvl + 1] = workLevels[lvl] + selfPop + otherPop; - + assert selfPop >= 0 && otherPop >= 0; if (selfPop == 0 && otherPop == 0) { continue; } else if (selfPop > 0 && otherPop == 0) { System.arraycopy(myCurDoubleItemsArr, myCurLevelsArr[lvl], workBuf, workLevels[lvl], selfPop); diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java index efdb4e4b2..ee41f8398 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java @@ -31,6 +31,7 @@ import org.apache.datasketches.memory.WritableMemory; +// // /** * Static methods to support KllFloatsSketch @@ -337,6 +338,7 @@ static void updateFloat(final KllFloatsSketch fltSk, final float item, final int } else { fltSk.updateMinMax(item); final KllHeapFloatsSketch tmpSk = new KllHeapFloatsSketch(fltSk.getK(), DEFAULT_M, item, weight); + fltSk.merge(tmpSk); } } @@ -470,7 +472,7 @@ private static void populateFloatWorkArrays( worklevels[0] = 0; - // Note: the level zero data from "other" was already inserted into "self" + // Note: the level zero data from "other" was already inserted into "self". // This copies into workbuf. final int selfPopZero = KllHelper.currentLevelSizeItems(0, myCurNumLevels, myCurLevelsArr); System.arraycopy( myCurFloatItemsArr, myCurLevelsArr[0], workbuf, worklevels[0], selfPopZero); @@ -480,7 +482,7 @@ private static void populateFloatWorkArrays( final int selfPop = KllHelper.currentLevelSizeItems(lvl, myCurNumLevels, myCurLevelsArr); final int otherPop = KllHelper.currentLevelSizeItems(lvl, otherNumLevels, otherLevelsArr); worklevels[lvl + 1] = worklevels[lvl] + selfPop + otherPop; - + assert selfPop >= 0 && otherPop >= 0; if (selfPop == 0 && otherPop == 0) { continue; } if (selfPop > 0 && otherPop == 0) { System.arraycopy(myCurFloatItemsArr, myCurLevelsArr[lvl], workbuf, worklevels[lvl], selfPop); diff --git a/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java index c83bd8d19..c6553d89e 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java @@ -59,8 +59,6 @@ final class KllHeapDoublesSketch extends KllDoublesSketch { * * @param k parameter that controls size of the sketch and accuracy of estimates. * k can be between m and 65535, inclusive. - * The default k = 200 results in a normalized rank error of about 1.65%. - * Larger k will have smaller error but the sketch will be larger (and slower). * @param m parameter controls the minimum level width in items. It can be 2, 4, 6 or 8. * The DEFAULT_M, which is 8 is recommended. Other sizes of m should be considered * experimental as they have not been as well characterized. diff --git a/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java index 4388354e2..479a28f2c 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java @@ -59,8 +59,6 @@ final class KllHeapFloatsSketch extends KllFloatsSketch { * * @param k parameter that controls size of the sketch and accuracy of estimates. * k can be between m and 65535, inclusive. - * The default k = 200 results in a normalized rank error of about 1.65%. - * Larger k will have smaller error but the sketch will be larger (and slower). * @param m parameter controls the minimum level width in items. It can be 2, 4, 6 or 8. * The DEFAULT_M, which is 8 is recommended. Other sizes of m should be considered * experimental as they have not been as well characterized. diff --git a/src/main/java/org/apache/datasketches/kll/KllHeapItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllHeapItemsSketch.java index 3ed776f25..eb7c853b1 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHeapItemsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllHeapItemsSketch.java @@ -25,6 +25,7 @@ import static org.apache.datasketches.kll.KllSketch.SketchStructure.COMPACT_EMPTY; import static org.apache.datasketches.kll.KllSketch.SketchStructure.COMPACT_FULL; import static org.apache.datasketches.kll.KllSketch.SketchStructure.COMPACT_SINGLE; +import static org.apache.datasketches.kll.KllSketch.SketchStructure.UPDATABLE; import java.lang.reflect.Array; import java.util.Comparator; @@ -34,6 +35,14 @@ import org.apache.datasketches.memory.Memory; import org.apache.datasketches.memory.WritableMemory; +/** + * This class implements an on-heap doubles KllSketch. + * + *

Please refer to the documentation in the package-info:
+ * {@link org.apache.datasketches.kll}

+ * + * @author Lee Rhodes, Kevin Lang + */ @SuppressWarnings("unchecked") final class KllHeapItemsSketch extends KllItemsSketch { private final int k; // configured size of K. @@ -46,14 +55,17 @@ final class KllHeapItemsSketch extends KllItemsSketch { private Object[] itemsArr; /** - * Constructs a new empty instance of this sketch on the Java heap. + * New instance heap constructor. + * @param k parameter that controls size of the sketch and accuracy of estimates. + * @param m parameter controls the minimum level width in items. It can be 2, 4, 6 or 8. + * The DEFAULT_M, which is 8 is recommended. Other sizes of m should be considered + * experimental as they have not been as well characterized. + * @param comparator user specified comparator of type T. + * @param serDe serialization / deserialization class */ - KllHeapItemsSketch( - final int k, - final int m, - final Comparator comparator, + KllHeapItemsSketch(final int k, final int m, final Comparator comparator, final ArrayOfItemsSerDe serDe) { - super(SketchStructure.UPDATABLE, comparator, serDe); + super(UPDATABLE, comparator, serDe); KllHelper.checkM(m); KllHelper.checkK(k, m); this.levelsArr = new int[] {k, k}; @@ -69,11 +81,27 @@ final class KllHeapItemsSketch extends KllItemsSketch { } /** - * The Heapify constructor, which constructs an image of this sketch from - * a Memory (or WritableMemory) object that was created by this sketch - * and has a type T consistent with the given comparator and serDe. - * Once the data from the given Memory has been transferred into this heap sketch, - * the reference to the Memory object is no longer retained. + * Used for creating a temporary sketch for use with weighted updates. + */ + KllHeapItemsSketch(final int k, final int m, final T item, final int weight, final Comparator comparator, + final ArrayOfItemsSerDe serDe) { + super(UPDATABLE, comparator, serDe); + KllHelper.checkM(m); + KllHelper.checkK(k, m); + this.levelsArr = KllHelper.createLevelsArray(weight); + this.readOnly = false; + this.k = k; + this.m = m; + this.n = weight; + this.minK = k; + this.isLevelZeroSorted = false; + this.minItem = item; + this.maxItem = item; + this.itemsArr = KllItemsHelper.createItemsArray(serDe.getClassOfT(), item, weight); + } + + /** + * The Heapify constructor * @param srcMem the Source Memory image that contains data. * @param comparator the comparator for this sketch and given Memory. * @param serDe the serializer / deserializer for this sketch and the given Memory. diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsHelper.java b/src/main/java/org/apache/datasketches/kll/KllItemsHelper.java index c3144a8eb..fcac3258e 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsHelper.java @@ -21,9 +21,11 @@ import static java.lang.Math.max; import static java.lang.Math.min; +import static java.lang.reflect.Array.newInstance; import static org.apache.datasketches.common.Util.isEven; import static org.apache.datasketches.common.Util.isOdd; import static org.apache.datasketches.kll.KllHelper.findLevelToCompact; +import static org.apache.datasketches.kll.KllSketch.DEFAULT_M; import java.util.Arrays; import java.util.Comparator; @@ -37,7 +39,7 @@ * @author Lee Rhodes */ @SuppressWarnings("unchecked") -final class KllItemsHelper { +final class KllItemsHelper { /** * Create Items Array from given item and weight. @@ -46,11 +48,11 @@ final class KllItemsHelper { * @param weight the given weight * @return the Items Array. */ - static T[] createItemsArray(final T item, final int weight) { + static T[] createItemsArray(final Class clazz, final T item, final int weight) { final int itemsArrLen = Integer.bitCount(weight); - final Object[] itemsArr = new Object[itemsArrLen]; + final T[] itemsArr = (T[])newInstance(clazz, itemsArrLen); Arrays.fill(itemsArr, item); - return (T[]) itemsArr; + return itemsArr; } /** @@ -140,12 +142,12 @@ static void mergeItemImpl(final KllItemsSketch mySketch, //MERGE: update this sketch with level0 items from the other sketch if (otherItmSk.isCompactSingleItem()) { - updateItem(mySketch, otherItmSk.getSingleItem(), comp); + updateItem(mySketch, otherItmSk.getSingleItem()); otherItemsArr = new Object[0]; } else { otherItemsArr = otherItmSk.getTotalItemsArray(); for (int i = otherLevelsArr[0]; i < otherLevelsArr[1]; i++) { - updateItem(mySketch, otherItemsArr[i], comp); + updateItem(mySketch, otherItemsArr[i]); } } @@ -164,12 +166,13 @@ static void mergeItemImpl(final KllItemsSketch mySketch, final int tmpSpaceNeeded = mySketch.getNumRetained() + KllHelper.getNumRetainedAboveLevelZero(otherNumLevels, otherLevelsArr); final Object[] workbuf = new Object[tmpSpaceNeeded]; - final int ub = KllHelper.ubOnNumLevels(finalN); - final int[] worklevels = new int[ub + 2]; // ub+1 does not work - final int[] outlevels = new int[ub + 2]; final int provisionalNumLevels = max(myCurNumLevels, otherNumLevels); + final int ub = max(KllHelper.ubOnNumLevels(finalN), provisionalNumLevels); + final int[] worklevels = new int[ub + 2]; // ub+1 does not work + final int[] outlevels = new int[ub + 2]; + populateItemWorkArrays(workbuf, worklevels, provisionalNumLevels, myCurNumLevels, myCurLevelsArr, myCurItemsArr, otherNumLevels, otherLevelsArr, otherItemsArr, comp); @@ -209,7 +212,11 @@ static void mergeItemImpl(final KllItemsSketch mySketch, //MEMORY SPACE MANAGEMENT //not used - } + //extra spaces to make comparison with other helpers easier + // + // + // + } //end of updating levels above level 0 //Update Preamble: mySketch.setN(finalN); @@ -235,7 +242,7 @@ static void mergeItemImpl(final KllItemsSketch mySketch, assert KllHelper.sumTheSampleWeights(mySketch.getNumLevels(), mySketch.levelsArr) == mySketch.getN(); } - private static void mergeSortedItemsArrays( + private static void mergeSortedItemsArrays( //only bufC is modified final Object[] bufA, final int startA, final int lenA, final Object[] bufB, final int startB, final int lenB, final Object[] bufC, final int startC, final Comparator comp) { @@ -309,30 +316,34 @@ private static void randomlyHalveUpItems(final Object[] buf, final int start, fi } //Called from KllItemsSketch::update and this - static void updateItem(final KllItemsSketch itmSk, - final Object item, final Comparator comp) { - if (item == null) { return; } //ignore - if (itmSk.isEmpty()) { - itmSk.setMinItem(item); - itmSk.setMaxItem(item); - } else { - itmSk.setMinItem(Util.minT(itmSk.getMinItem(), item, comp)); - itmSk.setMaxItem(Util.maxT(itmSk.getMaxItem(), item, comp)); - } - int level0space = itmSk.levelsArr[0]; - assert level0space >= 0; - if (level0space == 0) { + static void updateItem(final KllItemsSketch itmSk, final Object item) { + itmSk.updateMinMax((T)item); + int freeSpace = itmSk.levelsArr[0]; + assert freeSpace >= 0; + if (freeSpace == 0) { compressWhileUpdatingSketch(itmSk); - level0space = itmSk.levelsArr[0]; - assert (level0space > 0); + freeSpace = itmSk.levelsArr[0]; + assert (freeSpace > 0); } itmSk.incN(); itmSk.setLevelZeroSorted(false); - final int nextPos = level0space - 1; + final int nextPos = freeSpace - 1; itmSk.setLevelsArrayAt(0, nextPos); itmSk.setItemsArrayAt(nextPos, item); } + //Called from KllItemsSketch::update with weight + static void updateItem(final KllItemsSketch itmSk, final T item, final int weight) { + if (weight < itmSk.levelsArr[0]) { + for (int i = 0; i < weight; i++) { updateItem(itmSk, item); } + } else { + itmSk.updateMinMax(item); + final KllHeapItemsSketch tmpSk = + new KllHeapItemsSketch<>(itmSk.getK(), DEFAULT_M, item, weight, itmSk.comparator, itmSk.serDe); + itmSk.merge(tmpSk); + } + } + /** * Compression algorithm used to merge higher levels. *

Here is what we do for each level:

@@ -462,7 +473,8 @@ private static void populateItemWorkArrays( final Comparator comp) { worklevels[0] = 0; - // Note: the level zero data from "other" was already inserted into "self" + // Note: the level zero data from "other" was already inserted into "self". + // This copies into workbuf. final int selfPopZero = KllHelper.currentLevelSizeItems(0, myCurNumLevels, myCurLevelsArr); System.arraycopy( myCurItemsArr, myCurLevelsArr[0], workbuf, worklevels[0], selfPopZero); worklevels[1] = worklevels[0] + selfPopZero; @@ -471,12 +483,15 @@ private static void populateItemWorkArrays( final int selfPop = KllHelper.currentLevelSizeItems(lvl, myCurNumLevels, myCurLevelsArr); final int otherPop = KllHelper.currentLevelSizeItems(lvl, otherNumLevels, otherLevelsArr); worklevels[lvl + 1] = worklevels[lvl] + selfPop + otherPop; - - if (selfPop > 0 && otherPop == 0) { + assert selfPop >= 0 && otherPop >= 0; + if (selfPop == 0 && otherPop == 0) { continue; } + else if (selfPop > 0 && otherPop == 0) { System.arraycopy(myCurItemsArr, myCurLevelsArr[lvl], workbuf, worklevels[lvl], selfPop); - } else if (selfPop == 0 && otherPop > 0) { + } + else if (selfPop == 0 && otherPop > 0) { System.arraycopy(otherItemsArr, otherLevelsArr[lvl], workbuf, worklevels[lvl], otherPop); - } else if (selfPop > 0 && otherPop > 0) { + } + else if (selfPop > 0 && otherPop > 0) { mergeSortedItemsArrays( myCurItemsArr, myCurLevelsArr[lvl], selfPop, otherItemsArr, otherLevelsArr[lvl], otherPop, @@ -500,4 +515,3 @@ private static void populateItemWorkArrays( // } } - diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java index 76989cdaa..9580a8e12 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java @@ -30,6 +30,7 @@ import org.apache.datasketches.common.ArrayOfItemsSerDe; import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.common.Util; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.memory.MemoryRequestServer; import org.apache.datasketches.memory.WritableMemory; @@ -288,8 +289,18 @@ public String toString(final boolean withLevels, final boolean withLevelsAndItem @Override public void update(final T item) { + if (item == null) { return; } //ignore if (readOnly) { throw new SketchesArgumentException(TGT_IS_READ_ONLY_MSG); } - KllItemsHelper.updateItem(this, item, comparator); + KllItemsHelper.updateItem(this, item); + kllItemsSV = null; + } + + public void update(final T item, final int weight) { + if (item == null) { return; } //ignore + if (readOnly) { throw new SketchesArgumentException(TGT_IS_READ_ONLY_MSG); } + if (weight < 1) { throw new SketchesArgumentException("Weight is less than one."); } + if (weight == 1) { KllItemsHelper.updateItem(this, item); } + else { KllItemsHelper.updateItem(this, item, weight); } kllItemsSV = null; } @@ -374,4 +385,14 @@ void setWritableMemory(final WritableMemory wmem) { throw new SketchesArgumentException(UNSUPPORTED_MSG + "Sketch not writable."); } + void updateMinMax(final T item) { + if (isEmpty()) { + setMinItem(item); + setMaxItem(item); + } else { + setMinItem(Util.minT(getMinItem(), item, comparator)); + setMaxItem(Util.maxT(getMaxItem(), item, comparator)); + } + } + } diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java index ee1278826..27f97e17f 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketchSortedView.java @@ -95,7 +95,7 @@ public class KllItemsSketchSortedView implements GenericSortedView, Partit this.comparator = sketch.comparator; this.maxItem = sketch.getMaxItem(); this.minItem = sketch.getMinItem(); - this.clazz = (Class)sketch.serDe.getClassOfT(); + this.clazz = sketch.serDe.getClassOfT(); if (totalN == 0) { throw new SketchesArgumentException(EMPTY_MSG); } if (!sketch.isLevelZeroSorted()) { diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscDoublesTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscDoublesTest.java index c331c02af..41f193256 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscDoublesTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscDoublesTest.java @@ -62,7 +62,7 @@ public void checkSortedViewConstruction() { } } - @Test + @Test //set static enablePrinting = true for visual checking public void checkBounds() { final KllDoublesSketch kll = KllDoublesSketch.newHeapInstance(); // default k = 200 for (int i = 0; i < 1000; i++) { diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscFloatsTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscFloatsTest.java index 202f2c794..e7ba7d190 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscFloatsTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscFloatsTest.java @@ -62,7 +62,7 @@ public void checkSortedViewConstruction() { } } - @Test + @Test //set static enablePrinting = true for visual checking public void checkBounds() { final KllFloatsSketch kll = KllFloatsSketch.newHeapInstance(); //default k = 200 for (int i = 0; i < 1000; i++) { @@ -398,7 +398,7 @@ public void checkIntCapAuxAux() { assertEquals(result, KllHelper.intCapAuxAux(k, i)); } } - + @Test public void checkGrowLevels() { KllFloatsSketch sk = KllFloatsSketch.newHeapInstance(20); diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java index 67db954a8..f37b69506 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java @@ -19,7 +19,10 @@ package org.apache.datasketches.kll; +import static org.apache.datasketches.common.Util.bitAt; +import static org.apache.datasketches.kll.KllHelper.getGrowthSchemeForGivenN; import static org.apache.datasketches.kll.KllSketch.SketchType.ITEMS_SKETCH; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; @@ -31,11 +34,19 @@ import org.apache.datasketches.common.ArrayOfStringsSerDe; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.common.Util; +import org.apache.datasketches.kll.KllSketch.SketchType; import org.apache.datasketches.quantilescommon.GenericSortedViewIterator; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.memory.WritableMemory; +import org.apache.datasketches.quantilescommon.DoublesSortedViewIterator; +import org.apache.datasketches.quantilescommon.GenericSortedView; +import org.apache.datasketches.quantilescommon.GenericSortedViewIterator; import org.testng.annotations.Test; +/** + * @author Lee Rhodes + */ +@SuppressWarnings("unused") public class KllMiscItemsTest { static final String LS = System.getProperty("line.separator"); public ArrayOfStringsSerDe serDe = new ArrayOfStringsSerDe(); @@ -178,6 +189,8 @@ public void visualCheckToString() { public void viewHeapCompactions() { int k = 20; int n = 108; + boolean withLevels = false; + boolean withLevelsAndItems = true; int digits = Util.numDigits(n); int compaction = 0; KllItemsSketch sk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), serDe); @@ -192,6 +205,9 @@ public void viewHeapCompactions() { assertEquals(sk.getTotalItemsArray()[sk.levelsArr[0]], Util.longToFixedLengthString(i, digits)); } } + println(LS + "#<<< END STATE # >>>"); + println(sk.toString(withLevels, withLevelsAndItems)); + println(""); } @Test //set static enablePrinting = true for visual checking @@ -205,13 +221,64 @@ public void viewCompactionAndSortedView() { GenericSortedViewIterator itr = sv.iterator(); println("### SORTED VIEW"); printf("%12s%12s\n", "Value", "CumWeight"); + long[] correct = {2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}; + int i = 0; while (itr.next()) { String v = itr.getQuantile(); long wt = itr.getWeight(); printf("%12s%12d\n", v, wt); + assertEquals(wt, correct[i++]); } } + @Test //set static enablePrinting = true for visual checking + public void checkWeightedUpdates1() { + int k = 20; + int weight = 127; + String item = "10"; + KllItemsSketch sk = KllItemsSketch.newHeapInstance(20, Comparator.naturalOrder(), serDe); + println(sk.toString(true, true)); + sk.update(item, weight); + println(sk.toString(true, true)); + assertEquals(sk.getNumRetained(), 7); + assertEquals(sk.getN(), weight); + sk.update(item, weight); + println(sk.toString(true, true)); + assertEquals(sk.getNumRetained(), 14); + assertEquals(sk.getN(), 254); + } + + @Test //set static enablePrinting = true for visual checking + public void checkWeightedUpdates2() { + int k = 20; + int initial = 1000; + final int digits = 4; + int weight = 127; + String item = " 10"; + KllItemsSketch sk = KllItemsSketch.newHeapInstance(20, Comparator.naturalOrder(), serDe); + for (int i = 1; i <= initial; i++) { sk.update(Util.longToFixedLengthString(i + 1000, digits)); } + println(sk.toString(true, true)); + sk.update(item, weight); + println(sk.toString(true, true)); + assertEquals(sk.getNumRetained(), 65); + assertEquals(sk.getN(), 1127); + + GenericSortedViewIterator itr = sk.getSortedView().iterator(); + println("### SORTED VIEW"); + printf("%12s %12s %12s\n", "Value", "Weight", "NaturalRank"); + long cumWt = 0; + while (itr.next()) { + String v = itr.getQuantile(); + long wt = itr.getWeight(); + long natRank = itr.getNaturalRank(INCLUSIVE); + cumWt += wt; + assertEquals(cumWt, natRank); + printf("%12s %12d %12d\n", v, wt, natRank); + } + assertEquals(cumWt, sk.getN()); + } + + @Test public void checkGrowLevels() { KllItemsSketch sk = KllItemsSketch.newHeapInstance(20, Comparator.naturalOrder(), serDe); @@ -416,8 +483,57 @@ public void checkMemoryToStringItemsCompact() { assertEquals(compBytes, compBytes2); } - // public void checkMemoryToStringFloatUpdatable() Not Supported - // public void checkSimpleMerge() not supported + @Test //set static enablePrinting = true for visual checking + public void checkCreateItemsArray() { //used with weighted updates + String item = "10"; + int weight = 108; + String[] itemsArr = KllItemsHelper.createItemsArray(String.class, item, weight); + assertEquals(itemsArr.length, 4); + for (int i = 0; i < itemsArr.length; i++) { itemsArr[i] = item; } + outputItems(itemsArr); + } + + private static void outputItems(String[] itemsArr) { + String[] hdr2 = {"Index", "Value"}; + String hdr2fmt = "%6s %15s\n"; + String d2fmt = "%6d %15s\n"; + println("ItemsArr"); + printf(hdr2fmt, (Object[]) hdr2); + for (int i = 0; i < itemsArr.length; i++) { + printf(d2fmt, i, itemsArr[i]); + } + println(""); + } + + @Test //set static enablePrinting = true for visual checking + public void checkCreateLevelsArray() { //used with weighted updates + int weight = 108; + int[] levelsArr = KllHelper.createLevelsArray(weight); + assertEquals(levelsArr.length, 8); + int[] correct = {0,0,0,1,2,2,3,4}; + for (int i = 0; i < levelsArr.length; i++) { + assertEquals(levelsArr[i], correct[i]); + } + outputLevels(weight, levelsArr); + } + + private static void outputLevels(int weight, int[] levelsArr) { + String[] hdr = {"Lvl", "StartAdr", "BitPattern", "Weight"}; + String hdrfmt = "%3s %9s %10s %s\n"; + String dfmt = "%3d %9d %10d %d\n"; + String dfmt_2 = "%3d %9d %s\n"; + println("Count = " + weight + " => " + (Integer.toBinaryString(weight))); + println("LevelsArr"); + printf(hdrfmt, (Object[]) hdr); + for (int i = 0; i < levelsArr.length; i++) { + if (i == levelsArr.length - 1) { printf(dfmt_2, i, levelsArr[i], "ItemsArr.length"); } + else { + int j = bitAt(weight, i); + printf(dfmt, i, levelsArr[i], j, 1 << (i)); + } + } + println(""); + } @Test public void checkGetSingleItem() { From 44bb7d4563d881fc6303311d388814771ce37005 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Thu, 25 Jan 2024 15:34:54 -0800 Subject: [PATCH 3/3] Changed the weighted update method where the weight parameter is now a long. This change rippled through 3 or 4 classes each for floats, doubles, and items. --- .../org/apache/datasketches/kll/KllDoublesHelper.java | 8 ++++---- .../org/apache/datasketches/kll/KllDoublesSketch.java | 6 +++--- .../org/apache/datasketches/kll/KllFloatsHelper.java | 8 ++++---- .../org/apache/datasketches/kll/KllFloatsSketch.java | 6 +++--- .../apache/datasketches/kll/KllHeapDoublesSketch.java | 2 +- .../apache/datasketches/kll/KllHeapFloatsSketch.java | 2 +- .../apache/datasketches/kll/KllHeapItemsSketch.java | 2 +- .../java/org/apache/datasketches/kll/KllHelper.java | 7 +++++-- .../org/apache/datasketches/kll/KllItemsHelper.java | 8 ++++---- .../org/apache/datasketches/kll/KllItemsSketch.java | 11 ++++++++--- .../org/apache/datasketches/kll/KllMiscItemsTest.java | 4 ++-- 11 files changed, 36 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java index 42c9e38b7..88da49e38 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesHelper.java @@ -47,8 +47,8 @@ final class KllDoublesHelper { * @param weight the given weight * @return the Items Array. */ - static double[] createItemsArray(final double item, final int weight) { - final int itemsArrLen = Integer.bitCount(weight); + static double[] createItemsArray(final double item, final long weight) { + final int itemsArrLen = Long.bitCount(weight); final double[] itemsArr = new double[itemsArrLen]; Arrays.fill(itemsArr, item); return itemsArr; @@ -332,9 +332,9 @@ static void updateDouble(final KllDoublesSketch dblSk, final double item) { } //Called from KllDoublesSketch::update with weight - static void updateDouble(final KllDoublesSketch dblSk, final double item, final int weight) { + static void updateDouble(final KllDoublesSketch dblSk, final double item, final long weight) { if (weight < dblSk.levelsArr[0]) { - for (int i = 0; i < weight; i++) { updateDouble(dblSk, item); } + for (int i = 0; i < (int)weight; i++) { updateDouble(dblSk, item); } } else { dblSk.updateMinMax(item); final KllHeapDoublesSketch tmpSk = new KllHeapDoublesSketch(dblSk.getK(), DEFAULT_M, item, weight); diff --git a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java index caf11bfea..3cef84710 100644 --- a/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllDoublesSketch.java @@ -330,11 +330,11 @@ public void update(final double item) { * @param item the item to be repeated. NaNs are ignored. * @param weight the number of times the update of item is to be repeated. It must be ≥ one. */ - public void update(final double item, final int weight) { + public void update(final double item, final long weight) { if (Double.isNaN(item)) { return; } //ignore if (readOnly) { throw new SketchesArgumentException(TGT_IS_READ_ONLY_MSG); } - if (weight < 1) { throw new SketchesArgumentException("Weight is less than one."); } - if (weight == 1) { KllDoublesHelper.updateDouble(this, item); } + if (weight < 1L) { throw new SketchesArgumentException("Weight is less than one."); } + if (weight == 1L) { KllDoublesHelper.updateDouble(this, item); } else { KllDoublesHelper.updateDouble(this, item, weight); } kllDoublesSV = null; } diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java index ee41f8398..9e2c50e84 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsHelper.java @@ -47,8 +47,8 @@ final class KllFloatsHelper { * @param weight the given weight * @return the Items Array. */ - static float[] createItemsArray(final float item, final int weight) { - final int itemsArrLen = Integer.bitCount(weight); + static float[] createItemsArray(final float item, final long weight) { + final int itemsArrLen = Long.bitCount(weight); final float[] itemsArr = new float[itemsArrLen]; Arrays.fill(itemsArr, item); return itemsArr; @@ -332,9 +332,9 @@ static void updateFloat(final KllFloatsSketch fltSk, final float item) { } //Called from KllFloatsSketch::update with weight - static void updateFloat(final KllFloatsSketch fltSk, final float item, final int weight) { + static void updateFloat(final KllFloatsSketch fltSk, final float item, final long weight) { if (weight < fltSk.levelsArr[0]) { - for (int i = 0; i < weight; i++) { updateFloat(fltSk, item); } + for (int i = 0; i < (int)weight; i++) { updateFloat(fltSk, item); } } else { fltSk.updateMinMax(item); final KllHeapFloatsSketch tmpSk = new KllHeapFloatsSketch(fltSk.getK(), DEFAULT_M, item, weight); diff --git a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java index bde606ea2..a3a16813e 100644 --- a/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllFloatsSketch.java @@ -330,11 +330,11 @@ public void update(final float item) { * @param item the item to be repeated. NaNs are ignored. * @param weight the number of times the update of item is to be repeated. It must be ≥ one. */ - public void update(final float item, final int weight) { + public void update(final float item, final long weight) { if (Float.isNaN(item)) { return; } //ignore if (readOnly) { throw new SketchesArgumentException(TGT_IS_READ_ONLY_MSG); } - if (weight < 1) { throw new SketchesArgumentException("Weight is less than one."); } - if (weight == 1) { KllFloatsHelper.updateFloat(this, item); } + if (weight < 1L) { throw new SketchesArgumentException("Weight is less than one."); } + if (weight == 1L) { KllFloatsHelper.updateFloat(this, item); } else { KllFloatsHelper.updateFloat(this, item, weight); } kllFloatsSV = null; } diff --git a/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java b/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java index c6553d89e..177a2b536 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllHeapDoublesSketch.java @@ -82,7 +82,7 @@ final class KllHeapDoublesSketch extends KllDoublesSketch { /** * Used for creating a temporary sketch for use with weighted updates. */ - KllHeapDoublesSketch(final int k, final int m, final double item, final int weight) { + KllHeapDoublesSketch(final int k, final int m, final double item, final long weight) { super(UPDATABLE); KllHelper.checkM(m); KllHelper.checkK(k, m); diff --git a/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java b/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java index 479a28f2c..afc20a8a8 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllHeapFloatsSketch.java @@ -82,7 +82,7 @@ final class KllHeapFloatsSketch extends KllFloatsSketch { /** * Used for creating a temporary sketch for use with weighted updates. */ - KllHeapFloatsSketch(final int k, final int m, final float item, final int weight) { + KllHeapFloatsSketch(final int k, final int m, final float item, final long weight) { super(UPDATABLE); KllHelper.checkM(m); KllHelper.checkK(k, m); diff --git a/src/main/java/org/apache/datasketches/kll/KllHeapItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllHeapItemsSketch.java index eb7c853b1..83ba77907 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHeapItemsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllHeapItemsSketch.java @@ -83,7 +83,7 @@ final class KllHeapItemsSketch extends KllItemsSketch { /** * Used for creating a temporary sketch for use with weighted updates. */ - KllHeapItemsSketch(final int k, final int m, final T item, final int weight, final Comparator comparator, + KllHeapItemsSketch(final int k, final int m, final T item, final long weight, final Comparator comparator, final ArrayOfItemsSerDe serDe) { super(UPDATABLE, comparator, serDe); KllHelper.checkM(m); diff --git a/src/main/java/org/apache/datasketches/kll/KllHelper.java b/src/main/java/org/apache/datasketches/kll/KllHelper.java index c1010cc80..f1d1df9b7 100644 --- a/src/main/java/org/apache/datasketches/kll/KllHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllHelper.java @@ -157,8 +157,11 @@ public static long convertToCumulative(final long[] array) { * @param weight the given weight * @return the Levels Array */ - static int[] createLevelsArray(final int weight) { - final int numLevels = 32 - Integer.numberOfLeadingZeros(weight); + static int[] createLevelsArray(final long weight) { + final int numLevels = 64 - Long.numberOfLeadingZeros(weight); + if (numLevels > 61) { + throw new SketchesArgumentException("The requested weight must not exceed 2^61"); + } final int[] levelsArr = new int[numLevels + 1]; //always one more than numLevels int itemsArrIndex = 0; levelsArr[0] = itemsArrIndex; diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsHelper.java b/src/main/java/org/apache/datasketches/kll/KllItemsHelper.java index fcac3258e..31c6897d2 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsHelper.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsHelper.java @@ -48,8 +48,8 @@ final class KllItemsHelper { * @param weight the given weight * @return the Items Array. */ - static T[] createItemsArray(final Class clazz, final T item, final int weight) { - final int itemsArrLen = Integer.bitCount(weight); + static T[] createItemsArray(final Class clazz, final T item, final long weight) { + final int itemsArrLen = Long.bitCount(weight); final T[] itemsArr = (T[])newInstance(clazz, itemsArrLen); Arrays.fill(itemsArr, item); return itemsArr; @@ -333,9 +333,9 @@ static void updateItem(final KllItemsSketch itmSk, final Object item) { } //Called from KllItemsSketch::update with weight - static void updateItem(final KllItemsSketch itmSk, final T item, final int weight) { + static void updateItem(final KllItemsSketch itmSk, final T item, final long weight) { if (weight < itmSk.levelsArr[0]) { - for (int i = 0; i < weight; i++) { updateItem(itmSk, item); } + for (int i = 0; i < (int)weight; i++) { updateItem(itmSk, item); } } else { itmSk.updateMinMax(item); final KllHeapItemsSketch tmpSk = diff --git a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java index 9580a8e12..131d03c3b 100644 --- a/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java +++ b/src/main/java/org/apache/datasketches/kll/KllItemsSketch.java @@ -295,11 +295,16 @@ public void update(final T item) { kllItemsSV = null; } - public void update(final T item, final int weight) { + /** + * Weighted update. Updates this sketch with the given item the number of times specified by the given integer weight. + * @param item the item to be repeated. NaNs are ignored. + * @param weight the number of times the update of item is to be repeated. It must be ≥ one. + */ + public void update(final T item, final long weight) { if (item == null) { return; } //ignore if (readOnly) { throw new SketchesArgumentException(TGT_IS_READ_ONLY_MSG); } - if (weight < 1) { throw new SketchesArgumentException("Weight is less than one."); } - if (weight == 1) { KllItemsHelper.updateItem(this, item); } + if (weight < 1L) { throw new SketchesArgumentException("Weight is less than one."); } + if (weight == 1L) { KllItemsHelper.updateItem(this, item); } else { KllItemsHelper.updateItem(this, item, weight); } kllItemsSV = null; } diff --git a/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java b/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java index f37b69506..fd997f974 100644 --- a/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java +++ b/src/test/java/org/apache/datasketches/kll/KllMiscItemsTest.java @@ -236,7 +236,7 @@ public void checkWeightedUpdates1() { int k = 20; int weight = 127; String item = "10"; - KllItemsSketch sk = KllItemsSketch.newHeapInstance(20, Comparator.naturalOrder(), serDe); + KllItemsSketch sk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), serDe); println(sk.toString(true, true)); sk.update(item, weight); println(sk.toString(true, true)); @@ -255,7 +255,7 @@ public void checkWeightedUpdates2() { final int digits = 4; int weight = 127; String item = " 10"; - KllItemsSketch sk = KllItemsSketch.newHeapInstance(20, Comparator.naturalOrder(), serDe); + KllItemsSketch sk = KllItemsSketch.newHeapInstance(k, Comparator.naturalOrder(), serDe); for (int i = 1; i <= initial; i++) { sk.update(Util.longToFixedLengthString(i + 1000, digits)); } println(sk.toString(true, true)); sk.update(item, weight);