Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Filter Criteria for Late Filtering #123

Merged
merged 37 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
5ef6651
adds simple filter operators
net-cscience-raphael Nov 22, 2024
01552f5
adds null safety for values
net-cscience-raphael Nov 22, 2024
33e3db5
formats
net-cscience-raphael Nov 22, 2024
e04f14e
adds append to late filter
net-cscience-raphael Nov 25, 2024
6713af4
Debugs like and adds Assertions.assertTrue(result.isNotEmpty()) to test
net-cscience-raphael Nov 25, 2024
3b5ab2f
adds like for query LateFilter
net-cscience-raphael Nov 25, 2024
4b8292d
adds late limit
net-cscience-raphael Nov 25, 2024
a75268d
disables again assertion for 0 results
net-cscience-raphael Nov 25, 2024
1ee7ec2
changes name for filter transformer
net-cscience-raphael Dec 2, 2024
0dfa5f0
Adds Late filter without lookup
net-cscience-raphael Dec 3, 2024
77f2ed9
minor adjustments
net-cscience-raphael Dec 3, 2024
f803265
debugs skip strategy
net-cscience-raphael Dec 3, 2024
572dc21
changes default
net-cscience-raphael Dec 3, 2024
753014b
adjusts comments
net-cscience-raphael Dec 3, 2024
ba9a936
adds transformer for benchmark
net-cscience-raphael Dec 4, 2024
98af3e2
draft logger
net-cscience-raphael Dec 5, 2024
e66b12e
adds simple filter operators
net-cscience-raphael Nov 22, 2024
2ea8948
adds null safety for values
net-cscience-raphael Nov 22, 2024
4721ed6
formats
net-cscience-raphael Nov 22, 2024
e46ef8d
adds append to late filter
net-cscience-raphael Nov 25, 2024
cf977ba
Debugs like and adds Assertions.assertTrue(result.isNotEmpty()) to test
net-cscience-raphael Nov 25, 2024
85a8872
adds like for query LateFilter
net-cscience-raphael Nov 25, 2024
ee3cbe9
adds late limit
net-cscience-raphael Nov 25, 2024
641e5df
disables again assertion for 0 results
net-cscience-raphael Nov 25, 2024
9f39cba
changes name for filter transformer
net-cscience-raphael Dec 2, 2024
6384b43
Adds Late filter without lookup
net-cscience-raphael Dec 3, 2024
c0cbd08
minor adjustments
net-cscience-raphael Dec 3, 2024
1242c9d
debugs skip strategy
net-cscience-raphael Dec 3, 2024
2b1fad0
changes default
net-cscience-raphael Dec 3, 2024
25bd826
adjusts comments
net-cscience-raphael Dec 3, 2024
f67a1e5
adds transformer for benchmark
net-cscience-raphael Dec 4, 2024
cde944a
draft logger
net-cscience-raphael Dec 5, 2024
0fb0adc
Merge remote-tracking branch 'origin/feature/filtercriteria' into fea…
net-cscience-raphael Dec 5, 2024
ade752c
Auto stash before rebase of "feature/filtercriteria" onto "origin/dev"
net-cscience-raphael Dec 12, 2024
72f01a9
Merge branch 'feature/filtercriteria' into dev
net-cscience-raphael Dec 12, 2024
b3bfafb
Merge branch 'dev' into feature/filtercriteria
net-cscience-raphael Dec 12, 2024
15bd7f6
adds Text type
net-cscience-raphael Dec 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion example-configs/schema/dense.json
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
"clip": {
"factory": "DenseEmbedding",
"parameters": {
"host": "http://10.34.64.84:8888/",
"host": "http://10.34.64.83:8888/",
"model": "open-clip-vit-b32",
"length": "512",
"timeoutSeconds": "100",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,10 +112,9 @@ enum class ComparisonOperator(val value: String) {
when (v1) {
is Value.String,
is Value.Text -> {
(v1.value as String).replace("\\", "\\\\").replace("[", "\\[").replace("]", "\\]")
.replace("*", "\\*").replace("%", "*").toRegex().matches(v2.value as String)
(v2.value as String).replace("\\", "\\\\").replace("[", "\\[").replace("]", "\\]")
.replace("*", "\\*").replace("%", ".*").replace("_", ".?").toRegex().matches(v1.value as String)
}

else -> false
}

Expand All @@ -128,7 +127,7 @@ enum class ComparisonOperator(val value: String) {
* @param str The [String] which should be one of the [ComparisonOperator]
* @throws IllegalArgumentException In case the given string is not one of the defined ones.
*/
fun fromString(str: String): ComparisonOperator {
infix fun fromString(str: String): ComparisonOperator {
return when (str.trim()) {
EQ.value -> EQ
NEQ.value -> NEQ
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,19 @@ enum class Distance {
override fun invoke(v1: Value.DoubleVector, v2: Value.DoubleVector): Double = throw UnsupportedOperationException("Jaccard distance is not supported for float vectors.")
};

companion object {
infix fun fromString(value: String): Distance {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is wrong with Distance.fromString(value.toUpper()) ?

return when (value) {
"manhattan" -> MANHATTAN
"euclidean" -> EUCLIDEAN
"cosine" -> COSINE
"hamming" -> HAMMING
"jaccard" -> JACCARD
else -> throw IllegalArgumentException("Distance function $value is not supported.")
}
}
}

/**
* Calculates this [Distance] between two [Value.FloatVector].
*
Expand All @@ -115,4 +128,5 @@ enum class Distance {
* @return [Double]
*/
abstract operator fun invoke(v1: Value.DoubleVector, v2: Value.DoubleVector): Double
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ abstract class AbstractFileMetadataDescriptorReaderTest(schemaPath: String) : Ab

/* Check results. */
val result = reader.query(query).toList()
Assertions.assertTrue(result.isNotEmpty())
for (r in result) {
Assertions.assertTrue(r.path.value.endsWith(".jpg"))
}
Expand Down Expand Up @@ -139,6 +140,7 @@ abstract class AbstractFileMetadataDescriptorReaderTest(schemaPath: String) : Ab

/* Check results. */
val result = reader.query(query).toList()
Assertions.assertTrue(result.isNotEmpty())
for (r in result) {
Assertions.assertTrue(r.size.value > size.value)
}
Expand Down Expand Up @@ -166,6 +168,7 @@ abstract class AbstractFileMetadataDescriptorReaderTest(schemaPath: String) : Ab

/* Check results. */
val result = reader.query(query).toList()
Assertions.assertTrue(result.isNotEmpty())
for (r in result) {
Assertions.assertTrue(r.size.value < size.value)
}
Expand All @@ -191,6 +194,7 @@ abstract class AbstractFileMetadataDescriptorReaderTest(schemaPath: String) : Ab

/* Check results. */
val result = reader.query(query).toList()
// TODO enable Assertions.assertTrue(result.isNotEmpty())
for (r in result) {
Assertions.assertTrue(r.path.value.contains("var"))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import org.vitrivr.engine.core.model.descriptor.vector.FloatVectorDescriptor
import org.vitrivr.engine.core.model.metamodel.Analyser.Companion.merge
import org.vitrivr.engine.core.model.metamodel.Schema
import org.vitrivr.engine.core.model.query.Query
import org.vitrivr.engine.core.model.query.basics.Distance
import org.vitrivr.engine.core.model.query.proximity.ProximityQuery
import org.vitrivr.engine.core.model.retrievable.Retrievable
import org.vitrivr.engine.core.model.types.Value
Expand All @@ -36,6 +37,8 @@ class DenseEmbedding : ExternalFesAnalyser<ContentElement<*>, FloatVectorDescrip
companion object {
const val LENGTH_PARAMETER_DEFAULT = 512
const val LENGTH_PARAMETER_NAME = "length"
const val DISTANCE_PARAMETER_DEFAULT = "euclidean"
const val DISTANCE_PARAMETER_NAME = "distance"
}
override val contentClasses = setOf(ImageContent::class, TextContent::class)
override val descriptorClass = FloatVectorDescriptor::class
Expand Down Expand Up @@ -103,6 +106,7 @@ class DenseEmbedding : ExternalFesAnalyser<ContentElement<*>, FloatVectorDescrip
val retries = field.parameters[RETRIES_PARAMETER_NAME]?.toIntOrNull() ?: RETRIES_PARAMETER_DEFAULT
val model = field.parameters[MODEL_PARAMETER_NAME] ?: throw IllegalStateException("Model parameter not set.")
val k = context.getProperty(field.fieldName, "limit")?.toLongOrNull() ?: 1000L
val distance = Distance fromString (field.parameters[DISTANCE_PARAMETER_NAME] ?: DISTANCE_PARAMETER_DEFAULT)
val fetchVector = context.getProperty(field.fieldName, "returnDescriptor")?.toBooleanStrictOrNull() ?: false

/* Generate vector for content element. */
Expand All @@ -116,6 +120,6 @@ class DenseEmbedding : ExternalFesAnalyser<ContentElement<*>, FloatVectorDescrip
}

/* Return retriever. */
return this.newRetrieverForQuery(field, ProximityQuery(value = vector, k = k, fetchVector = fetchVector), context)
return this.newRetrieverForQuery(field, ProximityQuery(value = vector, distance = distance, k = k, fetchVector = fetchVector), context)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ class ScalarDescriptorReader(field: Schema.Field<*, ScalarDescriptor<*, *>>, con
val descriptorId = result.getObject(DESCRIPTOR_ID_COLUMN_NAME, UUID::class.java)
val retrievableId = result.getObject(RETRIEVABLE_ID_COLUMN_NAME, UUID::class.java)
return when (this.prototype) {
is BooleanDescriptor -> BooleanDescriptor(descriptorId, retrievableId, Value.Boolean(result.getBoolean(VALUE_ATTRIBUTE_NAME)))
is ByteDescriptor -> ByteDescriptor(descriptorId, retrievableId, Value.Byte(result.getByte(VALUE_ATTRIBUTE_NAME)))
is ShortDescriptor -> ShortDescriptor(descriptorId, retrievableId, Value.Short(result.getShort(VALUE_ATTRIBUTE_NAME)))
is IntDescriptor -> IntDescriptor(descriptorId, retrievableId, Value.Int(result.getInt(VALUE_ATTRIBUTE_NAME)))
is LongDescriptor -> LongDescriptor(descriptorId, retrievableId, Value.Long(result.getLong(VALUE_ATTRIBUTE_NAME)))
is FloatDescriptor -> FloatDescriptor(descriptorId, retrievableId, Value.Float(result.getFloat(VALUE_ATTRIBUTE_NAME)))
is DoubleDescriptor -> DoubleDescriptor(descriptorId, retrievableId, Value.Double(result.getDouble(VALUE_ATTRIBUTE_NAME)))
is StringDescriptor -> StringDescriptor(descriptorId, retrievableId, Value.String(result.getString(VALUE_ATTRIBUTE_NAME)))
is TextDescriptor -> TextDescriptor(descriptorId, retrievableId, Value.Text(result.getString(VALUE_ATTRIBUTE_NAME)))
is BooleanDescriptor -> BooleanDescriptor(descriptorId, retrievableId, Value.Boolean(result.getBoolean(VALUE_ATTRIBUTE_NAME)), this.field as Schema.Field<*, BooleanDescriptor>)
is ByteDescriptor -> ByteDescriptor(descriptorId, retrievableId, Value.Byte(result.getByte(VALUE_ATTRIBUTE_NAME)), this.field as Schema.Field<*, ByteDescriptor>)
is ShortDescriptor -> ShortDescriptor(descriptorId, retrievableId, Value.Short(result.getShort(VALUE_ATTRIBUTE_NAME)), this.field as Schema.Field<*, ShortDescriptor>)
is IntDescriptor -> IntDescriptor(descriptorId, retrievableId, Value.Int(result.getInt(VALUE_ATTRIBUTE_NAME)), this.field as Schema.Field<*, IntDescriptor>)
is LongDescriptor -> LongDescriptor(descriptorId, retrievableId, Value.Long(result.getLong(VALUE_ATTRIBUTE_NAME)), this.field as Schema.Field<*, LongDescriptor>)
is FloatDescriptor -> FloatDescriptor(descriptorId, retrievableId, Value.Float(result.getFloat(VALUE_ATTRIBUTE_NAME)), this.field as Schema.Field<*, FloatDescriptor>)
is DoubleDescriptor -> DoubleDescriptor(descriptorId, retrievableId, Value.Double(result.getDouble(VALUE_ATTRIBUTE_NAME)), this.field as Schema.Field<*, DoubleDescriptor>)
is StringDescriptor -> StringDescriptor(descriptorId, retrievableId, Value.String(result.getString(VALUE_ATTRIBUTE_NAME)), this.field as Schema.Field<*, StringDescriptor>)
is TextDescriptor -> TextDescriptor(descriptorId, retrievableId, Value.Text(result.getString(VALUE_ATTRIBUTE_NAME)),this.field as Schema.Field<*, TextDescriptor>)
}
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package org.vitrivr.engine.query.operators.transform.benchmark

import io.github.oshai.kotlinlogging.KLogger
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.serialization.json.Json
import kotlinx.serialization.json.encodeToJsonElement
import java.io.*
import java.nio.file.Path
import java.util.concurrent.BlockingQueue
import java.util.concurrent.LinkedBlockingQueue


class BenchmarkLogger(val logfile: Path) : Runnable {
private val logger: KLogger = KotlinLogging.logger {}

private val queue: BlockingQueue<BenchmarkMessage> = LinkedBlockingQueue()

infix fun log(message: BenchmarkMessage) {
queue.add(message)
}

override fun run() {
while (true) {

val log = queue.take()
logger.info { log }


FileOutputStream(File(logfile.toString()), true).bufferedWriter().use { writer ->
writer.appendLine("${Json.encodeToJsonElement(log).toString()},")
writer.close()
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
package org.vitrivr.engine.query.operators.transform.benchmark

import kotlinx.serialization.Serializable

@Serializable
data class BenchmarkMessage (
val name: String,
val source: String,
val timestamp: String,
val inputSize: Int,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
package org.vitrivr.engine.query.operators.transform.benchmark

import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.emitAll
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.toList
import org.vitrivr.engine.core.database.descriptor.DescriptorReader
import org.vitrivr.engine.core.model.metamodel.Schema
import org.vitrivr.engine.core.model.retrievable.Retrievable
import org.vitrivr.engine.core.model.retrievable.Retrieved
import org.vitrivr.engine.core.model.retrievable.attributes.PropertyAttribute
import org.vitrivr.engine.core.model.types.Value
import org.vitrivr.engine.core.operators.Operator
import org.vitrivr.engine.core.operators.general.Transformer
import java.nio.file.Path
import java.time.LocalDateTime
import java.util.Timer
import javax.management.Descriptor

/**
* Appends [Descriptor] to a [Retrieved] based on the values of a [Schema.Field], if available.
*
* @version 1.1.2
* @author Luca Rossetto
* @author Ralph Gasser
*/
class TimeBenchmark(
override val input: Operator<out Retrievable>,
val path: Path,
val pretty: String,
override val name: String
) : Transformer {

companion object {
@Volatile
private var bl: BenchmarkLogger? = null
}

init {
if (bl == null) {
bl = BenchmarkLogger(path)
Thread(bl).start()
}
}

override fun toFlow(scope: CoroutineScope): Flow<Retrievable> = flow {
val inputRetrieved = input.toFlow(scope).toList()
bl!! log BenchmarkMessage(name, pretty, LocalDateTime.now().toString(), inputRetrieved.size)
inputRetrieved.forEach { emit(it) }
}
}


Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package org.vitrivr.engine.query.operators.transform.benchmark

import org.vitrivr.engine.core.context.Context
import org.vitrivr.engine.core.context.QueryContext
import org.vitrivr.engine.core.model.retrievable.Retrievable
import org.vitrivr.engine.core.operators.Operator
import org.vitrivr.engine.core.operators.general.TransformerFactory
import kotlin.io.path.Path

class TimeBenchmarkFactory() : TransformerFactory {
override fun newTransformer(name: String, input: Operator<out Retrievable>, context: Context): TimeBenchmark {
require(context is QueryContext)
val logfilePath = Path(context[name, "logfile"]?.toString() ?: "benchmark.log")
val prettyName = context[name, "pretty"]?.toString() ?: name
return TimeBenchmark(input, logfilePath, prettyName, name)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package org.vitrivr.engine.query.operators.transform.filter

import io.github.oshai.kotlinlogging.KLogger
import io.github.oshai.kotlinlogging.KotlinLogging
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.flow.Flow
import kotlinx.coroutines.flow.flow
import kotlinx.coroutines.flow.toList
import org.vitrivr.engine.core.database.descriptor.DescriptorReader
import org.vitrivr.engine.core.model.metamodel.Schema
import org.vitrivr.engine.core.model.query.basics.ComparisonOperator
import org.vitrivr.engine.core.model.retrievable.Retrievable
import org.vitrivr.engine.core.model.retrievable.Retrieved
import org.vitrivr.engine.core.model.retrievable.attributes.PropertyAttribute
import org.vitrivr.engine.core.model.types.Value
import org.vitrivr.engine.core.operators.Operator
import org.vitrivr.engine.core.operators.general.Transformer
import java.sql.Date
import javax.management.Descriptor

/**
* Appends [Descriptor] to a [Retrieved] based on the values of a [Schema.Field], if available.
*
* @version 1.1.2
* @author Luca Rossetto
* @author Ralph Gasser
*/
class FieldLookupLateFilter(
override val input: Operator<out Retrievable>,
/* The reader for a given field. */
private val reader: DescriptorReader<*>,
/* keys to filter on */
val keys: List<String>,
/* boolean operator*/
val comparison: ComparisonOperator = ComparisonOperator.EQ,
/* value to compare to */
val value: String,
/* append field*/
val append: Boolean,
/* appends late filter */
val limit: Int = Int.MAX_VALUE,
override val name: String
) : Transformer {
private val logger: KLogger = KotlinLogging.logger {}

override fun toFlow(scope: CoroutineScope): Flow<Retrievable> = flow {
/* Parse input IDs.*/
val inputRetrieved = input.toFlow(scope).toList()

/* Fetch entries for the provided IDs. */
val ids = inputRetrieved.map { it.id }.toSet()
val descriptors = if (ids.isEmpty()) {
emptyMap()
} else {
this@FieldLookupLateFilter.reader.getAllForRetrievable(ids).associateBy { it.retrievableId!! }
}

// Multi keys for
if (keys.size > 1)
throw IllegalArgumentException("only one key is supported yet")

var emitted = 0
/* Emit retrievable with added attribute. */
inputRetrieved.forEach { retrieved ->
val descriptor = descriptors[retrieved.id]
if (descriptor != null) {
//retrieved.addDescriptor(descriptor)
/* Somewhat experimental. Goal: Attach information in a meaningful manner, such that it can be serialised */
val values = descriptor.values().toMap()
val attribute = keys.map {
(when (values[it]) {
is Value.String -> Pair(it to (values[it] as Value.String), Value.of(value.toString()))
is Value.Text -> Pair(it to (values[it] as Value.Text), Value.of(value.toString()))
is Value.Boolean -> Pair(it to (values[it] as Value.Boolean), Value.of(value.toBoolean()))
is Value.Int -> Pair(it to (values[it] as Value.Int), Value.of(value.toInt()))
is Value.Long -> Pair(it to (values[it] as Value.Long), Value.of(value.toLong()))
is Value.Float -> Pair(it to (values[it] as Value.Float), Value.of(value.toFloat()))
is Value.Double -> Pair(it to (values[it] as Value.Double), Value.of(value.toDouble()))
is Value.Byte -> Pair(it to (values[it] as Value.Byte), Value.of(value.toByte()))
is Value.Short -> Pair(it to (values[it] as Value.Short), Value.of(value.toShort()))
is Value.DateTime -> Pair(it to (values[it] as Value.DateTime), Value.of(Date.valueOf(value)))
else -> Pair(it to null, null)
})
}

retrieved.takeIf { append == true }?.let {
retrieved.addDescriptor(descriptor)
retrieved.addAttribute(PropertyAttribute(attribute.map { it.first.first.toString() to it.first.second!!.value.toString() }
.toMap()))
}

attribute[0].takeIf { it.first.second != null && it.second != null }?.let {
it.takeIf { ++emitted <= limit && comparison.compare(it.first.second!!, it.second!!) }?.let {
emit(retrieved)
}
}
}
}
}
}
Loading
Loading