Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Vector Embeddings #121

Open
wants to merge 20 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
64a7ef2
Vector Embeddings - Initial Commit
felipeelia Jan 16, 2025
265b928
Custom table to store vectors
felipeelia Jan 20, 2025
689fedb
Fix tablename
felipeelia Jan 20, 2025
01ccdd0
Different object representation
felipeelia Jan 22, 2025
ad89f61
Simplify things a bit
felipeelia Jan 24, 2025
9eb7413
Unify dimensions calculation + meta and terms data
felipeelia Jan 24, 2025
6b7f17c
Use the weighting engine to choose which terms and meta to vectorize
felipeelia Jan 24, 2025
69049ae
Remove initial support for WP_Term_Query
felipeelia Jan 27, 2025
cc83899
Rearrange a few things and add new filters
felipeelia Jan 27, 2025
ab49737
Add new ep_openai_embeddings_should_add_vector_field_to_post filter
felipeelia Jan 28, 2025
7525dba
use the ep_embeddings_ prefix consistently
felipeelia Jan 28, 2025
ca50fe4
Merge branch 'develop' into feature/vector-embeddings
felipeelia Jan 28, 2025
58c9c5c
Merge branch 'develop' into feature/vector-embeddings
felipeelia Jan 30, 2025
caff18f
Indexables array + fix content generation
felipeelia Jan 30, 2025
b18c898
Expose dimensions as a field
felipeelia Feb 3, 2025
457b585
Merge branch 'develop' into feature/vector-embeddings
felipeelia Feb 4, 2025
33b0ea2
Draft sending chunks to EP.io so it is enqueued for processing
felipeelia Feb 17, 2025
869d1f5
Conditionally use a text_chunks field to store the raw text chunks
felipeelia Feb 24, 2025
2d56350
Make sending method filterable
felipeelia Feb 24, 2025
c7bbe71
New ep_embeddings_control field
felipeelia Mar 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 176 additions & 0 deletions includes/classes/Feature/VectorEmbeddings/Indexable.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
<?php
/**
* Vector Embeddings - Indexable
*
* As each indexable type (posts, terms, comments, users) uses different hooks, this abstract class is used to
* keep implementations independent.
*
* @since 2.4.0
* @package ElasticPressLabs
*/

namespace ElasticPressLabs\Feature\VectorEmbeddings;

use ElasticPress\Elasticsearch;

/**
* Vector Embeddings Indexable abstract class
*/
abstract class Indexable {
/**
* VectorEmbeddings instance
*
* @var VectorEmbeddings
*/
protected $feature;

/**
* Class constructor
*
* @param VectorEmbeddings $feature The VectorEmbeddings feature instance
*/
public function __construct( VectorEmbeddings $feature ) {
$this->feature = $feature;
}

/**
* Add a vector field to the Elasticsearch mapping.
*
* @param array $mapping Current mapping.
* @param bool $quantization Whether to use quantization for the vector field. Default false.
* @return array
*/
public function add_vector_mapping_field( array $mapping, bool $quantization = true ): array {
$es_version = Elasticsearch::factory()->get_elasticsearch_version();

// Don't add the field if it already exists.
if ( isset( $mapping['mappings']['properties']['chunks'], $mapping['mappings']['properties']['ep_embeddings_control'] ) ) {
return $mapping;
}

// Add the default vector field mapping.
$mapping['mappings']['properties'] = array_merge(
$mapping['mappings']['properties'],
[
'ep_embeddings_control' => [
'properties' => [
'is_processing' => [
'type' => 'boolean',
],
'errors' => [
'type' => 'text',
],
'text_chunks' => [
'type' => 'text',
],
],
],
'chunks' => [
'type' => 'nested',
'properties' => [
'vector' => [
'type' => 'dense_vector',
'dims' => $this->feature->get_dimensions(),
],
],
],
]
);

// Add extra vector fields for newer versions of Elasticsearch.
if ( version_compare( $es_version, '8.0', '>=' ) ) {
// The index (true or false, default true) and similarity (l2_norm, dot_product or cosine) fields
// were added in 8.0. The similarity field must be set if index is true.
$mapping['mappings']['properties']['chunks']['properties']['vector'] = array_merge(
$mapping['mappings']['properties']['chunks']['properties']['vector'],
[
'index' => true,
'similarity' => 'cosine',
]
);

// The element_type field was added in 8.6. This can be either float (default) or byte.
if ( version_compare( $es_version, '8.6', '>=' ) ) {
$mapping['mappings']['properties']['chunks']['properties']['vector']['element_type'] = 'float';
}

// The int8_hnsw type was added in 8.12.
if ( $quantization && version_compare( $es_version, '8.12', '>=' ) ) {
// This is supposed to result in better performance but slightly less accurate results.
// See https://www.elastic.co/guide/en/elasticsearch/reference/8.13/knn-search.html#knn-search-quantized-example.
// Can test with this on and off and compare results to see what works best.
$mapping['mappings']['properties']['chunks']['properties']['vector']['index_options']['type'] = 'int8_hnsw';
}
}

return $mapping;
}

/**
* Add the embedding data to the post vector sync args.
*
* @param array $args The current sync args (an Elasticsearch document)
* @param array $embeddings The embeddings to add to the sync args
* @return array
*/
public function add_chunks_field_value( array $args, array $embeddings ): array {
// If we still don't have embeddings, return early.
if ( empty( $embeddings ) ) {
return $args;
}

// Add the embeddings data to the sync args.
$args['chunks'] = [];

foreach ( $embeddings as $embedding ) {
$args['chunks'][] = [
'vector' => array_map( 'floatval', $embedding ),
];
}

return $args;
}

/**
* Determine the way to send the text chunks to the Elasticsearch server.
*
* Depending on the size of the text chunks, it can either be sent using the index action args
* (the JSON object where we determine the index to be used, etc.) or as a regular field in the Elasticsearch.
* To avoid overhead, we prefer to send it as an index action arg, but sometimes it is just too big for it.
*
* @param array $text_chunks The text chunks
* @return string The method. Can be 'index_action_args' or 'es_doc_field'.
*/
protected function get_text_chunks_sending_method( $text_chunks ) {
$post_chunks_size = mb_strlen( wp_json_encode( $text_chunks ), '8bit' );

/**
* Filter to determine the threshold size for the text chunks to be sent as an index action arg.
*
* @hook ep_embeddings_sending_method_limit
* @since 2.4.0
*
* @param {int} $size The size limit in bytes. Defaults to 200kb.
* @return {int} The new $size value.
*/
$size_limit = apply_filters( 'ep_embeddings_sending_method_limit', 200 * KB_IN_BYTES );

$method = $post_chunks_size < $size_limit ? 'index_action_args' : 'es_doc_field';

/**
* Filter to determine the method to be used to send the text chunks.
*
* Unless you are implementing a custom solution, return should be either 'index_action_args' or 'es_doc_field'.
*
* @hook ep_embeddings_sending_method
* @since 2.4.0
*
* @param {string} $method The method to be used.
* @param {int} $text_chunks The text chunks being analyzed.
* @param {int} $post_chunks_size The determined size of the text chunks.
* @param {int} $size_limit The size limit in bytes. Defaults to 200kb.
* @return {string} The new $method value.
*/
return apply_filters( 'ep_embeddings_sending_method', $method, $text_chunks, $post_chunks_size, $size_limit );
}
}
Loading
Loading