From 64a7ef2be77a243ac2e2a95fb8980c1233c6031e Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Thu, 16 Jan 2025 11:35:42 -0300 Subject: [PATCH 01/17] Vector Embeddings - Initial Commit --- .../Feature/VectorEmbeddings/Indexable.php | 33 ++ .../VectorEmbeddings/Indexables/Post.php | 123 +++++ .../VectorEmbeddings/Indexables/Term.php | 109 ++++ .../VectorEmbeddings/VectorEmbeddings.php | 493 ++++++++++++++++++ includes/functions/core.php | 3 + 5 files changed, 761 insertions(+) create mode 100644 includes/classes/Feature/VectorEmbeddings/Indexable.php create mode 100644 includes/classes/Feature/VectorEmbeddings/Indexables/Post.php create mode 100644 includes/classes/Feature/VectorEmbeddings/Indexables/Term.php create mode 100644 includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php diff --git a/includes/classes/Feature/VectorEmbeddings/Indexable.php b/includes/classes/Feature/VectorEmbeddings/Indexable.php new file mode 100644 index 0000000..e129d62 --- /dev/null +++ b/includes/classes/Feature/VectorEmbeddings/Indexable.php @@ -0,0 +1,33 @@ +feature = $feature; + } +} diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php new file mode 100644 index 0000000..01fb95c --- /dev/null +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -0,0 +1,123 @@ +feature->get_setting( 'ep_external_embedding' ) ) { + add_filter( 'ep_post_sync_args_post_prepare_meta', [ $this, 'add_vector_field_to_post_sync' ], 10, 2 ); + } + } + + /** + * Add our vector field mapping to the Elasticsearch post index. + * + * @param array $mapping Current mapping. + * @return array + */ + public function add_post_vector_field_mapping( array $mapping ): array { + return $this->feature->add_vector_mapping_field( $mapping ); + } + + /** + * Exclude our vector meta from being synced. + * + * @param array $excluded_keys Current excluded keys. + * @return array + */ + public function exclude_vector_meta( array $excluded_keys ): array { + $excluded_keys[] = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); + return $excluded_keys; + } + + /** + * Add the embedding data to the post vector sync args. + * + * @param array $args Current sync args. + * @param int $post_id Post ID being synced. + * @return array + */ + public function add_vector_field_to_post_sync( array $args, int $post_id ): array { + // No need to add vector data if no content exists. + $post = get_post( $post_id ); + if ( empty( $post->post_content ) ) { + return $args; + } + $meta_field = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); + + // Try to use the stored embeddings first. + $embeddings = get_post_meta( $post_id, $meta_field, true ); + + // If they don't exist, make API requests to generate them. + if ( ! $embeddings ) { + $embeddings = []; + + $content_chunks = $this->feature->chunk_content( $post->post_content ); + + // Get the embeddings for each chunk. + if ( ! empty( $content_chunks ) ) { + foreach ( $content_chunks as $chunk ) { + $embedding = $this->feature->get_embedding( $chunk ); + + if ( $embedding && ! is_wp_error( $embedding ) ) { + $embeddings[] = array_map( 'floatval', $embedding ); + } + } + } + + // Add embeddings for title. + $title_embedding = $this->feature->get_embedding( $this->feature->normalize_content( $post->post_title ) ); + if ( $title_embedding && ! is_wp_error( $title_embedding ) ) { + $embeddings[] = array_map( 'floatval', $title_embedding ); + } + + // Add embeddings for slug. + $slug_embedding = $this->feature->get_embedding( $post->post_name ); + if ( $slug_embedding && ! is_wp_error( $slug_embedding ) ) { + $embeddings[] = array_map( 'floatval', $slug_embedding ); + } + + // Store the embeddings for future use. + if ( ! empty( $embeddings ) ) { + update_post_meta( $post_id, $meta_field, $embeddings ); + } + } + + // If we still don't have embeddings, return early. + if ( ! $embeddings || empty( $embeddings ) ) { + return $args; + } + + // Add the embeddings data to the sync args. + $args['chunks'] = []; + + foreach ( $embeddings as $embedding ) { + $args['chunks'][] = [ + 'vector' => array_map( 'floatval', $embedding ), + ]; + } + + return $args; + } +} diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Term.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Term.php new file mode 100644 index 0000000..766b1f2 --- /dev/null +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Term.php @@ -0,0 +1,109 @@ +feature->get_setting( 'ep_external_embedding' ) ) { + add_filter( 'ep_term_sync_args', [ $this, 'add_vector_field_to_term_sync' ], 10, 2 ); + } + } + + /** + * Add our vector field mapping to the Elasticsearch term index. + * + * @param array $mapping Current mapping. + * @return array + */ + public function add_term_vector_field_mapping( array $mapping ): array { + return $this->feature->add_vector_mapping_field( $mapping ); + } + + /** + * Exclude our vector meta from being synced. + * + * @param array $excluded_keys Current excluded keys. + * @return array + */ + public function exclude_vector_meta( array $excluded_keys ): array { + $excluded_keys[] = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); + return $excluded_keys; + } + + /** + * Add the embedding data to the term vector sync args. + * + * @param array $args Current sync args. + * @param int $term_id Term ID being synced. + * @return array + */ + public function add_vector_field_to_term_sync( array $args, int $term_id ): array { + $meta_field = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); + // Try to use the stored embeddings first. + $embeddings = get_term_meta( $term_id, $meta_field, true ); + + // If they don't exist, make API requests to generate them. + if ( ! $embeddings ) { + $term = get_term( $term_id ); + + // Build up the content we want to generate embeddings for. + $content = $term->name . ' ' . $term->slug . ' ' . $term->description; + + $embeddings = []; + $content_chunks = $this->feature->chunk_content( $content ); + + // Get the embeddings for each chunk. + if ( ! empty( $content_chunks ) ) { + foreach ( $content_chunks as $chunk ) { + $embedding = $this->feature->get_embedding( $chunk ); + + if ( $embedding && ! is_wp_error( $embedding ) ) { + $embeddings[] = array_map( 'floatval', $embedding ); + } + } + } + + // Store the embeddings for future use. + if ( ! empty( $embeddings ) ) { + update_term_meta( $term_id, $meta_field, $embeddings ); + } + } + + // If we still don't have embeddings, return early. + if ( ! $embeddings || empty( $embeddings ) ) { + return $args; + } + + // Add the embeddings data to the sync args. + $args['chunks'] = []; + + foreach ( $embeddings as $embedding ) { + $args['chunks'][] = [ + 'vector' => array_map( 'floatval', $embedding ), + ]; + } + + return $args; + } +} diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php new file mode 100644 index 0000000..e8b82c2 --- /dev/null +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -0,0 +1,493 @@ + '', + 'ep_openai_embeddings_api_url' => 'https://api.openai.com/v1/embeddings', + 'ep_openai_embedding_model' => 'text-embedding-3-small', + 'ep_vector_embeddings_meta_field' => 'vector_embeddings', + 'ep_external_embedding' => '0', + ]; + + /** + * Initialize feature setting it's config + */ + public function __construct() { + $this->slug = 'vector_embeddings'; + + $this->title = esc_html__( 'Vector Embeddings', 'elasticpress-labs' ); + + $this->requires_install_reindex = true; + + $this->summary = __( + 'This feature enables storage of vector embeddings, a numerical representation of the indexed content that can capture semantic relationships and similarities between data points. These embeddings are often used by AI models to process and understand complex information more efficiently and are used for features like natural language processing, recommendations and computer vision.', + 'elasticpress-labs' + ); + + $this->es_version = Elasticsearch::factory()->get_elasticsearch_version(); + + parent::__construct(); + } + + /** + * Connects the Module with WordPress using Hooks and/or Filters. + * + * @return void + */ + public function setup() { + $post_indexable = new Indexables\Post( $this ); + $post_indexable->setup(); + + $term_indexable = new Indexables\Term( $this ); + $term_indexable->setup(); + } + + /** + * Tell user whether requirements for feature are met or not. + * + * @return FeatureRequirementsStatus Requirements object + */ + public function requirements_status() { + $status = new \ElasticPress\FeatureRequirementsStatus( 1 ); + + // Vector support was added in Elasticsearch 7.0. + if ( version_compare( $this->es_version, '7.0', '<=' ) ) { + $status->code = 2; + $status->message = esc_html__( 'You need to have Elasticsearch with version >7.0.', 'elasticpress-labs' ); + } + + return $status; + } + + /** + * Set the `settings_schema` attribute + */ + public function set_settings_schema() { + $this->settings_schema = [ + [ + 'key' => 'ep_openai_api_key', + 'label' => __( 'OpenAI API Key', 'elasticpress-labs' ), + 'help' => sprintf( + wp_kses( + /* translators: %1$s: OpenAI sign up URL */ + __( 'Don\'t have an OpenAI account yet? Sign up for one in order to get your API key.', 'elasticpress-labs' ), + [ + 'a' => [ + 'href' => [], + 'title' => [], + ], + ] + ), + esc_url( 'https://platform.openai.com/signup' ) + ), + 'type' => 'text', + ], + [ + 'help' => __( 'OpenAI Embeddings API Url', 'elasticpress-labs' ), + 'key' => 'ep_openai_embeddings_api_url', + 'label' => __( 'OpenAI Embeddings API Url', 'elasticpress-labs' ), + 'type' => 'text', + ], + [ + 'help' => __( 'OpenAI Embedding model', 'elasticpress-labs' ), + 'key' => 'ep_openai_embedding_model', + 'label' => __( 'The name of the embedding model to use', 'elasticpress-labs' ), + 'type' => 'text', + ], + [ + 'help' => __( 'Specify the postmeta field name that will hold vector embeddings and will be added as dense vector in Elasticsearch mapping.', 'elasticpress-labs' ), + 'key' => 'ep_vector_embeddings_meta_field', + 'label' => __( 'Meta field holding the vector_embeddings', 'elasticpress-labs' ), + 'type' => 'text', + ], + [ + 'key' => 'ep_external_embedding', + 'help' => __( 'Enable this if an external process is providing the vector_embeddings meta field provided above with content. This will disable ElasticPress\'s control over embedding generation', 'elasticpress-labs' ), + 'label' => __( 'External embedding processing', 'elasticpress-labs' ), + 'type' => 'checkbox', + ], + ]; + } + + /** + * Add a vector field to the Elasticsearch mapping. + * + * @param array $mapping Current mapping. + * @param null|int $dimensions Number of dimensions for the vector field. + * @param bool $quantization Whether to use quantization for the vector field. Default false. + * @return array + */ + public function add_vector_mapping_field( array $mapping, $dimensions = null, bool $quantization = true ): array { + // Don't add the field if it already exists. + if ( isset( $mapping['mappings']['properties']['chunks'] ) ) { + return $mapping; + } + + // This needs to match the dimensions your model uses and be between 1 and 4096. + if ( ! $dimensions ) { + $dimensions = $this->get_dimensions(); + } + $calc_dimensions = max( 1, min( 4096, $dimensions ) ); + + // Add the default vector field mapping. + $mapping['mappings']['properties']['chunks'] = [ + 'type' => 'nested', + 'properties' => [ + 'vector' => [ + 'type' => 'dense_vector', + 'dims' => (int) $calc_dimensions, + ], + ], + ]; + + // Add extra vector fields for newer versions of Elasticsearch. + if ( version_compare( $this->es_version, '8.0', '>=' ) ) { + // The index (true or false, default true) and similarity (l2_norm, dot_product or cosine) fields + // were added in 8.0. The similarity field must be set if index is true. + $mapping['mappings']['properties']['chunks']['properties']['vector'] = array_merge( + $mapping['mappings']['properties']['chunks']['properties']['vector'], + [ + 'index' => true, + 'similarity' => 'cosine', + ] + ); + + // The element_type field was added in 8.6. This can be either float (default) or byte. + if ( version_compare( $this->es_version, '8.6', '>=' ) ) { + $mapping['mappings']['properties']['chunks']['properties']['vector']['element_type'] = 'float'; + } + + // The int8_hnsw type was added in 8.12. + if ( $quantization && version_compare( $this->es_version, '8.12', '>=' ) ) { + // This is supposed to result in better performance but slightly less accurate results. + // See https://www.elastic.co/guide/en/elasticsearch/reference/8.13/knn-search.html#knn-search-quantized-example. + // Can test with this on and off and compare results to see what works best. + $mapping['mappings']['properties']['chunks']['properties']['vector']['index_options']['type'] = 'int8_hnsw'; + } + } + + return $mapping; + } + + /** + * Get an embedding from a given text. + * + * @param string $text Text to get the embedding for. + * @param bool $cache Whether to cache the result. Default false. + * @return array|WP_Error + */ + public function get_embedding( string $text, bool $cache = false ) { + // Check to see if we have a stored embedding. + if ( $cache ) { + $key = 'ep_embedding_' . sanitize_title( $text ); + $query_embedding = wp_cache_get( $key, 'ep_embeddings' ); + + if ( $query_embedding ) { + return $query_embedding; + } + } + + // Generate the embedding. + $embedding = $this->generate_embedding( $text ); + + if ( is_wp_error( $embedding ) ) { + return $embedding; + } + + // Store the embedding for future use if desired. + if ( $cache ) { + wp_cache_set( $key, $embedding, 'ep_embeddings', false ); + } + + return $embedding; + } + + /** + * Generate an embedding for a particular piece of text. + * + * @param string $text Text to generate the embedding for. + * @return array|boolean|WP_Error + */ + public function generate_embedding( string $text = '' ) { + /** + * Filter the URL for the post request. + * + * @hook ep_openai_embeddings_api_url + * @since 2.4.0 + * + * @param {string} $url The URL for the request. + * + * @return {string} The URL for the request. + */ + $url = apply_filters( 'ep_openai_embeddings_api_url', $this->get_setting( 'ep_openai_embeddings_api_url' ) ); + + /** + * Filter the request body before sending to OpenAI. + * + * @hook ep_openai_embeddings_request_body + * @since 2.4.0 + * + * @param {array} $body Request body that will be sent to OpenAI. + * @param {string} $text Text we are getting embeddings for. + * + * @return {array} Request body. + */ + $body = apply_filters( + 'ep_openai_embeddings_request_body', + [ + 'model' => $this->get_setting( 'ep_openai_embedding_model' ), + 'input' => $text, + 'dimensions' => $this->get_dimensions(), + ], + $text + ); + + /** + * Filter the options for the post request. + * + * @hook ep_openai_embeddings_options + * @since 2.4.0 + * + * @param {array} $options The options for the request. + * @param {string} $url The URL for the request. + * + * @return {array} The options for the request. + */ + $options = apply_filters( + 'ep_openai_embeddings_options', + [ + 'body' => wp_json_encode( $body ), + 'timeout' => 60, // phpcs:ignore WordPressVIPMinimum.Performance.RemoteRequestTimeout.timeout_timeout + ], + $url + ); + + $this->add_headers( $options ); + + // Make our API request. + $response = $this->get_result( + wp_remote_post( + $url, + $options + ) + ); + + if ( is_wp_error( $response ) ) { + return $response; + } + + if ( empty( $response['data'] ) ) { + return new WP_Error( 'no_data', esc_html__( 'No data returned from OpenAI.', 'elasticpress-labs' ) ); + } + + $return = []; + + // Parse out the embeddings response. + foreach ( $response['data'] as $data ) { + if ( ! isset( $data['embedding'] ) || ! is_array( $data['embedding'] ) ) { + continue; + } + + $return = $data['embedding']; + break; + } + + return $return; + } + + /** + * Get results from the response. + * + * @param object $response The API response. + * @return array|WP_Error + */ + public function get_result( $response ) { + if ( is_wp_error( $response ) ) { + return $response; + } + + $headers = wp_remote_retrieve_headers( $response ); + $content_type = false; + + if ( ! empty( $headers ) ) { + $content_type = isset( $headers['content-type'] ) ? $headers['content-type'] : false; + } + + $body = wp_remote_retrieve_body( $response ); + $code = wp_remote_retrieve_response_code( $response ); + + if ( false === $content_type || false !== strpos( $content_type, 'application/json' ) ) { + $json = json_decode( $body, true ); + + if ( json_last_error() === JSON_ERROR_NONE ) { + if ( empty( $json['error'] ) ) { + return $json; + } else { + $message = $json['error']['message'] ?? esc_html__( 'An error occured', 'elasticpresslabs' ); + return new WP_Error( $code, $message ); + } + } else { + return new WP_Error( 'Invalid JSON: ' . json_last_error_msg(), $body ); + } + } elseif ( $content_type && false !== strpos( $content_type, 'audio/mpeg' ) ) { + return $response; + } else { + return new WP_Error( 'Invalid content type', $response ); + } + } + + /** + * Normalizes content into plain text. + * + * @param string $content Content to normalize. + * @return string + */ + public function normalize_content( string $content = '' ): string { + $content = apply_filters( 'the_content', $content ); + + // Strip shortcodes but keep internal caption text. + $content = preg_replace( '#\[.+\](.+)\[/.+\]#', '$1', $content ); + + // Strip HTML entities. + $content = preg_replace( '/&#?[a-z0-9]{2,8};/i', '', $content ); + + // Replace HTML linebreaks with newlines. + $content = preg_replace( '##', "\n\n", $content ); + + // Strip all HTML tags. + $content = wp_strip_all_tags( $content ); + + return $content; + } + + /** + * Chunk content into smaller pieces with an overlap. + * + * @param string $content Content to chunk. + * @param int $chunk_size Size of each chunk, in words. + * @param int $overlap_size Overlap size for each chunk, in words. + * @return array + */ + public function chunk_content( string $content = '', int $chunk_size = 150, $overlap_size = 25 ): array { + // Normalize our content. + $content = $this->normalize_content( $content ); + + // Remove multiple whitespaces. + $content = preg_replace( '/\s+/', ' ', $content ); + + // Split text by single whitespace. + $words = explode( ' ', $content ); + + $chunks = []; + $text_count = count( $words ); + + // Iterate through & chunk data with an overlap. + for ( $i = 0; $i < $text_count; $i += $chunk_size ) { + // Join a set of words into a string. + $chunk = implode( + ' ', + array_slice( + $words, + max( $i - $overlap_size, 0 ), + $i + $chunk_size + ) + ); + + array_push( $chunks, $chunk ); + } + + return $chunks; + } + + /** + * Get the number of dimensions for the embeddings. + * + * @return int + */ + public function get_dimensions(): int { + /** + * Filter the dimensions we want for each embedding. + * + * Useful if you want to increase or decrease the length + * of each embedding. + * + * @hook ep_openai_embeddings_dimensions + * @since 2.4.0 + * + * @param {int} $dimensions The default dimensions. + * @return {int} The dimensions. + */ + return apply_filters( 'ep_openai_embeddings_dimensions', $this->dimensions ); + } + + /** + * Add the headers. + * + * @param array $options The header options, passed by reference. + */ + public function add_headers( array &$options = [] ) { + if ( empty( $options['headers'] ) ) { + $options['headers'] = []; + } + + if ( ! isset( $options['headers']['Authorization'] ) ) { + $options['headers']['Authorization'] = $this->get_auth_header(); + } + + if ( ! isset( $options['headers']['Content-Type'] ) ) { + $options['headers']['Content-Type'] = 'application/json'; + } + } + + /** + * Get the auth header. + * + * @return string + */ + public function get_auth_header() { + return 'Bearer ' . $this->get_setting( 'ep_openai_api_key' ); + } +} diff --git a/includes/functions/core.php b/includes/functions/core.php index f7cae24..05ffcd7 100644 --- a/includes/functions/core.php +++ b/includes/functions/core.php @@ -232,6 +232,9 @@ function maybe_load_features() { \ElasticPress\Features::factory()->register_feature( $subfeature ); } } + + $vector_embeddings = new \ElasticPressLabs\Feature\VectorEmbeddings\VectorEmbeddings(); + \ElasticPress\Features::factory()->register_feature( $vector_embeddings ); } /** From 265b928e44f1776a673f1a95af6e1c2cba669d87 Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Mon, 20 Jan 2025 18:02:13 -0300 Subject: [PATCH 02/17] Custom table to store vectors --- .../Feature/VectorEmbeddings/Indexable.php | 71 ++++++ .../VectorEmbeddings/Indexables/Post.php | 84 +++---- .../VectorEmbeddings/Storage/DbTable.php | 207 ++++++++++++++++++ .../VectorEmbeddings/VectorEmbeddings.php | 33 ++- 4 files changed, 332 insertions(+), 63 deletions(-) create mode 100644 includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php diff --git a/includes/classes/Feature/VectorEmbeddings/Indexable.php b/includes/classes/Feature/VectorEmbeddings/Indexable.php index e129d62..4187896 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexable.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexable.php @@ -30,4 +30,75 @@ abstract class Indexable { public function __construct( VectorEmbeddings $feature ) { $this->feature = $feature; } + + /** + * Given an object and its content pieces, return the embeddings and clean up unused embeddings stored. + * + * @param int $object_id The object ID + * @param string $object_type The object type + * @param array $content_pieces Content pieces to get embeddings for + * @return array + */ + public function get_updated_embeddings( int $object_id, string $object_type, array $content_pieces ): array { + $all_hashes = $this->feature->storage->get_all_object_hashes( $object_id, 'post' ); + + $hashes_in_use = []; + $embeddings = []; + foreach ( $content_pieces as $content_piece ) { + $content_chunks = $this->feature->chunk_content( $content_piece ); + + // Get the embeddings for each chunk. + if ( ! empty( $content_chunks ) ) { + foreach ( $content_chunks as $chunk ) { + $hash = $this->feature->storage->hash_content( $chunk ); + + $hashes_in_use[] = $hash; + + if ( isset( $all_hashes[ $hash ] ) ) { + $embeddings[] = $all_hashes[ $hash ]; + continue; + } + + $embedding = $this->feature->get_embedding( $object_id, $object_type, $chunk ); + if ( $embedding ) { + $embeddings[] = $embedding; + } + } + } + } + + $hashes_in_use = array_unique( $hashes_in_use ); + + $unused_hashes = array_diff( array_keys( $all_hashes ), $hashes_in_use ); + foreach ( $unused_hashes as $unused_hash ) { + $this->feature->storage->delete( $object_id, $object_type, $unused_hash ); + } + + return $embeddings; + } + + /** + * Add the embedding data to the post vector sync args. + * + * @param array $args The current sync args (an Elasticsearch document) + * @param array $embeddings The embeddings to add to the sync args + * @return array + */ + public function add_chuncks_field_value( array $args, array $embeddings ): array { + // If we still don't have embeddings, return early. + if ( empty( $embeddings ) ) { + return $args; + } + + // Add the embeddings data to the sync args. + $args['chunks'] = []; + + foreach ( $embeddings as $embedding ) { + $args['chunks'][] = [ + 'vector' => array_map( 'floatval', $embedding ), + ]; + } + + return $args; + } } diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index 01fb95c..f78ea55 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -59,65 +59,43 @@ public function exclude_vector_meta( array $excluded_keys ): array { * @return array */ public function add_vector_field_to_post_sync( array $args, int $post_id ): array { - // No need to add vector data if no content exists. - $post = get_post( $post_id ); - if ( empty( $post->post_content ) ) { + if ( ! $this->should_add_vector_field_to_post( $post_id ) ) { return $args; } - $meta_field = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); - - // Try to use the stored embeddings first. - $embeddings = get_post_meta( $post_id, $meta_field, true ); - - // If they don't exist, make API requests to generate them. - if ( ! $embeddings ) { - $embeddings = []; - - $content_chunks = $this->feature->chunk_content( $post->post_content ); - - // Get the embeddings for each chunk. - if ( ! empty( $content_chunks ) ) { - foreach ( $content_chunks as $chunk ) { - $embedding = $this->feature->get_embedding( $chunk ); - - if ( $embedding && ! is_wp_error( $embedding ) ) { - $embeddings[] = array_map( 'floatval', $embedding ); - } - } - } - // Add embeddings for title. - $title_embedding = $this->feature->get_embedding( $this->feature->normalize_content( $post->post_title ) ); - if ( $title_embedding && ! is_wp_error( $title_embedding ) ) { - $embeddings[] = array_map( 'floatval', $title_embedding ); - } + $content_pieces = $this->get_object_content_pieces( $post_id ); + $embeddings = $this->get_updated_embeddings( $post_id, 'post', $content_pieces ); - // Add embeddings for slug. - $slug_embedding = $this->feature->get_embedding( $post->post_name ); - if ( $slug_embedding && ! is_wp_error( $slug_embedding ) ) { - $embeddings[] = array_map( 'floatval', $slug_embedding ); - } - - // Store the embeddings for future use. - if ( ! empty( $embeddings ) ) { - update_post_meta( $post_id, $meta_field, $embeddings ); - } - } - - // If we still don't have embeddings, return early. - if ( ! $embeddings || empty( $embeddings ) ) { - return $args; - } + return $this->add_chuncks_field_value( $args, $embeddings ); + } - // Add the embeddings data to the sync args. - $args['chunks'] = []; + /** + * Whether or not we should add the vector field to the post. + * + * @param int $post_id The Post ID + * @return boolean + */ + public function should_add_vector_field_to_post( int $post_id ): bool { + $post = get_post( $post_id ); + return ! empty( $post ); + } - foreach ( $embeddings as $embedding ) { - $args['chunks'][] = [ - 'vector' => array_map( 'floatval', $embedding ), - ]; - } + /** + * Return all content pieces for a given post ID. + * + * By default includes the title, the slug, and the post content, but could also add + * meta fields and taxonomy terms, for example. + * + * @param int $post_id The Post ID + * @return array + */ + public function get_object_content_pieces( int $post_id ): array { + $post = get_post( $post_id ); - return $args; + return [ + $post->post_content, + $post->post_title, + $post->post_name, + ]; } } diff --git a/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php b/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php new file mode 100644 index 0000000..46950e3 --- /dev/null +++ b/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php @@ -0,0 +1,207 @@ +feature = $feature; + } + + /** + * Setup hooks + */ + public function setup() { + add_action( 'init', [ $this, 'create_table' ] ); + } + + /** + * Return the custom table name + * + * @return string + */ + public function get_table_name(): string { + global $wpdb; + + return $wpdb->prefix . 'ep_embeddings_table'; + } + + /** + * Create the table + */ + public function create_table() { + global $wpdb; + + if ( $this->table_exists() ) { + return; + } + + $table_name = $this->get_table_name(); + + $charset_collate = $wpdb->get_charset_collate(); + + $sql = "CREATE TABLE $table_name ( + id bigint(20) unsigned NOT NULL AUTO_INCREMENT, + object_id bigint(20) unsigned NOT NULL, + object_type varchar(32) NOT NULL, + hash varchar(32) NOT NULL, + vectors longtext NOT NULL, + PRIMARY KEY (id), + INDEX object (object_id, object_type) + ) $charset_collate;"; + + require_once ABSPATH . 'wp-admin/includes/upgrade.php'; + dbDelta( $sql ); + } + + /** + * Whether or not the table exists + * + * @return boolean + */ + public function table_exists(): bool { + global $wpdb; + + $table_name = $this->get_table_name(); + + $table_exists = $wpdb->get_var( + $wpdb->prepare( 'SHOW TABLES LIKE %s', $table_name ) + ); + return ! \is_wp_error( $table_exists ) && ! \is_null( $table_exists ); + } + + /** + * Insert a new entry in the database + * + * @param integer $object_id The object ID + * @param string $object_type The object type + * @param string $text The text. It will be hashed and used as a key + * @param array $vectors Array of vectors + * @return void + */ + public function insert( int $object_id, string $object_type, string $text, array $vectors ) { + global $wpdb; + + $table_name = $this->get_table_name(); + + $wpdb->insert( + $table_name, + [ + 'object_id' => $object_id, + 'object_type' => $object_type, + 'hash' => $this->hash_content( $text ), + 'vectors' => wp_json_encode( $vectors ), + ] + ); + } + + /** + * Given a text, return the vectors if they exist in the database + * + * @param string $text The text. It will be hashed and used as a key + * @return array|null + */ + public function get( string $text ) { + global $wpdb; + + $table_name = $this->get_table_name(); + + $vectors = $wpdb->get_var( + $wpdb->prepare( + 'SELECT vectors FROM %s WHERE hash = %s', + $table_name, + $this->hash_content( $text ) + ) + ); + + return $vectors ? json_decode( $vectors ) : null; + } + + /** + * Get a full list of hashes for a given object + * + * @param integer $object_id The object ID + * @param string $object_type The object type + * @return array + */ + public function get_all_object_hashes( int $object_id, string $object_type ): array { + global $wpdb; + + $table_name = $this->get_table_name(); + + $rows = $wpdb->get_results( + $wpdb->prepare( + 'SELECT hash, vectors FROM %s WHERE object_id = %d AND object_type = %s', + $table_name, + $object_id, + $object_type + ) + ); + + return array_reduce( + $rows, + function ( $carry, $row ) { + $carry[ $row->hash ] = json_decode( $row->vectors ); + return $carry; + }, + [] + ); + } + + /** + * Delete a hash of a given object + * + * @param integer $object_id The object ID + * @param string $object_type The object type + * @param string $hash The hash + * @return void + */ + public function delete( int $object_id, string $object_type, string $hash ) { + global $wpdb; + + $table_name = $this->get_table_name(); + + $wpdb->delete( + $table_name, + [ + 'object_id' => $object_id, + 'object_type' => $object_type, + 'hash' => $hash, + ] + ); + } + + /** + * Hash the content. Uses md5 by default. + * + * @param string $content The content + * @return string + */ + public function hash_content( string $content ): string { + return md5( $content ); + } +} diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index e8b82c2..ecb46ba 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -39,6 +39,13 @@ class VectorEmbeddings extends Feature { */ protected $dimensions = 512; + /** + * Storage class instance. + * + * @var Storage\DbTable + */ + public $storage; + /** * Default settings * @@ -83,6 +90,9 @@ public function setup() { $term_indexable = new Indexables\Term( $this ); $term_indexable->setup(); + + $this->storage = new Storage\DbTable( $this ); + $this->storage->setup(); } /** @@ -215,18 +225,20 @@ public function add_vector_mapping_field( array $mapping, $dimensions = null, bo /** * Get an embedding from a given text. * - * @param string $text Text to get the embedding for. - * @param bool $cache Whether to cache the result. Default false. - * @return array|WP_Error + * @param int $object_id The Object ID. + * @param string $object_type The Object type. + * @param string $text Text to get the embedding for. + * @param string $return_type Return type ('array' or 'raw'). Default 'array'. + * @param bool $cache Whether to cache the result. Default true. + * @return array|null|WP_Error */ - public function get_embedding( string $text, bool $cache = false ) { + public function get_embedding( int $object_id, string $object_type, string $text, string $return_type = 'array', bool $cache = true ) { // Check to see if we have a stored embedding. if ( $cache ) { - $key = 'ep_embedding_' . sanitize_title( $text ); - $query_embedding = wp_cache_get( $key, 'ep_embeddings' ); + $cached = $this->storage->get( $text ); - if ( $query_embedding ) { - return $query_embedding; + if ( $cached ) { + return $cached; } } @@ -234,17 +246,18 @@ public function get_embedding( string $text, bool $cache = false ) { $embedding = $this->generate_embedding( $text ); if ( is_wp_error( $embedding ) ) { - return $embedding; + return 'raw' === $return_type ? $embedding : null; } // Store the embedding for future use if desired. if ( $cache ) { - wp_cache_set( $key, $embedding, 'ep_embeddings', false ); + $this->storage->insert( $object_id, $object_type, $text, $embedding ); } return $embedding; } + /** * Generate an embedding for a particular piece of text. * From 689fedb465a41352e518802f2374fdb566907b90 Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Mon, 20 Jan 2025 18:24:26 -0300 Subject: [PATCH 03/17] Fix tablename --- .../classes/Feature/VectorEmbeddings/Storage/DbTable.php | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php b/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php index 46950e3..e8614e9 100644 --- a/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php +++ b/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php @@ -132,8 +132,8 @@ public function get( string $text ) { $vectors = $wpdb->get_var( $wpdb->prepare( - 'SELECT vectors FROM %s WHERE hash = %s', - $table_name, + // phpcs:ignore WordPress.DB.PreparedSQL.InterpolatedNotPrepared + "SELECT vectors FROM {$table_name} WHERE hash = %s", $this->hash_content( $text ) ) ); @@ -155,8 +155,8 @@ public function get_all_object_hashes( int $object_id, string $object_type ): ar $rows = $wpdb->get_results( $wpdb->prepare( - 'SELECT hash, vectors FROM %s WHERE object_id = %d AND object_type = %s', - $table_name, + // phpcs:ignore WordPress.DB.PreparedSQL.InterpolatedNotPrepared + "SELECT hash, vectors FROM {$table_name} WHERE object_id = %d AND object_type = %s", $object_id, $object_type ) From 01ccdd050235618a4d6f82b0d3ab87ce7c656fb9 Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Wed, 22 Jan 2025 08:25:24 -0300 Subject: [PATCH 04/17] Different object representation --- .../Feature/VectorEmbeddings/Indexable.php | 45 ++++--------------- .../VectorEmbeddings/Indexables/Post.php | 33 +++++++++----- .../VectorEmbeddings/VectorEmbeddings.php | 38 ++++++++++------ 3 files changed, 56 insertions(+), 60 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Indexable.php b/includes/classes/Feature/VectorEmbeddings/Indexable.php index 4187896..bad29ad 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexable.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexable.php @@ -34,47 +34,18 @@ public function __construct( VectorEmbeddings $feature ) { /** * Given an object and its content pieces, return the embeddings and clean up unused embeddings stored. * - * @param int $object_id The object ID - * @param string $object_type The object type - * @param array $content_pieces Content pieces to get embeddings for + * @param int $object_id The object ID + * @param string $object_type The object type + * @param string $object_representation A string representing the object * @return array */ - public function get_updated_embeddings( int $object_id, string $object_type, array $content_pieces ): array { - $all_hashes = $this->feature->storage->get_all_object_hashes( $object_id, 'post' ); + public function get_updated_embeddings( int $object_id, string $object_type, string $object_representation ): array { + $embeddings = []; + $content_chunks = $this->feature->chunk_content( $object_representation ); - $hashes_in_use = []; - $embeddings = []; - foreach ( $content_pieces as $content_piece ) { - $content_chunks = $this->feature->chunk_content( $content_piece ); + $embeddings = $this->feature->get_embedding( $object_id, $object_type, $content_chunks ); - // Get the embeddings for each chunk. - if ( ! empty( $content_chunks ) ) { - foreach ( $content_chunks as $chunk ) { - $hash = $this->feature->storage->hash_content( $chunk ); - - $hashes_in_use[] = $hash; - - if ( isset( $all_hashes[ $hash ] ) ) { - $embeddings[] = $all_hashes[ $hash ]; - continue; - } - - $embedding = $this->feature->get_embedding( $object_id, $object_type, $chunk ); - if ( $embedding ) { - $embeddings[] = $embedding; - } - } - } - } - - $hashes_in_use = array_unique( $hashes_in_use ); - - $unused_hashes = array_diff( array_keys( $all_hashes ), $hashes_in_use ); - foreach ( $unused_hashes as $unused_hash ) { - $this->feature->storage->delete( $object_id, $object_type, $unused_hash ); - } - - return $embeddings; + return $embeddings ?? []; } /** diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index f78ea55..c9a1f0e 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -63,8 +63,8 @@ public function add_vector_field_to_post_sync( array $args, int $post_id ): arra return $args; } - $content_pieces = $this->get_object_content_pieces( $post_id ); - $embeddings = $this->get_updated_embeddings( $post_id, 'post', $content_pieces ); + $object_representation = $this->get_object_representation( $post_id ); + $embeddings = $this->get_updated_embeddings( $post_id, 'post', $object_representation ); return $this->add_chuncks_field_value( $args, $embeddings ); } @@ -81,21 +81,34 @@ public function should_add_vector_field_to_post( int $post_id ): bool { } /** - * Return all content pieces for a given post ID. + * Return a representation of a post. * * By default includes the title, the slug, and the post content, but could also add * meta fields and taxonomy terms, for example. * * @param int $post_id The Post ID - * @return array + * @return string */ - public function get_object_content_pieces( int $post_id ): array { + public function get_object_representation( int $post_id ): string { $post = get_post( $post_id ); - return [ - $post->post_content, - $post->post_title, - $post->post_name, - ]; + $return = ''; + + $title = get_the_title( $post_id ); + if ( $title ) { + $return .= "# Title\n{$title}\n\n"; + } + + if ( ! empty( $post->post_excerpt ) ) { + $excerpt = get_the_excerpt( $post_id ); + $return .= "# Summary\n{$excerpt}\n\n"; + } + + $content = get_the_content( $post_id ); + if ( $content ) { + $return .= "--\n{$content}\n\n"; + } + + return $return; } } diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index ecb46ba..88c4d15 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -225,14 +225,14 @@ public function add_vector_mapping_field( array $mapping, $dimensions = null, bo /** * Get an embedding from a given text. * - * @param int $object_id The Object ID. - * @param string $object_type The Object type. - * @param string $text Text to get the embedding for. - * @param string $return_type Return type ('array' or 'raw'). Default 'array'. - * @param bool $cache Whether to cache the result. Default true. + * @param int $object_id The Object ID. + * @param string $object_type The Object type. + * @param string|array $text Text or array of strings to get the embedding for. + * @param string $return_type Return type ('array' or 'raw'). Default 'array'. + * @param bool $cache Whether to cache the result. Default true. * @return array|null|WP_Error */ - public function get_embedding( int $object_id, string $object_type, string $text, string $return_type = 'array', bool $cache = true ) { + public function get_embedding( int $object_id, string $object_type, $text, string $return_type = 'array', bool $cache = false ) { // Check to see if we have a stored embedding. if ( $cache ) { $cached = $this->storage->get( $text ); @@ -243,9 +243,16 @@ public function get_embedding( int $object_id, string $object_type, string $text } // Generate the embedding. + if ( defined( 'WP_CLI' ) && WP_CLI ) { + \WP_CLI::line( "Generating embedding for {$object_type} ID: {$object_id}" ); + } else { + error_log( '$object_id: ' . $object_id . ' - $object_type: ' . $object_type ); + error_log( print_r( $text, true ) ); + } $embedding = $this->generate_embedding( $text ); if ( is_wp_error( $embedding ) ) { + error_log( print_r( $embedding, true ) ); return 'raw' === $return_type ? $embedding : null; } @@ -257,14 +264,13 @@ public function get_embedding( int $object_id, string $object_type, string $text return $embedding; } - /** * Generate an embedding for a particular piece of text. * - * @param string $text Text to generate the embedding for. + * @param string|array $text Text (or array of strings) to generate the embedding for. * @return array|boolean|WP_Error */ - public function generate_embedding( string $text = '' ) { + public function generate_embedding( $text = '' ) { /** * Filter the URL for the post request. * @@ -292,7 +298,7 @@ public function generate_embedding( string $text = '' ) { 'ep_openai_embeddings_request_body', [ 'model' => $this->get_setting( 'ep_openai_embedding_model' ), - 'input' => $text, + 'input' => (array) $text, 'dimensions' => $this->get_dimensions(), ], $text @@ -328,6 +334,8 @@ public function generate_embedding( string $text = '' ) { ) ); + error_log( 'generating embed' ); + if ( is_wp_error( $response ) ) { return $response; } @@ -344,8 +352,12 @@ public function generate_embedding( string $text = '' ) { continue; } - $return = $data['embedding']; - break; + if ( is_string( $text ) ) { + $return = $data['embedding']; + break; + } + + $return[] = $data['embedding']; } return $return; @@ -429,7 +441,7 @@ public function chunk_content( string $content = '', int $chunk_size = 150, $ove $content = $this->normalize_content( $content ); // Remove multiple whitespaces. - $content = preg_replace( '/\s+/', ' ', $content ); + $content = preg_replace( '/[ \t\r\f]+/', ' ', $content ); // Split text by single whitespace. $words = explode( ' ', $content ); From ad89f61a1f1834a1a12a00d1e53ce783f5ee2ffa Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Fri, 24 Jan 2025 10:09:20 -0300 Subject: [PATCH 05/17] Simplify things a bit --- .../VectorEmbeddings/Indexables/Post.php | 2 +- .../VectorEmbeddings/Storage/DbTable.php | 207 ------------------ .../VectorEmbeddings/VectorEmbeddings.php | 36 +-- 3 files changed, 6 insertions(+), 239 deletions(-) delete mode 100644 includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index c9a1f0e..e32d509 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -106,7 +106,7 @@ public function get_object_representation( int $post_id ): string { $content = get_the_content( $post_id ); if ( $content ) { - $return .= "--\n{$content}\n\n"; + $return .= "# Content\n{$content}\n\n"; } return $return; diff --git a/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php b/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php deleted file mode 100644 index e8614e9..0000000 --- a/includes/classes/Feature/VectorEmbeddings/Storage/DbTable.php +++ /dev/null @@ -1,207 +0,0 @@ -feature = $feature; - } - - /** - * Setup hooks - */ - public function setup() { - add_action( 'init', [ $this, 'create_table' ] ); - } - - /** - * Return the custom table name - * - * @return string - */ - public function get_table_name(): string { - global $wpdb; - - return $wpdb->prefix . 'ep_embeddings_table'; - } - - /** - * Create the table - */ - public function create_table() { - global $wpdb; - - if ( $this->table_exists() ) { - return; - } - - $table_name = $this->get_table_name(); - - $charset_collate = $wpdb->get_charset_collate(); - - $sql = "CREATE TABLE $table_name ( - id bigint(20) unsigned NOT NULL AUTO_INCREMENT, - object_id bigint(20) unsigned NOT NULL, - object_type varchar(32) NOT NULL, - hash varchar(32) NOT NULL, - vectors longtext NOT NULL, - PRIMARY KEY (id), - INDEX object (object_id, object_type) - ) $charset_collate;"; - - require_once ABSPATH . 'wp-admin/includes/upgrade.php'; - dbDelta( $sql ); - } - - /** - * Whether or not the table exists - * - * @return boolean - */ - public function table_exists(): bool { - global $wpdb; - - $table_name = $this->get_table_name(); - - $table_exists = $wpdb->get_var( - $wpdb->prepare( 'SHOW TABLES LIKE %s', $table_name ) - ); - return ! \is_wp_error( $table_exists ) && ! \is_null( $table_exists ); - } - - /** - * Insert a new entry in the database - * - * @param integer $object_id The object ID - * @param string $object_type The object type - * @param string $text The text. It will be hashed and used as a key - * @param array $vectors Array of vectors - * @return void - */ - public function insert( int $object_id, string $object_type, string $text, array $vectors ) { - global $wpdb; - - $table_name = $this->get_table_name(); - - $wpdb->insert( - $table_name, - [ - 'object_id' => $object_id, - 'object_type' => $object_type, - 'hash' => $this->hash_content( $text ), - 'vectors' => wp_json_encode( $vectors ), - ] - ); - } - - /** - * Given a text, return the vectors if they exist in the database - * - * @param string $text The text. It will be hashed and used as a key - * @return array|null - */ - public function get( string $text ) { - global $wpdb; - - $table_name = $this->get_table_name(); - - $vectors = $wpdb->get_var( - $wpdb->prepare( - // phpcs:ignore WordPress.DB.PreparedSQL.InterpolatedNotPrepared - "SELECT vectors FROM {$table_name} WHERE hash = %s", - $this->hash_content( $text ) - ) - ); - - return $vectors ? json_decode( $vectors ) : null; - } - - /** - * Get a full list of hashes for a given object - * - * @param integer $object_id The object ID - * @param string $object_type The object type - * @return array - */ - public function get_all_object_hashes( int $object_id, string $object_type ): array { - global $wpdb; - - $table_name = $this->get_table_name(); - - $rows = $wpdb->get_results( - $wpdb->prepare( - // phpcs:ignore WordPress.DB.PreparedSQL.InterpolatedNotPrepared - "SELECT hash, vectors FROM {$table_name} WHERE object_id = %d AND object_type = %s", - $object_id, - $object_type - ) - ); - - return array_reduce( - $rows, - function ( $carry, $row ) { - $carry[ $row->hash ] = json_decode( $row->vectors ); - return $carry; - }, - [] - ); - } - - /** - * Delete a hash of a given object - * - * @param integer $object_id The object ID - * @param string $object_type The object type - * @param string $hash The hash - * @return void - */ - public function delete( int $object_id, string $object_type, string $hash ) { - global $wpdb; - - $table_name = $this->get_table_name(); - - $wpdb->delete( - $table_name, - [ - 'object_id' => $object_id, - 'object_type' => $object_type, - 'hash' => $hash, - ] - ); - } - - /** - * Hash the content. Uses md5 by default. - * - * @param string $content The content - * @return string - */ - public function hash_content( string $content ): string { - return md5( $content ); - } -} diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index 88c4d15..2141ffc 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -39,13 +39,6 @@ class VectorEmbeddings extends Feature { */ protected $dimensions = 512; - /** - * Storage class instance. - * - * @var Storage\DbTable - */ - public $storage; - /** * Default settings * @@ -90,9 +83,6 @@ public function setup() { $term_indexable = new Indexables\Term( $this ); $term_indexable->setup(); - - $this->storage = new Storage\DbTable( $this ); - $this->storage->setup(); } /** @@ -229,38 +219,19 @@ public function add_vector_mapping_field( array $mapping, $dimensions = null, bo * @param string $object_type The Object type. * @param string|array $text Text or array of strings to get the embedding for. * @param string $return_type Return type ('array' or 'raw'). Default 'array'. - * @param bool $cache Whether to cache the result. Default true. * @return array|null|WP_Error */ - public function get_embedding( int $object_id, string $object_type, $text, string $return_type = 'array', bool $cache = false ) { - // Check to see if we have a stored embedding. - if ( $cache ) { - $cached = $this->storage->get( $text ); - - if ( $cached ) { - return $cached; - } - } - + public function get_embedding( int $object_id, string $object_type, $text, string $return_type = 'array' ) { // Generate the embedding. if ( defined( 'WP_CLI' ) && WP_CLI ) { \WP_CLI::line( "Generating embedding for {$object_type} ID: {$object_id}" ); - } else { - error_log( '$object_id: ' . $object_id . ' - $object_type: ' . $object_type ); - error_log( print_r( $text, true ) ); } $embedding = $this->generate_embedding( $text ); if ( is_wp_error( $embedding ) ) { - error_log( print_r( $embedding, true ) ); return 'raw' === $return_type ? $embedding : null; } - // Store the embedding for future use if desired. - if ( $cache ) { - $this->storage->insert( $object_id, $object_type, $text, $embedding ); - } - return $embedding; } @@ -443,6 +414,9 @@ public function chunk_content( string $content = '', int $chunk_size = 150, $ove // Remove multiple whitespaces. $content = preg_replace( '/[ \t\r\f]+/', ' ', $content ); + // Remove multiple new lines. + $content = preg_replace( '/[\n\v]{2,}/', "\n\n", $content ); + // Split text by single whitespace. $words = explode( ' ', $content ); @@ -452,7 +426,7 @@ public function chunk_content( string $content = '', int $chunk_size = 150, $ove // Iterate through & chunk data with an overlap. for ( $i = 0; $i < $text_count; $i += $chunk_size ) { // Join a set of words into a string. - $chunk = implode( + $chunk = 'search_document: ' . implode( ' ', array_slice( $words, From 9eb741323a0ab4bc0a02ab6d2c5b043f8f42e6dd Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Fri, 24 Jan 2025 12:07:24 -0300 Subject: [PATCH 06/17] Unify dimensions calculation + meta and terms data --- .../Feature/VectorEmbeddings/Indexable.php | 17 ---- .../VectorEmbeddings/Indexables/Post.php | 97 +++++++++++++++++-- .../VectorEmbeddings/VectorEmbeddings.php | 20 ++-- 3 files changed, 95 insertions(+), 39 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Indexable.php b/includes/classes/Feature/VectorEmbeddings/Indexable.php index bad29ad..e96c184 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexable.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexable.php @@ -31,23 +31,6 @@ public function __construct( VectorEmbeddings $feature ) { $this->feature = $feature; } - /** - * Given an object and its content pieces, return the embeddings and clean up unused embeddings stored. - * - * @param int $object_id The object ID - * @param string $object_type The object type - * @param string $object_representation A string representing the object - * @return array - */ - public function get_updated_embeddings( int $object_id, string $object_type, string $object_representation ): array { - $embeddings = []; - $content_chunks = $this->feature->chunk_content( $object_representation ); - - $embeddings = $this->feature->get_embedding( $object_id, $object_type, $content_chunks ); - - return $embeddings ?? []; - } - /** * Add the embedding data to the post vector sync args. * diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index e32d509..b16cc96 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -63,8 +63,8 @@ public function add_vector_field_to_post_sync( array $args, int $post_id ): arra return $args; } - $object_representation = $this->get_object_representation( $post_id ); - $embeddings = $this->get_updated_embeddings( $post_id, 'post', $object_representation ); + $post_chunks = $this->get_post_chunks( $post_id ); + $embeddings = $this->feature->get_embedding( $post_id, 'post', $post_chunks ); return $this->add_chuncks_field_value( $args, $embeddings ); } @@ -87,28 +87,105 @@ public function should_add_vector_field_to_post( int $post_id ): bool { * meta fields and taxonomy terms, for example. * * @param int $post_id The Post ID - * @return string + * @return array */ - public function get_object_representation( int $post_id ): string { + public function get_post_chunks( int $post_id ): array { $post = get_post( $post_id ); - $return = ''; + $main_content = ''; $title = get_the_title( $post_id ); if ( $title ) { - $return .= "# Title\n{$title}\n\n"; + $main_content .= "# Title\n{$title}\n\n"; } if ( ! empty( $post->post_excerpt ) ) { - $excerpt = get_the_excerpt( $post_id ); - $return .= "# Summary\n{$excerpt}\n\n"; + $excerpt = get_the_excerpt( $post_id ); + $main_content .= "# Summary\n{$excerpt}\n\n"; } $content = get_the_content( $post_id ); if ( $content ) { - $return .= "# Content\n{$content}\n\n"; + $main_content .= "# Content\n{$content}\n\n"; + } + + $chunks = $this->feature->chunk_content( $main_content ); + + $post_terms_str = $this->get_post_terms( $post ); + if ( $post_terms_str ) { + $chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_terms_str ) ]; + } + + $post_meta_str = $this->get_post_meta( $post ); + if ( $post_meta_str ) { + $chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_meta_str ) ]; + } + + return $chunks; + } + + /** + * Get the representation of the post terms. + * + * @param \WP_Post $post The post object + * @return string + */ + protected function get_post_terms( $post ): string { + $post_terms_str = ''; + $post_terms = []; + $indexable = \ElasticPress\Indexables::factory()->get( 'post' ); + $indexable_taxonomies = $indexable->get_indexable_post_taxonomies( $post ); + $taxonomy_by_names = wp_list_pluck( $indexable_taxonomies, 'label', 'name' ); + foreach ( $taxonomy_by_names as $tax_name => $tax_label ) { + $terms = get_the_terms( $post, $tax_name ); + if ( is_array( $terms ) ) { + $post_terms[ $tax_label ] = array_map( + function ( $term ) { + return $term->name; + }, + $terms + ); + } + } + + if ( ! empty( $post_terms ) ) { + $post_terms_str .= "# Taxonomy Terms\n"; + foreach ( $post_terms as $tax_label => $terms ) { + $post_terms_str .= "## {$tax_label}: "; + $post_terms_str .= implode( ', ', $terms ) . "\n"; + } + } + + return $post_terms_str; + } + + /** + * Get te representation of the post meta. + * + * @param \WP_Post $post The post object + * @return string + */ + protected function get_post_meta( $post ): string { + $meta_str = ''; + $meta_to_index = [ + 'footnotes', + 'searchwp_content_pdf_metadata', + ]; + $values = []; + if ( ! empty( $meta_to_index ) ) { + foreach ( $meta_to_index as $meta_field ) { + $values[ $meta_field ] = get_post_meta( $post->ID, $meta_field, true ); + } + } + $values = array_filter( $values ); + + if ( ! empty( $values ) ) { + $meta_str .= "# Metadata\n"; + foreach ( $values as $meta_field => $value ) { + $meta_str .= "## {$meta_field}: {$value}\n"; + } } - return $return; + return $meta_str; } } diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index 2141ffc..cc9daeb 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -155,30 +155,23 @@ public function set_settings_schema() { /** * Add a vector field to the Elasticsearch mapping. * - * @param array $mapping Current mapping. - * @param null|int $dimensions Number of dimensions for the vector field. - * @param bool $quantization Whether to use quantization for the vector field. Default false. + * @param array $mapping Current mapping. + * @param bool $quantization Whether to use quantization for the vector field. Default false. * @return array */ - public function add_vector_mapping_field( array $mapping, $dimensions = null, bool $quantization = true ): array { + public function add_vector_mapping_field( array $mapping, bool $quantization = true ): array { // Don't add the field if it already exists. if ( isset( $mapping['mappings']['properties']['chunks'] ) ) { return $mapping; } - // This needs to match the dimensions your model uses and be between 1 and 4096. - if ( ! $dimensions ) { - $dimensions = $this->get_dimensions(); - } - $calc_dimensions = max( 1, min( 4096, $dimensions ) ); - // Add the default vector field mapping. $mapping['mappings']['properties']['chunks'] = [ 'type' => 'nested', 'properties' => [ 'vector' => [ 'type' => 'dense_vector', - 'dims' => (int) $calc_dimensions, + 'dims' => $this->get_dimensions(), ], ], ]; @@ -308,6 +301,7 @@ public function generate_embedding( $text = '' ) { error_log( 'generating embed' ); if ( is_wp_error( $response ) ) { + error_log( print_r( $response, true ) ); return $response; } @@ -447,6 +441,8 @@ public function chunk_content( string $content = '', int $chunk_size = 150, $ove * @return int */ public function get_dimensions(): int { + $calc_dimensions = max( 1, min( 4096, $this->dimensions ) ); + /** * Filter the dimensions we want for each embedding. * @@ -459,7 +455,7 @@ public function get_dimensions(): int { * @param {int} $dimensions The default dimensions. * @return {int} The dimensions. */ - return apply_filters( 'ep_openai_embeddings_dimensions', $this->dimensions ); + return (int) apply_filters( 'ep_openai_embeddings_dimensions', $calc_dimensions ); } /** From 6b7f17ca27030d0aa04806f9c9bab3c18cae33ce Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Fri, 24 Jan 2025 13:27:21 -0300 Subject: [PATCH 07/17] Use the weighting engine to choose which terms and meta to vectorize --- .../VectorEmbeddings/Indexables/Post.php | 80 ++++++++++++------- 1 file changed, 52 insertions(+), 28 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index b16cc96..f48ae3d 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -111,14 +111,46 @@ public function get_post_chunks( int $post_id ): array { $chunks = $this->feature->chunk_content( $main_content ); - $post_terms_str = $this->get_post_terms( $post ); - if ( $post_terms_str ) { - $chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_terms_str ) ]; + $search_feature = \ElasticPress\Features::factory()->get_registered_feature( 'search' ); + $weighting = $search_feature->weighting->get_weighting_configuration_with_defaults(); + if ( empty( $weighting[ $post->post_type ] ) ) { + return $chunks; } - $post_meta_str = $this->get_post_meta( $post ); - if ( $post_meta_str ) { - $chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_meta_str ) ]; + $post_type_weighting = $weighting[ $post->post_type ]; + + $taxonomies = array_reduce( + array_keys( $post_type_weighting ), + function ( $acc, $field ) use ( $post_type_weighting ) { + if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/terms\.(.*)\.name/', $field, $matches ) ) { + $acc[] = $matches[1]; + } + return $acc; + }, + [] + ); + if ( $taxonomies ) { + $post_terms_str = $this->get_post_terms( $post, $taxonomies ); + if ( $post_terms_str ) { + $chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_terms_str ) ]; + } + } + + $meta_fields = array_reduce( + array_keys( $post_type_weighting ), + function ( $acc, $field ) use ( $post_type_weighting ) { + if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/meta\.(.*)\.value/', $field, $matches ) ) { + $acc[] = $matches[1]; + } + return $acc; + }, + [] + ); + if ( $meta_fields ) { + $post_meta_str = $this->get_post_meta( $post, $meta_fields ); + if ( $post_meta_str ) { + $chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_meta_str ) ]; + } } return $chunks; @@ -127,19 +159,17 @@ public function get_post_chunks( int $post_id ): array { /** * Get the representation of the post terms. * - * @param \WP_Post $post The post object + * @param \WP_Post $post The post object + * @param array $taxonomies Taxonomies to be added. * @return string */ - protected function get_post_terms( $post ): string { - $post_terms_str = ''; - $post_terms = []; - $indexable = \ElasticPress\Indexables::factory()->get( 'post' ); - $indexable_taxonomies = $indexable->get_indexable_post_taxonomies( $post ); - $taxonomy_by_names = wp_list_pluck( $indexable_taxonomies, 'label', 'name' ); - foreach ( $taxonomy_by_names as $tax_name => $tax_label ) { + protected function get_post_terms( $post, $taxonomies ): string { + $post_terms_str = ''; + $post_terms = []; + foreach ( $taxonomies as $tax_name ) { $terms = get_the_terms( $post, $tax_name ); if ( is_array( $terms ) ) { - $post_terms[ $tax_label ] = array_map( + $post_terms[ $tax_name ] = array_map( function ( $term ) { return $term->name; }, @@ -147,7 +177,6 @@ function ( $term ) { ); } } - if ( ! empty( $post_terms ) ) { $post_terms_str .= "# Taxonomy Terms\n"; foreach ( $post_terms as $tax_label => $terms ) { @@ -162,20 +191,15 @@ function ( $term ) { /** * Get te representation of the post meta. * - * @param \WP_Post $post The post object + * @param \WP_Post $post The post object + * @param array $meta_fields List of metafields * @return string */ - protected function get_post_meta( $post ): string { - $meta_str = ''; - $meta_to_index = [ - 'footnotes', - 'searchwp_content_pdf_metadata', - ]; - $values = []; - if ( ! empty( $meta_to_index ) ) { - foreach ( $meta_to_index as $meta_field ) { - $values[ $meta_field ] = get_post_meta( $post->ID, $meta_field, true ); - } + protected function get_post_meta( $post, $meta_fields ): string { + $meta_str = ''; + $values = []; + foreach ( $meta_fields as $meta_field ) { + $values[ $meta_field ] = get_post_meta( $post->ID, $meta_field, true ); } $values = array_filter( $values ); From 69049ae5aeb8c30107b6db6b45e256e1ddc16b79 Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Mon, 27 Jan 2025 15:27:15 -0300 Subject: [PATCH 08/17] Remove initial support for WP_Term_Query --- .../VectorEmbeddings/Indexables/Term.php | 109 ------------------ .../VectorEmbeddings/VectorEmbeddings.php | 3 - 2 files changed, 112 deletions(-) delete mode 100644 includes/classes/Feature/VectorEmbeddings/Indexables/Term.php diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Term.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Term.php deleted file mode 100644 index 766b1f2..0000000 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Term.php +++ /dev/null @@ -1,109 +0,0 @@ -feature->get_setting( 'ep_external_embedding' ) ) { - add_filter( 'ep_term_sync_args', [ $this, 'add_vector_field_to_term_sync' ], 10, 2 ); - } - } - - /** - * Add our vector field mapping to the Elasticsearch term index. - * - * @param array $mapping Current mapping. - * @return array - */ - public function add_term_vector_field_mapping( array $mapping ): array { - return $this->feature->add_vector_mapping_field( $mapping ); - } - - /** - * Exclude our vector meta from being synced. - * - * @param array $excluded_keys Current excluded keys. - * @return array - */ - public function exclude_vector_meta( array $excluded_keys ): array { - $excluded_keys[] = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); - return $excluded_keys; - } - - /** - * Add the embedding data to the term vector sync args. - * - * @param array $args Current sync args. - * @param int $term_id Term ID being synced. - * @return array - */ - public function add_vector_field_to_term_sync( array $args, int $term_id ): array { - $meta_field = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); - // Try to use the stored embeddings first. - $embeddings = get_term_meta( $term_id, $meta_field, true ); - - // If they don't exist, make API requests to generate them. - if ( ! $embeddings ) { - $term = get_term( $term_id ); - - // Build up the content we want to generate embeddings for. - $content = $term->name . ' ' . $term->slug . ' ' . $term->description; - - $embeddings = []; - $content_chunks = $this->feature->chunk_content( $content ); - - // Get the embeddings for each chunk. - if ( ! empty( $content_chunks ) ) { - foreach ( $content_chunks as $chunk ) { - $embedding = $this->feature->get_embedding( $chunk ); - - if ( $embedding && ! is_wp_error( $embedding ) ) { - $embeddings[] = array_map( 'floatval', $embedding ); - } - } - } - - // Store the embeddings for future use. - if ( ! empty( $embeddings ) ) { - update_term_meta( $term_id, $meta_field, $embeddings ); - } - } - - // If we still don't have embeddings, return early. - if ( ! $embeddings || empty( $embeddings ) ) { - return $args; - } - - // Add the embeddings data to the sync args. - $args['chunks'] = []; - - foreach ( $embeddings as $embedding ) { - $args['chunks'][] = [ - 'vector' => array_map( 'floatval', $embedding ), - ]; - } - - return $args; - } -} diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index cc9daeb..58051b6 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -80,9 +80,6 @@ public function __construct() { public function setup() { $post_indexable = new Indexables\Post( $this ); $post_indexable->setup(); - - $term_indexable = new Indexables\Term( $this ); - $term_indexable->setup(); } /** From cc83899a516922c1ad407c628f03c22f9d6f2b3d Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Mon, 27 Jan 2025 16:30:13 -0300 Subject: [PATCH 09/17] Rearrange a few things and add new filters --- .../Feature/VectorEmbeddings/Indexable.php | 59 ++++++- .../VectorEmbeddings/Indexables/Post.php | 158 ++++++++++++++---- .../VectorEmbeddings/VectorEmbeddings.php | 104 ++++-------- 3 files changed, 212 insertions(+), 109 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Indexable.php b/includes/classes/Feature/VectorEmbeddings/Indexable.php index e96c184..304cb0a 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexable.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexable.php @@ -11,6 +11,8 @@ namespace ElasticPressLabs\Feature\VectorEmbeddings; +use ElasticPress\Elasticsearch; + /** * Vector Embeddings Indexable abstract class */ @@ -31,6 +33,61 @@ public function __construct( VectorEmbeddings $feature ) { $this->feature = $feature; } + /** + * Add a vector field to the Elasticsearch mapping. + * + * @param array $mapping Current mapping. + * @param bool $quantization Whether to use quantization for the vector field. Default false. + * @return array + */ + public function add_vector_mapping_field( array $mapping, bool $quantization = true ): array { + $es_version = Elasticsearch::factory()->get_elasticsearch_version(); + + // Don't add the field if it already exists. + if ( isset( $mapping['mappings']['properties']['chunks'] ) ) { + return $mapping; + } + + // Add the default vector field mapping. + $mapping['mappings']['properties']['chunks'] = [ + 'type' => 'nested', + 'properties' => [ + 'vector' => [ + 'type' => 'dense_vector', + 'dims' => $this->feature->get_dimensions(), + ], + ], + ]; + + // Add extra vector fields for newer versions of Elasticsearch. + if ( version_compare( $es_version, '8.0', '>=' ) ) { + // The index (true or false, default true) and similarity (l2_norm, dot_product or cosine) fields + // were added in 8.0. The similarity field must be set if index is true. + $mapping['mappings']['properties']['chunks']['properties']['vector'] = array_merge( + $mapping['mappings']['properties']['chunks']['properties']['vector'], + [ + 'index' => true, + 'similarity' => 'cosine', + ] + ); + + // The element_type field was added in 8.6. This can be either float (default) or byte. + if ( version_compare( $es_version, '8.6', '>=' ) ) { + $mapping['mappings']['properties']['chunks']['properties']['vector']['element_type'] = 'float'; + } + + // The int8_hnsw type was added in 8.12. + if ( $quantization && version_compare( $es_version, '8.12', '>=' ) ) { + // This is supposed to result in better performance but slightly less accurate results. + // See https://www.elastic.co/guide/en/elasticsearch/reference/8.13/knn-search.html#knn-search-quantized-example. + // Can test with this on and off and compare results to see what works best. + $mapping['mappings']['properties']['chunks']['properties']['vector']['index_options']['type'] = 'int8_hnsw'; + } + } + + return $mapping; + } + /** * Add the embedding data to the post vector sync args. * @@ -38,7 +95,7 @@ public function __construct( VectorEmbeddings $feature ) { * @param array $embeddings The embeddings to add to the sync args * @return array */ - public function add_chuncks_field_value( array $args, array $embeddings ): array { + public function add_chunks_field_value( array $args, array $embeddings ): array { // If we still don't have embeddings, return early. if ( empty( $embeddings ) ) { return $args; diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index f48ae3d..ecc8844 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -37,7 +37,7 @@ public function setup() { * @return array */ public function add_post_vector_field_mapping( array $mapping ): array { - return $this->feature->add_vector_mapping_field( $mapping ); + return $this->add_vector_mapping_field( $mapping ); } /** @@ -66,7 +66,11 @@ public function add_vector_field_to_post_sync( array $args, int $post_id ): arra $post_chunks = $this->get_post_chunks( $post_id ); $embeddings = $this->feature->get_embedding( $post_id, 'post', $post_chunks ); - return $this->add_chuncks_field_value( $args, $embeddings ); + if ( ! is_array( $embeddings ) ) { + return $args; + } + + return $this->add_chunks_field_value( $args, $embeddings ); } /** @@ -109,26 +113,21 @@ public function get_post_chunks( int $post_id ): array { $main_content .= "# Content\n{$content}\n\n"; } - $chunks = $this->feature->chunk_content( $main_content ); + /** + * Filter the main content of a post before being split into chunks. + * + * @hook ep_openai_embeddings_post_main_content + * @since 2.4.0 + * + * @param {string} $main_content Title, excerpt, and content of a post. + * @param {\WP_Post} $post The post being processed. + * @return {string} The final main content representation. + */ + $main_content = apply_filters( 'ep_openai_embeddings_post_main_content', $main_content, $post ); - $search_feature = \ElasticPress\Features::factory()->get_registered_feature( 'search' ); - $weighting = $search_feature->weighting->get_weighting_configuration_with_defaults(); - if ( empty( $weighting[ $post->post_type ] ) ) { - return $chunks; - } - - $post_type_weighting = $weighting[ $post->post_type ]; + $chunks = $this->feature->chunk_content( $main_content ); - $taxonomies = array_reduce( - array_keys( $post_type_weighting ), - function ( $acc, $field ) use ( $post_type_weighting ) { - if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/terms\.(.*)\.name/', $field, $matches ) ) { - $acc[] = $matches[1]; - } - return $acc; - }, - [] - ); + $taxonomies = $this->get_embeddable_taxonomies( $post_id, $post->post_type ); if ( $taxonomies ) { $post_terms_str = $this->get_post_terms( $post, $taxonomies ); if ( $post_terms_str ) { @@ -136,24 +135,57 @@ function ( $acc, $field ) use ( $post_type_weighting ) { } } - $meta_fields = array_reduce( + $meta_fields = $this->get_embeddable_meta( $post_id, $post->post_type ); + if ( $meta_fields ) { + $post_meta_str = $this->get_post_meta( $post, $meta_fields ); + if ( $post_meta_str ) { + $chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_meta_str ) ]; + } + } + + return $chunks; + } + + /** + * Return the list of taxonomies that should be included in the post representation. + * + * @param integer $post_id The post ID. + * @param string $post_type The post type. + * @return array + */ + protected function get_embeddable_taxonomies( int $post_id, string $post_type ): array { + $search_feature = \ElasticPress\Features::factory()->get_registered_feature( 'search' ); + $weighting = $search_feature->weighting->get_weighting_configuration_with_defaults(); + if ( empty( $weighting[ $post_type ] ) ) { + /** + * Filter the list of taxonomies which terms should be included in the post representation. + * + * @hook ep_openai_embeddings_post_embeddable_taxonomies + * @since 2.4.0 + * + * @param {array} $embeddable_taxonomies Array of taxonomy names. + * @param {int} $post_id The post ID. + * @param {string} $post_type The post type. + * @return {array} The list of taxonomy names. + */ + return apply_filters( 'ep_openai_embeddings_post_embeddable_taxonomies', [], $post_id, $post_type ); + } + + $post_type_weighting = $weighting[ $post_type ]; + + $taxonomies = array_reduce( array_keys( $post_type_weighting ), function ( $acc, $field ) use ( $post_type_weighting ) { - if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/meta\.(.*)\.value/', $field, $matches ) ) { + if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/terms\.(.*)\.name/', $field, $matches ) ) { $acc[] = $matches[1]; } return $acc; }, [] ); - if ( $meta_fields ) { - $post_meta_str = $this->get_post_meta( $post, $meta_fields ); - if ( $post_meta_str ) { - $chunks = [ ...$chunks, ...$this->feature->chunk_content( $post_meta_str ) ]; - } - } - return $chunks; + // This filter is documented above. + return apply_filters( 'ep_openai_embeddings_post_embeddable_taxonomies', $taxonomies, $post_id, $post_type ); } /** @@ -185,11 +217,63 @@ function ( $term ) { } } - return $post_terms_str; + /** + * Filter the string that represents the list of terms associated with this post. + * + * @hook ep_openai_embeddings_post_terms_str + * @since 2.4.0 + * + * @param {string} $post_terms_str String with post terms. + * @param {WP_Post} $post The post. + * @return {string} The string with post terms. + */ + return apply_filters( 'ep_openai_embeddings_post_terms_str', $post_terms_str, $post ); + } + + /** + * Return the list of metafields that should be included in the post representation. + * + * @param integer $post_id The post ID. + * @param string $post_type The post type. + * @return array + */ + protected function get_embeddable_meta( int $post_id, string $post_type ): array { + $search_feature = \ElasticPress\Features::factory()->get_registered_feature( 'search' ); + $weighting = $search_feature->weighting->get_weighting_configuration_with_defaults(); + if ( empty( $weighting[ $post_type ] ) ) { + /** + * Filter the list of metafields which values should be included in the post representation. + * + * @hook ep_openai_embeddings_post_embeddable_meta + * @since 2.4.0 + * + * @param {array} $embeddable_meta Array of meta keys. + * @param {int} $post_id The post ID. + * @param {string} $post_type The post type. + * @return {array} The list of meta keys. + */ + return apply_filters( 'ep_openai_embeddings_post_embeddable_meta', [], $post_id, $post_type ); + } + + $post_type_weighting = $weighting[ $post_type ]; + + $meta_fields = array_reduce( + array_keys( $post_type_weighting ), + function ( $acc, $field ) use ( $post_type_weighting ) { + if ( $post_type_weighting[ $field ]['enabled'] && preg_match( '/meta\.(.*)\.value/', $field, $matches ) ) { + $acc[] = $matches[1]; + } + return $acc; + }, + [] + ); + + // This filter is documented above. + return apply_filters( 'ep_openai_embeddings_post_embeddable_meta', $meta_fields, $post_id, $post_type ); } /** - * Get te representation of the post meta. + * Get the representation of the post meta. * * @param \WP_Post $post The post object * @param array $meta_fields List of metafields @@ -210,6 +294,16 @@ protected function get_post_meta( $post, $meta_fields ): string { } } - return $meta_str; + /** + * Filter the string that represents the meta fields associated with this post. + * + * @hook ep_openai_embeddings_post_meta_str + * @since 2.4.0 + * + * @param {string} $post_terms_str String with post terms. + * @param {WP_Post} $post The post. + * @return {string} The string with post terms. + */ + return apply_filters( 'ep_openai_embeddings_post_meta_str', $meta_str, $post ); } } diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index 58051b6..647e90e 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -25,13 +25,6 @@ * Vector Embeddings feature */ class VectorEmbeddings extends Feature { - /** - * Elasticsearch version. - * - * @var string $es_version - */ - protected $es_version; - /** * Number of dimensions for the embeddings. * @@ -67,8 +60,6 @@ public function __construct() { 'elasticpress-labs' ); - $this->es_version = Elasticsearch::factory()->get_elasticsearch_version(); - parent::__construct(); } @@ -91,7 +82,7 @@ public function requirements_status() { $status = new \ElasticPress\FeatureRequirementsStatus( 1 ); // Vector support was added in Elasticsearch 7.0. - if ( version_compare( $this->es_version, '7.0', '<=' ) ) { + if ( version_compare( Elasticsearch::factory()->get_elasticsearch_version(), '7.0', '<=' ) ) { $status->code = 2; $status->message = esc_html__( 'You need to have Elasticsearch with version >7.0.', 'elasticpress-labs' ); } @@ -150,79 +141,20 @@ public function set_settings_schema() { } /** - * Add a vector field to the Elasticsearch mapping. - * - * @param array $mapping Current mapping. - * @param bool $quantization Whether to use quantization for the vector field. Default false. - * @return array - */ - public function add_vector_mapping_field( array $mapping, bool $quantization = true ): array { - // Don't add the field if it already exists. - if ( isset( $mapping['mappings']['properties']['chunks'] ) ) { - return $mapping; - } - - // Add the default vector field mapping. - $mapping['mappings']['properties']['chunks'] = [ - 'type' => 'nested', - 'properties' => [ - 'vector' => [ - 'type' => 'dense_vector', - 'dims' => $this->get_dimensions(), - ], - ], - ]; - - // Add extra vector fields for newer versions of Elasticsearch. - if ( version_compare( $this->es_version, '8.0', '>=' ) ) { - // The index (true or false, default true) and similarity (l2_norm, dot_product or cosine) fields - // were added in 8.0. The similarity field must be set if index is true. - $mapping['mappings']['properties']['chunks']['properties']['vector'] = array_merge( - $mapping['mappings']['properties']['chunks']['properties']['vector'], - [ - 'index' => true, - 'similarity' => 'cosine', - ] - ); - - // The element_type field was added in 8.6. This can be either float (default) or byte. - if ( version_compare( $this->es_version, '8.6', '>=' ) ) { - $mapping['mappings']['properties']['chunks']['properties']['vector']['element_type'] = 'float'; - } - - // The int8_hnsw type was added in 8.12. - if ( $quantization && version_compare( $this->es_version, '8.12', '>=' ) ) { - // This is supposed to result in better performance but slightly less accurate results. - // See https://www.elastic.co/guide/en/elasticsearch/reference/8.13/knn-search.html#knn-search-quantized-example. - // Can test with this on and off and compare results to see what works best. - $mapping['mappings']['properties']['chunks']['properties']['vector']['index_options']['type'] = 'int8_hnsw'; - } - } - - return $mapping; - } - - /** - * Get an embedding from a given text. + * Get an embedding from a given strings or array of strings. * * @param int $object_id The Object ID. * @param string $object_type The Object type. - * @param string|array $text Text or array of strings to get the embedding for. - * @param string $return_type Return type ('array' or 'raw'). Default 'array'. + * @param string|array $text String or array of strings to get the embedding for. * @return array|null|WP_Error */ - public function get_embedding( int $object_id, string $object_type, $text, string $return_type = 'array' ) { + public function get_embedding( int $object_id, string $object_type, $text ) { // Generate the embedding. if ( defined( 'WP_CLI' ) && WP_CLI ) { \WP_CLI::line( "Generating embedding for {$object_type} ID: {$object_id}" ); } - $embedding = $this->generate_embedding( $text ); - if ( is_wp_error( $embedding ) ) { - return 'raw' === $return_type ? $embedding : null; - } - - return $embedding; + return $this->generate_embedding( $text ); } /** @@ -295,10 +227,19 @@ public function generate_embedding( $text = '' ) { ) ); - error_log( 'generating embed' ); + /** + * Filter the response of the request. + * + * @hook ep_openai_embeddings_request_response + * @since 2.4.0 + * + * @param {array|WP_Error} $response The request response. + * @param {array|string} $text The text that was sent to be processed. + * @return {array|WP_Error} The request response. + */ + $response = apply_filters( 'ep_openai_embeddings_request_response', $response, $text ); if ( is_wp_error( $response ) ) { - error_log( print_r( $response, true ) ); return $response; } @@ -417,7 +358,7 @@ public function chunk_content( string $content = '', int $chunk_size = 150, $ove // Iterate through & chunk data with an overlap. for ( $i = 0; $i < $text_count; $i += $chunk_size ) { // Join a set of words into a string. - $chunk = 'search_document: ' . implode( + $chunk = implode( ' ', array_slice( $words, @@ -426,6 +367,17 @@ public function chunk_content( string $content = '', int $chunk_size = 150, $ove ) ); + /** + * Filter a chunk of text. + * + * @hook ep_openai_embeddings_chunk + * @since 2.4.0 + * + * @param {string} $chunk The chunk being processed. + * @return {string} The modified chunk. + */ + $chunk = apply_filters( 'ep_openai_embeddings_chunk', $chunk ); + array_push( $chunks, $chunk ); } From ab4973742accb6a17f28e5a969bd70a8945ef5b5 Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Tue, 28 Jan 2025 11:39:50 -0300 Subject: [PATCH 10/17] Add new ep_openai_embeddings_should_add_vector_field_to_post filter --- .../Feature/VectorEmbeddings/Indexables/Post.php | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index ecc8844..55bb326 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -81,7 +81,18 @@ public function add_vector_field_to_post_sync( array $args, int $post_id ): arra */ public function should_add_vector_field_to_post( int $post_id ): bool { $post = get_post( $post_id ); - return ! empty( $post ); + + /** + * Filter whether the vector field should or not be added to the post. + * + * @hook ep_embeddings_should_add_vector_field_to_post + * @since 2.4.0 + * + * @param {bool} $should_add Whether the vector field should or not be added to the post. + * @param {int} $post_id The post ID. + * @return {bool} The new $should_add value. + */ + return apply_filters( 'ep_openai_embeddings_should_add_vector_field_to_post', ! empty( $post ), $post_id ); } /** From 7525dba4c21dca06b52896c95365ff05f30dba1f Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Tue, 28 Jan 2025 11:49:24 -0300 Subject: [PATCH 11/17] use the ep_embeddings_ prefix consistently --- .../VectorEmbeddings/Indexables/Post.php | 42 +++++---------- .../VectorEmbeddings/VectorEmbeddings.php | 51 ++++++++----------- 2 files changed, 36 insertions(+), 57 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index 55bb326..4e26175 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -21,11 +21,8 @@ public function setup() { // Alter post and term mapping to store our vector embeddings add_filter( 'ep_post_mapping', [ $this, 'add_post_vector_field_mapping' ] ); - // Exclude designated meta field holding the vector embeddings from search - add_filter( 'ep_prepare_meta_excluded_public_keys', [ $this, 'exclude_vector_meta' ] ); - // Only trigger embeddings when external embeddings are turned off - if ( ! $this->feature->get_setting( 'ep_external_embedding' ) ) { + if ( ! $this->feature->get_setting( 'ep_embeddings_external_embedding' ) ) { add_filter( 'ep_post_sync_args_post_prepare_meta', [ $this, 'add_vector_field_to_post_sync' ], 10, 2 ); } } @@ -40,17 +37,6 @@ public function add_post_vector_field_mapping( array $mapping ): array { return $this->add_vector_mapping_field( $mapping ); } - /** - * Exclude our vector meta from being synced. - * - * @param array $excluded_keys Current excluded keys. - * @return array - */ - public function exclude_vector_meta( array $excluded_keys ): array { - $excluded_keys[] = $this->feature->get_setting( 'ep_vector_embeddings_meta_field' ); - return $excluded_keys; - } - /** * Add the embedding data to the post vector sync args. * @@ -92,7 +78,7 @@ public function should_add_vector_field_to_post( int $post_id ): bool { * @param {int} $post_id The post ID. * @return {bool} The new $should_add value. */ - return apply_filters( 'ep_openai_embeddings_should_add_vector_field_to_post', ! empty( $post ), $post_id ); + return apply_filters( 'ep_embeddings_should_add_vector_field_to_post', ! empty( $post ), $post_id ); } /** @@ -127,14 +113,14 @@ public function get_post_chunks( int $post_id ): array { /** * Filter the main content of a post before being split into chunks. * - * @hook ep_openai_embeddings_post_main_content + * @hook ep_embeddings_post_main_content * @since 2.4.0 * * @param {string} $main_content Title, excerpt, and content of a post. * @param {\WP_Post} $post The post being processed. * @return {string} The final main content representation. */ - $main_content = apply_filters( 'ep_openai_embeddings_post_main_content', $main_content, $post ); + $main_content = apply_filters( 'ep_embeddings_post_main_content', $main_content, $post ); $chunks = $this->feature->chunk_content( $main_content ); @@ -171,7 +157,7 @@ protected function get_embeddable_taxonomies( int $post_id, string $post_type ): /** * Filter the list of taxonomies which terms should be included in the post representation. * - * @hook ep_openai_embeddings_post_embeddable_taxonomies + * @hook ep_embeddings_post_embeddable_taxonomies * @since 2.4.0 * * @param {array} $embeddable_taxonomies Array of taxonomy names. @@ -179,7 +165,7 @@ protected function get_embeddable_taxonomies( int $post_id, string $post_type ): * @param {string} $post_type The post type. * @return {array} The list of taxonomy names. */ - return apply_filters( 'ep_openai_embeddings_post_embeddable_taxonomies', [], $post_id, $post_type ); + return apply_filters( 'ep_embeddings_post_embeddable_taxonomies', [], $post_id, $post_type ); } $post_type_weighting = $weighting[ $post_type ]; @@ -196,7 +182,7 @@ function ( $acc, $field ) use ( $post_type_weighting ) { ); // This filter is documented above. - return apply_filters( 'ep_openai_embeddings_post_embeddable_taxonomies', $taxonomies, $post_id, $post_type ); + return apply_filters( 'ep_embeddings_post_embeddable_taxonomies', $taxonomies, $post_id, $post_type ); } /** @@ -231,14 +217,14 @@ function ( $term ) { /** * Filter the string that represents the list of terms associated with this post. * - * @hook ep_openai_embeddings_post_terms_str + * @hook ep_embeddings_post_terms_str * @since 2.4.0 * * @param {string} $post_terms_str String with post terms. * @param {WP_Post} $post The post. * @return {string} The string with post terms. */ - return apply_filters( 'ep_openai_embeddings_post_terms_str', $post_terms_str, $post ); + return apply_filters( 'ep_embeddings_post_terms_str', $post_terms_str, $post ); } /** @@ -255,7 +241,7 @@ protected function get_embeddable_meta( int $post_id, string $post_type ): array /** * Filter the list of metafields which values should be included in the post representation. * - * @hook ep_openai_embeddings_post_embeddable_meta + * @hook ep_embeddings_post_embeddable_meta * @since 2.4.0 * * @param {array} $embeddable_meta Array of meta keys. @@ -263,7 +249,7 @@ protected function get_embeddable_meta( int $post_id, string $post_type ): array * @param {string} $post_type The post type. * @return {array} The list of meta keys. */ - return apply_filters( 'ep_openai_embeddings_post_embeddable_meta', [], $post_id, $post_type ); + return apply_filters( 'ep_embeddings_post_embeddable_meta', [], $post_id, $post_type ); } $post_type_weighting = $weighting[ $post_type ]; @@ -280,7 +266,7 @@ function ( $acc, $field ) use ( $post_type_weighting ) { ); // This filter is documented above. - return apply_filters( 'ep_openai_embeddings_post_embeddable_meta', $meta_fields, $post_id, $post_type ); + return apply_filters( 'ep_embeddings_post_embeddable_meta', $meta_fields, $post_id, $post_type ); } /** @@ -308,13 +294,13 @@ protected function get_post_meta( $post, $meta_fields ): string { /** * Filter the string that represents the meta fields associated with this post. * - * @hook ep_openai_embeddings_post_meta_str + * @hook ep_embeddings_post_meta_str * @since 2.4.0 * * @param {string} $post_terms_str String with post terms. * @param {WP_Post} $post The post. * @return {string} The string with post terms. */ - return apply_filters( 'ep_openai_embeddings_post_meta_str', $meta_str, $post ); + return apply_filters( 'ep_embeddings_post_meta_str', $meta_str, $post ); } } diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index 647e90e..7a1bd64 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -38,11 +38,10 @@ class VectorEmbeddings extends Feature { * @var array $default_settings. */ public $default_settings = [ - 'ep_openai_api_key' => '', - 'ep_openai_embeddings_api_url' => 'https://api.openai.com/v1/embeddings', - 'ep_openai_embedding_model' => 'text-embedding-3-small', - 'ep_vector_embeddings_meta_field' => 'vector_embeddings', - 'ep_external_embedding' => '0', + 'ep_embeddings_api_key' => '', + 'ep_embeddings_api_url' => 'https://api.openai.com/v1/embeddings', + 'ep_embeddings_embedding_model' => 'text-embedding-3-small', + 'ep_embeddings_external_embedding' => '0', ]; /** @@ -96,7 +95,7 @@ public function requirements_status() { public function set_settings_schema() { $this->settings_schema = [ [ - 'key' => 'ep_openai_api_key', + 'key' => 'ep_embeddings_api_key', 'label' => __( 'OpenAI API Key', 'elasticpress-labs' ), 'help' => sprintf( wp_kses( @@ -115,24 +114,18 @@ public function set_settings_schema() { ], [ 'help' => __( 'OpenAI Embeddings API Url', 'elasticpress-labs' ), - 'key' => 'ep_openai_embeddings_api_url', + 'key' => 'ep_embeddings_api_url', 'label' => __( 'OpenAI Embeddings API Url', 'elasticpress-labs' ), 'type' => 'text', ], [ 'help' => __( 'OpenAI Embedding model', 'elasticpress-labs' ), - 'key' => 'ep_openai_embedding_model', + 'key' => 'ep_embeddings_embedding_model', 'label' => __( 'The name of the embedding model to use', 'elasticpress-labs' ), 'type' => 'text', ], [ - 'help' => __( 'Specify the postmeta field name that will hold vector embeddings and will be added as dense vector in Elasticsearch mapping.', 'elasticpress-labs' ), - 'key' => 'ep_vector_embeddings_meta_field', - 'label' => __( 'Meta field holding the vector_embeddings', 'elasticpress-labs' ), - 'type' => 'text', - ], - [ - 'key' => 'ep_external_embedding', + 'key' => 'ep_embeddings_external_embedding', 'help' => __( 'Enable this if an external process is providing the vector_embeddings meta field provided above with content. This will disable ElasticPress\'s control over embedding generation', 'elasticpress-labs' ), 'label' => __( 'External embedding processing', 'elasticpress-labs' ), 'type' => 'checkbox', @@ -167,19 +160,19 @@ public function generate_embedding( $text = '' ) { /** * Filter the URL for the post request. * - * @hook ep_openai_embeddings_api_url + * @hook ep_embeddings_api_url * @since 2.4.0 * * @param {string} $url The URL for the request. * * @return {string} The URL for the request. */ - $url = apply_filters( 'ep_openai_embeddings_api_url', $this->get_setting( 'ep_openai_embeddings_api_url' ) ); + $url = apply_filters( 'ep_embeddings_api_url', $this->get_setting( 'ep_embeddings_api_url' ) ); /** * Filter the request body before sending to OpenAI. * - * @hook ep_openai_embeddings_request_body + * @hook ep_embeddings_request_body * @since 2.4.0 * * @param {array} $body Request body that will be sent to OpenAI. @@ -188,9 +181,9 @@ public function generate_embedding( $text = '' ) { * @return {array} Request body. */ $body = apply_filters( - 'ep_openai_embeddings_request_body', + 'ep_embeddings_request_body', [ - 'model' => $this->get_setting( 'ep_openai_embedding_model' ), + 'model' => $this->get_setting( 'ep_embeddings_embedding_model' ), 'input' => (array) $text, 'dimensions' => $this->get_dimensions(), ], @@ -200,7 +193,7 @@ public function generate_embedding( $text = '' ) { /** * Filter the options for the post request. * - * @hook ep_openai_embeddings_options + * @hook ep_embeddings_options * @since 2.4.0 * * @param {array} $options The options for the request. @@ -209,7 +202,7 @@ public function generate_embedding( $text = '' ) { * @return {array} The options for the request. */ $options = apply_filters( - 'ep_openai_embeddings_options', + 'ep_embeddings_options', [ 'body' => wp_json_encode( $body ), 'timeout' => 60, // phpcs:ignore WordPressVIPMinimum.Performance.RemoteRequestTimeout.timeout_timeout @@ -230,14 +223,14 @@ public function generate_embedding( $text = '' ) { /** * Filter the response of the request. * - * @hook ep_openai_embeddings_request_response + * @hook ep_embeddings_request_response * @since 2.4.0 * * @param {array|WP_Error} $response The request response. * @param {array|string} $text The text that was sent to be processed. * @return {array|WP_Error} The request response. */ - $response = apply_filters( 'ep_openai_embeddings_request_response', $response, $text ); + $response = apply_filters( 'ep_embeddings_request_response', $response, $text ); if ( is_wp_error( $response ) ) { return $response; @@ -370,13 +363,13 @@ public function chunk_content( string $content = '', int $chunk_size = 150, $ove /** * Filter a chunk of text. * - * @hook ep_openai_embeddings_chunk + * @hook ep_embeddings_chunk * @since 2.4.0 * * @param {string} $chunk The chunk being processed. * @return {string} The modified chunk. */ - $chunk = apply_filters( 'ep_openai_embeddings_chunk', $chunk ); + $chunk = apply_filters( 'ep_embeddings_chunk', $chunk ); array_push( $chunks, $chunk ); } @@ -398,13 +391,13 @@ public function get_dimensions(): int { * Useful if you want to increase or decrease the length * of each embedding. * - * @hook ep_openai_embeddings_dimensions + * @hook ep_embeddings_dimensions * @since 2.4.0 * * @param {int} $dimensions The default dimensions. * @return {int} The dimensions. */ - return (int) apply_filters( 'ep_openai_embeddings_dimensions', $calc_dimensions ); + return (int) apply_filters( 'ep_embeddings_dimensions', $calc_dimensions ); } /** @@ -432,6 +425,6 @@ public function add_headers( array &$options = [] ) { * @return string */ public function get_auth_header() { - return 'Bearer ' . $this->get_setting( 'ep_openai_api_key' ); + return 'Bearer ' . $this->get_setting( 'ep_embeddings_api_key' ); } } From caff18f9223749935cd0cd88ccba7d953662261c Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Thu, 30 Jan 2025 15:13:06 -0300 Subject: [PATCH 12/17] Indexables array + fix content generation --- .../VectorEmbeddings/Indexables/Post.php | 2 +- .../VectorEmbeddings/VectorEmbeddings.php | 20 +++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index 4e26175..34d8734 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -105,7 +105,7 @@ public function get_post_chunks( int $post_id ): array { $main_content .= "# Summary\n{$excerpt}\n\n"; } - $content = get_the_content( $post_id ); + $content = get_the_content( null, false, $post_id ); if ( $content ) { $main_content .= "# Content\n{$content}\n\n"; } diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index 7a1bd64..23e0431 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -32,6 +32,13 @@ class VectorEmbeddings extends Feature { */ protected $dimensions = 512; + /** + * Array of VectorEmbeddings\Indexable objects + * + * @var array + */ + protected $indexables = []; + /** * Default settings * @@ -68,8 +75,8 @@ public function __construct() { * @return void */ public function setup() { - $post_indexable = new Indexables\Post( $this ); - $post_indexable->setup(); + $this->indexables['post'] = new Indexables\Post( $this ); + $this->indexables['post']->setup(); } /** @@ -400,6 +407,15 @@ public function get_dimensions(): int { return (int) apply_filters( 'ep_embeddings_dimensions', $calc_dimensions ); } + /** + * Return the array of indexables. + * + * @return array + */ + public function get_indexables() { + return $this->indexables; + } + /** * Add the headers. * From b18c8982ca51d24a8a5501bfabcaaff9a334901d Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Mon, 3 Feb 2025 11:49:23 -0300 Subject: [PATCH 13/17] Expose dimensions as a field --- .../VectorEmbeddings/VectorEmbeddings.php | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index 23e0431..7276e17 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -25,13 +25,6 @@ * Vector Embeddings feature */ class VectorEmbeddings extends Feature { - /** - * Number of dimensions for the embeddings. - * - * @var int - */ - protected $dimensions = 512; - /** * Array of VectorEmbeddings\Indexable objects * @@ -48,6 +41,7 @@ class VectorEmbeddings extends Feature { 'ep_embeddings_api_key' => '', 'ep_embeddings_api_url' => 'https://api.openai.com/v1/embeddings', 'ep_embeddings_embedding_model' => 'text-embedding-3-small', + 'ep_embeddings_dimensions' => 512, 'ep_embeddings_external_embedding' => '0', ]; @@ -120,16 +114,25 @@ public function set_settings_schema() { 'type' => 'text', ], [ - 'help' => __( 'OpenAI Embeddings API Url', 'elasticpress-labs' ), - 'key' => 'ep_embeddings_api_url', - 'label' => __( 'OpenAI Embeddings API Url', 'elasticpress-labs' ), - 'type' => 'text', + 'help' => __( 'OpenAI Embeddings API Url', 'elasticpress-labs' ), + 'key' => 'ep_embeddings_api_url', + 'label' => __( 'OpenAI Embeddings API Url', 'elasticpress-labs' ), + 'type' => 'text', + 'default' => $this->default_settings['ep_embeddings_api_url'], ], [ - 'help' => __( 'OpenAI Embedding model', 'elasticpress-labs' ), - 'key' => 'ep_embeddings_embedding_model', - 'label' => __( 'The name of the embedding model to use', 'elasticpress-labs' ), - 'type' => 'text', + 'help' => __( 'OpenAI Embedding model', 'elasticpress-labs' ), + 'key' => 'ep_embeddings_embedding_model', + 'label' => __( 'The name of the embedding model to use', 'elasticpress-labs' ), + 'type' => 'text', + 'default' => $this->default_settings['ep_embeddings_embedding_model'], + ], + [ + 'help' => __( 'Embedding model dimensions', 'elasticpress-labs' ), + 'key' => 'ep_embeddings_dimensions', + 'label' => __( 'The number of dimensions supported by your embedding model', 'elasticpress-labs' ), + 'type' => 'number', + 'default' => $this->default_settings['ep_embeddings_dimensions'], ], [ 'key' => 'ep_embeddings_external_embedding', @@ -390,7 +393,7 @@ public function chunk_content( string $content = '', int $chunk_size = 150, $ove * @return int */ public function get_dimensions(): int { - $calc_dimensions = max( 1, min( 4096, $this->dimensions ) ); + $calc_dimensions = max( 1, min( 4096, $this->get_setting( 'ep_embeddings_dimensions' ) ) ); /** * Filter the dimensions we want for each embedding. From 33b0ea2fb46bf12bb336827133dfdc2d9bacc914 Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Mon, 17 Feb 2025 13:28:20 -0300 Subject: [PATCH 14/17] Draft sending chunks to EP.io so it is enqueued for processing --- .../VectorEmbeddings/Indexables/Post.php | 25 ++++++++++++++++++- .../VectorEmbeddings/VectorEmbeddings.php | 11 ++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index 34d8734..dc87a96 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -23,7 +23,11 @@ public function setup() { // Only trigger embeddings when external embeddings are turned off if ( ! $this->feature->get_setting( 'ep_embeddings_external_embedding' ) ) { - add_filter( 'ep_post_sync_args_post_prepare_meta', [ $this, 'add_vector_field_to_post_sync' ], 10, 2 ); + if ( $this->feature->get_setting( 'ep_embeddings_use_epio' ) ) { + add_filter( 'ep_bulk_index_action_args', [ $this, 'add_chunks_to_bulk_index_action_args' ], 10, 2 ); + } else { + add_filter( 'ep_post_sync_args_post_prepare_meta', [ $this, 'add_vector_field_to_post_sync' ], 10, 2 ); + } } } @@ -37,6 +41,25 @@ public function add_post_vector_field_mapping( array $mapping ): array { return $this->add_vector_mapping_field( $mapping ); } + /** + * Add the content chunks to the index action args. + * + * This will be picked up by EP.io servers so it is enqueued for processing. + * + * @param array $args Current index action args. + * @param array $post The post being indexed. + * @return array + */ + public function add_chunks_to_bulk_index_action_args( array $args, array $post ) { + if ( ! $this->should_add_vector_field_to_post( $post['ID'] ) ) { + return $args; + } + + $args['epio-content-chunks'] = $this->get_post_chunks( $post['ID'] ); + + return $args; + } + /** * Add the embedding data to the post vector sync args. * diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index 7276e17..9cb50a8 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -15,6 +15,7 @@ use ElasticPress\Feature; use ElasticPress\Elasticsearch; +use ElasticPress\Utils; use WP_Error; if ( ! defined( 'ABSPATH' ) ) { @@ -43,6 +44,7 @@ class VectorEmbeddings extends Feature { 'ep_embeddings_embedding_model' => 'text-embedding-3-small', 'ep_embeddings_dimensions' => 512, 'ep_embeddings_external_embedding' => '0', + 'ep_embeddings_use_epio' => '0', ]; /** @@ -141,6 +143,15 @@ public function set_settings_schema() { 'type' => 'checkbox', ], ]; + + if ( Utils\is_epio() ) { + $this->settings_schema[] = [ + 'key' => 'ep_embeddings_use_epio', + 'help' => __( 'Enable this if you want to use ElasticPress.io to vectorize your content.', 'elasticpress-labs' ), + 'label' => __( 'Use EP.io', 'elasticpress-labs' ), + 'type' => 'checkbox', + ]; + } } /** From 869d1f595e686c83e1fc98ed9cdad54000c71fdc Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Mon, 24 Feb 2025 12:55:19 -0300 Subject: [PATCH 15/17] Conditionally use a text_chunks field to store the raw text chunks --- .../Feature/VectorEmbeddings/Indexable.php | 4 ++- .../VectorEmbeddings/Indexables/Post.php | 36 +++++++++++++++++-- .../VectorEmbeddings/VectorEmbeddings.php | 10 +++++- 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Indexable.php b/includes/classes/Feature/VectorEmbeddings/Indexable.php index 304cb0a..408e667 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexable.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexable.php @@ -44,7 +44,7 @@ public function add_vector_mapping_field( array $mapping, bool $quantization = t $es_version = Elasticsearch::factory()->get_elasticsearch_version(); // Don't add the field if it already exists. - if ( isset( $mapping['mappings']['properties']['chunks'] ) ) { + if ( isset( $mapping['mappings']['properties']['chunks'], $mapping['mappings']['properties']['text_chunks'] ) ) { return $mapping; } @@ -59,6 +59,8 @@ public function add_vector_mapping_field( array $mapping, bool $quantization = t ], ]; + $mapping['mappings']['properties']['text_chunks']['type'] = 'text'; + // Add extra vector fields for newer versions of Elasticsearch. if ( version_compare( $es_version, '8.0', '>=' ) ) { // The index (true or false, default true) and similarity (l2_norm, dot_product or cosine) fields diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index dc87a96..47299e4 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -24,7 +24,8 @@ public function setup() { // Only trigger embeddings when external embeddings are turned off if ( ! $this->feature->get_setting( 'ep_embeddings_external_embedding' ) ) { if ( $this->feature->get_setting( 'ep_embeddings_use_epio' ) ) { - add_filter( 'ep_bulk_index_action_args', [ $this, 'add_chunks_to_bulk_index_action_args' ], 10, 2 ); + add_filter( 'ep_bulk_index_action_args', [ $this, 'maybe_add_chunks_to_bulk_index_action_args' ], 10, 2 ); + add_filter( 'ep_post_sync_args_post_prepare_meta', [ $this, 'maybe_add_chunks_to_text_chunks_fields' ], 10, 2 ); } else { add_filter( 'ep_post_sync_args_post_prepare_meta', [ $this, 'add_vector_field_to_post_sync' ], 10, 2 ); } @@ -50,12 +51,41 @@ public function add_post_vector_field_mapping( array $mapping ): array { * @param array $post The post being indexed. * @return array */ - public function add_chunks_to_bulk_index_action_args( array $args, array $post ) { + public function maybe_add_chunks_to_bulk_index_action_args( array $args, array $post ) { if ( ! $this->should_add_vector_field_to_post( $post['ID'] ) ) { return $args; } - $args['epio-content-chunks'] = $this->get_post_chunks( $post['ID'] ); + $post_chunks = $this->get_post_chunks( $post['ID'] ); + $post_chunks_size = mb_strlen( wp_json_encode( $post_chunks ), '8bit' ); + if ( $post_chunks_size >= 200 * KB_IN_BYTES ) { + return $args; + } + + $args['epio-content-chunks'] = $post_chunks; + + return $args; + } + + /** + * Add text chunks to their field in the post sync args. + * + * @param array $args Current sync args. + * @param int $post_id Post ID being synced. + * @return array + */ + public function maybe_add_chunks_to_text_chunks_fields( array $args, int $post_id ) { + if ( ! $this->should_add_vector_field_to_post( $post_id ) ) { + return $args; + } + + $post_chunks = $this->get_post_chunks( $post_id ); + $post_chunks_size = mb_strlen( wp_json_encode( $post_chunks ), '8bit' ); + if ( $post_chunks_size < 200 * KB_IN_BYTES ) { + return $args; + } + + $args['text_chunks'] = $post_chunks; return $args; } diff --git a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php index 9cb50a8..c795a1c 100644 --- a/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php +++ b/includes/classes/Feature/VectorEmbeddings/VectorEmbeddings.php @@ -331,7 +331,12 @@ public function normalize_content( string $content = '' ): string { $content = apply_filters( 'the_content', $content ); // Strip shortcodes but keep internal caption text. - $content = preg_replace( '#\[.+\](.+)\[/.+\]#', '$1', $content ); + // Revert it if shortcodes are not balanced and preg_replace errors out. + $pre_content = $content; + $content = preg_replace( '#\[.+\](.+)\[/.+\]#', '$1', $content ); + if ( null === $content ) { + $content = $pre_content; + } // Strip HTML entities. $content = preg_replace( '/&#?[a-z0-9]{2,8};/i', '', $content ); @@ -356,6 +361,9 @@ public function normalize_content( string $content = '' ): string { public function chunk_content( string $content = '', int $chunk_size = 150, $overlap_size = 25 ): array { // Normalize our content. $content = $this->normalize_content( $content ); + if ( ! $content ) { + return []; + } // Remove multiple whitespaces. $content = preg_replace( '/[ \t\r\f]+/', ' ', $content ); From 2d563503a6f78ef3b78f3fbfd744c3e53b34a3cd Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Mon, 24 Feb 2025 13:48:12 -0300 Subject: [PATCH 16/17] Make sending method filterable --- .../Feature/VectorEmbeddings/Indexable.php | 43 +++++++++++++++++++ .../VectorEmbeddings/Indexables/Post.php | 18 +++----- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Indexable.php b/includes/classes/Feature/VectorEmbeddings/Indexable.php index 408e667..b416a8d 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexable.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexable.php @@ -114,4 +114,47 @@ public function add_chunks_field_value( array $args, array $embeddings ): array return $args; } + + /** + * Determine the way to send the text chunks to the Elasticsearch server. + * + * Depending on the size of the text chunks, it can either be sent using the index action args + * (the JSON object where we determine the index to be used, etc.) or as a regular field in the Elasticsearch. + * To avoid overhead, we prefer to send it as an index action arg, but sometimes it is just too big for it. + * + * @param array $text_chunks The text chunks + * @return string The method. Can be 'index_action_args' or 'es_doc_field'. + */ + protected function get_text_chunks_sending_method( $text_chunks ) { + $post_chunks_size = mb_strlen( wp_json_encode( $text_chunks ), '8bit' ); + + /** + * Filter to determine the threshold size for the text chunks to be sent as an index action arg. + * + * @hook ep_embeddings_sending_method_limit + * @since 2.4.0 + * + * @param {int} $size The size limit in bytes. Defaults to 200kb. + * @return {int} The new $size value. + */ + $size_limit = apply_filters( 'ep_embeddings_sending_method_limit', 200 * KB_IN_BYTES ); + + $method = $post_chunks_size < $size_limit ? 'index_action_args' : 'es_doc_field'; + + /** + * Filter to determine the method to be used to send the text chunks. + * + * Unless you are implementing a custom solution, return should be either 'index_action_args' or 'es_doc_field'. + * + * @hook ep_embeddings_sending_method + * @since 2.4.0 + * + * @param {string} $method The method to be used. + * @param {int} $text_chunks The text chunks being analyzed. + * @param {int} $post_chunks_size The determined size of the text chunks. + * @param {int} $size_limit The size limit in bytes. Defaults to 200kb. + * @return {string} The new $method value. + */ + return apply_filters( 'ep_embeddings_sending_method', $method, $text_chunks, $post_chunks_size, $size_limit ); + } } diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index 47299e4..07fdd19 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -56,14 +56,11 @@ public function maybe_add_chunks_to_bulk_index_action_args( array $args, array $ return $args; } - $post_chunks = $this->get_post_chunks( $post['ID'] ); - $post_chunks_size = mb_strlen( wp_json_encode( $post_chunks ), '8bit' ); - if ( $post_chunks_size >= 200 * KB_IN_BYTES ) { - return $args; + $post_chunks = $this->get_post_chunks( $post['ID'] ); + if ( 'index_action_args' === $this->get_text_chunks_sending_method( $post_chunks ) ) { + $args['epio-content-chunks'] = $post_chunks; } - $args['epio-content-chunks'] = $post_chunks; - return $args; } @@ -79,14 +76,11 @@ public function maybe_add_chunks_to_text_chunks_fields( array $args, int $post_i return $args; } - $post_chunks = $this->get_post_chunks( $post_id ); - $post_chunks_size = mb_strlen( wp_json_encode( $post_chunks ), '8bit' ); - if ( $post_chunks_size < 200 * KB_IN_BYTES ) { - return $args; + $post_chunks = $this->get_post_chunks( $post_id ); + if ( 'es_doc_field' === $this->get_text_chunks_sending_method( $post_chunks ) ) { + $args['text_chunks'] = $post_chunks; } - $args['text_chunks'] = $post_chunks; - return $args; } From c7bbe718133467cf7bc375190a924db15d2986c5 Mon Sep 17 00:00:00 2001 From: Felipe Elia Date: Mon, 3 Mar 2025 11:01:11 -0300 Subject: [PATCH 17/17] New ep_embeddings_control field --- .../Feature/VectorEmbeddings/Indexable.php | 38 +++++++++++----- .../VectorEmbeddings/Indexables/Post.php | 44 ++++++++++++++++++- 2 files changed, 70 insertions(+), 12 deletions(-) diff --git a/includes/classes/Feature/VectorEmbeddings/Indexable.php b/includes/classes/Feature/VectorEmbeddings/Indexable.php index b416a8d..ffbe620 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexable.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexable.php @@ -44,22 +44,38 @@ public function add_vector_mapping_field( array $mapping, bool $quantization = t $es_version = Elasticsearch::factory()->get_elasticsearch_version(); // Don't add the field if it already exists. - if ( isset( $mapping['mappings']['properties']['chunks'], $mapping['mappings']['properties']['text_chunks'] ) ) { + if ( isset( $mapping['mappings']['properties']['chunks'], $mapping['mappings']['properties']['ep_embeddings_control'] ) ) { return $mapping; } // Add the default vector field mapping. - $mapping['mappings']['properties']['chunks'] = [ - 'type' => 'nested', - 'properties' => [ - 'vector' => [ - 'type' => 'dense_vector', - 'dims' => $this->feature->get_dimensions(), + $mapping['mappings']['properties'] = array_merge( + $mapping['mappings']['properties'], + [ + 'ep_embeddings_control' => [ + 'properties' => [ + 'is_processing' => [ + 'type' => 'boolean', + ], + 'errors' => [ + 'type' => 'text', + ], + 'text_chunks' => [ + 'type' => 'text', + ], + ], ], - ], - ]; - - $mapping['mappings']['properties']['text_chunks']['type'] = 'text'; + 'chunks' => [ + 'type' => 'nested', + 'properties' => [ + 'vector' => [ + 'type' => 'dense_vector', + 'dims' => $this->feature->get_dimensions(), + ], + ], + ], + ] + ); // Add extra vector fields for newer versions of Elasticsearch. if ( version_compare( $es_version, '8.0', '>=' ) ) { diff --git a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php index 07fdd19..3d97d1a 100644 --- a/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php +++ b/includes/classes/Feature/VectorEmbeddings/Indexables/Post.php @@ -26,6 +26,7 @@ public function setup() { if ( $this->feature->get_setting( 'ep_embeddings_use_epio' ) ) { add_filter( 'ep_bulk_index_action_args', [ $this, 'maybe_add_chunks_to_bulk_index_action_args' ], 10, 2 ); add_filter( 'ep_post_sync_args_post_prepare_meta', [ $this, 'maybe_add_chunks_to_text_chunks_fields' ], 10, 2 ); + add_filter( 'ep_doc_status', [ $this, 'maybe_set_doc_status' ], 10, 3 ); } else { add_filter( 'ep_post_sync_args_post_prepare_meta', [ $this, 'add_vector_field_to_post_sync' ], 10, 2 ); } @@ -76,14 +77,55 @@ public function maybe_add_chunks_to_text_chunks_fields( array $args, int $post_i return $args; } + $args['ep_embeddings_control'] = [ + 'is_processing' => true, + 'errors' => [], + 'text_chunks' => [], + ]; + $post_chunks = $this->get_post_chunks( $post_id ); if ( 'es_doc_field' === $this->get_text_chunks_sending_method( $post_chunks ) ) { - $args['text_chunks'] = $post_chunks; + $args['ep_embeddings_control']['text_chunks'] = $post_chunks; } return $args; } + /** + * Change the doc status indicator depending on the Vector Embeddings process status + * + * @param array $status The status array containing status, message and explanation + * @param int $post_id The post ID being checked + * @param array $es_doc The Elasticsearch document + * @return array + */ + public function maybe_set_doc_status( array $status, int $post_id, array $es_doc ): array { + if ( ! isset( $es_doc['ep_embeddings_control'] ) ) { + return $status; + } + + if ( ! empty( $es_doc['ep_embeddings_control']['is_processing'] ) ) { + $status = [ + 'status' => 'warning', + 'message' => esc_html__( 'Processing vector embeddings', 'elasticpress-labs' ), + 'explanation' => esc_html__( 'Vector embeddings are still being processed.', 'elasticpress' ), + ]; + } + + if ( ! empty( $es_doc['ep_embeddings_control']['errors'] ) ) { + $status = [ + 'status' => 'error', + 'message' => esc_html__( 'Vector embeddings failed', 'elasticpress-labs' ), + 'explanation' => wp_sprintf( + esc_html__( 'Vector embeddings failed with the following error(s): %l', 'elasticpress-labs' ), + $es_doc['ep_embeddings_control']['errors'] + ), + ]; + } + + return $status; + } + /** * Add the embedding data to the post vector sync args. *