From 495df4b61160e2b9e0e35937f4b831f68944bb48 Mon Sep 17 00:00:00 2001 From: luzhang Date: Tue, 15 Oct 2024 16:07:17 +0800 Subject: [PATCH 1/5] add json key inverted index in stats to speed up json expr Signed-off-by: luzhang Co-authored-by: luzhang --- internal/core/src/common/Consts.h | 1 + internal/core/src/common/Json.h | 129 +++++ internal/core/src/common/jsmn.h | 498 ++++++++++++++++++ internal/core/src/exec/expression/Expr.h | 14 + .../core/src/exec/expression/TermExpr.cpp | 279 ++++++---- internal/core/src/exec/expression/TermExpr.h | 4 + .../core/src/index/JsonKeyInvertedIndex.cpp | 247 +++++++++ .../core/src/index/JsonKeyInvertedIndex.h | 93 ++++ internal/core/src/indexbuilder/index_c.cpp | 68 +++ internal/core/src/indexbuilder/index_c.h | 5 + internal/core/src/mmap/ChunkedColumn.h | 8 +- internal/core/src/mmap/Column.h | 8 +- .../src/segcore/ChunkedSegmentSealedImpl.h | 28 + internal/core/src/segcore/SegmentInterface.h | 1 + internal/core/src/segcore/SegmentSealed.h | 10 + .../core/src/segcore/SegmentSealedImpl.cpp | 25 + internal/core/src/segcore/SegmentSealedImpl.h | 16 + internal/core/src/segcore/segment_c.cpp | 54 ++ internal/core/src/segcore/segment_c.h | 6 + .../core/src/storage/DiskFileManagerImpl.cpp | 197 +++---- .../core/src/storage/DiskFileManagerImpl.h | 41 +- internal/core/src/storage/Util.cpp | 22 + internal/core/src/storage/Util.h | 13 + .../tantivy-binding/include/tantivy-binding.h | 2 +- .../tantivy/tantivy-binding/src/array.rs | 10 +- .../tantivy-binding/src/docid_collector.rs | 8 +- .../tantivy-binding/src/index_reader.rs | 32 +- .../tantivy-binding/src/index_reader_text.rs | 2 +- .../tantivy-binding/src/vec_collector.rs | 10 +- internal/core/unittest/CMakeLists.txt | 1 + internal/core/unittest/test_c_api.cpp | 384 ++++++++++++++ .../core/unittest/test_json_key_index.cpp | 222 ++++++++ internal/datacoord/job_manager.go | 31 ++ internal/datacoord/meta.go | 1 + internal/datacoord/segment_operator.go | 12 + internal/datacoord/task_stats.go | 7 + internal/indexnode/indexnode_service.go | 23 +- internal/indexnode/task_stats.go | 104 ++++ internal/indexnode/taskinfo_ops.go | 66 ++- internal/proto/data_coord.proto | 11 + internal/proto/index_cgo_msg.proto | 10 + internal/proto/index_coord.proto | 1 + internal/proto/query_coord.proto | 1 + internal/proto/worker.proto | 1 + internal/querycoordv2/utils/types.go | 33 +- internal/querynodev2/segments/segment.go | 34 ++ .../querynodev2/segments/segment_loader.go | 25 +- internal/util/indexcgowrapper/index.go | 37 ++ pkg/util/typeutil/field_schema.go | 7 + 49 files changed, 2543 insertions(+), 299 deletions(-) create mode 100644 internal/core/src/common/jsmn.h create mode 100644 internal/core/src/index/JsonKeyInvertedIndex.cpp create mode 100644 internal/core/src/index/JsonKeyInvertedIndex.h create mode 100644 internal/core/unittest/test_json_key_index.cpp diff --git a/internal/core/src/common/Consts.h b/internal/core/src/common/Consts.h index c8e10347db8f4..3a6a0ee6659e2 100644 --- a/internal/core/src/common/Consts.h +++ b/internal/core/src/common/Consts.h @@ -49,6 +49,7 @@ const char PAGE_RETAIN_ORDER[] = "page_retain_order"; const char TEXT_LOG_ROOT_PATH[] = "text_log"; const char ITERATIVE_FILTER[] = "iterative_filter"; const char HINTS[] = "hints"; +const char JSON_KEY_INDEX_LOG_ROOT_PATH[] = "json_key_index_log"; const char DEFAULT_PLANNODE_ID[] = "0"; const char DEAFULT_QUERY_ID[] = "0"; diff --git a/internal/core/src/common/Json.h b/internal/core/src/common/Json.h index 0570bdb56dd2c..55c736e05da1c 100644 --- a/internal/core/src/common/Json.h +++ b/internal/core/src/common/Json.h @@ -71,6 +71,45 @@ ExtractSubJson(const std::string& json, const std::vector& keys) { return buffer.GetString(); } +inline std::pair +ParseTopLevelKey(const std::string& json_pointer, bool escaped = false) { + if (json_pointer.empty()) { + return {"", ""}; + } + + Assert(json_pointer[0] == '/'); + size_t start = 1; + size_t end = json_pointer.find('/', start); + + std::string top_key = (end == std::string::npos) + ? json_pointer.substr(start) + : json_pointer.substr(start, end - start); + + if (escaped) { + if (top_key.find("~0") != std::string::npos) { + top_key.replace(top_key.find("~0"), 2, "~"); + } + if (top_key.find("~1") != std::string::npos) { + top_key.replace(top_key.find("~1"), 2, "/"); + } + } + + std::string remaining_path = + (end == std::string::npos) ? "" : json_pointer.substr(end); + + return {top_key, remaining_path}; +} + +static std::string +ToLower(const std::string_view& str) { + std::string result(str); + std::transform( + result.begin(), result.end(), result.begin(), [](unsigned char c) { + return std::tolower(c); + }); + return result; +} + using document = simdjson::ondemand::document; template using value_result = simdjson::simdjson_result; @@ -149,6 +188,25 @@ class Json { return doc; } + value_result + doc(uint16_t offset, uint16_t length) const { + thread_local simdjson::ondemand::parser parser; + + // it's always safe to add the padding, + // as we have allocated the memory with this padding + auto doc = parser.iterate( + data_.data() + offset, length, length + simdjson::SIMDJSON_PADDING); + AssertInfo(doc.error() == simdjson::SUCCESS, + "failed to parse the json {} offset {}, length {}: {}, " + "total_json:{}", + std::string(data_.data() + offset, length), + offset, + length, + simdjson::error_message(doc.error()), + data_); + return doc; + } + value_result dom_doc() const { if (data_.size() == 0) { @@ -166,6 +224,21 @@ class Json { return doc; } + value_result + dom_doc(uint16_t offset, uint16_t length) const { + thread_local simdjson::dom::parser parser; + + // it's always safe to add the padding, + // as we have allocated the memory with this padding + auto doc = parser.parse(data_.data() + offset, + length + simdjson::SIMDJSON_PADDING); + AssertInfo(doc.error() == simdjson::SUCCESS, + "failed to parse the json {}: {}", + std::string(data_.data() + offset, length), + simdjson::error_message(doc.error())); + return doc; + } + bool exist(std::string_view pointer) const { return doc().at_pointer(pointer).error() == simdjson::SUCCESS; @@ -205,6 +278,62 @@ class Json { return doc().at_pointer(pointer).get(); } + template + value_result + at(uint16_t offset, uint16_t length) const { + if constexpr (std::is_same_v || + std::is_same_v) { + return value_result(T(data_.data() + offset, length)); + } + return doc(offset, length).get(); + } + + template + std::pair + at_pos(uint16_t offset, uint16_t length) const { + const char* pos = data_.data() + offset; + std::string_view str(pos, length); + if constexpr (std::is_same_v) { + if (milvus::ToLower(str) == "true") { + return {true, ""}; + } else if (milvus::ToLower(str) == "false") { + return {false, ""}; + } else { + return {false, "invalid boolean value"}; + } + } else if constexpr (std::is_same_v) { + try { + size_t parsed_chars; + int64_t int_value = std::stoll(pos, &parsed_chars, 10); + if (parsed_chars == length) { + return {int_value, ""}; + } + return {0, "string contains non-integer characters"}; + } catch (...) { + return {0, "invalid integer string"}; + } + } else if constexpr (std::is_same_v) { + try { + size_t parsed_chars; + double double_value = std::stod(pos, &parsed_chars); + if (parsed_chars == length) { + return {double_value, ""}; + } + return {0, "string contains non-integer characters"}; + } catch (...) { + return {0, "invalid double string"}; + } + } else { + static_assert(std::is_same_v); + return {str, ""}; + } + } + + value_result + array_at(uint16_t offset, uint16_t length) const { + return dom_doc(offset, length).get_array(); + } + // get dom array by JSON pointer, // call `size()` to get array size, // call `at()` to get array element by index, diff --git a/internal/core/src/common/jsmn.h b/internal/core/src/common/jsmn.h new file mode 100644 index 0000000000000..f20b56ba48a68 --- /dev/null +++ b/internal/core/src/common/jsmn.h @@ -0,0 +1,498 @@ +/* + * MIT License + * + * Copyright (c) 2010 Serge Zaitsev + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef JSMN_H +#define JSMN_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define JSMN_STATIC +#ifdef JSMN_STATIC +#define JSMN_API static +#else +#define JSMN_API extern +#endif + +/** + * JSON type identifier. Basic types are: + * o Object + * o Array + * o String + * o Other primitive: number, boolean (true/false) or null + */ +typedef enum { + JSMN_UNDEFINED = 0, + JSMN_OBJECT = 1 << 0, + JSMN_ARRAY = 1 << 1, + JSMN_STRING = 1 << 2, + JSMN_PRIMITIVE = 1 << 3 +} jsmntype_t; + +enum jsmnerr { + /* Not enough tokens were provided */ + JSMN_ERROR_NOMEM = -1, + /* Invalid character inside JSON string */ + JSMN_ERROR_INVAL = -2, + /* The string is not a full JSON packet, more bytes expected */ + JSMN_ERROR_PART = -3 +}; + +/** + * JSON token description. + * type type (object, array, string etc.) + * start start position in JSON data string + * end end position in JSON data string + */ +typedef struct jsmntok { + jsmntype_t type; + int start; + int end; + int size; +#ifdef JSMN_PARENT_LINKS + int parent; +#endif +} jsmntok_t; + +/** + * JSON parser. Contains an array of token blocks available. Also stores + * the string being parsed now and current position in that string. + */ +typedef struct jsmn_parser { + unsigned int pos; /* offset in the JSON string */ + unsigned int toknext; /* next token to allocate */ + int toksuper; /* superior token node, e.g. parent object or array */ +} jsmn_parser; + +/** + * Create JSON parser over an array of tokens + */ +JSMN_API void +jsmn_init(jsmn_parser* parser); + +/** + * Run JSON parser. It parses a JSON data string into and array of tokens, each + * describing + * a single JSON object. + */ +JSMN_API int +jsmn_parse(jsmn_parser* parser, + const char* js, + const size_t len, + jsmntok_t* tokens, + const unsigned int num_tokens); + +#ifndef JSMN_HEADER +/** + * Allocates a fresh unused token from the token pool. + */ +static jsmntok_t* +jsmn_alloc_token(jsmn_parser* parser, + jsmntok_t* tokens, + const size_t num_tokens) { + jsmntok_t* tok; + if (parser->toknext >= num_tokens) { + return NULL; + } + tok = &tokens[parser->toknext++]; + tok->start = tok->end = -1; + tok->size = 0; +#ifdef JSMN_PARENT_LINKS + tok->parent = -1; +#endif + return tok; +} + +/** + * Fills token type and boundaries. + */ +static void +jsmn_fill_token(jsmntok_t* token, + const jsmntype_t type, + const int start, + const int end) { + token->type = type; + token->start = start; + token->end = end; + token->size = 0; +} + +/** + * Fills next available token with JSON primitive. + */ +static int +jsmn_parse_primitive(jsmn_parser* parser, + const char* js, + const size_t len, + jsmntok_t* tokens, + const size_t num_tokens) { + jsmntok_t* token; + int start; + + start = parser->pos; + + for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) { + switch (js[parser->pos]) { +#ifndef JSMN_STRICT + /* In strict mode primitive must be followed by "," or "}" or "]" */ + case ':': +#endif + case '\t': + case '\r': + case '\n': + case ' ': + case ',': + case ']': + case '}': + goto found; + default: + /* to quiet a warning from gcc*/ + break; + } + if (js[parser->pos] < 32 || js[parser->pos] >= 127) { + parser->pos = start; + return JSMN_ERROR_INVAL; + } + } +#ifdef JSMN_STRICT + /* In strict mode primitive must be followed by a comma/object/array */ + parser->pos = start; + return JSMN_ERROR_PART; +#endif + +found: + if (tokens == NULL) { + parser->pos--; + return 0; + } + token = jsmn_alloc_token(parser, tokens, num_tokens); + if (token == NULL) { + parser->pos = start; + return JSMN_ERROR_NOMEM; + } + jsmn_fill_token(token, JSMN_PRIMITIVE, start, parser->pos); +#ifdef JSMN_PARENT_LINKS + token->parent = parser->toksuper; +#endif + parser->pos--; + return 0; +} + +/** + * Fills next token with JSON string. + */ +static int +jsmn_parse_string(jsmn_parser* parser, + const char* js, + const size_t len, + jsmntok_t* tokens, + const size_t num_tokens) { + jsmntok_t* token; + + int start = parser->pos; + + /* Skip starting quote */ + parser->pos++; + + for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) { + char c = js[parser->pos]; + + /* Quote: end of string */ + if (c == '\"') { + if (tokens == NULL) { + return 0; + } + token = jsmn_alloc_token(parser, tokens, num_tokens); + if (token == NULL) { + parser->pos = start; + return JSMN_ERROR_NOMEM; + } + jsmn_fill_token(token, JSMN_STRING, start + 1, parser->pos); +#ifdef JSMN_PARENT_LINKS + token->parent = parser->toksuper; +#endif + return 0; + } + + /* Backslash: Quoted symbol expected */ + if (c == '\\' && parser->pos + 1 < len) { + int i; + parser->pos++; + switch (js[parser->pos]) { + /* Allowed escaped symbols */ + case '\"': + case '/': + case '\\': + case 'b': + case 'f': + case 'r': + case 'n': + case 't': + break; + /* Allows escaped symbol \uXXXX */ + case 'u': + parser->pos++; + for (i = 0; + i < 4 && parser->pos < len && js[parser->pos] != '\0'; + i++) { + /* If it isn't a hex character we have an error */ + if (!((js[parser->pos] >= 48 && + js[parser->pos] <= 57) || /* 0-9 */ + (js[parser->pos] >= 65 && + js[parser->pos] <= 70) || /* A-F */ + (js[parser->pos] >= 97 && + js[parser->pos] <= 102))) { /* a-f */ + parser->pos = start; + return JSMN_ERROR_INVAL; + } + parser->pos++; + } + parser->pos--; + break; + /* Unexpected symbol */ + default: + parser->pos = start; + return JSMN_ERROR_INVAL; + } + } + } + parser->pos = start; + return JSMN_ERROR_PART; +} + +/** + * Parse JSON string and fill tokens. + */ +JSMN_API int +jsmn_parse(jsmn_parser* parser, + const char* js, + const size_t len, + jsmntok_t* tokens, + const unsigned int num_tokens) { + int r; + int i; + jsmntok_t* token; + int count = parser->toknext; + + for (; parser->pos < len && js[parser->pos] != '\0'; parser->pos++) { + char c; + jsmntype_t type; + + c = js[parser->pos]; + switch (c) { + case '{': + case '[': + count++; + if (tokens == NULL) { + break; + } + token = jsmn_alloc_token(parser, tokens, num_tokens); + if (token == NULL) { + return JSMN_ERROR_NOMEM; + } + if (parser->toksuper != -1) { + jsmntok_t* t = &tokens[parser->toksuper]; +#ifdef JSMN_STRICT + /* In strict mode an object or array can't become a key */ + if (t->type == JSMN_OBJECT) { + return JSMN_ERROR_INVAL; + } +#endif + t->size++; +#ifdef JSMN_PARENT_LINKS + token->parent = parser->toksuper; +#endif + } + token->type = (c == '{' ? JSMN_OBJECT : JSMN_ARRAY); + token->start = parser->pos; + parser->toksuper = parser->toknext - 1; + break; + case '}': + case ']': + if (tokens == NULL) { + break; + } + type = (c == '}' ? JSMN_OBJECT : JSMN_ARRAY); +#ifdef JSMN_PARENT_LINKS + if (parser->toknext < 1) { + return JSMN_ERROR_INVAL; + } + token = &tokens[parser->toknext - 1]; + for (;;) { + if (token->start != -1 && token->end == -1) { + if (token->type != type) { + return JSMN_ERROR_INVAL; + } + token->end = parser->pos + 1; + parser->toksuper = token->parent; + break; + } + if (token->parent == -1) { + if (token->type != type || parser->toksuper == -1) { + return JSMN_ERROR_INVAL; + } + break; + } + token = &tokens[token->parent]; + } +#else + for (i = parser->toknext - 1; i >= 0; i--) { + token = &tokens[i]; + if (token->start != -1 && token->end == -1) { + if (token->type != type) { + return JSMN_ERROR_INVAL; + } + parser->toksuper = -1; + token->end = parser->pos + 1; + break; + } + } + /* Error if unmatched closing bracket */ + if (i == -1) { + return JSMN_ERROR_INVAL; + } + for (; i >= 0; i--) { + token = &tokens[i]; + if (token->start != -1 && token->end == -1) { + parser->toksuper = i; + break; + } + } +#endif + break; + case '\"': + r = jsmn_parse_string(parser, js, len, tokens, num_tokens); + if (r < 0) { + return r; + } + count++; + if (parser->toksuper != -1 && tokens != NULL) { + tokens[parser->toksuper].size++; + } + break; + case '\t': + case '\r': + case '\n': + case ' ': + break; + case ':': + parser->toksuper = parser->toknext - 1; + break; + case ',': + if (tokens != NULL && parser->toksuper != -1 && + tokens[parser->toksuper].type != JSMN_ARRAY && + tokens[parser->toksuper].type != JSMN_OBJECT) { +#ifdef JSMN_PARENT_LINKS + parser->toksuper = tokens[parser->toksuper].parent; +#else + for (i = parser->toknext - 1; i >= 0; i--) { + if (tokens[i].type == JSMN_ARRAY || + tokens[i].type == JSMN_OBJECT) { + if (tokens[i].start != -1 && tokens[i].end == -1) { + parser->toksuper = i; + break; + } + } + } +#endif + } + break; +#ifdef JSMN_STRICT + /* In strict mode primitives are: numbers and booleans */ + case '-': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case 't': + case 'f': + case 'n': + /* And they must not be keys of the object */ + if (tokens != NULL && parser->toksuper != -1) { + const jsmntok_t* t = &tokens[parser->toksuper]; + if (t->type == JSMN_OBJECT || + (t->type == JSMN_STRING && t->size != 0)) { + return JSMN_ERROR_INVAL; + } + } +#else + /* In non-strict mode every unquoted value is a primitive */ + default: +#endif + r = jsmn_parse_primitive(parser, js, len, tokens, num_tokens); + if (r < 0) { + return r; + } + count++; + if (parser->toksuper != -1 && tokens != NULL) { + tokens[parser->toksuper].size++; + } + break; + +#ifdef JSMN_STRICT + /* Unexpected char in strict mode */ + default: + return JSMN_ERROR_INVAL; +#endif + } + } + + if (tokens != NULL) { + for (i = parser->toknext - 1; i >= 0; i--) { + /* Unmatched opened object or array */ + if (tokens[i].start != -1 && tokens[i].end == -1) { + return JSMN_ERROR_PART; + } + } + } + + return count; +} + +/** + * Creates a new parser based over a given buffer with an array of tokens + * available. + */ +JSMN_API void +jsmn_init(jsmn_parser* parser) { + parser->pos = 0; + parser->toknext = 0; + parser->toksuper = -1; +} + +#endif /* JSMN_HEADER */ + +#ifdef __cplusplus +} +#endif + +#endif /* JSMN_H */ diff --git a/internal/core/src/exec/expression/Expr.h b/internal/core/src/exec/expression/Expr.h index f2bbc8cd7f6bb..58e248a2c64b2 100644 --- a/internal/core/src/exec/expression/Expr.h +++ b/internal/core/src/exec/expression/Expr.h @@ -27,6 +27,7 @@ #include "exec/QueryContext.h" #include "expr/ITypeExpr.h" #include "query/PlanProto.h" +#include "segcore/SegmentSealedImpl.h" namespace milvus { namespace exec { @@ -1060,6 +1061,19 @@ class SegmentExpr : public Expr { use_index_ = false; } + bool + CanUseJsonKeyIndex(FieldId field_id) const { + if (segment_->type() == SegmentType::Sealed) { + auto sealed_seg = + dynamic_cast(segment_); + Assert(sealed_seg != nullptr); + if (sealed_seg->GetJsonKeyIndex(field_id) != nullptr) { + return true; + } + } + return false; + } + protected: const segcore::SegmentInternalInterface* segment_; const FieldId field_id_; diff --git a/internal/core/src/exec/expression/TermExpr.cpp b/internal/core/src/exec/expression/TermExpr.cpp index 960d9731c9604..3c0e812fb42b3 100644 --- a/internal/core/src/exec/expression/TermExpr.cpp +++ b/internal/core/src/exec/expression/TermExpr.cpp @@ -76,6 +76,7 @@ PhyTermFilterExpr::Eval(EvalCtx& context, VectorPtr& result) { break; } auto type = expr_->vals_[0].val_case(); + auto start = std::chrono::steady_clock::now(); switch (type) { case proto::plan::GenericValue::ValCase::kBoolVal: result = ExecVisitorImplTemplateJson(input); @@ -92,6 +93,11 @@ PhyTermFilterExpr::Eval(EvalCtx& context, VectorPtr& result) { default: PanicInfo(DataTypeInvalid, "unknown data type: {}", type); } + std::cout << "optimize cost:" + << std::chrono::duration_cast( + std::chrono::steady_clock::now() - start) + .count() + << std::endl; break; } case DataType::ARRAY: { @@ -282,27 +288,27 @@ PhyTermFilterExpr::ExecTermArrayVariableInField(OffsetVector* input) { TargetBitmapView res, TargetBitmapView valid_res, const ValueType& target_val) { - auto executor = [&](size_t offset) { - for (int i = 0; i < data[offset].length(); i++) { - auto val = data[offset].template get_data(i); - if (val == target_val) { - return true; + auto executor = [&](size_t offset) { + for (int i = 0; i < data[offset].length(); i++) { + auto val = data[offset].template get_data(i); + if (val == target_val) { + return true; + } + } + return false; + }; + for (int i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); } - return false; }; - for (int i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = executor(offset); - } - }; int64_t processed_size; if (has_offset_input_) { @@ -369,23 +375,23 @@ PhyTermFilterExpr::ExecTermArrayFieldInVariable(OffsetVector* input) { TargetBitmapView valid_res, int index, const std::unordered_set& term_set) { - for (int i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - if (term_set.empty() || index >= data[offset].length()) { - res[i] = false; - continue; + for (int i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + if (term_set.empty() || index >= data[offset].length()) { + res[i] = false; + continue; + } + auto value = data[offset].get_data(index); + res[i] = term_set.find(ValueType(value)) != term_set.end(); } - auto value = data[offset].get_data(index); - res[i] = term_set.find(ValueType(value)) != term_set.end(); - } - }; + }; int64_t processed_size; if (has_offset_input_) { @@ -446,34 +452,34 @@ PhyTermFilterExpr::ExecTermJsonVariableInField(OffsetVector* input) { TargetBitmapView valid_res, const std::string pointer, const ValueType& target_val) { - auto executor = [&](size_t i) { - auto doc = data[i].doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) - return false; - for (auto it = array.begin(); it != array.end(); ++it) { - auto val = (*it).template get(); - if (val.error()) { + auto executor = [&](size_t i) { + auto doc = data[i].doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) return false; + for (auto it = array.begin(); it != array.end(); ++it) { + auto val = (*it).template get(); + if (val.error()) { + return false; + } + if (val.value() == target_val) { + return true; + } + } + return false; + }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; } - if (val.value() == target_val) { - return true; + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; } + res[i] = executor(offset); } - return false; }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = executor(offset); - } - }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, @@ -495,12 +501,87 @@ PhyTermFilterExpr::ExecTermJsonVariableInField(OffsetVector* input) { return res_vec; } +static void +pollute_cache(size_t size) { + std::vector dummy(size); + for (size_t i = 0; i < size; ++i) { + dummy[i] = 'x'; + } + volatile char sink = dummy[0]; +} + +template +VectorPtr +PhyTermFilterExpr::ExecJsonInVariableByKeyIndex() { + using GetType = std::conditional_t, + std::string_view, + ValueType>; + Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ + ? active_count_ - current_data_chunk_pos_ + : batch_size_; + + auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); + std::unordered_set term_set; + for (const auto& element : expr_->vals_) { + term_set.insert(GetValueFromProto(element)); + } + + if (term_set.empty()) { + MoveCursor(); + return std::make_shared( + TargetBitmap(real_batch_size, false), + TargetBitmap(real_batch_size, true)); + } + + if (cached_index_chunk_id_ != 0) { + const auto* sealed_seg = + dynamic_cast(segment_); + auto field_id = expr_->column_.field_id_; + auto* index = sealed_seg->GetJsonKeyIndex(field_id); + Assert(index != nullptr); + + auto filter_func = [sealed_seg, &term_set, &field_id](uint32_t row_id, + uint16_t offset, + uint16_t size) { + //std::cout << row_id << " " << offset << " " << size << std::endl; + auto json_pair = sealed_seg->GetJsonData(field_id, row_id); + if (!json_pair.second) { + return false; + } + auto json = + milvus::Json(json_pair.first.data(), json_pair.first.size()); + auto val = json.at(offset, size); + if (val.error()) { + //std::cout << val.error() << std::endl; + return false; + } + return term_set.find(ValueType(val.value())) != term_set.end(); + }; + cached_index_chunk_res_ = + index->FilterByPath(pointer, filter_func).clone(); + cached_index_chunk_id_ = 0; + } + + TargetBitmap result; + result.append( + cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size); + current_data_chunk_pos_ += real_batch_size; + return std::make_shared(std::move(result), + TargetBitmap(real_batch_size, true)); +} + template VectorPtr PhyTermFilterExpr::ExecTermJsonFieldInVariable(OffsetVector* input) { using GetType = std::conditional_t, std::string_view, ValueType>; + FieldId field_id = expr_->column_.field_id_; + if (CanUseJsonKeyIndex(field_id)) { + return ExecJsonInVariableByKeyIndex(); + } + auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { @@ -535,40 +616,40 @@ PhyTermFilterExpr::ExecTermJsonFieldInVariable(OffsetVector* input) { TargetBitmapView valid_res, const std::string pointer, const std::unordered_set& terms) { - auto executor = [&](size_t i) { - auto x = data[i].template at(pointer); - if (x.error()) { - if constexpr (std::is_same_v) { - auto x = data[i].template at(pointer); - if (x.error()) { - return false; + auto executor = [&](size_t i) { + auto x = data[i].template at(pointer); + if (x.error()) { + if constexpr (std::is_same_v) { + auto x = data[i].template at(pointer); + if (x.error()) { + return false; + } + + auto value = x.value(); + // if the term set is {1}, and the value is 1.1, we should not return true. + return std::floor(value) == value && + terms.find(ValueType(value)) != terms.end(); } - - auto value = x.value(); - // if the term set is {1}, and the value is 1.1, we should not return true. - return std::floor(value) == value && - terms.find(ValueType(value)) != terms.end(); + return false; } - return false; + return terms.find(ValueType(x.value())) != terms.end(); + }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + if (terms.empty()) { + res[i] = false; + continue; + } + res[i] = executor(offset); } - return terms.find(ValueType(x.value())) != terms.end(); }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - if (terms.empty()) { - res[i] = false; - continue; - } - res[i] = executor(offset); - } - }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, @@ -695,19 +776,19 @@ PhyTermFilterExpr::ExecVisitorImplForData(OffsetVector* input) { TargetBitmapView res, TargetBitmapView valid_res, const std::unordered_set& vals) { - TermElementFuncSet func; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; + TermElementFuncSet func; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = func(vals, data[offset]); } - res[i] = func(vals, data[offset]); - } - }; + }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, diff --git a/internal/core/src/exec/expression/TermExpr.h b/internal/core/src/exec/expression/TermExpr.h index 19f03b131b9c3..c06db53ec991b 100644 --- a/internal/core/src/exec/expression/TermExpr.h +++ b/internal/core/src/exec/expression/TermExpr.h @@ -117,6 +117,10 @@ class PhyTermFilterExpr : public SegmentExpr { VectorPtr ExecTermArrayFieldInVariable(OffsetVector* input = nullptr); + template + VectorPtr + ExecJsonInVariableByKeyIndex(); + private: std::shared_ptr expr_; milvus::Timestamp query_timestamp_; diff --git a/internal/core/src/index/JsonKeyInvertedIndex.cpp b/internal/core/src/index/JsonKeyInvertedIndex.cpp new file mode 100644 index 0000000000000..e76644c2d5349 --- /dev/null +++ b/internal/core/src/index/JsonKeyInvertedIndex.cpp @@ -0,0 +1,247 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#include +#include + +#include "index/JsonKeyInvertedIndex.h" +#include "index/InvertedIndexUtil.h" +#include "index/Utils.h" + +namespace milvus::index { +constexpr const char* TMP_JSON_INVERTED_LOG_PREFIX = + "/tmp/milvus/json-key-inverted-index-log/"; + +void +JsonKeyInvertedIndex::AddInvertedRecord(const std::vector& paths, + uint32_t row_id, + uint16_t offset, + uint16_t length) { + auto key = std::string("/") + Join(paths, "."); + std::cout << "xxx insert inverted key" << key << "rowid" << row_id + << "offset" << offset << "length" << length << std::endl; + LOG_DEBUG( + "insert inverted key: {}, row_id: {}, offset: " + "{}, length:{}", + key, + row_id, + offset, + length); + int64_t combine_id = EncodeOffset(row_id, offset, length); + wrapper_->add_multi_data(&key, 1, combine_id); +} + +void +JsonKeyInvertedIndex::TravelJson(const char* json, + jsmntok* tokens, + int& index, + std::vector& path, + int32_t offset) { + jsmntok current = tokens[0]; + Assert(current.type != JSMN_UNDEFINED); + if (current.type == JSMN_OBJECT) { + int j = 1; + for (int i = 0; i < current.size; i++) { + Assert(tokens[j].type == JSMN_STRING && tokens[j].size != 0); + std::string key(json + tokens[j].start, + tokens[j].end - tokens[j].start); + path.push_back(key); + j++; + int consumed = 0; + TravelJson(json, tokens + j, consumed, path, offset); + path.pop_back(); + j += consumed; + } + index = j; + } else if (current.type == JSMN_PRIMITIVE) { + AddInvertedRecord( + path, offset, current.start, current.end - current.start); + index++; + } else if (current.type == JSMN_ARRAY) { + AddInvertedRecord( + path, offset, current.start, current.end - current.start); + // skip array parse + int count = current.size; + int j = 1; + while (count > 0) { + if (tokens[j].size == 0) { + count--; + } else { + count += tokens[j].size; + } + j++; + } + index = j; + + } else if (current.type == JSMN_STRING) { + Assert(current.size == 0); + AddInvertedRecord( + path, offset, current.start, current.end - current.start); + index++; + } +} + +void +JsonKeyInvertedIndex::AddJson(const char* json, int64_t offset) { + jsmn_parser parser; + jsmntok_t* tokens = (jsmntok_t*)malloc(16 * sizeof(jsmntok_t)); + if (!tokens) { + fprintf(stderr, "Memory allocation failed\n"); + return; + } + int num_tokens = 0; + int token_capacity = 16; + + jsmn_init(&parser); + + while (1) { + int r = jsmn_parse(&parser, json, strlen(json), tokens, token_capacity); + if (r < 0) { + if (r == JSMN_ERROR_NOMEM) { + // Reallocate tokens array if not enough space + token_capacity *= 2; + tokens = (jsmntok_t*)realloc( + tokens, token_capacity * sizeof(jsmntok_t)); + if (!tokens) { + PanicInfo(ErrorCode::UnexpectedError, "realloc failed"); + } + continue; + } else { + free(tokens); + PanicInfo(ErrorCode::UnexpectedError, + "Failed to parse Json: {}, error: {}", + json, + int(r)); + } + } + num_tokens = r; + break; + } + + int index = 0; + std::vector paths; + TravelJson(json, tokens, index, paths, offset); +} + +JsonKeyInvertedIndex::JsonKeyInvertedIndex( + const storage::FileManagerContext& ctx, bool is_load) { + schema_ = ctx.fieldDataMeta.field_schema; + field_id_ = ctx.fieldDataMeta.field_id; + mem_file_manager_ = std::make_shared(ctx); + disk_file_manager_ = std::make_shared(ctx); + + if (is_load) { + auto prefix = disk_file_manager_->GetLocalJsonKeyIndexPrefix(); + path_ = prefix; + } else { + auto prefix = disk_file_manager_->GetJsonKeyIndexIdentifier(); + path_ = std::string(TMP_JSON_INVERTED_LOG_PREFIX) + prefix; + + boost::filesystem::create_directories(path_); + std::string field_name = + std::to_string(disk_file_manager_->GetFieldDataMeta().field_id); + d_type_ = TantivyDataType::Keyword; + wrapper_ = std::make_shared( + field_name.c_str(), d_type_, path_.c_str()); + } +} + +BinarySet +JsonKeyInvertedIndex::Upload(const Config& config) { + finish(); + boost::filesystem::path p(path_); + boost::filesystem::directory_iterator end_iter; + + for (boost::filesystem::directory_iterator iter(p); iter != end_iter; + iter++) { + if (boost::filesystem::is_directory(*iter)) { + LOG_WARN("{} is a directory", iter->path().string()); + } else { + LOG_INFO("trying to add json key inverted index log: {}", + iter->path().string()); + AssertInfo( + disk_file_manager_->AddJsonKeyIndexLog(iter->path().string()), + "failed to add json key inverted index log: {}", + iter->path().string()); + LOG_INFO("json key inverted index log: {} added", + iter->path().string()); + } + } + + BinarySet ret; + + auto remote_paths_to_size = disk_file_manager_->GetRemotePathsToFileSize(); + for (auto& file : remote_paths_to_size) { + ret.Append(file.first, nullptr, file.second); + } + + return ret; +} + +void +JsonKeyInvertedIndex::Load(milvus::tracer::TraceContext ctx, + const Config& config) { + auto index_files = + GetValueFromConfig>(config, "index_files"); + AssertInfo(index_files.has_value(), + "index file paths is empty when load json key index"); + + disk_file_manager_->CacheJsonKeyIndexToDisk(index_files.value()); + AssertInfo( + tantivy_index_exist(path_.c_str()), "index not exist: {}", path_); + wrapper_ = std::make_shared(path_.c_str()); + LOG_INFO("load json key index done for field id:{} with dir:{}", + field_id_, + path_); +} + +void +JsonKeyInvertedIndex::BuildWithFieldData( + const std::vector& field_datas) { + AssertInfo(schema_.data_type() == proto::schema::DataType::JSON, + "schema data type is {}", + schema_.data_type()); + if (schema_.nullable()) { + int64_t total = 0; + for (const auto& data : field_datas) { + total += data->get_null_count(); + } + null_offset.reserve(total); + } + int64_t offset = 0; + if (schema_.nullable()) { + for (const auto& data : field_datas) { + auto n = data->get_num_rows(); + for (int i = 0; i < n; i++) { + if (!data->is_valid(i)) { + null_offset.push_back(i); + } + AddJson(static_cast(data->RawValue(i)) + ->data() + .data(), + offset++); + } + } + } else { + for (const auto& data : field_datas) { + auto n = data->get_num_rows(); + for (int i = 0; i < n; i++) { + AddJson(static_cast(data->RawValue(i)) + ->data() + .data(), + offset++); + } + } + } + LOG_INFO("build json key index done for field id:{}", field_id_); +} + +} // namespace milvus::index diff --git a/internal/core/src/index/JsonKeyInvertedIndex.h b/internal/core/src/index/JsonKeyInvertedIndex.h new file mode 100644 index 0000000000000..90f8ba1f84db0 --- /dev/null +++ b/internal/core/src/index/JsonKeyInvertedIndex.h @@ -0,0 +1,93 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#pragma once + +#include +#include + +#include "index/InvertedIndexTantivy.h" +#include "common/jsmn.h" + +namespace milvus::index { + +using stdclock = std::chrono::high_resolution_clock; +class JsonKeyInvertedIndex : public InvertedIndexTantivy { + public: + explicit JsonKeyInvertedIndex(const storage::FileManagerContext& ctx, + bool is_load); + + ~JsonKeyInvertedIndex() override {}; + + public: + BinarySet + Upload(const Config& config) override; + + void + Load(milvus::tracer::TraceContext ctx, const Config& config) override; + + void + BuildWithFieldData(const std::vector& datas) override; + + const TargetBitmap + FilterByPath(const std::string& path, + std::function filter) { + TargetBitmap bitset(Count()); + auto array = wrapper_->term_query(path); + LOG_DEBUG("json key filter size:{}", array.array_.len); + + for (size_t j = 0; j < array.array_.len; j++) { + auto the_offset = array.array_.array[j]; + auto tuple = DecodeOffset(the_offset); + auto row_id = std::get<0>(tuple); + bitset[row_id] = filter( + std::get<0>(tuple), std::get<1>(tuple), std::get<2>(tuple)); + } + + return std::move(bitset); + } + + private: + void + AddJson(const char* json, int64_t offset); + + void + TravelJson(const char* json, + jsmntok* tokens, + int& index, + std::vector& path, + int32_t offset); + + void + AddInvertedRecord(const std::vector& paths, + uint32_t row_id, + uint16_t offset, + uint16_t length); + + int64_t + EncodeOffset(uint32_t row_id, uint16_t row_offset, uint16_t size) { + return static_cast(row_id) << 32 | + static_cast(row_offset) << 16 | + static_cast(size); + } + + std::tuple + DecodeOffset(int64_t encode_offset) { + uint32_t row_id = (encode_offset >> 32) & 0xFFFFFFFF; + uint16_t row_offset = (encode_offset >> 16) & 0xFFFF; + uint16_t size = encode_offset & 0xFFFF; + return std::make_tuple(row_id, row_offset, size); + } + + private: + int64_t field_id_; +}; +} // namespace milvus::index diff --git a/internal/core/src/indexbuilder/index_c.cpp b/internal/core/src/indexbuilder/index_c.cpp index df1cc9ae59c1a..8d3b74cda2943 100644 --- a/internal/core/src/indexbuilder/index_c.cpp +++ b/internal/core/src/indexbuilder/index_c.cpp @@ -34,6 +34,7 @@ #include "pb/index_cgo_msg.pb.h" #include "storage/Util.h" #include "index/Meta.h" +#include "index/JsonKeyInvertedIndex.h" using namespace milvus; CStatus @@ -237,6 +238,73 @@ CreateIndex(CIndex* res_index, } } +CStatus +BuildJsonKeyIndex(CBinarySet* c_binary_set, + const uint8_t* serialized_build_index_info, + const uint64_t len) { + try { + auto build_index_info = + std::make_unique(); + auto res = + build_index_info->ParseFromArray(serialized_build_index_info, len); + AssertInfo(res, "Unmarshall build index info failed"); + + auto field_type = + static_cast(build_index_info->field_schema().data_type()); + + auto storage_config = + get_storage_config(build_index_info->storage_config()); + auto config = get_config(build_index_info); + + // init file manager + milvus::storage::FieldDataMeta field_meta{ + build_index_info->collectionid(), + build_index_info->partitionid(), + build_index_info->segmentid(), + build_index_info->field_schema().fieldid(), + build_index_info->field_schema()}; + + milvus::storage::IndexMeta index_meta{ + build_index_info->segmentid(), + build_index_info->field_schema().fieldid(), + build_index_info->buildid(), + build_index_info->index_version(), + "", + build_index_info->field_schema().name(), + field_type, + build_index_info->dim(), + }; + auto chunk_manager = + milvus::storage::CreateChunkManager(storage_config); + + milvus::storage::FileManagerContext fileManagerContext( + field_meta, index_meta, chunk_manager); + + auto field_schema = + FieldMeta::ParseFrom(build_index_info->field_schema()); + auto index = + std::make_unique(fileManagerContext, false); + index->Build(config); + auto binary = + std::make_unique(index->Upload(config)); + *c_binary_set = binary.release(); + auto status = CStatus(); + status.error_code = Success; + status.error_msg = ""; + return status; + } catch (SegcoreError& e) { + auto status = CStatus(); + status.error_code = e.get_error_code(); + status.error_msg = strdup(e.what()); + return status; + } catch (std::exception& e) { + auto status = CStatus(); + status.error_code = UnexpectedError; + status.error_msg = strdup(e.what()); + return status; + } +} + CStatus BuildTextIndex(CBinarySet* c_binary_set, const uint8_t* serialized_build_index_info, diff --git a/internal/core/src/indexbuilder/index_c.h b/internal/core/src/indexbuilder/index_c.h index 6d26adc3442d9..c9b3e34d1f20b 100644 --- a/internal/core/src/indexbuilder/index_c.h +++ b/internal/core/src/indexbuilder/index_c.h @@ -35,6 +35,11 @@ CreateIndex(CIndex* res_index, CStatus DeleteIndex(CIndex index); +CStatus +BuildJsonKeyIndex(CBinarySet* c_binary_set, + const uint8_t* serialized_build_index_info, + const uint64_t len); + CStatus BuildTextIndex(CBinarySet* c_binary_set, const uint8_t* serialized_build_index_info, diff --git a/internal/core/src/mmap/ChunkedColumn.h b/internal/core/src/mmap/ChunkedColumn.h index a75f648460923..80ffabd9ce19e 100644 --- a/internal/core/src/mmap/ChunkedColumn.h +++ b/internal/core/src/mmap/ChunkedColumn.h @@ -149,6 +149,12 @@ class ChunkedColumnBase : public ColumnBase { "GetBatchBuffer only supported for VariableColumn"); } + virtual std::string_view + RawAt(const size_t i) const { + PanicInfo(ErrorCode::Unsupported, + "RawAt only supported for VariableColumn"); + } + virtual std::pair, FixedVector> StringViews(int64_t chunk_id) const { PanicInfo(ErrorCode::Unsupported, @@ -383,7 +389,7 @@ class ChunkedVariableColumn : public ChunkedColumnBase { } std::string_view - RawAt(const int i) const { + RawAt(const size_t i) const { return std::string_view((*this)[i]); } }; diff --git a/internal/core/src/mmap/Column.h b/internal/core/src/mmap/Column.h index d0e2ed3690f1e..c2ccdb57e455c 100644 --- a/internal/core/src/mmap/Column.h +++ b/internal/core/src/mmap/Column.h @@ -327,6 +327,12 @@ class SingleChunkColumnBase : public ColumnBase { "viewsbyoffsets only supported for VariableColumn"); } + virtual std::string_view + RawAt(const size_t i) const { + PanicInfo(ErrorCode::Unsupported, + "RawAt only supported for VariableColumn"); + } + virtual void AppendBatch(const FieldDataPtr data) override { size_t required_size = data_size_ + data->DataSize(); @@ -795,7 +801,7 @@ class SingleChunkVariableColumn : public SingleChunkColumnBase { } std::string_view - RawAt(const int i) const { + RawAt(const size_t i) const { return std::string_view((*this)[i]); } diff --git a/internal/core/src/segcore/ChunkedSegmentSealedImpl.h b/internal/core/src/segcore/ChunkedSegmentSealedImpl.h index 7f53f9d94a09d..b1fd793b9e0ef 100644 --- a/internal/core/src/segcore/ChunkedSegmentSealedImpl.h +++ b/internal/core/src/segcore/ChunkedSegmentSealedImpl.h @@ -97,6 +97,31 @@ class ChunkedSegmentSealedImpl : public SegmentSealed { void LoadTextIndex(FieldId field_id, std::unique_ptr index) override; + void + LoadJsonKeyIndex( + FieldId field_id, + std::unique_ptr index) override { + std::unique_lock lck(mutex_); + const auto& field_meta = schema_->operator[](field_id); + json_key_indexes_[field_id] = std::move(index); + } + + index::JsonKeyInvertedIndex* + GetJsonKeyIndex(FieldId field_id) const override { + std::shared_lock lck(mutex_); + auto iter = json_key_indexes_.find(field_id); + if (iter == json_key_indexes_.end()) { + return nullptr; + } + return iter->second.get(); + } + + std::pair + GetJsonData(FieldId field_id, size_t offset) const override { + auto column = fields_.at(field_id); + bool is_valid = column->IsValid(offset); + return std::make_pair(std::move(column->RawAt(offset)), is_valid); + } public: size_t @@ -391,6 +416,9 @@ class ChunkedSegmentSealedImpl : public SegmentSealed { // whether the segment is sorted by the pk bool is_sorted_by_pk_ = false; + // used for json expr optimization + std::unordered_map> + json_key_indexes_; }; } // namespace milvus::segcore diff --git a/internal/core/src/segcore/SegmentInterface.h b/internal/core/src/segcore/SegmentInterface.h index 90e34ce78a7b2..214003485668a 100644 --- a/internal/core/src/segcore/SegmentInterface.h +++ b/internal/core/src/segcore/SegmentInterface.h @@ -37,6 +37,7 @@ #include "index/SkipIndex.h" #include "mmap/Column.h" #include "index/TextMatchIndex.h" +#include "index/JsonKeyInvertedIndex.h" namespace milvus::segcore { diff --git a/internal/core/src/segcore/SegmentSealed.h b/internal/core/src/segcore/SegmentSealed.h index 5078fbc11a5c6..fd6cc13d0a4a9 100644 --- a/internal/core/src/segcore/SegmentSealed.h +++ b/internal/core/src/segcore/SegmentSealed.h @@ -55,6 +55,16 @@ class SegmentSealed : public SegmentInternalInterface { virtual InsertRecord& get_insert_record() = 0; + virtual void + LoadJsonKeyIndex(FieldId field_id, + std::unique_ptr index) = 0; + + virtual index::JsonKeyInvertedIndex* + GetJsonKeyIndex(FieldId field_id) const = 0; + + virtual std::pair + GetJsonData(FieldId field_id, size_t offset) const = 0; + SegmentType type() const override { return SegmentType::Sealed; diff --git a/internal/core/src/segcore/SegmentSealedImpl.cpp b/internal/core/src/segcore/SegmentSealedImpl.cpp index 1944ca1086337..5fba00ce94009 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.cpp +++ b/internal/core/src/segcore/SegmentSealedImpl.cpp @@ -2051,4 +2051,29 @@ SegmentSealedImpl::LoadTextIndex(FieldId field_id, text_indexes_[field_id] = std::move(index); } +void +SegmentSealedImpl::LoadJsonKeyIndex( + FieldId field_id, std::unique_ptr index) { + std::unique_lock lck(mutex_); + const auto& field_meta = schema_->operator[](field_id); + json_key_indexes_[field_id] = std::move(index); +} + +index::JsonKeyInvertedIndex* +SegmentSealedImpl::GetJsonKeyIndex(FieldId field_id) const { + std::shared_lock lck(mutex_); + auto iter = json_key_indexes_.find(field_id); + if (iter == json_key_indexes_.end()) { + return nullptr; + } + return iter->second.get(); +} + +std::pair +SegmentSealedImpl::GetJsonData(FieldId field_id, size_t offset) const { + auto column = fields_.at(field_id); + bool is_valid = column->IsValid(offset); + return std::make_pair(std::move(column->RawAt(offset)), is_valid); +} + } // namespace milvus::segcore diff --git a/internal/core/src/segcore/SegmentSealedImpl.h b/internal/core/src/segcore/SegmentSealedImpl.h index 03a33d014c9e5..020fca5fdf3ea 100644 --- a/internal/core/src/segcore/SegmentSealedImpl.h +++ b/internal/core/src/segcore/SegmentSealedImpl.h @@ -36,6 +36,7 @@ #include "common/Types.h" #include "common/IndexMeta.h" #include "index/TextMatchIndex.h" +#include "index/JsonKeyInvertedIndex.h" namespace milvus::segcore { @@ -100,6 +101,17 @@ class SegmentSealedImpl : public SegmentSealed { LoadTextIndex(FieldId field_id, std::unique_ptr index) override; + void + LoadJsonKeyIndex( + FieldId field_id, + std::unique_ptr index) override; + + index::JsonKeyInvertedIndex* + GetJsonKeyIndex(FieldId field_id) const override; + + std::pair + GetJsonData(FieldId field_id, size_t offset) const override; + public: size_t GetMemoryUsageInBytes() const override { @@ -397,6 +409,10 @@ class SegmentSealedImpl : public SegmentSealed { // whether the segment is sorted by the pk bool is_sorted_by_pk_ = false; + + // used for json expr optimization + std::unordered_map> + json_key_indexes_; }; inline SegmentSealedUPtr diff --git a/internal/core/src/segcore/segment_c.cpp b/internal/core/src/segcore/segment_c.cpp index fd7180d1ef184..09ab999810a14 100644 --- a/internal/core/src/segcore/segment_c.cpp +++ b/internal/core/src/segcore/segment_c.cpp @@ -479,6 +479,60 @@ LoadTextIndex(CSegmentInterface c_segment, } } +CStatus +LoadJsonKeyIndex(CTraceContext c_trace, + CSegmentInterface c_segment, + const uint8_t* serialized_load_json_key_index_info, + const uint64_t len) { + try { + auto ctx = milvus::tracer::TraceContext{ + c_trace.traceID, c_trace.spanID, c_trace.traceFlags}; + auto segment_interface = + reinterpret_cast(c_segment); + auto segment = + dynamic_cast(segment_interface); + AssertInfo(segment != nullptr, "segment conversion failed"); + + auto info_proto = + std::make_unique(); + info_proto->ParseFromArray(serialized_load_json_key_index_info, len); + + milvus::storage::FieldDataMeta field_meta{info_proto->collectionid(), + info_proto->partitionid(), + segment->get_segment_id(), + info_proto->fieldid(), + info_proto->schema()}; + milvus::storage::IndexMeta index_meta{segment->get_segment_id(), + info_proto->fieldid(), + info_proto->buildid(), + info_proto->version()}; + auto remote_chunk_manager = + milvus::storage::RemoteChunkManagerSingleton::GetInstance() + .GetRemoteChunkManager(); + + milvus::Config config; + std::vector files; + for (const auto& f : info_proto->files()) { + files.push_back(f); + } + config["index_files"] = files; + + milvus::storage::FileManagerContext file_ctx( + field_meta, index_meta, remote_chunk_manager); + + auto index = std::make_unique( + file_ctx, true); + index->Load(ctx, config); + + segment->LoadJsonKeyIndex(milvus::FieldId(info_proto->fieldid()), + std::move(index)); + + return milvus::SuccessCStatus(); + } catch (std::exception& e) { + return milvus::FailureCStatus(&e); + } +} + CStatus UpdateFieldRawDataSize(CSegmentInterface c_segment, int64_t field_id, diff --git a/internal/core/src/segcore/segment_c.h b/internal/core/src/segcore/segment_c.h index 80bb099886405..007f14f2cf37b 100644 --- a/internal/core/src/segcore/segment_c.h +++ b/internal/core/src/segcore/segment_c.h @@ -122,6 +122,12 @@ LoadTextIndex(CSegmentInterface c_segment, const uint8_t* serialized_load_text_index_info, const uint64_t len); +CStatus +LoadJsonKeyIndex(CTraceContext c_trace, + CSegmentInterface c_segment, + const uint8_t* serialied_load_json_key_index_info, + const uint64_t len); + CStatus UpdateFieldRawDataSize(CSegmentInterface c_segment, int64_t field_id, diff --git a/internal/core/src/storage/DiskFileManagerImpl.cpp b/internal/core/src/storage/DiskFileManagerImpl.cpp index e54fa6d748825..3b022ef76947a 100644 --- a/internal/core/src/storage/DiskFileManagerImpl.cpp +++ b/internal/core/src/storage/DiskFileManagerImpl.cpp @@ -79,8 +79,18 @@ DiskFileManagerImpl::GetRemoteTextLogPath(const std::string& file_name, return remote_prefix + "/" + file_name + "_" + std::to_string(slice_num); } +std::string +DiskFileManagerImpl::GetRemoteJsonKeyIndexPath(const std::string& file_name, + int64_t slice_num) { + auto remote_prefix = GetRemoteJsonKeyLogPrefix(); + return remote_prefix + "/" + file_name + "_" + std::to_string(slice_num); +} + bool -DiskFileManagerImpl::AddFile(const std::string& file) noexcept { +DiskFileManagerImpl::AddFileInternal( + const std::string& file, + const std::function& + get_remote_path) noexcept { auto local_chunk_manager = LocalChunkManagerSingleton::GetInstance().GetChunkManager(); FILEMANAGER_TRY @@ -115,8 +125,7 @@ DiskFileManagerImpl::AddFile(const std::string& file) noexcept { } auto batch_size = std::min(FILE_SLICE_SIZE, int64_t(fileSize) - offset); - batch_remote_files.emplace_back( - GetRemoteIndexPath(fileName, slice_num)); + batch_remote_files.emplace_back(get_remote_path(fileName, slice_num)); remote_file_sizes.emplace_back(batch_size); local_file_offsets.emplace_back(offset); offset += batch_size; @@ -132,56 +141,28 @@ DiskFileManagerImpl::AddFile(const std::string& file) noexcept { } // namespace knowhere bool -DiskFileManagerImpl::AddTextLog(const std::string& file) noexcept { - auto local_chunk_manager = - LocalChunkManagerSingleton::GetInstance().GetChunkManager(); - FILEMANAGER_TRY - if (!local_chunk_manager->Exist(file)) { - LOG_ERROR("local file {} not exists", file); - return false; - } - - // record local file path - local_paths_.emplace_back(file); - - auto fileName = GetFileName(file); - auto fileSize = local_chunk_manager->Size(file); - - std::vector batch_remote_files; - std::vector remote_file_sizes; - std::vector local_file_offsets; - - int slice_num = 0; - auto parallel_degree = - uint64_t(DEFAULT_FIELD_MAX_MEMORY_LIMIT / FILE_SLICE_SIZE); - for (int64_t offset = 0; offset < fileSize; slice_num++) { - if (batch_remote_files.size() >= parallel_degree) { - AddBatchIndexFiles(file, - local_file_offsets, - batch_remote_files, - - remote_file_sizes); - batch_remote_files.clear(); - remote_file_sizes.clear(); - local_file_offsets.clear(); - } +DiskFileManagerImpl::AddFile(const std::string& file) noexcept { + return AddFileInternal(file, + [this](const std::string& file_name, int slice_num) { + return GetRemoteIndexPath(file_name, slice_num); + }); +} - auto batch_size = std::min(FILE_SLICE_SIZE, int64_t(fileSize) - offset); - batch_remote_files.emplace_back( - GetRemoteTextLogPath(fileName, slice_num)); - remote_file_sizes.emplace_back(batch_size); - local_file_offsets.emplace_back(offset); - offset += batch_size; - } - if (batch_remote_files.size() > 0) { - AddBatchIndexFiles( - file, local_file_offsets, batch_remote_files, remote_file_sizes); - } - FILEMANAGER_CATCH - FILEMANAGER_END +bool +DiskFileManagerImpl::AddJsonKeyIndexLog(const std::string& file) noexcept { + return AddFileInternal( + file, [this](const std::string& file_name, int slice_num) { + return GetRemoteJsonKeyIndexPath(file_name, slice_num); + }); +} - return true; -} // namespace knowhere +bool +DiskFileManagerImpl::AddTextLog(const std::string& file) noexcept { + return AddFileInternal( + file, [this](const std::string& file_name, int slice_num) { + return GetRemoteTextLogPath(file_name, slice_num); + }); +} void DiskFileManagerImpl::AddBatchIndexFiles( @@ -236,8 +217,9 @@ DiskFileManagerImpl::AddBatchIndexFiles( } void -DiskFileManagerImpl::CacheIndexToDisk( - const std::vector& remote_files) { +DiskFileManagerImpl::CacheIndexToDiskInternal( + const std::vector& remote_files, + const std::function& get_local_index_prefix) noexcept { auto local_chunk_manager = LocalChunkManagerSingleton::GetInstance().GetChunkManager(); @@ -263,7 +245,7 @@ DiskFileManagerImpl::CacheIndexToDisk( for (auto& slices : index_slices) { auto prefix = slices.first; auto local_index_file_name = - GetLocalIndexObjectPrefix() + + get_local_index_prefix() + prefix.substr(prefix.find_last_of('/') + 1); local_chunk_manager->CreateFile(local_index_file_name); auto file = @@ -304,57 +286,24 @@ DiskFileManagerImpl::CacheIndexToDisk( } void -DiskFileManagerImpl::CacheTextLogToDisk( +DiskFileManagerImpl::CacheIndexToDisk( const std::vector& remote_files) { - auto local_chunk_manager = - LocalChunkManagerSingleton::GetInstance().GetChunkManager(); - - std::map> index_slices; - for (auto& file_path : remote_files) { - auto pos = file_path.find_last_of("_"); - AssertInfo(pos > 0, "invalided index file path:{}", file_path); - try { - auto idx = std::stoi(file_path.substr(pos + 1)); - index_slices[file_path.substr(0, pos)].emplace_back(idx); - } catch (const std::logic_error& e) { - auto err_message = fmt::format( - "invalided text log path:{}, error:{}", file_path, e.what()); - LOG_ERROR(err_message); - throw std::logic_error(err_message); - } - } - - for (auto& slices : index_slices) { - std::sort(slices.second.begin(), slices.second.end()); - } - - for (auto& slices : index_slices) { - auto prefix = slices.first; - auto local_index_file_name = - GetLocalTextIndexPrefix() + "/" + - prefix.substr(prefix.find_last_of('/') + 1); - local_chunk_manager->CreateFile(local_index_file_name); - auto file = - File::Open(local_index_file_name, O_CREAT | O_RDWR | O_TRUNC); + return CacheIndexToDiskInternal( + remote_files, [this]() { return GetLocalIndexObjectPrefix(); }); +} - // Get the remote files - std::vector batch_remote_files; - batch_remote_files.reserve(slices.second.size()); - for (int& iter : slices.second) { - auto origin_file = prefix + "_" + std::to_string(iter); - batch_remote_files.push_back(origin_file); - } +void +DiskFileManagerImpl::CacheTextLogToDisk( + const std::vector& remote_files) { + return CacheIndexToDiskInternal( + remote_files, [this]() { return GetLocalTextIndexPrefix(); }); +} - auto index_chunks = GetObjectData(rcm_.get(), batch_remote_files); - for (auto& chunk : index_chunks) { - auto index_data = chunk.get()->GetFieldData(); - auto index_size = index_data->Size(); - auto chunk_data = reinterpret_cast( - const_cast(index_data->Data())); - file.Write(chunk_data, index_size); - } - local_paths_.emplace_back(local_index_file_name); - } +void +DiskFileManagerImpl::CacheJsonKeyIndexToDisk( + const std::vector& remote_files) { + return CacheIndexToDiskInternal( + remote_files, [this]() { return GetLocalJsonKeyIndexPrefix(); }); } void @@ -693,6 +642,12 @@ DiskFileManagerImpl::GetFileName(const std::string& localfile) { return localPath.filename().string(); } +std::string +DiskFileManagerImpl::GetIndexIdentifier() { + return GenIndexPathIdentifier(index_meta_.build_id, + index_meta_.index_version); +} + std::string DiskFileManagerImpl::GetLocalIndexObjectPrefix() { auto local_chunk_manager = @@ -701,6 +656,14 @@ DiskFileManagerImpl::GetLocalIndexObjectPrefix() { local_chunk_manager, index_meta_.build_id, index_meta_.index_version); } +std::string +DiskFileManagerImpl::GetTextIndexIdentifier() { + return std::to_string(index_meta_.build_id) + "/" + + std::to_string(index_meta_.index_version) + "/" + + std::to_string(field_meta_.segment_id) + "/" + + std::to_string(field_meta_.field_id); +} + std::string DiskFileManagerImpl::GetLocalTextIndexPrefix() { auto local_chunk_manager = @@ -713,17 +676,31 @@ DiskFileManagerImpl::GetLocalTextIndexPrefix() { } std::string -DiskFileManagerImpl::GetIndexIdentifier() { - return GenIndexPathIdentifier(index_meta_.build_id, - index_meta_.index_version); +DiskFileManagerImpl::GetJsonKeyIndexIdentifier() { + return GenJsonKeyIndexPathIdentifier(index_meta_.build_id, + index_meta_.index_version, + field_meta_.segment_id, + field_meta_.field_id); } std::string -DiskFileManagerImpl::GetTextIndexIdentifier() { - return std::to_string(index_meta_.build_id) + "/" + - std::to_string(index_meta_.index_version) + "/" + - std::to_string(field_meta_.segment_id) + - std::to_string(field_meta_.field_id); +DiskFileManagerImpl::GetLocalJsonKeyIndexPrefix() { + auto local_chunk_manager = + LocalChunkManagerSingleton::GetInstance().GetChunkManager(); + return GenJsonKeyIndexPathPrefix(local_chunk_manager, + index_meta_.build_id, + index_meta_.index_version, + field_meta_.segment_id, + field_meta_.field_id); +} + +std::string +DiskFileManagerImpl::GetRemoteJsonKeyLogPrefix() { + return GenJsonKeyIndexPathPrefix(rcm_, + index_meta_.build_id, + index_meta_.index_version, + field_meta_.segment_id, + field_meta_.field_id); } std::string diff --git a/internal/core/src/storage/DiskFileManagerImpl.h b/internal/core/src/storage/DiskFileManagerImpl.h index b2c87b1ff78db..aee9bca069b9e 100644 --- a/internal/core/src/storage/DiskFileManagerImpl.h +++ b/internal/core/src/storage/DiskFileManagerImpl.h @@ -51,27 +51,42 @@ class DiskFileManagerImpl : public FileManagerImpl { bool AddTextLog(const std::string& filename) noexcept; + bool + AddJsonKeyIndexLog(const std::string& filename) noexcept; + public: std::string GetName() const override { return "DiskFileManagerImpl"; } + std::string + GetIndexIdentifier(); + std::string GetLocalIndexObjectPrefix(); + // Different from user index, a text index task may have multiple text fields sharing same build_id/task_id. So + // segment_id and field_id are required to identify a unique text index, in case that we support multiple index task + // in the same indexnode at the same time later. + std::string + GetTextIndexIdentifier(); + // Similar to GetTextIndexIdentifier, segment_id and field_id is also required. std::string GetLocalTextIndexPrefix(); + // Used for building index, using this index identifier mode to construct tmp building-index dir. std::string - GetIndexIdentifier(); + GetJsonKeyIndexIdentifier(); - // Different from user index, a text index task may have multiple text fields sharing same build_id/task_id. So - // segment_id and field_id are required to identify a unique text index, in case that we support multiple index task - // in the same indexnode at the same time later. + // Used for loading index, using this index prefix dir to store index. std::string - GetTextIndexIdentifier(); + GetLocalJsonKeyIndexPrefix(); + + // Used for upload index to remote storage, using this index prefix dir as remote storage directory + std::string + GetRemoteJsonKeyLogPrefix(); std::string GetLocalRawDataObjectPrefix(); @@ -92,6 +107,9 @@ class DiskFileManagerImpl : public FileManagerImpl { void CacheTextLogToDisk(const std::vector& remote_files); + void + CacheJsonKeyIndexToDisk(const std::vector& remote_files); + void AddBatchIndexFiles(const std::string& local_file_name, const std::vector& local_file_offsets, @@ -125,6 +143,19 @@ class DiskFileManagerImpl : public FileManagerImpl { std::string GetRemoteTextLogPath(const std::string& file_name, int64_t slice_num) const; + std::string + GetRemoteJsonKeyIndexPath(const std::string& file_name, int64_t slice_num); + + bool + AddFileInternal(const std::string& file_name, + const std::function& + get_remote_path) noexcept; + + void + CacheIndexToDiskInternal( + const std::vector& remote_files, + const std::function& get_local_index_prefix) noexcept; + private: // local file path (abs path) std::vector local_paths_; diff --git a/internal/core/src/storage/Util.cpp b/internal/core/src/storage/Util.cpp index 5e137207722f7..babd929b48775 100644 --- a/internal/core/src/storage/Util.cpp +++ b/internal/core/src/storage/Util.cpp @@ -522,6 +522,28 @@ GenTextIndexPathPrefix(ChunkManagerPtr cm, return (prefix / path / path1).string(); } +std::string +GenJsonKeyIndexPathIdentifier(int64_t build_id, + int64_t index_version, + int64_t segment_id, + int64_t field_id) { + return std::to_string(build_id) + "/" + std::to_string(index_version) + + "/" + std::to_string(segment_id) + "/" + std::to_string(field_id) + + "/"; +} + +std::string +GenJsonKeyIndexPathPrefix(ChunkManagerPtr cm, + int64_t build_id, + int64_t index_version, + int64_t segment_id, + int64_t field_id) { + return cm->GetRootPath() + "/" + std::string(JSON_KEY_INDEX_LOG_ROOT_PATH) + + "/" + + GenJsonKeyIndexPathIdentifier( + build_id, index_version, segment_id, field_id); +} + std::string GetIndexPathPrefixWithBuildID(ChunkManagerPtr cm, int64_t build_id) { boost::filesystem::path prefix = cm->GetRootPath(); diff --git a/internal/core/src/storage/Util.h b/internal/core/src/storage/Util.h index b3a6a124fbe70..638ad9b08de5b 100644 --- a/internal/core/src/storage/Util.h +++ b/internal/core/src/storage/Util.h @@ -92,6 +92,19 @@ GenTextIndexPathPrefix(ChunkManagerPtr cm, int64_t segment_id, int64_t field_id); +std::string +GenJsonKeyIndexPathIdentifier(int64_t build_id, + int64_t index_version, + int64_t segment_id, + int64_t field_id); + +std::string +GenJsonKeyIndexPathPrefix(ChunkManagerPtr cm, + int64_t build_id, + int64_t index_version, + int64_t segment_id, + int64_t field_id); + std::string GenFieldRawDataPathPrefix(ChunkManagerPtr cm, int64_t segment_id, diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h index 61a2088a2d26f..2f9890ece2e04 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h +++ b/internal/core/thirdparty/tantivy/tantivy-binding/include/tantivy-binding.h @@ -15,7 +15,7 @@ enum class TantivyDataType : uint8_t { }; struct RustArray { - uint32_t *array; + int64_t *array; size_t len; size_t cap; }; diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs index 8584907a38308..c5e03c34e4103 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/array.rs @@ -13,13 +13,13 @@ use crate::util::free_binding; #[repr(C)] pub struct RustArray { - array: *mut u32, + array: *mut i64, len: size_t, cap: size_t, } impl RustArray { - pub fn from_vec(vec: Vec) -> RustArray { + pub fn from_vec(vec: Vec) -> RustArray { let len = vec.len(); let cap = vec.capacity(); let v = vec.leak(); @@ -41,8 +41,8 @@ impl std::default::Default for RustArray { } } -impl From> for RustArray { - fn from(vec: Vec) -> Self { +impl From> for RustArray { + fn from(vec: Vec) -> Self { RustArray::from_vec(vec) } } @@ -75,7 +75,7 @@ macro_rules! impl_from_for_enum { }; } -impl_from_for_enum!(Value, None => (), RustArray => RustArray, RustArray => Vec, U32 => u32, Ptr => *mut c_void); +impl_from_for_enum!(Value, None => (), RustArray => RustArray, RustArray => Vec, U32 => u32, Ptr => *mut c_void); #[repr(C)] pub struct RustResult { diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs index 95d585b436d16..f04aa5b34adfd 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/docid_collector.rs @@ -7,7 +7,7 @@ use tantivy::{ pub(crate) struct DocIdCollector; impl Collector for DocIdCollector { - type Fruit = Vec; + type Fruit = Vec; type Child = DocIdChildCollector; fn for_segment( @@ -41,16 +41,16 @@ impl Collector for DocIdCollector { } pub(crate) struct DocIdChildCollector { - docs: Vec, + docs: Vec, column: Column, } impl SegmentCollector for DocIdChildCollector { - type Fruit = Vec; + type Fruit = Vec; fn collect(&mut self, doc: DocId, _score: Score) { self.column.values_for_doc(doc).for_each(|doc_id| { - self.docs.push(doc_id as u32); + self.docs.push(doc_id); }) } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs index cb5f989070d10..d92b5bba4bd6c 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader.rs @@ -67,7 +67,7 @@ impl IndexReaderWrapper { Ok(sum) } - pub(crate) fn search(&self, q: &dyn Query) -> Result> { + pub(crate) fn search(&self, q: &dyn Query) -> Result> { let searcher = self.reader.searcher(); match self.id_field { Some(_) => { @@ -85,7 +85,7 @@ impl IndexReaderWrapper { } } - pub fn term_query_i64(&self, term: i64) -> Result> { + pub fn term_query_i64(&self, term: i64) -> Result> { let q = TermQuery::new( Term::from_field_i64(self.field, term), IndexRecordOption::Basic, @@ -97,7 +97,7 @@ impl IndexReaderWrapper { &self, lower_bound: i64, inclusive: bool, - ) -> Result> { + ) -> Result> { let q = RangeQuery::new_i64_bounds( self.field_name.to_string(), make_bounds(lower_bound, inclusive), @@ -110,7 +110,7 @@ impl IndexReaderWrapper { &self, upper_bound: i64, inclusive: bool, - ) -> Result> { + ) -> Result> { let q = RangeQuery::new_i64_bounds( self.field_name.to_string(), Bound::Unbounded, @@ -125,14 +125,14 @@ impl IndexReaderWrapper { upper_bound: i64, lb_inclusive: bool, ub_inclusive: bool, - ) -> Result> { + ) -> Result> { let lb = make_bounds(lower_bound, lb_inclusive); let ub = make_bounds(upper_bound, ub_inclusive); let q = RangeQuery::new_i64_bounds(self.field_name.to_string(), lb, ub); self.search(&q) } - pub fn term_query_f64(&self, term: f64) -> Result> { + pub fn term_query_f64(&self, term: f64) -> Result> { let q = TermQuery::new( Term::from_field_f64(self.field, term), IndexRecordOption::Basic, @@ -144,7 +144,7 @@ impl IndexReaderWrapper { &self, lower_bound: f64, inclusive: bool, - ) -> Result> { + ) -> Result> { let q = RangeQuery::new_f64_bounds( self.field_name.to_string(), make_bounds(lower_bound, inclusive), @@ -157,7 +157,7 @@ impl IndexReaderWrapper { &self, upper_bound: f64, inclusive: bool, - ) -> Result> { + ) -> Result> { let q = RangeQuery::new_f64_bounds( self.field_name.to_string(), Bound::Unbounded, @@ -172,14 +172,14 @@ impl IndexReaderWrapper { upper_bound: f64, lb_inclusive: bool, ub_inclusive: bool, - ) -> Result> { + ) -> Result> { let lb = make_bounds(lower_bound, lb_inclusive); let ub = make_bounds(upper_bound, ub_inclusive); let q = RangeQuery::new_f64_bounds(self.field_name.to_string(), lb, ub); self.search(&q) } - pub fn term_query_bool(&self, term: bool) -> Result> { + pub fn term_query_bool(&self, term: bool) -> Result> { let q = TermQuery::new( Term::from_field_bool(self.field, term), IndexRecordOption::Basic, @@ -187,7 +187,7 @@ impl IndexReaderWrapper { self.search(&q) } - pub fn term_query_keyword(&self, term: &str) -> Result> { + pub fn term_query_keyword(&self, term: &str) -> Result> { let q = TermQuery::new( Term::from_field_text(self.field, term), IndexRecordOption::Basic, @@ -199,7 +199,7 @@ impl IndexReaderWrapper { &self, lower_bound: &str, inclusive: bool, - ) -> Result> { + ) -> Result> { let q = RangeQuery::new_str_bounds( self.field_name.to_string(), make_bounds(lower_bound, inclusive), @@ -212,7 +212,7 @@ impl IndexReaderWrapper { &self, upper_bound: &str, inclusive: bool, - ) -> Result> { + ) -> Result> { let q = RangeQuery::new_str_bounds( self.field_name.to_string(), Bound::Unbounded, @@ -227,20 +227,20 @@ impl IndexReaderWrapper { upper_bound: &str, lb_inclusive: bool, ub_inclusive: bool, - ) -> Result> { + ) -> Result> { let lb = make_bounds(lower_bound, lb_inclusive); let ub = make_bounds(upper_bound, ub_inclusive); let q = RangeQuery::new_str_bounds(self.field_name.to_string(), lb, ub); self.search(&q) } - pub fn prefix_query_keyword(&self, prefix: &str) -> Result> { + pub fn prefix_query_keyword(&self, prefix: &str) -> Result> { let escaped = regex::escape(prefix); let pattern = format!("{}(.|\n)*", escaped); self.regex_query(&pattern) } - pub fn regex_query(&self, pattern: &str) -> Result> { + pub fn regex_query(&self, pattern: &str) -> Result> { let q = RegexQuery::from_pattern(&pattern, self.field)?; self.search(&q) } diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs index f83df709c4098..52d78450febf9 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/index_reader_text.rs @@ -10,7 +10,7 @@ use crate::{index_reader::IndexReaderWrapper, tokenizer::standard_analyzer}; impl IndexReaderWrapper { // split the query string into multiple tokens using index's default tokenizer, // and then execute the disconjunction of term query. - pub(crate) fn match_query(&self, q: &str) -> Result> { + pub(crate) fn match_query(&self, q: &str) -> Result> { // clone the tokenizer to make `match_query` thread-safe. let mut tokenizer = self .index diff --git a/internal/core/thirdparty/tantivy/tantivy-binding/src/vec_collector.rs b/internal/core/thirdparty/tantivy/tantivy-binding/src/vec_collector.rs index 73299f24779e0..64d1c558205c6 100644 --- a/internal/core/thirdparty/tantivy/tantivy-binding/src/vec_collector.rs +++ b/internal/core/thirdparty/tantivy/tantivy-binding/src/vec_collector.rs @@ -7,7 +7,7 @@ use tantivy::{ pub struct VecCollector; impl Collector for VecCollector { - type Fruit = Vec; + type Fruit = Vec; type Child = VecChildCollector; @@ -23,7 +23,7 @@ impl Collector for VecCollector { false } - fn merge_fruits(&self, segment_fruits: Vec>) -> tantivy::Result> { + fn merge_fruits(&self, segment_fruits: Vec>) -> tantivy::Result> { if segment_fruits.len() == 1 { Ok(segment_fruits.into_iter().next().unwrap()) } else { @@ -44,14 +44,14 @@ impl Collector for VecCollector { } pub struct VecChildCollector { - docs: Vec, + docs: Vec, } impl SegmentCollector for VecChildCollector { - type Fruit = Vec; + type Fruit = Vec; fn collect(&mut self, doc: DocId, _score: tantivy::Score) { - self.docs.push(doc); + self.docs.push(doc as i64); } fn harvest(self) -> Self::Fruit { diff --git a/internal/core/unittest/CMakeLists.txt b/internal/core/unittest/CMakeLists.txt index 0fa63b45c264b..c0c12ade0176c 100644 --- a/internal/core/unittest/CMakeLists.txt +++ b/internal/core/unittest/CMakeLists.txt @@ -90,6 +90,7 @@ set(MILVUS_TEST_FILES test_chunked_column.cpp test_rust_result.cpp test_cached_search_iterator.cpp + test_json_key_index.cpp ) if ( INDEX_ENGINE STREQUAL "cardinal" ) diff --git a/internal/core/unittest/test_c_api.cpp b/internal/core/unittest/test_c_api.cpp index 61a2d86cde29e..df8365f3da28d 100644 --- a/internal/core/unittest/test_c_api.cpp +++ b/internal/core/unittest/test_c_api.cpp @@ -46,6 +46,7 @@ #include "segcore/load_index_c.h" #include "test_utils/c_api_test_utils.h" #include "segcore/vector_index_c.h" +#include "common/jsmn.h" namespace chrono = std::chrono; @@ -5281,3 +5282,386 @@ TEST(CApiTest, RANGE_SEARCH_WITH_RADIUS_AND_RANGE_FILTER_WHEN_IP_BFLOAT16) { TEST(CApiTest, IsLoadWithDisk) { ASSERT_TRUE(IsLoadWithDisk(INVERTED_INDEX_TYPE, 0)); } + +// 1000 keys +std::string +GenerateJson(int N) { + std::vector data(N); + std::default_random_engine er(67); + std::normal_distribution<> distr(0, 1); + std::vector keys; + for (int i = 0; i < N; i++) { + keys.push_back("keys" + std::to_string(i)); + } + std::string json_string; + std::vector values(N); + for (int i = 0; i < N; i++) { + if (i % 7 == 0 || i % 7 == 4) { + values[i] = std::to_string(er()); + } else if (i % 7 == 1 || i % 7 == 5) { + values[i] = std::to_string(static_cast(er())); + } else if (i % 7 == 2 || i % 7 == 6) { + values[i] = er() / 2 == 0 ? "true" : "false"; + } else if (i % 7 == 3) { + values[i] = "\"xxxx" + std::to_string(i) + "\""; + // } else if (i % 7 == 4) { + // std::vector intvec(10); + // for (int j = 0; j < 10; j++) { + // intvec[j] = std::to_string(i + j); + // } + // values[i] = "[" + join(intvec, ",") + "]"; + // } else if (i % 7 == 5) { + // std::vector doublevec(10); + // for (int j = 0; j < 10; j++) { + // doublevec[j] = + // std::to_string(static_cast(i + j + er())); + // } + // values[i] = "[" + join(doublevec, ",") + "]"; + // } else if (i % 7 == 6) { + // std::vector stringvec(10); + // for (int j = 0; j < 10; j++) { + // stringvec[j] = "\"xxx" + std::to_string(j) + "\""; + // } + // values[i] = "[" + join(stringvec, ",") + "]"; + } + } + json_string += "{"; + for (int i = 0; i < N - 1; i++) { + json_string += R"(")" + keys[i] + R"(":)" + values[i] + R"(,)"; + } + json_string += R"(")" + keys[N - 1] + R"(":)" + values[N - 1]; + json_string += "}"; + return json_string; +} + +void +ParseJson(const std::string& json) { + jsmn_parser p; + jsmntok_t t[2002]; + + jsmn_init(&p); + int r = jsmn_parse( + &p, json.c_str(), strlen(json.c_str()), t, sizeof(t) / sizeof(t[0])); + if (r < 0) { + printf("Failed to parse JSON: %d\n", r); + return; + } + if (r < 1 || t[0].type != JSMN_OBJECT) { + printf("Object expected\n"); + return; + } + //std::cout << r << std::endl; +} + +TEST(CApiTest, test_parse_perform) { + for (int i = 0; i < 10000; i++) { + { + int64_t all_cost = 0; + for (int j = 0; j < 10000; j++) { + auto json_string = GenerateJson(1000); + if (j == 0) { + std::cout << json_string.size() << std::endl; + } + //std::cout << json_string << std::endl; + auto start = std::chrono::steady_clock::now(); + ParseJson(json_string); + all_cost += + std::chrono::duration_cast( + std::chrono::steady_clock::now() - start) + .count(); + } + std::cout << "cost: " << all_cost << "us" << std::endl; + } + { + int64_t all_cost = 0; + for (int j = 0; j < 10000; j++) { + auto json_string = GenerateJson(100); + if (j == 0) { + std::cout << json_string.size() << std::endl; + } + //std::cout << json_string << std::endl; + auto start = std::chrono::steady_clock::now(); + ParseJson(json_string); + all_cost += + std::chrono::duration_cast( + std::chrono::steady_clock::now() - start) + .count(); + } + std::cout << "cost: " << all_cost << "us" << std::endl; + } + { + int64_t all_cost = 0; + for (int j = 0; j < 10000; j++) { + auto json_string = GenerateJson(50); + if (j == 0) { + std::cout << json_string.size() << std::endl; + } + auto start = std::chrono::steady_clock::now(); + ParseJson(json_string); + all_cost += + std::chrono::duration_cast( + std::chrono::steady_clock::now() - start) + .count(); + } + std::cout << "cost: " << all_cost << "us" << std::endl; + } + } +} + +void +extract_key_value_pairs(const char* json, size_t len) { + jsmn_parser parser; + jsmntok_t* tokens = + (jsmntok_t*)malloc(16 * sizeof(jsmntok_t)); // Initial allocation + if (!tokens) { + fprintf(stderr, "Memory allocation failed\n"); + return; + } + int num_tokens = 0; + int token_capacity = 16; + + // Initialize the parser + jsmn_init(&parser); + + size_t pos = 0; + while (pos < len) { + size_t chunk_size = + len - pos > 256 ? 256 : len - pos; // Read in chunks of 256 bytes + int r = + jsmn_parse(&parser, json + pos, chunk_size, tokens, token_capacity); + if (r < 0) { + if (r == JSMN_ERROR_NOMEM) { + // Reallocate tokens array if not enough space + token_capacity *= 2; // Double the capacity + tokens = (jsmntok_t*)realloc( + tokens, token_capacity * sizeof(jsmntok_t)); + if (!tokens) { + fprintf(stderr, "Memory reallocation failed\n"); + return; + } + continue; // Try parsing again + } else { + fprintf(stderr, "Failed to parse JSON: %d\n", r); + free(tokens); + return; + } + } + + // Update the position + pos += chunk_size; + } + + // Iterate through the tokens + for (int i = 0; i < parser.toknext; i++) { + if (tokens[i].type == JSMN_OBJECT) { + for (int j = 0; j < tokens[i].size; j++) { + // The next token is the key (string) + j++; + printf("Key: %.*s\n", + tokens[j].end - tokens[j].start, + json + tokens[j].start); + + // The next token is the value + j++; + printf("Value: %.*s\n", + tokens[j].end - tokens[j].start, + json + tokens[j].start); + } + } + } + + // Clean up + free(tokens); +} + +void +TravelJson(const char* json, + jsmntok* tokens, + int& index, + std::vector& path) { + jsmntok current = tokens[0]; + if (current.type == JSMN_OBJECT) { + int j = 1; + for (int i = 0; i < current.size; i++) { + assert(tokens[j].type == JSMN_STRING && tokens[j].size != 0); + std::string key(json + tokens[j].start, + tokens[j].end - tokens[j].start); + path.push_back(key); + j++; + int consumed = 0; + TravelJson(json, tokens + j, consumed, path); + path.pop_back(); + j += consumed; + } + index = j; + } else if (current.type == JSMN_PRIMITIVE) { + std::cout << "key:" << Join(path, ".") << "values:" + << std::string(json + current.start, + current.end - current.start) + << std::endl; + index++; + } else if (current.type == JSMN_ARRAY) { + std::cout << "key:" << Join(path, ".") << "values:" + << std::string(json + current.start, + current.end - current.start) + << std::endl; + // skip next array parse + int count = current.size; + int j = 1; + while (count > 0) { + if (tokens[j].size == 0) { + count--; + } else { + count += tokens[j].size; + } + j++; + } + index = j; + + } else if (current.type == JSMN_STRING) { + if (current.size == 0) { + std::cout << "key:" << Join(path, ".") << " values:" + << std::string(json + current.start, + current.end - current.start) + << std::endl; + index++; + } else { + throw std::runtime_error("not should happen"); + } + } else { + throw std::runtime_error("not should happen"); + } +} + +void +extract_key_value_pairs(const char* json) { + jsmn_parser parser; + jsmntok_t* tokens = + (jsmntok_t*)malloc(16 * sizeof(jsmntok_t)); // Initial allocation + if (!tokens) { + fprintf(stderr, "Memory allocation failed\n"); + return; + } + int num_tokens = 0; + int token_capacity = 16; + + // Initialize the parser + jsmn_init(&parser); + + // Parse the JSON string + while (1) { + int r = jsmn_parse(&parser, json, strlen(json), tokens, token_capacity); + if (r < 0) { + if (r == JSMN_ERROR_NOMEM) { + // Reallocate tokens array if not enough space + token_capacity *= 2; // Double the capacity + tokens = (jsmntok_t*)realloc( + tokens, token_capacity * sizeof(jsmntok_t)); + if (!tokens) { + fprintf(stderr, "Memory reallocation failed\n"); + return; + } + continue; // Try parsing again + } else { + fprintf(stderr, "Failed to parse JSON: %d\n", r); + free(tokens); + return; + } + } + num_tokens = r; + break; // Exit the loop if parsing was successful + } + + std::cout << "num_tokens:" << num_tokens << std::endl; + // Iterate through the tokens + for (int i = 0; i < num_tokens; i++) { + std::cout << "i:" << i << "type: " << tokens[i].type + << "token size:" << tokens[i].size << std::endl; + printf("value: %.*s\n", + tokens[i].end - tokens[i].start, + json + tokens[i].start); + } + + std::cout << "-----------------" << std::endl; + int index = 0; + std::vector path; + TravelJson(json, tokens, index, path); + + // Clean up + free(tokens); +} + +void +extract_json(const char* json) { + jsmn_parser parser; + jsmntok_t* tokens = + (jsmntok_t*)malloc(16 * sizeof(jsmntok_t)); // Initial allocation + if (!tokens) { + fprintf(stderr, "Memory allocation failed\n"); + return; + } + int num_tokens = 0; + int token_capacity = 16; + + // Initialize the parser + jsmn_init(&parser); + + // Parse the JSON string + while (1) { + int r = jsmn_parse(&parser, json, strlen(json), tokens, token_capacity); + if (r < 0) { + if (r == JSMN_ERROR_NOMEM) { + // Reallocate tokens array if not enough space + token_capacity *= 2; // Double the capacity + tokens = (jsmntok_t*)realloc( + tokens, token_capacity * sizeof(jsmntok_t)); + if (!tokens) { + fprintf(stderr, "Memory reallocation failed\n"); + return; + } + continue; // Try parsing again + } else { + fprintf(stderr, "Failed to parse JSON: %d\n", r); + free(tokens); + return; + } + } + num_tokens = r; + break; // Exit the loop if parsing was successful + } + + // assert(tokens[0].type == JSMN_OBJECT); + + // Iterate through the tokens + for (int i = 0; i < num_tokens; i++) { + std::cout << "i:" << i << "type: " << tokens[i].type + << "token size:" << tokens[i].size << std::endl; + printf("value: %.*s\n", + tokens[i].end - tokens[i].start, + json + tokens[i].start); + } + + // Clean up + free(tokens); +} + +TEST(CApiTest, test_jsmn_function) { + int64_t all_cost = 0; + // auto json_string = GenerateJson(50); + // std::cout << json_string << std::endl; + // extract_key_value_pairs(json_string.c_str()); + + std::string json_string = + R"({"keys0": ["value0", 234, "values1"], "keys1": ["value3", 1235]})"; + std::cout << json_string << std::endl; + extract_key_value_pairs(json_string.c_str()); + + json_string = + R"({"keys0": [{"keys1": 1234, "keys2": "xxx"}, {"keys3": 567, "keys4": "xxxxx"}]})"; + std::cout << json_string << std::endl; + extract_key_value_pairs(json_string.c_str()); + + json_string = R"({"keys0": {"keys1": { "keys2": "xxx", "keys3" :1234}}})"; + std::cout << json_string << std::endl; + extract_key_value_pairs(json_string.c_str()); +} diff --git a/internal/core/unittest/test_json_key_index.cpp b/internal/core/unittest/test_json_key_index.cpp new file mode 100644 index 0000000000000..137718eb2f794 --- /dev/null +++ b/internal/core/unittest/test_json_key_index.cpp @@ -0,0 +1,222 @@ +// Copyright(C) 2019 - 2020 Zilliz.All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + +#include +#include +#include +#include +#include + +#include "common/Tracer.h" +#include "index/BitmapIndex.h" +#include "storage/Util.h" +#include "storage/InsertData.h" +#include "indexbuilder/IndexFactory.h" +#include "index/IndexFactory.h" +#include "test_utils/indexbuilder_test_utils.h" +#include "index/Meta.h" +#include "index/JsonKeyInvertedIndex.h" + +using namespace milvus::index; +using namespace milvus::indexbuilder; +using namespace milvus; +using namespace milvus::index; + +// 1000 keys +static std::string +GenerateJson(int N) { + std::vector data(N); + std::default_random_engine er(67); + std::normal_distribution<> distr(0, 1); + std::vector keys; + for (int i = 0; i < N; i++) { + keys.push_back("keys" + std::to_string(i)); + } + std::string json_string; + std::vector values(N); + for (int i = 0; i < N; i++) { + if (i % 7 == 0 || i % 7 == 4) { + values[i] = std::to_string(er()); + } else if (i % 7 == 1 || i % 7 == 5) { + values[i] = std::to_string(static_cast(er())); + } else if (i % 7 == 2 || i % 7 == 6) { + values[i] = er() / 2 == 0 ? "true" : "false"; + } else if (i % 7 == 3) { + values[i] = "\"xxxx" + std::to_string(i) + "\""; + // } else if (i % 7 == 4) { + // std::vector intvec(10); + // for (int j = 0; j < 10; j++) { + // intvec[j] = std::to_string(i + j); + // } + // values[i] = "[" + join(intvec, ",") + "]"; + // } else if (i % 7 == 5) { + // std::vector doublevec(10); + // for (int j = 0; j < 10; j++) { + // doublevec[j] = + // std::to_string(static_cast(i + j + er())); + // } + // values[i] = "[" + join(doublevec, ",") + "]"; + // } else if (i % 7 == 6) { + // std::vector stringvec(10); + // for (int j = 0; j < 10; j++) { + // stringvec[j] = "\"xxx" + std::to_string(j) + "\""; + // } + // values[i] = "[" + join(stringvec, ",") + "]"; + } + } + json_string += "{"; + for (int i = 0; i < N - 1; i++) { + json_string += R"(")" + keys[i] + R"(":)" + values[i] + R"(,)"; + } + json_string += R"(")" + keys[N - 1] + R"(":)" + values[N - 1]; + json_string += "}"; + return json_string; +} + +static std::vector +GenerateJsons(int size, int dim) { + std::vector jsons; + for (int i = 0; i < size; ++i) { + jsons.push_back( + milvus::Json(simdjson::padded_string(GenerateJson(dim)))); + } + return jsons; +} + +class JsonKeyIndexTest : public testing::Test { + protected: + void + Init(int64_t collection_id, + int64_t partition_id, + int64_t segment_id, + int64_t field_id, + int64_t index_build_id, + int64_t index_version) { + proto::schema::FieldSchema field_schema; + field_schema.set_data_type(proto::schema::DataType::JSON); + + auto field_meta = storage::FieldDataMeta{ + collection_id, partition_id, segment_id, field_id, field_schema}; + auto index_meta = storage::IndexMeta{ + segment_id, field_id, index_build_id, index_version}; + + data_ = std::move(GenerateJsons(10000, 100)); + auto field_data = storage::CreateFieldData(DataType::JSON); + field_data->FillFieldData(data_.data(), data_.size()); + storage::InsertData insert_data(field_data); + insert_data.SetFieldDataMeta(field_meta); + insert_data.SetTimestamps(0, 100); + + auto serialized_bytes = insert_data.Serialize(storage::Remote); + + auto log_path = fmt::format("/{}/{}/{}/{}/{}/{}", + "/tmp/test-jsonkey-index/", + collection_id, + partition_id, + segment_id, + field_id, + 0); + chunk_manager_->Write( + log_path, serialized_bytes.data(), serialized_bytes.size()); + + storage::FileManagerContext ctx(field_meta, index_meta, chunk_manager_); + std::vector index_files; + + Config config; + config["insert_files"] = std::vector{log_path}; + + auto build_index = std::make_shared(ctx, false); + build_index->Build(config); + + auto binary_set = build_index->Upload(config); + for (const auto& [key, _] : binary_set.binary_map_) { + index_files.push_back(key); + } + + index::CreateIndexInfo index_info{}; + config["index_files"] = index_files; + + index_ = std::make_shared(ctx, true); + index_->Load(milvus::tracer::TraceContext{}, config); + } + + virtual void + SetParam() { + } + void + SetUp() override { + SetParam(); + + type_ = DataType::JSON; + int64_t collection_id = 1; + int64_t partition_id = 2; + int64_t segment_id = 3; + int64_t field_id = 101; + int64_t index_build_id = 1000; + int64_t index_version = 10000; + std::string root_path = "/tmp/test-jsonkey-index/"; + + storage::StorageConfig storage_config; + storage_config.storage_type = "local"; + storage_config.root_path = root_path; + chunk_manager_ = storage::CreateChunkManager(storage_config); + + Init(collection_id, + partition_id, + segment_id, + field_id, + index_build_id, + index_version); + } + + virtual ~JsonKeyIndexTest() override { + boost::filesystem::remove_all(chunk_manager_->GetRootPath()); + } + + public: + void + TestTermInFunc() { + std::set term_set = {"xxxxx"}; + auto filter_func = [&term_set, this](uint32_t row_id, + uint16_t offset, + uint16_t size) { + //std::cout << row_id << " " << offset << " " << size << std::endl; + + auto val = this->data_[row_id].template at_pos( + offset, size); + if (val.second != "") { + //std::cout << val.error() << std::endl; + return false; + } + return term_set.find((std::string(val.first))) != term_set.end(); + }; + index_->FilterByPath("/keys0", filter_func); + } + + public: + std::shared_ptr index_; + DataType type_; + size_t nb_; + std::vector data_; + std::shared_ptr chunk_manager_; +}; + +TEST_F(JsonKeyIndexTest, CountFuncTest) { + int all_cost = 0; + while (true) { + auto start = std::chrono::steady_clock::now(); + TestTermInFunc(); + all_cost += std::chrono::duration_cast( + std::chrono::steady_clock::now() - start) + .count(); + std::cout << "all_cost" << all_cost << std::endl; + } +} \ No newline at end of file diff --git a/internal/datacoord/job_manager.go b/internal/datacoord/job_manager.go index 9db974a9e1105..4ffc24c6702bf 100644 --- a/internal/datacoord/job_manager.go +++ b/internal/datacoord/job_manager.go @@ -8,6 +8,7 @@ import ( "github.com/cockroachdb/errors" "go.uber.org/zap" + "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" "github.com/milvus-io/milvus/internal/datacoord/allocator" "github.com/milvus-io/milvus/internal/proto/datapb" "github.com/milvus-io/milvus/internal/proto/indexpb" @@ -82,6 +83,7 @@ func (jm *statsJobManager) triggerStatsTaskLoop() { jm.triggerSortStatsTask() jm.triggerTextStatsTask() jm.triggerBM25StatsTask() + jm.triggerJsonKeyIndexStatsTask() case segID := <-getStatsTaskChSingleton(): log.Info("receive new segment to trigger stats task", zap.Int64("segmentID", segID)) @@ -139,6 +141,13 @@ func needDoTextIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool { return false } +func needDoJsonKeyIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool { + if !isFlush(segment) { + return false + } + return true +} + func needDoBM25(segment *SegmentInfo, fieldIDs []UniqueID) bool { // TODO: docking bm25 stats task return false @@ -170,6 +179,28 @@ func (jm *statsJobManager) triggerTextStatsTask() { } } +func (jm *statsJobManager) triggerJsonKeyIndexStatsTask() { + collections := jm.mt.GetCollections() + for _, collection := range collections { + needTriggerFieldIDs := make([]UniqueID, 0) + for _, field := range collection.Schema.GetFields() { + if field.GetDataType() == schemapb.DataType_JSON { + needTriggerFieldIDs = append(needTriggerFieldIDs, field.GetFieldID()) + } + } + segments := jm.mt.SelectSegments(jm.ctx, WithCollection(collection.ID), SegmentFilterFunc(func(seg *SegmentInfo) bool { + return needDoJsonKeyIndex(seg, needTriggerFieldIDs) + })) + for _, segment := range segments { + if err := jm.SubmitStatsTask(segment.GetID(), segment.GetID(), indexpb.StatsSubJob_JsonKeyIndexJob, true); err != nil { + log.Warn("create stats task with json key index for segment failed, wait for retry:", + zap.Int64("segmentID", segment.GetID()), zap.Error(err)) + continue + } + } + } +} + func (jm *statsJobManager) triggerBM25StatsTask() { collections := jm.mt.GetCollections() for _, collection := range collections { diff --git a/internal/datacoord/meta.go b/internal/datacoord/meta.go index aab1f9a41e78e..a2e9207f71f4f 100644 --- a/internal/datacoord/meta.go +++ b/internal/datacoord/meta.go @@ -2058,6 +2058,7 @@ func (m *meta) SaveStatsResultSegment(oldSegmentID int64, result *workerpb.Stats Statslogs: result.GetStatsLogs(), TextStatsLogs: result.GetTextStatsLogs(), Bm25Statslogs: result.GetBm25Logs(), + JsonKeyStats: result.GetJsonKeyStatsLogs(), Deltalogs: nil, CompactionFrom: []int64{oldSegmentID}, IsSorted: true, diff --git a/internal/datacoord/segment_operator.go b/internal/datacoord/segment_operator.go index 91b4da67ba8ce..2558865d51061 100644 --- a/internal/datacoord/segment_operator.go +++ b/internal/datacoord/segment_operator.go @@ -43,6 +43,18 @@ func SetTextIndexLogs(textIndexLogs map[int64]*datapb.TextIndexStats) SegmentOpe } } +func SetJsonKeyIndexLogs(jsonKeyIndexLogs map[int64]*datapb.JsonKeyStats) SegmentOperator { + return func(segment *SegmentInfo) bool { + if segment.JsonKeyStats == nil { + segment.JsonKeyStats = make(map[int64]*datapb.JsonKeyStats) + } + for field, logs := range jsonKeyIndexLogs { + segment.JsonKeyStats[field] = logs + } + return true + } +} + type segmentCriterion struct { collectionID int64 channel string diff --git a/internal/datacoord/task_stats.go b/internal/datacoord/task_stats.go index ca061b1e17da2..21feca1efb8c5 100644 --- a/internal/datacoord/task_stats.go +++ b/internal/datacoord/task_stats.go @@ -313,6 +313,13 @@ func (st *statsTask) SetJobInfo(meta *meta) error { zap.Int64("segmentID", st.segmentID), zap.Error(err)) return err } + case indexpb.StatsSubJob_JsonKeyIndexJob: + err := meta.UpdateSegment(st.taskInfo.GetSegmentID(), SetJsonKeyIndexLogs(st.taskInfo.GetJsonKeyStatsLogs())) + if err != nil { + log.Warn("save json key index stats result failed", zap.Int64("taskId", st.taskID), + zap.Int64("segmentID", st.segmentID), zap.Error(err)) + return err + } case indexpb.StatsSubJob_BM25Job: // TODO: support bm25 job } diff --git a/internal/indexnode/indexnode_service.go b/internal/indexnode/indexnode_service.go index 69c96b4162cb5..2948c87829f50 100644 --- a/internal/indexnode/indexnode_service.go +++ b/internal/indexnode/indexnode_service.go @@ -506,18 +506,19 @@ func (i *IndexNode) QueryJobsV2(ctx context.Context, req *workerpb.QueryJobsV2Re info := i.getStatsTaskInfo(req.GetClusterID(), taskID) if info != nil { results = append(results, &workerpb.StatsResult{ - TaskID: taskID, - State: info.state, - FailReason: info.failReason, - CollectionID: info.collID, - PartitionID: info.partID, - SegmentID: info.segID, - Channel: info.insertChannel, - InsertLogs: info.insertLogs, - StatsLogs: info.statsLogs, - TextStatsLogs: info.textStatsLogs, + TaskID: taskID, + State: info.state, + FailReason: info.failReason, + CollectionID: info.collID, + PartitionID: info.partID, + SegmentID: info.segID, + Channel: info.insertChannel, + InsertLogs: info.insertLogs, + StatsLogs: info.statsLogs, + TextStatsLogs: info.textStatsLogs, Bm25Logs: info.bm25Logs, - NumRows: info.numRows, + NumRows: info.numRows, + JsonKeyStatsLogs: info.jsonKeyStatsLogs, }) } } diff --git a/internal/indexnode/task_stats.go b/internal/indexnode/task_stats.go index 40c4756877dd5..92233de1e2a58 100644 --- a/internal/indexnode/task_stats.go +++ b/internal/indexnode/task_stats.go @@ -332,6 +332,13 @@ func (st *statsTask) Execute(ctx context.Context) error { log.Ctx(ctx).Warn("stats wrong, failed to create text index", zap.Error(err)) return err } + } else if st.req.GetSubJobType() == indexpb.StatsSubJob_JsonKeyIndexJob { + err = st.createJsonKeyIndex(ctx, st.req.GetStorageConfig(), st.req.GetCollectionID(), + st.req.GetPartitionID(), st.req.GetTargetSegmentID(), st.req.GetTaskVersion(), st.req.GetTaskID(), insertLogs) + if err != nil { + log.Warn("stats wrong, failed to create text index", zap.Error(err)) + return err + } } return nil @@ -715,3 +722,100 @@ func (st *statsTask) createTextIndex(ctx context.Context, textIndexLogs) return nil } + +func (st *statsTask) createJsonKeyIndex(ctx context.Context, + storageConfig *indexpb.StorageConfig, + collectionID int64, + partitionID int64, + segmentID int64, + version int64, + taskID int64, + insertBinlogs []*datapb.FieldBinlog, +) error { + log := log.Ctx(ctx).With( + zap.String("clusterID", st.req.GetClusterID()), + zap.Int64("taskID", st.req.GetTaskID()), + zap.Int64("collectionID", st.req.GetCollectionID()), + zap.Int64("partitionID", st.req.GetPartitionID()), + zap.Int64("segmentID", st.req.GetSegmentID()), + zap.Any("statsJobType", st.req.GetSubJobType()), + ) + + fieldBinlogs := lo.GroupBy(insertBinlogs, func(binlog *datapb.FieldBinlog) int64 { + return binlog.GetFieldID() + }) + + getInsertFiles := func(fieldID int64) ([]string, error) { + binlogs, ok := fieldBinlogs[fieldID] + if !ok { + return nil, fmt.Errorf("field binlog not found for field %d", fieldID) + } + result := make([]string, 0, len(binlogs)) + for _, binlog := range binlogs { + for _, file := range binlog.GetBinlogs() { + result = append(result, metautil.BuildInsertLogPath(storageConfig.GetRootPath(), collectionID, partitionID, segmentID, fieldID, file.GetLogID())) + } + } + return result, nil + } + + newStorageConfig, err := ParseStorageConfig(storageConfig) + if err != nil { + return err + } + + jsonKeyIndexStats := make(map[int64]*datapb.JsonKeyStats) + for _, field := range st.req.GetSchema().GetFields() { + h := typeutil.CreateFieldSchemaHelper(field) + if !h.EnableJsonKeyIndex() { + continue + } + log.Info("field enable json key index, ready to create json key index", zap.Int64("field id", field.GetFieldID())) + // create text index and upload the text index files. + files, err := getInsertFiles(field.GetFieldID()) + if err != nil { + return err + } + + buildIndexParams := &indexcgopb.BuildIndexInfo{ + BuildID: taskID, + CollectionID: collectionID, + PartitionID: partitionID, + SegmentID: segmentID, + IndexVersion: version, + InsertFiles: files, + FieldSchema: field, + StorageConfig: newStorageConfig, + } + + uploaded, err := indexcgowrapper.CreateJsonKeyIndex(ctx, buildIndexParams) + if err != nil { + return err + } + jsonKeyIndexStats[field.GetFieldID()] = &datapb.JsonKeyStats{ + FieldID: field.GetFieldID(), + Version: version, + BuildID: taskID, + Files: lo.Keys(uploaded), + } + log.Info("field enable json key index, create json key index done", + zap.Int64("field id", field.GetFieldID()), + zap.Strings("files", lo.Keys(uploaded)), + ) + } + + totalElapse := st.tr.RecordSpan() + + st.node.storeJsonKeyIndexResult(st.req.GetClusterID(), + st.req.GetTaskID(), + st.req.GetCollectionID(), + st.req.GetPartitionID(), + st.req.GetTargetSegmentID(), + st.req.GetInsertChannel(), + jsonKeyIndexStats) + + log.Info("create json key index done", + zap.Int64("target segmentID", st.req.GetTargetSegmentID()), + zap.Duration("total elapse", totalElapse)) + return nil +} diff --git a/internal/indexnode/taskinfo_ops.go b/internal/indexnode/taskinfo_ops.go index 8a0431f76bf42..f15050af4eb68 100644 --- a/internal/indexnode/taskinfo_ops.go +++ b/internal/indexnode/taskinfo_ops.go @@ -312,18 +312,19 @@ func (i *IndexNode) waitTaskFinish() { } type statsTaskInfo struct { - cancel context.CancelFunc - state indexpb.JobState - failReason string - collID UniqueID - partID UniqueID - segID UniqueID - insertChannel string - numRows int64 - insertLogs []*datapb.FieldBinlog - statsLogs []*datapb.FieldBinlog - textStatsLogs map[int64]*datapb.TextIndexStats + cancel context.CancelFunc + state indexpb.JobState + failReason string + collID UniqueID + partID UniqueID + segID UniqueID + insertChannel string + numRows int64 + insertLogs []*datapb.FieldBinlog + statsLogs []*datapb.FieldBinlog + textStatsLogs map[int64]*datapb.TextIndexStats bm25Logs []*datapb.FieldBinlog + jsonKeyStatsLogs map[int64]*datapb.JsonKeyStats } func (i *IndexNode) loadOrStoreStatsTask(clusterID string, taskID UniqueID, info *statsTaskInfo) *statsTaskInfo { @@ -410,24 +411,45 @@ func (i *IndexNode) storeStatsTextIndexResult( } } +func (i *IndexNode) storeJsonKeyIndexResult( + clusterID string, + taskID UniqueID, + collID UniqueID, + partID UniqueID, + segID UniqueID, + channel string, + jsonKeyIndexLogs map[int64]*datapb.JsonKeyStats) { + key := taskKey{ClusterID: clusterID, TaskID: taskID} + i.stateLock.Lock() + defer i.stateLock.Unlock() + if info, ok := i.statsTasks[key]; ok { + info.jsonKeyStatsLogs = jsonKeyIndexLogs + info.segID = segID + info.collID = collID + info.partID = partID + info.insertChannel = channel + } +} + func (i *IndexNode) getStatsTaskInfo(clusterID string, taskID UniqueID) *statsTaskInfo { i.stateLock.Lock() defer i.stateLock.Unlock() if info, ok := i.statsTasks[taskKey{ClusterID: clusterID, TaskID: taskID}]; ok { return &statsTaskInfo{ - cancel: info.cancel, - state: info.state, - failReason: info.failReason, - collID: info.collID, - partID: info.partID, - segID: info.segID, - insertChannel: info.insertChannel, - numRows: info.numRows, - insertLogs: info.insertLogs, - statsLogs: info.statsLogs, - textStatsLogs: info.textStatsLogs, + cancel: info.cancel, + state: info.state, + failReason: info.failReason, + collID: info.collID, + partID: info.partID, + segID: info.segID, + insertChannel: info.insertChannel, + numRows: info.numRows, + insertLogs: info.insertLogs, + statsLogs: info.statsLogs, + textStatsLogs: info.textStatsLogs, bm25Logs: info.bm25Logs, + jsonKeyStatsLogs: info.jsonKeyStatsLogs, } } return nil diff --git a/internal/proto/data_coord.proto b/internal/proto/data_coord.proto index 5d78f28cf0ddf..552eddf4b20b8 100644 --- a/internal/proto/data_coord.proto +++ b/internal/proto/data_coord.proto @@ -361,6 +361,8 @@ message SegmentInfo { // This field is used to indicate that some intermediate state segments should not be loaded. // For example, segments that have been clustered but haven't undergone stats yet. bool is_invisible = 28; + // jsonKeyStats is used to record json key index for fields. + map jsonKeyStats = 29; } message SegmentStartPosition { @@ -448,6 +450,15 @@ message TextIndexStats { int64 buildID = 6; } +message JsonKeyStats { + int64 fieldID = 1; + int64 version = 2; + repeated string files = 3; + int64 log_size = 4; + int64 memory_size = 5; + int64 buildID = 6; +} + message Binlog { int64 entries_num = 1; uint64 timestamp_from = 2; diff --git a/internal/proto/index_cgo_msg.proto b/internal/proto/index_cgo_msg.proto index 92e98100f35ec..4e1be8c6991ae 100644 --- a/internal/proto/index_cgo_msg.proto +++ b/internal/proto/index_cgo_msg.proto @@ -92,3 +92,13 @@ message LoadTextIndexInfo { int64 collectionID = 6; int64 partitionID = 7; } + +message LoadJsonKeyIndexInfo { + int64 FieldID = 1; + int64 version = 2; + int64 buildID = 3; + repeated string files = 4; + schema.FieldSchema schema = 5; + int64 collectionID = 6; + int64 partitionID = 7; +} diff --git a/internal/proto/index_coord.proto b/internal/proto/index_coord.proto index adcd0aed7bd37..9d062a0a60e6c 100644 --- a/internal/proto/index_coord.proto +++ b/internal/proto/index_coord.proto @@ -322,4 +322,5 @@ enum StatsSubJob { Sort = 1; TextIndexJob = 2; BM25Job=3; + JsonKeyIndexJob = 4; } diff --git a/internal/proto/query_coord.proto b/internal/proto/query_coord.proto index d469841c525ff..a38fdc0c535fd 100644 --- a/internal/proto/query_coord.proto +++ b/internal/proto/query_coord.proto @@ -373,6 +373,7 @@ message SegmentLoadInfo { bool is_sorted = 19; map textStatsLogs = 20; repeated data.FieldBinlog bm25logs = 21; + map jsonKeyStatsLogs = 22; } message FieldIndexInfo { diff --git a/internal/proto/worker.proto b/internal/proto/worker.proto index 6f8f72a0a441f..ca3d1160fadcb 100644 --- a/internal/proto/worker.proto +++ b/internal/proto/worker.proto @@ -199,6 +199,7 @@ message StatsResult { map text_stats_logs = 10; int64 num_rows = 11; repeated data.FieldBinlog bm25_logs = 12; + map json_key_stats_logs = 13; } message StatsResults { diff --git a/internal/querycoordv2/utils/types.go b/internal/querycoordv2/utils/types.go index 511081d73763b..8056449ff07c7 100644 --- a/internal/querycoordv2/utils/types.go +++ b/internal/querycoordv2/utils/types.go @@ -74,22 +74,23 @@ func PackSegmentLoadInfo(segment *datapb.SegmentInfo, channelCheckpoint *msgpb.M zap.Duration("tsLag", tsLag)) } loadInfo := &querypb.SegmentLoadInfo{ - SegmentID: segment.ID, - PartitionID: segment.PartitionID, - CollectionID: segment.CollectionID, - BinlogPaths: segment.Binlogs, - NumOfRows: segment.NumOfRows, - Statslogs: segment.Statslogs, - Deltalogs: segment.Deltalogs, - Bm25Logs: segment.Bm25Statslogs, - InsertChannel: segment.InsertChannel, - IndexInfos: indexes, - StartPosition: segment.GetStartPosition(), - DeltaPosition: channelCheckpoint, - Level: segment.GetLevel(), - StorageVersion: segment.GetStorageVersion(), - IsSorted: segment.GetIsSorted(), - TextStatsLogs: segment.GetTextStatsLogs(), + SegmentID: segment.ID, + PartitionID: segment.PartitionID, + CollectionID: segment.CollectionID, + BinlogPaths: segment.Binlogs, + NumOfRows: segment.NumOfRows, + Statslogs: segment.Statslogs, + Deltalogs: segment.Deltalogs, + Bm25Logs: segment.Bm25Statslogs, + InsertChannel: segment.InsertChannel, + IndexInfos: indexes, + StartPosition: segment.GetStartPosition(), + DeltaPosition: channelCheckpoint, + Level: segment.GetLevel(), + StorageVersion: segment.GetStorageVersion(), + IsSorted: segment.GetIsSorted(), + TextStatsLogs: segment.GetTextStatsLogs(), + JsonKeyStatsLogs: segment.GetJsonKeyStats(), } return loadInfo } diff --git a/internal/querynodev2/segments/segment.go b/internal/querynodev2/segments/segment.go index 78401e5e73a0c..848c682d5b763 100644 --- a/internal/querynodev2/segments/segment.go +++ b/internal/querynodev2/segments/segment.go @@ -1094,6 +1094,40 @@ func (s *LocalSegment) LoadTextIndex(ctx context.Context, textLogs *datapb.TextI return HandleCStatus(ctx, &status, "LoadTextIndex failed") } +func (s *LocalSegment) LoadJsonKeyIndex(ctx context.Context, jsonKeyStats *datapb.JsonKeyStats, schemaHelper *typeutil.SchemaHelper) error { + log.Ctx(ctx).Info("load json key index", zap.Int64("field id", jsonKeyStats.GetFieldID()), zap.Any("json key logs", jsonKeyStats)) + + f, err := schemaHelper.GetFieldFromID(jsonKeyStats.GetFieldID()) + if err != nil { + return err + } + + cgoProto := &indexcgopb.LoadJsonKeyIndexInfo{ + FieldID: jsonKeyStats.GetFieldID(), + Version: jsonKeyStats.GetVersion(), + BuildID: jsonKeyStats.GetBuildID(), + Files: jsonKeyStats.GetFiles(), + Schema: f, + CollectionID: s.Collection(), + PartitionID: s.Partition(), + } + + marshaled, err := proto.Marshal(cgoProto) + if err != nil { + return err + } + + var status C.CStatus + _, _ = GetLoadPool().Submit(func() (any, error) { + traceCtx := ParseCTraceContext(ctx) + status = C.LoadJsonKeyIndex(traceCtx.ctx, s.ptr, (*C.uint8_t)(unsafe.Pointer(&marshaled[0])), (C.uint64_t)(len(marshaled))) + return nil, nil + }).Await() + + return HandleCStatus(ctx, &status, "Load JsonKeyStats failed") + +} + func (s *LocalSegment) UpdateIndexInfo(ctx context.Context, indexInfo *querypb.FieldIndexInfo, info *LoadIndexInfo) error { log := log.Ctx(ctx).With( zap.Int64("collectionID", s.Collection()), diff --git a/internal/querynodev2/segments/segment_loader.go b/internal/querynodev2/segments/segment_loader.go index 390f4a460d6ef..2d7163fecff9b 100644 --- a/internal/querynodev2/segments/segment_loader.go +++ b/internal/querynodev2/segments/segment_loader.go @@ -689,6 +689,7 @@ func separateLoadInfoV2(loadInfo *querypb.SegmentLoadInfo, schema *schemapb.Coll []*datapb.FieldBinlog, // fields info map[int64]*datapb.TextIndexStats, // text indexed info map[int64]struct{}, // unindexed text fields + map[int64]*datapb.JsonKeyStats, // json key stats info ) { fieldID2IndexInfo := make(map[int64]*querypb.FieldIndexInfo) for _, indexInfo := range loadInfo.IndexInfos { @@ -725,6 +726,16 @@ func separateLoadInfoV2(loadInfo *querypb.SegmentLoadInfo, schema *schemapb.Coll } } + jsonKeyIndexInfo := make(map[int64]*datapb.JsonKeyStats, len(loadInfo.GetJsonKeyStatsLogs())) + for _, fieldStatsLog := range loadInfo.GetJsonKeyStatsLogs() { + jsonKeyLog, ok := jsonKeyIndexInfo[fieldStatsLog.FieldID] + if !ok { + jsonKeyIndexInfo[fieldStatsLog.FieldID] = fieldStatsLog + } else if fieldStatsLog.GetVersion() > jsonKeyLog.GetVersion() { + jsonKeyIndexInfo[fieldStatsLog.FieldID] = fieldStatsLog + } + } + unindexedTextFields := make(map[int64]struct{}) for _, field := range schema.GetFields() { h := typeutil.CreateFieldSchemaHelper(field) @@ -734,7 +745,7 @@ func separateLoadInfoV2(loadInfo *querypb.SegmentLoadInfo, schema *schemapb.Coll } } - return indexedFieldInfos, fieldBinlogs, textIndexedInfo, unindexedTextFields + return indexedFieldInfos, fieldBinlogs, textIndexedInfo, unindexedTextFields, jsonKeyIndexInfo } func (loader *segmentLoader) loadSealedSegment(ctx context.Context, loadInfo *querypb.SegmentLoadInfo, segment *LocalSegment) (err error) { @@ -758,7 +769,7 @@ func (loader *segmentLoader) loadSealedSegment(ctx context.Context, loadInfo *qu collection := segment.GetCollection() schemaHelper, _ := typeutil.CreateSchemaHelper(collection.Schema()) - indexedFieldInfos, fieldBinlogs, textIndexes, unindexedTextFields := separateLoadInfoV2(loadInfo, collection.Schema()) + indexedFieldInfos, fieldBinlogs, textIndexes, unindexedTextFields, jsonKeyStats := separateLoadInfoV2(loadInfo, collection.Schema()) if err := segment.AddFieldDataInfo(ctx, loadInfo.GetNumOfRows(), loadInfo.GetBinlogPaths()); err != nil { return err } @@ -769,6 +780,7 @@ func (loader *segmentLoader) loadSealedSegment(ctx context.Context, loadInfo *qu zap.Int64s("indexedFields", lo.Keys(indexedFieldInfos)), zap.Int64s("indexed text fields", lo.Keys(textIndexes)), zap.Int64s("unindexed text fields", lo.Keys(unindexedTextFields)), + zap.Int64s("indexed json key fields", lo.Keys(jsonKeyStats)), ) if err := loader.loadFieldsIndex(ctx, schemaHelper, segment, loadInfo.GetNumOfRows(), indexedFieldInfos); err != nil { return err @@ -815,6 +827,13 @@ func (loader *segmentLoader) loadSealedSegment(ctx context.Context, loadInfo *qu } } + for _, info := range jsonKeyStats { + if err := segment.LoadJsonKeyIndex(ctx, info, schemaHelper); err != nil { + return err + } + } + loadJsonKeyIndexesSpan := tr.RecordSpan() + // 4. rectify entries number for binlog in very rare cases // https://github.com/milvus-io/milvus/23654 // legacy entry num = 0 @@ -828,6 +847,7 @@ func (loader *segmentLoader) loadSealedSegment(ctx context.Context, loadInfo *qu zap.Duration("loadRawDataSpan", loadRawDataSpan), zap.Duration("patchEntryNumberSpan", patchEntryNumberSpan), zap.Duration("loadTextIndexesSpan", loadTextIndexesSpan), + zap.Duration("loadJsonKeyIndexSpan", loadJsonKeyIndexesSpan), ) return nil } @@ -846,6 +866,7 @@ func (loader *segmentLoader) LoadSegment(ctx context.Context, zap.String("shard", segment.Shard().VirtualName()), zap.Int64("segmentID", segment.ID()), ) + log.Info("start loading segment files", zap.Int64("rowNum", loadInfo.GetNumOfRows()), zap.String("segmentType", segment.Type().String())) diff --git a/internal/util/indexcgowrapper/index.go b/internal/util/indexcgowrapper/index.go index c87a3801feeae..255fc99d43e4a 100644 --- a/internal/util/indexcgowrapper/index.go +++ b/internal/util/indexcgowrapper/index.go @@ -163,6 +163,43 @@ func CreateTextIndex(ctx context.Context, buildIndexInfo *indexcgopb.BuildIndexI return res, nil } +func CreateJsonKeyIndex(ctx context.Context, buildIndexInfo *indexcgopb.BuildIndexInfo) (map[string]int64, error) { + buildIndexInfoBlob, err := proto.Marshal(buildIndexInfo) + if err != nil { + log.Ctx(ctx).Warn("marshal buildIndexInfo failed", + zap.String("clusterID", buildIndexInfo.GetClusterID()), + zap.Int64("buildID", buildIndexInfo.GetBuildID()), + zap.Error(err)) + return nil, err + } + var cBinarySet C.CBinarySet + status := C.BuildJsonKeyIndex(&cBinarySet, (*C.uint8_t)(unsafe.Pointer(&buildIndexInfoBlob[0])), (C.uint64_t)(len(buildIndexInfoBlob))) + if err := HandleCStatus(&status, "failed to build json key index"); err != nil { + return nil, err + } + + defer func() { + if cBinarySet != nil { + C.DeleteBinarySet(cBinarySet) + } + }() + + res := make(map[string]int64) + indexFilePaths, err := GetBinarySetKeys(cBinarySet) + if err != nil { + return nil, err + } + for _, path := range indexFilePaths { + size, err := GetBinarySetSize(cBinarySet, path) + if err != nil { + return nil, err + } + res[path] = size + } + + return res, nil +} + // TODO: this seems to be used only for test. We should mark the method // name with ForTest, or maybe move to test file. func (index *CgoIndex) Build(dataset *Dataset) error { diff --git a/pkg/util/typeutil/field_schema.go b/pkg/util/typeutil/field_schema.go index d07bf0696dce7..460b9adf25efc 100644 --- a/pkg/util/typeutil/field_schema.go +++ b/pkg/util/typeutil/field_schema.go @@ -53,6 +53,13 @@ func (h *FieldSchemaHelper) EnableMatch() bool { return err == nil && enable } +func (h *FieldSchemaHelper) EnableJsonKeyIndex() bool { + if IsJSONType(h.schema.GetDataType()) { + return true + } + return false +} + func (h *FieldSchemaHelper) EnableAnalyzer() bool { if !IsStringType(h.schema.GetDataType()) { return false From 2056abdce1d8aaa9bd54b949061bd6eebd0a2a6d Mon Sep 17 00:00:00 2001 From: "Xianhui.Lin" Date: Tue, 26 Nov 2024 21:15:55 +0800 Subject: [PATCH 2/5] jsoncontainexpr unaryexpr binaryexpr json index optimization Signed-off-by: Xianhui.Lin improve jsonContainSExpr Signed-off-by: Xianhui.Lin add jsonindex test Signed-off-by: Xianhui.Lin --- .../src/exec/expression/BinaryRangeExpr.cpp | 137 ++- .../src/exec/expression/BinaryRangeExpr.h | 4 + .../core/src/exec/expression/ExistsExpr.cpp | 62 +- .../core/src/exec/expression/ExistsExpr.h | 3 + .../src/exec/expression/JsonContainsExpr.cpp | 1001 +++++++++++++---- .../src/exec/expression/JsonContainsExpr.h | 20 + .../core/src/exec/expression/TermExpr.cpp | 2 +- .../core/src/exec/expression/UnaryExpr.cpp | 453 +++++--- internal/core/src/exec/expression/UnaryExpr.h | 4 + .../core/src/index/JsonKeyInvertedIndex.h | 3 +- .../core/unittest/test_json_key_index.cpp | 309 ++++- internal/indexnode/task_stats.go | 12 +- 12 files changed, 1517 insertions(+), 493 deletions(-) diff --git a/internal/core/src/exec/expression/BinaryRangeExpr.cpp b/internal/core/src/exec/expression/BinaryRangeExpr.cpp index 7dd0943794703..5c3ff818977f5 100644 --- a/internal/core/src/exec/expression/BinaryRangeExpr.cpp +++ b/internal/core/src/exec/expression/BinaryRangeExpr.cpp @@ -261,17 +261,17 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForData(OffsetVector* input) { TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); valid_res.set(); - auto execute_sub_batch = - [ lower_inclusive, - upper_inclusive ]( - const T* data, - const bool* valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - TargetBitmapView valid_res, - HighPrecisionType val1, - HighPrecisionType val2) { + auto execute_sub_batch = [lower_inclusive, + upper_inclusive]( + const T* data, + const bool* valid_data, + const int32_t* offsets, + const int size, + TargetBitmapView res, + TargetBitmapView valid_res, + HighPrecisionType val1, + HighPrecisionType val2) { if (lower_inclusive && upper_inclusive) { BinaryRangeElementFunc func; func(val1, val2, data, size, res, offsets); @@ -345,6 +345,10 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson(OffsetVector* input) { using GetType = std::conditional_t, std::string_view, ValueType>; + FieldId field_id = expr_->column_.field_id_; + if (CanUseJsonKeyIndex(field_id)) { + return ExecRangeVisitorImplForJsonForIndex(); + } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { @@ -362,17 +366,18 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson(OffsetVector* input) { ValueType val2 = GetValueFromProto(expr_->upper_val_); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); - auto execute_sub_batch = - [ lower_inclusive, upper_inclusive, - pointer ]( - const milvus::Json* data, - const bool* valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - TargetBitmapView valid_res, - ValueType val1, - ValueType val2) { + auto execute_sub_batch = [lower_inclusive, + upper_inclusive, + pointer]( + const milvus::Json* data, + const bool* valid_data, + const int32_t* offsets, + const int size, + TargetBitmapView res, + TargetBitmapView valid_res, + ValueType val1, + ValueType val2) { if (lower_inclusive && upper_inclusive) { BinaryRangeElementFuncForJson func; @@ -445,6 +450,70 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson(OffsetVector* input) { return res_vec; } +template +VectorPtr +PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJsonForIndex() { + using GetType = std::conditional_t, + std::string_view, + ValueType>; + Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ + ? active_count_ - current_data_chunk_pos_ + : batch_size_; + auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); + bool lower_inclusive = expr_->lower_inclusive_; + bool upper_inclusive = expr_->upper_inclusive_; + ValueType val1 = GetValueFromProto(expr_->lower_val_); + ValueType val2 = GetValueFromProto(expr_->upper_val_); + if (cached_index_chunk_id_ != 0) { + const auto* sealed_seg = + dynamic_cast(segment_); + auto field_id = expr_->column_.field_id_; + auto* index = sealed_seg->GetJsonKeyIndex(field_id); + Assert(index != nullptr); + auto filter_func = + [sealed_seg, + &field_id, + val1, + val2, + lower_inclusive, + upper_inclusive](uint32_t row_id, uint16_t offset, uint16_t size) { + auto json_pair = sealed_seg->GetJsonData(field_id, row_id); + if (!json_pair.second) { + return false; + } + auto json = milvus::Json(json_pair.first.data(), + json_pair.first.size()); + auto val = json.at(offset, size); + if (val.error()) { + return false; + } + if (lower_inclusive && upper_inclusive) { + return val1 <= ValueType(val.value()) && + ValueType(val.value()) <= val2; + } else if (lower_inclusive && !upper_inclusive) { + return val1 <= ValueType(val.value()) && + ValueType(val.value()) < val2; + } else if (!lower_inclusive && upper_inclusive) { + return val1 < ValueType(val.value()) && + ValueType(val.value()) <= val2; + } else { + return val1 < ValueType(val.value()) && + ValueType(val.value()) < val2; + } + }; + cached_index_chunk_res_ = + index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + cached_index_chunk_id_ = 0; + } + TargetBitmap result; + result.append( + cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size); + current_data_chunk_pos_ += real_batch_size; + return std::make_shared(std::move(result), + TargetBitmap(real_batch_size, true)); +} + template VectorPtr PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForArray(OffsetVector* input) { @@ -471,18 +540,18 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForArray(OffsetVector* input) { index = std::stoi(expr_->column_.nested_path_[0]); } - auto execute_sub_batch = - [ lower_inclusive, - upper_inclusive ]( - const milvus::ArrayView* data, - const bool* valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - TargetBitmapView valid_res, - ValueType val1, - ValueType val2, - int index) { + auto execute_sub_batch = [lower_inclusive, + upper_inclusive]( + const milvus::ArrayView* data, + const bool* valid_data, + const int32_t* offsets, + const int size, + TargetBitmapView res, + TargetBitmapView valid_res, + ValueType val1, + ValueType val2, + int index) { if (lower_inclusive && upper_inclusive) { BinaryRangeElementFuncForArray func; diff --git a/internal/core/src/exec/expression/BinaryRangeExpr.h b/internal/core/src/exec/expression/BinaryRangeExpr.h index 1babfc6fd044e..7696dae12c76f 100644 --- a/internal/core/src/exec/expression/BinaryRangeExpr.h +++ b/internal/core/src/exec/expression/BinaryRangeExpr.h @@ -270,6 +270,10 @@ class PhyBinaryRangeFilterExpr : public SegmentExpr { VectorPtr ExecRangeVisitorImplForJson(OffsetVector* input = nullptr); + template + VectorPtr + ExecRangeVisitorImplForJsonForIndex(); + template VectorPtr ExecRangeVisitorImplForArray(OffsetVector* input = nullptr); diff --git a/internal/core/src/exec/expression/ExistsExpr.cpp b/internal/core/src/exec/expression/ExistsExpr.cpp index a4163e46aa0f7..cc64dcb57c43b 100644 --- a/internal/core/src/exec/expression/ExistsExpr.cpp +++ b/internal/core/src/exec/expression/ExistsExpr.cpp @@ -42,6 +42,10 @@ PhyExistsFilterExpr::Eval(EvalCtx& context, VectorPtr& result) { VectorPtr PhyExistsFilterExpr::EvalJsonExistsForDataSegment(OffsetVector* input) { + FieldId field_id = expr_->column_.field_id_; + if (CanUseJsonKeyIndex(field_id)) { + return EvalJsonExistsForDataSegmentForIndex(); + } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { @@ -63,18 +67,18 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment(OffsetVector* input) { TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer) { - for (int i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; + for (int i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = data[offset].exist(pointer); } - res[i] = data[offset].exist(pointer); - } - }; + }; int64_t processed_size; if (has_offset_input_) { @@ -96,5 +100,41 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment(OffsetVector* input) { return res_vec; } +VectorPtr +PhyExistsFilterExpr::EvalJsonExistsForDataSegmentForIndex() { + Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ + ? active_count_ - current_data_chunk_pos_ + : batch_size_; + auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); + if (cached_index_chunk_id_ != 0) { + const auto* sealed_seg = + dynamic_cast(segment_); + auto field_id = expr_->column_.field_id_; + auto* index = sealed_seg->GetJsonKeyIndex(field_id); + Assert(index != nullptr); + auto filter_func = [sealed_seg, field_id, pointer](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto json_pair = sealed_seg->GetJsonData(field_id, row_id); + if (!json_pair.second) { + return false; + } + auto json = + milvus::Json(json_pair.first.data(), json_pair.first.size()); + return json.exist(pointer); + }; + cached_index_chunk_res_ = + index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + cached_index_chunk_id_ = 0; + } + TargetBitmap result; + result.append( + cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size); + current_data_chunk_pos_ += real_batch_size; + return std::make_shared(std::move(result), + TargetBitmap(real_batch_size, true)); +} + } //namespace exec } // namespace milvus diff --git a/internal/core/src/exec/expression/ExistsExpr.h b/internal/core/src/exec/expression/ExistsExpr.h index dc00f883c7400..1b6dac2b1c272 100644 --- a/internal/core/src/exec/expression/ExistsExpr.h +++ b/internal/core/src/exec/expression/ExistsExpr.h @@ -59,6 +59,9 @@ class PhyExistsFilterExpr : public SegmentExpr { VectorPtr EvalJsonExistsForDataSegment(OffsetVector* input = nullptr); + VectorPtr + EvalJsonExistsForDataSegmentForIndex(); + private: std::shared_ptr expr_; }; diff --git a/internal/core/src/exec/expression/JsonContainsExpr.cpp b/internal/core/src/exec/expression/JsonContainsExpr.cpp index 3318a4822865f..46cd3fc220a91 100644 --- a/internal/core/src/exec/expression/JsonContainsExpr.cpp +++ b/internal/core/src/exec/expression/JsonContainsExpr.cpp @@ -196,27 +196,28 @@ PhyJsonContainsFilterExpr::ExecArrayContains(OffsetVector* input) { TargetBitmapView res, TargetBitmapView valid_res, const std::unordered_set& elements) { - auto executor = [&](size_t i) { - const auto& array = data[i]; - for (int j = 0; j < array.length(); ++j) { - if (elements.count(array.template get_data(j)) > 0) { - return true; + auto executor = [&](size_t i) { + const auto& array = data[i]; + for (int j = 0; j < array.length(); ++j) { + if (elements.count(array.template get_data(j)) > + 0) { + return true; + } + } + return false; + }; + for (int i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; } + res[i] = executor(offset); } - return false; }; - for (int i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = executor(offset); - } - }; int64_t processed_size; if (has_offset_input_) { @@ -246,6 +247,12 @@ PhyJsonContainsFilterExpr::ExecJsonContains(OffsetVector* input) { std::conditional_t, std::string_view, ExprValueType>; + + FieldId field_id = expr_->column_.field_id_; + if (CanUseJsonKeyIndex(field_id)) { + return ExecJsonContainsByKeyIndex(); + } + auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { @@ -273,35 +280,35 @@ PhyJsonContainsFilterExpr::ExecJsonContains(OffsetVector* input) { TargetBitmapView valid_res, const std::string& pointer, const std::unordered_set& elements) { - auto executor = [&](size_t i) { - auto doc = data[i].doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) { + auto executor = [&](size_t i) { + auto doc = data[i].doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) { + return false; + } + for (auto&& it : array) { + auto val = it.template get(); + if (val.error()) { + continue; + } + if (elements.count(val.value()) > 0) { + return true; + } + } return false; - } - for (auto&& it : array) { - auto val = it.template get(); - if (val.error()) { - continue; + }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; } - if (elements.count(val.value()) > 0) { - return true; + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; } + res[i] = executor(offset); } - return false; }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = executor(offset); - } - }; int64_t processed_size; if (has_offset_input_) { @@ -328,8 +335,77 @@ PhyJsonContainsFilterExpr::ExecJsonContains(OffsetVector* input) { return res_vec; } +template +VectorPtr +PhyJsonContainsFilterExpr::ExecJsonContainsByKeyIndex() { + using GetType = + std::conditional_t, + std::string_view, + ExprValueType>; + Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ + ? active_count_ - current_data_chunk_pos_ + : batch_size_; + std::unordered_set elements; + auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); + for (auto const& element : expr_->vals_) { + elements.insert(GetValueFromProto(element)); + } + if (elements.empty()) { + MoveCursor(); + return std::make_shared( + TargetBitmap(real_batch_size, false), + TargetBitmap(real_batch_size, true)); + } + if (cached_index_chunk_id_ != 0) { + const auto* sealed_seg = + dynamic_cast(segment_); + auto field_id = expr_->column_.field_id_; + auto* index = sealed_seg->GetJsonKeyIndex(field_id); + Assert(index != nullptr); + auto filter_func = [sealed_seg, &elements, &field_id](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto json_pair = sealed_seg->GetJsonData(field_id, row_id); + if (!json_pair.second) { + return false; + } + auto json = + milvus::Json(json_pair.first.data(), json_pair.first.size()); + auto array = json.array_at(offset, size); + + if (array.error()) { + return false; + } + for (auto&& it : array) { + auto val = it.template get(); + if (val.error()) { + continue; + } + if (elements.count(val.value()) > 0) { + return true; + } + } + return false; + }; + cached_index_chunk_res_ = + index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + cached_index_chunk_id_ = 0; + } + TargetBitmap result; + result.append( + cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size); + current_data_chunk_pos_ += real_batch_size; + return std::make_shared(std::move(result), + TargetBitmap(real_batch_size, true)); +} + VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsArray(OffsetVector* input) { + FieldId field_id = expr_->column_.field_id_; + if (CanUseJsonKeyIndex(field_id)) { + return ExecJsonContainsArrayByKeyIndex(); + } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { @@ -357,44 +433,44 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray(OffsetVector* input) { TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements) { - auto executor = [&](size_t i) -> bool { - auto doc = data[i].doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) { - return false; - } - for (auto&& it : array) { - auto val = it.get_array(); - if (val.error()) { - continue; + auto executor = [&](size_t i) -> bool { + auto doc = data[i].doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) { + return false; } - std::vector< - simdjson::simdjson_result> - json_array; - json_array.reserve(val.count_elements()); - for (auto&& e : val) { - json_array.emplace_back(e); - } - for (auto const& element : elements) { - if (CompareTwoJsonArray(json_array, element)) { - return true; + for (auto&& it : array) { + auto val = it.get_array(); + if (val.error()) { + continue; } + std::vector< + simdjson::simdjson_result> + json_array; + json_array.reserve(val.count_elements()); + for (auto&& e : val) { + json_array.emplace_back(e); + } + for (auto const& element : elements) { + if (CompareTwoJsonArray(json_array, element)) { + return true; + } + } + } + return false; + }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; } + res[i] = executor(offset); } - return false; }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = executor(offset); - } - }; int64_t processed_size; if (has_offset_input_) { @@ -421,6 +497,67 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray(OffsetVector* input) { return res_vec; } +VectorPtr +PhyJsonContainsFilterExpr::ExecJsonContainsArrayByKeyIndex() { + Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ + ? active_count_ - current_data_chunk_pos_ + : batch_size_; + std::vector elements; + auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); + for (auto const& element : expr_->vals_) { + elements.emplace_back(GetValueFromProto(element)); + } + if (elements.empty()) { + MoveCursor(); + return std::make_shared( + TargetBitmap(real_batch_size, false), + TargetBitmap(real_batch_size, true)); + } + if (cached_index_chunk_id_ != 0) { + const auto* sealed_seg = + dynamic_cast(segment_); + auto field_id = expr_->column_.field_id_; + auto* index = sealed_seg->GetJsonKeyIndex(field_id); + Assert(index != nullptr); + auto filter_func = [sealed_seg, &elements, &field_id](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto json_pair = sealed_seg->GetJsonData(field_id, row_id); + if (!json_pair.second) { + return false; + } + auto json = + milvus::Json(json_pair.first.data(), json_pair.first.size()); + auto array = json.array_at(offset, size); + if (array.error()) { + return false; + } + for (auto&& it : array) { + auto val = it.get_array(); + if (val.error()) { + continue; + } + for (auto const& element : elements) { + if (CompareTwoJsonArray(val, element)) { + return true; + } + } + } + return false; + }; + cached_index_chunk_res_ = + index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + cached_index_chunk_id_ = 0; + } + TargetBitmap result; + result.append( + cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size); + current_data_chunk_pos_ += real_batch_size; + return std::make_shared(std::move(result), + TargetBitmap(real_batch_size, true)); +} + template VectorPtr PhyJsonContainsFilterExpr::ExecArrayContainsAll(OffsetVector* input) { @@ -456,29 +593,29 @@ PhyJsonContainsFilterExpr::ExecArrayContainsAll(OffsetVector* input) { TargetBitmapView res, TargetBitmapView valid_res, const std::unordered_set& elements) { - auto executor = [&](size_t i) { - std::unordered_set tmp_elements(elements); - // Note: array can only be iterated once - for (int j = 0; j < data[i].length(); ++j) { - tmp_elements.erase(data[i].template get_data(j)); - if (tmp_elements.size() == 0) { - return true; + auto executor = [&](size_t i) { + std::unordered_set tmp_elements(elements); + // Note: array can only be iterated once + for (int j = 0; j < data[i].length(); ++j) { + tmp_elements.erase(data[i].template get_data(j)); + if (tmp_elements.size() == 0) { + return true; + } } + return tmp_elements.size() == 0; + }; + for (int i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); } - return tmp_elements.size() == 0; }; - for (int i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = executor(offset); - } - }; int64_t processed_size; if (has_offset_input_) { @@ -508,6 +645,11 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(OffsetVector* input) { std::conditional_t, std::string_view, ExprValueType>; + + FieldId field_id = expr_->column_.field_id_; + if (CanUseJsonKeyIndex(field_id)) { + return ExecJsonContainsAllByKeyIndex(); + } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { @@ -536,38 +678,38 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(OffsetVector* input) { TargetBitmapView valid_res, const std::string& pointer, const std::unordered_set& elements) { - auto executor = [&](const size_t i) -> bool { - auto doc = data[i].doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) { - return false; - } - std::unordered_set tmp_elements(elements); - // Note: array can only be iterated once - for (auto&& it : array) { - auto val = it.template get(); - if (val.error()) { - continue; + auto executor = [&](const size_t i) -> bool { + auto doc = data[i].doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) { + return false; } - tmp_elements.erase(val.value()); - if (tmp_elements.size() == 0) { - return true; + std::unordered_set tmp_elements(elements); + // Note: array can only be iterated once + for (auto&& it : array) { + auto val = it.template get(); + if (val.error()) { + continue; + } + tmp_elements.erase(val.value()); + if (tmp_elements.size() == 0) { + return true; + } + } + return tmp_elements.size() == 0; + }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; } + res[i] = executor(offset); } - return tmp_elements.size() == 0; }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = executor(offset); - } - }; int64_t processed_size; if (has_offset_input_) { @@ -594,9 +736,79 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(OffsetVector* input) { return res_vec; } +template +VectorPtr +PhyJsonContainsFilterExpr::ExecJsonContainsAllByKeyIndex() { + using GetType = + std::conditional_t, + std::string_view, + ExprValueType>; + Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ + ? active_count_ - current_data_chunk_pos_ + : batch_size_; + std::unordered_set elements; + auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); + for (auto const& element : expr_->vals_) { + elements.insert(GetValueFromProto(element)); + } + if (elements.empty()) { + MoveCursor(); + return std::make_shared( + TargetBitmap(real_batch_size, false), + TargetBitmap(real_batch_size, true)); + } + if (cached_index_chunk_id_ != 0) { + const auto* sealed_seg = + dynamic_cast(segment_); + auto field_id = expr_->column_.field_id_; + auto* index = sealed_seg->GetJsonKeyIndex(field_id); + Assert(index != nullptr); + auto filter_func = [sealed_seg, &elements, &field_id](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto json_pair = sealed_seg->GetJsonData(field_id, row_id); + if (!json_pair.second) { + return false; + } + auto json = + milvus::Json(json_pair.first.data(), json_pair.first.size()); + auto array = json.array_at(offset, size); + if (array.error()) { + return false; + } + std::unordered_set tmp_elements(elements); + for (auto&& it : array) { + auto val = it.template get(); + if (val.error()) { + continue; + } + tmp_elements.erase(val.value()); + if (tmp_elements.size() == 0) { + return true; + } + } + return tmp_elements.empty(); + }; + cached_index_chunk_res_ = + index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + cached_index_chunk_id_ = 0; + } + TargetBitmap result; + result.append( + cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size); + current_data_chunk_pos_ += real_batch_size; + return std::make_shared(std::move(result), + TargetBitmap(real_batch_size, true)); +} + VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType( OffsetVector* input) { + FieldId field_id = expr_->column_.field_id_; + if (CanUseJsonKeyIndex(field_id)) { + return ExecJsonContainsAllWithDiffTypeByKeyIndex(); + } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { @@ -629,14 +841,168 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType( const std::string& pointer, const std::vector& elements, const std::unordered_set elements_index) { - auto executor = [&](size_t i) -> bool { - const auto& json = data[i]; - auto doc = json.doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) { + auto executor = [&](size_t i) -> bool { + const auto& json = data[i]; + auto doc = json.doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) { + return false; + } + std::unordered_set tmp_elements_index(elements_index); + for (auto&& it : array) { + int i = -1; + for (auto& element : elements) { + i++; + switch (element.val_case()) { + case proto::plan::GenericValue::kBoolVal: { + auto val = it.template get(); + if (val.error()) { + continue; + } + if (val.value() == element.bool_val()) { + tmp_elements_index.erase(i); + } + break; + } + case proto::plan::GenericValue::kInt64Val: { + auto val = it.template get(); + if (val.error()) { + continue; + } + if (val.value() == element.int64_val()) { + tmp_elements_index.erase(i); + } + break; + } + case proto::plan::GenericValue::kFloatVal: { + auto val = it.template get(); + if (val.error()) { + continue; + } + if (val.value() == element.float_val()) { + tmp_elements_index.erase(i); + } + break; + } + case proto::plan::GenericValue::kStringVal: { + auto val = it.template get(); + if (val.error()) { + continue; + } + if (val.value() == element.string_val()) { + tmp_elements_index.erase(i); + } + break; + } + case proto::plan::GenericValue::kArrayVal: { + auto val = it.get_array(); + if (val.error()) { + continue; + } + if (CompareTwoJsonArray(val, + element.array_val())) { + tmp_elements_index.erase(i); + } + break; + } + default: + PanicInfo( + DataTypeInvalid, + fmt::format("unsupported data type {}", + element.val_case())); + } + if (tmp_elements_index.size() == 0) { + return true; + } + } + if (tmp_elements_index.size() == 0) { + return true; + } + } + return tmp_elements_index.size() == 0; + }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; + + int64_t processed_size; + if (has_offset_input_) { + processed_size = ProcessDataByOffsets(execute_sub_batch, + std::nullptr_t{}, + input, + res, + valid_res, + pointer, + elements, + elements_index); + } else { + processed_size = ProcessDataChunks(execute_sub_batch, + std::nullptr_t{}, + res, + valid_res, + pointer, + elements, + elements_index); + } + AssertInfo(processed_size == real_batch_size, + "internal error: expr processed rows {} not equal " + "expect batch size {}", + processed_size, + real_batch_size); + return res_vec; +} + +VectorPtr +PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffTypeByKeyIndex() { + Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ + ? active_count_ - current_data_chunk_pos_ + : batch_size_; + auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); + auto elements = expr_->vals_; + std::unordered_set elements_index; + int i = 0; + for (auto& element : elements) { + elements_index.insert(i); + i++; + } + if (elements.empty()) { + MoveCursor(); + return std::make_shared( + TargetBitmap(real_batch_size, false), + TargetBitmap(real_batch_size, true)); + } + if (cached_index_chunk_id_ != 0) { + const auto* sealed_seg = + dynamic_cast(segment_); + auto field_id = expr_->column_.field_id_; + auto* index = sealed_seg->GetJsonKeyIndex(field_id); + Assert(index != nullptr); + auto filter_func = [sealed_seg, &elements, &elements_index, &field_id]( + uint32_t row_id, + uint16_t offset, + uint16_t size) { + return false; + auto json_pair = sealed_seg->GetJsonData(field_id, row_id); + if (!json_pair.second) { return false; } + auto json = + milvus::Json(json_pair.first.data(), json_pair.first.size()); std::unordered_set tmp_elements_index(elements_index); + auto array = json.array_at(offset, size); + if (array.error()) { + return false; + } for (auto&& it : array) { int i = -1; for (auto& element : elements) { @@ -707,48 +1073,24 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType( } return tmp_elements_index.size() == 0; }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = executor(offset); - } - }; - - int64_t processed_size; - if (has_offset_input_) { - processed_size = ProcessDataByOffsets(execute_sub_batch, - std::nullptr_t{}, - input, - res, - valid_res, - pointer, - elements, - elements_index); - } else { - processed_size = ProcessDataChunks(execute_sub_batch, - std::nullptr_t{}, - res, - valid_res, - pointer, - elements, - elements_index); + cached_index_chunk_res_ = + index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + cached_index_chunk_id_ = 0; } - AssertInfo(processed_size == real_batch_size, - "internal error: expr processed rows {} not equal " - "expect batch size {}", - processed_size, - real_batch_size); - return res_vec; + TargetBitmap result; + result.append( + cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size); + current_data_chunk_pos_ += real_batch_size; + return std::make_shared(std::move(result), + TargetBitmap(real_batch_size, true)); } VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(OffsetVector* input) { + FieldId field_id = expr_->column_.field_id_; + if (CanUseJsonKeyIndex(field_id)) { + return ExecJsonContainsAllArrayByKeyIndex(); + } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { @@ -777,48 +1119,48 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(OffsetVector* input) { TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements) { - auto executor = [&](const size_t i) { - auto doc = data[i].doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) { - return false; - } - std::unordered_set exist_elements_index; - for (auto&& it : array) { - auto val = it.get_array(); - if (val.error()) { - continue; + auto executor = [&](const size_t i) { + auto doc = data[i].doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) { + return false; } - std::vector< - simdjson::simdjson_result> - json_array; - json_array.reserve(val.count_elements()); - for (auto&& e : val) { - json_array.emplace_back(e); - } - for (int index = 0; index < elements.size(); ++index) { - if (CompareTwoJsonArray(json_array, elements[index])) { - exist_elements_index.insert(index); + std::unordered_set exist_elements_index; + for (auto&& it : array) { + auto val = it.get_array(); + if (val.error()) { + continue; + } + std::vector< + simdjson::simdjson_result> + json_array; + json_array.reserve(val.count_elements()); + for (auto&& e : val) { + json_array.emplace_back(e); + } + for (int index = 0; index < elements.size(); ++index) { + if (CompareTwoJsonArray(json_array, elements[index])) { + exist_elements_index.insert(index); + } + } + if (exist_elements_index.size() == elements.size()) { + return true; } } - if (exist_elements_index.size() == elements.size()) { - return true; + return exist_elements_index.size() == elements.size(); + }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; } + res[i] = executor(offset); } - return exist_elements_index.size() == elements.size(); }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = executor(offset); - } - }; int64_t processed_size; if (has_offset_input_) { @@ -845,8 +1187,77 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(OffsetVector* input) { return res_vec; } +VectorPtr +PhyJsonContainsFilterExpr::ExecJsonContainsAllArrayByKeyIndex() { + Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ + ? active_count_ - current_data_chunk_pos_ + : batch_size_; + auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); + std::vector elements; + for (auto const& element : expr_->vals_) { + elements.emplace_back(GetValueFromProto(element)); + } + if (elements.empty()) { + MoveCursor(); + return std::make_shared( + TargetBitmap(real_batch_size, false), + TargetBitmap(real_batch_size, true)); + } + if (cached_index_chunk_id_ != 0) { + const auto* sealed_seg = + dynamic_cast(segment_); + auto field_id = expr_->column_.field_id_; + auto* index = sealed_seg->GetJsonKeyIndex(field_id); + Assert(index != nullptr); + auto filter_func = [sealed_seg, &elements, &field_id](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto json_pair = sealed_seg->GetJsonData(field_id, row_id); + if (!json_pair.second) { + return false; + } + auto json = + milvus::Json(json_pair.first.data(), json_pair.first.size()); + auto array = json.array_at(offset, size); + if (array.error()) { + return false; + } + std::unordered_set exist_elements_index; + for (auto&& it : array) { + auto json_array = it.get_array(); + if (json_array.error()) { + continue; + } + for (int index = 0; index < elements.size(); ++index) { + if (CompareTwoJsonArray(json_array, elements[index])) { + exist_elements_index.insert(index); + } + } + if (exist_elements_index.size() == elements.size()) { + return true; + } + } + return exist_elements_index.size() == elements.size(); + }; + cached_index_chunk_res_ = + index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + cached_index_chunk_id_ = 0; + } + TargetBitmap result; + result.append( + cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size); + current_data_chunk_pos_ += real_batch_size; + return std::make_shared(std::move(result), + TargetBitmap(real_batch_size, true)); +} + VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(OffsetVector* input) { + FieldId field_id = expr_->column_.field_id_; + if (CanUseJsonKeyIndex(field_id)) { + return ExecJsonContainsWithDiffTypeByKeyIndex(); + } auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { @@ -879,10 +1290,146 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(OffsetVector* input) { TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements) { - auto executor = [&](const size_t i) { - auto& json = data[i]; - auto doc = json.doc(); - auto array = doc.at_pointer(pointer).get_array(); + auto executor = [&](const size_t i) { + auto& json = data[i]; + auto doc = json.doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) { + return false; + } + // Note: array can only be iterated once + for (auto&& it : array) { + for (auto const& element : elements) { + switch (element.val_case()) { + case proto::plan::GenericValue::kBoolVal: { + auto val = it.template get(); + if (val.error()) { + continue; + } + if (val.value() == element.bool_val()) { + return true; + } + break; + } + case proto::plan::GenericValue::kInt64Val: { + auto val = it.template get(); + if (val.error()) { + continue; + } + if (val.value() == element.int64_val()) { + return true; + } + break; + } + case proto::plan::GenericValue::kFloatVal: { + auto val = it.template get(); + if (val.error()) { + continue; + } + if (val.value() == element.float_val()) { + return true; + } + break; + } + case proto::plan::GenericValue::kStringVal: { + auto val = it.template get(); + if (val.error()) { + continue; + } + if (val.value() == element.string_val()) { + return true; + } + break; + } + case proto::plan::GenericValue::kArrayVal: { + auto val = it.get_array(); + if (val.error()) { + continue; + } + if (CompareTwoJsonArray(val, + element.array_val())) { + return true; + } + break; + } + default: + PanicInfo( + DataTypeInvalid, + fmt::format("unsupported data type {}", + element.val_case())); + } + } + } + return false; + }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; + + int64_t processed_size; + if (has_offset_input_) { + processed_size = ProcessDataByOffsets(execute_sub_batch, + std::nullptr_t{}, + input, + res, + valid_res, + pointer, + elements); + } else { + processed_size = ProcessDataChunks(execute_sub_batch, + std::nullptr_t{}, + res, + valid_res, + pointer, + elements); + } + AssertInfo(processed_size == real_batch_size, + "internal error: expr processed rows {} not equal " + "expect batch size {}", + processed_size, + real_batch_size); + return res_vec; +} + +VectorPtr +PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffTypeByKeyIndex() { + Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ + ? active_count_ - current_data_chunk_pos_ + : batch_size_; + auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); + auto elements = expr_->vals_; + if (elements.empty()) { + MoveCursor(); + return std::make_shared( + TargetBitmap(real_batch_size, false), + TargetBitmap(real_batch_size, true)); + } + if (cached_index_chunk_id_ != 0) { + const auto* sealed_seg = + dynamic_cast(segment_); + auto field_id = expr_->column_.field_id_; + auto* index = sealed_seg->GetJsonKeyIndex(field_id); + Assert(index != nullptr); + auto filter_func = [sealed_seg, &elements, &field_id](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto json_pair = sealed_seg->GetJsonData(field_id, row_id); + if (!json_pair.second) { + return false; + } + auto json = + milvus::Json(json_pair.first.data(), json_pair.first.size()); + auto array = json.array_at(offset, size); if (array.error()) { return false; } @@ -949,42 +1496,16 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(OffsetVector* input) { } return false; }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = executor(offset); - } - }; - - int64_t processed_size; - if (has_offset_input_) { - processed_size = ProcessDataByOffsets(execute_sub_batch, - std::nullptr_t{}, - input, - res, - valid_res, - pointer, - elements); - } else { - processed_size = ProcessDataChunks(execute_sub_batch, - std::nullptr_t{}, - res, - valid_res, - pointer, - elements); + cached_index_chunk_res_ = + index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + cached_index_chunk_id_ = 0; } - AssertInfo(processed_size == real_batch_size, - "internal error: expr processed rows {} not equal " - "expect batch size {}", - processed_size, - real_batch_size); - return res_vec; + TargetBitmap result; + result.append( + cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size); + current_data_chunk_pos_ += real_batch_size; + return std::make_shared(std::move(result), + TargetBitmap(real_batch_size, true)); } VectorPtr diff --git a/internal/core/src/exec/expression/JsonContainsExpr.h b/internal/core/src/exec/expression/JsonContainsExpr.h index a0c8848cba188..71c2bf780f0d6 100644 --- a/internal/core/src/exec/expression/JsonContainsExpr.h +++ b/internal/core/src/exec/expression/JsonContainsExpr.h @@ -56,6 +56,10 @@ class PhyJsonContainsFilterExpr : public SegmentExpr { VectorPtr ExecJsonContains(OffsetVector* input = nullptr); + template + VectorPtr + ExecJsonContainsByKeyIndex(); + template VectorPtr ExecArrayContains(OffsetVector* input = nullptr); @@ -64,6 +68,10 @@ class PhyJsonContainsFilterExpr : public SegmentExpr { VectorPtr ExecJsonContainsAll(OffsetVector* input = nullptr); + template + VectorPtr + ExecJsonContainsAllByKeyIndex(); + template VectorPtr ExecArrayContainsAll(OffsetVector* input = nullptr); @@ -71,15 +79,27 @@ class PhyJsonContainsFilterExpr : public SegmentExpr { VectorPtr ExecJsonContainsArray(OffsetVector* input = nullptr); + VectorPtr + ExecJsonContainsArrayByKeyIndex(); + VectorPtr ExecJsonContainsAllArray(OffsetVector* input = nullptr); + VectorPtr + ExecJsonContainsAllArrayByKeyIndex(); + VectorPtr ExecJsonContainsAllWithDiffType(OffsetVector* input = nullptr); + VectorPtr + ExecJsonContainsAllWithDiffTypeByKeyIndex(); + VectorPtr ExecJsonContainsWithDiffType(OffsetVector* input = nullptr); + VectorPtr + ExecJsonContainsWithDiffTypeByKeyIndex(); + VectorPtr EvalArrayContainsForIndexSegment(); diff --git a/internal/core/src/exec/expression/TermExpr.cpp b/internal/core/src/exec/expression/TermExpr.cpp index 3c0e812fb42b3..4682442bef02a 100644 --- a/internal/core/src/exec/expression/TermExpr.cpp +++ b/internal/core/src/exec/expression/TermExpr.cpp @@ -559,7 +559,7 @@ PhyTermFilterExpr::ExecJsonInVariableByKeyIndex() { return term_set.find(ValueType(val.value())) != term_set.end(); }; cached_index_chunk_res_ = - index->FilterByPath(pointer, filter_func).clone(); + index->FilterByPath(pointer, real_batch_size, filter_func).clone(); cached_index_chunk_id_ = 0; } diff --git a/internal/core/src/exec/expression/UnaryExpr.cpp b/internal/core/src/exec/expression/UnaryExpr.cpp index d3453d1f66f69..88e2cac622b41 100644 --- a/internal/core/src/exec/expression/UnaryExpr.cpp +++ b/internal/core/src/exec/expression/UnaryExpr.cpp @@ -276,144 +276,145 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArray(OffsetVector* input) { if (expr_->column_.nested_path_.size() > 0) { index = std::stoi(expr_->column_.nested_path_[0]); } - auto execute_sub_batch = [op_type]( - const milvus::ArrayView* data, - const bool* valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - TargetBitmapView valid_res, - ValueType val, - int index) { - switch (op_type) { - case proto::plan::GreaterThan: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::GreaterEqual: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::LessThan: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::LessEqual: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::Equal: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::NotEqual: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::PrefixMatch: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::Match: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; + auto execute_sub_batch = + [op_type]( + const milvus::ArrayView* data, + const bool* valid_data, + const int32_t* offsets, + const int size, + TargetBitmapView res, + TargetBitmapView valid_res, + ValueType val, + int index) { + switch (op_type) { + case proto::plan::GreaterThan: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::GreaterEqual: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::LessThan: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::LessEqual: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::Equal: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::NotEqual: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::PrefixMatch: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::Match: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + default: + PanicInfo( + OpTypeInvalid, + fmt::format( + "unsupported operator type for unary expr: {}", + op_type)); } - default: - PanicInfo( - OpTypeInvalid, - fmt::format("unsupported operator type for unary expr: {}", - op_type)); - } - }; + }; int64_t processed_size; if (has_offset_input_) { processed_size = @@ -479,7 +480,7 @@ PhyUnaryRangeFilterExpr::ExecArrayEqualForIndex(bool reverse) { }; } else { auto size_per_chunk = segment_->size_per_chunk(); - retrieve = [ size_per_chunk, this ](int64_t offset) -> auto { + retrieve = [size_per_chunk, this](int64_t offset) -> auto { auto chunk_idx = offset / size_per_chunk; auto chunk_offset = offset % size_per_chunk; const auto& chunk = @@ -553,6 +554,12 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(OffsetVector* input) { std::conditional_t, std::string_view, ExprValueType>; + + FieldId field_id = expr_->column_.field_id_; + if (CanUseJsonKeyIndex(field_id)) { + return ExecRangeVisitorImplJsonForIndex(); + } + auto real_batch_size = has_offset_input_ ? input->size() : GetNextBatchSize(); if (real_batch_size == 0) { @@ -598,15 +605,15 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(OffsetVector* input) { res[i] = (cmp); \ } while (false) - auto execute_sub_batch = - [ op_type, pointer ]( - const milvus::Json* data, - const bool* valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - TargetBitmapView valid_res, - ExprValueType val) { + auto execute_sub_batch = [op_type, pointer]( + const milvus::Json* data, + const bool* valid_data, + const int32_t* offsets, + const int size, + TargetBitmapView res, + TargetBitmapView valid_res, + ExprValueType val) { switch (op_type) { case proto::plan::GreaterThan: { for (size_t i = 0; i < size; ++i) { @@ -793,6 +800,144 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(OffsetVector* input) { return res_vec; } +template +VectorPtr +PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonForIndex() { + using GetType = + std::conditional_t, + std::string_view, + ExprValueType>; + Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ + ? active_count_ - current_data_chunk_pos_ + : batch_size_; + auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); + ExprValueType val = GetValueFromProto(expr_->val_); + auto op_type = expr_->op_type_; + if (cached_index_chunk_id_ != 0) { + const auto* sealed_seg = + dynamic_cast(segment_); + auto field_id = expr_->column_.field_id_; + auto* index = sealed_seg->GetJsonKeyIndex(field_id); + Assert(index != nullptr); + auto filter_func = [sealed_seg, field_id, op_type, val](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto json_pair = sealed_seg->GetJsonData(field_id, row_id); + if (!json_pair.second) { + return false; + } + auto json = + milvus::Json(json_pair.first.data(), json_pair.first.size()); + switch (op_type) { + case proto::plan::GreaterThan: + if constexpr (std::is_same_v) { + return false; + } else { + auto x = json.at(offset, size); + if (x.error()) { + return false; + } + return ExprValueType(x.value()) > val; + } + case proto::plan::GreaterEqual: + if constexpr (std::is_same_v) { + return false; + } else { + auto x = json.at(offset, size); + if (x.error()) { + return false; + } + return ExprValueType(x.value()) >= val; + } + case proto::plan::LessThan: + if constexpr (std::is_same_v) { + return false; + } else { + auto x = json.at(offset, size); + if (x.error()) { + return false; + } + return ExprValueType(x.value()) < val; + } + case proto::plan::LessEqual: + if constexpr (std::is_same_v) { + return false; + } else { + auto x = json.at(offset, size); + if (x.error()) { + return false; + } + return ExprValueType(x.value()) <= val; + } + case proto::plan::Equal: + if constexpr (std::is_same_v) { + auto array = json.array_at(offset, size); + if (array.error()) { + return false; + } + return CompareTwoJsonArray(array.value(), val); + } else { + auto x = json.at(offset, size); + if (x.error()) { + return false; + } + return ExprValueType(x.value()) == val; + } + case proto::plan::NotEqual: + if constexpr (std::is_same_v) { + auto array = json.array_at(offset, size); + if (array.error()) { + return false; + } + return !CompareTwoJsonArray(array.value(), val); + } else { + auto x = json.at(offset, size); + if (x.error()) { + return false; + } + return ExprValueType(x.value()) != val; + } + case proto::plan::PrefixMatch: + if constexpr (std::is_same_v) { + return false; + } else { + auto x = json.at(offset, size); + if (x.error()) { + return false; + } + return milvus::query::Match( + ExprValueType(x.value()), val, op_type); + } + case proto::plan::Match: + if constexpr (std::is_same_v) { + return false; + } else { + auto x = json.at(offset, size); + if (x.error()) { + return false; + } + PatternMatchTranslator translator; + auto regex_pattern = translator(val); + RegexMatcher matcher(regex_pattern); + return matcher(ExprValueType(x.value())); + } + default: + return false; + } + }; + cached_index_chunk_res_ = + index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + cached_index_chunk_id_ = 0; + } + TargetBitmap result; + result.append( + cached_index_chunk_res_, current_data_chunk_pos_, real_batch_size); + current_data_chunk_pos_ += real_batch_size; + return std::make_shared(std::move(result), + TargetBitmap(real_batch_size, true)); +} + template VectorPtr PhyUnaryRangeFilterExpr::ExecRangeVisitorImpl(OffsetVector* input) { @@ -978,13 +1123,13 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForData(OffsetVector* input) { auto execute_sub_batch = [expr_type]( - const T* data, - const bool* valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - TargetBitmapView valid_res, - IndexInnerType val) { + const T* data, + const bool* valid_data, + const int32_t* offsets, + const int size, + TargetBitmapView res, + TargetBitmapView valid_res, + IndexInnerType val) { switch (expr_type) { case proto::plan::GreaterThan: { UnaryElementFunc func; diff --git a/internal/core/src/exec/expression/UnaryExpr.h b/internal/core/src/exec/expression/UnaryExpr.h index 159fe5abb4091..f4c368ab58f50 100644 --- a/internal/core/src/exec/expression/UnaryExpr.h +++ b/internal/core/src/exec/expression/UnaryExpr.h @@ -353,6 +353,10 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr { VectorPtr ExecRangeVisitorImplJson(OffsetVector* input = nullptr); + template + VectorPtr + ExecRangeVisitorImplJsonForIndex(); + template VectorPtr ExecRangeVisitorImplArray(OffsetVector* input = nullptr); diff --git a/internal/core/src/index/JsonKeyInvertedIndex.h b/internal/core/src/index/JsonKeyInvertedIndex.h index 90f8ba1f84db0..b686f63897f2e 100644 --- a/internal/core/src/index/JsonKeyInvertedIndex.h +++ b/internal/core/src/index/JsonKeyInvertedIndex.h @@ -39,8 +39,9 @@ class JsonKeyInvertedIndex : public InvertedIndexTantivy { const TargetBitmap FilterByPath(const std::string& path, + int32_t row, std::function filter) { - TargetBitmap bitset(Count()); + TargetBitmap bitset(row); auto array = wrapper_->term_query(path); LOG_DEBUG("json key filter size:{}", array.array_.len); diff --git a/internal/core/unittest/test_json_key_index.cpp b/internal/core/unittest/test_json_key_index.cpp index 137718eb2f794..c128c78909535 100644 --- a/internal/core/unittest/test_json_key_index.cpp +++ b/internal/core/unittest/test_json_key_index.cpp @@ -30,11 +30,23 @@ using namespace milvus::indexbuilder; using namespace milvus; using namespace milvus::index; +std::string +join(const std::vector& vec, const std::string& delimiter) { + std::ostringstream oss; + for (size_t i = 0; i < vec.size(); ++i) { + oss << vec[i]; + if (i != vec.size() - 1) { + oss << delimiter; + } + } + return oss.str(); +} + // 1000 keys static std::string GenerateJson(int N) { std::vector data(N); - std::default_random_engine er(67); + std::default_random_engine er(42); std::normal_distribution<> distr(0, 1); std::vector keys; for (int i = 0; i < N; i++) { @@ -43,33 +55,33 @@ GenerateJson(int N) { std::string json_string; std::vector values(N); for (int i = 0; i < N; i++) { - if (i % 7 == 0 || i % 7 == 4) { + if (i % 7 == 0) { values[i] = std::to_string(er()); - } else if (i % 7 == 1 || i % 7 == 5) { + } else if (i % 7 == 1) { values[i] = std::to_string(static_cast(er())); - } else if (i % 7 == 2 || i % 7 == 6) { + } else if (i % 7 == 2) { values[i] = er() / 2 == 0 ? "true" : "false"; } else if (i % 7 == 3) { values[i] = "\"xxxx" + std::to_string(i) + "\""; - // } else if (i % 7 == 4) { - // std::vector intvec(10); - // for (int j = 0; j < 10; j++) { - // intvec[j] = std::to_string(i + j); - // } - // values[i] = "[" + join(intvec, ",") + "]"; - // } else if (i % 7 == 5) { - // std::vector doublevec(10); - // for (int j = 0; j < 10; j++) { - // doublevec[j] = - // std::to_string(static_cast(i + j + er())); - // } - // values[i] = "[" + join(doublevec, ",") + "]"; - // } else if (i % 7 == 6) { - // std::vector stringvec(10); - // for (int j = 0; j < 10; j++) { - // stringvec[j] = "\"xxx" + std::to_string(j) + "\""; - // } - // values[i] = "[" + join(stringvec, ",") + "]"; + } else if (i % 7 == 4) { + std::vector intvec(10); + for (int j = 0; j < 10; j++) { + intvec[j] = std::to_string(i + j); + } + values[i] = "[" + join(intvec, ",") + "]"; + } else if (i % 7 == 5) { + std::vector doublevec(10); + for (int j = 0; j < 10; j++) { + doublevec[j] = + std::to_string(static_cast(i + j + er())); + } + values[i] = "[" + join(doublevec, ",") + "]"; + } else if (i % 7 == 6) { + std::vector stringvec(10); + for (int j = 0; j < 10; j++) { + stringvec[j] = "\"xxx" + std::to_string(j) + "\""; + } + values[i] = "[" + join(stringvec, ",") + "]"; } } json_string += "{"; @@ -85,6 +97,7 @@ static std::vector GenerateJsons(int size, int dim) { std::vector jsons; for (int i = 0; i < size; ++i) { + std::cout << GenerateJson(dim) << std::endl; jsons.push_back( milvus::Json(simdjson::padded_string(GenerateJson(dim)))); } @@ -99,7 +112,9 @@ class JsonKeyIndexTest : public testing::Test { int64_t segment_id, int64_t field_id, int64_t index_build_id, - int64_t index_version) { + int64_t index_version, + int64_t size, + int64_t dim) { proto::schema::FieldSchema field_schema; field_schema.set_data_type(proto::schema::DataType::JSON); @@ -108,7 +123,7 @@ class JsonKeyIndexTest : public testing::Test { auto index_meta = storage::IndexMeta{ segment_id, field_id, index_build_id, index_version}; - data_ = std::move(GenerateJsons(10000, 100)); + data_ = std::move(GenerateJsons(size, dim)); auto field_data = storage::CreateFieldData(DataType::JSON); field_data->FillFieldData(data_.data(), data_.size()); storage::InsertData insert_data(field_data); @@ -162,6 +177,8 @@ class JsonKeyIndexTest : public testing::Test { int64_t field_id = 101; int64_t index_build_id = 1000; int64_t index_version = 10000; + size_ = 10; + dim_ = 10; std::string root_path = "/tmp/test-jsonkey-index/"; storage::StorageConfig storage_config; @@ -174,7 +191,9 @@ class JsonKeyIndexTest : public testing::Test { segment_id, field_id, index_build_id, - index_version); + index_version, + size_, + dim_); } virtual ~JsonKeyIndexTest() override { @@ -184,39 +203,231 @@ class JsonKeyIndexTest : public testing::Test { public: void TestTermInFunc() { - std::set term_set = {"xxxxx"}; - auto filter_func = [&term_set, this](uint32_t row_id, - uint16_t offset, - uint16_t size) { - //std::cout << row_id << " " << offset << " " << size << std::endl; - - auto val = this->data_[row_id].template at_pos( - offset, size); - if (val.second != "") { - //std::cout << val.error() << std::endl; - return false; + { + std::vector> testcases{{"705894"}}; + for (auto testcase : testcases) { + auto check = [&](std::string value) { + std::unordered_set term_set(testcase.begin(), + testcase.end()); + return term_set.find(value) != term_set.end(); + }; + std::unordered_set term_set(testcase.begin(), + testcase.end()); + auto filter_func = [&term_set, this](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto val = + this->data_[row_id].template at_pos( + offset, size); + if (val.second != "") { + return false; + } + return term_set.find((std::string(val.first))) != + term_set.end(); + }; + auto bitset = + index_->FilterByPath("/keys0", size_, filter_func); + + ASSERT_EQ(bitset.size(), size_); + for (int i = 0; i < bitset.size(); ++i) { + auto ans = bitset[i]; + auto ref = check("705894"); + ASSERT_EQ(ans, ref); + } + } + } + { + std::vector testcases{"true"}; + for (auto& value : testcases) { + auto filter_func = [this, &value](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto val = + this->data_[row_id].template at_pos( + offset, size); + if (val.second != "") { + return false; + } + return std::string(val.first) == value; + }; + + auto bitset = + index_->FilterByPath("/keys2", size_, filter_func); + ASSERT_EQ(bitset.size(), size_); + for (int i = 0; i < bitset.size(); ++i) { + auto ans = bitset[i]; + auto ref = (value == "false"); + ASSERT_EQ(ans, ref); + } + } + } + } + void + TestUnaryRangeInFunc() { + std::vector testcases{"10", "705894", "805894"}; + std::vector ops{ + OpType::Equal, + OpType::NotEqual, + OpType::GreaterThan, + OpType::GreaterEqual, + OpType::LessThan, + OpType::LessEqual, + }; + for (const auto& testcase : testcases) { + auto check = [&](std::string value) { return value == testcase; }; + std::function f = check; + for (auto& op : ops) { + switch (op) { + case OpType::Equal: { + f = [&](std::string value) { + return value == testcase; + }; + break; + } + case OpType::NotEqual: { + f = [&](std::string value) { + return value != testcase; + }; + break; + } + case OpType::GreaterEqual: { + f = [&](std::string value) { + return value >= testcase; + }; + break; + } + case OpType::GreaterThan: { + f = [&](std::string value) { return value > testcase; }; + break; + } + case OpType::LessEqual: { + f = [&](std::string value) { + return value <= testcase; + }; + break; + } + case OpType::LessThan: { + f = [&](std::string value) { return value < testcase; }; + break; + } + default: { + PanicInfo(Unsupported, "unsupported range node"); + } + } + + auto filter_func = [&op, &testcase, this](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto val = + this->data_[row_id].template at_pos( + offset, size); + if (val.second != "") { + return false; + } + switch (op) { + case OpType::GreaterThan: + return std::string(val.first) > testcase; + case OpType::GreaterEqual: + return std::string(val.first) >= testcase; + case OpType::LessThan: + return std::string(val.first) < testcase; + case OpType::LessEqual: + return std::string(val.first) <= testcase; + case OpType::Equal: + return std::string(val.first) == testcase; + case OpType::NotEqual: + return std::string(val.first) != testcase; + default: + return false; + } + }; + auto bitset = + index_->FilterByPath("/keys0", size_, filter_func); + ASSERT_EQ(bitset.size(), size_); + for (int i = 0; i < bitset.size(); ++i) { + auto ans = bitset[i]; + auto ref = f("705894"); + ASSERT_EQ(ans, ref); + } } - return term_set.find((std::string(val.first))) != term_set.end(); + } + } + + void + TestBinaryRangeInFunc() { + struct Testcase { + bool lower_inclusive; + bool upper_inclusive; + std::string lower; + std::string upper; + }; + std::vector testcases{ + {true, false, "10", "20"}, + {true, true, "20", "30"}, + {false, true, "30", "40"}, + {false, false, "40", "50"}, }; - index_->FilterByPath("/keys0", filter_func); + for (const auto& testcase : testcases) { + auto check = [&](std::string value) { + if (testcase.lower_inclusive && testcase.upper_inclusive) { + return testcase.lower <= value && value <= testcase.upper; + } else if (testcase.lower_inclusive && + !testcase.upper_inclusive) { + return testcase.lower <= value && value < testcase.upper; + } else if (!testcase.lower_inclusive && + testcase.upper_inclusive) { + return testcase.lower < value && value <= testcase.upper; + } else { + return testcase.lower < value && value < testcase.upper; + } + }; + + auto filter_func = [&testcase, this](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto val = + this->data_[row_id].template at_pos( + offset, size); + if (val.second != "") { + return false; + } + if (testcase.lower_inclusive && testcase.upper_inclusive) { + return testcase.lower <= std::string(val.first) && + std::string(val.first) <= testcase.upper; + } else if (testcase.lower_inclusive && + !testcase.upper_inclusive) { + return testcase.lower <= std::string(val.first) && + std::string(val.first) < testcase.upper; + } else if (!testcase.lower_inclusive && + testcase.upper_inclusive) { + return testcase.lower < std::string(val.first) && + std::string(val.first) <= testcase.upper; + } else { + return testcase.lower < std::string(val.first) && + std::string(val.first) < testcase.upper; + } + }; + auto bitset = index_->FilterByPath("/keys7", size_, filter_func); + ASSERT_EQ(bitset.size(), size_); + for (int i = 0; i < bitset.size(); ++i) { + auto ans = bitset[i]; + auto ref = check("970724117"); + ASSERT_EQ(ans, ref); + } + } } public: std::shared_ptr index_; DataType type_; - size_t nb_; + size_t size_; + size_t dim_; std::vector data_; std::shared_ptr chunk_manager_; }; TEST_F(JsonKeyIndexTest, CountFuncTest) { - int all_cost = 0; - while (true) { - auto start = std::chrono::steady_clock::now(); - TestTermInFunc(); - all_cost += std::chrono::duration_cast( - std::chrono::steady_clock::now() - start) - .count(); - std::cout << "all_cost" << all_cost << std::endl; - } + TestTermInFunc(); + TestUnaryRangeInFunc(); + TestBinaryRangeInFunc(); } \ No newline at end of file diff --git a/internal/indexnode/task_stats.go b/internal/indexnode/task_stats.go index 92233de1e2a58..787fc32d6daa9 100644 --- a/internal/indexnode/task_stats.go +++ b/internal/indexnode/task_stats.go @@ -333,10 +333,16 @@ func (st *statsTask) Execute(ctx context.Context) error { return err } } else if st.req.GetSubJobType() == indexpb.StatsSubJob_JsonKeyIndexJob { - err = st.createJsonKeyIndex(ctx, st.req.GetStorageConfig(), st.req.GetCollectionID(), - st.req.GetPartitionID(), st.req.GetTargetSegmentID(), st.req.GetTaskVersion(), st.req.GetTaskID(), insertLogs) + err = st.createJsonKeyIndex(ctx, + st.req.GetStorageConfig(), + st.req.GetCollectionID(), + st.req.GetPartitionID(), + st.req.GetTargetSegmentID(), + st.req.GetTaskVersion(), + st.req.GetTaskID(), + insertLogs) if err != nil { - log.Warn("stats wrong, failed to create text index", zap.Error(err)) + log.Warn("stats wrong, failed to create json index", zap.Error(err)) return err } } From 623e3b67d67b8ae1496a5e95158dfc73770eff4d Mon Sep 17 00:00:00 2001 From: "Xianhui.Lin" Date: Tue, 10 Dec 2024 09:55:30 +0800 Subject: [PATCH 3/5] support load json index after loadsegment Signed-off-by: Xianhui.Lin improve statschecker unittest Signed-off-by: Xianhui.Lin jsonindex expr code format Signed-off-by: Xianhui.Lin fix go format Signed-off-by: Xianhui.Lin fix controllerbasetest fail Signed-off-by: Xianhui.Lin fix jsonindex memeroy leak Signed-off-by: Xianhui.Lin fix jsonkey go format Signed-off-by: Xianhui.Lin fix jsonindex go codeformat Signed-off-by: Xianhui.Lin improve jsoninvert unitest Signed-off-by: Xianhui.Lin delete unuse code Signed-off-by: Xianhui.Lin refine test_json_key_index Signed-off-by: Xianhui.Lin fix cpp unitest Signed-off-by: Xianhui.Lin delete loginfo Signed-off-by: Xianhui.Lin fix complie error Signed-off-by: Xianhui.Lin fix codeformat Signed-off-by: Xianhui.Lin fix createindex again hang up Signed-off-by: Xianhui.Lin fix unitest Signed-off-by: Xianhui.Lin fix createindex hang Signed-off-by: Xianhui.Lin fix triggerstatstask go ut Signed-off-by: Xianhui.Lin fix jsonindex filter error Signed-off-by: Xianhui.Lin fix jsonindex filter error Signed-off-by: Xianhui.Lin fix format Signed-off-by: Xianhui.Lin improve jsonkey unitest Signed-off-by: Xianhui.Lin remove sealsegment chunknum assert Signed-off-by: Xianhui.Lin --- internal/core/src/common/Json.h | 83 +-- internal/core/src/common/jsmn.h | 11 + .../src/exec/expression/BinaryRangeExpr.cpp | 144 ++-- .../core/src/exec/expression/ExistsExpr.cpp | 26 +- .../src/exec/expression/JsonContainsExpr.cpp | 641 +++++++++--------- .../core/src/exec/expression/TermExpr.cpp | 204 +++--- .../core/src/exec/expression/UnaryExpr.cpp | 381 ++++++----- internal/core/src/exec/expression/UnaryExpr.h | 2 +- .../core/src/index/JsonKeyInvertedIndex.cpp | 8 +- .../core/src/index/JsonKeyInvertedIndex.h | 2 +- internal/core/src/indexbuilder/index_c.cpp | 4 +- internal/core/unittest/test_c_api.cpp | 385 +---------- .../core/unittest/test_json_key_index.cpp | 624 +++++++++-------- internal/datacoord/job_manager.go | 13 +- internal/datacoord/job_manager_test.go | 7 +- internal/indexnode/indexnode_service.go | 2 +- internal/indexnode/task_stats.go | 10 +- internal/indexnode/taskinfo_ops.go | 9 +- internal/proto/query_coord.proto | 2 + internal/querycoordv2/checkers/controller.go | 3 + .../checkers/controller_base_test.go | 2 +- .../querycoordv2/checkers/stats_checker.go | 185 +++++ .../checkers/stats_checker_test.go | 280 ++++++++ internal/querycoordv2/dist/dist_handler.go | 1 + .../querycoordv2/meta/segment_dist_manager.go | 1 + internal/querycoordv2/ops_service_test.go | 2 +- internal/querycoordv2/task/action.go | 10 +- internal/querycoordv2/task/executor.go | 5 +- internal/querycoordv2/task/scheduler.go | 10 +- internal/querycoordv2/task/utils.go | 6 + internal/querycoordv2/utils/checker.go | 3 + internal/querynodev2/handlers.go | 39 ++ internal/querynodev2/segments/mock_loader.go | 48 ++ internal/querynodev2/segments/mock_segment.go | 47 ++ internal/querynodev2/segments/segment.go | 22 +- .../querynodev2/segments/segment_interface.go | 2 + internal/querynodev2/segments/segment_l0.go | 4 + .../querynodev2/segments/segment_loader.go | 39 +- internal/querynodev2/services.go | 5 + internal/util/indexcgowrapper/index.go | 2 +- pkg/util/typeutil/field_schema.go | 7 +- 41 files changed, 1793 insertions(+), 1488 deletions(-) create mode 100644 internal/querycoordv2/checkers/stats_checker.go create mode 100644 internal/querycoordv2/checkers/stats_checker_test.go diff --git a/internal/core/src/common/Json.h b/internal/core/src/common/Json.h index 55c736e05da1c..992d45646b64f 100644 --- a/internal/core/src/common/Json.h +++ b/internal/core/src/common/Json.h @@ -71,45 +71,6 @@ ExtractSubJson(const std::string& json, const std::vector& keys) { return buffer.GetString(); } -inline std::pair -ParseTopLevelKey(const std::string& json_pointer, bool escaped = false) { - if (json_pointer.empty()) { - return {"", ""}; - } - - Assert(json_pointer[0] == '/'); - size_t start = 1; - size_t end = json_pointer.find('/', start); - - std::string top_key = (end == std::string::npos) - ? json_pointer.substr(start) - : json_pointer.substr(start, end - start); - - if (escaped) { - if (top_key.find("~0") != std::string::npos) { - top_key.replace(top_key.find("~0"), 2, "~"); - } - if (top_key.find("~1") != std::string::npos) { - top_key.replace(top_key.find("~1"), 2, "/"); - } - } - - std::string remaining_path = - (end == std::string::npos) ? "" : json_pointer.substr(end); - - return {top_key, remaining_path}; -} - -static std::string -ToLower(const std::string_view& str) { - std::string result(str); - std::transform( - result.begin(), result.end(), result.begin(), [](unsigned char c) { - return std::tolower(c); - }); - return result; -} - using document = simdjson::ondemand::document; template using value_result = simdjson::simdjson_result; @@ -230,8 +191,7 @@ class Json { // it's always safe to add the padding, // as we have allocated the memory with this padding - auto doc = parser.parse(data_.data() + offset, - length + simdjson::SIMDJSON_PADDING); + auto doc = parser.parse(data_.data() + offset, length); AssertInfo(doc.error() == simdjson::SUCCESS, "failed to parse the json {}: {}", std::string(data_.data() + offset, length), @@ -288,47 +248,6 @@ class Json { return doc(offset, length).get(); } - template - std::pair - at_pos(uint16_t offset, uint16_t length) const { - const char* pos = data_.data() + offset; - std::string_view str(pos, length); - if constexpr (std::is_same_v) { - if (milvus::ToLower(str) == "true") { - return {true, ""}; - } else if (milvus::ToLower(str) == "false") { - return {false, ""}; - } else { - return {false, "invalid boolean value"}; - } - } else if constexpr (std::is_same_v) { - try { - size_t parsed_chars; - int64_t int_value = std::stoll(pos, &parsed_chars, 10); - if (parsed_chars == length) { - return {int_value, ""}; - } - return {0, "string contains non-integer characters"}; - } catch (...) { - return {0, "invalid integer string"}; - } - } else if constexpr (std::is_same_v) { - try { - size_t parsed_chars; - double double_value = std::stod(pos, &parsed_chars); - if (parsed_chars == length) { - return {double_value, ""}; - } - return {0, "string contains non-integer characters"}; - } catch (...) { - return {0, "invalid double string"}; - } - } else { - static_assert(std::is_same_v); - return {str, ""}; - } - } - value_result array_at(uint16_t offset, uint16_t length) const { return dom_doc(offset, length).get_array(); diff --git a/internal/core/src/common/jsmn.h b/internal/core/src/common/jsmn.h index f20b56ba48a68..3843d6efe2d86 100644 --- a/internal/core/src/common/jsmn.h +++ b/internal/core/src/common/jsmn.h @@ -1,3 +1,14 @@ +// Copyright (C) 2019-2020 Zilliz. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software distributed under the License +// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +// or implied. See the License for the specific language governing permissions and limitations under the License + /* * MIT License * diff --git a/internal/core/src/exec/expression/BinaryRangeExpr.cpp b/internal/core/src/exec/expression/BinaryRangeExpr.cpp index 5c3ff818977f5..8b4a01d369393 100644 --- a/internal/core/src/exec/expression/BinaryRangeExpr.cpp +++ b/internal/core/src/exec/expression/BinaryRangeExpr.cpp @@ -261,17 +261,17 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForData(OffsetVector* input) { TargetBitmapView valid_res(res_vec->GetValidRawData(), real_batch_size); valid_res.set(); - auto execute_sub_batch = [lower_inclusive, - upper_inclusive]( - const T* data, - const bool* valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - TargetBitmapView valid_res, - HighPrecisionType val1, - HighPrecisionType val2) { + auto execute_sub_batch = + [ lower_inclusive, + upper_inclusive ]( + const T* data, + const bool* valid_data, + const int32_t* offsets, + const int size, + TargetBitmapView res, + TargetBitmapView valid_res, + HighPrecisionType val1, + HighPrecisionType val2) { if (lower_inclusive && upper_inclusive) { BinaryRangeElementFunc func; func(val1, val2, data, size, res, offsets); @@ -366,18 +366,17 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJson(OffsetVector* input) { ValueType val2 = GetValueFromProto(expr_->upper_val_); auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); - auto execute_sub_batch = [lower_inclusive, - upper_inclusive, - pointer]( - const milvus::Json* data, - const bool* valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - TargetBitmapView valid_res, - ValueType val1, - ValueType val2) { + auto execute_sub_batch = + [ lower_inclusive, upper_inclusive, + pointer ]( + const milvus::Json* data, + const bool* valid_data, + const int32_t* offsets, + const int size, + TargetBitmapView res, + TargetBitmapView valid_res, + ValueType val1, + ValueType val2) { if (lower_inclusive && upper_inclusive) { BinaryRangeElementFuncForJson func; @@ -456,11 +455,23 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJsonForIndex() { using GetType = std::conditional_t, std::string_view, ValueType>; - Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + Assert(segment_->type() == SegmentType::Sealed); auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); +#define BinaryRangeJSONIndexCompare(cmp) \ + do { \ + auto val = json.at(offset, size); \ + if (val.error()) { \ + if constexpr (std::is_same_v) { \ + auto val = json.at(offset, size); \ + return !val.error() && (cmp); \ + } \ + return false; \ + } \ + return (cmp); \ + } while (false) bool lower_inclusive = expr_->lower_inclusive_; bool upper_inclusive = expr_->upper_inclusive_; ValueType val1 = GetValueFromProto(expr_->lower_val_); @@ -471,39 +482,36 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForJsonForIndex() { auto field_id = expr_->column_.field_id_; auto* index = sealed_seg->GetJsonKeyIndex(field_id); Assert(index != nullptr); - auto filter_func = - [sealed_seg, - &field_id, - val1, - val2, - lower_inclusive, - upper_inclusive](uint32_t row_id, uint16_t offset, uint16_t size) { - auto json_pair = sealed_seg->GetJsonData(field_id, row_id); - if (!json_pair.second) { - return false; - } - auto json = milvus::Json(json_pair.first.data(), - json_pair.first.size()); - auto val = json.at(offset, size); - if (val.error()) { - return false; - } - if (lower_inclusive && upper_inclusive) { - return val1 <= ValueType(val.value()) && - ValueType(val.value()) <= val2; - } else if (lower_inclusive && !upper_inclusive) { - return val1 <= ValueType(val.value()) && - ValueType(val.value()) < val2; - } else if (!lower_inclusive && upper_inclusive) { - return val1 < ValueType(val.value()) && - ValueType(val.value()) <= val2; - } else { - return val1 < ValueType(val.value()) && - ValueType(val.value()) < val2; - } - }; + auto filter_func = [sealed_seg, + &field_id, + val1, + val2, + lower_inclusive, + upper_inclusive](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto json_pair = sealed_seg->GetJsonData(field_id, row_id); + if (!json_pair.second) { + return false; + } + auto json = + milvus::Json(json_pair.first.data(), json_pair.first.size()); + if (lower_inclusive && upper_inclusive) { + BinaryRangeJSONIndexCompare(val1 <= ValueType(val.value()) && + ValueType(val.value()) <= val2); + } else if (lower_inclusive && !upper_inclusive) { + BinaryRangeJSONIndexCompare(val1 <= ValueType(val.value()) && + ValueType(val.value()) < val2); + } else if (!lower_inclusive && upper_inclusive) { + BinaryRangeJSONIndexCompare(val1 < ValueType(val.value()) && + ValueType(val.value()) <= val2); + } else { + BinaryRangeJSONIndexCompare(val1 < ValueType(val.value()) && + ValueType(val.value()) < val2); + } + }; cached_index_chunk_res_ = - index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + index->FilterByPath(pointer, active_count_, filter_func).clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; @@ -540,18 +548,18 @@ PhyBinaryRangeFilterExpr::ExecRangeVisitorImplForArray(OffsetVector* input) { index = std::stoi(expr_->column_.nested_path_[0]); } - auto execute_sub_batch = [lower_inclusive, - upper_inclusive]( - const milvus::ArrayView* data, - const bool* valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - TargetBitmapView valid_res, - ValueType val1, - ValueType val2, - int index) { + auto execute_sub_batch = + [ lower_inclusive, + upper_inclusive ]( + const milvus::ArrayView* data, + const bool* valid_data, + const int32_t* offsets, + const int size, + TargetBitmapView res, + TargetBitmapView valid_res, + ValueType val1, + ValueType val2, + int index) { if (lower_inclusive && upper_inclusive) { BinaryRangeElementFuncForArray func; diff --git a/internal/core/src/exec/expression/ExistsExpr.cpp b/internal/core/src/exec/expression/ExistsExpr.cpp index cc64dcb57c43b..d15339815b08d 100644 --- a/internal/core/src/exec/expression/ExistsExpr.cpp +++ b/internal/core/src/exec/expression/ExistsExpr.cpp @@ -67,18 +67,18 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment(OffsetVector* input) { TargetBitmapView res, TargetBitmapView valid_res, const std::string& pointer) { - for (int i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = data[offset].exist(pointer); + for (int i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; } - }; + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = data[offset].exist(pointer); + } + }; int64_t processed_size; if (has_offset_input_) { @@ -102,7 +102,7 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegment(OffsetVector* input) { VectorPtr PhyExistsFilterExpr::EvalJsonExistsForDataSegmentForIndex() { - Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + Assert(segment_->type() == SegmentType::Sealed); auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; @@ -125,7 +125,7 @@ PhyExistsFilterExpr::EvalJsonExistsForDataSegmentForIndex() { return json.exist(pointer); }; cached_index_chunk_res_ = - index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + index->FilterByPath(pointer, active_count_, filter_func).clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; diff --git a/internal/core/src/exec/expression/JsonContainsExpr.cpp b/internal/core/src/exec/expression/JsonContainsExpr.cpp index 46cd3fc220a91..321a4259fcee9 100644 --- a/internal/core/src/exec/expression/JsonContainsExpr.cpp +++ b/internal/core/src/exec/expression/JsonContainsExpr.cpp @@ -196,28 +196,27 @@ PhyJsonContainsFilterExpr::ExecArrayContains(OffsetVector* input) { TargetBitmapView res, TargetBitmapView valid_res, const std::unordered_set& elements) { - auto executor = [&](size_t i) { - const auto& array = data[i]; - for (int j = 0; j < array.length(); ++j) { - if (elements.count(array.template get_data(j)) > - 0) { - return true; - } - } - return false; - }; - for (int i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; + auto executor = [&](size_t i) { + const auto& array = data[i]; + for (int j = 0; j < array.length(); ++j) { + if (elements.count(array.template get_data(j)) > 0) { + return true; } - res[i] = executor(offset); } + return false; }; + for (int i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; int64_t processed_size; if (has_offset_input_) { @@ -280,35 +279,35 @@ PhyJsonContainsFilterExpr::ExecJsonContains(OffsetVector* input) { TargetBitmapView valid_res, const std::string& pointer, const std::unordered_set& elements) { - auto executor = [&](size_t i) { - auto doc = data[i].doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) { - return false; - } - for (auto&& it : array) { - auto val = it.template get(); - if (val.error()) { - continue; - } - if (elements.count(val.value()) > 0) { - return true; - } - } + auto executor = [&](size_t i) { + auto doc = data[i].doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) { return false; - }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; + } + for (auto&& it : array) { + auto val = it.template get(); + if (val.error()) { continue; } - res[i] = executor(offset); + if (elements.count(val.value()) > 0) { + return true; + } } + return false; }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; int64_t processed_size; if (has_offset_input_) { @@ -342,7 +341,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsByKeyIndex() { std::conditional_t, std::string_view, ExprValueType>; - Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + Assert(segment_->type() == SegmentType::Sealed); auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; @@ -389,7 +388,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsByKeyIndex() { return false; }; cached_index_chunk_res_ = - index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + index->FilterByPath(pointer, active_count_, filter_func).clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; @@ -433,44 +432,44 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray(OffsetVector* input) { TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements) { - auto executor = [&](size_t i) -> bool { - auto doc = data[i].doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) { - return false; - } - for (auto&& it : array) { - auto val = it.get_array(); - if (val.error()) { - continue; - } - std::vector< - simdjson::simdjson_result> - json_array; - json_array.reserve(val.count_elements()); - for (auto&& e : val) { - json_array.emplace_back(e); - } - for (auto const& element : elements) { - if (CompareTwoJsonArray(json_array, element)) { - return true; - } - } - } + auto executor = [&](size_t i) -> bool { + auto doc = data[i].doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) { return false; - }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; + } + for (auto&& it : array) { + auto val = it.get_array(); + if (val.error()) { continue; } - res[i] = executor(offset); + std::vector< + simdjson::simdjson_result> + json_array; + json_array.reserve(val.count_elements()); + for (auto&& e : val) { + json_array.emplace_back(e); + } + for (auto const& element : elements) { + if (CompareTwoJsonArray(json_array, element)) { + return true; + } + } } + return false; }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; int64_t processed_size; if (has_offset_input_) { @@ -499,7 +498,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArray(OffsetVector* input) { VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsArrayByKeyIndex() { - Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + Assert(segment_->type() == SegmentType::Sealed); auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; @@ -547,7 +546,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsArrayByKeyIndex() { return false; }; cached_index_chunk_res_ = - index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + index->FilterByPath(pointer, active_count_, filter_func).clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; @@ -593,29 +592,29 @@ PhyJsonContainsFilterExpr::ExecArrayContainsAll(OffsetVector* input) { TargetBitmapView res, TargetBitmapView valid_res, const std::unordered_set& elements) { - auto executor = [&](size_t i) { - std::unordered_set tmp_elements(elements); - // Note: array can only be iterated once - for (int j = 0; j < data[i].length(); ++j) { - tmp_elements.erase(data[i].template get_data(j)); - if (tmp_elements.size() == 0) { - return true; - } - } - return tmp_elements.size() == 0; - }; - for (int i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; + auto executor = [&](size_t i) { + std::unordered_set tmp_elements(elements); + // Note: array can only be iterated once + for (int j = 0; j < data[i].length(); ++j) { + tmp_elements.erase(data[i].template get_data(j)); + if (tmp_elements.size() == 0) { + return true; } - res[i] = executor(offset); } + return tmp_elements.size() == 0; }; + for (int i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; int64_t processed_size; if (has_offset_input_) { @@ -678,38 +677,38 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAll(OffsetVector* input) { TargetBitmapView valid_res, const std::string& pointer, const std::unordered_set& elements) { - auto executor = [&](const size_t i) -> bool { - auto doc = data[i].doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) { - return false; - } - std::unordered_set tmp_elements(elements); - // Note: array can only be iterated once - for (auto&& it : array) { - auto val = it.template get(); - if (val.error()) { - continue; - } - tmp_elements.erase(val.value()); - if (tmp_elements.size() == 0) { - return true; - } - } - return tmp_elements.size() == 0; - }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; + auto executor = [&](const size_t i) -> bool { + auto doc = data[i].doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) { + return false; + } + std::unordered_set tmp_elements(elements); + // Note: array can only be iterated once + for (auto&& it : array) { + auto val = it.template get(); + if (val.error()) { continue; } - res[i] = executor(offset); + tmp_elements.erase(val.value()); + if (tmp_elements.size() == 0) { + return true; + } } + return tmp_elements.size() == 0; }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; int64_t processed_size; if (has_offset_input_) { @@ -743,7 +742,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllByKeyIndex() { std::conditional_t, std::string_view, ExprValueType>; - Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + Assert(segment_->type() == SegmentType::Sealed); auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; @@ -791,7 +790,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllByKeyIndex() { return tmp_elements.empty(); }; cached_index_chunk_res_ = - index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + index->FilterByPath(pointer, active_count_, filter_func).clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; @@ -841,98 +840,96 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType( const std::string& pointer, const std::vector& elements, const std::unordered_set elements_index) { - auto executor = [&](size_t i) -> bool { - const auto& json = data[i]; - auto doc = json.doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) { - return false; - } - std::unordered_set tmp_elements_index(elements_index); - for (auto&& it : array) { - int i = -1; - for (auto& element : elements) { - i++; - switch (element.val_case()) { - case proto::plan::GenericValue::kBoolVal: { - auto val = it.template get(); - if (val.error()) { - continue; - } - if (val.value() == element.bool_val()) { - tmp_elements_index.erase(i); - } - break; + auto executor = [&](size_t i) -> bool { + const auto& json = data[i]; + auto doc = json.doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) { + return false; + } + std::unordered_set tmp_elements_index(elements_index); + for (auto&& it : array) { + int i = -1; + for (auto& element : elements) { + i++; + switch (element.val_case()) { + case proto::plan::GenericValue::kBoolVal: { + auto val = it.template get(); + if (val.error()) { + continue; } - case proto::plan::GenericValue::kInt64Val: { - auto val = it.template get(); - if (val.error()) { - continue; - } - if (val.value() == element.int64_val()) { - tmp_elements_index.erase(i); - } - break; + if (val.value() == element.bool_val()) { + tmp_elements_index.erase(i); } - case proto::plan::GenericValue::kFloatVal: { - auto val = it.template get(); - if (val.error()) { - continue; - } - if (val.value() == element.float_val()) { - tmp_elements_index.erase(i); - } - break; + break; + } + case proto::plan::GenericValue::kInt64Val: { + auto val = it.template get(); + if (val.error()) { + continue; } - case proto::plan::GenericValue::kStringVal: { - auto val = it.template get(); - if (val.error()) { - continue; - } - if (val.value() == element.string_val()) { - tmp_elements_index.erase(i); - } - break; + if (val.value() == element.int64_val()) { + tmp_elements_index.erase(i); + } + break; + } + case proto::plan::GenericValue::kFloatVal: { + auto val = it.template get(); + if (val.error()) { + continue; } - case proto::plan::GenericValue::kArrayVal: { - auto val = it.get_array(); - if (val.error()) { - continue; - } - if (CompareTwoJsonArray(val, - element.array_val())) { - tmp_elements_index.erase(i); - } - break; + if (val.value() == element.float_val()) { + tmp_elements_index.erase(i); } - default: - PanicInfo( - DataTypeInvalid, - fmt::format("unsupported data type {}", - element.val_case())); + break; } - if (tmp_elements_index.size() == 0) { - return true; + case proto::plan::GenericValue::kStringVal: { + auto val = it.template get(); + if (val.error()) { + continue; + } + if (val.value() == element.string_val()) { + tmp_elements_index.erase(i); + } + break; + } + case proto::plan::GenericValue::kArrayVal: { + auto val = it.get_array(); + if (val.error()) { + continue; + } + if (CompareTwoJsonArray(val, element.array_val())) { + tmp_elements_index.erase(i); + } + break; } + default: + PanicInfo(DataTypeInvalid, + fmt::format("unsupported data type {}", + element.val_case())); } if (tmp_elements_index.size() == 0) { return true; } } - return tmp_elements_index.size() == 0; - }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; + if (tmp_elements_index.size() == 0) { + return true; } - res[i] = executor(offset); } + return tmp_elements_index.size() == 0; }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; int64_t processed_size; if (has_offset_input_) { @@ -963,7 +960,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffType( VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffTypeByKeyIndex() { - Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + Assert(segment_->type() == SegmentType::Sealed); auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; @@ -1074,7 +1071,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllWithDiffTypeByKeyIndex() { return tmp_elements_index.size() == 0; }; cached_index_chunk_res_ = - index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + index->FilterByPath(pointer, active_count_, filter_func).clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; @@ -1119,48 +1116,48 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(OffsetVector* input) { TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements) { - auto executor = [&](const size_t i) { - auto doc = data[i].doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) { - return false; + auto executor = [&](const size_t i) { + auto doc = data[i].doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) { + return false; + } + std::unordered_set exist_elements_index; + for (auto&& it : array) { + auto val = it.get_array(); + if (val.error()) { + continue; } - std::unordered_set exist_elements_index; - for (auto&& it : array) { - auto val = it.get_array(); - if (val.error()) { - continue; - } - std::vector< - simdjson::simdjson_result> - json_array; - json_array.reserve(val.count_elements()); - for (auto&& e : val) { - json_array.emplace_back(e); - } - for (int index = 0; index < elements.size(); ++index) { - if (CompareTwoJsonArray(json_array, elements[index])) { - exist_elements_index.insert(index); - } - } - if (exist_elements_index.size() == elements.size()) { - return true; - } + std::vector< + simdjson::simdjson_result> + json_array; + json_array.reserve(val.count_elements()); + for (auto&& e : val) { + json_array.emplace_back(e); } - return exist_elements_index.size() == elements.size(); - }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; + for (int index = 0; index < elements.size(); ++index) { + if (CompareTwoJsonArray(json_array, elements[index])) { + exist_elements_index.insert(index); + } } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; + if (exist_elements_index.size() == elements.size()) { + return true; } - res[i] = executor(offset); } + return exist_elements_index.size() == elements.size(); }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; int64_t processed_size; if (has_offset_input_) { @@ -1189,7 +1186,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArray(OffsetVector* input) { VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsAllArrayByKeyIndex() { - Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + Assert(segment_->type() == SegmentType::Sealed); auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; @@ -1241,7 +1238,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsAllArrayByKeyIndex() { return exist_elements_index.size() == elements.size(); }; cached_index_chunk_res_ = - index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + index->FilterByPath(pointer, active_count_, filter_func).clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; @@ -1290,90 +1287,88 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(OffsetVector* input) { TargetBitmapView valid_res, const std::string& pointer, const std::vector& elements) { - auto executor = [&](const size_t i) { - auto& json = data[i]; - auto doc = json.doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) { - return false; - } - // Note: array can only be iterated once - for (auto&& it : array) { - for (auto const& element : elements) { - switch (element.val_case()) { - case proto::plan::GenericValue::kBoolVal: { - auto val = it.template get(); - if (val.error()) { - continue; - } - if (val.value() == element.bool_val()) { - return true; - } - break; + auto executor = [&](const size_t i) { + auto& json = data[i]; + auto doc = json.doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) { + return false; + } + // Note: array can only be iterated once + for (auto&& it : array) { + for (auto const& element : elements) { + switch (element.val_case()) { + case proto::plan::GenericValue::kBoolVal: { + auto val = it.template get(); + if (val.error()) { + continue; } - case proto::plan::GenericValue::kInt64Val: { - auto val = it.template get(); - if (val.error()) { - continue; - } - if (val.value() == element.int64_val()) { - return true; - } - break; + if (val.value() == element.bool_val()) { + return true; + } + break; + } + case proto::plan::GenericValue::kInt64Val: { + auto val = it.template get(); + if (val.error()) { + continue; } - case proto::plan::GenericValue::kFloatVal: { - auto val = it.template get(); - if (val.error()) { - continue; - } - if (val.value() == element.float_val()) { - return true; - } - break; + if (val.value() == element.int64_val()) { + return true; } - case proto::plan::GenericValue::kStringVal: { - auto val = it.template get(); - if (val.error()) { - continue; - } - if (val.value() == element.string_val()) { - return true; - } - break; + break; + } + case proto::plan::GenericValue::kFloatVal: { + auto val = it.template get(); + if (val.error()) { + continue; } - case proto::plan::GenericValue::kArrayVal: { - auto val = it.get_array(); - if (val.error()) { - continue; - } - if (CompareTwoJsonArray(val, - element.array_val())) { - return true; - } - break; + if (val.value() == element.float_val()) { + return true; } - default: - PanicInfo( - DataTypeInvalid, - fmt::format("unsupported data type {}", - element.val_case())); + break; } + case proto::plan::GenericValue::kStringVal: { + auto val = it.template get(); + if (val.error()) { + continue; + } + if (val.value() == element.string_val()) { + return true; + } + break; + } + case proto::plan::GenericValue::kArrayVal: { + auto val = it.get_array(); + if (val.error()) { + continue; + } + if (CompareTwoJsonArray(val, element.array_val())) { + return true; + } + break; + } + default: + PanicInfo(DataTypeInvalid, + fmt::format("unsupported data type {}", + element.val_case())); } } - return false; - }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = executor(offset); } + return false; }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; int64_t processed_size; if (has_offset_input_) { @@ -1402,7 +1397,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffType(OffsetVector* input) { VectorPtr PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffTypeByKeyIndex() { - Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + Assert(segment_->type() == SegmentType::Sealed); auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; @@ -1497,7 +1492,7 @@ PhyJsonContainsFilterExpr::ExecJsonContainsWithDiffTypeByKeyIndex() { return false; }; cached_index_chunk_res_ = - index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + index->FilterByPath(pointer, active_count_, filter_func).clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; diff --git a/internal/core/src/exec/expression/TermExpr.cpp b/internal/core/src/exec/expression/TermExpr.cpp index 4682442bef02a..e5419114e8626 100644 --- a/internal/core/src/exec/expression/TermExpr.cpp +++ b/internal/core/src/exec/expression/TermExpr.cpp @@ -288,27 +288,27 @@ PhyTermFilterExpr::ExecTermArrayVariableInField(OffsetVector* input) { TargetBitmapView res, TargetBitmapView valid_res, const ValueType& target_val) { - auto executor = [&](size_t offset) { - for (int i = 0; i < data[offset].length(); i++) { - auto val = data[offset].template get_data(i); - if (val == target_val) { - return true; - } - } - return false; - }; - for (int i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; + auto executor = [&](size_t offset) { + for (int i = 0; i < data[offset].length(); i++) { + auto val = data[offset].template get_data(i); + if (val == target_val) { + return true; } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = executor(offset); } + return false; }; + for (int i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; int64_t processed_size; if (has_offset_input_) { @@ -375,23 +375,23 @@ PhyTermFilterExpr::ExecTermArrayFieldInVariable(OffsetVector* input) { TargetBitmapView valid_res, int index, const std::unordered_set& term_set) { - for (int i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - if (term_set.empty() || index >= data[offset].length()) { - res[i] = false; - continue; - } - auto value = data[offset].get_data(index); - res[i] = term_set.find(ValueType(value)) != term_set.end(); + for (int i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; } - }; + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + if (term_set.empty() || index >= data[offset].length()) { + res[i] = false; + continue; + } + auto value = data[offset].get_data(index); + res[i] = term_set.find(ValueType(value)) != term_set.end(); + } + }; int64_t processed_size; if (has_offset_input_) { @@ -452,34 +452,34 @@ PhyTermFilterExpr::ExecTermJsonVariableInField(OffsetVector* input) { TargetBitmapView valid_res, const std::string pointer, const ValueType& target_val) { - auto executor = [&](size_t i) { - auto doc = data[i].doc(); - auto array = doc.at_pointer(pointer).get_array(); - if (array.error()) - return false; - for (auto it = array.begin(); it != array.end(); ++it) { - auto val = (*it).template get(); - if (val.error()) { - return false; - } - if (val.value() == target_val) { - return true; - } - } + auto executor = [&](size_t i) { + auto doc = data[i].doc(); + auto array = doc.at_pointer(pointer).get_array(); + if (array.error()) return false; - }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; + for (auto it = array.begin(); it != array.end(); ++it) { + auto val = (*it).template get(); + if (val.error()) { + return false; } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; + if (val.value() == target_val) { + return true; } - res[i] = executor(offset); } + return false; }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, @@ -516,7 +516,7 @@ PhyTermFilterExpr::ExecJsonInVariableByKeyIndex() { using GetType = std::conditional_t, std::string_view, ValueType>; - Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); + Assert(segment_->type() == SegmentType::Sealed); auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; @@ -544,7 +544,6 @@ PhyTermFilterExpr::ExecJsonInVariableByKeyIndex() { auto filter_func = [sealed_seg, &term_set, &field_id](uint32_t row_id, uint16_t offset, uint16_t size) { - //std::cout << row_id << " " << offset << " " << size << std::endl; auto json_pair = sealed_seg->GetJsonData(field_id, row_id); if (!json_pair.second) { return false; @@ -553,13 +552,12 @@ PhyTermFilterExpr::ExecJsonInVariableByKeyIndex() { milvus::Json(json_pair.first.data(), json_pair.first.size()); auto val = json.at(offset, size); if (val.error()) { - //std::cout << val.error() << std::endl; return false; } return term_set.find(ValueType(val.value())) != term_set.end(); }; cached_index_chunk_res_ = - index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + index->FilterByPath(pointer, active_count_, filter_func).clone(); cached_index_chunk_id_ = 0; } @@ -616,40 +614,40 @@ PhyTermFilterExpr::ExecTermJsonFieldInVariable(OffsetVector* input) { TargetBitmapView valid_res, const std::string pointer, const std::unordered_set& terms) { - auto executor = [&](size_t i) { - auto x = data[i].template at(pointer); - if (x.error()) { - if constexpr (std::is_same_v) { - auto x = data[i].template at(pointer); - if (x.error()) { - return false; - } - - auto value = x.value(); - // if the term set is {1}, and the value is 1.1, we should not return true. - return std::floor(value) == value && - terms.find(ValueType(value)) != terms.end(); + auto executor = [&](size_t i) { + auto x = data[i].template at(pointer); + if (x.error()) { + if constexpr (std::is_same_v) { + auto x = data[i].template at(pointer); + if (x.error()) { + return false; } - return false; - } - return terms.find(ValueType(x.value())) != terms.end(); - }; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - if (terms.empty()) { - res[i] = false; - continue; + + auto value = x.value(); + // if the term set is {1}, and the value is 1.1, we should not return true. + return std::floor(value) == value && + terms.find(ValueType(value)) != terms.end(); } - res[i] = executor(offset); + return false; } + return terms.find(ValueType(x.value())) != terms.end(); }; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; + } + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + if (terms.empty()) { + res[i] = false; + continue; + } + res[i] = executor(offset); + } + }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, @@ -776,19 +774,19 @@ PhyTermFilterExpr::ExecVisitorImplForData(OffsetVector* input) { TargetBitmapView res, TargetBitmapView valid_res, const std::unordered_set& vals) { - TermElementFuncSet func; - for (size_t i = 0; i < size; ++i) { - auto offset = i; - if constexpr (filter_type == FilterType::random) { - offset = (offsets) ? offsets[i] : i; - } - if (valid_data != nullptr && !valid_data[offset]) { - res[i] = valid_res[i] = false; - continue; - } - res[i] = func(vals, data[offset]); + TermElementFuncSet func; + for (size_t i = 0; i < size; ++i) { + auto offset = i; + if constexpr (filter_type == FilterType::random) { + offset = (offsets) ? offsets[i] : i; } - }; + if (valid_data != nullptr && !valid_data[offset]) { + res[i] = valid_res[i] = false; + continue; + } + res[i] = func(vals, data[offset]); + } + }; int64_t processed_size; if (has_offset_input_) { processed_size = ProcessDataByOffsets(execute_sub_batch, diff --git a/internal/core/src/exec/expression/UnaryExpr.cpp b/internal/core/src/exec/expression/UnaryExpr.cpp index 88e2cac622b41..e7e175c642da1 100644 --- a/internal/core/src/exec/expression/UnaryExpr.cpp +++ b/internal/core/src/exec/expression/UnaryExpr.cpp @@ -276,145 +276,144 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplArray(OffsetVector* input) { if (expr_->column_.nested_path_.size() > 0) { index = std::stoi(expr_->column_.nested_path_[0]); } - auto execute_sub_batch = - [op_type]( - const milvus::ArrayView* data, - const bool* valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - TargetBitmapView valid_res, - ValueType val, - int index) { - switch (op_type) { - case proto::plan::GreaterThan: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::GreaterEqual: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::LessThan: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::LessEqual: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::Equal: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::NotEqual: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::PrefixMatch: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - case proto::plan::Match: { - UnaryElementFuncForArray - func; - func(data, - valid_data, - size, - val, - index, - res, - valid_res, - offsets); - break; - } - default: - PanicInfo( - OpTypeInvalid, - fmt::format( - "unsupported operator type for unary expr: {}", - op_type)); + auto execute_sub_batch = [op_type]( + const milvus::ArrayView* data, + const bool* valid_data, + const int32_t* offsets, + const int size, + TargetBitmapView res, + TargetBitmapView valid_res, + ValueType val, + int index) { + switch (op_type) { + case proto::plan::GreaterThan: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; } - }; + case proto::plan::GreaterEqual: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::LessThan: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::LessEqual: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::Equal: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::NotEqual: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::PrefixMatch: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + case proto::plan::Match: { + UnaryElementFuncForArray + func; + func(data, + valid_data, + size, + val, + index, + res, + valid_res, + offsets); + break; + } + default: + PanicInfo( + OpTypeInvalid, + fmt::format("unsupported operator type for unary expr: {}", + op_type)); + } + }; int64_t processed_size; if (has_offset_input_) { processed_size = @@ -480,7 +479,7 @@ PhyUnaryRangeFilterExpr::ExecArrayEqualForIndex(bool reverse) { }; } else { auto size_per_chunk = segment_->size_per_chunk(); - retrieve = [size_per_chunk, this](int64_t offset) -> auto { + retrieve = [ size_per_chunk, this ](int64_t offset) -> auto { auto chunk_idx = offset / size_per_chunk; auto chunk_offset = offset % size_per_chunk; const auto& chunk = @@ -605,15 +604,15 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJson(OffsetVector* input) { res[i] = (cmp); \ } while (false) - auto execute_sub_batch = [op_type, pointer]( - const milvus::Json* data, - const bool* valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - TargetBitmapView valid_res, - ExprValueType val) { + auto execute_sub_batch = + [ op_type, pointer ]( + const milvus::Json* data, + const bool* valid_data, + const int32_t* offsets, + const int size, + TargetBitmapView res, + TargetBitmapView valid_res, + ExprValueType val) { switch (op_type) { case proto::plan::GreaterThan: { for (size_t i = 0; i < size; ++i) { @@ -807,11 +806,36 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonForIndex() { std::conditional_t, std::string_view, ExprValueType>; - Assert(segment_->type() == SegmentType::Sealed && num_data_chunk_ == 1); auto real_batch_size = current_data_chunk_pos_ + batch_size_ > active_count_ ? active_count_ - current_data_chunk_pos_ : batch_size_; auto pointer = milvus::Json::pointer(expr_->column_.nested_path_); + +#define UnaryRangeJSONIndexCompare(cmp) \ + do { \ + auto x = json.at(offset, size); \ + if (x.error()) { \ + if constexpr (std::is_same_v) { \ + auto x = json.at(offset, size); \ + return !x.error() && (cmp); \ + } \ + return false; \ + } \ + return (cmp); \ + } while (false) + +#define UnaryRangeJSONIndexCompareNotEqual(cmp) \ + do { \ + auto x = json.at(offset, size); \ + if (x.error()) { \ + if constexpr (std::is_same_v) { \ + auto x = json.at(offset, size); \ + return x.error() || (cmp); \ + } \ + return true; \ + } \ + return (cmp); \ + } while (false) ExprValueType val = GetValueFromProto(expr_->val_); auto op_type = expr_->op_type_; if (cached_index_chunk_id_ != 0) { @@ -834,42 +858,31 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonForIndex() { if constexpr (std::is_same_v) { return false; } else { - auto x = json.at(offset, size); - if (x.error()) { - return false; - } - return ExprValueType(x.value()) > val; + UnaryRangeJSONIndexCompare(ExprValueType(x.value()) > + val); } case proto::plan::GreaterEqual: if constexpr (std::is_same_v) { return false; } else { - auto x = json.at(offset, size); - if (x.error()) { - return false; - } - return ExprValueType(x.value()) >= val; + UnaryRangeJSONIndexCompare(ExprValueType(x.value()) >= + val); } case proto::plan::LessThan: if constexpr (std::is_same_v) { return false; } else { - auto x = json.at(offset, size); - if (x.error()) { - return false; - } - return ExprValueType(x.value()) < val; + UnaryRangeJSONIndexCompare(ExprValueType(x.value()) < + val); } case proto::plan::LessEqual: if constexpr (std::is_same_v) { return false; } else { - auto x = json.at(offset, size); - if (x.error()) { - return false; - } - return ExprValueType(x.value()) <= val; + UnaryRangeJSONIndexCompare(ExprValueType(x.value()) <= + val); } + case proto::plan::Equal: if constexpr (std::is_same_v) { auto array = json.array_at(offset, size); @@ -878,11 +891,8 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonForIndex() { } return CompareTwoJsonArray(array.value(), val); } else { - auto x = json.at(offset, size); - if (x.error()) { - return false; - } - return ExprValueType(x.value()) == val; + UnaryRangeJSONIndexCompare(ExprValueType(x.value()) == + val); } case proto::plan::NotEqual: if constexpr (std::is_same_v) { @@ -892,11 +902,8 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonForIndex() { } return !CompareTwoJsonArray(array.value(), val); } else { - auto x = json.at(offset, size); - if (x.error()) { - return false; - } - return ExprValueType(x.value()) != val; + UnaryRangeJSONIndexCompareNotEqual( + ExprValueType(x.value()) != val); } case proto::plan::PrefixMatch: if constexpr (std::is_same_v) { @@ -927,7 +934,7 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplJsonForIndex() { } }; cached_index_chunk_res_ = - index->FilterByPath(pointer, real_batch_size, filter_func).clone(); + index->FilterByPath(pointer, active_count_, filter_func).clone(); cached_index_chunk_id_ = 0; } TargetBitmap result; @@ -1123,13 +1130,13 @@ PhyUnaryRangeFilterExpr::ExecRangeVisitorImplForData(OffsetVector* input) { auto execute_sub_batch = [expr_type]( - const T* data, - const bool* valid_data, - const int32_t* offsets, - const int size, - TargetBitmapView res, - TargetBitmapView valid_res, - IndexInnerType val) { + const T* data, + const bool* valid_data, + const int32_t* offsets, + const int size, + TargetBitmapView res, + TargetBitmapView valid_res, + IndexInnerType val) { switch (expr_type) { case proto::plan::GreaterThan: { UnaryElementFunc func; diff --git a/internal/core/src/exec/expression/UnaryExpr.h b/internal/core/src/exec/expression/UnaryExpr.h index f4c368ab58f50..d69e8c0270c95 100644 --- a/internal/core/src/exec/expression/UnaryExpr.h +++ b/internal/core/src/exec/expression/UnaryExpr.h @@ -356,7 +356,7 @@ class PhyUnaryRangeFilterExpr : public SegmentExpr { template VectorPtr ExecRangeVisitorImplJsonForIndex(); - + template VectorPtr ExecRangeVisitorImplArray(OffsetVector* input = nullptr); diff --git a/internal/core/src/index/JsonKeyInvertedIndex.cpp b/internal/core/src/index/JsonKeyInvertedIndex.cpp index e76644c2d5349..f549bf271624e 100644 --- a/internal/core/src/index/JsonKeyInvertedIndex.cpp +++ b/internal/core/src/index/JsonKeyInvertedIndex.cpp @@ -25,9 +25,7 @@ JsonKeyInvertedIndex::AddInvertedRecord(const std::vector& paths, uint32_t row_id, uint16_t offset, uint16_t length) { - auto key = std::string("/") + Join(paths, "."); - std::cout << "xxx insert inverted key" << key << "rowid" << row_id - << "offset" << offset << "length" << length << std::endl; + auto key = std::string("/") + Join(paths, "/"); LOG_DEBUG( "insert inverted key: {}, row_id: {}, offset: " "{}, length:{}", @@ -129,6 +127,7 @@ JsonKeyInvertedIndex::AddJson(const char* json, int64_t offset) { int index = 0; std::vector paths; TravelJson(json, tokens, index, paths, offset); + free(tokens); } JsonKeyInvertedIndex::JsonKeyInvertedIndex( @@ -223,6 +222,9 @@ JsonKeyInvertedIndex::BuildWithFieldData( for (int i = 0; i < n; i++) { if (!data->is_valid(i)) { null_offset.push_back(i); + std::string empty = ""; + wrapper_->add_multi_data(&empty, 0, offset++); + return; } AddJson(static_cast(data->RawValue(i)) ->data() diff --git a/internal/core/src/index/JsonKeyInvertedIndex.h b/internal/core/src/index/JsonKeyInvertedIndex.h index b686f63897f2e..220374ccb504f 100644 --- a/internal/core/src/index/JsonKeyInvertedIndex.h +++ b/internal/core/src/index/JsonKeyInvertedIndex.h @@ -25,7 +25,7 @@ class JsonKeyInvertedIndex : public InvertedIndexTantivy { explicit JsonKeyInvertedIndex(const storage::FileManagerContext& ctx, bool is_load); - ~JsonKeyInvertedIndex() override {}; + ~JsonKeyInvertedIndex() override{}; public: BinarySet diff --git a/internal/core/src/indexbuilder/index_c.cpp b/internal/core/src/indexbuilder/index_c.cpp index 8d3b74cda2943..0daaab7cf76a9 100644 --- a/internal/core/src/indexbuilder/index_c.cpp +++ b/internal/core/src/indexbuilder/index_c.cpp @@ -282,8 +282,8 @@ BuildJsonKeyIndex(CBinarySet* c_binary_set, auto field_schema = FieldMeta::ParseFrom(build_index_info->field_schema()); - auto index = - std::make_unique(fileManagerContext, false); + auto index = std::make_unique( + fileManagerContext, false); index->Build(config); auto binary = std::make_unique(index->Upload(config)); diff --git a/internal/core/unittest/test_c_api.cpp b/internal/core/unittest/test_c_api.cpp index df8365f3da28d..6a069b859abec 100644 --- a/internal/core/unittest/test_c_api.cpp +++ b/internal/core/unittest/test_c_api.cpp @@ -5281,387 +5281,4 @@ TEST(CApiTest, RANGE_SEARCH_WITH_RADIUS_AND_RANGE_FILTER_WHEN_IP_BFLOAT16) { TEST(CApiTest, IsLoadWithDisk) { ASSERT_TRUE(IsLoadWithDisk(INVERTED_INDEX_TYPE, 0)); -} - -// 1000 keys -std::string -GenerateJson(int N) { - std::vector data(N); - std::default_random_engine er(67); - std::normal_distribution<> distr(0, 1); - std::vector keys; - for (int i = 0; i < N; i++) { - keys.push_back("keys" + std::to_string(i)); - } - std::string json_string; - std::vector values(N); - for (int i = 0; i < N; i++) { - if (i % 7 == 0 || i % 7 == 4) { - values[i] = std::to_string(er()); - } else if (i % 7 == 1 || i % 7 == 5) { - values[i] = std::to_string(static_cast(er())); - } else if (i % 7 == 2 || i % 7 == 6) { - values[i] = er() / 2 == 0 ? "true" : "false"; - } else if (i % 7 == 3) { - values[i] = "\"xxxx" + std::to_string(i) + "\""; - // } else if (i % 7 == 4) { - // std::vector intvec(10); - // for (int j = 0; j < 10; j++) { - // intvec[j] = std::to_string(i + j); - // } - // values[i] = "[" + join(intvec, ",") + "]"; - // } else if (i % 7 == 5) { - // std::vector doublevec(10); - // for (int j = 0; j < 10; j++) { - // doublevec[j] = - // std::to_string(static_cast(i + j + er())); - // } - // values[i] = "[" + join(doublevec, ",") + "]"; - // } else if (i % 7 == 6) { - // std::vector stringvec(10); - // for (int j = 0; j < 10; j++) { - // stringvec[j] = "\"xxx" + std::to_string(j) + "\""; - // } - // values[i] = "[" + join(stringvec, ",") + "]"; - } - } - json_string += "{"; - for (int i = 0; i < N - 1; i++) { - json_string += R"(")" + keys[i] + R"(":)" + values[i] + R"(,)"; - } - json_string += R"(")" + keys[N - 1] + R"(":)" + values[N - 1]; - json_string += "}"; - return json_string; -} - -void -ParseJson(const std::string& json) { - jsmn_parser p; - jsmntok_t t[2002]; - - jsmn_init(&p); - int r = jsmn_parse( - &p, json.c_str(), strlen(json.c_str()), t, sizeof(t) / sizeof(t[0])); - if (r < 0) { - printf("Failed to parse JSON: %d\n", r); - return; - } - if (r < 1 || t[0].type != JSMN_OBJECT) { - printf("Object expected\n"); - return; - } - //std::cout << r << std::endl; -} - -TEST(CApiTest, test_parse_perform) { - for (int i = 0; i < 10000; i++) { - { - int64_t all_cost = 0; - for (int j = 0; j < 10000; j++) { - auto json_string = GenerateJson(1000); - if (j == 0) { - std::cout << json_string.size() << std::endl; - } - //std::cout << json_string << std::endl; - auto start = std::chrono::steady_clock::now(); - ParseJson(json_string); - all_cost += - std::chrono::duration_cast( - std::chrono::steady_clock::now() - start) - .count(); - } - std::cout << "cost: " << all_cost << "us" << std::endl; - } - { - int64_t all_cost = 0; - for (int j = 0; j < 10000; j++) { - auto json_string = GenerateJson(100); - if (j == 0) { - std::cout << json_string.size() << std::endl; - } - //std::cout << json_string << std::endl; - auto start = std::chrono::steady_clock::now(); - ParseJson(json_string); - all_cost += - std::chrono::duration_cast( - std::chrono::steady_clock::now() - start) - .count(); - } - std::cout << "cost: " << all_cost << "us" << std::endl; - } - { - int64_t all_cost = 0; - for (int j = 0; j < 10000; j++) { - auto json_string = GenerateJson(50); - if (j == 0) { - std::cout << json_string.size() << std::endl; - } - auto start = std::chrono::steady_clock::now(); - ParseJson(json_string); - all_cost += - std::chrono::duration_cast( - std::chrono::steady_clock::now() - start) - .count(); - } - std::cout << "cost: " << all_cost << "us" << std::endl; - } - } -} - -void -extract_key_value_pairs(const char* json, size_t len) { - jsmn_parser parser; - jsmntok_t* tokens = - (jsmntok_t*)malloc(16 * sizeof(jsmntok_t)); // Initial allocation - if (!tokens) { - fprintf(stderr, "Memory allocation failed\n"); - return; - } - int num_tokens = 0; - int token_capacity = 16; - - // Initialize the parser - jsmn_init(&parser); - - size_t pos = 0; - while (pos < len) { - size_t chunk_size = - len - pos > 256 ? 256 : len - pos; // Read in chunks of 256 bytes - int r = - jsmn_parse(&parser, json + pos, chunk_size, tokens, token_capacity); - if (r < 0) { - if (r == JSMN_ERROR_NOMEM) { - // Reallocate tokens array if not enough space - token_capacity *= 2; // Double the capacity - tokens = (jsmntok_t*)realloc( - tokens, token_capacity * sizeof(jsmntok_t)); - if (!tokens) { - fprintf(stderr, "Memory reallocation failed\n"); - return; - } - continue; // Try parsing again - } else { - fprintf(stderr, "Failed to parse JSON: %d\n", r); - free(tokens); - return; - } - } - - // Update the position - pos += chunk_size; - } - - // Iterate through the tokens - for (int i = 0; i < parser.toknext; i++) { - if (tokens[i].type == JSMN_OBJECT) { - for (int j = 0; j < tokens[i].size; j++) { - // The next token is the key (string) - j++; - printf("Key: %.*s\n", - tokens[j].end - tokens[j].start, - json + tokens[j].start); - - // The next token is the value - j++; - printf("Value: %.*s\n", - tokens[j].end - tokens[j].start, - json + tokens[j].start); - } - } - } - - // Clean up - free(tokens); -} - -void -TravelJson(const char* json, - jsmntok* tokens, - int& index, - std::vector& path) { - jsmntok current = tokens[0]; - if (current.type == JSMN_OBJECT) { - int j = 1; - for (int i = 0; i < current.size; i++) { - assert(tokens[j].type == JSMN_STRING && tokens[j].size != 0); - std::string key(json + tokens[j].start, - tokens[j].end - tokens[j].start); - path.push_back(key); - j++; - int consumed = 0; - TravelJson(json, tokens + j, consumed, path); - path.pop_back(); - j += consumed; - } - index = j; - } else if (current.type == JSMN_PRIMITIVE) { - std::cout << "key:" << Join(path, ".") << "values:" - << std::string(json + current.start, - current.end - current.start) - << std::endl; - index++; - } else if (current.type == JSMN_ARRAY) { - std::cout << "key:" << Join(path, ".") << "values:" - << std::string(json + current.start, - current.end - current.start) - << std::endl; - // skip next array parse - int count = current.size; - int j = 1; - while (count > 0) { - if (tokens[j].size == 0) { - count--; - } else { - count += tokens[j].size; - } - j++; - } - index = j; - - } else if (current.type == JSMN_STRING) { - if (current.size == 0) { - std::cout << "key:" << Join(path, ".") << " values:" - << std::string(json + current.start, - current.end - current.start) - << std::endl; - index++; - } else { - throw std::runtime_error("not should happen"); - } - } else { - throw std::runtime_error("not should happen"); - } -} - -void -extract_key_value_pairs(const char* json) { - jsmn_parser parser; - jsmntok_t* tokens = - (jsmntok_t*)malloc(16 * sizeof(jsmntok_t)); // Initial allocation - if (!tokens) { - fprintf(stderr, "Memory allocation failed\n"); - return; - } - int num_tokens = 0; - int token_capacity = 16; - - // Initialize the parser - jsmn_init(&parser); - - // Parse the JSON string - while (1) { - int r = jsmn_parse(&parser, json, strlen(json), tokens, token_capacity); - if (r < 0) { - if (r == JSMN_ERROR_NOMEM) { - // Reallocate tokens array if not enough space - token_capacity *= 2; // Double the capacity - tokens = (jsmntok_t*)realloc( - tokens, token_capacity * sizeof(jsmntok_t)); - if (!tokens) { - fprintf(stderr, "Memory reallocation failed\n"); - return; - } - continue; // Try parsing again - } else { - fprintf(stderr, "Failed to parse JSON: %d\n", r); - free(tokens); - return; - } - } - num_tokens = r; - break; // Exit the loop if parsing was successful - } - - std::cout << "num_tokens:" << num_tokens << std::endl; - // Iterate through the tokens - for (int i = 0; i < num_tokens; i++) { - std::cout << "i:" << i << "type: " << tokens[i].type - << "token size:" << tokens[i].size << std::endl; - printf("value: %.*s\n", - tokens[i].end - tokens[i].start, - json + tokens[i].start); - } - - std::cout << "-----------------" << std::endl; - int index = 0; - std::vector path; - TravelJson(json, tokens, index, path); - - // Clean up - free(tokens); -} - -void -extract_json(const char* json) { - jsmn_parser parser; - jsmntok_t* tokens = - (jsmntok_t*)malloc(16 * sizeof(jsmntok_t)); // Initial allocation - if (!tokens) { - fprintf(stderr, "Memory allocation failed\n"); - return; - } - int num_tokens = 0; - int token_capacity = 16; - - // Initialize the parser - jsmn_init(&parser); - - // Parse the JSON string - while (1) { - int r = jsmn_parse(&parser, json, strlen(json), tokens, token_capacity); - if (r < 0) { - if (r == JSMN_ERROR_NOMEM) { - // Reallocate tokens array if not enough space - token_capacity *= 2; // Double the capacity - tokens = (jsmntok_t*)realloc( - tokens, token_capacity * sizeof(jsmntok_t)); - if (!tokens) { - fprintf(stderr, "Memory reallocation failed\n"); - return; - } - continue; // Try parsing again - } else { - fprintf(stderr, "Failed to parse JSON: %d\n", r); - free(tokens); - return; - } - } - num_tokens = r; - break; // Exit the loop if parsing was successful - } - - // assert(tokens[0].type == JSMN_OBJECT); - - // Iterate through the tokens - for (int i = 0; i < num_tokens; i++) { - std::cout << "i:" << i << "type: " << tokens[i].type - << "token size:" << tokens[i].size << std::endl; - printf("value: %.*s\n", - tokens[i].end - tokens[i].start, - json + tokens[i].start); - } - - // Clean up - free(tokens); -} - -TEST(CApiTest, test_jsmn_function) { - int64_t all_cost = 0; - // auto json_string = GenerateJson(50); - // std::cout << json_string << std::endl; - // extract_key_value_pairs(json_string.c_str()); - - std::string json_string = - R"({"keys0": ["value0", 234, "values1"], "keys1": ["value3", 1235]})"; - std::cout << json_string << std::endl; - extract_key_value_pairs(json_string.c_str()); - - json_string = - R"({"keys0": [{"keys1": 1234, "keys2": "xxx"}, {"keys3": 567, "keys4": "xxxxx"}]})"; - std::cout << json_string << std::endl; - extract_key_value_pairs(json_string.c_str()); - - json_string = R"({"keys0": {"keys1": { "keys2": "xxx", "keys3" :1234}}})"; - std::cout << json_string << std::endl; - extract_key_value_pairs(json_string.c_str()); -} +} \ No newline at end of file diff --git a/internal/core/unittest/test_json_key_index.cpp b/internal/core/unittest/test_json_key_index.cpp index c128c78909535..29b9a675a0ecf 100644 --- a/internal/core/unittest/test_json_key_index.cpp +++ b/internal/core/unittest/test_json_key_index.cpp @@ -24,87 +24,29 @@ #include "test_utils/indexbuilder_test_utils.h" #include "index/Meta.h" #include "index/JsonKeyInvertedIndex.h" - +#include "common/Json.h" +#include "common/Types.h" using namespace milvus::index; using namespace milvus::indexbuilder; using namespace milvus; using namespace milvus::index; -std::string -join(const std::vector& vec, const std::string& delimiter) { - std::ostringstream oss; - for (size_t i = 0; i < vec.size(); ++i) { - oss << vec[i]; - if (i != vec.size() - 1) { - oss << delimiter; - } - } - return oss.str(); -} - -// 1000 keys -static std::string -GenerateJson(int N) { - std::vector data(N); - std::default_random_engine er(42); - std::normal_distribution<> distr(0, 1); - std::vector keys; - for (int i = 0; i < N; i++) { - keys.push_back("keys" + std::to_string(i)); - } - std::string json_string; - std::vector values(N); - for (int i = 0; i < N; i++) { - if (i % 7 == 0) { - values[i] = std::to_string(er()); - } else if (i % 7 == 1) { - values[i] = std::to_string(static_cast(er())); - } else if (i % 7 == 2) { - values[i] = er() / 2 == 0 ? "true" : "false"; - } else if (i % 7 == 3) { - values[i] = "\"xxxx" + std::to_string(i) + "\""; - } else if (i % 7 == 4) { - std::vector intvec(10); - for (int j = 0; j < 10; j++) { - intvec[j] = std::to_string(i + j); - } - values[i] = "[" + join(intvec, ",") + "]"; - } else if (i % 7 == 5) { - std::vector doublevec(10); - for (int j = 0; j < 10; j++) { - doublevec[j] = - std::to_string(static_cast(i + j + er())); - } - values[i] = "[" + join(doublevec, ",") + "]"; - } else if (i % 7 == 6) { - std::vector stringvec(10); - for (int j = 0; j < 10; j++) { - stringvec[j] = "\"xxx" + std::to_string(j) + "\""; - } - values[i] = "[" + join(stringvec, ",") + "]"; - } - } - json_string += "{"; - for (int i = 0; i < N - 1; i++) { - json_string += R"(")" + keys[i] + R"(":)" + values[i] + R"(,)"; - } - json_string += R"(")" + keys[N - 1] + R"(":)" + values[N - 1]; - json_string += "}"; - return json_string; -} - -static std::vector -GenerateJsons(int size, int dim) { +static std::vector +GenerateJsons(int size) { std::vector jsons; - for (int i = 0; i < size; ++i) { - std::cout << GenerateJson(dim) << std::endl; - jsons.push_back( - milvus::Json(simdjson::padded_string(GenerateJson(dim)))); + std::default_random_engine random(42); + std::normal_distribution<> distr(0, 1); + for (int i = 0; i < size; i++) { + auto str = R"({"int":)" + std::to_string(random()) + R"(,"double":)" + + std::to_string(static_cast(random())) + + R"(,"string":")" + std::to_string(random()) + + R"(","bool": true)" + R"(, "array": [1,2,3])" + "}"; + jsons.push_back(milvus::Json(simdjson::padded_string(str))); } return jsons; } -class JsonKeyIndexTest : public testing::Test { +class JsonKeyIndexTest : public ::testing::TestWithParam { protected: void Init(int64_t collection_id, @@ -113,19 +55,42 @@ class JsonKeyIndexTest : public testing::Test { int64_t field_id, int64_t index_build_id, int64_t index_version, - int64_t size, - int64_t dim) { + int64_t size) { proto::schema::FieldSchema field_schema; field_schema.set_data_type(proto::schema::DataType::JSON); - + field_schema.set_nullable(nullable_); auto field_meta = storage::FieldDataMeta{ collection_id, partition_id, segment_id, field_id, field_schema}; auto index_meta = storage::IndexMeta{ segment_id, field_id, index_build_id, index_version}; - data_ = std::move(GenerateJsons(size, dim)); - auto field_data = storage::CreateFieldData(DataType::JSON); - field_data->FillFieldData(data_.data(), data_.size()); + data_ = std::move(GenerateJsons(size)); + auto field_data = storage::CreateFieldData(DataType::JSON, nullable_); + if (nullable_) { + valid_data.reserve(size_); + for (size_t i = 0; i < size_; i++) { + valid_data.push_back(false); + } + } + if (nullable_) { + int byteSize = (size_ + 7) / 8; + uint8_t* valid_data_ = new uint8_t[byteSize]; + for (int i = 0; i < size_; i++) { + bool value = valid_data[i]; + int byteIndex = i / 8; + int bitIndex = i % 8; + if (value) { + valid_data_[byteIndex] |= (1 << bitIndex); + } else { + valid_data_[byteIndex] &= ~(1 << bitIndex); + } + } + field_data->FillFieldData(data_.data(), valid_data_, data_.size()); + delete[] valid_data_; + } else { + field_data->FillFieldData(data_.data(), data_.size()); + } + storage::InsertData insert_data(field_data); insert_data.SetFieldDataMeta(field_meta); insert_data.SetTimestamps(0, 100); @@ -163,13 +128,9 @@ class JsonKeyIndexTest : public testing::Test { index_->Load(milvus::tracer::TraceContext{}, config); } - virtual void - SetParam() { - } void SetUp() override { - SetParam(); - + nullable_ = GetParam(); type_ = DataType::JSON; int64_t collection_id = 1; int64_t partition_id = 2; @@ -177,8 +138,7 @@ class JsonKeyIndexTest : public testing::Test { int64_t field_id = 101; int64_t index_build_id = 1000; int64_t index_version = 10000; - size_ = 10; - dim_ = 10; + size_ = 1; std::string root_path = "/tmp/test-jsonkey-index/"; storage::StorageConfig storage_config; @@ -192,8 +152,7 @@ class JsonKeyIndexTest : public testing::Test { field_id, index_build_id, index_version, - size_, - dim_); + size_); } virtual ~JsonKeyIndexTest() override { @@ -201,233 +160,330 @@ class JsonKeyIndexTest : public testing::Test { } public: - void - TestTermInFunc() { - { - std::vector> testcases{{"705894"}}; - for (auto testcase : testcases) { - auto check = [&](std::string value) { - std::unordered_set term_set(testcase.begin(), - testcase.end()); - return term_set.find(value) != term_set.end(); - }; - std::unordered_set term_set(testcase.begin(), - testcase.end()); - auto filter_func = [&term_set, this](uint32_t row_id, - uint16_t offset, - uint16_t size) { - auto val = - this->data_[row_id].template at_pos( - offset, size); - if (val.second != "") { - return false; - } - return term_set.find((std::string(val.first))) != - term_set.end(); - }; - auto bitset = - index_->FilterByPath("/keys0", size_, filter_func); + std::shared_ptr index_; + DataType type_; + bool nullable_; + size_t size_; + FixedVector valid_data; + std::vector data_; + std::vector json_col; + std::shared_ptr chunk_manager_; +}; - ASSERT_EQ(bitset.size(), size_); - for (int i = 0; i < bitset.size(); ++i) { - auto ans = bitset[i]; - auto ref = check("705894"); - ASSERT_EQ(ans, ref); - } +INSTANTIATE_TEST_SUITE_P(JsonKeyIndexTestSuite, + JsonKeyIndexTest, + ::testing::Values(true, false)); + +TEST_P(JsonKeyIndexTest, TestTermInFunc) { + struct Testcase { + std::vector term; + std::vector nested_path; + }; + std::vector testcases{ + {{1, 2, 3, 4}, {"int"}}, + {{10, 100, 1000, 10000}, {"int"}}, + {{100, 10000, 9999, 444}, {"int"}}, + {{23, 42, 66, 17, 25}, {"int"}}, + }; + for (auto testcase : testcases) { + auto check = [&](int64_t value) { + std::unordered_set term_set(testcase.term.begin(), + testcase.term.end()); + return term_set.find(value) != term_set.end(); + }; + std::unordered_set term_set(testcase.term.begin(), + testcase.term.end()); + auto filter_func = [&term_set, this](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto val = this->data_[row_id].template at(offset, size); + if (val.error()) { + return false; + } + return term_set.find(int64_t(val.value())) != term_set.end(); + }; + auto pointer = milvus::Json::pointer(testcase.nested_path); + auto bitset = index_->FilterByPath(pointer, size_, filter_func); + ASSERT_EQ(bitset.size(), size_); + for (int i = 0; i < bitset.size(); ++i) { + if (nullable_ && !valid_data[i]) { + ASSERT_EQ(bitset[i], false); + } else { + auto val = data_[i].template at(pointer).value(); + auto ans = bitset[i]; + auto ref = check(val); + ASSERT_EQ(ans, ref); } } - { - std::vector testcases{"true"}; - for (auto& value : testcases) { - auto filter_func = [this, &value](uint32_t row_id, - uint16_t offset, - uint16_t size) { - auto val = - this->data_[row_id].template at_pos( - offset, size); - if (val.second != "") { - return false; - } - return std::string(val.first) == value; - }; + } +} + +TEST_P(JsonKeyIndexTest, TestUnaryRangeInFunc) { + struct Testcase { + int64_t val; + std::vector nested_path; + }; + std::vector testcases{ + {10, {"int"}}, + {20, {"int"}}, + {30, {"int"}}, + {40, {"int"}}, + }; + std::vector ops{ + OpType::Equal, + OpType::NotEqual, + OpType::GreaterThan, + OpType::GreaterEqual, + OpType::LessThan, + OpType::LessEqual, + }; + for (const auto& testcase : testcases) { + auto check = [&](int64_t value) { return value == testcase.val; }; + std::function f = check; + for (auto& op : ops) { + switch (op) { + case OpType::Equal: { + f = [&](int64_t value) { return value == testcase.val; }; + break; + } + case OpType::NotEqual: { + f = [&](int64_t value) { return value != testcase.val; }; + break; + } + case OpType::GreaterEqual: { + f = [&](int64_t value) { return value >= testcase.val; }; + break; + } + case OpType::GreaterThan: { + f = [&](int64_t value) { return value > testcase.val; }; + break; + } + case OpType::LessEqual: { + f = [&](int64_t value) { return value <= testcase.val; }; + break; + } + case OpType::LessThan: { + f = [&](int64_t value) { return value < testcase.val; }; + break; + } + default: { + PanicInfo(Unsupported, "unsupported range node"); + } + } - auto bitset = - index_->FilterByPath("/keys2", size_, filter_func); - ASSERT_EQ(bitset.size(), size_); - for (int i = 0; i < bitset.size(); ++i) { + auto filter_func = [&op, &testcase, this](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto val = + this->data_[row_id].template at(offset, size); + if (val.error()) { + return false; + } + switch (op) { + case OpType::GreaterThan: + return int64_t(val.value()) > testcase.val; + case OpType::GreaterEqual: + return int64_t(val.value()) >= testcase.val; + case OpType::LessThan: + return int64_t(val.value()) < testcase.val; + case OpType::LessEqual: + return int64_t(val.value()) <= testcase.val; + case OpType::Equal: + return int64_t(val.value()) == testcase.val; + case OpType::NotEqual: + return int64_t(val.value()) != testcase.val; + default: + return false; + } + }; + auto pointer = milvus::Json::pointer(testcase.nested_path); + auto bitset = index_->FilterByPath(pointer, size_, filter_func); + ASSERT_EQ(bitset.size(), size_); + for (int i = 0; i < bitset.size(); ++i) { + if (nullable_ && !valid_data[i]) { + ASSERT_EQ(bitset[i], false); + } else { auto ans = bitset[i]; - auto ref = (value == "false"); - ASSERT_EQ(ans, ref); + if (testcase.nested_path[0] == "int") { + auto val = + data_[i].template at(pointer).value(); + auto ref = f(val); + ASSERT_EQ(ans, ref); + } else { + auto val = + data_[i].template at(pointer).value(); + auto ref = f(val); + ASSERT_EQ(ans, ref); + } } } } } - void - TestUnaryRangeInFunc() { - std::vector testcases{"10", "705894", "805894"}; - std::vector ops{ - OpType::Equal, - OpType::NotEqual, - OpType::GreaterThan, - OpType::GreaterEqual, - OpType::LessThan, - OpType::LessEqual, +} + +TEST_P(JsonKeyIndexTest, TestBinaryRangeInFunc) { + struct Testcase { + bool lower_inclusive; + bool upper_inclusive; + int64_t lower; + int64_t upper; + std::vector nested_path; + }; + std::vector testcases{ + {true, false, 10, 20, {"int"}}, + {true, true, 20, 30, {"int"}}, + {false, true, 30, 40, {"int"}}, + {false, false, 40, 50, {"int"}}, + {true, false, 10, 20, {"double"}}, + {true, true, 20, 30, {"double"}}, + {false, true, 30, 40, {"double"}}, + {false, false, 40, 50, {"double"}}, + }; + for (const auto& testcase : testcases) { + auto check = [&](int64_t value) { + if (testcase.lower_inclusive && testcase.upper_inclusive) { + return testcase.lower <= value && value <= testcase.upper; + } else if (testcase.lower_inclusive && !testcase.upper_inclusive) { + return testcase.lower <= value && value < testcase.upper; + } else if (!testcase.lower_inclusive && testcase.upper_inclusive) { + return testcase.lower < value && value <= testcase.upper; + } else { + return testcase.lower < value && value < testcase.upper; + } }; - for (const auto& testcase : testcases) { - auto check = [&](std::string value) { return value == testcase; }; - std::function f = check; - for (auto& op : ops) { - switch (op) { - case OpType::Equal: { - f = [&](std::string value) { - return value == testcase; - }; - break; - } - case OpType::NotEqual: { - f = [&](std::string value) { - return value != testcase; - }; - break; - } - case OpType::GreaterEqual: { - f = [&](std::string value) { - return value >= testcase; - }; - break; - } - case OpType::GreaterThan: { - f = [&](std::string value) { return value > testcase; }; - break; - } - case OpType::LessEqual: { - f = [&](std::string value) { - return value <= testcase; - }; - break; - } - case OpType::LessThan: { - f = [&](std::string value) { return value < testcase; }; - break; - } - default: { - PanicInfo(Unsupported, "unsupported range node"); - } - } - auto filter_func = [&op, &testcase, this](uint32_t row_id, - uint16_t offset, - uint16_t size) { - auto val = - this->data_[row_id].template at_pos( - offset, size); - if (val.second != "") { - return false; - } - switch (op) { - case OpType::GreaterThan: - return std::string(val.first) > testcase; - case OpType::GreaterEqual: - return std::string(val.first) >= testcase; - case OpType::LessThan: - return std::string(val.first) < testcase; - case OpType::LessEqual: - return std::string(val.first) <= testcase; - case OpType::Equal: - return std::string(val.first) == testcase; - case OpType::NotEqual: - return std::string(val.first) != testcase; - default: - return false; - } - }; - auto bitset = - index_->FilterByPath("/keys0", size_, filter_func); - ASSERT_EQ(bitset.size(), size_); - for (int i = 0; i < bitset.size(); ++i) { - auto ans = bitset[i]; - auto ref = f("705894"); + auto filter_func = [&testcase, this](uint32_t row_id, + uint16_t offset, + uint16_t size) { + auto val = this->data_[row_id].template at(offset, size); + if (val.error()) { + return false; + } + if (testcase.lower_inclusive && testcase.upper_inclusive) { + return testcase.lower <= int64_t(val.value()) && + int64_t(val.value()) <= testcase.upper; + } else if (testcase.lower_inclusive && !testcase.upper_inclusive) { + return testcase.lower <= int64_t(val.value()) && + int64_t(val.value()) < testcase.upper; + } else if (!testcase.lower_inclusive && testcase.upper_inclusive) { + return testcase.lower < int64_t(val.value()) && + int64_t(val.value()) <= testcase.upper; + } else { + return testcase.lower < int64_t(val.value()) && + int64_t(val.value()) < testcase.upper; + } + }; + auto pointer = milvus::Json::pointer(testcase.nested_path); + auto bitset = index_->FilterByPath(pointer, size_, filter_func); + ASSERT_EQ(bitset.size(), size_); + for (int i = 0; i < bitset.size(); ++i) { + if (nullable_ && !valid_data[i]) { + ASSERT_EQ(bitset[i], false); + } else { + auto ans = bitset[i]; + if (testcase.nested_path[0] == "int") { + auto val = data_[i].template at(pointer).value(); + auto ref = check(val); + ASSERT_EQ(ans, ref); + } else { + auto val = data_[i].template at(pointer).value(); + auto ref = check(val); ASSERT_EQ(ans, ref); } } } } +} - void - TestBinaryRangeInFunc() { - struct Testcase { - bool lower_inclusive; - bool upper_inclusive; - std::string lower; - std::string upper; - }; +TEST_P(JsonKeyIndexTest, TestExistInFunc) { + struct Testcase { + std::vector nested_path; + }; + std::vector testcases{ + {{"A"}}, + {{"int"}}, + {{"double"}}, + {{"B"}}, + }; + for (const auto& testcase : testcases) { + auto pointer = milvus::Json::pointer(testcase.nested_path); + auto filter_func = + [&pointer, this](uint32_t row_id, uint16_t offset, uint16_t size) { + return this->data_[row_id].exist(pointer); + }; + + auto bitset = index_->FilterByPath(pointer, size_, filter_func); + ASSERT_EQ(bitset.size(), size_); + for (int i = 0; i < bitset.size(); ++i) { + if (nullable_ && !valid_data[i]) { + ASSERT_EQ(bitset[i], false); + } else { + auto ans = bitset[i]; + auto val = data_[i].exist(pointer); + ASSERT_EQ(ans, val); + } + } + } +} +TEST_P(JsonKeyIndexTest, TestJsonContainsAllFunc) { + struct Testcase { + std::vector term; + std::vector nested_path; + }; + { std::vector testcases{ - {true, false, "10", "20"}, - {true, true, "20", "30"}, - {false, true, "30", "40"}, - {false, false, "40", "50"}, + {{1, 2, 3}, {"array"}}, + {{10, 100}, {"array"}}, + {{100, 1000}, {"array"}}, }; for (const auto& testcase : testcases) { - auto check = [&](std::string value) { - if (testcase.lower_inclusive && testcase.upper_inclusive) { - return testcase.lower <= value && value <= testcase.upper; - } else if (testcase.lower_inclusive && - !testcase.upper_inclusive) { - return testcase.lower <= value && value < testcase.upper; - } else if (!testcase.lower_inclusive && - testcase.upper_inclusive) { - return testcase.lower < value && value <= testcase.upper; - } else { - return testcase.lower < value && value < testcase.upper; + auto check = [&](const std::vector& values) { + for (auto const& e : testcase.term) { + if (std::find(values.begin(), values.end(), e) == + values.end()) { + return false; + } } + return true; }; - - auto filter_func = [&testcase, this](uint32_t row_id, + auto pointer = milvus::Json::pointer(testcase.nested_path); + std::unordered_set elements; + for (auto const& element : testcase.term) { + elements.insert(element); + } + auto filter_func = [&elements, this](uint32_t row_id, uint16_t offset, uint16_t size) { - auto val = - this->data_[row_id].template at_pos( - offset, size); - if (val.second != "") { - return false; - } - if (testcase.lower_inclusive && testcase.upper_inclusive) { - return testcase.lower <= std::string(val.first) && - std::string(val.first) <= testcase.upper; - } else if (testcase.lower_inclusive && - !testcase.upper_inclusive) { - return testcase.lower <= std::string(val.first) && - std::string(val.first) < testcase.upper; - } else if (!testcase.lower_inclusive && - testcase.upper_inclusive) { - return testcase.lower < std::string(val.first) && - std::string(val.first) <= testcase.upper; - } else { - return testcase.lower < std::string(val.first) && - std::string(val.first) < testcase.upper; + auto array = this->data_[row_id].array_at(offset, size); + std::unordered_set tmp_elements(elements); + for (auto&& it : array) { + auto val = it.template get(); + if (val.error()) { + continue; + } + tmp_elements.erase(val.value()); + if (tmp_elements.size() == 0) { + return true; + } } + return tmp_elements.empty(); }; - auto bitset = index_->FilterByPath("/keys7", size_, filter_func); + + auto bitset = index_->FilterByPath(pointer, size_, filter_func); ASSERT_EQ(bitset.size(), size_); for (int i = 0; i < bitset.size(); ++i) { - auto ans = bitset[i]; - auto ref = check("970724117"); - ASSERT_EQ(ans, ref); + if (nullable_ && !valid_data[i]) { + ASSERT_EQ(bitset[i], false); + } else { + auto ans = bitset[i]; + auto array = data_[i].array_at(pointer); + std::vector res; + for (const auto& element : array) { + res.push_back(element.template get()); + } + ASSERT_EQ(ans, check(res)); + } } } } - - public: - std::shared_ptr index_; - DataType type_; - size_t size_; - size_t dim_; - std::vector data_; - std::shared_ptr chunk_manager_; -}; - -TEST_F(JsonKeyIndexTest, CountFuncTest) { - TestTermInFunc(); - TestUnaryRangeInFunc(); - TestBinaryRangeInFunc(); } \ No newline at end of file diff --git a/internal/datacoord/job_manager.go b/internal/datacoord/job_manager.go index 4ffc24c6702bf..ba74d71a4a226 100644 --- a/internal/datacoord/job_manager.go +++ b/internal/datacoord/job_manager.go @@ -142,10 +142,19 @@ func needDoTextIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool { } func needDoJsonKeyIndex(segment *SegmentInfo, fieldIDs []UniqueID) bool { - if !isFlush(segment) { + if !(isFlush(segment) && segment.GetLevel() != datapb.SegmentLevel_L0 && + segment.GetIsSorted()) { return false } - return true + for _, fieldID := range fieldIDs { + if segment.GetJsonKeyStats() == nil { + return true + } + if segment.GetJsonKeyStats()[fieldID] == nil { + return true + } + } + return false } func needDoBM25(segment *SegmentInfo, fieldIDs []UniqueID) bool { diff --git a/internal/datacoord/job_manager_test.go b/internal/datacoord/job_manager_test.go index a0d95e4cd5b3d..3627f027a703a 100644 --- a/internal/datacoord/job_manager_test.go +++ b/internal/datacoord/job_manager_test.go @@ -62,6 +62,11 @@ func (s *jobManagerSuite) TestJobManager_triggerStatsTaskLoop() { }, }, }, + { + FieldID: 102, + Name: "json", + DataType: schemapb.DataType_JSON, + }, }, }, }, @@ -117,5 +122,5 @@ func (s *jobManagerSuite) TestJobManager_triggerStatsTaskLoop() { jm.loopWg.Wait() - s.Equal(2, len(jm.scheduler.tasks)) + s.Equal(3, len(jm.scheduler.tasks)) } diff --git a/internal/indexnode/indexnode_service.go b/internal/indexnode/indexnode_service.go index 2948c87829f50..53aee5afdb765 100644 --- a/internal/indexnode/indexnode_service.go +++ b/internal/indexnode/indexnode_service.go @@ -516,7 +516,7 @@ func (i *IndexNode) QueryJobsV2(ctx context.Context, req *workerpb.QueryJobsV2Re InsertLogs: info.insertLogs, StatsLogs: info.statsLogs, TextStatsLogs: info.textStatsLogs, - Bm25Logs: info.bm25Logs, + Bm25Logs: info.bm25Logs, NumRows: info.numRows, JsonKeyStatsLogs: info.jsonKeyStatsLogs, }) diff --git a/internal/indexnode/task_stats.go b/internal/indexnode/task_stats.go index 787fc32d6daa9..40f2dc93a1d4e 100644 --- a/internal/indexnode/task_stats.go +++ b/internal/indexnode/task_stats.go @@ -333,7 +333,7 @@ func (st *statsTask) Execute(ctx context.Context) error { return err } } else if st.req.GetSubJobType() == indexpb.StatsSubJob_JsonKeyIndexJob { - err = st.createJsonKeyIndex(ctx, + err = st.createJSONKeyIndex(ctx, st.req.GetStorageConfig(), st.req.GetCollectionID(), st.req.GetPartitionID(), @@ -729,7 +729,7 @@ func (st *statsTask) createTextIndex(ctx context.Context, return nil } -func (st *statsTask) createJsonKeyIndex(ctx context.Context, +func (st *statsTask) createJSONKeyIndex(ctx context.Context, storageConfig *indexpb.StorageConfig, collectionID int64, partitionID int64, @@ -773,7 +773,7 @@ func (st *statsTask) createJsonKeyIndex(ctx context.Context, jsonKeyIndexStats := make(map[int64]*datapb.JsonKeyStats) for _, field := range st.req.GetSchema().GetFields() { h := typeutil.CreateFieldSchemaHelper(field) - if !h.EnableJsonKeyIndex() { + if !h.EnableJSONKeyIndex() { continue } log.Info("field enable json key index, ready to create json key index", zap.Int64("field id", field.GetFieldID())) @@ -794,7 +794,7 @@ func (st *statsTask) createJsonKeyIndex(ctx context.Context, StorageConfig: newStorageConfig, } - uploaded, err := indexcgowrapper.CreateJsonKeyIndex(ctx, buildIndexParams) + uploaded, err := indexcgowrapper.CreateJSONKeyIndex(ctx, buildIndexParams) if err != nil { return err } @@ -812,7 +812,7 @@ func (st *statsTask) createJsonKeyIndex(ctx context.Context, totalElapse := st.tr.RecordSpan() - st.node.storeJsonKeyIndexResult(st.req.GetClusterID(), + st.node.storeJSONKeyIndexResult(st.req.GetClusterID(), st.req.GetTaskID(), st.req.GetCollectionID(), st.req.GetPartitionID(), diff --git a/internal/indexnode/taskinfo_ops.go b/internal/indexnode/taskinfo_ops.go index f15050af4eb68..41538b0f74953 100644 --- a/internal/indexnode/taskinfo_ops.go +++ b/internal/indexnode/taskinfo_ops.go @@ -323,7 +323,7 @@ type statsTaskInfo struct { insertLogs []*datapb.FieldBinlog statsLogs []*datapb.FieldBinlog textStatsLogs map[int64]*datapb.TextIndexStats - bm25Logs []*datapb.FieldBinlog + bm25Logs []*datapb.FieldBinlog jsonKeyStatsLogs map[int64]*datapb.JsonKeyStats } @@ -411,14 +411,15 @@ func (i *IndexNode) storeStatsTextIndexResult( } } -func (i *IndexNode) storeJsonKeyIndexResult( +func (i *IndexNode) storeJSONKeyIndexResult( clusterID string, taskID UniqueID, collID UniqueID, partID UniqueID, segID UniqueID, channel string, - jsonKeyIndexLogs map[int64]*datapb.JsonKeyStats) { + jsonKeyIndexLogs map[int64]*datapb.JsonKeyStats, +) { key := taskKey{ClusterID: clusterID, TaskID: taskID} i.stateLock.Lock() defer i.stateLock.Unlock() @@ -448,7 +449,7 @@ func (i *IndexNode) getStatsTaskInfo(clusterID string, taskID UniqueID) *statsTa insertLogs: info.insertLogs, statsLogs: info.statsLogs, textStatsLogs: info.textStatsLogs, - bm25Logs: info.bm25Logs, + bm25Logs: info.bm25Logs, jsonKeyStatsLogs: info.jsonKeyStatsLogs, } } diff --git a/internal/proto/query_coord.proto b/internal/proto/query_coord.proto index a38fdc0c535fd..0d0d56dcacd71 100644 --- a/internal/proto/query_coord.proto +++ b/internal/proto/query_coord.proto @@ -396,6 +396,7 @@ enum LoadScope { Full = 0; Delta = 1; Index = 2; + Stats = 3; } message LoadSegmentsRequest { @@ -645,6 +646,7 @@ message SegmentVersionInfo { map index_info = 7; data.SegmentLevel level = 8; bool is_sorted = 9; + repeated int64 field_json_index_stats = 10; } message ChannelVersionInfo { diff --git a/internal/querycoordv2/checkers/controller.go b/internal/querycoordv2/checkers/controller.go index 2cc46e5f1f11b..1baa0acbffe2a 100644 --- a/internal/querycoordv2/checkers/controller.go +++ b/internal/querycoordv2/checkers/controller.go @@ -72,6 +72,7 @@ func NewCheckerController( // todo temporary work around must fix // utils.LeaderChecker: NewLeaderChecker(meta, dist, targetMgr, nodeMgr, true), utils.LeaderChecker: NewLeaderChecker(meta, dist, targetMgr, nodeMgr), + utils.StatsChecker: NewStatsChecker(meta, dist, broker, nodeMgr, targetMgr), } manualCheckChs := map[utils.CheckerType]chan struct{}{ @@ -112,6 +113,8 @@ func getCheckerInterval(checker utils.CheckerType) time.Duration { return Params.QueryCoordCfg.IndexCheckInterval.GetAsDuration(time.Millisecond) case utils.LeaderChecker: return Params.QueryCoordCfg.LeaderViewUpdateInterval.GetAsDuration(time.Second) + case utils.StatsChecker: + return Params.QueryCoordCfg.IndexCheckInterval.GetAsDuration(time.Millisecond) default: return Params.QueryCoordCfg.CheckInterval.GetAsDuration(time.Millisecond) } diff --git a/internal/querycoordv2/checkers/controller_base_test.go b/internal/querycoordv2/checkers/controller_base_test.go index 0d8e301492b51..e66123b2aa38a 100644 --- a/internal/querycoordv2/checkers/controller_base_test.go +++ b/internal/querycoordv2/checkers/controller_base_test.go @@ -103,7 +103,7 @@ func (s *ControllerBaseTestSuite) TestActivation() { func (s *ControllerBaseTestSuite) TestListCheckers() { checkers := s.controller.Checkers() - s.Equal(5, len(checkers)) + s.Equal(6, len(checkers)) } func TestControllerBaseTestSuite(t *testing.T) { diff --git a/internal/querycoordv2/checkers/stats_checker.go b/internal/querycoordv2/checkers/stats_checker.go new file mode 100644 index 0000000000000..0d266b17b2395 --- /dev/null +++ b/internal/querycoordv2/checkers/stats_checker.go @@ -0,0 +1,185 @@ +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checkers + +import ( + "context" + "time" + + "github.com/samber/lo" + "go.uber.org/zap" + + "github.com/milvus-io/milvus-proto/go-api/v2/milvuspb" + "github.com/milvus-io/milvus/internal/proto/datapb" + "github.com/milvus-io/milvus/internal/proto/querypb" + "github.com/milvus-io/milvus/internal/querycoordv2/meta" + "github.com/milvus-io/milvus/internal/querycoordv2/params" + "github.com/milvus-io/milvus/internal/querycoordv2/session" + "github.com/milvus-io/milvus/internal/querycoordv2/task" + "github.com/milvus-io/milvus/internal/querycoordv2/utils" + "github.com/milvus-io/milvus/pkg/log" + "github.com/milvus-io/milvus/pkg/util/typeutil" +) + +var _ Checker = (*StatsChecker)(nil) + +// StatsChecker perform segment stats index check. +type StatsChecker struct { + *checkerActivation + meta *meta.Meta + dist *meta.DistributionManager + broker meta.Broker + nodeMgr *session.NodeManager + + targetMgr meta.TargetManagerInterface +} + +func NewStatsChecker( + meta *meta.Meta, + dist *meta.DistributionManager, + broker meta.Broker, + nodeMgr *session.NodeManager, + targetMgr meta.TargetManagerInterface, +) *StatsChecker { + return &StatsChecker{ + checkerActivation: newCheckerActivation(), + meta: meta, + dist: dist, + broker: broker, + nodeMgr: nodeMgr, + targetMgr: targetMgr, + } +} + +func (c *StatsChecker) ID() utils.CheckerType { + return utils.StatsChecker +} + +func (c *StatsChecker) Description() string { + return "StatsChecker checks stats state change of segments and generates load stats task" +} + +func (c *StatsChecker) Check(ctx context.Context) []task.Task { + if !c.IsActive() { + return nil + } + collectionIDs := c.meta.CollectionManager.GetAll(ctx) + var tasks []task.Task + + for _, collectionID := range collectionIDs { + resp, err := c.broker.DescribeCollection(ctx, collectionID) + if err != nil { + log.Warn("describeCollection during check stats", zap.Int64("collection", collectionID)) + continue + } + collection := c.meta.CollectionManager.GetCollection(ctx, collectionID) + if collection == nil { + log.Warn("collection released during check stats", zap.Int64("collection", collectionID)) + continue + } + replicas := c.meta.ReplicaManager.GetByCollection(ctx, collectionID) + for _, replica := range replicas { + tasks = append(tasks, c.checkReplica(ctx, collection, replica, resp)...) + } + } + + return tasks +} + +func (c *StatsChecker) checkReplica(ctx context.Context, collection *meta.Collection, replica *meta.Replica, resp *milvuspb.DescribeCollectionResponse) []task.Task { + var tasks []task.Task + segments := c.dist.SegmentDistManager.GetByFilter(meta.WithCollectionID(replica.GetCollectionID()), meta.WithReplica(replica)) + idSegments := make(map[int64]*meta.Segment) + roNodeSet := typeutil.NewUniqueSet(replica.GetRONodes()...) + targets := make(map[int64][]int64) // segmentID => FieldID + for _, segment := range segments { + // skip update index in read only node + if roNodeSet.Contain(segment.Node) { + continue + } + + // skip update index for l0 segment + segmentInTarget := c.targetMgr.GetSealedSegment(ctx, collection.GetCollectionID(), segment.GetID(), meta.CurrentTargetFirst) + if segmentInTarget == nil || segmentInTarget.GetLevel() == datapb.SegmentLevel_L0 { + continue + } + missing := c.checkSegment(segment, resp) + if len(missing) > 0 { + targets[segment.GetID()] = missing + idSegments[segment.GetID()] = segment + } + } + + segmentsToUpdate := typeutil.NewSet[int64]() + for _, segmentIDs := range lo.Chunk(lo.Keys(targets), MaxSegmentNumPerGetIndexInfoRPC) { + for _, segmentID := range segmentIDs { + segmentsToUpdate.Insert(segmentID) + } + } + + tasks = lo.FilterMap(segmentsToUpdate.Collect(), func(segmentID int64, _ int) (task.Task, bool) { + return c.createSegmentUpdateTask(ctx, idSegments[segmentID], replica) + }) + + return tasks +} + +func (c *StatsChecker) checkSegment(segment *meta.Segment, resp *milvuspb.DescribeCollectionResponse) (missFieldIDs []int64) { + var result []int64 + for _, field := range resp.GetSchema().GetFields() { + h := typeutil.CreateFieldSchemaHelper(field) + if h.EnableJSONKeyIndex() { + exists := false + for i := 0; i < len(segment.JSONIndexField); i++ { + if segment.JSONIndexField[i] == field.FieldID { + exists = true + break + } + } + + if !exists { + result = append(result, field.FieldID) + continue + } + } + } + return result +} + +func (c *StatsChecker) createSegmentUpdateTask(ctx context.Context, segment *meta.Segment, replica *meta.Replica) (task.Task, bool) { + action := task.NewSegmentActionWithScope(segment.Node, task.ActionTypeStatsUpdate, segment.GetInsertChannel(), segment.GetID(), querypb.DataScope_Historical) + t, err := task.NewSegmentTask( + ctx, + params.Params.QueryCoordCfg.SegmentTaskTimeout.GetAsDuration(time.Millisecond), + c.ID(), + segment.GetCollectionID(), + replica, + action, + ) + if err != nil { + log.Warn("create segment stats update task failed", + zap.Int64("collection", segment.GetCollectionID()), + zap.String("channel", segment.GetInsertChannel()), + zap.Int64("node", segment.Node), + zap.Error(err), + ) + return nil, false + } + t.SetPriority(task.TaskPriorityLow) + t.SetReason("missing json stats") + return t, true +} diff --git a/internal/querycoordv2/checkers/stats_checker_test.go b/internal/querycoordv2/checkers/stats_checker_test.go new file mode 100644 index 0000000000000..db2500fedb8bf --- /dev/null +++ b/internal/querycoordv2/checkers/stats_checker_test.go @@ -0,0 +1,280 @@ +// Licensed to the LF AI & Data foundation under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package checkers + +import ( + "context" + "testing" + + "github.com/cockroachdb/errors" + "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/suite" + + "github.com/milvus-io/milvus-proto/go-api/v2/milvuspb" + "github.com/milvus-io/milvus-proto/go-api/v2/schemapb" + etcdkv "github.com/milvus-io/milvus/internal/kv/etcd" + "github.com/milvus-io/milvus/internal/metastore/kv/querycoord" + "github.com/milvus-io/milvus/internal/proto/datapb" + "github.com/milvus-io/milvus/internal/querycoordv2/meta" + "github.com/milvus-io/milvus/internal/querycoordv2/params" + "github.com/milvus-io/milvus/internal/querycoordv2/session" + "github.com/milvus-io/milvus/internal/querycoordv2/task" + "github.com/milvus-io/milvus/internal/querycoordv2/utils" + "github.com/milvus-io/milvus/pkg/kv" + "github.com/milvus-io/milvus/pkg/util/etcd" + "github.com/milvus-io/milvus/pkg/util/merr" + "github.com/milvus-io/milvus/pkg/util/paramtable" +) + +type StatsCheckerSuite struct { + suite.Suite + kv kv.MetaKv + checker *StatsChecker + meta *meta.Meta + broker *meta.MockBroker + nodeMgr *session.NodeManager + targetMgr *meta.MockTargetManager +} + +func (suite *StatsCheckerSuite) SetupSuite() { + paramtable.Init() +} + +func (suite *StatsCheckerSuite) SetupTest() { + var err error + config := params.GenerateEtcdConfig() + cli, err := etcd.GetEtcdClient( + config.UseEmbedEtcd.GetAsBool(), + config.EtcdUseSSL.GetAsBool(), + config.Endpoints.GetAsStrings(), + config.EtcdTLSCert.GetValue(), + config.EtcdTLSKey.GetValue(), + config.EtcdTLSCACert.GetValue(), + config.EtcdTLSMinVersion.GetValue()) + suite.Require().NoError(err) + suite.kv = etcdkv.NewEtcdKV(cli, config.MetaRootPath.GetValue()) + + // meta + store := querycoord.NewCatalog(suite.kv) + idAllocator := params.RandomIncrementIDAllocator() + suite.nodeMgr = session.NewNodeManager() + suite.meta = meta.NewMeta(idAllocator, store, suite.nodeMgr) + distManager := meta.NewDistributionManager() + suite.broker = meta.NewMockBroker(suite.T()) + + suite.targetMgr = meta.NewMockTargetManager(suite.T()) + suite.checker = NewStatsChecker(suite.meta, distManager, suite.broker, suite.nodeMgr, suite.targetMgr) + + suite.targetMgr.EXPECT().GetSealedSegment(mock.Anything, mock.Anything, mock.Anything, mock.Anything).RunAndReturn(func(ctx context.Context, cid, sid int64, i3 int32) *datapb.SegmentInfo { + return &datapb.SegmentInfo{ + ID: sid, + Level: datapb.SegmentLevel_L1, + } + }).Maybe() +} + +func (suite *StatsCheckerSuite) TearDownTest() { + suite.kv.Close() +} + +func (suite *StatsCheckerSuite) TestLoadJsonIndex() { + checker := suite.checker + ctx := context.Background() + + // meta + coll := utils.CreateTestCollection(1, 1) + coll.FieldIndexID = map[int64]int64{101: 1000} + checker.meta.CollectionManager.PutCollection(ctx, coll) + checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2})) + suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{ + NodeID: 1, + Address: "localhost", + Hostname: "localhost", + })) + suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{ + NodeID: 2, + Address: "localhost", + Hostname: "localhost", + })) + checker.meta.ResourceManager.HandleNodeUp(ctx, 1) + checker.meta.ResourceManager.HandleNodeUp(ctx, 2) + + // dist + checker.dist.SegmentDistManager.Update(1, utils.CreateTestSegment(1, 1, 2, 1, 1, "test-insert-channel")) + + // broker + suite.broker.EXPECT().DescribeCollection(mock.Anything, int64(1)). + Return(&milvuspb.DescribeCollectionResponse{ + Status: merr.Success(), + Schema: &schemapb.CollectionSchema{ + Name: "test_loadJsonIndex", + Fields: []*schemapb.FieldSchema{ + {FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"}, + }, + }, + CollectionID: 1, + CollectionName: "test_loadJsonIndex", + }, nil) + + tasks := checker.Check(context.Background()) + suite.Require().Len(tasks, 1) + + t := tasks[0] + suite.Require().Len(t.Actions(), 1) + + action, ok := t.Actions()[0].(*task.SegmentAction) + suite.Require().True(ok) + suite.EqualValues(200, t.ReplicaID()) + suite.Equal(task.ActionTypeStatsUpdate, action.Type()) + suite.EqualValues(2, action.GetSegmentID()) + + // test skip load json index for read only node + suite.nodeMgr.Stopping(1) + suite.nodeMgr.Stopping(2) + suite.meta.ResourceManager.HandleNodeStopping(ctx, 1) + suite.meta.ResourceManager.HandleNodeStopping(ctx, 2) + utils.RecoverAllCollection(suite.meta) + tasks = checker.Check(context.Background()) + suite.Require().Len(tasks, 0) +} + +func (suite *StatsCheckerSuite) TestJsonIndexNotMatch() { + checker := suite.checker + ctx := context.Background() + + // meta + coll := utils.CreateTestCollection(1, 1) + coll.FieldIndexID = map[int64]int64{101: 1000} + checker.meta.CollectionManager.PutCollection(ctx, coll) + checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2})) + suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{ + NodeID: 1, + Address: "localhost", + Hostname: "localhost", + })) + suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{ + NodeID: 2, + Address: "localhost", + Hostname: "localhost", + })) + checker.meta.ResourceManager.HandleNodeUp(ctx, 1) + checker.meta.ResourceManager.HandleNodeUp(ctx, 2) + + // dist + checker.dist.SegmentDistManager.Update(1, utils.CreateTestSegment(1, 1, 2, 1, 1, "test-insert-channel")) + + // broker + suite.broker.EXPECT().DescribeCollection(mock.Anything, int64(1)). + Return(&milvuspb.DescribeCollectionResponse{ + Status: merr.Success(), + Schema: &schemapb.CollectionSchema{ + Name: "test_loadJsonIndex", + Fields: []*schemapb.FieldSchema{ + {FieldID: 101, DataType: schemapb.DataType_Int16, Name: "int"}, + }, + }, + CollectionID: 1, + CollectionName: "test_loadJsonIndex", + }, nil) + + tasks := checker.Check(context.Background()) + suite.Require().Len(tasks, 0) +} + +func (suite *StatsCheckerSuite) TestDescribeCollectionFailed() { + checker := suite.checker + ctx := context.Background() + + // meta + coll := utils.CreateTestCollection(1, 1) + coll.FieldIndexID = map[int64]int64{101: 1000} + checker.meta.CollectionManager.PutCollection(ctx, coll) + checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2})) + suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{ + NodeID: 1, + Address: "localhost", + Hostname: "localhost", + })) + suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{ + NodeID: 2, + Address: "localhost", + Hostname: "localhost", + })) + checker.meta.ResourceManager.HandleNodeUp(ctx, 1) + checker.meta.ResourceManager.HandleNodeUp(ctx, 2) + + // dist + checker.dist.SegmentDistManager.Update(1, utils.CreateTestSegment(1, 1, 2, 1, 1, "test-insert-channel")) + + // broker + suite.broker.EXPECT().DescribeCollection(mock.Anything, int64(1)). + Return(nil, errors.New("mocked error")) + + tasks := checker.Check(context.Background()) + suite.Require().Len(tasks, 0) +} + +func (suite *StatsCheckerSuite) TestCreateNewJsonIndex() { + checker := suite.checker + ctx := context.Background() + + // meta + coll := utils.CreateTestCollection(1, 1) + coll.FieldIndexID = map[int64]int64{101: 1000} + checker.meta.CollectionManager.PutCollection(ctx, coll) + checker.meta.ReplicaManager.Put(ctx, utils.CreateTestReplica(200, 1, []int64{1, 2})) + suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{ + NodeID: 1, + Address: "localhost", + Hostname: "localhost", + })) + suite.nodeMgr.Add(session.NewNodeInfo(session.ImmutableNodeInfo{ + NodeID: 2, + Address: "localhost", + Hostname: "localhost", + })) + checker.meta.ResourceManager.HandleNodeUp(ctx, 1) + checker.meta.ResourceManager.HandleNodeUp(ctx, 2) + + // dist + segment := utils.CreateTestSegment(1, 1, 2, 1, 1, "test-insert-channel") + segment.JSONIndexField = []int64{102} + checker.dist.SegmentDistManager.Update(1, segment) + + // broker + suite.broker.EXPECT().DescribeCollection(mock.Anything, int64(1)). + Return(&milvuspb.DescribeCollectionResponse{ + Status: merr.Success(), + Schema: &schemapb.CollectionSchema{ + Name: "test_loadJsonIndex", + Fields: []*schemapb.FieldSchema{ + {FieldID: 101, DataType: schemapb.DataType_JSON, Name: "JSON"}, + }, + }, + CollectionID: 1, + CollectionName: "test_loadJsonIndex", + }, nil) + + tasks := checker.Check(context.Background()) + suite.Len(tasks, 1) + suite.Len(tasks[0].Actions(), 1) + suite.Equal(tasks[0].Actions()[0].(*task.SegmentAction).Type(), task.ActionTypeStatsUpdate) +} + +func TestStatsChecker(t *testing.T) { + suite.Run(t, new(StatsCheckerSuite)) +} diff --git a/internal/querycoordv2/dist/dist_handler.go b/internal/querycoordv2/dist/dist_handler.go index 828cedc6e5ce3..caec69800bc75 100644 --- a/internal/querycoordv2/dist/dist_handler.go +++ b/internal/querycoordv2/dist/dist_handler.go @@ -160,6 +160,7 @@ func (dh *distHandler) updateSegmentsDistribution(ctx context.Context, resp *que Version: s.GetVersion(), LastDeltaTimestamp: s.GetLastDeltaTimestamp(), IndexInfo: s.GetIndexInfo(), + JSONIndexField: s.GetFieldJsonIndexStats(), }) } diff --git a/internal/querycoordv2/meta/segment_dist_manager.go b/internal/querycoordv2/meta/segment_dist_manager.go index 85519b7770360..e8de376db9299 100644 --- a/internal/querycoordv2/meta/segment_dist_manager.go +++ b/internal/querycoordv2/meta/segment_dist_manager.go @@ -125,6 +125,7 @@ type Segment struct { Version int64 // Version is the timestamp of loading segment LastDeltaTimestamp uint64 // The timestamp of the last delta record IndexInfo map[int64]*querypb.FieldIndexInfo // index info of loaded segment + JSONIndexField []int64 // json index info of loaded segment } func SegmentFromInfo(info *datapb.SegmentInfo) *Segment { diff --git a/internal/querycoordv2/ops_service_test.go b/internal/querycoordv2/ops_service_test.go index db56c8ded85d4..ce263ef10ed23 100644 --- a/internal/querycoordv2/ops_service_test.go +++ b/internal/querycoordv2/ops_service_test.go @@ -184,7 +184,7 @@ func (suite *OpsServiceSuite) TestActiveCheckers() { resp, err = suite.server.ListCheckers(ctx, &querypb.ListCheckersRequest{}) suite.NoError(err) suite.True(merr.Ok(resp.Status)) - suite.Len(resp.GetCheckerInfos(), 5) + suite.Len(resp.GetCheckerInfos(), 6) resp4, err := suite.server.DeactivateChecker(ctx, &querypb.DeactivateCheckerRequest{ CheckerID: int32(utils.ChannelChecker), diff --git a/internal/querycoordv2/task/action.go b/internal/querycoordv2/task/action.go index dfbc4c44ddf52..1ab2be6200487 100644 --- a/internal/querycoordv2/task/action.go +++ b/internal/querycoordv2/task/action.go @@ -34,12 +34,14 @@ const ( ActionTypeGrow ActionType = iota + 1 ActionTypeReduce ActionTypeUpdate + ActionTypeStatsUpdate ) var ActionTypeName = map[ActionType]string{ - ActionTypeGrow: "Grow", - ActionTypeReduce: "Reduce", - ActionTypeUpdate: "Update", + ActionTypeGrow: "Grow", + ActionTypeReduce: "Reduce", + ActionTypeUpdate: "Update", + ActionTypeStatsUpdate: "StatsUpdate", } func (t ActionType) String() string { @@ -154,7 +156,7 @@ func (action *SegmentAction) IsFinished(distMgr *meta.DistributionManager) bool return true } return action.rpcReturned.Load() - } else if action.Type() == ActionTypeUpdate { + } else if action.Type() == ActionTypeUpdate || action.Type() == ActionTypeStatsUpdate { return action.rpcReturned.Load() } diff --git a/internal/querycoordv2/task/executor.go b/internal/querycoordv2/task/executor.go index b66d26b0f722e..7ff26e8b6f99b 100644 --- a/internal/querycoordv2/task/executor.go +++ b/internal/querycoordv2/task/executor.go @@ -156,7 +156,7 @@ func (ex *Executor) removeTask(task Task, step int) { func (ex *Executor) executeSegmentAction(task *SegmentTask, step int) { switch task.Actions()[step].Type() { - case ActionTypeGrow, ActionTypeUpdate: + case ActionTypeGrow, ActionTypeUpdate, ActionTypeStatsUpdate: ex.loadSegment(task, step) case ActionTypeReduce: @@ -469,6 +469,9 @@ func (ex *Executor) executeLeaderAction(task *LeaderTask, step int) { case ActionTypeUpdate: ex.updatePartStatsVersions(task, step) + + case ActionTypeStatsUpdate: + ex.updatePartStatsVersions(task, step) } } diff --git a/internal/querycoordv2/task/scheduler.go b/internal/querycoordv2/task/scheduler.go index 316f1a552be71..99ba1a03e40f7 100644 --- a/internal/querycoordv2/task/scheduler.go +++ b/internal/querycoordv2/task/scheduler.go @@ -47,13 +47,15 @@ const ( TaskTypeReduce TaskTypeMove TaskTypeUpdate + TaskTypeStatsUpdate ) var TaskTypeName = map[Type]string{ - TaskTypeGrow: "Grow", - TaskTypeReduce: "Reduce", - TaskTypeMove: "Move", - TaskTypeUpdate: "Update", + TaskTypeGrow: "Grow", + TaskTypeReduce: "Reduce", + TaskTypeMove: "Move", + TaskTypeUpdate: "Update", + TaskTypeStatsUpdate: "StatsUpdate", } type Type int32 diff --git a/internal/querycoordv2/task/utils.go b/internal/querycoordv2/task/utils.go index c4f4df26e5332..400d945073cb3 100644 --- a/internal/querycoordv2/task/utils.go +++ b/internal/querycoordv2/task/utils.go @@ -95,6 +95,8 @@ func GetTaskType(task Task) Type { return TaskTypeReduce case task.Actions()[0].Type() == ActionTypeUpdate: return TaskTypeUpdate + case task.Actions()[0].Type() == ActionTypeStatsUpdate: + return TaskTypeStatsUpdate } return 0 } @@ -132,6 +134,10 @@ func packLoadSegmentRequest( loadScope = querypb.LoadScope_Index } + if action.Type() == ActionTypeStatsUpdate { + loadScope = querypb.LoadScope_Stats + } + if task.Source() == utils.LeaderChecker { loadScope = querypb.LoadScope_Delta } diff --git a/internal/querycoordv2/utils/checker.go b/internal/querycoordv2/utils/checker.go index 0234ff2e98d8a..b201837d26e6c 100644 --- a/internal/querycoordv2/utils/checker.go +++ b/internal/querycoordv2/utils/checker.go @@ -28,6 +28,7 @@ const ( IndexCheckerName = "index_checker" LeaderCheckerName = "leader_checker" ManualBalanceName = "manual_balance" + StatsCheckerName = "stats_checker" ) type CheckerType int32 @@ -39,6 +40,7 @@ const ( IndexChecker LeaderChecker ManualBalance + StatsChecker ) var checkerNames = map[CheckerType]string{ @@ -48,6 +50,7 @@ var checkerNames = map[CheckerType]string{ IndexChecker: IndexCheckerName, LeaderChecker: LeaderCheckerName, ManualBalance: ManualBalanceName, + StatsChecker: StatsCheckerName, } func (s CheckerType) String() string { diff --git a/internal/querynodev2/handlers.go b/internal/querynodev2/handlers.go index 9c3cc113de236..24d21a8063777 100644 --- a/internal/querynodev2/handlers.go +++ b/internal/querynodev2/handlers.go @@ -186,6 +186,45 @@ func (node *QueryNode) loadIndex(ctx context.Context, req *querypb.LoadSegmentsR return status } +func (node *QueryNode) loadStats(ctx context.Context, req *querypb.LoadSegmentsRequest) *commonpb.Status { + log := log.Ctx(ctx).With( + zap.Int64("collectionID", req.GetCollectionID()), + zap.Int64s("segmentIDs", lo.Map(req.GetInfos(), func(info *querypb.SegmentLoadInfo, _ int) int64 { return info.GetSegmentID() })), + ) + + status := merr.Success() + log.Info("start to load stats") + + for _, info := range req.GetInfos() { + log := log.With(zap.Int64("segmentID", info.GetSegmentID())) + segment := node.manager.Segment.GetSealed(info.GetSegmentID()) + if segment == nil { + log.Warn("segment not found for load stats operation") + continue + } + localSegment, ok := segment.(*segments.LocalSegment) + if !ok { + log.Warn("segment not local for load stats opeartion") + continue + } + + if localSegment.IsLazyLoad() { + localSegment.SetLoadInfo(info) + localSegment.SetNeedUpdatedVersion(req.GetVersion()) + node.manager.DiskCache.MarkItemNeedReload(ctx, localSegment.ID()) + return nil + } + err := node.loader.LoadJSONIndex(ctx, localSegment, info) + if err != nil { + log.Warn("failed to load stats", zap.Error(err)) + status = merr.Status(err) + break + } + } + + return status +} + func (node *QueryNode) queryChannel(ctx context.Context, req *querypb.QueryRequest, channel string) (*internalpb.RetrieveResults, error) { msgID := req.Req.Base.GetMsgID() traceID := trace.SpanFromContext(ctx).SpanContext().TraceID() diff --git a/internal/querynodev2/segments/mock_loader.go b/internal/querynodev2/segments/mock_loader.go index d50d52b9c077a..34e7b9a5768db 100644 --- a/internal/querynodev2/segments/mock_loader.go +++ b/internal/querynodev2/segments/mock_loader.go @@ -355,6 +355,54 @@ func (_c *MockLoader_LoadIndex_Call) RunAndReturn(run func(context.Context, Segm return _c } +// LoadJSONIndex provides a mock function with given fields: ctx, segment, info +func (_m *MockLoader) LoadJSONIndex(ctx context.Context, segment Segment, info *querypb.SegmentLoadInfo) error { + ret := _m.Called(ctx, segment, info) + + if len(ret) == 0 { + panic("no return value specified for LoadJSONIndex") + } + + var r0 error + if rf, ok := ret.Get(0).(func(context.Context, Segment, *querypb.SegmentLoadInfo) error); ok { + r0 = rf(ctx, segment, info) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// MockLoader_LoadJSONIndex_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'LoadJSONIndex' +type MockLoader_LoadJSONIndex_Call struct { + *mock.Call +} + +// LoadJSONIndex is a helper method to define mock.On call +// - ctx context.Context +// - segment Segment +// - info *querypb.SegmentLoadInfo +func (_e *MockLoader_Expecter) LoadJSONIndex(ctx interface{}, segment interface{}, info interface{}) *MockLoader_LoadJSONIndex_Call { + return &MockLoader_LoadJSONIndex_Call{Call: _e.mock.On("LoadJSONIndex", ctx, segment, info)} +} + +func (_c *MockLoader_LoadJSONIndex_Call) Run(run func(ctx context.Context, segment Segment, info *querypb.SegmentLoadInfo)) *MockLoader_LoadJSONIndex_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(context.Context), args[1].(Segment), args[2].(*querypb.SegmentLoadInfo)) + }) + return _c +} + +func (_c *MockLoader_LoadJSONIndex_Call) Return(_a0 error) *MockLoader_LoadJSONIndex_Call { + _c.Call.Return(_a0) + return _c +} + +func (_c *MockLoader_LoadJSONIndex_Call) RunAndReturn(run func(context.Context, Segment, *querypb.SegmentLoadInfo) error) *MockLoader_LoadJSONIndex_Call { + _c.Call.Return(run) + return _c +} + // LoadLazySegment provides a mock function with given fields: ctx, segment, loadInfo func (_m *MockLoader) LoadLazySegment(ctx context.Context, segment Segment, loadInfo *querypb.SegmentLoadInfo) error { ret := _m.Called(ctx, segment, loadInfo) diff --git a/internal/querynodev2/segments/mock_segment.go b/internal/querynodev2/segments/mock_segment.go index 1af3012ed3038..9cb06d475e937 100644 --- a/internal/querynodev2/segments/mock_segment.go +++ b/internal/querynodev2/segments/mock_segment.go @@ -363,6 +363,53 @@ func (_c *MockSegment_GetBM25Stats_Call) RunAndReturn(run func() map[int64]*stor return _c } +// GetFieldJSONIndexStats provides a mock function with given fields: +func (_m *MockSegment) GetFieldJSONIndexStats() []int64 { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for GetFieldJSONIndexStats") + } + + var r0 []int64 + if rf, ok := ret.Get(0).(func() []int64); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]int64) + } + } + + return r0 +} + +// MockSegment_GetFieldJSONIndexStats_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'GetFieldJSONIndexStats' +type MockSegment_GetFieldJSONIndexStats_Call struct { + *mock.Call +} + +// GetFieldJSONIndexStats is a helper method to define mock.On call +func (_e *MockSegment_Expecter) GetFieldJSONIndexStats() *MockSegment_GetFieldJSONIndexStats_Call { + return &MockSegment_GetFieldJSONIndexStats_Call{Call: _e.mock.On("GetFieldJSONIndexStats")} +} + +func (_c *MockSegment_GetFieldJSONIndexStats_Call) Run(run func()) *MockSegment_GetFieldJSONIndexStats_Call { + _c.Call.Run(func(args mock.Arguments) { + run() + }) + return _c +} + +func (_c *MockSegment_GetFieldJSONIndexStats_Call) Return(_a0 []int64) *MockSegment_GetFieldJSONIndexStats_Call { + _c.Call.Return(_a0) + return _c +} + +func (_c *MockSegment_GetFieldJSONIndexStats_Call) RunAndReturn(run func() []int64) *MockSegment_GetFieldJSONIndexStats_Call { + _c.Call.Return(run) + return _c +} + // GetIndex provides a mock function with given fields: fieldID func (_m *MockSegment) GetIndex(fieldID int64) *IndexedFieldInfo { ret := _m.Called(fieldID) diff --git a/internal/querynodev2/segments/segment.go b/internal/querynodev2/segments/segment.go index 848c682d5b763..51ae77742cbf1 100644 --- a/internal/querynodev2/segments/segment.go +++ b/internal/querynodev2/segments/segment.go @@ -281,6 +281,7 @@ type LocalSegment struct { fields *typeutil.ConcurrentMap[int64, *FieldInfo] fieldIndexes *typeutil.ConcurrentMap[int64, *IndexedFieldInfo] warmupDispatcher *AsyncWarmupDispatcher + fieldJSONStats []int64 } func NewSegment(ctx context.Context, @@ -1094,9 +1095,19 @@ func (s *LocalSegment) LoadTextIndex(ctx context.Context, textLogs *datapb.TextI return HandleCStatus(ctx, &status, "LoadTextIndex failed") } -func (s *LocalSegment) LoadJsonKeyIndex(ctx context.Context, jsonKeyStats *datapb.JsonKeyStats, schemaHelper *typeutil.SchemaHelper) error { +func (s *LocalSegment) LoadJSONKeyIndex(ctx context.Context, jsonKeyStats *datapb.JsonKeyStats, schemaHelper *typeutil.SchemaHelper) error { log.Ctx(ctx).Info("load json key index", zap.Int64("field id", jsonKeyStats.GetFieldID()), zap.Any("json key logs", jsonKeyStats)) - + exists := false + for _, field := range s.fieldJSONStats { + if field == jsonKeyStats.GetFieldID() { + exists = true + break + } + } + if exists { + log.Warn("JsonKeyIndexStats already loaded") + return nil + } f, err := schemaHelper.GetFieldFromID(jsonKeyStats.GetFieldID()) if err != nil { return err @@ -1123,9 +1134,8 @@ func (s *LocalSegment) LoadJsonKeyIndex(ctx context.Context, jsonKeyStats *datap status = C.LoadJsonKeyIndex(traceCtx.ctx, s.ptr, (*C.uint8_t)(unsafe.Pointer(&marshaled[0])), (C.uint64_t)(len(marshaled))) return nil, nil }).Await() - + s.fieldJSONStats = append(s.fieldJSONStats, jsonKeyStats.GetFieldID()) return HandleCStatus(ctx, &status, "Load JsonKeyStats failed") - } func (s *LocalSegment) UpdateIndexInfo(ctx context.Context, indexInfo *querypb.FieldIndexInfo, info *LoadIndexInfo) error { @@ -1438,3 +1448,7 @@ func (d *AsyncWarmupDispatcher) Run(ctx context.Context) { } } } + +func (s *LocalSegment) GetFieldJSONIndexStats() []int64 { + return s.fieldJSONStats +} diff --git a/internal/querynodev2/segments/segment_interface.go b/internal/querynodev2/segments/segment_interface.go index 400886ccd5edf..950fe4034e559 100644 --- a/internal/querynodev2/segments/segment_interface.go +++ b/internal/querynodev2/segments/segment_interface.go @@ -102,4 +102,6 @@ type Segment interface { // lazy load related NeedUpdatedVersion() int64 RemoveUnusedFieldFiles() error + + GetFieldJSONIndexStats() []int64 } diff --git a/internal/querynodev2/segments/segment_l0.go b/internal/querynodev2/segments/segment_l0.go index cab1f64b7645a..f7b16af500b86 100644 --- a/internal/querynodev2/segments/segment_l0.go +++ b/internal/querynodev2/segments/segment_l0.go @@ -188,3 +188,7 @@ func (s *L0Segment) Release(ctx context.Context, opts ...releaseOption) { func (s *L0Segment) RemoveUnusedFieldFiles() error { panic("not implemented") } + +func (s *L0Segment) GetFieldJSONIndexStats() []int64 { + return nil +} diff --git a/internal/querynodev2/segments/segment_loader.go b/internal/querynodev2/segments/segment_loader.go index 2d7163fecff9b..cc9b52d2919ee 100644 --- a/internal/querynodev2/segments/segment_loader.go +++ b/internal/querynodev2/segments/segment_loader.go @@ -93,6 +93,10 @@ type Loader interface { segment Segment, loadInfo *querypb.SegmentLoadInfo, ) error + + LoadJSONIndex(ctx context.Context, + segment Segment, + info *querypb.SegmentLoadInfo) error } type ResourceEstimate struct { @@ -828,11 +832,11 @@ func (loader *segmentLoader) loadSealedSegment(ctx context.Context, loadInfo *qu } for _, info := range jsonKeyStats { - if err := segment.LoadJsonKeyIndex(ctx, info, schemaHelper); err != nil { + if err := segment.LoadJSONKeyIndex(ctx, info, schemaHelper); err != nil { return err } } - loadJsonKeyIndexesSpan := tr.RecordSpan() + loadJSONKeyIndexesSpan := tr.RecordSpan() // 4. rectify entries number for binlog in very rare cases // https://github.com/milvus-io/milvus/23654 @@ -847,7 +851,7 @@ func (loader *segmentLoader) loadSealedSegment(ctx context.Context, loadInfo *qu zap.Duration("loadRawDataSpan", loadRawDataSpan), zap.Duration("patchEntryNumberSpan", patchEntryNumberSpan), zap.Duration("loadTextIndexesSpan", loadTextIndexesSpan), - zap.Duration("loadJsonKeyIndexSpan", loadJsonKeyIndexesSpan), + zap.Duration("loadJsonKeyIndexSpan", loadJSONKeyIndexesSpan), ) return nil } @@ -1691,6 +1695,35 @@ func (loader *segmentLoader) LoadIndex(ctx context.Context, return loader.waitSegmentLoadDone(ctx, commonpb.SegmentState_SegmentStateNone, []int64{loadInfo.GetSegmentID()}, version) } +func (loader *segmentLoader) LoadJSONIndex(ctx context.Context, + seg Segment, + loadInfo *querypb.SegmentLoadInfo, +) error { + segment, ok := seg.(*LocalSegment) + if !ok { + return merr.WrapErrParameterInvalid("LocalSegment", fmt.Sprintf("%T", seg)) + } + + collection := segment.GetCollection() + schemaHelper, _ := typeutil.CreateSchemaHelper(collection.Schema()) + + jsonKeyIndexInfo := make(map[int64]*datapb.JsonKeyStats, len(loadInfo.GetJsonKeyStatsLogs())) + for _, fieldStatsLog := range loadInfo.GetJsonKeyStatsLogs() { + jsonKeyLog, ok := jsonKeyIndexInfo[fieldStatsLog.FieldID] + if !ok { + jsonKeyIndexInfo[fieldStatsLog.FieldID] = fieldStatsLog + } else if fieldStatsLog.GetVersion() > jsonKeyLog.GetVersion() { + jsonKeyIndexInfo[fieldStatsLog.FieldID] = fieldStatsLog + } + } + for _, info := range jsonKeyIndexInfo { + if err := segment.LoadJSONKeyIndex(ctx, info, schemaHelper); err != nil { + return err + } + } + return nil +} + func getBinlogDataDiskSize(fieldBinlog *datapb.FieldBinlog) int64 { fieldSize := int64(0) for _, binlog := range fieldBinlog.Binlogs { diff --git a/internal/querynodev2/services.go b/internal/querynodev2/services.go index 446f5fded4fa0..ec19c64e92edd 100644 --- a/internal/querynodev2/services.go +++ b/internal/querynodev2/services.go @@ -484,6 +484,9 @@ func (node *QueryNode) LoadSegments(ctx context.Context, req *querypb.LoadSegmen if req.GetLoadScope() == querypb.LoadScope_Index { return node.loadIndex(ctx, req), nil } + if req.GetLoadScope() == querypb.LoadScope_Stats { + return node.loadStats(ctx, req), nil + } // Actual load segment log.Info("start to load segments...") @@ -1160,6 +1163,7 @@ func (node *QueryNode) GetDataDistribution(ctx context.Context, req *querypb.Get sealedSegments := node.manager.Segment.GetBy(segments.WithType(commonpb.SegmentState_Sealed)) segmentVersionInfos := make([]*querypb.SegmentVersionInfo, 0, len(sealedSegments)) for _, s := range sealedSegments { + log.Info("GetDataDistribution", zap.Any("JsonKeyStatsLogs", s.LoadInfo().GetJsonKeyStatsLogs()), zap.Any("Indexes", s.Indexes())) segmentVersionInfos = append(segmentVersionInfos, &querypb.SegmentVersionInfo{ ID: s.ID(), Collection: s.Collection(), @@ -1172,6 +1176,7 @@ func (node *QueryNode) GetDataDistribution(ctx context.Context, req *querypb.Get IndexInfo: lo.SliceToMap(s.Indexes(), func(info *segments.IndexedFieldInfo) (int64, *querypb.FieldIndexInfo) { return info.IndexInfo.FieldID, info.IndexInfo }), + FieldJsonIndexStats: s.GetFieldJSONIndexStats(), }) } diff --git a/internal/util/indexcgowrapper/index.go b/internal/util/indexcgowrapper/index.go index 255fc99d43e4a..a813be05a5cf3 100644 --- a/internal/util/indexcgowrapper/index.go +++ b/internal/util/indexcgowrapper/index.go @@ -163,7 +163,7 @@ func CreateTextIndex(ctx context.Context, buildIndexInfo *indexcgopb.BuildIndexI return res, nil } -func CreateJsonKeyIndex(ctx context.Context, buildIndexInfo *indexcgopb.BuildIndexInfo) (map[string]int64, error) { +func CreateJSONKeyIndex(ctx context.Context, buildIndexInfo *indexcgopb.BuildIndexInfo) (map[string]int64, error) { buildIndexInfoBlob, err := proto.Marshal(buildIndexInfo) if err != nil { log.Ctx(ctx).Warn("marshal buildIndexInfo failed", diff --git a/pkg/util/typeutil/field_schema.go b/pkg/util/typeutil/field_schema.go index 460b9adf25efc..4b14fd40078de 100644 --- a/pkg/util/typeutil/field_schema.go +++ b/pkg/util/typeutil/field_schema.go @@ -53,11 +53,8 @@ func (h *FieldSchemaHelper) EnableMatch() bool { return err == nil && enable } -func (h *FieldSchemaHelper) EnableJsonKeyIndex() bool { - if IsJSONType(h.schema.GetDataType()) { - return true - } - return false +func (h *FieldSchemaHelper) EnableJSONKeyIndex() bool { + return IsJSONType(h.schema.GetDataType()) } func (h *FieldSchemaHelper) EnableAnalyzer() bool { From c71e7fbe2f52f4ef9790b12c90f296b2bde203b3 Mon Sep 17 00:00:00 2001 From: "Xianhui.Lin" Date: Tue, 7 Jan 2025 17:12:17 +0800 Subject: [PATCH 4/5] fix codereview Signed-off-by: Xianhui.Lin --- internal/core/src/exec/expression/TermExpr.cpp | 5 ----- internal/core/src/index/JsonKeyInvertedIndex.cpp | 8 ++------ internal/indexnode/task_stats.go | 1 - 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/internal/core/src/exec/expression/TermExpr.cpp b/internal/core/src/exec/expression/TermExpr.cpp index e5419114e8626..3921b0ed257de 100644 --- a/internal/core/src/exec/expression/TermExpr.cpp +++ b/internal/core/src/exec/expression/TermExpr.cpp @@ -93,11 +93,6 @@ PhyTermFilterExpr::Eval(EvalCtx& context, VectorPtr& result) { default: PanicInfo(DataTypeInvalid, "unknown data type: {}", type); } - std::cout << "optimize cost:" - << std::chrono::duration_cast( - std::chrono::steady_clock::now() - start) - .count() - << std::endl; break; } case DataType::ARRAY: { diff --git a/internal/core/src/index/JsonKeyInvertedIndex.cpp b/internal/core/src/index/JsonKeyInvertedIndex.cpp index f549bf271624e..395753ae817d1 100644 --- a/internal/core/src/index/JsonKeyInvertedIndex.cpp +++ b/internal/core/src/index/JsonKeyInvertedIndex.cpp @@ -92,7 +92,7 @@ JsonKeyInvertedIndex::AddJson(const char* json, int64_t offset) { jsmn_parser parser; jsmntok_t* tokens = (jsmntok_t*)malloc(16 * sizeof(jsmntok_t)); if (!tokens) { - fprintf(stderr, "Memory allocation failed\n"); + PanicInfo(ErrorCode::UnexpectedError, "alloc jsmn token failed"); return; } int num_tokens = 0; @@ -213,7 +213,6 @@ JsonKeyInvertedIndex::BuildWithFieldData( for (const auto& data : field_datas) { total += data->get_null_count(); } - null_offset.reserve(total); } int64_t offset = 0; if (schema_.nullable()) { @@ -221,10 +220,7 @@ JsonKeyInvertedIndex::BuildWithFieldData( auto n = data->get_num_rows(); for (int i = 0; i < n; i++) { if (!data->is_valid(i)) { - null_offset.push_back(i); - std::string empty = ""; - wrapper_->add_multi_data(&empty, 0, offset++); - return; + continue; } AddJson(static_cast(data->RawValue(i)) ->data() diff --git a/internal/indexnode/task_stats.go b/internal/indexnode/task_stats.go index 40f2dc93a1d4e..f153360dde62f 100644 --- a/internal/indexnode/task_stats.go +++ b/internal/indexnode/task_stats.go @@ -777,7 +777,6 @@ func (st *statsTask) createJSONKeyIndex(ctx context.Context, continue } log.Info("field enable json key index, ready to create json key index", zap.Int64("field id", field.GetFieldID())) - // create text index and upload the text index files. files, err := getInsertFiles(field.GetFieldID()) if err != nil { return err From ac45a92b10a4b8838a7100cbbdf46593e363d1c4 Mon Sep 17 00:00:00 2001 From: "Xianhui.Lin" Date: Wed, 8 Jan 2025 12:59:31 +0800 Subject: [PATCH 5/5] fix jsonindex parse string to int Signed-off-by: Xianhui.Lin fix unitest Signed-off-by: Xianhui.Lin improve Signed-off-by: Xianhui.Lin fix Signed-off-by: Xianhui.Lin fix Signed-off-by: Xianhui.Lin fix go test error Signed-off-by: Xianhui.Lin improve Signed-off-by: Xianhui.Lin fix Signed-off-by: Xianhui.Lin --- internal/core/src/common/Json.h | 4 ---- internal/core/src/index/JsonKeyInvertedIndex.cpp | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/internal/core/src/common/Json.h b/internal/core/src/common/Json.h index 992d45646b64f..b2a403c569691 100644 --- a/internal/core/src/common/Json.h +++ b/internal/core/src/common/Json.h @@ -241,10 +241,6 @@ class Json { template value_result at(uint16_t offset, uint16_t length) const { - if constexpr (std::is_same_v || - std::is_same_v) { - return value_result(T(data_.data() + offset, length)); - } return doc(offset, length).get(); } diff --git a/internal/core/src/index/JsonKeyInvertedIndex.cpp b/internal/core/src/index/JsonKeyInvertedIndex.cpp index 395753ae817d1..3d18f4478123b 100644 --- a/internal/core/src/index/JsonKeyInvertedIndex.cpp +++ b/internal/core/src/index/JsonKeyInvertedIndex.cpp @@ -26,7 +26,7 @@ JsonKeyInvertedIndex::AddInvertedRecord(const std::vector& paths, uint16_t offset, uint16_t length) { auto key = std::string("/") + Join(paths, "/"); - LOG_DEBUG( + LOG_INFO( "insert inverted key: {}, row_id: {}, offset: " "{}, length:{}", key, @@ -82,7 +82,7 @@ JsonKeyInvertedIndex::TravelJson(const char* json, } else if (current.type == JSMN_STRING) { Assert(current.size == 0); AddInvertedRecord( - path, offset, current.start, current.end - current.start); + path, offset, current.start - 1, current.end - current.start + 2); index++; } }