Blame: internal/cpp/rag_analyzer.cpp - infiniflow/ragflow

infiniflow / ragflow UNCLAIMED

RAGFlow is a leading open-source Retrieval-Augmented Generation (RAG) engine that fuses cutting-edge RAG with Agent capabilities to create a superior context layer for LLMs

76430 0 0 Python

Normal View History Raw

-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved.
 								//
 								// Licensed under the Apache License, Version 2.0 (the "License");
 								// you may not use this file except in compliance with the License.
 								// You may obtain a copy of the License at
 								//
 								//     https://www.apache.org/licenses/LICENSE-2.0
 								//
 								// Unless required by applicable law or agreed to in writing, software
 								// distributed under the License is distributed on an "AS IS" BASIS,
 								// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								// See the License for the specific language governing permissions and
 								// limitations under the License.
 								#define PCRE2_CODE_UNIT_WIDTH 8
 								#include "opencc/openccxx.h"
 								#include "pcre2.h"
 								#include "string_utils.h"
 								#include "rag_analyzer.h"
 								#include "re2/re2.h"
 								#include <cassert>
 								#include <cstdint>
 								#include <filesystem>
 								#include <iostream>
 								#include <cmath>
 								#include <fstream>
 								// import :term;
 								// import :stemmer;
 								// import :analyzer;
 								// import :darts_trie;
 								// import :wordnet_lemmatizer;
 								// import :stemmer;
 								// import :term;
 								//
 								// import std.compat;
 								namespace fs = std::filesystem;
 								static const std::string DICT_PATH = "rag/huqie.txt";
 								static const std::string POS_DEF_PATH = "rag/pos-id.def";
 								static const std::string TRIE_PATH = "rag/huqie.trie";
 								static const std::string WORDNET_PATH = "wordnet";
 								static const std::string OPENCC_PATH = "opencc";
 								static const std::string REGEX_SPLIT_CHAR =
 								    R"#(([ ,\.<>/?;'\[\]\`!@#$%^&*$$\{\}\|_+=《》，。？、；‘’：“”【】~！￥%……（）——-]+|[a-zA-Z\.-]+|[0-9,\.-]+))#";
 								static const std::string NLTK_TOKENIZE_PATTERN =
 								    R"((?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)|(?=[^\(\"\`{\[:;&\#\*@\)}\]\-,])\S+?(?=\s|$|(?:[)\";}\]\*:@\'\({\[\?!])|(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)|,(?=$|\s|(?:[)\";}\]\*:@\'\({\[\?!])|(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)))|\S)";
 								static constexpr std::size_t MAX_SENTENCE_LEN = 100;
 								static inline int32_t Encode(int32_t freq, int32_t idx) {
 								    uint32_t encoded_value = 0;
 								    if (freq < 0) {
 								        encoded_value |= static_cast<uint32_t>(-freq);
 								        encoded_value |= (1U << 23);
 								    } else {
 								        encoded_value = static_cast<uint32_t>(freq & 0x7FFFFF);
 								    }
 								    encoded_value |= static_cast<uint32_t>(idx) << 24;
 								    return static_cast<int32_t>(encoded_value);
 								}
 								static inline int32_t DecodeFreq(int32_t value) {
 								    uint32_t v1 = static_cast<uint32_t>(value) & 0xFFFFFF;
 								    if (v1 & (1 << 23)) {
 								        v1 &= 0x7FFFFF;
 								        return -static_cast<int32_t>(v1);
 								    } else {
 								        v1 = static_cast<int32_t>(v1);
 								    }
 								    return v1;
 								}
 								static inline int32_t DecodePOSIndex(int32_t value) {
 								    // POS index is stored in the high 8 bits (bits 24-31)
 								    return static_cast<int32_t>(static_cast<uint32_t>(value) >> 24);
 								}
 								void Split(const std::string &input, const std::string &split_pattern, std::vector<std::string> &result, bool keep_delim = false) {
 								    re2::RE2 pattern(split_pattern);
 								    re2::StringPiece leftover(input.data());
 								    re2::StringPiece last_end = leftover;
 								    re2::StringPiece extracted_delim_token;
 								    while (RE2::FindAndConsume(&leftover, pattern, &extracted_delim_token)) {
 								        std::string_view token(last_end.data(), extracted_delim_token.data() - last_end.data());
 								        if (!token.empty()) {
 								            result.emplace_back(token.data(), token.size());
 								        }
 								        if (keep_delim)
 								            result.emplace_back(extracted_delim_token.data(), extracted_delim_token.size());
 								        last_end = leftover;
 								    }
 								    if (!leftover.empty()) {
 								        result.emplace_back(leftover.data(), leftover.size());
 								    }
 								}
 								void Split(const std::string &input, const re2::RE2 &pattern, std::vector<std::string> &result, bool keep_delim = false) {
 								    re2::StringPiece leftover(input.data());
 								    re2::StringPiece last_end = leftover;
 								    re2::StringPiece extracted_delim_token;
 								    while (RE2::FindAndConsume(&leftover, pattern, &extracted_delim_token)) {
 								        std::string_view token(last_end.data(), extracted_delim_token.data() - last_end.data());
 								        if (!token.empty()) {
 								            result.emplace_back(token.data(), token.size());
 								        }
 								        if (keep_delim)
 								            result.emplace_back(extracted_delim_token.data(), extracted_delim_token.size());
 								        last_end = leftover;
 								    }
 								    if (!leftover.empty()) {
 								        result.emplace_back(leftover.data(), leftover.size());
 								    }
 								}
 								std::string Replace(const re2::RE2 &re, const std::string &replacement, const std::string &input) {
 								    std::string output = input;
 								    re2::RE2::GlobalReplace(&output, re, replacement);
 								    return output;
 								}
 								template <typename T>
 								std::string Join(const std::vector<T> &tokens, int start, int end, const std::string &delim = " ") {
 								    std::ostringstream oss;
 								    for (int i = start; i < end; ++i) {
 								        if (i > start)
 								            oss << delim;
 								        oss << tokens[i];
 								    }
 								    return std::move(oss).str();
 								}
 								template <typename T>
 								std::string Join(const std::vector<T> &tokens, int start, const std::string &delim = " ") {
 								    return Join(tokens, start, tokens.size(), delim);
 								}
 								std::string Join(const TermList &tokens, int start, int end, const std::string &delim = " ") {
 								    std::ostringstream oss;
 								    for (int i = start; i < end; ++i) {
 								        if (i > start)
 								            oss << delim;
 								        oss << tokens[i].text_;
 								    }
 								    return std::move(oss).str();
 								}
 								bool IsChinese(const std::string &str) {
 								    for (std::size_t i = 0; i < str.length(); ++i) {
 								        unsigned char c = str[i];
 								        if (c >= 0xE4 && c <= 0xE9) {
 								            if (i + 2 < str.length()) {
 								                unsigned char c2 = str[i + 1];
 								                unsigned char c3 = str[i + 2];
 								                if ((c2 >= 0x80 && c2 <= 0xBF) && (c3 >= 0x80 && c3 <= 0xBF)) {
 								                    return true;
 								                }
 								            }
 								        }
 								    }
 								    return false;
 								}
 								bool IsAlphabet(const std::string &str) {
 								    for (std::size_t i = 0; i < str.length(); ++i) {
 								        unsigned char c = str[i];
 								        if (c > 0x7F) {
 								            return false;
 								        }
 								    }
 								    return true;
 								}
 								bool IsKorean(const std::string &str) {
 								    for (std::size_t i = 0; i < str.length(); ++i) {
 								        unsigned char c = str[i];
 								        if (c == 0xE1) {
 								            if (i + 2 < str.length()) {
 								                unsigned char c2 = str[i + 1];
 								                unsigned char c3 = str[i + 2];
 								                if ((c2 == 0x84 || c2 == 0x85 || c2 == 0x86 || c2 == 0x87) && (c3 >= 0x80 && c3 <= 0xBF)) {
 								                    return true;
 								                }
 								            }
 								        }
 								    }
 								    return false;
 								}
 								bool IsJapanese(const std::string &str) {
 								    for (std::size_t i = 0; i < str.length(); ++i) {
 								        unsigned char c = str[i];
 								        if (c == 0xE3) {
 								            if (i + 2 < str.length()) {
 								                unsigned char c2 = str[i + 1];
 								                unsigned char c3 = str[i + 2];
 								                if ((c2 == 0x81 || c2 == 0x82 || c2 == 0x83) && (c3 >= 0x81 && c3 <= 0xBF)) {
 								                    return true;
 								                }
 								            }
 								        }
 								    }
 								    return false;
 								}
 								bool IsCJK(const std::string &str) {
 								    for (std::size_t i = 0; i < str.length(); ++i) {
 								        unsigned char c = str[i];
 								        // Check Chinese
 								        if (c >= 0xE4 && c <= 0xE9) {
 								            if (i + 2 < str.length()) {
 								                unsigned char c2 = str[i + 1];
 								                unsigned char c3 = str[i + 2];
 								                if ((c2 >= 0x80 && c2 <= 0xBF) && (c3 >= 0x80 && c3 <= 0xBF)) {
 								                    return true;
 								                }
 								            }
 								        }
 								        // Check Japanese
 								        if (c == 0xE3) {
 								            if (i + 2 < str.length()) {
 								                unsigned char c2 = str[i + 1];
 								                unsigned char c3 = str[i + 2];
 								                if ((c2 == 0x81 || c2 == 0x82 || c2 == 0x83) && (c3 >= 0x81 && c3 <= 0xBF)) {
 								                    return true;
 								                }
 								            }
 								        }
 								        // Check Korean
 								        if (c == 0xE1) {
 								            if (i + 2 < str.length()) {
 								                unsigned char c2 = str[i + 1];
 								                unsigned char c3 = str[i + 2];
 								                if ((c2 == 0x84 || c2 == 0x85 || c2 == 0x86 || c2 == 0x87) && (c3 >= 0x80 && c3 <= 0xBF)) {
 								                    return true;
 								                }
 								            }
 								        }
 								    }
 								    return false;
 								}
 								class RegexTokenizer {
 								public:
 								    RegexTokenizer() {
 								        int errorcode = 0;
 								        PCRE2_SIZE erroffset = 0;
 								        re_ = pcre2_compile((PCRE2_SPTR)(NLTK_TOKENIZE_PATTERN.c_str()),
 								                            PCRE2_ZERO_TERMINATED,
 								                            PCRE2_MULTILINE | PCRE2_UTF,
 								                            &errorcode,
 								                            &erroffset,
 								                            nullptr);
 								    }
 								    ~RegexTokenizer() {
 								        pcre2_code_free(re_);
 								    }
 								    void RegexTokenize(const std::string &input, TermList &tokens) {
 								        PCRE2_SPTR subject = (PCRE2_SPTR)input.c_str();
 								        PCRE2_SIZE subject_length = input.length();
 								        pcre2_match_data_8 *match_data = pcre2_match_data_create_8(1024, nullptr);
 								        PCRE2_SIZE start_offset = 0;
 								        while (start_offset < subject_length) {
 								            int res = pcre2_match(re_, subject, subject_length, start_offset, 0, match_data, nullptr);
 								            if (res < 0) {
 								                if (res == PCRE2_ERROR_NOMATCH) {
 								                    break; // No more matches
 								                } else {
 								                    std::cerr << "Matching error code: " << res << std::endl;
 								                    break; // Other error
 								                }
 								            }
 								            // Extract matched substring
 								            PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data);
 								            for (int i = 0; i < res; ++i) {
 								                PCRE2_SIZE start = ovector[2 * i];
 								                PCRE2_SIZE end = ovector[2 * i + 1];
 								                tokens.Add(input.c_str() + start, end - start, start, end);
 								            }
 								            // Update the start offset for the next search
 								            start_offset = ovector[1]; // Move to the end of the last match
 								        }
 								        // Free memory
 								        pcre2_match_data_free(match_data);
 								    }
 								private:
 								    pcre2_code_8 *re_{nullptr};
 								};
 								class MacIntyreContractions {
 								public:
 								    // List of contractions adapted from Robert MacIntyre's tokenizer.
 								    std::vector<std::string> CONTRACTIONS2 = {R"((?i)\b(can)(?#X)(not)\b)",
 								                                              R"((?i)\b(d)(?#X)('ye)\b)",
 								                                              R"((?i)\b(gim)(?#X)(me)\b)",
 								                                              R"((?i)\b(gon)(?#X)(na)\b)",
 								                                              R"((?i)\b(got)(?#X)(ta)\b)",
 								                                              R"((?i)\b(lem)(?#X)(me)\b)",
 								                                              R"((?i)\b(more)(?#X)('n)\b)",
 								                                              R"((?i)\b(wan)(?#X)(na)(?=\s))"};
 								    std::vector<std::string> CONTRACTIONS3 = {R"((?i) ('t)(?#X)(is)\b)", R"((?i) ('t)(?#X)(was)\b)"};
 								    std::vector<std::string> CONTRACTIONS4 = {R"((?i)\b(whad)(dd)(ya)\b)", R"((?i)\b(wha)(t)(cha)\b)"};
 								};
 								// Structure to hold precompiled regex patterns
 								struct CompiledRegex {
 								    pcre2_code *re{nullptr};
 								    std::string substitution;
 								    CompiledRegex(pcre2_code *r, std::string sub) : re(r), substitution(std::move(sub)) {
 								    }
 								    CompiledRegex(const CompiledRegex &) = delete;
 								    CompiledRegex &operator=(const CompiledRegex &) = delete;
 								    CompiledRegex(CompiledRegex &&other) noexcept : re(other.re), substitution(std::move(other.substitution)) { other.re = nullptr; }
 								    CompiledRegex &operator=(CompiledRegex &&other) noexcept {
 								        if (this != &other) {
 								            if (re)
 								                pcre2_code_free(re);
 								            re = other.re;
 								            substitution = std::move(other.substitution);
 								            other.re = nullptr;
 								        }
 								        return *this;
 								    }
 								    ~CompiledRegex() {
 								        if (re) {
 								            pcre2_code_free(re);
 								        }
 								    }
 								};
 								class NLTKWordTokenizer {
 								    MacIntyreContractions contractions_;
 								    // Static singleton instance
 								    static std::unique_ptr<NLTKWordTokenizer> instance_;
 								    static std::once_flag init_flag_;
 								public:
 								    // Static method to get the singleton instance
 								    static NLTKWordTokenizer &GetInstance() {
 								        std::call_once(init_flag_, []() { instance_ = std::make_unique<NLTKWordTokenizer>(); });
 								        return *instance_;
 								    }
 								    // Starting quotes.
 								    std::vector<std::pair<std::string, std::string>> STARTING_QUOTES = {
 								        {std::string(R"(([«“‘„]|[`]+))"), std::string(R"( $1 )")},
 								        {std::string(R"(^\")"), std::string(R"(``)")},
 								        {std::string(R"((``))"), std::string(R"( $1 )")},
 								        {std::string(R"(([ \(\[{<])(\"|\'{2}))"), std::string(R"($1 `` )")},
 								        {std::string(R"((?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\b)"), std::string(R"($1 $2)")}};
 								    // Ending quotes.
 								    std::vector<std::pair<std::string, std::string>> ENDING_QUOTES = {
 								        {std::string(R"(([»”’]))"), std::string(R"( $1 )")},
 								        {std::string(R"('')"), std::string(R"( '' )")},
 								        {std::string(R"(")"), std::string(R"( '' )")},
 								        {std::string(R"(\s+)"), std::string(R"( )")},
 								        {std::string(R"(([^' ])('[sS]|'[mM]|'[dD]|') )"), std::string(R"($1 $2 )")},
 								        {std::string(R"(([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) )"), std::string(R"($1 $2 )")}};
 								    // Punctuation.
 								    std::vector<std::pair<std::string, std::string>> PUNCTUATION = {
 								        {std::string(R"(([^\.])(\.)([\]\)}>"\'»”’ ]*)\s*$)"), std::string(R"($1 $2 $3 )")},
 								        {std::string(R"(([:,])([^\d]))"), std::string(R"( $1 $2)")},
 								        {std::string(R"(([:,])$)"), std::string(R"($1 )")},
 								        {std::string(R"(\.{2,})"), std::string(R"($0 )")},
 								        {std::string(R"([;@#$%&])"), std::string(R"($0 )")},
 								        {std::string(R"(([^\.])(\.)([\]\)}>"\']*)\s*$)"), std::string(R"($1 $2 $3 )")},
 								        {std::string(R"([?!])"), std::string(R"($0 )")},
 								        {std::string(R"(([^'])' )"), std::string(R"($1 ' )")},
 								        {std::string(R"([*])"), std::string(R"($0 )")}};
 								    // Pads parentheses
 								    std::pair<std::string, std::string> PARENS_BRACKETS = {std::string(R"([\]\[\(\)\{\}\<\>])"), std::string(R"( $0 )")};
 								    std::vector<std::pair<std::string, std::string>> CONVERT_PARENTHESES = {{std::string(R"(\()"), std::string("-LRB-")},
 								                                                                            {std::string(R"(\))"), std::string("-RRB-")},
 								                                                                            {std::string(R"(\[)"), std::string("-LSB-")},
 								                                                                            {std::string(R"(\])"), std::string("-RSB-")},
 								                                                                            {std::string(R"(\{)"), std::string("-LCB-")},
 								                                                                            {std::string(R"(\})"), std::string("-RCB-")}};
 								    std::pair<std::string, std::string> DOUBLE_DASHES = {std::string(R"(--)"), std::string(R"( -- )")};
 								    // Cache for compiled regex patterns
 								    std::vector<CompiledRegex> compiled_starting_quotes_;
 								    std::vector<CompiledRegex> compiled_ending_quotes_;
 								    std::vector<CompiledRegex> compiled_punctuation_;
 								    CompiledRegex compiled_parens_brackets_;
 								    std::vector<CompiledRegex> compiled_convert_parentheses_;
 								    CompiledRegex compiled_double_dashes_;
 								    std::vector<CompiledRegex> compiled_contractions2_;
 								    std::vector<CompiledRegex> compiled_contractions3_;
 								    // Constructor that precompiles all regex patterns
 								    NLTKWordTokenizer() : compiled_parens_brackets_(nullptr, ""), compiled_double_dashes_(nullptr, "") { CompileRegexPatterns(); }
 								    void Tokenize(const std::string &text, std::vector<std::string> &tokens, bool convert_parentheses = false) {
 								        std::string result = text;
 								        for (const auto &compiled : compiled_starting_quotes_) {
 								            result = ApplyRegex(result, compiled);
 								        }
 								        for (const auto &compiled : compiled_punctuation_) {
 								            result = ApplyRegex(result, compiled);
 								        }
 								        // Handles parentheses.
 								        result = ApplyRegex(result, compiled_parens_brackets_);
 								        // Optionally convert parentheses
 								        if (convert_parentheses) {
 								            for (const auto &compiled : compiled_convert_parentheses_) {
 								                result = ApplyRegex(result, compiled);
 								            }
 								        }
 								        // Handles double dash.
 								        result = ApplyRegex(result, compiled_double_dashes_);
 								        // Add extra space to make things easier
 								        result = " " + result + " ";
 								        for (const auto &compiled : compiled_ending_quotes_) {
 								            result = ApplyRegex(result, compiled);
 								        }
 								        for (const auto &compiled : compiled_contractions2_) {
 								            result = ApplyRegex(result, compiled);
 								        }
 								        for (const auto &compiled : compiled_contractions3_) {
 								            result = ApplyRegex(result, compiled);
 								        }
 								        // Split the result into tokens
 								        size_t start = 0;
 								        size_t end = result.find(' ');
 								        while (end != std::string::npos) {
 								            if (end != start) {
 								                std::string token = result.substr(start, end - start);
 								                // Handle underscore tokens properly
 								                if (token == "_") {
 								                    // Single underscore token
 								                    tokens.push_back("_");
 								                } else if (token.find('_') != std::string::npos) {
 								                    // Split tokens containing underscores and keep underscores as separate tokens
 								                    std::stringstream ss(token);
 								                    std::string sub_token;
 								                    bool first = true;
 								                    while (std::getline(ss, sub_token, '_')) {
 								                        if (!first) {
 								                            tokens.push_back("_");
 								                        }
 								                        if (!sub_token.empty()) {
 								                            tokens.push_back(sub_token);
 								                        }
 								                        first = false;
 								                    }
 								                    // Handle case where token ends with underscore
 								                    if (token.back() == '_') {
 								                        tokens.push_back("_");
 								                    }
 								                } else {
 								                    tokens.push_back(token);
 								                }
 								            }
 								            start = end + 1;
 								            end = result.find(' ', start);
 								        }
 								        if (start != result.length()) {
 								            std::string token = result.substr(start);
 								            // Handle underscore tokens properly
 								            if (token == "_") {
 								                // Single underscore token
 								                tokens.push_back("_");
 								            } else if (token.find('_') != std::string::npos) {
 								                // Split tokens containing underscores and keep underscores as separate tokens
 								                std::stringstream ss(token);
 								                std::string sub_token;
 								                bool first = true;
 								                while (std::getline(ss, sub_token, '_')) {
 								                    if (!first) {
 								                        tokens.push_back("_");
 								                    }
 								                    if (!sub_token.empty()) {
 								                        tokens.push_back(sub_token);
 								                    }
 								                    first = false;
 								                }
 								                // Handle case where token ends with underscore
 								                if (token.back() == '_') {
 								                    tokens.push_back("_");
 								                }
 								            } else {
 								                tokens.push_back(token);
 								            }
 								        }
 								    }
 								private:
 								    void CompileRegexPatterns() {
 								        compiled_starting_quotes_.reserve(STARTING_QUOTES.size());
 								        for (const auto &[pattern, substitution] : STARTING_QUOTES) {
 								            compiled_starting_quotes_.emplace_back(CompilePattern(pattern), substitution);
 								        }
 								        compiled_ending_quotes_.reserve(ENDING_QUOTES.size());
 								        for (const auto &[pattern, substitution] : ENDING_QUOTES) {
 								            compiled_ending_quotes_.emplace_back(CompilePattern(pattern), substitution);
 								        }
 								        compiled_punctuation_.reserve(PUNCTUATION.size());
 								        for (const auto &[pattern, substitution] : PUNCTUATION) {
 								            compiled_punctuation_.emplace_back(CompilePattern(pattern), substitution);
 								        }
 								        compiled_parens_brackets_ = CompiledRegex(CompilePattern(PARENS_BRACKETS.first), PARENS_BRACKETS.second);
 								        compiled_convert_parentheses_.reserve(CONVERT_PARENTHESES.size());
 								        for (const auto &[pattern, substitution] : CONVERT_PARENTHESES) {
 								            compiled_convert_parentheses_.emplace_back(CompilePattern(pattern), substitution);
 								        }
 								        compiled_double_dashes_ = CompiledRegex(CompilePattern(DOUBLE_DASHES.first), DOUBLE_DASHES.second);
 								        compiled_contractions2_.reserve(contractions_.CONTRACTIONS2.size());
 								        for (const auto &pattern : contractions_.CONTRACTIONS2) {
 								            compiled_contractions2_.emplace_back(CompilePattern(pattern), R"( $1 $2 )");
 								        }
 								        compiled_contractions3_.reserve(contractions_.CONTRACTIONS3.size());
 								        for (const auto &pattern : contractions_.CONTRACTIONS3) {
 								            compiled_contractions3_.emplace_back(CompilePattern(pattern), R"( $1 $2 )");
 								        }
 								    }
 								    pcre2_code *CompilePattern(const std::string &pattern) {
 								        int errorcode = 0;
 								        PCRE2_SIZE erroffset = 0;
 								        pcre2_code *re = pcre2_compile(reinterpret_cast<PCRE2_SPTR>(pattern.c_str()),
 								                                       PCRE2_ZERO_TERMINATED,
 								                                       PCRE2_MULTILINE | PCRE2_UTF,
 								                                       &errorcode,
 								                                       &erroffset,
 								                                       nullptr);
 								        if (re == nullptr) {
 								            PCRE2_UCHAR buffer[256];
 								            pcre2_get_error_message(errorcode, buffer, sizeof(buffer));
 								            std::cerr << "PCRE2 compilation failed at offset " << erroffset << ": " << buffer << std::endl;
 								            return nullptr;
 								        }
 								        return re;
 								    }
 								    std::string ApplyRegex(const std::string &text, const CompiledRegex &compiled) {
 								        if (compiled.re == nullptr) {
 								            return text;
 								        }
 								        PCRE2_SPTR pcre2_subject = reinterpret_cast<PCRE2_SPTR>(text.c_str());
 								        PCRE2_SPTR pcre2_replacement = reinterpret_cast<PCRE2_SPTR>(compiled.substitution.c_str());
 								        size_t outlength = text.length() * 2 < 1024 ? 1024 : text.length() * 2;
 								        auto buffer = std::make_unique<PCRE2_UCHAR[]>(outlength);
 								        int rc = pcre2_substitute(compiled.re,
 								                                  pcre2_subject,
 								                                  text.length(),
 ,
 								                                  PCRE2_SUBSTITUTE_GLOBAL,
 								                                  nullptr,
 								                                  nullptr,
 								                                  pcre2_replacement,
 								                                  PCRE2_ZERO_TERMINATED,
 								                                  buffer.get(),
 								                                  &outlength);
 								        if (rc < 0) {
 								            return text;
 								        }
 								        return std::string(reinterpret_cast<char *>(buffer.get()), outlength);
 								    }
 								};
 								// Static member definitions for NLTKWordTokenizer singleton
 								std::unique_ptr<NLTKWordTokenizer> NLTKWordTokenizer::instance_ = nullptr;
 								std::once_flag NLTKWordTokenizer::init_flag_;
 								void SentenceSplitter(const std::string &text, std::vector<std::string> &result) {
 								    int error_code;
 								    PCRE2_SIZE error_offset;
 								    const char *pattern = R"( *[\.\?!]['"\)\]]* *)";
 								    pcre2_code *re = pcre2_compile((PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, PCRE2_MULTILINE | PCRE2_UTF, &error_code, &error_offset, nullptr);
 								    if (re == nullptr) {
 								        PCRE2_UCHAR buffer[256];
 								        pcre2_get_error_message(error_code, buffer, sizeof(buffer));
 								        std::cerr << "PCRE2 compilation failed at offset " << error_offset << ": " << buffer << std::endl;
 								        return;
 								    }
 								    pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, nullptr);
 								    PCRE2_SIZE start_offset = 0;
 								    while (start_offset < text.size()) {
 								        int rc = pcre2_match(re, (PCRE2_SPTR)text.c_str(), text.size(), start_offset, 0, match_data, nullptr);
 								        if (rc < 0) {
 								            result.push_back(text.substr(start_offset));
 								            break;
 								        }
 								        PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data);
 								        PCRE2_SIZE match_start = ovector[0];
 								        PCRE2_SIZE match_end = ovector[1];
 								        if (match_start > start_offset) {
 								            result.push_back(text.substr(start_offset, match_end - start_offset));
 								        }
 								        start_offset = match_end;
 								    }
 								    pcre2_match_data_free(match_data);
 								    pcre2_code_free(re);
 								}
 								RAGAnalyzer::RAGAnalyzer(const std::string &path)
-												Fix tokenizer in cpp (#13735)

### What problem does this PR solve?

Tokenzier in Infinity is modified in
https://github.com/infiniflow/infinity/pull/3330, sync the code change
to cpp files in ragflow

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-23 15:40:35 +08:00
+								    : dict_path_(path), stemmer_(std::make_unique<Stemmer>()), lowercase_string_buffer_(term_string_buffer_limit_) {
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								    InitStemmer(STEM_LANG_ENGLISH);
 								}
 								RAGAnalyzer::RAGAnalyzer(const RAGAnalyzer &other)
 								    : own_dict_(false), trie_(other.trie_), pos_table_(other.pos_table_), wordnet_lemma_(other.wordnet_lemma_), stemmer_(std::make_unique<Stemmer>()),
-												Fix tokenizer in cpp (#13735)

### What problem does this PR solve?

Tokenzier in Infinity is modified in
https://github.com/infiniflow/infinity/pull/3330, sync the code change
to cpp files in ragflow

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-23 15:40:35 +08:00
+								      opencc_(other.opencc_), lowercase_string_buffer_(term_string_buffer_limit_), fine_grained_(other.fine_grained_) {
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								    InitStemmer(STEM_LANG_ENGLISH);
 								}
 								RAGAnalyzer::~RAGAnalyzer() {
 								    if (own_dict_) {
 								        delete trie_;
 								        delete pos_table_;
 								        delete wordnet_lemma_;
 								        delete opencc_;
 								    }
 								}
 								int32_t RAGAnalyzer::Load() {
 								    fs::path root(dict_path_);
 								    fs::path dict_path(root / DICT_PATH);
 								    if (!fs::exists(dict_path)) {
 								        printf("Invalid analyzer file: %s", dict_path.string().c_str());
 								        // return Status::InvalidAnalyzerFile(dict_path);
 								        return -1;
 								    }
 								    fs::path pos_def_path(root / POS_DEF_PATH);
 								    if (!fs::exists(pos_def_path)) {
 								        printf("Invalid post file: %s", pos_def_path.string().c_str());
 								        // return Status::InvalidAnalyzerFile(pos_def_path);
 								        return -1;
 								    }
 								    own_dict_ = true;
 								    trie_ = new DartsTrie();
 								    pos_table_ = new POSTable(pos_def_path.string());
 								    if (pos_table_->Load() != 0) {
 								        printf("Fail to load post table: %s", pos_def_path.string().c_str());
 								        return -1;
 								        // return Status::InvalidAnalyzerFile("Failed to load RAGAnalyzer POS definition");
 								    }
 								    fs::path trie_path(root / TRIE_PATH);
 								    if (fs::exists(trie_path)) {
 								        trie_->Load(trie_path.string());
 								    } else {
 								        // Build trie
 								        try {
 								            std::ifstream from(dict_path.string());
 								            std::string line;
 								            re2::RE2 re_pattern(R"([\r\n]+)");
 								            std::string split_pattern("([ \t])");
 								            while (getline(from, line)) {
 								                line = line.substr(0, line.find('\r'));
 								                if (line.empty())
 								                    continue;
 								                line = Replace(re_pattern, "", line);
 								                std::vector<std::string> results;
 								                Split(line, split_pattern, results);
 								                if (results.size() != 3)
 								                    throw std::runtime_error("Invalid dictionary format");
 								                int32_t freq = std::stoi(results[1]);
 								                freq = int32_t(std::log(float(freq) / DENOMINATOR) + 0.5);
 								                int32_t pos_idx = pos_table_->GetPOSIndex(results[2]);
 								                int value = Encode(freq, pos_idx);
 								                trie_->Add(results[0], value);
 								                std::string rkey = RKey(results[0]);
 								                trie_->Add(rkey, Encode(1, 0));
 								            }
 								            trie_->Build();
 								        } catch (const std::exception &e) {
 								            return -1;
 								            // return Status::InvalidAnalyzerFile("Failed to load RAGAnalyzer analyzer");
 								        }
 								        trie_->Save(trie_path.string());
 								    }
 								    fs::path lemma_path(root / WORDNET_PATH);
 								    if (!fs::exists(lemma_path)) {
 								        printf("Fail to load wordnet: %s", lemma_path.string().c_str());
 								        return -1;
 								        // return Status::InvalidAnalyzerFile(lemma_path);
 								    }
 								    wordnet_lemma_ = new WordNetLemmatizer(lemma_path.string());
 								    fs::path opencc_path(root / OPENCC_PATH);
 								    if (!fs::exists(opencc_path)) {
 								        printf("Fail to load opencc_path: %s", opencc_path.string().c_str());
 								        return -1;
 								        // return Status::InvalidAnalyzerFile(opencc_path);
 								    }
 								    try {
 								        opencc_ = new ::OpenCC(opencc_path.string());
 								    } catch (const std::exception &e) {
 								        return -1;
 								        // return Status::InvalidAnalyzerFile("Failed to load OpenCC");
 								    }
 								    // return Status::OK();
 								    return 0;
 								}
 								void RAGAnalyzer::BuildPositionMapping(const std::string &original, const std::string &converted, std::vector<unsigned> &pos_mapping) {
 								    pos_mapping.clear();
 								    pos_mapping.resize(converted.size() + 1);
 								    size_t orig_pos = 0;
 								    size_t conv_pos = 0;
 								    // Map each character position from converted string to original string
 								    while (orig_pos < original.size() && conv_pos < converted.size()) {
 								        // Get character lengths
 								        size_t orig_char_len = UTF8_BYTE_LENGTH_TABLE[static_cast<uint8_t>(original[orig_pos])];
 								        size_t conv_char_len = UTF8_BYTE_LENGTH_TABLE[static_cast<uint8_t>(converted[conv_pos])];
 								        // Map all bytes of current converted character to current original position
 								        for (size_t i = 0; i < conv_char_len && conv_pos + i < pos_mapping.size(); ++i) {
 								            pos_mapping[conv_pos + i] = static_cast<unsigned>(orig_pos);
 								        }
 								        // Move to next character in both strings
 								        orig_pos += orig_char_len;
 								        conv_pos += conv_char_len;
 								    }
 								    // Fill any remaining positions
 								    for (size_t i = conv_pos; i < pos_mapping.size(); ++i) {
 								        pos_mapping[i] = static_cast<unsigned>(original.size());
 								    }
 								}
 								std::string RAGAnalyzer::StrQ2B(const std::string &input) {
 								    std::string output;
 								    size_t i = 0;
 								    while (i < input.size()) {
 								        unsigned char c = input[i];
 								        uint32_t codepoint = 0;
 								        if (c < 0x80) {
 								            codepoint = c;
 								            i += 1;
 								        } else if ((c & 0xE0) == 0xC0) {
 								            codepoint = (c & 0x1F) << 6;
 								            codepoint |= (input[i + 1] & 0x3F);
 								            i += 2;
 								        } else if ((c & 0xF0) == 0xE0) {
 								            codepoint = (c & 0x0F) << 12;
 								            codepoint |= (input[i + 1] & 0x3F) << 6;
 								            codepoint |= (input[i + 2] & 0x3F);
 								            i += 3;
 								        } else {
 								            output += c;
 								            i += 1;
 								            continue;
 								        }
 								        if (codepoint >= 0xFF01 && codepoint <= 0xFF5E) {
 								            output += static_cast<char>(codepoint - 0xFEE0);
 								        } else if (codepoint == 0x3000) {
 								            output += ' ';
 								        } else {
 								            if (codepoint < 0x80) {
 								                output += static_cast<char>(codepoint);
 								            } else if (codepoint < 0x800) {
 								                output += static_cast<char>(0xC0 | (codepoint >> 6));
 								                output += static_cast<char>(0x80 | (codepoint & 0x3F));
 								            } else if (codepoint < 0x10000) {
 								                output += static_cast<char>(0xE0 | (codepoint >> 12));
 								                output += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
 								                output += static_cast<char>(0x80 | (codepoint & 0x3F));
 								            }
 								        }
 								    }
 								    return output;
 								}
 								int32_t RAGAnalyzer::Freq(const std::string_view key) const {
 								    int32_t v = trie_->Get(key);
 								    v = DecodeFreq(v);
 								    return static_cast<int32_t>(std::exp(v) * DENOMINATOR + 0.5);
 								}
 								std::string RAGAnalyzer::Tag(std::string_view key) const {
 								    std::string lower_key = Key(std::string(key));
 								    int32_t encoded_value = trie_->Get(lower_key);
 								    if (encoded_value == -1) {
 								        return "";
 								    }
 								    int32_t pos_idx = DecodePOSIndex(encoded_value);
 								    if (pos_table_ == nullptr) {
 								        return "";
 								    }
 								    const char* pos_tag = pos_table_->GetPOS(pos_idx);
 								    return pos_tag ? std::string(pos_tag) : "";
 								}
 								std::string RAGAnalyzer::Key(const std::string_view line) { return ToLowerString(line); }
 								std::string RAGAnalyzer::RKey(const std::string_view line) {
 								    std::string reversed;
 								    reversed.reserve(line.size() + 2);
 								    reversed += "DD";
 								    for (size_t i = line.size(); i > 0;) {
 								        size_t start = i - 1;
 								        while (start > 0 && (line[start] & 0xC0) == 0x80) {
 								            --start;
 								        }
 								        reversed += line.substr(start, i - start);
 								        i = start;
 								    }
 								    ToLower(reversed.data() + 2, reversed.size() - 2);
 								    return reversed;
 								}
 								std::pair<std::vector<std::string>, double> RAGAnalyzer::Score(const std::vector<std::pair<std::string, int>> &token_freqs) {
 								    constexpr int64_t B = 30;
 								    int64_t F = 0, L = 0;
 								    std::vector<std::string> tokens;
 								    tokens.reserve(token_freqs.size());
 								    for (const auto &[token, freq_tag] : token_freqs) {
 								        F += DecodeFreq(freq_tag);
 								        L += (UTF8Length(token) < 2) ? 0 : 1;
 								        tokens.push_back(token);
 								    }
 								    const auto score = B / static_cast<double>(tokens.size()) + L / static_cast<double>(tokens.size()) + F;
 								    return {std::move(tokens), score};
 								}
 								void RAGAnalyzer::SortTokens(const std::vector<std::vector<std::pair<std::string, int>>> &token_list,
 								                             std::vector<std::pair<std::vector<std::string>, double>> &res) {
 								    for (const auto &tfts : token_list) {
 								        res.push_back(Score(tfts));
 								    }
 								    std::sort(res.begin(), res.end(), [](const auto &a, const auto &b) { return a.second > b.second; });
 								}
 								std::pair<std::vector<std::string>, double> RAGAnalyzer::MaxForward(const std::string &line) const {
 								    std::vector<std::pair<std::string, int>> res;
 								    std::size_t s = 0;
 								    std::size_t len = UTF8Length(line);
 								    while (s < len) {
 								        std::size_t e = s + 1;
 								        std::string t = UTF8Substr(line, s, e - s);
 								        while (e < len && trie_->HasKeysWithPrefix(Key(t))) {
 								            e += 1;
 								            t = UTF8Substr(line, s, e - s);
 								        }
 								        while (e - 1 > s && trie_->Get(Key(t)) == -1) {
 								            e -= 1;
 								            t = UTF8Substr(line, s, e - s);
 								        }
 								        int v = trie_->Get(Key(t));
 								        if (v != -1) {
 								            res.emplace_back(std::move(t), v);
 								        } else {
 								            res.emplace_back(std::move(t), 0);
 								        }
 								        s = e;
 								    }
 								    return Score(res);
 								}
 								std::pair<std::vector<std::string>, double> RAGAnalyzer::MaxBackward(const std::string &line) const {
 								    std::vector<std::pair<std::string, int>> res;
 								    int s = UTF8Length(line) - 1;
 								    while (s >= 0) {
 								        const int e = s + 1;
 								        std::string t = UTF8Substr(line, s, e - s);
 								        while (s > 0 && trie_->HasKeysWithPrefix(RKey(t))) {
 								            s -= 1;
 								            t = UTF8Substr(line, s, e - s);
 								        }
 								        while (s + 1 < e && trie_->Get(Key(t)) == -1) {
 								            s += 1;
 								            t = UTF8Substr(line, s, e - s);
 								        }
 								        int v = trie_->Get(Key(t));
 								        if (v != -1) {
 								            res.emplace_back(std::move(t), v);
 								        } else {
 								            res.emplace_back(std::move(t), 0);
 								        }
 								        s -= 1;
 								    }
 								    std::reverse(res.begin(), res.end());
 								    return Score(res);
 								}
-												Limit max recursion depth for rag analyzer#3318 (#13637)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-16 22:49:56 +08:00
+								static constexpr int MAX_DFS_DEPTH = 10;
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								int RAGAnalyzer::DFS(const std::string &chars,
 								                     const int s,
 								                     std::vector<std::pair<std::string, int>> &pre_tokens,
 								                     std::vector<std::vector<std::pair<std::string, int>>> &token_list,
 								                     std::vector<std::string> &best_tokens,
 								                     double &max_score,
-												Limit max recursion depth for rag analyzer#3318 (#13637)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-16 22:49:56 +08:00
+								                     const bool memo_all,
 								                     const int depth) const {
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								    int res = s;
 								    const int len = UTF8Length(chars);
-												Limit max recursion depth for rag analyzer#3318 (#13637)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-16 22:49:56 +08:00
 								    // Check max recursion depth - graceful degradation like Python version
 								    if (depth > MAX_DFS_DEPTH) {
 								        if (s < len) {
 								            auto pretks = pre_tokens;
 								            std::string remaining = UTF8Substr(chars, s, len - s);
 								            pretks.emplace_back(std::move(remaining), Encode(-12, 0));
 								            if (memo_all) {
 								                token_list.push_back(std::move(pretks));
 								            } else if (auto [vec_str, current_score] = Score(pretks); current_score > max_score) {
 								                best_tokens = std::move(vec_str);
 								                max_score = current_score;
 								            }
 								        }
 								        return len;
 								    }
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								    if (s >= len) {
 								        if (memo_all) {
 								            token_list.push_back(pre_tokens);
 								        } else if (auto [vec_str, current_score] = Score(pre_tokens); current_score > max_score) {
 								            best_tokens = std::move(vec_str);
 								            max_score = current_score;
 								        }
 								        return res;
 								    }
 								    // pruning
 								    int S = s + 1;
 								    if (s + 2 <= len) {
 								        std::string t1 = UTF8Substr(chars, s, 1);
 								        std::string t2 = UTF8Substr(chars, s, 2);
 								        if (trie_->HasKeysWithPrefix(Key(t1)) && !trie_->HasKeysWithPrefix(Key(t2))) {
 								            S = s + 2;
 								        }
 								    }
 								    if (pre_tokens.size() > 2 && UTF8Length(pre_tokens[pre_tokens.size() - 1].first) == 1 &&
 								        UTF8Length(pre_tokens[pre_tokens.size() - 2].first) == 1 && UTF8Length(pre_tokens[pre_tokens.size() - 3].first) == 1) {
 								        std::string t1 = pre_tokens[pre_tokens.size() - 1].first + UTF8Substr(chars, s, 1);
 								        if (trie_->HasKeysWithPrefix(Key(t1))) {
 								            S = s + 2;
 								        }
 								    }
 								    for (int e = S; e <= len; ++e) {
 								        std::string t = UTF8Substr(chars, s, e - s);
 								        std::string k = Key(t);
 								        if (e > s + 1 && !trie_->HasKeysWithPrefix(k)) {
 								            break;
 								        }
 								        if (const int v = trie_->Get(k); v != -1) {
 								            auto pretks = pre_tokens;
 								            pretks.emplace_back(std::move(t), v);
-												Limit max recursion depth for rag analyzer#3318 (#13637)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-16 22:49:56 +08:00
+								            res = std::max(res, DFS(chars, e, pretks, token_list, best_tokens, max_score, memo_all, depth + 1));
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								        }
 								    }
 								    if (res > s) {
 								        return res;
 								    }
 								    std::string t = UTF8Substr(chars, s, 1);
 								    if (const int v = trie_->Get(Key(t)); v != -1) {
 								        pre_tokens.emplace_back(std::move(t), v);
 								    } else {
 								        pre_tokens.emplace_back(std::move(t), Encode(-12, 0));
 								    }
-												Limit max recursion depth for rag analyzer#3318 (#13637)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-16 22:49:56 +08:00
+								    return DFS(chars, s + 1, pre_tokens, token_list, best_tokens, max_score, memo_all, depth + 1);
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								}
 								struct TokensList {
 								    const TokensList *prev = nullptr;
 								    std::string_view token = {};
 								};
 								struct BestTokenCandidate {
 								    static constexpr int64_t B = 30;
 								    TokensList tl{};
 								    // N: token num
 								    // L: num of tokens with length >= 2
 								    // F: sum of freq
 								    uint32_t N{};
 								    uint32_t L{};
 								    int64_t F{};
 								    auto k() const {
 								#ifdef DIVIDE_F_BY_N
 								        return N;
 								#else
 								        return std::make_pair(N, L);
 								#endif
 								    }
 								    auto v() const { return F; }
 								    auto score() const {
 								#ifdef DIVIDE_F_BY_N
 								        return static_cast<double>(B + L + F) / N;
 								#else
 								        return F + (static_cast<double>(B + L) / N);
 								#endif
 								    }
 								    BestTokenCandidate update(const std::string_view new_token_sv, const int32_t key_f, const uint32_t add_l) const {
 								        return {{&tl, new_token_sv}, N + 1, L + add_l, F + key_f};
 								    }
 								};
 								struct GrowingBestTokenCandidatesTopN {
 								    int32_t top_n{};
 								    std::vector<BestTokenCandidate> candidates{};
 								    explicit GrowingBestTokenCandidatesTopN(const int32_t top_n) : top_n(top_n) {
 								    }
 								    void AddBestTokenCandidateTopN(const BestTokenCandidate &add_candidate) {
 								        const auto [it_b, it_e] =
 								            std::equal_range(candidates.begin(), candidates.end(), add_candidate, [](const auto &a, const auto &b) { return a.k() < b.k(); });
 								        auto target_it = it_b;
 								        bool do_replace = false;
 								        if (const auto match_cnt = std::distance(it_b, it_e); match_cnt >= top_n) {
 								            assert(match_cnt == top_n);
 								            const auto it = std::min_element(it_b, it_e, [](const auto &a, const auto &b) { return a.v() < b.v(); });
 								            if (it->v() >= add_candidate.v()) {
 								                return;
 								            }
 								            target_it = it;
 								            do_replace = true;
 								        }
 								        if (do_replace) {
 								            *target_it = add_candidate;
 								        } else {
 								            candidates.insert(target_it, add_candidate);
 								        }
 								    }
 								};
 								std::vector<std::pair<std::vector<std::string_view>, double>> RAGAnalyzer::GetBestTokensTopN(const std::string_view chars, const uint32_t n) const {
 								    const auto utf8_len = UTF8Length(chars);
 								    std::vector<GrowingBestTokenCandidatesTopN> dp_vec(utf8_len + 1, GrowingBestTokenCandidatesTopN(n));
 								    dp_vec[0].candidates.resize(1);
 								    const char *current_utf8_ptr = chars.data();
 								    uint32_t current_left_chars = chars.size();
 								    std::string growing_key; // in lower case
 								    for (uint32_t i = 0; i < utf8_len; ++i) {
 								        const std::string_view current_chars{current_utf8_ptr, current_left_chars};
 								        const uint32_t left_utf8_cnt = utf8_len - i;
 								        growing_key.clear();
 								        const char *lookup_until = current_utf8_ptr;
 								        uint32_t lookup_left_chars = current_left_chars;
 								        std::size_t reuse_node_pos = 0;
 								        std::size_t reuse_key_pos = 0;
 								        for (uint32_t j = 1; j <= left_utf8_cnt; ++j) {
 								            {
 								                // handle growing_key
 								                const auto next_one_utf8 = UTF8Substrview({lookup_until, lookup_left_chars}, 0, 1);
 								                if (next_one_utf8.size() == 1 && next_one_utf8[0] >= 'A' && next_one_utf8[0] <= 'Z') {
 								                    growing_key.push_back(next_one_utf8[0] - 'A' + 'a');
 								                } else {
 								                    growing_key.append(next_one_utf8);
 								                }
 								                lookup_until += next_one_utf8.size();
 								                lookup_left_chars -= next_one_utf8.size();
 								            }
 								            auto dp_f = [&dp_vec, i, j, original_sv = std::string_view{current_utf8_ptr, growing_key.size()}](
 								                const int32_t key_f,
 								                const uint32_t add_l) {
 								                auto &target_dp = dp_vec[i + j];
 								                for (const auto &c : dp_vec[i].candidates) {
 								                    target_dp.AddBestTokenCandidateTopN(c.update(original_sv, key_f, add_l));
 								                }
 								            };
 								            if (const auto traverse_result = trie_->Traverse(growing_key.data(), reuse_node_pos, reuse_key_pos, growing_key.size());
 								                traverse_result >= 0) {
 								                // in dictionary
 								                const int32_t key_f = DecodeFreq(traverse_result);
 								                const auto add_l = static_cast<uint32_t>(j >= 2);
 								                dp_f(key_f, add_l);
 								            } else {
 								                // not in dictionary
 								                if (j == 1) {
 								                    // also give a score: -12
 								                    dp_f(-12, 0);
 								                }
 								                if (traverse_result == -2) {
 								                    // no more results
 								                    break;
 								                }
 								            }
 								        }
 								        // update current_utf8_ptr and current_left_chars
 								        const auto forward_cnt = UTF8Substrview(current_chars, 0, 1).size();
 								        current_utf8_ptr += forward_cnt;
 								        current_left_chars -= forward_cnt;
 								    }
 								    std::vector<std::pair<const TokensList *, double>> mid_result;
 								    mid_result.reserve(n);
 								    for (const auto &c : dp_vec.back().candidates) {
 								        const auto new_pair = std::make_pair(&(c.tl), c.score());
 								        if (mid_result.size() < n) {
 								            mid_result.push_back(new_pair);
 								        } else {
 								            assert(mid_result.size() == n);
 								            if (new_pair.second > mid_result.back().second) {
 								                mid_result.pop_back();
 								                const auto insert_pos = std::lower_bound(mid_result.begin(),
 								                                                         mid_result.end(),
 								                                                         new_pair,
 								                                                         [](const auto &a, const auto &b) {
 								                                                             return a.second > b.second;
 								                                                         });
 								                mid_result.insert(insert_pos, new_pair);
 								            }
 								        }
 								    }
 								    class HelperFunc {
 								        uint32_t cnt = 0;
 								        std::vector<std::string_view> result{};
 								        void GetTokensInner(const TokensList *tl) {
 								            if (!tl->prev) {
 								                result.reserve(cnt);
 								                return;
 								            }
 								            ++cnt;
 								            GetTokensInner(tl->prev);
 								            result.push_back(tl->token);
 								        }
 								    public:
 								        std::vector<std::string_view> GetTokens(const TokensList *tl) {
 								            GetTokensInner(tl);
 								            return std::move(result);
 								        }
 								    };
 								    std::vector<std::pair<std::vector<std::string_view>, double>> result;
 								    result.reserve(mid_result.size());
 								    for (const auto [tl, score] : mid_result) {
 								        result.emplace_back(HelperFunc{}.GetTokens(tl), score);
 								    }
 								    return result;
 								}
 								// TODO: for test
 								// #ifndef INFINITY_DEBUG
 								// #define INFINITY_DEBUG 1
 								// #endif
 								#ifdef INFINITY_DEBUG
 								namespace dp_debug {
 								template <typename T>
 								std::string TestPrintTokens(const std::vector<T> &tokens) {
 								    std::ostringstream oss;
 								    for (std::size_t i = 0; i < tokens.size(); ++i) {
 								        oss << (i ? " #" : "#") << tokens[i] << "#";
 								    }
 								    return std::move(oss).str();
 								}
 								auto print_1 = [](const bool b) { return b ? "✅" : "❌"; };
 								auto print_2 = [](const bool b) { return b ? "equal" : "not equal"; };
 								void compare_score_and_tokens(const std::vector<std::string> &dfs_tokens,
 								                              const double dfs_score,
 								                              const std::vector<std::string_view> &dp_tokens,
 								                              const double dp_score,
 								                              const std::string &prefix) {
 								    std::ostringstream oss;
 								    const auto b_score_eq = dp_score == dfs_score;
 								    oss << fmt::format("\n{} {} DFS and DP score {}:\nDFS: {}\nDP : {}\n", print_1(b_score_eq), prefix, print_2(b_score_eq), dfs_score, dp_score);
 								    bool vec_equal = true;
 								    if (dp_tokens.size() != dfs_tokens.size()) {
 								        vec_equal = false;
 								    } else {
 								        for (std::size_t k = 0; k < dp_tokens.size(); ++k) {
 								            if (dp_tokens[k] != dfs_tokens[k]) {
 								                vec_equal = false;
 								                break;
 								            }
 								        }
 								    }
 								    oss << fmt::format("{} {} DFS and DP result {}:\nDFS: {}\nDP : {}\n",
 								                       print_1(vec_equal),
 								                       prefix,
 								                       print_2(vec_equal),
 								                       TestPrintTokens(dfs_tokens),
 								                       TestPrintTokens(dp_tokens));
 								    std::cerr << std::move(oss).str() << std::endl;
 								}
 								inline void CheckDP(const RAGAnalyzer *this_ptr,
 								                    const std::string_view input_str,
 								                    const std::vector<std::string> &dfs_tokens,
 								                    const double dfs_score,
 								                    const auto t0,
 								                    const auto t1) {
 								    const auto dp_result = this_ptr->GetBestTokensTopN(input_str, 1);
 								    const auto t2 = std::chrono::high_resolution_clock::now();
 								    const auto dfs_duration = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(t1 - t0);
 								    const auto dp_duration = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(t2 - t1);
 								    const auto dp_faster = dp_duration < dfs_duration;
 								    std::cerr << "\n!!! " << print_1(dp_faster) << "\nTOP1 DFS duration: " << dfs_duration << " \nDP  duration: " << dp_duration;
 								    const auto &[dp_vec, dp_score] = dp_result[0];
 								    compare_score_and_tokens(dfs_tokens, dfs_score, dp_vec, dp_score, "[1 in top1]");
 								}
 								inline void CheckDP2(const RAGAnalyzer *this_ptr, const std::string_view input_str, auto get_dfs_sorted_tokens, const auto t0, const auto t1) {
 								    constexpr int topn = 2;
 								    const auto dp_result = this_ptr->GetBestTokensTopN(input_str, topn);
 								    const auto t2 = std::chrono::high_resolution_clock::now();
 								    const auto dfs_duration = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(t1 - t0);
 								    const auto dp_duration = std::chrono::duration_cast<std::chrono::duration<float, std::milli>>(t2 - t1);
 								    const auto dp_faster = dp_duration < dfs_duration;
 								    std::cerr << "\n!!! " << print_1(dp_faster) << "\nTOP2 DFS duration: " << dfs_duration << " \nTOP2 DP  duration: " << dp_duration;
 								    const auto dfs_sorted_tokens = get_dfs_sorted_tokens();
 								    for (int i = 0; i < std::min(topn, (int)dfs_sorted_tokens.size()); ++i) {
 								        compare_score_and_tokens(dfs_sorted_tokens[i].first,
 								                                 dfs_sorted_tokens[i].second,
 								                                 dp_result[i].first,
 								                                 dp_result[i].second,
 								                                 std::format("[{} in top{}]", i + 1, topn));
 								    }
 								}
 								} // namespace dp_debug
 								#endif
 								std::string RAGAnalyzer::Merge(const std::string &tks_str) const {
 								    std::string tks = tks_str;
 								    tks = Replace(replace_space_pattern_, " ", tks);
 								    std::vector<std::string> tokens;
 								    Split(tks, blank_pattern_, tokens);
 								    std::vector<std::string> res;
 								    std::size_t s = 0;
 								    while (true) {
 								        if (s >= tokens.size())
 								            break;
 								        std::size_t E = s + 1;
 								        for (std::size_t e = s + 2; e < std::min(tokens.size() + 1, s + 6); ++e) {
 								            std::string tk = Join(tokens, s, e, "");
 								            if (re2::RE2::PartialMatch(tk, regex_split_pattern_)) {
 								                if (Freq(tk) > 0) {
 								                    E = e;
 								                }
 								            }
 								        }
 								        res.push_back(Join(tokens, s, E, ""));
 								        s = E;
 								    }
 								    return Join(res, 0, res.size());
 								}
 								void RAGAnalyzer::MergeWithPosition(const std::vector<std::string> &tokens,
 								                                    const std::vector<std::pair<unsigned, unsigned>> &positions,
 								                                    std::vector<std::string> &merged_tokens,
 								                                    std::vector<std::pair<unsigned, unsigned>> &merged_positions) const {
 								    // Filter out empty tokens first (like spaces) to match Merge behavior
 								    std::vector<std::string> filtered_tokens;
 								    std::vector<std::pair<unsigned, unsigned>> filtered_positions;
 								    for (size_t i = 0; i < tokens.size(); ++i) {
 								        if (!tokens[i].empty() && tokens[i] != " ") {
 								            filtered_tokens.push_back(tokens[i]);
 								            filtered_positions.push_back(positions[i]);
 								        }
 								    }
 								    std::vector<std::string> res;
 								    std::size_t s = 0;
 								    std::vector<std::pair<unsigned, unsigned>> res_positions;
 								    while (true) {
 								        if (s >= filtered_tokens.size())
 								            break;
 								        std::size_t E = s + 1;
 								        for (std::size_t e = s + 2; e < std::min(filtered_tokens.size() + 1, s + 6); ++e) {
 								            std::string tk = Join(filtered_tokens, s, e, "");
 								            if (re2::RE2::PartialMatch(tk, regex_split_pattern_)) {
 								                if (Freq(tk) > 0) {
 								                    E = e;
 								                }
 								            }
 								        }
 								        std::string merged_token = Join(filtered_tokens, s, E, "");
 								        res.push_back(merged_token);
 								        unsigned start_pos = filtered_positions[s].first;
 								        unsigned end_pos = filtered_positions[E - 1].second;
 								        res_positions.emplace_back(start_pos, end_pos);
 								        s = E;
 								    }
 								    merged_tokens = std::move(res);
 								    merged_positions = std::move(res_positions);
 								}
 								void RAGAnalyzer::EnglishNormalize(const std::vector<std::string> &tokens, std::vector<std::string> &res) const {
 								    for (auto &t : tokens) {
-												Fix tokenizer in cpp (#13735)

### What problem does this PR solve?

Tokenzier in Infinity is modified in
https://github.com/infiniflow/infinity/pull/3330, sync the code change
to cpp files in ragflow

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-23 15:40:35 +08:00
+								        if (re2::RE2::PartialMatch(t, pattern1_)) { //"[a-zA-Z_-]+$"
 								            // Apply lowercase before lemmatization to match Python NLTK behavior
 								            char *lowercase_term = lowercase_string_buffer_.data();
 								            ToLower(t.c_str(), t.size(), lowercase_term, term_string_buffer_limit_);
 								            std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term);
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								            std::string stem_term;
-												Fix tokenizer in cpp (#13735)

### What problem does this PR solve?

Tokenzier in Infinity is modified in
https://github.com/infiniflow/infinity/pull/3330, sync the code change
to cpp files in ragflow

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-23 15:40:35 +08:00
+								            stemmer_->Stem(lemma_term, stem_term);
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								            res.push_back(stem_term);
 								        } else {
 								            res.push_back(t);
 								        }
 								    }
 								}
 								void RAGAnalyzer::SplitByLang(const std::string &line, std::vector<std::pair<std::string, bool>> &txt_lang_pairs) const {
 								    std::vector<std::string> arr;
 								    Split(line, regex_split_pattern_, arr, true);
 								    for (const auto &a : arr) {
 								        if (a.empty()) {
 								            continue;
 								        }
 								        std::size_t s = 0;
 								        std::size_t e = s + 1;
 								        bool zh = IsChinese(UTF8Substr(a, s, 1));
 								        while (e < UTF8Length(a)) {
 								            bool _zh = IsChinese(UTF8Substr(a, e, 1));
 								            if (_zh == zh) {
 								                e++;
 								                continue;
 								            }
 								            std::string segment = UTF8Substr(a, s, e - s);
 								            txt_lang_pairs.emplace_back(segment, zh);
 								            s = e;
 								            e = s + 1;
 								            zh = _zh;
 								        }
 								        if (s >= UTF8Length(a)) {
 								            continue;
 								        }
 								        std::string segment = UTF8Substr(a, s, e - s);
 								        txt_lang_pairs.emplace_back(segment, zh);
 								    }
 								}
 								void RAGAnalyzer::TokenizeInner(std::vector<std::string> &res, const std::string &L) const {
 								    auto [tks, s] = MaxForward(L);
 								    auto [tks1, s1] = MaxBackward(L);
 								#if 0
 								    std::size_t i = 0, j = 0, _i = 0, _j = 0, same = 0;
 								    while ((i + same < tks1.size()) && (j + same < tks.size()) && tks1[i + same] == tks[j + same]) {
 								        same++;
 								    }
 								    if (same > 0) {
 								        res.push_back(Join(tks, j, j + same));
 								    }
 								    _i = i + same;
 								    _j = j + same;
 								    j = _j + 1;
 								    i = _i + 1;
 								    while (i < tks1.size() && j < tks.size()) {
 								        std::string tk1 = Join(tks1, _i, i, "");
 								        std::string tk = Join(tks, _j, j, "");
 								        if (tk1 != tk) {
 								            if (tk1.length() > tk.length()) {
 								                j++;
 								            } else {
 								                i++;
 								            }
 								            continue;
 								        }
 								        if (tks1[i] != tks[j]) {
 								            i++;
 								            j++;
 								            continue;
 								        }
 								        std::vector<std::pair<std::string, int>> pre_tokens;
 								        std::vector<std::vector<std::pair<std::string, int>>> token_list;
 								        std::vector<std::string> best_tokens;
 								        double max_score = std::numeric_limits<double>::lowest();
 								        const auto str_for_dfs = Join(tks, _j, j, "");
 								#ifdef INFINITY_DEBUG
 								    const auto t0 = std::chrono::high_resolution_clock::now();
 								#endif
 								    DFS(str_for_dfs, 0, pre_tokens, token_list, best_tokens, max_score, false);
 								#ifdef INFINITY_DEBUG
 								    const auto t1 = std::chrono::high_resolution_clock::now();
 								    dp_debug::CheckDP(this, str_for_dfs, best_tokens, max_score, t0, t1);
 								#endif
 								    res.push_back(Join(best_tokens, 0));
 								    same = 1;
 								    while (i + same < tks1.size() && j + same < tks.size() && tks1[i + same] == tks[j + same])
 								        same++;
 								    res.push_back(Join(tks, j, j + same));
 								    _i = i + same;
 								    _j = j + same;
 								    j = _j + 1;
 								    i = _i + 1;
 								    }
 								    if (_i < tks1.size()) {
 								        std::vector<std::pair<std::string, int>> pre_tokens;
 								        std::vector<std::vector<std::pair<std::string, int>>> token_list;
 								        std::vector<std::string> best_tokens;
 								        double max_score = std::numeric_limits<double>::lowest();
 								        const auto str_for_dfs = Join(tks, _j, tks.size(), "");
 								#ifdef INFINITY_DEBUG
 								    const auto t0 = std::chrono::high_resolution_clock::now();
 								#endif
 								    DFS(str_for_dfs, 0, pre_tokens, token_list, best_tokens, max_score, false);
 								#ifdef INFINITY_DEBUG
 								    const auto t1 = std::chrono::high_resolution_clock::now();
 								    dp_debug::CheckDP(this, str_for_dfs, best_tokens, max_score, t0, t1);
 								#endif
 								    res.push_back(Join(best_tokens, 0));
 								    }
 								#else
 								    std::size_t i = 0, j = 0, _i = 0, _j = 0, same = 0;
 								    while ((i + same < tks1.size()) && (j + same < tks.size()) && tks1[i + same] == tks[j + same]) {
 								        same++;
 								    }
 								    if (same > 0) {
 								        res.push_back(Join(tks, j, j + same));
 								    }
 								    _i = i + same;
 								    _j = j + same;
 								    j = _j + 1;
 								    i = _i + 1;
 								    while (i < tks1.size() && j < tks.size()) {
 								        std::string tk1 = Join(tks1, _i, i, "");
 								        std::string tk = Join(tks, _j, j, "");
 								        if (tk1 != tk) {
 								            if (tk1.length() > tk.length()) {
 								                j++;
 								            } else {
 								                i++;
 								            }
 								            continue;
 								        }
 								        if (tks1[i] != tks[j]) {
 								            i++;
 								            j++;
 								            continue;
 								        }
 								        std::vector<std::pair<std::string, int>> pre_tokens;
 								        std::vector<std::vector<std::pair<std::string, int>>> token_list;
 								        std::vector<std::string> best_tokens;
 								        double max_score = std::numeric_limits<double>::lowest();
 								        const auto str_for_dfs = Join(tks, _j, j, "");
 								#ifdef INFINITY_DEBUG
 								        const auto t0 = std::chrono::high_resolution_clock::now();
 								#endif
 								        DFS(str_for_dfs, 0, pre_tokens, token_list, best_tokens, max_score, false);
 								#ifdef INFINITY_DEBUG
 								        const auto t1 = std::chrono::high_resolution_clock::now();
 								        dp_debug::CheckDP(this, str_for_dfs, best_tokens, max_score, t0, t1);
 								#endif
 								        res.push_back(Join(best_tokens, 0));
 								        same = 1;
 								        while (i + same < tks1.size() && j + same < tks.size() && tks1[i + same] == tks[j + same])
 								            same++;
 								        res.push_back(Join(tks, j, j + same));
 								        _i = i + same;
 								        _j = j + same;
 								        j = _j + 1;
 								        i = _i + 1;
 								    }
 								    if (_i < tks1.size()) {
 								        std::vector<std::pair<std::string, int>> pre_tokens;
 								        std::vector<std::vector<std::pair<std::string, int>>> token_list;
 								        std::vector<std::string> best_tokens;
 								        double max_score = std::numeric_limits<double>::lowest();
 								        const auto str_for_dfs = Join(tks, _j, tks.size(), "");
 								#ifdef INFINITY_DEBUG
 								        const auto t0 = std::chrono::high_resolution_clock::now();
 								#endif
 								        DFS(str_for_dfs, 0, pre_tokens, token_list, best_tokens, max_score, false);
 								#ifdef INFINITY_DEBUG
 								        const auto t1 = std::chrono::high_resolution_clock::now();
 								        dp_debug::CheckDP(this, str_for_dfs, best_tokens, max_score, t0, t1);
 								#endif
 								        res.push_back(Join(best_tokens, 0));
 								    }
 								#endif
 								}
 								void RAGAnalyzer::SplitLongText(const std::string &L, uint32_t length, std::vector<std::string> &sublines) const {
 								    uint32_t slice_count = length / MAX_SENTENCE_LEN + 1;
 								    sublines.reserve(slice_count);
 								    std::size_t last_sentence_start = 0;
 								    std::size_t next_sentence_start = 0;
 								    for (unsigned i = 0; i < slice_count; ++i) {
 								        next_sentence_start = MAX_SENTENCE_LEN * (i + 1) - 5;
 								        if (next_sentence_start + 5 < length) {
 								            std::size_t sentence_length = MAX_SENTENCE_LEN * (i + 1) + 5 > length ? length - next_sentence_start : 10;
 								            std::string substr = UTF8Substr(L, next_sentence_start, sentence_length);
 								            auto [tks, s] = MaxForward(substr);
 								            auto [tks1, s1] = MaxBackward(substr);
 								            std::vector<int> diff(std::max(tks.size(), tks1.size()), 0);
 								            for (std::size_t j = 0; j < std::min(tks.size(), tks1.size()); ++j) {
 								                if (tks[j] != tks1[j]) {
 								                    diff[j] = 1;
 								                }
 								            }
 								            if (s1 > s) {
 								                tks = tks1;
 								            }
 								            std::size_t start = 0;
 								            std::size_t forward_same_len = 0;
 								            while (start < tks.size() && diff[start] == 0) {
 								                forward_same_len += UTF8Length(tks[start]);
 								                start++;
 								            }
 								            if (forward_same_len == 0) {
 								                std::size_t end = tks.size() - 1;
 								                std::size_t backward_same_len = 0;
 								                while (end >= 0 && diff[end] == 0) {
 								                    backward_same_len += UTF8Length(tks[end]);
 								                    end--;
 								                }
 								                next_sentence_start += sentence_length - backward_same_len;
 								            } else
 								                next_sentence_start += forward_same_len;
 								        } else
 								            next_sentence_start = length;
 								        if (next_sentence_start == last_sentence_start)
 								            continue;
 								        std::string str = UTF8Substr(L, last_sentence_start, next_sentence_start - last_sentence_start);
 								        sublines.push_back(str);
 								        last_sentence_start = next_sentence_start;
 								    }
 								}
 								// PCRE2-based replacement function to match Python's re.sub behavior
 								// Returns processed string and position mapping from processed to original
 								std::pair<std::string, std::vector<std::pair<unsigned, unsigned>>>
 								PCRE2GlobalReplaceWithPosition(const std::string &text, const std::string &pattern, const std::string &replacement) {
 								    std::vector<std::pair<unsigned, unsigned>> pos_mapping;
 								    std::string result;
 								    pcre2_code *re;
 								    PCRE2_SPTR pcre2_pattern = reinterpret_cast<PCRE2_SPTR>(pattern.c_str());
 								    PCRE2_SPTR pcre2_subject = reinterpret_cast<PCRE2_SPTR>(text.c_str());
 								    // Note: pcre2_replacement is used in the replacement logic below
 								    int errorcode;
 								    PCRE2_SIZE erroroffset;
 								    // Compile the pattern with UTF and UCP flags for Unicode support
 								    re = pcre2_compile(pcre2_pattern, PCRE2_ZERO_TERMINATED, PCRE2_UCP | PCRE2_UTF, &errorcode, &erroroffset, nullptr);
 								    if (re == nullptr) {
 								        PCRE2_UCHAR buffer[256];
 								        pcre2_get_error_message(errorcode, buffer, sizeof(buffer));
 								        std::cerr << "PCRE2 compilation failed at offset " << erroroffset << ": " << buffer << std::endl;
 								        return {text, {}};
 								    }
 								    pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(re, nullptr);
 								    PCRE2_SIZE current_pos = 0;
 								    PCRE2_SIZE last_match_end = 0;
 								    // Process the string match by match
 								    while (current_pos < text.length()) {
 								        int rc = pcre2_match(re, pcre2_subject, text.length(), current_pos, 0, match_data, nullptr);
 								        if (rc < 0) {
 								            // No more matches, copy remaining text
 								            if (last_match_end < text.length()) {
 								                std::string remaining = text.substr(last_match_end);
 								                result += remaining;
 								                // Map each character in remaining text
 								                for (size_t i = 0; i < remaining.length(); ++i) {
 								                    pos_mapping.emplace_back(last_match_end + i, last_match_end + i);
 								                }
 								            }
 								            break;
 								        }
 								        PCRE2_SIZE *ovector = pcre2_get_ovector_pointer(match_data);
 								        PCRE2_SIZE match_start = ovector[0];
 								        PCRE2_SIZE match_end = ovector[1];
 								        // Copy text before the match
 								        if (last_match_end < match_start) {
 								            std::string before_match = text.substr(last_match_end, match_start - last_match_end);
 								            result += before_match;
 								            // Map each character in before_match
 								            for (size_t i = 0; i < before_match.length(); ++i) {
 								                pos_mapping.emplace_back(last_match_end + i, last_match_end + i);
 								            }
 								        }
 								        // Add the replacement string
 								        result += replacement;
 								        // Map each character in replacement to the start of the match
 								        for (size_t i = 0; i < replacement.length(); ++i) {
 								            pos_mapping.emplace_back(match_start, match_start);
 								        }
 								        last_match_end = match_end;
 								        current_pos = match_end;
 								        // If the match was zero-length, move forward one character to avoid infinite loop
 								        if (match_start == match_end) {
 								            if (current_pos < text.length()) {
 								                current_pos++;
 								            } else {
 								                break;
 								            }
 								        }
 								    }
 								    pcre2_match_data_free(match_data);
 								    pcre2_code_free(re);
 								    return {result, pos_mapping};
 								}
 								// Original PCRE2GlobalReplace for backward compatibility
 								std::string PCRE2GlobalReplace(const std::string &text, const std::string &pattern, const std::string &replacement) {
 								    auto [result, _] = PCRE2GlobalReplaceWithPosition(text, pattern, replacement);
 								    return result;
 								}
 								std::string RAGAnalyzer::Tokenize(const std::string &line) const {
 								    // Python-style simple tokenization: re.sub(r"\\W+", " ", line)
 								    std::string processed_line = PCRE2GlobalReplace(line, R"#(\W+)#", " ");
 								    std::string str1 = StrQ2B(processed_line);
 								    std::string strline;
 								    opencc_->convert(str1, strline);
 								    std::vector<std::string> res;
 								    // Use SplitByLang to separate by language
 								    std::vector<std::pair<std::string, bool>> arr;
 								    SplitByLang(strline, arr);
 								    for (const auto &[L, lang] : arr) {
 								        if (!lang) {
 								            // Non-Chinese text: use NLTK tokenizer, lemmatize and stem
 								            std::vector<std::string> term_list;
 								            std::vector<std::string> sentences;
 								            SentenceSplitter(L, sentences);
 								            for (auto &sentence : sentences) {
 								                NLTKWordTokenizer::GetInstance().Tokenize(sentence, term_list);
 								            }
 								            for (unsigned i = 0; i < term_list.size(); ++i) {
-												Fix tokenizer in cpp (#13735)

### What problem does this PR solve?

Tokenzier in Infinity is modified in
https://github.com/infiniflow/infinity/pull/3330, sync the code change
to cpp files in ragflow

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-23 15:40:35 +08:00
+								                // Apply lowercase before lemmatization to match Python NLTK behavior
 								                char *lowercase_term = lowercase_string_buffer_.data();
 								                ToLower(term_list[i].c_str(), term_list[i].size(), lowercase_term, term_string_buffer_limit_);
 								                std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term);
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								                std::string stem_term;
-												Fix tokenizer in cpp (#13735)

### What problem does this PR solve?

Tokenzier in Infinity is modified in
https://github.com/infiniflow/infinity/pull/3330, sync the code change
to cpp files in ragflow

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-23 15:40:35 +08:00
+								                stemmer_->Stem(lemma_term, stem_term);
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								                res.push_back(stem_term);
 								            }
 								            continue;
 								        }
 								        auto length = UTF8Length(L);
 								        if (length < 2 || re2::RE2::PartialMatch(L, pattern2_) || re2::RE2::PartialMatch(L, pattern3_)) {
 								            //[a-z\\.-]+$  [0-9\\.-]+$
 								            res.push_back(L);
 								            continue;
 								        }
 								        // Chinese processing: use TokenizeInner
 								#if 0
 								        if (length > MAX_SENTENCE_LEN) {
 								            std::vector<std::string> sublines;
 								            SplitLongText(L, length, sublines);
 								            for (auto &l : sublines) {
 								                TokenizeInner(res, l);
 								            }
 								        } else
 								#endif
 								        TokenizeInner(res, L);
 								    }
 								    // std::vector<std::string> normalize_res;
 								    // EnglishNormalize(res, normalize_res);
 								    std::string r = Join(res, 0);
 								    std::string ret = Merge(r);
 								    return ret;
 								}
 								std::pair<std::vector<std::string>, std::vector<std::pair<unsigned, unsigned>>> RAGAnalyzer::TokenizeWithPosition(const std::string &line) const {
 								    // Python-style simple tokenization: re.sub(r"\W+", " ", line)
 								    // Get processed line and position mapping from PCRE2GlobalReplace
 								    auto [processed_line, pcre2_pos_mapping] = PCRE2GlobalReplaceWithPosition(line, R"#(\W+)#", " ");
 								    std::string str1 = StrQ2B(processed_line);
 								    std::string strline;
 								    opencc_->convert(str1, strline);
 								    std::vector<std::string> tokens;
 								    std::vector<std::pair<unsigned, unsigned>> positions;
 								    // Build character position mapping from StrQ2B conversion
 								    std::vector<unsigned> strq2b_pos_mapping;
 								    BuildPositionMapping(processed_line, str1, strq2b_pos_mapping);
 								    // Build character position mapping from OpenCC conversion
 								    std::vector<unsigned> opencc_pos_mapping;
 								    BuildPositionMapping(str1, strline, opencc_pos_mapping);
 								    // Combine all position mappings: strline -> str1 -> processed_line -> line
 								    std::vector<unsigned> final_pos_mapping;
 								    final_pos_mapping.resize(strline.size() + 1);
 								    for (size_t i = 0; i < strline.size(); ++i) {
 								        if (i < opencc_pos_mapping.size()) {
 								            unsigned str1_pos = opencc_pos_mapping[i];
 								            if (str1_pos < strq2b_pos_mapping.size()) {
 								                unsigned processed_pos = strq2b_pos_mapping[str1_pos];
 								                if (processed_pos < pcre2_pos_mapping.size()) {
 								                    final_pos_mapping[i] = pcre2_pos_mapping[processed_pos].first;
 								                } else {
 								                    final_pos_mapping[i] = static_cast<unsigned>(line.size());
 								                }
 								            } else {
 								                final_pos_mapping[i] = static_cast<unsigned>(line.size());
 								            }
 								        } else {
 								            final_pos_mapping[i] = static_cast<unsigned>(line.size());
 								        }
 								    }
 								    // Fill the last position
 								    if (strline.size() < final_pos_mapping.size()) {
 								        final_pos_mapping[strline.size()] = static_cast<unsigned>(line.size());
 								    }
 								    // Use SplitByLang to separate by language
 								    std::vector<std::pair<std::string, bool>> arr;
 								    SplitByLang(strline, arr);
 								    unsigned current_pos = 0;
 								    for (const auto &[L, lang] : arr) {
 								        if (L.empty()) {
 								            continue;
 								        }
 								        std::size_t processed_pos = strline.find(L, current_pos);
 								        if (processed_pos == std::string::npos) {
 								            continue;
 								        }
 								        unsigned original_start = current_pos;
 								        current_pos = original_start + static_cast<unsigned>(L.size());
 								        if (!lang) {
 								            // Non-Chinese text: use NLTK tokenizer, lemmatize and stem
 								            std::vector<std::string> term_list;
 								            std::vector<std::string> sentences;
 								            SentenceSplitter(L, sentences);
 								            unsigned sentence_start_pos = original_start;
 								            for (auto &sentence : sentences) {
 								                std::vector<std::string> sentence_terms;
 								                NLTKWordTokenizer::GetInstance().Tokenize(sentence, sentence_terms);
 								                unsigned current_search_pos = 0;
 								                for (auto &term : sentence_terms) {
 								                    size_t pos_in_sentence = sentence.find(term, current_search_pos);
 								                    if (pos_in_sentence != std::string::npos) {
 								                        unsigned start_pos = sentence_start_pos + static_cast<unsigned>(pos_in_sentence);
 								                        unsigned end_pos = start_pos + static_cast<unsigned>(term.size());
-												Fix tokenizer in cpp (#13735)

### What problem does this PR solve?

Tokenzier in Infinity is modified in
https://github.com/infiniflow/infinity/pull/3330, sync the code change
to cpp files in ragflow

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-23 15:40:35 +08:00
+								                        // Apply lowercase before lemmatization to match Python NLTK behavior
 								                        char *lowercase_term = lowercase_string_buffer_.data();
 								                        ToLower(term.c_str(), term.size(), lowercase_term, term_string_buffer_limit_);
 								                        std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term);
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								                        std::string stem_term;
-												Fix tokenizer in cpp (#13735)

### What problem does this PR solve?

Tokenzier in Infinity is modified in
https://github.com/infiniflow/infinity/pull/3330, sync the code change
to cpp files in ragflow

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-23 15:40:35 +08:00
+								                        stemmer_->Stem(lemma_term, stem_term);
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
 								                        tokens.push_back(stem_term);
 								                        // Map positions back to original string using final_pos_mapping
 								                        if (start_pos < final_pos_mapping.size()) {
 								                            positions.emplace_back(final_pos_mapping[start_pos], final_pos_mapping[end_pos]);
 								                        } else {
 								                            positions.emplace_back(static_cast<unsigned>(line.size()), static_cast<unsigned>(line.size()));
 								                        }
 								                        current_search_pos = pos_in_sentence + term.size();
 								                    }
 								                }
 								                sentence_start_pos += static_cast<unsigned>(sentence.size());
 								            }
 								            continue;
 								        }
 								        auto length = UTF8Length(L);
 								        if (length < 2 || re2::RE2::PartialMatch(L, pattern2_) || re2::RE2::PartialMatch(L, pattern3_)) {
 								            tokens.push_back(L);
 								            // Map positions back to original string using final_pos_mapping
 								            unsigned start_pos = original_start;
 								            unsigned end_pos = original_start + static_cast<unsigned>(L.size());
 								            if (start_pos < final_pos_mapping.size() && end_pos < final_pos_mapping.size()) {
 								                positions.emplace_back(final_pos_mapping[start_pos], final_pos_mapping[end_pos]);
 								            } else {
 								                positions.emplace_back(static_cast<unsigned>(line.size()), static_cast<unsigned>(line.size()));
 								            }
 								            continue;
 								        }
 								        // Chinese processing: use TokenizeInnerWithPosition
 								#if 0
 								        if (length > MAX_SENTENCE_LEN) {
 								            std::vector<std::string> sublines;
 								            SplitLongText(L, length, sublines);
 								            unsigned subline_start_pos = original_start;
 								            for (auto &l : sublines) {
 								                TokenizeInnerWithPosition(l, tokens, positions, subline_start_pos, &final_pos_mapping);
 								                subline_start_pos += static_cast<unsigned>(l.size());
 								            }
 								        } else
 								#endif
 								        TokenizeInnerWithPosition(L, tokens, positions, original_start, &final_pos_mapping);
 								    }
 								    // std::vector<std::string> normalize_tokens;
 								    // std::vector<std::pair<unsigned, unsigned>> normalize_positions;
 								    // EnglishNormalizeWithPosition(tokens, positions, normalize_tokens, normalize_positions);
 								    // Apply MergeWithPosition to match Tokenize behavior
 								    std::vector<std::string> merged_tokens;
 								    std::vector<std::pair<unsigned, unsigned>> merged_positions;
 								    MergeWithPosition(tokens, positions, merged_tokens, merged_positions);
 								    tokens = std::move(merged_tokens);
 								    positions = std::move(merged_positions);
 								    return {std::move(tokens), std::move(positions)};
 								}
 								unsigned RAGAnalyzer::MapToOriginalPosition(unsigned processed_pos, const std::vector<std::pair<unsigned, unsigned>> &mapping) const {
 								    for (const auto &[orig, proc] : mapping) {
 								        if (proc == processed_pos) {
 								            return orig;
 								        }
 								    }
 								    return processed_pos;
 								}
 								static unsigned CalculateTokensLength(const std::vector<std::string> &tokens, int start, int end) {
 								    unsigned total_length = 0;
 								    for (int i = start; i < end; ++i) {
 								        total_length += static_cast<unsigned>(tokens[i].size());
 								    }
 								    return total_length;
 								}
 								void RAGAnalyzer::TokenizeInnerWithPosition(const std::string &L,
 								                                            std::vector<std::string> &tokens,
 								                                            std::vector<std::pair<unsigned, unsigned>> &positions,
 								                                            unsigned base_pos,
 								                                            const std::vector<unsigned> *pos_mapping) const {
 								    auto [tks, s] = MaxForward(L);
 								    auto [tks1, s1] = MaxBackward(L);
 								    // Use the same algorithm as Python version
 								    std::size_t i = 0, j = 0, _i = 0, _j = 0, same = 0;
 								    while ((i + same < tks1.size()) && (j + same < tks.size()) && tks1[i + same] == tks[j + same]) {
 								        same++;
 								    }
 								    if (same > 0) {
 								        std::string token_str = Join(tks, j, j + same);
 								        unsigned token_len = static_cast<unsigned>(token_str.size());
 								        unsigned start_pos = base_pos + CalculateTokensLength(tks, 0, j);
 								        if (token_str.find(' ') != std::string::npos) {
 								            std::vector<std::string> space_split_tokens;
 								            Split(token_str, blank_pattern_, space_split_tokens, false);
 								            unsigned space_start_pos = start_pos;
 								            for (const auto &space_token : space_split_tokens) {
 								                if (space_token.empty()) {
 								                    continue;
 								                }
 								                unsigned space_token_len = static_cast<unsigned>(space_token.size());
 								                tokens.push_back(space_token);
 								                // Map position back to original string if mapping is provided
 								                if (pos_mapping) {
 								                    unsigned mapped_start = space_start_pos < pos_mapping->size() ? (*pos_mapping)[space_start_pos] : 0;
 								                    unsigned mapped_end =
 								                        (space_start_pos + space_token_len) < pos_mapping->size() ? (*pos_mapping)[space_start_pos + space_token_len] : 0;
 								                    positions.emplace_back(mapped_start, mapped_end);
 								                } else {
 								                    positions.emplace_back(space_start_pos, space_start_pos + space_token_len);
 								                }
 								                space_start_pos += space_token_len;
 								            }
 								        } else {
 								            tokens.push_back(token_str);
 								            // Map position back to original string if mapping is provided
 								            if (pos_mapping) {
 								                unsigned mapped_start = start_pos < pos_mapping->size() ? (*pos_mapping)[start_pos] : 0;
 								                unsigned mapped_end = (start_pos + token_len) < pos_mapping->size() ? (*pos_mapping)[start_pos + token_len] : 0;
 								                positions.emplace_back(mapped_start, mapped_end);
 								            } else {
 								                positions.emplace_back(start_pos, start_pos + token_len);
 								            }
 								        }
 								    }
 								    _i = i + same;
 								    _j = j + same;
 								    j = _j + 1;
 								    i = _i + 1;
 								    while (i < tks1.size() && j < tks.size()) {
 								        std::string tk1 = Join(tks1, _i, i, "");
 								        std::string tk = Join(tks, _j, j, "");
 								        if (tk1 != tk) {
 								            if (tk1.length() > tk.length()) {
 								                j++;
 								            } else {
 								                i++;
 								            }
 								            continue;
 								        }
 								        if (tks1[i] != tks[j]) {
 								            i++;
 								            j++;
 								            continue;
 								        }
 								        // Handle different part with DFS
 								        std::vector<std::pair<std::string, int>> pre_tokens;
 								        std::vector<std::vector<std::pair<std::string, int>>> token_list;
 								        std::vector<std::string> best_tokens;
 								        double max_score = std::numeric_limits<double>::lowest();
 								        const auto str_for_dfs = Join(tks, _j, j, "");
 								#ifdef INFINITY_DEBUG
 								        const auto t0 = std::chrono::high_resolution_clock::now();
 								#endif
 								        DFS(str_for_dfs, 0, pre_tokens, token_list, best_tokens, max_score, false);
 								#ifdef INFINITY_DEBUG
 								        const auto t1 = std::chrono::high_resolution_clock::now();
 								        dp_debug::CheckDP(this, str_for_dfs, best_tokens, max_score, t0, t1);
 								#endif
 								        std::string best_token_str = Join(best_tokens, 0);
 								        unsigned start_pos = base_pos + CalculateTokensLength(tks, 0, _j);
 								        std::string original_token_str = Join(tks, _j, j, "");
 								        unsigned end_pos = start_pos + static_cast<unsigned>(original_token_str.size());
 								        if (best_token_str.find(' ') != std::string::npos) {
 								            std::vector<std::string> space_split_tokens;
 								            Split(best_token_str, blank_pattern_, space_split_tokens, false);
 								            unsigned space_start_pos = start_pos;
 								            for (const auto &space_token : space_split_tokens) {
 								                if (space_token.empty()) {
 								                    continue;
 								                }
 								                unsigned space_token_len = static_cast<unsigned>(space_token.size());
 								                tokens.push_back(space_token);
 								                // Map position back to original string if mapping is provided
 								                if (pos_mapping) {
 								                    unsigned mapped_start = space_start_pos < pos_mapping->size() ? (*pos_mapping)[space_start_pos] : 0;
 								                    unsigned mapped_end =
 								                        (space_start_pos + space_token_len) < pos_mapping->size() ? (*pos_mapping)[space_start_pos + space_token_len] : 0;
 								                    positions.emplace_back(mapped_start, mapped_end);
 								                } else {
 								                    positions.emplace_back(space_start_pos, space_start_pos + space_token_len);
 								                }
 								                space_start_pos += space_token_len;
 								            }
 								        } else {
 								            tokens.push_back(best_token_str);
 								            // Map position back to original string if mapping is provided
 								            if (pos_mapping) {
 								                unsigned mapped_start = start_pos < pos_mapping->size() ? (*pos_mapping)[start_pos] : 0;
 								                unsigned mapped_end = end_pos < pos_mapping->size() ? (*pos_mapping)[end_pos] : 0;
 								                positions.emplace_back(mapped_start, mapped_end);
 								            } else {
 								                positions.emplace_back(start_pos, end_pos);
 								            }
 								        }
 								        same = 1;
 								        while (i + same < tks1.size() && j + same < tks.size() && tks1[i + same] == tks[j + same])
 								            same++;
 								        // Handle same part after different tokens
 								        std::string token_str = Join(tks, j, j + same);
 								        unsigned token_len = static_cast<unsigned>(token_str.size());
 								        start_pos = base_pos + CalculateTokensLength(tks, 0, j);
 								        if (token_str.find(' ') != std::string::npos) {
 								            std::vector<std::string> space_split_tokens;
 								            Split(token_str, blank_pattern_, space_split_tokens, false);
 								            unsigned space_start_pos = start_pos;
 								            for (const auto &space_token : space_split_tokens) {
 								                if (space_token.empty()) {
 								                    continue;
 								                }
 								                unsigned space_token_len = static_cast<unsigned>(space_token.size());
 								                tokens.push_back(space_token);
 								                // Map position back to original string if mapping is provided
 								                if (pos_mapping) {
 								                    unsigned mapped_start = space_start_pos < pos_mapping->size() ? (*pos_mapping)[space_start_pos] : 0;
 								                    unsigned mapped_end =
 								                        (space_start_pos + space_token_len) < pos_mapping->size() ? (*pos_mapping)[space_start_pos + space_token_len] : 0;
 								                    positions.emplace_back(mapped_start, mapped_end);
 								                } else {
 								                    positions.emplace_back(space_start_pos, space_start_pos + space_token_len);
 								                }
 								                space_start_pos += space_token_len;
 								            }
 								        } else {
 								            tokens.push_back(token_str);
 								            // Map position back to original string if mapping is provided
 								            if (pos_mapping) {
 								                unsigned mapped_start = start_pos < pos_mapping->size() ? (*pos_mapping)[start_pos] : 0;
 								                unsigned mapped_end = (start_pos + token_len) < pos_mapping->size() ? (*pos_mapping)[start_pos + token_len] : 0;
 								                positions.emplace_back(mapped_start, mapped_end);
 								            } else {
 								                positions.emplace_back(start_pos, start_pos + token_len);
 								            }
 								        }
 								        _i = i + same;
 								        _j = j + same;
 								        j = _j + 1;
 								        i = _i + 1;
 								    }
 								    // Handle remaining part
 								    if (_i < tks1.size()) {
 								        std::vector<std::pair<std::string, int>> pre_tokens;
 								        std::vector<std::vector<std::pair<std::string, int>>> token_list;
 								        std::vector<std::string> best_tokens;
 								        double max_score = std::numeric_limits<double>::lowest();
 								        const auto str_for_dfs = Join(tks, _j, tks.size(), "");
 								#ifdef INFINITY_DEBUG
 								        const auto t0 = std::chrono::high_resolution_clock::now();
 								#endif
 								        DFS(str_for_dfs, 0, pre_tokens, token_list, best_tokens, max_score, false);
 								#ifdef INFINITY_DEBUG
 								        const auto t1 = std::chrono::high_resolution_clock::now();
 								        dp_debug::CheckDP(this, str_for_dfs, best_tokens, max_score, t0, t1);
 								#endif
 								        std::string best_token_str = Join(best_tokens, 0);
 								        unsigned start_pos = base_pos + CalculateTokensLength(tks, 0, _j);
 								        std::string original_token_str = Join(tks, _j, tks.size(), "");
 								        unsigned end_pos = start_pos + static_cast<unsigned>(original_token_str.size());
 								        if (best_token_str.find(' ') != std::string::npos) {
 								            std::vector<std::string> space_split_tokens;
 								            Split(best_token_str, blank_pattern_, space_split_tokens, false);
 								            unsigned space_start_pos = start_pos;
 								            for (const auto &space_token : space_split_tokens) {
 								                if (space_token.empty()) {
 								                    continue;
 								                }
 								                unsigned space_token_len = static_cast<unsigned>(space_token.size());
 								                tokens.push_back(space_token);
 								                // Map position back to original string if mapping is provided
 								                if (pos_mapping) {
 								                    unsigned mapped_start = space_start_pos < pos_mapping->size() ? (*pos_mapping)[space_start_pos] : 0;
 								                    unsigned mapped_end =
 								                        (space_start_pos + space_token_len) < pos_mapping->size() ? (*pos_mapping)[space_start_pos + space_token_len] : 0;
 								                    positions.emplace_back(mapped_start, mapped_end);
 								                } else {
 								                    positions.emplace_back(space_start_pos, space_start_pos + space_token_len);
 								                }
 								                space_start_pos += space_token_len;
 								            }
 								        } else {
 								            tokens.push_back(best_token_str);
 								            // Map position back to original string if mapping is provided
 								            if (pos_mapping) {
 								                unsigned mapped_start = start_pos < pos_mapping->size() ? (*pos_mapping)[start_pos] : 0;
 								                unsigned mapped_end = end_pos < pos_mapping->size() ? (*pos_mapping)[end_pos] : 0;
 								                positions.emplace_back(mapped_start, mapped_end);
 								            } else {
 								                positions.emplace_back(start_pos, end_pos);
 								            }
 								        }
 								    }
 								}
 								void RAGAnalyzer::EnglishNormalizeWithPosition(const std::vector<std::string> &tokens,
 								                                               const std::vector<std::pair<unsigned, unsigned>> &positions,
 								                                               std::vector<std::string> &normalize_tokens,
 								                                               std::vector<std::pair<unsigned, unsigned>> &normalize_positions) const {
 								    for (size_t i = 0; i < tokens.size(); ++i) {
 								        const auto &token = tokens[i];
 								        const auto &[start_pos, end_pos] = positions[i];
-												Fix tokenizer in cpp (#13735)

### What problem does this PR solve?

Tokenzier in Infinity is modified in
https://github.com/infiniflow/infinity/pull/3330, sync the code change
to cpp files in ragflow

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-23 15:40:35 +08:00
+								        if (re2::RE2::PartialMatch(token, pattern1_)) { //"[a-zA-Z_-]+$"
 								            // Apply lowercase before lemmatization to match Python NLTK behavior
 								            char *lowercase_term = lowercase_string_buffer_.data();
 								            ToLower(token.c_str(), token.size(), lowercase_term, term_string_buffer_limit_);
 								            std::string lemma_term = wordnet_lemma_->Lemmatize(lowercase_term);
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
+								            std::string stem_term;
-												Fix tokenizer in cpp (#13735)

### What problem does this PR solve?

Tokenzier in Infinity is modified in
https://github.com/infiniflow/infinity/pull/3330, sync the code change
to cpp files in ragflow

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
											
										
										
											2026-03-23 15:40:35 +08:00
+								            stemmer_->Stem(lemma_term, stem_term);
-												RAGFlow go API server (#13240)

# RAGFlow Go Implementation Plan 🚀

This repository tracks the progress of porting RAGFlow to Go. We'll
implement core features and provide performance comparisons between
Python and Go versions.

## Implementation Checklist

- [x] User Management APIs
- [x] Dataset Management Operations
- [x] Retrieval Test
- [x] Chat Management Operations
- [x] Infinity Go SDK

---------

Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Co-authored-by: Yingfeng Zhang <yingfeng.zhang@gmail.com>
											
										
										
											2026-03-04 19:17:16 +08:00
 								            normalize_tokens.push_back(stem_term);
 								            normalize_positions.emplace_back(start_pos, end_pos);
 								        } else {
 								            normalize_tokens.push_back(token);
 								            normalize_positions.emplace_back(start_pos, end_pos);
 								        }
 								    }
 								}
 								void RAGAnalyzer::FineGrainedTokenizeWithPosition(const std::string &tokens_str,
 								                                                  const std::vector<std::pair<unsigned, unsigned>> &positions,
 								                                                  std::vector<std::string> &fine_tokens,
 								                                                  std::vector<std::pair<unsigned, unsigned>> &fine_positions) const {
 								    std::vector<std::string> tks;
 								    Split(tokens_str, blank_pattern_, tks);
 								    std::size_t zh_num = 0;
 								    for (auto &token : tks) {
 								        int len = UTF8Length(token);
 								        for (int i = 0; i < len; ++i) {
 								            std::string t = UTF8Substr(token, i, 1);
 								            if (IsChinese(t)) {
 								                zh_num++;
 								            }
 								        }
 								    }
 								    if (zh_num < tks.size() * 0.2) {
 								        // English text processing - apply normalization
 								        std::vector<std::string> temp_tokens;
 								        for (size_t i = 0; i < tks.size(); ++i) {
 								            const auto &token = tks[i];
 								            const auto &[start_pos, end_pos] = positions[i];
 								            std::istringstream iss(token);
 								            std::string sub_token;
 								            unsigned sub_start = start_pos;
 								            while (std::getline(iss, sub_token, '/')) {
 								                if (!sub_token.empty()) {
 								                    unsigned sub_end = sub_start + sub_token.size();
 								                    fine_tokens.push_back(sub_token);
 								                    fine_positions.emplace_back(sub_start, sub_end);
 								                    sub_start = sub_end + 1;
 								                }
 								            }
 								        }
 								        // Apply English normalization to get lowercase and stemmed tokens
 								        // std::vector<std::pair<unsigned, unsigned>> temp_positions = fine_positions;
 								        // EnglishNormalizeWithPosition(temp_tokens, temp_positions, fine_tokens, fine_positions);
 								    } else {
 								        // Chinese or mixed text processing - match FineGrainedTokenize behavior
 								        for (size_t i = 0; i < tks.size(); ++i) {
 								            const auto &token = tks[i];
 								            const auto &[start_pos, end_pos] = positions[i];
 								            const auto token_len = UTF8Length(token);
 								            if (token_len < 3 || re2::RE2::PartialMatch(token, pattern4_)) {
 								                fine_tokens.push_back(token);
 								                fine_positions.emplace_back(start_pos, end_pos);
 								                continue;
 								            }
 								            std::vector<std::vector<std::pair<std::string, int>>> token_list;
 								            if (token_len > 10) {
 								                std::vector<std::pair<std::string, int>> tk;
 								                tk.emplace_back(token, Encode(-1, 0));
 								                token_list.push_back(tk);
 								            } else {
 								                std::vector<std::pair<std::string, int>> pre_tokens;
 								                std::vector<std::string> best_tokens;
 								                double max_score = 0.0F;
 								                DFS(token, 0, pre_tokens, token_list, best_tokens, max_score, true);
 								            }
 								            if (token_list.size() < 2) {
 								                fine_tokens.push_back(token);
 								                fine_positions.emplace_back(start_pos, end_pos);
 								                continue;
 								            }
 								            std::vector<std::pair<std::vector<std::string>, double>> sorted_tokens;
 								            SortTokens(token_list, sorted_tokens);
 								            const auto &stk = sorted_tokens[1].first;
 								            if (stk.size() == token_len) {
 								                fine_tokens.push_back(token);
 								                fine_positions.emplace_back(start_pos, end_pos);
 								            } else if (re2::RE2::PartialMatch(token, pattern5_)) {
 								                bool need_append_stk = true;
 								                for (auto &t : stk) {
 								                    if (UTF8Length(t) < 3) {
 								                        fine_tokens.push_back(token);
 								                        fine_positions.emplace_back(start_pos, end_pos);
 								                        need_append_stk = false;
 								                        break;
 								                    }
 								                }
 								                if (need_append_stk) {
 								                    unsigned sub_pos = start_pos;
 								                    for (auto &t : stk) {
 								                        unsigned sub_end = sub_pos + UTF8Length(t);
 								                        fine_tokens.push_back(t);
 								                        fine_positions.emplace_back(sub_pos, sub_end);
 								                        sub_pos = sub_end;
 								                    }
 								                }
 								            } else {
 								                unsigned sub_pos = start_pos;
 								                for (auto &t : stk) {
 								                    unsigned sub_end = sub_pos + static_cast<unsigned>(t.size());
 								                    fine_tokens.push_back(t);
 								                    fine_positions.emplace_back(sub_pos, sub_end);
 								                    sub_pos = sub_end;
 								                }
 								            }
 								        }
 								    }
 								    // Apply English normalization only if needed, similar to FineGrainedTokenize
 								    // For Chinese text, no additional normalization needed
 								    // fine_tokens already contains the correct Chinese tokens
 								}
 								void RAGAnalyzer::FineGrainedTokenize(const std::string &tokens, std::vector<std::string> &result) const {
 								    std::vector<std::string> tks;
 								    Split(tokens, blank_pattern_, tks);
 								    std::vector<std::string> res;
 								    std::size_t zh_num = 0;
 								    for (auto &token : tks) {
 								        int len = UTF8Length(token);
 								        for (int i = 0; i < len; ++i) {
 								            std::string t = UTF8Substr(token, i, 1);
 								            if (IsChinese(t)) {
 								                zh_num++;
 								            }
 								        }
 								    }
 								    if (zh_num < tks.size() * 0.2) {
 								        for (auto &token : tks) {
 								            std::istringstream iss(token);
 								            std::string sub_token;
 								            while (std::getline(iss, sub_token, '/')) {
 								                result.push_back(sub_token);
 								            }
 								        }
 								        // std::string ret = Join(res, 0);
 								        return;
 								    }
 								    for (auto &token : tks) {
 								        const auto token_len = UTF8Length(token);
 								        if (token_len < 3 || re2::RE2::PartialMatch(token, pattern4_)) {
 								            //[0-9,\\.-]+$
 								            res.push_back(token);
 								            continue;
 								        }
 								        std::vector<std::vector<std::pair<std::string, int>>> token_list;
 								        if (token_len > 10) {
 								            std::vector<std::pair<std::string, int>> tk;
 								            tk.emplace_back(token, Encode(-1, 0));
 								            token_list.push_back(tk);
 								        } else {
 								            std::vector<std::pair<std::string, int>> pre_tokens;
 								            std::vector<std::string> best_tokens;
 								            double max_score = 0.0F;
 								#ifdef INFINITY_DEBUG
 								            const auto t0 = std::chrono::high_resolution_clock::now();
 								#endif
 								            DFS(token, 0, pre_tokens, token_list, best_tokens, max_score, true);
 								#ifdef INFINITY_DEBUG
 								            const auto t1 = std::chrono::high_resolution_clock::now();
 								            auto get_dfs_sorted_tokens = [&]() {
 								                std::vector<std::pair<std::vector<std::string>, double>> sorted_tokens;
 								                SortTokens(token_list, sorted_tokens);
 								                return sorted_tokens;
 								            };
 								            dp_debug::CheckDP2(this, token, get_dfs_sorted_tokens, t0, t1);
 								#endif
 								        }
 								        if (token_list.size() < 2) {
 								            res.push_back(token);
 								            continue;
 								        }
 								        std::vector<std::pair<std::vector<std::string>, double>> sorted_tokens;
 								        SortTokens(token_list, sorted_tokens);
 								        const auto &stk = sorted_tokens[1].first;
 								        if (stk.size() == token_len) {
 								            res.push_back(token);
 								        } else if (re2::RE2::PartialMatch(token, pattern5_)) {
 								            // [a-z\\.-]+
 								            bool need_append_stk = true;
 								            for (auto &t : stk) {
 								                if (UTF8Length(t) < 3) {
 								                    res.push_back(token);
 								                    need_append_stk = false;
 								                    break;
 								                }
 								            }
 								            if (need_append_stk) {
 								                for (auto &t : stk) {
 								                    res.push_back(t);
 								                }
 								            }
 								        } else {
 								            for (auto &t : stk) {
 								                res.push_back(t);
 								            }
 								        }
 								    }
 								    EnglishNormalize(res, result);
 								    // std::string ret = Join(normalize_res, 0);
 								    // return ret;
 								}
 								int RAGAnalyzer::AnalyzeImpl(const Term &input, void *data, bool fine_grained, bool enable_position, HookType func) const {
 								    if (enable_position) {
 								        auto [tokens, positions] = TokenizeWithPosition(input.text_);
 								        if (fine_grained) {
 								            std::vector<std::string> fine_tokens;
 								            std::vector<std::pair<unsigned, unsigned>> fine_positions;
 								            FineGrainedTokenizeWithPosition(Join(tokens, 0), positions, fine_tokens, fine_positions);
 								            tokens = std::move(fine_tokens);
 								            positions = std::move(fine_positions);
 								        }
 								        for (size_t i = 0; i < tokens.size(); ++i) {
 								            if (tokens[i].empty())
 								                continue;
 								            const auto &[start_pos, end_pos] = positions[i];
 								            func(data, tokens[i].c_str(), tokens[i].size(), start_pos, end_pos, false, 0);
 								        }
 								    } else {
 								        std::string result = Tokenize(input.text_);
 								        std::vector<std::string> tokens;
 								        if (fine_grained) {
 								            FineGrainedTokenize(result, tokens);
 								        } else {
 								            Split(result, blank_pattern_, tokens);
 								        }
 								        unsigned offset = 0;
 								        for (auto &t : tokens) {
 								            if (t.empty())
 								                continue;
 								            func(data, t.c_str(), t.size(), offset++, 0, false, 0);
 								        }
 								    }
 								    return 0;
 								}